|
| 1 | +#------------------------------------------------------------- |
| 2 | +# Leave-one-out Cross-validation (LOOCV) for regression models |
| 3 | +# in R |
| 4 | +#------------------------------------------------------------- |
| 5 | + |
| 6 | +# 1. generate artificial data |
| 7 | +set.seed(2023) |
| 8 | +n <- 300 |
| 9 | +x <- rnorm(n = n, mean = 5, sd = 3) |
| 10 | +y <- x^2 + rnorm(n = n, mean = 0, sd = 8) |
| 11 | +data = data.frame(x = x, y = y) |
| 12 | + |
| 13 | +# 2. plot the models that we want to fit |
| 14 | +par(mfrow = c(1,2), pty = "s") |
| 15 | + |
| 16 | +model1 <- lm(y ~ x) |
| 17 | +plot(x = x, y = y, main = 'Linear model', cex = 1.1, pch = 1, lwd = 1.2) |
| 18 | +yhat1 <- model1$coef[1] + model1$coef[2] * x |
| 19 | +lines(x, yhat1, lw = 2.5, col = 'red') |
| 20 | + |
| 21 | +model2 <- lm(y ~ x + I(x^2)) |
| 22 | +plot(x = x, y = y, main = 'Quadratic model', cex = 1.1, pch = 1, lwd = 1.2) |
| 23 | +yhat2 <- model2$coef[1] + model2$coef[2] * x + model2$coef[3] * x^2 |
| 24 | +lines(x[order(x)], yhat2[order(x)], lw = 2.5, col = 'red') |
| 25 | + |
| 26 | +# 3. Cross-Validation |
| 27 | +# fit the models on leave-one-out samples |
| 28 | +pred.cv.mod1 <- pred.cv.mod2 <- numeric(n) |
| 29 | + |
| 30 | +for(i in 1:n) { |
| 31 | + |
| 32 | + # quadratic model |
| 33 | + mod1 = lm(y ~ x, subset = -i) |
| 34 | + pred.cv.mod1[i] = predict(mod1, data[i,]) |
| 35 | + |
| 36 | + # quadratic model |
| 37 | + mod2 = lm(y ~ x + I(x^2), subset = -i) |
| 38 | + pred.cv.mod2[i] = predict(mod2, data[i,]) |
| 39 | +} |
| 40 | + |
| 41 | +MSE1 = (1/n) * sum((y - pred.cv.mod1)^2) # theta_hat_pe |
| 42 | +MSE2 = (1/n) * sum((y - pred.cv.mod2)^2) # theta_hat_pe |
| 43 | + |
| 44 | +# Root Mean Squared Error (RMSE) |
| 45 | +sqrt(c(MSE1, MSE2)) |
| 46 | +# [1] 15.68599 7.99332 |
| 47 | +# The second model (Quadratic) has the lowest RMSE and thus is prefered. |
| 48 | + |
| 49 | +#---- |
| 50 | +# end |
| 51 | +#---- |
0 commit comments