-
Notifications
You must be signed in to change notification settings - Fork 0
/
report-hw04.Rmd
495 lines (343 loc) · 11 KB
/
report-hw04.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
---
title: 'Homework #4'
author: "Spencer Pease"
date: "3/08/2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
options(knitr.kable.NA = '-')
```
```{r}
# Prep work ---------------------------------------------------------------
library(dplyr)
library(ggplot2)
library(splines)
library(boot)
library(tree)
library(randomForest)
library(gbm)
source("functions/evaluate_grid.R")
source("functions/misclass_table.R")
load("data/heart.RData")
data_heart <- as_tibble(full)
rm(full)
```
# Question 1
## Part (a, b)
```{r}
# Question 1a,b -----------------------------------------------------------
gen_sim_data <- function(n) {
X <- runif(n = n, min = -1, max = 1)
noise <- rnorm(n = n, mean = 0, sd = 0.1)
f <- function(x) 3 - 2 * x + 3 * x^3
Y <- f(X) + noise
return(list(X = X, Y = Y, f = f))
}
set.seed(2)
data_sim <- gen_sim_data(n = 50)
```
For this exercise, $50$ observations are generated from the function
$$ Y = f(X) + \epsilon $$
where
$$
\begin{aligned}
X &\sim \text{Unifrom}(-1, 1) \\
\epsilon &\sim \text{Normal}(0, 0.1) \\
f(X) &= 3 - 2X + 3X^3
\end{aligned}
$$
## Part (c)
```{r}
# Question 1c -------------------------------------------------------------
model_spline3 <- with(data_sim, smooth.spline(X, Y, lambda = 1e-3))
model_spline7 <- with(data_sim, smooth.spline(X, Y, lambda = 1e-7))
```
Two smoothing spline models, with $\lambda = 10^{-3}$ and $\lambda = 10^{-7}$,
are then fit to the generated dataset.
## Part (d)
```{r}
# Question 1d -------------------------------------------------------------
pred_sim_spline <- evaluate_grid(
data_sim$X, data_sim$Y,
pred_list = list(
"Reference" = data_sim$f,
"Spline (lambda = 1e-3)" = function(x) predict(model_spline3, x)$y,
"Spline (lambda = 1e-7)" = function(x) predict(model_spline7, x)$y
)
)
```
```{r q1d-results}
pred_sim_spline$plot + labs(title = "Spline Model Evaluation")
```
## Part (e)
```{r}
# Question 1e -------------------------------------------------------------
model_spline_cv <- with(data_sim, smooth.spline(X, Y, cv = TRUE))
```
```{r q1e-results}
q1e <- list(
sprintf("%0.3e", model_spline_cv$lambda)
)
```
Fitting a cross-validated smoothing spline model to the generated dataset shows
that the optimal value of $\lambda$ is `r q1e[[1]]`.
## Part (f)
```{r}
# Question 1f -------------------------------------------------------------
pred_sim_spline_cv <- evaluate_grid(
data_sim$X, data_sim$Y,
pred_list = list(
"Reference" = data_sim$f,
"CV Spline" = function(x) predict(model_spline_cv, x)$y
)
)
```
```{r q1f-results}
pred_sim_spline_cv$plot + labs(title = "CV Spline Model Evaluation")
```
## Part (g)
```{r}
# Question 1g -------------------------------------------------------------
estimate_spline <- function(data, indices, pred_point) {
data_boot <- data[indices, ]
spline3 <- with(data_boot, smooth.spline(X, Y, lambda = 1e-3))
spline7 <- with(data_boot, smooth.spline(X, Y, lambda = 1e-7))
return(c(
"spline3" = predict(spline3, pred_point)$y,
"spline7" = predict(spline7, pred_point)$y
))
}
model_spline_boot <-
with(data_sim, tibble(X = X, Y = Y)) %>%
boot(estimate_spline, R = 1000, pred_point = 0)
summary_spline_boot <- model_spline_boot$t %>%
as_tibble(.name_repair = "unique") %>%
rename(spline3 = 1, spline7 = 2) %>%
summarise(across(.fns = var, .names = "{.col}_var"))
```
```{r q1g-results}
knitr::kable(
summary_spline_boot %>% mutate(across(.fns = ~sprintf("%0.3e", .x))),
caption = "Bootstraped variance at X = 0",
digits = 4,
col.names = c(
"$\\hat{f}_{\\lambda = 10^{-3}}$", "$\\hat{f}_{\\lambda = 10^{-7}}$"
),
eval = FALSE
)
```
Looking at the estimated variance of $X = 0$ from $1000$ bootstrapped datasets
for each spline model, we see that the model with greater smoothing had a
lower variance. This is consistent with our understanding of how a less smooth
model will overfit to the data more, leading to greater variation in estimates
when fit to many bootstrapped datasets.
# Question 2
## Part (a)
```{r}
# Question 2a -------------------------------------------------------------
obs_by_class <- data_heart %>%
group_by(Disease) %>%
summarise(observations = n())
train_obs <- 200
set.seed(2)
df_shuffled <- data_heart[sample(nrow(data_heart)), ]
df_train <- df_shuffled[1:train_obs, ]
df_test <- df_shuffled[-(1:train_obs), ]
```
```{r q2a-results}
knitr::kable(obs_by_class, caption = "Observations by disease class")
```
The *heart* dataset has a sample size $n$ of `r nrow(data_heart)` and
`r ncol(data_heart) - 1` predictors $p$. *Table 2* shows a summary of the number
of observations in each class.
For model training, the *heart* dataset is split into a training set of
`r nrow(df_train)` observations and test set of `r nrow(df_test)` observations.
## Part (b, c)
```{r}
# Question 2b -------------------------------------------------------------
model_tree_overgrown <- tree(Disease ~ ., data = df_train)
# Question 2c -------------------------------------------------------------
misclass_tree_overgrown <- misclass_table(
model_tree_overgrown,
df_train,
df_test,
type = "class"
)
```
```{r q2c-results}
knitr::kable(
misclass_tree_overgrown,
caption = "Misclassification error rates for the overgrown model",
digits = 3
)
```
The overgrown tree model has a lower misclassification rate when predicting the
training data class than when predicting the test data, which is expected since
the model is overfit to the training data.
## Part (d)
```{r}
# Question 2d -------------------------------------------------------------
set.seed(2)
model_tree_cv <- cv.tree(model_tree_overgrown, FUN = prune.misclass)
optimal_tree_size <- with(model_tree_cv, size[which.min(dev)])
model_tree_prune <- prune.tree(model_tree_overgrown, best = optimal_tree_size)
plot_tree_prune <-
with(model_tree_cv, tibble(size = size, misclass_err = dev)) %>%
ggplot(aes(x = size, y = misclass_err)) +
geom_step() +
geom_point(color = "red3") +
theme_bw(base_family = "serif") +
labs(
title = "Misclassification Error vs Subtree Size",
x = "Subtree Size",
y = "Misclassification Error"
)
```
```{r q2d1-results, fig.asp=1.1}
plot(model_tree_overgrown)
text(model_tree_overgrown, pretty = 0, cex = .6)
title("Overgrown and Pruned Tree")
par(new = TRUE, oma = c(1, 0, .1, 2.5))
plot(model_tree_prune, col = alpha("red", .4))
# text(model_tree_prune, pretty = 0, cex = .8)
```
*Key: Black = overgrown tree; Red = pruned tree branches*
```{r q2d2-results}
plot_tree_prune
```
## Part (e)
```{r}
# Question 2e -------------------------------------------------------------
misclass_tree_opt <- misclass_table(
model_tree_prune,
df_train,
df_test,
type = "class"
)
```
```{r q2e-results}
knitr::kable(
misclass_tree_opt,
caption = "Misclassification error rates for the optimal subtree model",
digits = 3
)
```
The training misclassification rate of the optimal subtree model is higher than
that of the overgrown tree, which is expected as there are fewer splits to
overfit on. The test misclassification rate is higher than its training
counterpart (as expected) and the that of the overgrown model. This may be an
artifact of sampling, but it also shows how fitting the optimal subtree will
only sacrifice a little accuracy, if any, for a corresponding reduction in
variance.
## Part (f)
```{r}
# Question 2f -------------------------------------------------------------
set.seed(2)
model_tree_bag <- randomForest(
Disease ~ .,
data = df_train,
mtry = ncol(df_train) - 1,
importance = TRUE
)
# misclass_tree_bag <- list(
# "train" = mean(model_tree_bag$predicted != df_train$Disease),
# "test" = mean(predict(model_tree_bag, df_test) != df_test$Disease)
# )
misclass_tree_bag <- misclass_table(model_tree_bag, df_train, df_test)
```
```{r q2f-results}
knitr::kable(
misclass_tree_bag,
caption = "Misclassification error rates for the bagged tree model",
digits = 3
)
```
The bagging model is able to reduce the training misclassification to zero, as
well as reduce the test misclassification compared to the optimal pruned tree
model. This shows the power bootstrapping data has for reducing variance in
a tree model.
## Part (g)
```{r}
# Question 2g -------------------------------------------------------------
set.seed(2)
model_tree_rf <- randomForest(
Disease ~ .,
data = df_train,
mtry = floor((ncol(df_train) - 1) / 3),
importance = TRUE
)
misclass_tree_rf2 <- list(
"train" = mean(model_tree_rf$predicted != df_train$Disease),
"test" = mean(predict(model_tree_rf, df_test) != df_test$Disease)
)
misclass_tree_rf <- misclass_table(model_tree_rf, df_train, df_test)
```
```{r q2g-results}
knitr::kable(
misclass_tree_rf,
caption = "Misclassification error rates for the random forest model",
digits = 3
)
```
The random forest model maintains the zero misclassification error for the
training set, and reduces the test misclassification further. This shows that
sampling a subset of predictors when bootstrapping is another way to create a
model more robust to new observations.
## Part (h)
Both bagged trees and random forests use bootstrapped data to fit their models.
Since bootstrapping involves randomly sampling training data, there is no
innate guarantee that a model will be fit using the same data sample each time,
leading to randomness in the final results.
In addition, random forests select a subset of predictors for each
bagged tree. The selection process also has elements of randomness, so again
fitting the same model will produce different results.
## Part (i)
```{r}
# Question 2i -------------------------------------------------------------
df_train_int <- df_train %>% mutate(Disease = as.integer(Disease) - 1L)
df_test_int <- df_test %>% mutate(Disease = as.integer(Disease) - 1L)
set.seed(2)
model_tree_boost <- gbm(
Disease ~ .,
data = df_train_int,
distribution = "bernoulli",
n.trees = 500,
shrinkage = 0.1,
interaction.depth = 2
)
misclass_tree_boost <- misclass_table(
model_tree_boost,
df_train_int,
df_test_int,
bayes_thresh = 0.5,
type = "response"
)
```
```{r q2i-results}
knitr::kable(
misclass_tree_boost,
caption = "Misclassification error rates for the boosted tree model",
digits = 3
)
```
The boost tree model has the best performance out of all the tree models, with
zero training classification error and the lowest test misclassification error.
By sequentially building off of the residuals of previous splits, boosted
trees target difficult classification cases for improvement and perform better
than other methods.
\newpage
# Appendix
## Analysis
```{r getlabels, include=FALSE}
labs <- knitr::all_labels()
labs <- labs[!labs %in% c("setup", "toc", "getlabels", "allcode")]
labs <- labs[!grepl("results", labs)]
```
```{r allcode, ref.label=labs, eval=FALSE, echo=TRUE}
```
## Helper Functions
```{r code=readLines("functions/evaluate_grid.R"), eval=FALSE, echo=TRUE}
```
```{r code=readLines("functions/misclass_table.R"), eval=FALSE, echo=TRUE}
```