-
Notifications
You must be signed in to change notification settings - Fork 27
/
004_InOutSampleErrors.R
41 lines (34 loc) · 1.31 KB
/
004_InOutSampleErrors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library(kernlab)
data(spam)
set.seed(333)
# take a subset of data to build a predictor
smallSpam <- spam[sample(dim(spam)[1], size=10),]
spamLabel <- (smallSpam$type == "spam") * 1 + 1
plot(smallSpam$capitalAve, col=spamLabel)
# define a first predictor with high accuracy but overfit dataset samples
rule1 <- function(x) {
prediction <- rep(NA, length(x))
prediction[x > 2.7] <- "spam"
prediction[x < 2.4] <- "nonspam"
# add a rule for a specific spam that cannot be captured with previous conditions
prediction[(x >= 2.40 & x <= 2.45)] <- "spam"
prediction[(x > 2.45 & x <= 2.70)] <- "nonspam"
return(prediction)
}
# show the accuracy of this predictor
table(rule1(smallSpam$capitalAve), smallSpam$type)
# define a second predictor that omit the rule specific to a point
rule2 <- function(x) {
prediction <- rep(NA, length(x))
prediction[x > 2.8] <- "spam"
prediction[x <= 2.8] <- "nonspam"
return(prediction)
}
table(rule2(smallSpam$capitalAve), smallSpam$type)
# apply both predictors to the whole dataset and display accuracy table
table(rule1(spam$capitalAve), spam$type)
table(rule2(spam$capitalAve), spam$type)
mean(rule1(spam$capitalAve)==spam$type)
# show the number of times the predictor gave an errorneous value
sum(rule1(spam$capitalAve)==spam$type)
sum(rule2(spam$capitalAve)==spam$type)