-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassification_Algorithms.R
342 lines (236 loc) · 11.6 KB
/
Classification_Algorithms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
library(MASS)
library(smallstuff)
library(boot)
library(class)
library(leaps)
library(broom)
library(glmnet)
library(data.table)
library(gam)
library(ggplot2)
#library(tidyr)
# The following code in based on the 9 steps mentioned in the final project
# instructions file
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# 1)
data = fread('../data/HTRU_2.csv')
setDT(data)
# Our target variable represents the classification of whether a star is a pulsar
# (1) or not a pulsar (0), we will be using all columns in our analysis
summary(data)
# No missing values were found
# We only have one categorical variable, which is the target class
class(data$target_class)
# We can convert it to a factor as follows:
data[, target_class:=factor(target_class)]
class(data$target_class)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# 2)
# Splitting the data between training and testing,
# with 80% of the data used for training and 20% for testing
set.seed(42069)
te=sort(sample(nrow(data),round2(nrow(data)/5)))
PulsarTrain=data[-te,]
PulsarTest=data[te,]
dim(PulsarTest) # 3580 x 9
dim(PulsarTrain) # 14318 x 9
summary(PulsarTest$target_class) # 334/3246
summary(PulsarTrain$target_class) # 1305/13013
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# 3)
# Data Source:
# https://archive.ics.uci.edu/ml/datasets/HTRU2#
# The data set shared here contains 16,259 spurious examples caused by RFI/noise,
# and 1,639 real pulsar examples. These examples have all been checked by human annotators.
# We chose it because we are passionate about astronomy and physics.
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# 4) Creating all models and evaluating their performance
error_rates = c(rep(0, 7))
# LDA model - All predictors
ldamod=lda(target_class~., PulsarTrain)
predl=predict(ldamod)
mean(predl$class!=PulsarTrain$target_class) # Training set: 2.49% error rate
set.seed(42069);CVerror(ldamod,5) #5-fold CV error rate=2.535
set.seed(42069);CVerror(ldamod,10) #10-fold CV error rate=2.493
predl_test=predict(ldamod, PulsarTest)
mean(predl_test$class!=PulsarTest$target_class)
# Test set: 2.63% error rate
error_rates[1] = mean(predl_test$class!=PulsarTest$target_class)
#-------------------------------------------------------------------------------
# QDA model - All predictors
qdamod=qda(target_class~., PulsarTrain)
predq=predict(qdamod)
mean(predq$class!=PulsarTrain$target_class) # Training set: 3.23% error rate
set.seed(42069);CVerror(qdamod,5) #5-fold CV error rate=3.255
set.seed(42069);CVerror(qdamod,10) #10-fold CV error rate=3.248
predq_test=predict(qdamod, PulsarTest)
mean(predq_test$class!=PulsarTest$target_class)
# Test set: 3.29% error rate
error_rates[2] = mean(predq_test$class!=PulsarTest$target_class)
#-------------------------------------------------------------------------------
# Logistic Regression - All predictors
gmod=glm(target_class~., binomial, PulsarTrain)
logistErrorRate(gmod, PulsarTrain) # 1.98% Training error rate
set.seed(42069);CVerror(gmod,5) #5-fold CV error rate=2.018
set.seed(42069);CVerror(gmod,10) #10-fold CV error rate=2.011
logistErrorRate(gmod, PulsarTest)
# 2.291% Testing error rate
summary(gmod)
# This shows that 2 predictors are not significant:
# Excess kurtosis of the DM-SNR curve, and
# Skewness of the DM-SNR curve
# Let's see how it performs without these predictors:
#-------------------------------------------------------------------------------
# Logistic Regression - Reduced model 1
gmod2=glm(target_class~., binomial, PulsarTrain[,-"Excess kurtosis of the DM-SNR curve"])
logistErrorRate(gmod2, PulsarTrain) # 1.99% Training error rate
set.seed(42069);CVerror(gmod2,5) #5-fold CV error rate=2.025
set.seed(42069);CVerror(gmod2,10) #10-fold CV error rate=2.032
logistErrorRate(gmod2, PulsarTest)
# 2.291% Testing error rate
# Removing this predictor did not impact our testing error rate
error_rates[3] = logistErrorRate(gmod2, PulsarTest)$errorRate/100
#-------------------------------------------------------------------------------
# Logistic Regression - Reduced model 2
gmod3=glm(target_class~., binomial, PulsarTrain[,-c("Excess kurtosis of the DM-SNR curve",
"Skewness of the DM-SNR curve")])
logistErrorRate(gmod3, PulsarTrain) # 1.99% Training error rate
set.seed(42069);CVerror(gmod3,5) #5-fold CV error rate=1.997%
set.seed(42069);CVerror(gmod3,10) #10-fold CV error rate=1.997%
logistErrorRate(gmod3, PulsarTest)
# 2.37% Testing error rate
summary(gmod)
# All predictors are significant, testing accuracy is only slightly worse
#-------------------------------------------------------------------------------
# Ridge Regression
X1=model.matrix(target_class~.,PulsarTrain)[,-1]
X1t=model.matrix(target_class~.,PulsarTest)[,-1]
rmod=glmnet(X1,PulsarTrain$target_class, "binomial", alpha=0)
rmod$lambda
lambdas=c(300:2,300:0/1000)
rmod=glmnet(X1,PulsarTrain$target_class, family = "binomial", alpha=0,lambda=lambdas)
set.seed(42069)
(lamr=cv.glmnet(X1,PulsarTrain$target_class, family = "binomial", alpha=0,lambda=lambdas)$lambda.min)
# Lambda is 0
#Test error rate for ridge regression
pred=predict(rmod,X1t,s=lamr,type="r")
yhat=rep(levels(PulsarTest$target_class)[1],nrow(PulsarTest))
yhat[pred[,1]>.5]=levels(PulsarTest$target_class)[2]
mean(PulsarTest$target_class!=yhat)
# Error Rate: 2.291%
error_rates[4] = mean(PulsarTest$target_class!=yhat)
#-------------------------------------------------------------------------------
# Lasso Regression
lassomod=glmnet(X1,PulsarTrain$target_class, family="binomial", lambda=lambdas)
set.seed(42069)
(laml=cv.glmnet(X1,PulsarTrain$target_class, family="binomial", lambda=lambdas)$lambda.min)
#Test error rate for lasso regression
pred=predict(lassomod,X1t,s=laml,type="r")
yhat=rep(levels(PulsarTest$target_class)[1],nrow(PulsarTest))
yhat[pred[,1]>.5]=levels(PulsarTest$target_class)[2]
mean(PulsarTest$target_class!=yhat)
# Error Rate: 2.291%
error_rates[5] = mean(PulsarTest$target_class!=yhat)
#-------------------------------------------------------------------------------
# K nearest neighbors Models
(Ks=c(1,seq(1,20,by=5),seq(25,round2(nrow(PulsarTrain)/29),by=100)))
knntrain=NULL;knn5err=NULL;knn10err=NULL
for (i in (1:length(Ks))) {
set.seed(42069)
knntrain[i]=mean(knn(PulsarTrain[,1:8],PulsarTrain[,1:8],PulsarTrain$target_class,k=Ks[i])!=PulsarTrain$target_class)*100
set.seed(42069);knn5err[i]=CVerrorknn(PulsarTrain[,1:8],PulsarTrain$target_class,K=Ks[i],k=5)
set.seed(42069);knn10err[i]=CVerrorknn(PulsarTrain[,1:8],PulsarTrain$target_class,K=Ks[i],k=10)
}
# Note: grab a coffee while this runs (~5min)
par(mfrow=c(2,2))
plot(knntrain~Ks,type='b',main="Training",xlab="K",ylab="% Error Rate")
plot(knn5err~Ks,type='b',main="5-CV",xlab="K",ylab="% Error Rate")
plot(knn10err~Ks,type='b',main="10-CV",xlab="K",ylab="% Error Rate")
par(mfrow=c(1,1))
Ks[c(which.min(knntrain),which.min(knn5err),which.min(knn10err))]
round2(c(min(knntrain),min(knn5err),min(knn10err)),2)
# Our best k is 11, which has a training error rate of 2.67%
set.seed(42069)
yhat1=knn(PulsarTrain[,1:8],PulsarTest[,1:8],PulsarTrain$target_class,11,prob=T)
mean(yhat1!=PulsarTest$target_class)
# Testing error rate: 2.598%
error_rates[6] = mean(yhat1!=PulsarTest$target_class)
#-------------------------------------------------------------------------------
# GAMs
gamod=gam(target_class~., data=PulsarTrain, family = "binomial")
summary(gamod)
pred=predict(gamod,PulsarTest[,-'target_class'])
yhat=rep(levels(PulsarTest$target_class)[1],nrow(PulsarTest))
yhat[pred>.5]=levels(PulsarTest$target_class)[2]
mean(PulsarTest$target_class!=yhat)
# Test error 2.430%
error_rates[7] = mean(PulsarTest$target_class!=yhat)
# GAMs - Removing all of the insignificant predictors
# We removed each predictor 1 by 1 according to their significance, the resulting
# model has all predictors significant
gamod=gam(target_class~., data=PulsarTrain[,-c('Skewness of the DM-SNR curve',
'Mean of the DM-SNR curve',
'Standard deviation of the integrated profile',
'Excess kurtosis of the DM-SNR curve',
'Standard deviation of the DM-SNR curve')], family = "binomial")
summary(gamod)
pred=predict(gamod,PulsarTest[,-'target_class'])
yhat=rep(levels(PulsarTest$target_class)[1],nrow(PulsarTest))
yhat[pred>.5]=levels(PulsarTest$target_class)[2]
mean(PulsarTest$target_class!=yhat)
# Test error 2.514%
#-------------------------------------------------------------------------------
# Dummy Classifier
table(PulsarTrain$target_class)
# We will utilize "0" as our dummy classifier
table(PulsarTest$target_class)
# 3246 negative examples and 334 positive examples
# Testing Error Rate
(1-(3246/(3246+334)))
# 9.33%
#-------------------------------------------------------------------------------
# Final Remarks
# -All our models outperform the dummy classifier
# -Logistic Regression (Full and Reduced 1 Models) had the best performance (lowest test ER)
# -Our Lasso and Ridge Regression models had a best lambda of 0, therefore defaulting to logistic regression
# -The GAM model was our second best model, followed by LDA
# -It is important to note that our reduced GAM model, which used only 3 predictors
# (mean of the integrated profile, excess kurtosis of IP, skeweness of IP), was still able to
# obtain significant high performance
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# 5) Picking our best model and running its diagnosis
# We chose the logistic regression to be our model due to its lower testing error
# rate (2.291%). We will now perform a deeper analysis on its results
logistErrorRate(gmod2, PulsarTest)
ler=logistErrorRate(gmod2, PulsarTest)
lr=ler$result
lr[2,2]/lr[3,2] #Sensitivity=.820
# This means that we correctly identified 82.0% of pulsar on the testing set
lr[1,1]/lr[3,1] #Specificity=.993
# This means that we correctly identified 99.3% of all non Pulsar objects on the testing
lr[2,1]/lr[3,1] #False positive rate=.007
lr[1,2]/lr[3,2] #False negative rate=.179
lr[2,2]/lr[2,3] #Precision=.926
# Intrepretation of the coefficients
summary(gmod2)$coefficients
# The three coefficients with highest magnitude are:
# Excess kurtosis of the integrated profile (6.76),
# Skewness of the integrated profile (-0.64)
# Mean of the DM-SNR curve (-0.0325)
#-------------------------------------------------------------------------------
# Make plot to visualize performance of all models
models = c("LDA", "QDA", "Logistic", "Ridge", "Lasso", "KNN - k=11", "GAM")
error_rates = 100*error_rates
df = data.frame(models, error_rates)
df$models = reorder(df$models, df$error_rates)
ggplot(df, aes(x = models, y = error_rates)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Error Rates by Model") +
xlab("Model") +
ylab("Error Rate (in %)") +
theme(panel.background = element_rect(fill = "white"))