-
Notifications
You must be signed in to change notification settings - Fork 25
/
LogisticRegression.R
106 lines (79 loc) · 3.82 KB
/
LogisticRegression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#CLASSIFICATION TECHNIQUES USING R
#1) LOGISTIC REGFRESSION using glm() function-Generalized linear Models , followed by
# family = binomial
require(ISLR) #packages with all datasets
#using Smarket dataset
names(Smarket) #Stock Market Data
?Smarket
view(Smarket)
summary(Smarket)
#Direction var -will be used as a Binary Response variable -to predict whether market will
#move Up or Down on a given day
#Constructing a Scatterplot Matrix
pairs(Smarket,col = Smarket$Direction)
#By looking at the scatterplot matrix we can easily see that there are no Correlations
#between Variables - As it is a Stock market Data- No BIG Surprise!!
#Fitting a Logistic REgression Model
mod1<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,data =Smarket , family =binomial)
#The model will compute the Prob values of Direction given these inputs/predictors
summary(mod1)
#Non of the variables have significant p-values , this only means that they are not
#correlated or very correlated, and also suggest that none of the variables are related
#to the Response(Direction) variable, i.e H0 is true-no relations b/w predictor and Res
#and again for this kind of datasets it is not a big surprise.
#Still we can make Predictions and calculate Probabilities values
probs<-fitted(mod1,'response')
#probs a vector of fitted Pr(Y=Direction|Inputs) values by the Model
head(probs)
#Prob values nearby 0.5
probs<-ifelse(probs>=0.5,'Up','Down')
#if prob >=0.5 then 'Up' , otherwise Market 'Down'
attach(Smarket)
#Forming a CONFUSION MATRIX to check the number of mismatches(misclassifications)
table(Predicted=probs,True=Direction)
#lots of mismatches
#Accuracy of the model
mean(probs==Direction)
#i.e The Model performs Slightly Better than chance , ie when Error Rate < 1/2 or 50%
# % of matches = 52 % , Error(mismatches)=47%
#We might be OVERFITTING with such high accuracy on TRAINING DATA
#NOW WE SEGREGATE INTO Training and Test Data and see if we do any better?
train = Year < 2005
mod2<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,
data =Smarket ,subset = train, family =binomial)
summary(mod2)
#Lets check the predictions of this model with Traiing data
#But we will generalize & predict Direction on TEST DATA
prob1<-predict(mod2,newdata = subset(Smarket,!train),type='response')
mod2.pred<-ifelse(prob1>=0.5,'Up','Down')
head(mod2.pred)
#Dataframe of True Directions and Predicted Direction on TEST Data
pred.df<-data.frame(True_Direction = subset(Smarket,!train)$Direction , Predicted = mod2.pred)
#CONFUSION MATRIX:
table(Predicted = mod2.pred , Ture = Smarket$Direction[!train])
#Accuracy of the Model2-no of matches-correct classifications
mean(mod2.pred==subset(Smarket,!train)$Direction)
#accuracy decreases to 48%
#Hence the model performs poorer than 1st Model and also Overfits
#Smaller Model with Lesser Input variables
mod3<-glm(Direction ~ Lag1 + Lag2,data = Smarket,subset=train , family = binomial)
summary(mod3)
#AIC(mod2,mod3)
#df AIC
#mod2 7 1395.105
#mod3 3 1387.402
#By looking at the AIC(Infor criterion) we can say that model3 is better and good as
#less information is lost
#PREDICTIONS ON TEST
prob1<-predict(mod3,newdata = subset(Smarket,!train),type='response')
mod3.pred<-ifelse(prob1>=0.5,'Up','Down')
head(mod3.pred)
#Dataframe of True Directions and Predicted Direction on TEST Data
pred.df2<-data.frame(lag1 = Smarket$Lag1[!train] , Lag2 = Smarket$Lag2[!train],True_direction =
Smarket$Direction[!train],Predicted = mod3.pred)
#CONFUSION MATRIX:
table(Predicted = mod3.pred , Ture = Smarket$Direction[!train])
#Accuracy of the Model2-no of matches-correct classifications
mean(mod3.pred==subset(Smarket,!train)$Direction)
#Hence by taking Less predictors the Accuracy of the Model has improved to 56%
predict(mod3,newdata = data.frame(Lag1=0.967,Lag2=0.600),type='response')