-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDA.R
370 lines (296 loc) · 11.7 KB
/
DA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#Importing the libraries we will be using in the project
library("tidyr")
library('ggvis')
library('tidyverse')
library('ggplot2')
library('dplyr')
library('reshape')
library('lattice')
library('heatmaply')
library('ggcorrplot')
library('patchwork')
library('GGally')
library('data.table')
library('cowplot')
library('knitr')
library('gmodels')
library('reshape2')
#Reading the "previous_app.csv" into the dataframe prev_data
prev_data <- read.csv(file = '/Users/khushijainmahendra/Downloads/previous_app.csv')
#Reading the "current_app.csv" into the dataframe prev_data
curr_data <- read.csv(file = '/Users/khushijainmahendra/Downloads/current_app.csv')
#Printing the head of the dataframe to view the data available
head(prev_data,n=5)
head(curr_data,n=5)
#finding sum of null values in every column
colSums(is.na(prev_data))
#finding sum of null values in every column
colSums(is.na(curr_data))
dim(curr_data)
#getting elaborate details about each attribute in the data
summary(curr_data)
summary(prev_data)
## Remove columns with more than 90% NA
curr_data <- curr_data[, which(colMeans(!is.na(curr_data)) >= 0.9)]
head(curr_data)
#To check the null values in the dataset
is.null(curr_data)
#The number of columns has gone down to 84 from 122 which means that 38 columns had more than 50% rows with null values.
# Using a box plot to see if we have outliers in the CNT_FAM_MEMBERS column of the curr_data dataset
ggplot(curr_data, aes(x=CNT_FAM_MEMBERS, y = "CNT_FAM_MEMBERS")) +
geom_boxplot()
summary(curr_data['CNT_FAM_MEMBERS'])
# use a box plot to see if we have outliers in the AMT_CREDIT column
# of the curr_data dataset
ggplot(curr_data, aes(x=AMT_CREDIT, y = "AMT_CREDIT")) +
geom_boxplot()
ggplot(data = curr_data) +
geom_histogram(mapping = aes(x = CNT_FAM_MEMBERS), binwidth = 0.5)
summary(curr_data["AMT_CREDIT"])
# checking for percentage of null values
colMeans(is.na(curr_data))
#Checking for data imbalance for the Target Column
Defaulter_sum <- sum(curr_data$TARGET == 1)
Defaulter_sum
Defaulter <- round(Defaulter_sum/length(curr_data),1)
Defaulter
Non_Defaulter_sum <- sum(curr_data$TARGET==0)
Non_Defaulter_sum
Non_Defaulter <- round(Non_Defaulter_sum/length(curr_data),1)
Non_Defaulter
#Piechart
x <- c(Defaulter,Non_Defaulter)
labels<- c("Defaulter", "Non_Defaulter")
piepercent<- round(100*x/sum(x),1)
piepercent
pie(x,labels = piepercent, col= rainbow(length(x)))
legend("topright",c(labels), cex=0.8, fill=rainbow((length(x))))
# We can see from the map that there is a severe imbalance in the client list for the TARGET columns, with an Imbalance ratio of 11.5 (i.e., 8% of clients are defaulters compared to 92% of non-defaulters).
# We now analyze the dataset with respect to Inferential Statistics
#splitting the dataframe into two data sets
OriginalDT <- data.table(curr_data)
curr_data0 <- OriginalDT[TARGET == 0]
curr_data1 <- OriginalDT[TARGET == 1]
curr_data0
dt2 <- prev_data %>%
rownames_to_column() %>%
gather(colname, value, -rowname)
head(dt2)
#correlation matrix
cormat <- cor(prev_data[sapply(prev_data,is.numeric)])
melted_data <- melt(cormat)
head(melted_data)
melted_data
ggplot(data= melted_data, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
#closer at the distribution of TARGET column
# Histogram with kernel density curve
ggplot(curr_data,aes(x=TARGET)) +
geom_density(alpha=.2, fill="#FF6666")
##plotting the graph
melted_data_num <- data.frame(
lapply(curr_data, as.integer))
ggcorr(curr_data,
method=c("pairwise","spearman"),
nbreaks=6,
hjust = 0.5,
label = TRUE,
label_size=2,
color="grey50")
#lower triangle of the correlation matrix
get_lower_tri <- function(cormat)
{
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
#upper triangle of the correlation matrix
get_upper_tri <-function(cormat) {
cormat[lower.tri(cormat)]<- NA
return (cormat)
}
upper_tri<- get_upper_tri(cormat)
upper_tri
#correlation heatmap
melted_data <- melt(upper_tri, na.rm=TRUE)
ggplot(data=melted_data, aes(Var2,Var1, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Univariate Categorical Features
plot_unnivariate_categorical <- function(i) {
g1 <-ggplot(curr_data0, aes_string(x=i)) +geom_bar()
g2 <-ggplot(curr_data1, aes_string(x=i)) +geom_bar()
print(cowplot::plot_grid(g1,g2))
}
plot_unnivariate_categorical('CODE_GENDER')
plot_unnivariate_categorical('NAME_CONTRACT_TYPE')
plot_unnivariate_categorical('NAME_FAMILY_STATUS')
plot_unnivariate_categorical('NAME_HOUSING_TYPE')
plot_unnivariate_categorical('NAME_EDUCATION_TYPE')
#Univariate Continuous Features
plot_unnivariate_continuous <- function(i)
{
h1<- ggplot(curr_data0, aes_string(x=i)) +geom_histogram(alpha=0.5, position="identity",bins='30')
h2<- ggplot(curr_data1, aes_string(x=i)) +geom_histogram(alpha=0.5, position="identity",bins='30')
print(cowplot::plot_grid(h1,h2))
}
#Count of Children
plot_unnivariate_continuous('CNT_CHILDREN')
#Inference: The number of people taking out loans before they have children are more than the number of people taking loans after having children
#Total Income
plot_unnivariate_continuous('AMT_INCOME_TOTAL')
#Credit Amount
plot_unnivariate_continuous('AMT_CREDIT')
#Annuity Amount
plot_unnivariate_continuous('AMT_ANNUITY')
#Age
plot_unnivariate_continuous('DAYS_BIRTH')
#Bivariate-categorical-categorical
plot_Bivariate_cat_cat <- function(i,j)
{
g3<- ggplot(curr_data0, aes_string(x=i, fill=j)) +geom_bar(position = position_dodge(preserve = "single"))
g4<- ggplot(curr_data1, aes_string(x=i,fill=j))+geom_bar(position = position_dodge(preserve = "single"))
print(cowplot::plot_grid(g3,g4))
}
#Following plots were plotted:
#Family Status VS Education
plot_Bivariate_cat_cat('NAME_FAMILY_STATUS','NAME_EDUCATION_TYPE')
#Housing Type VS Family Status
plot_Bivariate_cat_cat('NAME_HOUSING_TYPE','NAME_FAMILY_STATUS')
#Education VS Gender
plot_Bivariate_cat_cat('NAME_EDUCATION_TYPE','CODE_GENDER')
#Education VS Gender
plot_Bivariate_cat_cat('NAME_HOUSING_TYPE','NAME_EDUCATION_TYPE')
#Housing Type VS Gender
plot_Bivariate_cat_cat('NAME_FAMILY_STATUS','CODE_GENDER')
#Bivariate Categorical-Continuous
plot_Bivariate_cat_con <- function(i,j)
{
b1<- ggplot(curr_data0, aes_string(x=i, y=j)) +geom_boxplot()
b2<- ggplot(curr_data1, aes_string(x=i,y=j))+geom_boxplot()
print(cowplot::plot_grid(b1,b2))
}
#Education VS Income
plot_Bivariate_cat_con('NAME_EDUCATION_TYPE','AMT_INCOME_TOTAL')
#Education VS Credit Amount
plot_Bivariate_cat_con('NAME_EDUCATION_TYPE','AMT_CREDIT')
# Inference: The overall mean for the Credit Amount is higher for people with Academic degree.
#Gender VS Credit Amount
plot_Bivariate_cat_con('CODE_GENDER','AMT_CREDIT')
#Inference: Both Men and Women have the same anount of credit.
#Housing Type VS Count of Children
plot_Bivariate_cat_con('NAME_HOUSING_TYPE','CNT_CHILDREN')
#Education VS Count of Children
plot_Bivariate_cat_con('NAME_EDUCATION_TYPE','CNT_CHILDREN')
#Bivariate Continuous-Continuous
plot_Bivariate_con_con <- function(i,j)
{
s1<- ggplot(curr_data0, aes_string(x=i, y=j)) +geom_point(color="cornflowerblue",
size = 2,
alpha=.8)
s2<- ggplot(curr_data1, aes_string(x=i,y=j))+geom_point(color="cornflowerblue",
size = 2,
alpha=.8)
print(cowplot::plot_grid(s1,s2))
}
#Credit Amount VS Age
plot_Bivariate_con_con('AMT_CREDIT','DAYS_BIRTH')
#Income VS Credit Amount
plot_Bivariate_con_con('AMT_INCOME_TOTAL','AMT_CREDIT')
#Income VS Annuity
plot_Bivariate_con_con('AMT_INCOME_TOTAL','AMT_ANNUITY')
#Income VS Age
plot_Bivariate_con_con('AMT_INCOME_TOTAL','DAYS_BIRTH')
#Age VS Annuity
plot_Bivariate_con_con('DAYS_BIRTH','AMT_ANNUITY')
#correlation heatmap for non defaulters
subset_data0 <- select(curr_data0, CNT_CHILDREN, AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, DAYS_BIRTH, DAYS_EMPLOYED, AMT_GOODS_PRICE)
corr_mat0 <- round(cor(subset_data0),2)
# reduce the size of correlation matrix
melted_corr_mat0 <- melt(corr_mat0)
head(melted_corr_mat0)
# plotting the correlation heatmap
ggplot(data = melted_corr_mat0, aes(x=Var1, y=Var2,
fill=value)) +
geom_tile()
#correlation heatmap for defaulters
subset_data1 <- select(curr_data1, CNT_CHILDREN, AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, DAYS_BIRTH, DAYS_EMPLOYED, AMT_GOODS_PRICE)
corr_mat1 <- round(cor(subset_data1),2)
# reduce the size of correlation matrix
melted_corr_mat1 <- melt(corr_mat1)
head(melted_corr_mat1)
# plotting the correlation heatmap
ggplot(data = melted_corr_mat1, aes(x=Var1, y=Var2,
fill=value)) +
geom_tile()
#Decision tree
library(party)
output.tree <- ctree(
TARGET ~ AMT_ANNUITY,
data = curr_data)
plot(output.tree)
#RegressionTree
#test_curr <- sample(1:nrow(curr_data), size = 102503)
#train_curr<-(1:nrow(curr_data))[-test_curr]
#set.seed(1)
library(rpart)
#Treemodel<-rpart(TARGET~ ., data=curr_data[train_curr, ],method = "class")
library(rpart.plot)
#rpart.plot(Treemodel, extra = 106)
# Splitting the data into train and test
split1<- sample(c(rep(0, 0.7 * nrow(encoded_data)), rep(1, 0.3 * nrow(encoded_data))))
train <- encoded_data[split1 == 0, ]
test <- encoded_data[split1== 1, ]
# rpart(): Function to fit the model. The arguments are:
# survived ~.: Formula of the Decision Trees
# data = data_train: Dataset
# method = ‘class’: Fit a binary model
fit <- rpart(TARGET~., data = train, method = 'class')
rpart.plot(fit, extra = 106)
predict_unseen <-predict(fit, test, type = 'class')
#Confusion Matrix
table_mat <- table(test$TARGET, predict_unseen)
table_mat
#Calculating the accuracy
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test*100))
#rpart.plot(Treemodel,extra =1)
#library(ROCR)
#prp(Treemodel, type=2, extra=1)
#Fitting the model using glm() function
model<- glm(TARGET ~., family = binomial(link='logit'),data=train)
summary(model)
train_prev <- prev_data
# printing the first 3 elements
lapply(model,class)[1:3]
model$aic
#Accessing the performance of the model:
predict<- predict(model,test, type='response')
test_new <-test
test_new$CODE_GENDER[which(!(test_new$CODE_GENDER %in% unique(train$CODE_GENDER)))]<- NA
test_new$NAME_INCOME_TYPE[which(!(test_new$NAME_INCOME_TYPE %in% unique(train$NAME_INCOME_TYPE)))]<- NA
test_new$NAME_FAMILY_STATUS[which(!(test_new$NAME_FAMILY_STATUS %in% unique(train$NAME_FAMILY_STATUS)))]<- NA
test_new$ORGANIZATION_TYPE[which(!(test_new$ORGANIZATION_TYPE %in% unique(train$ORGANIZATION_TYPE)))]<- NA
test_new$ORGANIZATION_TYPE[which(!(test_new$ORGANIZATION_TYPE %in% unique(test$ORGANIZATION_TYPE)))]<- NA
test_new
predict<- predict(model,test_new, type='response')
summary(model)
#Now we can run the anova() function on the model to analyze the table of deviance
anova(model, test="Chisq")
#While no exact equivalent to the R2 of linear regression exists, the McFadden R2 index can be used to assess the model fit.
pR2(model)
levels(as.factor(train$CODE_GENDER))
#Assessing the predictive ability of the model
#a <- predict(model,newdata = test_new)
#fitted.results <- predict(model,newdata=subset(test_new,select=c(2)),type='response')
#fitted.results <- ifelse(fitted.results > 0.5,1,0)
#misClasificError <- mean(fitted.results != test$TARGET)
#print(paste('Accuracy',1-misClasificError))
#confusion matrix
table_mat<-table(test$TARGET, predict >0.5)
table_mat