-
Notifications
You must be signed in to change notification settings - Fork 0
/
mckinsey_hc.R
218 lines (170 loc) · 10.5 KB
/
mckinsey_hc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
library(ggplot2)
library(tidyr)
library(rpart)
library(randomForest)
library(party)
library(mice)
library(party)
train <- read.csv("train_mk_hc.csv", header = TRUE, stringsAsFactors = TRUE)
test <- read.csv("test_mk_hc.csv", header = TRUE, stringsAsFactors = TRUE)
test$stroke <- NA
all <- rbind(train,test)
all$hypertension <- as.factor(all$hypertension)
all$heart_disease <- as.factor(all$heart_disease)
all$stroke <- as.factor(all$stroke)
#-----EXPLORATORY ANALYSIS-----
table(summary(is.na(all)))
all[all$smoking_status=="", "smoking_status"] <- NA
#bmi & smoking_status have NA / blank values. They respectively have about 3.3% and 31% missing value ratio.
#Beware, because my initial doubt is people with unhealthy bmi might have chosen not to disclose this information, indicating a case of 'Missing Not at Random' (MNAR).
all[all$gender == "Other",]
#Among people with the 'Other' gender we have 4 children which makes me think if 'Other' partially or fully used to indicate missing gender data, rather than a gender besides male/female.
ggplot(all, aes(x=age)) +
geom_density(data=all[all$gender=="Female",], fill="green", alpha=0.2) +
geom_density(data=all[all$gender=="Male",], fill="blue", alpha=0.2)
#In 20-50 age bracket we have more cases for Female patients while until ~20 the amount of cases for Male patients is superior.
prop.table(table(all$heart_disease, all$smoking_status),2)
prop.table(table(all$hypertension, all$smoking_status),2)
#Looking at the relation between smoking status and having hypertension/heart_disease, results are pretty counter-intuitive.
#For both health problems, former smokers seem to have the highest ratio of sick people. One explanation of this could be a high number of smokers who had to quit after diagnosis.
#For hypertension, people who never smoked and people who smoke have a very similar ratio of sick people.
prop.table(table(train$smoking_status, train$stroke),1)
#Former smokers have the highest ratio of stroke patients as well
prop.table(table(train$smoking_status, train$stroke),1)
prop.table(table(train$ever_married, train$stroke),1)
prop.table(table(all$ever_married, all$smoking_status),1)
ggplot(all, aes(x=age)) +
geom_density(data=all[all$smoking_status=="formerly smoked",], fill="orange", alpha=0.2) +
geom_density(data=all[all$smoking_status=="never smoked",], fill="green", alpha=0.2) +
geom_density(data=all[all$smoking_status=="smokes",], fill="red", alpha=0.2)
#Above graph shows that amount of smokers tend to fall with age after age of 50 while the amount of former smokers reach it's peak.
ggplot(train, aes(x=age)) +
geom_density(data=train[train$stroke=="1",], fill="red", alpha=0.2) +
geom_density(data=train[train$stroke=="0",], fill="green", alpha=0.2)
#This plot shows that stroke possibility dramatically increases with age. However, interestingly, it starts falling after about 76 years old.
ggplot(train, aes(x=bmi)) +
geom_density(data=train[train$stroke=="1",], fill="red", alpha=0.2) +
geom_density(data=train[train$stroke=="0",], fill="green", alpha=0.2)
#This plot shows that stroke possibility dramatically increases with age. However, interestingly, it starts falling after about 76 years old.
ggplot(all, aes(x=age)) +
geom_density(data=all[all$ever_married=="No",], fill="red", alpha=0.2) +
geom_density(data=all[all$ever_married=="Yes",], fill="green", alpha=0.2)
#Amount of patient who were ever married surpass who didn't by the age of 30.
ggplot(all, aes(x=age)) +
geom_density(data=all[all$hypertension=="1",], fill="red", alpha=0.2) +
geom_density(data=all[all$hypertension=="0",], fill="green", alpha=0.2)
ggplot(all, aes(x=age)) +
geom_density(data=all[all$heart_disease=="1",], fill="red", alpha=0.2) +
geom_density(data=all[all$heart_disease=="0",], fill="green", alpha=0.2)
#As expected hypertension & heart disease is more prominent among older people
ggplot(all, aes(x=bmi, y=avg_glucose_level)) +
geom_point()
#No obvious relation between avg_glucose_level vs bmi
#Also, there are several very high bmi values, up to 97.60 (~260 kg, assuming 165 cm height).
#I will be leaving these values in the dataset as is, because I believe them to be plausible.
#----MISSING VALUES----
#looking at bmi
prop.table(table(is.na(all$bmi), all$gender),2)
prop.table(table(is.na(all$bmi), all$ever_married),2)
prop.table(table(is.na(all$bmi), all$work_type),2)
prop.table(table(is.na(all$bmi), all$smoking_status),2)
ggplot(all, aes(x=age)) +
geom_density(data=all[is.na(all$bmi),], fill="red", alpha=0.2) +
geom_density(data=all[!is.na(all$bmi),], fill="green", alpha=0.2)
#Missing bmi gets more frequent with increasing age, thus I conclude again on a missing-at-random case.
temp_bmi <- mice(all[, !names(all) %in% c("stroke", "id", "name", "bmi_rpart")], m=5, maxit = 5, method="pmm", seed=17)
ggplot(all, aes(x=bmi)) +
geom_density(fill="black", alpha=0.2) +
geom_density(data=data.frame(all$id, mice::complete(temp_bmi,1)), fill="blue", alpha=0.2) +
geom_density(data=data.frame(all$id, mice::complete(temp_bmi,2)), fill="red", alpha=0.2) +
geom_density(data=data.frame(all$id, mice::complete(temp_bmi,3)), fill="green", alpha=0.2) +
geom_density(data=data.frame(all$id, mice::complete(temp_bmi,4)), fill="yellow", alpha=0.2) +
geom_density(data=data.frame(all$id, mice::complete(temp_bmi,5)), fill="purple", alpha=0.2)
all$bmi <- mice::complete(temp_bmi,2)$bmi
#use rpart to impute bmi to create bmi2
#bmi_fit <- rpart(bmi ~ gender + age + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + smoking_status, data=all[!is.na(all$b)])
#looking at smoking_status
prop.table(table(is.na(all$smoking_status), all$gender),2)
prop.table(table(is.na(all$smoking_status), all$Residence_type),2)
prop.table(table(is.na(all$smoking_status), all$work_type),2)
hist(all[all$work_type=="Never_worked", "age"])
hist(all[is.na(all$smoking_status), "age"])
#The missing value ratio for smoking_status is higher for people under 20. This indicates a missing-at-random case.
smoking_fit <- rpart(smoking_status ~ gender + age + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + bmi, data=all[!is.na(all$smoking_status),], method="class")
all$smoking_status[is.na(all$smoking_status)] <- predict(smoking_fit, all[is.na(all$smoking_status),], type="class")
#----FEATURE ENGINEERING----
#Let's create a feature indicating people who has both heart_disease and hypertension
all$hasboth <- FALSE
all[train$heart_disease=="1" & train$hypertension=="1", "hasboth"] <- TRUE
all$hasboth <- as.factor(all$hasboth)
#Likewise, let's tag obese people with hypertension
all$obese_hyper <- FALSE
all[all$bmi_category=="obese" & all$hypertension=="1", "obese_hyper"] <- TRUE
all$obese_hyper <- as.factor(all$obese_hyper)
#Create isObese
all$isObese <- FALSE
all[all$bmi_category=="obese", "isObese"] <- TRUE
all$isObese <- as.factor(all$isObese)
all$isOverweight <- FALSE
all[all$bmi_category %in% c("obese","overweight"), "isOverweight"] <- TRUE
all$isOverweight <- as.factor(all$isOverweight)
#Create age_brackets
all$age_bracket <- ""
all[all$age >= 55, "age_bracket"] <- "Elder"
all[all$age < 55, "age_bracket"] <- "Middle Aged"
all[all$age < 35, "age_bracket"] <- "Young"
all[all$age < 20, "age_bracket"] <- "Teenager"
all[all$age < 7, "age_bracket"] <- "Kid"
all$age_bracket <- as.factor(all$age_bracket)
#Create isElder
all$isElder <- FALSE
all[all$age_bracket == "Elder", "isElder"] <- TRUE
#Creating BMI categories
#BMI interval for each category is taken from U.S. Department of Health & Human Serices - National Heart, Lung and Blood Institute
all$bmi_category <- ""
all[all$bmi >= 30, "bmi_category"] <- "obese"
all[all$bmi < 30, "bmi_category"] <- "overweight"
all[all$bmi < 25, "bmi_category"] <- "normal"
all[all$bmi < 18.5, "bmi_category"] <- "underweight"
all$bmi_category <- as.factor(all$bmi_category)
#Although I'm no medical expert, after some Google search, I believe it would be safe to diagnose people with >=200 avg_glucose_level with diabetes.
all$diabetes <- FALSE
all[all$avg_glucose_level >= 200, "diabetes"] <- TRUE
all$diabetes <- as.factor(all$diabetes)
#Create high_stress
all$high_stress <- FALSE
all[all$work_type %in% c("private", "Self-employed") & all$Residence_type=="Rural", "high_stress"] <- TRUE
all$high_stress <-as.factor(all$high_stress)
#Create condition_count
all$condition_count <- 0
all[all$hypertension==1, "condition_count"] <- all[all$hypertension==1,"condition_count"]+1
all[all$heart_disease==1, "condition_count"] <- all[all$heart_disease==1, "condition_count"]+1
all[all$diabetes==1, "condition_count"] <- all[all$diabetes==1, "condition_count"]+1
all[all$smoking_status %in% c("formerly smoked", "smokes"), "condition_count"] <- all[all$smoking_status %in% c("formerly smoked", "smokes"), "condition_count"]+1
all$condition_count <- as.factor(all$condition_count)
#----PREDICTION-----
train <- all[1:43400,]
test <- all[43401:62001,]
#---Random Forest--- #0.7994
rf <- randomForest(stroke ~ gender + age + ever_married + work_type +
avg_glucose_level + bmi + hypertension + heart_disease +
Residence_type +smoking_status + diabetes + isOverweight,
data=train, ntree=1000, importance=TRUE)
rf <- randomForest(stroke ~ gender + age + ever_married + work_type +
avg_glucose_level + bmi + hypertension + heart_disease +
Residence_type + smoking_status + diabetes + isOverweight +
condition_count, data=train, ntree=1000, importance=TRUE)
Prediction <- predict(rf,test, type="prob")[,2]
submission_file <- data.frame(id = test$id, stroke = Prediction)
write.csv(submission_file, file = "rf.csv", row.names = FALSE)
#---/RandomForest---
**
#---Conditional Random Forest--- #0.81744
crf <- cforest(stroke ~ gender + age + ever_married + work_type +
avg_glucose_level + bmi + hypertension + heart_disease +
Residence_type + smoking_status + diabetes + isOverweight +
condition_count, data=train, controls=cforest_unbiased(ntree=50, mtry=3))
Prediction_crf <- predict(crf, test, OOB=TRUE, type = "prob")
submission_file_crf <- data.frame(id = test$id, stroke = sapply(Prediction_crf, "[[", 2))
write.csv(submission_file_crf, file = "crf.csv", row.names = FALSE)
#---/Conditional Random Forest---