Skip to content

Commit

Permalink
Added models trained on raw dataset (without binning) for both Ranger…
Browse files Browse the repository at this point in the history
… and XGBoost models. Automated code in "rand_forest.R", it's now training two models by default.
  • Loading branch information
Jakub Cierocki committed May 9, 2020
1 parent 4c6fcd0 commit 9b23293
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 20 deletions.
Binary file modified data/split.RDS
Binary file not shown.
Binary file added data/split_raw.RDS
Binary file not shown.
4 changes: 4 additions & 0 deletions dataset_prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ data1 <- data_raw %>%
HasCrCard = factor(HasCrCard) %>% `levels<-`(c("No", "Yes"))) %>%
dplyr::select(-RowNumber, -CustomerId, -Surname)

data1 %>% filter_vars_by_iv(significance_thres = 0.02) %>%
initial_split(prop = 0.75) %>%
saveRDS("data/split_raw.RDS")

data2 <- data1 %>%
factorize(bin_methods = "tree") %>%
as_tibble() %>%
Expand Down
8 changes: 5 additions & 3 deletions gbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@ rm(list = ls())
# source("dataset_prep.R")

dataset_split <- readRDS("data/split.RDS")
dataset_split$data <- dataset_split$data %>%
mutate_if(~ length(levels(.x)) > 3, as.integer) %>%
dataset_split$data <- dataset_split$data %>%
mutate_if(~ length(levels(.x)) > 3, as.integer) %>%
mutate_at(vars(Balance), as.integer)

# dataset_split <- readRDS("data/split_raw.RDS")

df_train <- dataset_split %>% training()
df_test <- dataset_split %>% testing()

gbm_model_1 <- boost_tree(mode = "classification",
mtry = 2,
mtry = 3,
trees = 500,
min_n = 5,
# tree_depth = 5,
Expand Down
54 changes: 37 additions & 17 deletions rand_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,52 @@ rm(list = ls())

# source("dataset_prep.R")

dataset_split <- readRDS("data/split.RDS")
dataset_split1 <- readRDS("data/split.RDS")
dataset_split2 <- readRDS("data/split_raw.RDS")

df_train <- dataset_split %>% training()
df_test <- dataset_split %>% testing()
df_train1 <- dataset_split1 %>% training()
df_test1 <- dataset_split1 %>% testing()
df_train2 <- dataset_split2 %>% training()
df_test2 <- dataset_split2 %>% testing()

ranger_model_1 <- rand_forest("classification", 2, 1000, 5) %>%
set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>%
# set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) %>%
# set_engine("ranger", num.threads = 8) %>%
fit(Exited ~ ., data = df_train)
ranger_model_specs <- rand_forest("classification", 2, 1000, 5) %>%
# set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>%
set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T)

ranger_model_1 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train1)

ranger_model_2 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train2)

df_pred <- ranger_model_1 %>%
predict(df_test) %>%
bind_cols(df_test)
df_pred1 <- ranger_model_1 %>%
predict(df_test1) %>%
bind_cols(df_test1)

df_pred2 <- ranger_model_2 %>%
predict(df_test2) %>%
bind_cols(df_test2)

df_pred %>% metrics(Exited, .pred_class)
df_pred1 %>% metrics(Exited, .pred_class)
df_pred2 %>% metrics(Exited, .pred_class)

df_pred_probs <- ranger_model_1 %>%
predict(df_test, type = "prob") %>%
bind_cols(df_test)
df_pred_probs1 <- ranger_model_1 %>%
predict(df_test1, type = "prob") %>%
bind_cols(df_test1)

df_pred_probs %>% roc_auc(Exited, .pred_No)
df_pred_probs %>% roc_curve(Exited, .pred_No) %>% autoplot()
df_pred_probs2 <- ranger_model_2 %>%
predict(df_test2, type = "prob") %>%
bind_cols(df_test2)

df_pred_probs1 %>% roc_auc(Exited, .pred_No)
df_pred_probs2 %>% roc_auc(Exited, .pred_No)

df_pred_probs1 %>% roc_curve(Exited, .pred_No) %>% autoplot()
df_pred_probs2 %>% roc_curve(Exited, .pred_No) %>% autoplot()

vi(ranger_model_1)
vi(ranger_model_2)

vip(ranger_model_1)
vip(ranger_model_2)



0 comments on commit 9b23293

Please sign in to comment.