Skip to content

Commit

Permalink
Added basic XGBoost model(with simple validation), anologous to priev…
Browse files Browse the repository at this point in the history
…iosly trained Ranger one. Generalized "factorize" function due to fix the bug in scorecard package which ignores "bin_num_limit" argument while using "chimerge" method.
  • Loading branch information
Jakub Cierocki committed May 9, 2020
1 parent 2aee0c6 commit 4c6fcd0
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 13 deletions.
Binary file modified data/split.RDS
Binary file not shown.
7 changes: 4 additions & 3 deletions dataset_prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ data1 <- data_raw %>%
HasCrCard = factor(HasCrCard) %>% `levels<-`(c("No", "Yes"))) %>%
dplyr::select(-RowNumber, -CustomerId, -Surname)

data2 <- data1 %>% factorize() %>% as_tibble() %>% filter_vars_by_iv()
data2 <- data1 %>%
factorize(bin_methods = "tree") %>%
as_tibble() %>%
filter_vars_by_iv(significance_thres = 0.02)

dataset_split <- initial_split(data2, prop = 0.75) %>% saveRDS("data/split.RDS")

rm(list = ls())


24 changes: 15 additions & 9 deletions funs_preproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,23 @@ merge_factor_vars.tbl <- function(var1, ...) {
do.call(function(...) factor(str_c(...)), as.list(var1))
}

factorize <- function(df, y_name = "Exited", y_pos = "No") {
choose_best_binning <- function(binnings_df) {
binnings_df %>% pmap(function(...) {
opts <- list(...)
best_iv_idx <- opts %>% map_dbl(~ .x$total_iv[1]) %>% which.max()

opts[[best_iv_idx]]
}) %>% return
}

factorize <- function(df, y_name = "Exited", y_pos = "No", bin_limit = 6, bin_methods = c("tree", "chimerge")) {
fct_cols <- colnames(df)[data1 %>% map_lgl(~ !is.factor(.x)) & colnames(df) != y_name]
bins_tree <- df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = 5, method = "tree")
bins_chimerge <- df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = 5, method = "chimerge")
binnings <- bin_methods %>%
map(~ df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = bin_limit, method = .x)) %>%
`names<-`(bin_methods) %>%
as_tibble()

bins_best <- map2(bins_tree, bins_chimerge, function(x, y) {
if(x$total_iv[1] > y$total_iv[1])
return(x)
else
return(y)
})
bins_best <- choose_best_binning(binnings)

df %>% woebin_ply(bins = bins_best, to = "bin") %>%
mutate_if(~ !is.factor(.x), as.factor) %>%
Expand Down
30 changes: 30 additions & 0 deletions gbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,36 @@ rm(list = ls())
# source("dataset_prep.R")

dataset_split <- readRDS("data/split.RDS")
dataset_split$data <- dataset_split$data %>%
mutate_if(~ length(levels(.x)) > 3, as.integer) %>%
mutate_at(vars(Balance), as.integer)

df_train <- dataset_split %>% training()
df_test <- dataset_split %>% testing()

gbm_model_1 <- boost_tree(mode = "classification",
mtry = 2,
trees = 500,
min_n = 5,
# tree_depth = 5,
learn_rate = .1,
loss_reduction = 0,
sample_size = 0.7) %>%
set_engine("xgboost", objective = "binary:logistic") %>%
fit(Exited ~ ., data = df_train)

df_pred <- gbm_model_1 %>%
predict(df_test) %>%
bind_cols(df_test)

df_pred %>% metrics(Exited, .pred_class)

df_pred_probs <- gbm_model_1 %>%
predict(df_test, type = "prob") %>%
bind_cols(df_test)

df_pred_probs %>% roc_auc(Exited, .pred_No)
df_pred_probs %>% roc_curve(Exited, .pred_No) %>% autoplot()

vi(gbm_model_1)
vip(gbm_model_1)
3 changes: 2 additions & 1 deletion rand_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ rm(list = ls())
# source("dataset_prep.R")

dataset_split <- readRDS("data/split.RDS")

df_train <- dataset_split %>% training()
df_test <- dataset_split %>% testing()

ranger_model_1 <- rand_forest("classification", 2, 500, 5) %>%
ranger_model_1 <- rand_forest("classification", 2, 1000, 5) %>%
set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>%
# set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) %>%
# set_engine("ranger", num.threads = 8) %>%
Expand Down

0 comments on commit 4c6fcd0

Please sign in to comment.