Added basic XGBoost model(with simple validation), anologous to priev…

…iosly trained Ranger one. Generalized "factorize" function due to fix the bug in scorecard package which ignores "bin_num_limit" argument while using "chimerge" method.
jcierocki · May 9, 2020 · 4c6fcd0 · 4c6fcd0
1 parent 2aee0c6
commit 4c6fcd0
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 13 deletions.
diff --git a/data/split.RDS b/data/split.RDS
diff --git a/dataset_prep.R b/dataset_prep.R
@@ -17,10 +17,11 @@ data1 <- data_raw %>%
          HasCrCard = factor(HasCrCard) %>% `levels<-`(c("No", "Yes"))) %>% 
   dplyr::select(-RowNumber, -CustomerId, -Surname)
 
-data2 <- data1 %>% factorize() %>% as_tibble() %>% filter_vars_by_iv()
+data2 <- data1 %>% 
+  factorize(bin_methods = "tree") %>% 
+  as_tibble() %>% 
+  filter_vars_by_iv(significance_thres = 0.02)
 
 dataset_split <- initial_split(data2, prop = 0.75) %>% saveRDS("data/split.RDS")
 
 rm(list = ls())
-
-
diff --git a/funs_preproc.R b/funs_preproc.R
@@ -31,17 +31,23 @@ merge_factor_vars.tbl <- function(var1, ...) {
   do.call(function(...) factor(str_c(...)), as.list(var1))
 }
 
-factorize <- function(df, y_name = "Exited", y_pos = "No") {
+choose_best_binning <- function(binnings_df) {
+  binnings_df %>% pmap(function(...) {
+    opts <- list(...)
+    best_iv_idx <- opts %>% map_dbl(~ .x$total_iv[1]) %>% which.max()
+
+    opts[[best_iv_idx]]
+  }) %>% return
+}
+
+factorize <- function(df, y_name = "Exited", y_pos = "No", bin_limit = 6, bin_methods = c("tree", "chimerge")) {
   fct_cols <- colnames(df)[data1 %>% map_lgl(~ !is.factor(.x)) & colnames(df) != y_name]
-  bins_tree <- df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = 5, method = "tree")
-  bins_chimerge <- df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = 5, method = "chimerge")
+  binnings <- bin_methods %>% 
+    map(~ df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = bin_limit, method = .x)) %>% 
+    `names<-`(bin_methods) %>% 
+    as_tibble()
 
-  bins_best <- map2(bins_tree, bins_chimerge, function(x, y) {
-    if(x$total_iv[1] > y$total_iv[1])
-      return(x)
-    else
-      return(y)
-  })
+  bins_best <- choose_best_binning(binnings)
 
   df %>% woebin_ply(bins = bins_best, to = "bin") %>% 
     mutate_if(~ !is.factor(.x), as.factor) %>% 

diff --git a/gbm.R b/gbm.R
@@ -11,6 +11,36 @@ rm(list = ls())
 # source("dataset_prep.R")
 
 dataset_split <- readRDS("data/split.RDS")
+dataset_split$data <-  dataset_split$data %>% 
+  mutate_if(~ length(levels(.x)) > 3, as.integer) %>% 
+  mutate_at(vars(Balance), as.integer)
+
 df_train <- dataset_split %>% training()
 df_test <- dataset_split %>% testing()
 
+gbm_model_1 <- boost_tree(mode = "classification",
+                          mtry = 2,
+                          trees = 500,
+                          min_n = 5,
+                          # tree_depth = 5,
+                          learn_rate = .1,
+                          loss_reduction = 0,
+                          sample_size = 0.7) %>% 
+  set_engine("xgboost", objective = "binary:logistic") %>% 
+  fit(Exited ~ ., data = df_train)
+
+df_pred <- gbm_model_1 %>% 
+  predict(df_test) %>% 
+  bind_cols(df_test)
+
+df_pred %>% metrics(Exited, .pred_class)
+
+df_pred_probs <- gbm_model_1 %>% 
+  predict(df_test, type = "prob") %>% 
+  bind_cols(df_test)
+
+df_pred_probs %>% roc_auc(Exited, .pred_No)
+df_pred_probs %>% roc_curve(Exited, .pred_No) %>% autoplot()
+
+vi(gbm_model_1)
+vip(gbm_model_1)
diff --git a/rand_forest.R b/rand_forest.R
@@ -11,10 +11,11 @@ rm(list = ls())
 # source("dataset_prep.R")
 
 dataset_split <- readRDS("data/split.RDS")
+
 df_train <- dataset_split %>% training()
 df_test <- dataset_split %>% testing()
 
-ranger_model_1 <- rand_forest("classification", 2, 500, 5) %>% 
+ranger_model_1 <- rand_forest("classification", 2, 1000, 5) %>% 
   set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>%
   # set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) %>%
   # set_engine("ranger", num.threads = 8) %>%