diff --git a/data/split.RDS b/data/split.RDS index 0f9618e..f76c7c5 100644 Binary files a/data/split.RDS and b/data/split.RDS differ diff --git a/data/split_raw.RDS b/data/split_raw.RDS new file mode 100644 index 0000000..eba7e53 Binary files /dev/null and b/data/split_raw.RDS differ diff --git a/dataset_prep.R b/dataset_prep.R index 81dd35b..6b8f5a5 100644 --- a/dataset_prep.R +++ b/dataset_prep.R @@ -17,6 +17,10 @@ data1 <- data_raw %>% HasCrCard = factor(HasCrCard) %>% `levels<-`(c("No", "Yes"))) %>% dplyr::select(-RowNumber, -CustomerId, -Surname) +data1 %>% filter_vars_by_iv(significance_thres = 0.02) %>% + initial_split(prop = 0.75) %>% + saveRDS("data/split_raw.RDS") + data2 <- data1 %>% factorize(bin_methods = "tree") %>% as_tibble() %>% diff --git a/gbm.R b/gbm.R index 891249f..95ae8a8 100644 --- a/gbm.R +++ b/gbm.R @@ -11,15 +11,17 @@ rm(list = ls()) # source("dataset_prep.R") dataset_split <- readRDS("data/split.RDS") -dataset_split$data <- dataset_split$data %>% - mutate_if(~ length(levels(.x)) > 3, as.integer) %>% +dataset_split$data <- dataset_split$data %>% + mutate_if(~ length(levels(.x)) > 3, as.integer) %>% mutate_at(vars(Balance), as.integer) +# dataset_split <- readRDS("data/split_raw.RDS") + df_train <- dataset_split %>% training() df_test <- dataset_split %>% testing() gbm_model_1 <- boost_tree(mode = "classification", - mtry = 2, + mtry = 3, trees = 500, min_n = 5, # tree_depth = 5, diff --git a/rand_forest.R b/rand_forest.R index 73fe9be..a566fa9 100644 --- a/rand_forest.R +++ b/rand_forest.R @@ -10,32 +10,52 @@ rm(list = ls()) # source("dataset_prep.R") -dataset_split <- readRDS("data/split.RDS") +dataset_split1 <- readRDS("data/split.RDS") +dataset_split2 <- readRDS("data/split_raw.RDS") -df_train <- dataset_split %>% training() -df_test <- dataset_split %>% testing() +df_train1 <- dataset_split1 %>% training() +df_test1 <- dataset_split1 %>% testing() +df_train2 <- dataset_split2 %>% training() +df_test2 <- dataset_split2 %>% testing() -ranger_model_1 <- rand_forest("classification", 2, 1000, 5) %>% - set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>% - # set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) %>% - # set_engine("ranger", num.threads = 8) %>% - fit(Exited ~ ., data = df_train) +ranger_model_specs <- rand_forest("classification", 2, 1000, 5) %>% + # set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>% + set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) + +ranger_model_1 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train1) + +ranger_model_2 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train2) -df_pred <- ranger_model_1 %>% - predict(df_test) %>% - bind_cols(df_test) +df_pred1 <- ranger_model_1 %>% + predict(df_test1) %>% + bind_cols(df_test1) + +df_pred2 <- ranger_model_2 %>% + predict(df_test2) %>% + bind_cols(df_test2) -df_pred %>% metrics(Exited, .pred_class) +df_pred1 %>% metrics(Exited, .pred_class) +df_pred2 %>% metrics(Exited, .pred_class) -df_pred_probs <- ranger_model_1 %>% - predict(df_test, type = "prob") %>% - bind_cols(df_test) +df_pred_probs1 <- ranger_model_1 %>% + predict(df_test1, type = "prob") %>% + bind_cols(df_test1) -df_pred_probs %>% roc_auc(Exited, .pred_No) -df_pred_probs %>% roc_curve(Exited, .pred_No) %>% autoplot() +df_pred_probs2 <- ranger_model_2 %>% + predict(df_test2, type = "prob") %>% + bind_cols(df_test2) + +df_pred_probs1 %>% roc_auc(Exited, .pred_No) +df_pred_probs2 %>% roc_auc(Exited, .pred_No) + +df_pred_probs1 %>% roc_curve(Exited, .pred_No) %>% autoplot() +df_pred_probs2 %>% roc_curve(Exited, .pred_No) %>% autoplot() vi(ranger_model_1) +vi(ranger_model_2) + vip(ranger_model_1) +vip(ranger_model_2)