diff --git a/data/fitted_models.RDS b/data/fitted_models.RDS new file mode 100644 index 0000000..f8e7bf7 Binary files /dev/null and b/data/fitted_models.RDS differ diff --git a/data/predictions.RDS b/data/predictions.RDS new file mode 100644 index 0000000..f98956a Binary files /dev/null and b/data/predictions.RDS differ diff --git a/dataset_prep.R b/dataset_prep.R index 190cf40..cc81e26 100644 --- a/dataset_prep.R +++ b/dataset_prep.R @@ -30,13 +30,13 @@ data2 <- data2 %>% data2 %>% filter_vars_by_iv(significance_thres = 0.01) %>% initial_split(prop = 0.75) %>% - saveRDS("data/split_raw.RDS") + write_rds("data/split_raw.RDS", compress = "gz2") data3 <- data2 %>% factorize(bin_methods = "tree") %>% as_tibble() %>% filter_vars_by_iv(significance_thres = 0.01) -dataset_split <- data3 %>% initial_split(prop = 0.75) %>% saveRDS("data/split.RDS") +dataset_split <- data3 %>% initial_split(prop = 0.75) %>% write_rds("data/split.RDS", compress = "gz2") rm(list = ls()) diff --git a/funs_valid.R b/funs_valid.R new file mode 100644 index 0000000..4b67024 --- /dev/null +++ b/funs_valid.R @@ -0,0 +1,16 @@ +#### validation and visualisation automating functions + +get_all_metrics <- function(pred_dfs, spec = 1L) { + f <- function(df) metrics(df, Exited, .pred_class, .pred_No) + model1_metrics <- f(pred_dfs[[1]][[spec]]) + + all_metrics <- do.call( + function(...) bind_cols(model1_metrics, ...), + pred_dfs %>% select(-1) %>% map(~ f(.x[[spec]])$.estimate)) %>% + rename(model_1 = .estimate) %>% + return +} + +exportable_conf_matrix <- function(df) { + conf_mat(df, Exited, .pred_class)$table +} \ No newline at end of file diff --git a/rand_forest.R b/rand_forest.R index a566fa9..47e598b 100644 --- a/rand_forest.R +++ b/rand_forest.R @@ -10,52 +10,35 @@ rm(list = ls()) # source("dataset_prep.R") -dataset_split1 <- readRDS("data/split.RDS") -dataset_split2 <- readRDS("data/split_raw.RDS") - -df_train1 <- dataset_split1 %>% training() -df_test1 <- dataset_split1 %>% testing() -df_train2 <- dataset_split2 %>% training() -df_test2 <- dataset_split2 %>% testing() - -ranger_model_specs <- rand_forest("classification", 2, 1000, 5) %>% - # set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>% - set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) - -ranger_model_1 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train1) - -ranger_model_2 <- ranger_model_specs %>% fit(Exited ~ ., data = df_train2) +dataset_splits <- list( + read_rds("data/split.RDS"), + read_rds("data/split_raw.RDS") + ) + +testing_sets <- dataset_splits %>% map(~ .x %>% testing()) + +models_specs <- list( + rand_forest("classification", 2, 1000, 5) %>% + # set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "impurity") %>% + set_engine("ranger", num.threads = 8, replace = F, sample.fraction = 0.8, importance = "permutation", local.importance = T) + ) + +spec_names <- str_c("model_", 1:length(dataset_splits)) +fitted_models <- dataset_splits %>% + map(~ .x %>% training()) %>% + map2_dfc(spec_names, function(df, col_name) { + tibble(!!col_name := models_specs %>% map(~ .x %>% fit(Exited ~ ., data = df))) + }) -df_pred1 <- ranger_model_1 %>% - predict(df_test1) %>% - bind_cols(df_test1) - -df_pred2 <- ranger_model_2 %>% - predict(df_test2) %>% - bind_cols(df_test2) - -df_pred1 %>% metrics(Exited, .pred_class) -df_pred2 %>% metrics(Exited, .pred_class) - -df_pred_probs1 <- ranger_model_1 %>% - predict(df_test1, type = "prob") %>% - bind_cols(df_test1) - -df_pred_probs2 <- ranger_model_2 %>% - predict(df_test2, type = "prob") %>% - bind_cols(df_test2) - -df_pred_probs1 %>% roc_auc(Exited, .pred_No) -df_pred_probs2 %>% roc_auc(Exited, .pred_No) - -df_pred_probs1 %>% roc_curve(Exited, .pred_No) %>% autoplot() -df_pred_probs2 %>% roc_curve(Exited, .pred_No) %>% autoplot() - -vi(ranger_model_1) -vi(ranger_model_2) - -vip(ranger_model_1) -vip(ranger_model_2) - - - +pred_dfs <- list(fitted_models, testing_sets, spec_names) %>% pmap_dfc(function(models_by_spec, df, spec_name) { + tibble(!!spec_name := + models_by_spec %>% map(function(model) { + df %>% bind_cols( + model %>% predict(df), + model %>% predict(df, type = "prob") + ) + })) +}) + +fitted_models %>% write_rds("data/fitted_models.RDS", compress = "bz2") +pred_dfs %>% write_rds("data/predictions.RDS", compress = "bz2") diff --git a/validation_rf.R b/validation_rf.R new file mode 100644 index 0000000..7e0e93c --- /dev/null +++ b/validation_rf.R @@ -0,0 +1,29 @@ +#### walidation + +library(tidyverse) +library(stringr) +library(tidymodels) +library(ranger) +library(vip) +library(knitr) +library(kableExtra) + +rm(list = ls()) + +fitted_models <- read_rds("data/fitted_models.RDS") +pred_dfs <- read_rds("data/predictions.RDS") + +source("funs_valid.R") + +all_metrics <- get_all_metrics(pred_dfs) + +pred_dfs[[1]][[1]] %>% exportable_conf_matrix %>% kable(format = "markdown") + +# df_pred_probs1 %>% roc_curve(Exited, .pred_No) %>% autoplot() +# df_pred_probs2 %>% roc_curve(Exited, .pred_No) %>% autoplot() +# +# vi(ranger_model_1) +# vi(ranger_model_2) +# +# vip(ranger_model_1) +# vip(ranger_model_2) \ No newline at end of file