Skip to content

Commit

Permalink
Merge pull request #17 from jcierocki/dummy-vars
Browse files Browse the repository at this point in the history
Dummy variables for Geography
  • Loading branch information
jcierocki authored May 9, 2020
2 parents 7fe84f5 + f9a0864 commit 6fe28ec
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 6 deletions.
Binary file modified data/split.RDS
Binary file not shown.
Binary file modified data/split_raw.RDS
Binary file not shown.
19 changes: 15 additions & 4 deletions dataset_prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ library(tidyverse)
library(tidymodels)
library(stringr)
library(scorecard)
library(recipes)

rm(list = ls())

Expand All @@ -17,15 +18,25 @@ data1 <- data_raw %>%
HasCrCard = factor(HasCrCard) %>% `levels<-`(c("No", "Yes"))) %>%
dplyr::select(-RowNumber, -CustomerId, -Surname)

data1 %>% filter_vars_by_iv(significance_thres = 0.02) %>%
data2 <- recipe(Exited ~ ., data = data1) %>%
step_dummy(Geography) %>%
prep %>% bake(new_data = data1)

changed_cols_idx <- data2 %>% colnames %>% str_split("_") %>% map_lgl(~ .x[1] == "Geography")
changed_cols <- colnames(data2)[changed_cols_idx]
data2 <- data2 %>%
mutate_at(changed_cols, ~ as.factor(.x) %>% `levels<-`(c("No", "Yes"))) %>%
rename_at(changed_cols, ~ str_remove(.x, "_"))

data2 %>% filter_vars_by_iv(significance_thres = 0.01) %>%
initial_split(prop = 0.75) %>%
saveRDS("data/split_raw.RDS")

data2 <- data1 %>%
data3 <- data2 %>%
factorize(bin_methods = "tree") %>%
as_tibble() %>%
filter_vars_by_iv(significance_thres = 0.02)
filter_vars_by_iv(significance_thres = 0.01)

dataset_split <- initial_split(data2, prop = 0.75) %>% saveRDS("data/split.RDS")
dataset_split <- data3 %>% initial_split(prop = 0.75) %>% saveRDS("data/split.RDS")

rm(list = ls())
4 changes: 2 additions & 2 deletions funs_preproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ choose_best_binning <- function(binnings_df) {
}

factorize <- function(df, y_name = "Exited", y_pos = "No", bin_limit = 6, bin_methods = c("tree", "chimerge")) {
fct_cols <- colnames(df)[data1 %>% map_lgl(~ !is.factor(.x)) & colnames(df) != y_name]
fct_cols <- colnames(df)[(df %>% map_lgl(~ !is.factor(.x))) & colnames(df) != y_name]
binnings <- bin_methods %>%
map(~ df %>% woebin(y = y_name, x = fct_cols, positive = y_pos, bin_num_limit = bin_limit, method = .x)) %>%
`names<-`(bin_methods) %>%
Expand All @@ -51,7 +51,7 @@ factorize <- function(df, y_name = "Exited", y_pos = "No", bin_limit = 6, bin_me

df %>% woebin_ply(bins = bins_best, to = "bin") %>%
mutate_if(~ !is.factor(.x), as.factor) %>%
rename_all(function(x) map_chr(str_split(x, "_"), ~ .x[1])) %>%
rename_all(function(x) str_split(x, "_") %>% map_chr(~ .x[1])) %>%
return
}

Expand Down

0 comments on commit 6fe28ec

Please sign in to comment.