From 1bb3c53e71ef811048e1e03ce3311803f3b4baa0 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Thu, 14 May 2020 18:56:38 +0200 Subject: [PATCH 01/19] catboost-plot-pimp-shapimp --- boruta/boruta_py.py | 358 +++++++++++++++++++++++++++++-------- boruta/test/modif_tests.py | 154 ++++++++++++++++ 2 files changed, 435 insertions(+), 77 deletions(-) create mode 100644 boruta/test/modif_tests.py diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index fd3f20b..c346708 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,3 +1,18 @@ +from __future__ import print_function, division +import numpy as np +import scipy as sp +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier +from sklearn.model_selection import RepeatedKFold, train_test_split +from sklearn.inspection import permutation_importance +import shap +import pandas as pd +import time +import matplotlib +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D #!/usr/bin/env python # -*- coding: utf-8 -*- """ @@ -8,17 +23,11 @@ License: BSD 3 clause """ -from __future__ import print_function, division -import numpy as np -import scipy as sp -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator class BorutaPy(BaseEstimator, TransformerMixin): """ Improved Python implementation of the Boruta R package. - The improvements of this implementation include: - Faster run times: Thanks to scikit-learn's fast implementation of the ensemble methods. @@ -48,40 +57,33 @@ class BorutaPy(BaseEstimator, TransformerMixin): - Ranking of features: After fitting BorutaPy it provides the user with ranking of features. Confirmed ones are 1, Tentatives are 2, and the rejected are ranked - starting from 3, based on their feautre importance history through + starting from 3, based on their feature importance history through the iterations. - + - Using either the native variable importance, scikit permutation importance, + SHAP importance. We highly recommend using pruned trees with a depth between 3-7. - For more, see the docs of these functions, and the examples below. - Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - Boruta is an all relevant feature selection method, while most other are minimal optimal; this means it tries to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some classifier has a minimal error. - Why bother with all relevant feature selection? When you try to understand the phenomenon that made your data, you should care about all factors that contribute to it, not just the bluntest signs of it in context of your methodology (yes, minimal optimal set of features by definition depends on your classifier choice). - Parameters ---------- - estimator : object A supervised learning estimator, with a 'fit' method that returns the feature_importances_ attribute. Important features must correspond to high absolute values in the feature_importances_. - n_estimators : int or string, default = 1000 If int sets the number of estimators in the chosen ensemble method. If 'auto' this is determined automatically based on the size of the dataset. The other parameters of the used estimators need to be set with initialisation. - perc : int, default = 100 Instead of the max we use the percentile defined by the user, to pick our threshold for comparison between shadow and real features. The max @@ -89,93 +91,81 @@ class BorutaPy(BaseEstimator, TransformerMixin): lower perc is the more false positives will be picked as relevant but also the less relevant features will be left out. The usual trade-off. The default is essentially the vanilla Boruta corresponding to the max. - alpha : float, default = 0.05 Level at which the corrected p-values will get rejected in both correction steps. - + importance : str, default = 'shap' + The kind of variable importance used to compare and discriminate original + vs shadow predictors. two_step : Boolean, default = True If you want to use the original implementation of Boruta with Bonferroni correction only set this to False. - max_iter : int, default = 100 The number of maximum iterations to perform. - random_state : int, RandomState instance or None; default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - + weight : pd.Series verbose : int, default=0 Controls verbosity of output: - 0: no output - 1: displays iteration number - 2: which features have been selected already - Attributes ---------- - n_features_ : int The number of selected features. - support_ : array of shape [n_features] - The mask of selected features - only confirmed ones are True. - support_weak_ : array of shape [n_features] - The mask of selected tentative features, which haven't gained enough support during the max_iter number of iterations.. - ranking_ : array of shape [n_features] - The feature ranking, such that ``ranking_[i]`` corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1 and tentative features are assigned rank 2. - Examples -------- - + import pandas as pd from sklearn.ensemble import RandomForestClassifier from boruta import BorutaPy - + # load X and y # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute X = pd.read_csv('examples/test_X.csv', index_col=0).values y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values y = y.ravel() - + # define random forest classifier, with utilising all cores and # sampling in proportion to y labels rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) - + # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1) - + # find all relevant features - 5 features should be selected feat_selector.fit(X, y) - + # check selected features - first 5 features are selected feat_selector.support_ - + # check ranking of features feat_selector.ranking_ - + # call transform() on X to filter it down to selected features X_filtered = feat_selector.transform(X) - References ---------- - [1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package" Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010 """ - def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, - two_step=True, max_iter=100, random_state=None, verbose=0): + def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap', + two_step=True, max_iter=100, random_state=None, weight=None, verbose=0): self.estimator = estimator self.n_estimators = n_estimators self.perc = perc @@ -183,39 +173,48 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, self.two_step = two_step self.max_iter = max_iter self.random_state = random_state + self.random_state_instance = None self.verbose = verbose + self.weight = weight + self.importance = importance + self.cat_name = None + self.cat_idx = None + # Catboost doesn't allow to change random seed after fitting + self.is_cat = False + # plotting + self.imp_real_hist = None + self.sha_max = None + self.col_names = None def fit(self, X, y): """ Fits the Boruta feature selection with the provided estimator. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - y : array-like, shape = [n_samples] The target values. """ - + self.imp_real_hist = np.empty((0, X.shape[1]), float) + if isinstance(X, pd.DataFrame) is not True: + X = pd.DataFrame(X) + self.col_names = X.columns.to_list() return self._fit(X, y) def transform(self, X, weak=False, return_df=False): """ Reduces the input X to the features selected by Boruta. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - weak: boolean, default = False If set to true, the tentative features are also used to reduce X. - + return_df : boolean, default = False If ``X`` if a pandas dataframe and this parameter is set to True, the transformed data will also be a dataframe. - Returns ------- X : array-like, shape = [n_samples, n_features_] @@ -228,22 +227,17 @@ def transform(self, X, weak=False, return_df=False): def fit_transform(self, X, y, weak=False, return_df=False): """ Fits Boruta, then reduces the input X to the selected features. - Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. - y : array-like, shape = [n_samples] The target values. - weak: boolean, default = False If set to true, the tentative features are also used to reduce X. - return_df : boolean, default = False If ``X`` if a pandas dataframe and this parameter is set to True, the transformed data will also be a dataframe. - Returns ------- X : array-like, shape = [n_samples, n_features_] @@ -254,6 +248,63 @@ def fit_transform(self, X, y, weak=False, return_df=False): self._fit(X, y) return self._transform(X, weak, return_df) + def plot_importance(self): + """ + Boxplot of the variable importance, ordered by magnitude + The max shadow variable importance illustrated by the dashed line. + Requires to apply the fit method first. + :return: boxplot + """ + if self.imp_real_hist is None: + raise ValueError("Use the fit method first to compute the var.imp") + + color = {'boxes': 'gray', 'whiskers': 'gray', 'medians': '#404040', 'caps': 'gray'} + vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.col_names) + vimp_df = vimp_df.reindex(vimp_df.mean().sort_values(ascending=True).index, axis=1) + bp = vimp_df.boxplot(color=color, + boxprops=dict(linestyle='-', linewidth=1.5), + flierprops=dict(linestyle='-', linewidth=1.5), + medianprops=dict(linestyle='-', linewidth=1.5), + whiskerprops=dict(linestyle='-', linewidth=1.5), + capprops=dict(linestyle='-', linewidth=1.5), + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, + figsize=(10, 3 * len(self.support_) / 10) + ) + + box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) + for c in range(len(box_face_col)): + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + + # xt = bp.get_xticks() + # xt = np.round(np.append(xt, self.sha_max), 2) + # xtl = xt.tolist() + # xtl[-1] = "sha. max" + # bp.set_xticks(xt) + # bp.set_xticklabels(xtl) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)) + custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), + Line2D([0], [0], color="#FDD023", lw=5), + Line2D([0], [0], color="gray", lw=5), + Line2D([0], [0], linestyle='--', color="gray", lw=2)] + bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") + plt.axvline(x=self.sha_max, linestyle='--', color='gray') + fig = bp.get_figure() + fig.suptitle('Boruta importance and selected predictors') + plt.show() + + def _is_catboost(self): + self.is_cat = 'catboost' in str(type(self.estimator)) + + def _is_tree_based(self): + """ + checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) + :return: + """ + tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest'] + condition = any(i in str(type(self.estimator)) for i in tree_based_models) + return condition + def _validate_pandas_input(self, arg): try: return arg.values @@ -262,16 +313,48 @@ def _validate_pandas_input(self, arg): "input needs to be a numpy array or pandas data frame." ) - def _fit(self, X, y): + def _fit(self, X_raw, y): + self._is_catboost() + + start_time = time.time() + # basic cat features encoding + # First, let's store "object" columns as categorical columns + # obj_feat = X_raw.dtypes.loc[X_raw.dtypes == 'object'].index.tolist() + obj_feat = list(set(list(X_raw.columns)) - set(list(X_raw.select_dtypes(include=[np.number])))) + + if obj_feat: + X_raw.loc[:, obj_feat] = X_raw[obj_feat].astype('str').astype('category') + cat_feat = X_raw.dtypes.loc[X_raw.dtypes == 'category'].index.tolist() + cat_idx = [X_raw.columns.get_loc(c) for c in cat_feat if c in cat_feat] + if cat_feat: + # a way without loop but need to re-do astype + Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() + + if self.is_cat is False: + if cat_feat: + X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) + else: + X = X_raw + else: + X = X_raw + + X = np.nan_to_num(X) + y = np.nan_to_num(y) + # w = w.fillna(0) + + self.cat_name = cat_feat + self.cat_idx = cat_idx + # check input params self._check_params(X, y) if not isinstance(X, np.ndarray): - X = self._validate_pandas_input(X) + X = self._validate_pandas_input(X) + if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) - self.random_state = check_random_state(self.random_state) + self.random_state_instance = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 @@ -301,7 +384,9 @@ def _fit(self, X, y): self.estimator.set_params(n_estimators=n_tree) # make sure we start with a new tree in each iteration - self.estimator.set_params(random_state=self.random_state) + # Catboost doesn't allow to change random seed after fitting + if self.is_cat is False: + self.estimator.set_params(random_state=self.random_state) # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) @@ -342,6 +427,11 @@ def _fit(self, X, y): self.support_[confirmed] = 1 self.support_weak_ = np.zeros(n_feat, dtype=np.bool) self.support_weak_[tentative] = 1 + # for plotting + self.imp_real_hist = imp_history + self.sha_max = imp_sha_max + if isinstance(X_raw, pd.DataFrame): + self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x] # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) @@ -356,18 +446,18 @@ def _fit(self, X, y): # update rank for not_selected features if not_selected.shape[0] > 0: - # calculate ranks in each iteration, then median of ranks across feats - iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) - rank_medians = np.nanmedian(iter_ranks, axis=0) - ranks = self._nanrankdata(rank_medians, axis=0) - - # set smallest rank to 3 if there are tentative feats - if tentative.shape[0] > 0: - ranks = ranks - np.min(ranks) + 3 - else: - # and 2 otherwise - ranks = ranks - np.min(ranks) + 2 - self.ranking_[not_selected] = ranks + # calculate ranks in each iteration, then median of ranks across feats + iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) + rank_medians = np.nanmedian(iter_ranks, axis=0) + ranks = self._nanrankdata(rank_medians, axis=0) + + # set smallest rank to 3 if there are tentative feats + if tentative.shape[0] > 0: + ranks = ranks - np.min(ranks) + 3 + else: + # and 2 otherwise + ranks = ranks - np.min(ranks) + 2 + self.ranking_[not_selected] = ranks else: # all are selected, thus we set feature supports to True self.support_ = np.ones(n_feat, dtype=np.bool) @@ -375,6 +465,10 @@ def _fit(self, X, y): # notify user if self.verbose > 0: self._print_results(dec_reg, _iter, 1) + self.running_time = time.time() - start_time + hours, rem = divmod(self.running_time, 3600) + minutes, seconds = divmod(rem, 60) + print("All relevant predictors selected in {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) return self def _transform(self, X, weak=False, return_df=False): @@ -408,7 +502,16 @@ def _get_tree_num(self, n_feat): def _get_imp(self, X, y): try: - self.estimator.fit(X, y) + if self.is_cat: + X = pd.DataFrame(X) + obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() + if obj_feat: + X[obj_feat] = X[obj_feat].astype('str').astype('category') + cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist() + self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat) + else: + self.estimator.fit(X, y, sample_weight=self.weight) + except Exception as e: raise ValueError('Please check your X and y variable. The provided ' 'estimator cannot be fitted to your data.\n' + str(e)) @@ -419,8 +522,100 @@ def _get_imp(self, X, y): 'are currently supported in BorutaPy.') return imp + def _get_shap_imp(self, X, y): + + if self.weight is not None: + w = self.weight + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + # if obj_feat: + # X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + # X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + # obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if self._is_tree_based(): + try: + if self.is_cat: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx) + else: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + # build the explainer + explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") + shap_values = explainer.shap_values(X_tt) + # flatten to 2D if classification + if is_classifier(self.estimator): + shap_values = np.array(shap_values) + shap_values = shap_values.reshape(-1, shap_values.shape[-1]) + + shap_imp = np.mean(np.abs(shap_values), axis=0) + else: + raise ValueError('Not a tree based model') + + return shap_imp + + def _get_perm_imp(self, X, y): + + if self.weight is not None: + w = self.weight + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(self.estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if self._is_tree_based(): + try: + if self.is_cat: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) + else: + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + + perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + imp = perm_imp.importances_mean.ravel() + else: + raise ValueError('Not a tree based model') + + return imp + def _get_shuffle(self, seq): - self.random_state.shuffle(seq) + self.random_state_instance.shuffle(seq) return seq def _add_shadows_get_imps(self, X, y, dec_reg): @@ -436,12 +631,19 @@ def _add_shadows_get_imps(self, X, y, dec_reg): # shuffle xSha x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha) # get importance of the merged matrix - imp = self._get_imp(np.hstack((x_cur, x_sha)), y) + if self.importance == 'shap': + imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y) + elif self.importance == 'pimp': + imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y) + else: + imp = self._get_imp(np.hstack((x_cur, x_sha)), y) + # separate importances of real and shadow features imp_sha = imp[x_cur_w:] imp_real = np.zeros(X.shape[1]) imp_real[:] = np.nan imp_real[x_cur_ind] = imp[:x_cur_w] + return imp_real, imp_sha def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): @@ -492,14 +694,12 @@ def _fdrcorrection(self, pvals, alpha=0.05): """ Benjamini/Hochberg p-value correction for false discovery rate, from statsmodels package. Included here for decoupling dependency on statsmodels. - Parameters ---------- pvals : array_like set of p-values of the individual tests. alpha : float error rate - Returns ------- rejected : array, bool @@ -541,7 +741,7 @@ def _check_params(self, X, y): Check hyperparameters as well as X and y before proceeding with fit. """ # check X and y are consistent len, X is Array and y is column - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, dtype=None, force_all_finite=False) if self.perc <= 0 or self.perc > 100: raise ValueError('The percentile should be between 0 and 100.') @@ -568,5 +768,9 @@ def _print_results(self, dec_reg, _iter, flag): n_tentative = np.sum(self.support_weak_) content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected]) result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)]) - output = "\n\nBorutaPy finished running.\n\n" + result + if self.importance in ['shap', 'pimp']: + vimp = str(self.importance) + else: + vimp = 'native' + output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result print(output) diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py new file mode 100644 index 0000000..507db6f --- /dev/null +++ b/boruta/test/modif_tests.py @@ -0,0 +1,154 @@ +%matplotlib inline +from sklearn.datasets import fetch_openml +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance +from sklearn.compose import ColumnTransformer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.datasets import fetch_openml +from sklearn.inspection import permutation_importance +import catboost +from boruta import BorutaPy as bp +from sklearn.datasets import load_boston, load_diabetes +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import gc +import shap +# lightgbm and catboost +import lightgbm as lgb +from lightgbm import LGBMRegressor, LGBMClassifier +from xgboost import XGBRegressor, XGBClassifier +from catboost import CatBoostRegressor, CatBoostClassifier +from sys import getsizeof, path +plt.style.use('fivethirtyeight') +rng = np.random.RandomState(seed=42) + + + +# Convert the cat. pred. for boosting +def cat_var(df, col_excl=None, return_cat=True): + """Identify categorical features. + + Parameters + ---------- + df: original df after missing operations + + Returns + ------- + cat_var_df: summary df with col index and col name for all categorical vars + :param return_cat: Boolean, return encoded cols as type 'category' + :param df: pd.DF, the encoded data-frame + :param col_excl: list, colums not to be encoded + """ + + if col_excl is None: + non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number])))) + else: + non_num_cols = list( + set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl)) + + # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object'] + # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index] + + cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df] + + cat_var_df = pd.DataFrame({'cat_ind': cat_var_index, + 'cat_name': non_num_cols}) + + cols_need_mapped = cat_var_df.cat_name.to_list() + inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]} + mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]} + + for c in cols_need_mapped: + df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int) + + if return_cat: + df[non_num_cols] = df[non_num_cols].astype('category') + return df, cat_var_df, inv_mapper + + +def get_titanic_data(): + # Fetch Titanic data and add random cat and numbers + # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/ + # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + rng = np.random.RandomState(seed=42) + X['random_cat'] = rng.randint(3, size=X.shape[0]) + X['random_cat'] = X['random_cat'].astype('str') + X['random_num'] = rng.randn(X.shape[0]) + + categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] + numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] + X = X[categorical_columns + numerical_columns] + # Impute + categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))]) + numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) + preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) + X_trans = preprocessing.fit_transform(X) + X = pd.DataFrame(X_trans, columns = X.columns) + # encode + X, cat_var_df, inv_mapper = cat_var(X) + return X, y + +def get_boston_data(): + boston = load_boston() + X = pd.DataFrame(boston.data) + X['random_num1'] = rng.randn(X.shape[0]) + X['random_num2'] = np.random.poisson(1, X.shape[0]) + y = pd.Series(boston.target) + return X, y + +# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors +def testing_estimators(varimp, models, X, y): + for model in models: + print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) + feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y) + print(feat_selector.support_names_) + feat_selector.plot_importance() + gc.enable() + del(feat_selector, model) + gc.collect() + +def testing_clf_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + +def testing_regr_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + + +X, y = get_titanic_data() +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +# lightgbm faster and better than RF +lgb_model = LGBMClassifier(n_jobs= 4) +lgb_model.fit(X_train, y_train) +result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) +sorted_idx = result.importances_mean.argsort() +# Plot +fig, ax = plt.subplots() +ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]) +ax.set_title("Permutation Importances (test set)") +fig.tight_layout() +plt.show() + +if __name__ == '__main__': + # classification and cat. pred + X, y = get_titanic_data() + testing_clf_all_varimp(X=X, y=y) + # regression + X, y = get_boston_data() + testing_regr_all_varimp(X=X, y=y) \ No newline at end of file From bbea1b745c51eac0031e2b7b854c406480df52d0 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Sat, 16 May 2020 12:54:56 +0200 Subject: [PATCH 02/19] correcting typos --- boruta/boruta_py.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index c346708..1cb1416 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -542,15 +542,15 @@ def _get_shap_imp(self, X, y): obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) obj_idx = None - # if obj_feat: - # X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - # X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - # obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() if self._is_tree_based(): try: if self.is_cat: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_idx) + model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) @@ -607,7 +607,7 @@ def _get_perm_imp(self, X, y): raise ValueError('Please check your X and y variable. The provided ' 'estimator cannot be fitted to your data.\n' + str(e)) - perm_imp = permutation_importance(self.estimator, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) imp = perm_imp.importances_mean.ravel() else: raise ValueError('Not a tree based model') From e8062d7cd58c1f2e13fa1c989a1afe7770782f06 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Mon, 18 May 2020 17:54:23 +0200 Subject: [PATCH 03/19] fix the fig size + tight layout --- boruta/boruta_py.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 1cb1416..7d943b7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -267,8 +267,7 @@ def plot_importance(self): medianprops=dict(linestyle='-', linewidth=1.5), whiskerprops=dict(linestyle='-', linewidth=1.5), capprops=dict(linestyle='-', linewidth=1.5), - showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, - figsize=(10, 3 * len(self.support_) / 10) + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True ) box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) @@ -276,13 +275,8 @@ def plot_importance(self): bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) - # xt = bp.get_xticks() - # xt = np.round(np.append(xt, self.sha_max), 2) - # xtl = xt.tolist() - # xtl[-1] = "sha. max" - # bp.set_xticks(xt) - # bp.set_xticklabels(xtl) - bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)) + xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange) custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), Line2D([0], [0], color="#FDD023", lw=5), Line2D([0], [0], color="gray", lw=5), @@ -290,7 +284,9 @@ def plot_importance(self): bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") plt.axvline(x=self.sha_max, linestyle='--', color='gray') fig = bp.get_figure() - fig.suptitle('Boruta importance and selected predictors') + plt.title('Boruta importance and selected predictors') + fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10))) + plt.tight_layout() plt.show() def _is_catboost(self): From 6a05d378c6c2a91eb0ea643c255ad685afe4fd87 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Thu, 28 May 2020 16:16:55 +0200 Subject: [PATCH 04/19] fixing lgb clf shap imp --- boruta/boruta_py.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 7d943b7..3136d47 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -292,6 +292,9 @@ def plot_importance(self): def _is_catboost(self): self.is_cat = 'catboost' in str(type(self.estimator)) + def _is_lightgbm(self): + self.is_lgb = 'lightgbm' in str(type(self.estimator)) + def _is_tree_based(self): """ checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) @@ -556,12 +559,20 @@ def _get_shap_imp(self, X, y): # build the explainer explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") shap_values = explainer.shap_values(X_tt) - # flatten to 2D if classification + # flatten to 2D if classification and lightgbm if is_classifier(self.estimator): - shap_values = np.array(shap_values) - shap_values = shap_values.reshape(-1, shap_values.shape[-1]) - - shap_imp = np.mean(np.abs(shap_values), axis=0) + if isinstance(shap_values, list): + # for lightgbm clf sklearn, shap return list of arrays + # https://github.com/slundberg/shap/issues/526 + class_inds = range(len(shap_values)) + shap_imp = np.zeros(shap_values[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp = np.abs(shap_values[ind]).mean(0) + shap_imp /= len(shap_values) + else: + shap_imp = np.abs(shap_values).mean(0) + else: + shap_imp = np.abs(shap_values).mean(0) else: raise ValueError('Not a tree based model') From 2081d391cdf883742d3ddf439dccd2ec3cc6a63f Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:35:28 +0200 Subject: [PATCH 05/19] changes for lightgbm seed --- boruta/boruta_py.py | 42 +- .../Madalon_Data_Set-checkpoint.ipynb | 405 ++++++++++++++++++ boruta/examples/Madalon_Data_Set.ipynb | 21 +- .../.ipynb_checkpoints/__init__-checkpoint.py | 0 .../modif_tests-checkpoint.py | 154 +++++++ .../unit_tests-checkpoint.py | 57 +++ 6 files changed, 645 insertions(+), 34 deletions(-) create mode 100644 boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb create mode 100644 boruta/test/.ipynb_checkpoints/__init__-checkpoint.py create mode 100644 boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py create mode 100644 boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 3136d47..eaa79ba 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,17 +1,15 @@ from __future__ import print_function, division import numpy as np import scipy as sp -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_regressor, is_classifier -from sklearn.model_selection import RepeatedKFold, train_test_split -from sklearn.inspection import permutation_importance import shap import pandas as pd import time import matplotlib import matplotlib.pyplot as plt +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.model_selection import RepeatedKFold, train_test_split +from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D #!/usr/bin/env python # -*- coding: utf-8 -*- @@ -180,7 +178,9 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.cat_name = None self.cat_idx = None # Catboost doesn't allow to change random seed after fitting - self.is_cat = False + self.is_cat = 'catboost' in str(type(self.estimator)) + # Random state throws an error with lightgbm + self.is_lgb = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None @@ -289,12 +289,6 @@ def plot_importance(self): plt.tight_layout() plt.show() - def _is_catboost(self): - self.is_cat = 'catboost' in str(type(self.estimator)) - - def _is_lightgbm(self): - self.is_lgb = 'lightgbm' in str(type(self.estimator)) - def _is_tree_based(self): """ checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) @@ -313,7 +307,6 @@ def _validate_pandas_input(self, arg): ) def _fit(self, X_raw, y): - self._is_catboost() start_time = time.time() # basic cat features encoding @@ -329,7 +322,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if self.is_cat is False: + if not self.is_cat: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -353,7 +346,7 @@ def _fit(self, X_raw, y): if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) - self.random_state_instance = check_random_state(self.random_state) + self.random_state = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape _iter = 1 @@ -384,8 +377,13 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if self.is_cat is False: - self.estimator.set_params(random_state=self.random_state) + if not self.is_cat: + if self._is_lightgbm: + # https://github.com/scikit-learn-contrib/boruta_py/pull/78 + self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) + else: + self.estimator.set_params(random_state=self.random_state) + # add shadow attributes, shuffle them and train estimator, get imps cur_imp = self._add_shadows_get_imps(X, y, dec_reg) @@ -522,7 +520,7 @@ def _get_imp(self, X, y): return imp def _get_shap_imp(self, X, y): - + # SHAP and permutation importances must be computed on unseen data if self.weight is not None: w = self.weight if is_regressor(self.estimator): @@ -562,12 +560,12 @@ def _get_shap_imp(self, X, y): # flatten to 2D if classification and lightgbm if is_classifier(self.estimator): if isinstance(shap_values, list): - # for lightgbm clf sklearn, shap return list of arrays + # for lightgbm clf sklearn api, shap returns list of arrays # https://github.com/slundberg/shap/issues/526 class_inds = range(len(shap_values)) shap_imp = np.zeros(shap_values[0].shape[1]) for i, ind in enumerate(class_inds): - shap_imp = np.abs(shap_values[ind]).mean(0) + shap_imp += np.abs(shap_values[ind]).mean(0) shap_imp /= len(shap_values) else: shap_imp = np.abs(shap_values).mean(0) @@ -622,7 +620,7 @@ def _get_perm_imp(self, X, y): return imp def _get_shuffle(self, seq): - self.random_state_instance.shuffle(seq) + self.random_state.shuffle(seq) return seq def _add_shadows_get_imps(self, X, y, dec_reg): diff --git a/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb new file mode 100644 index 0000000..45fb14a --- /dev/null +++ b/boruta/examples/.ipynb_checkpoints/Madalon_Data_Set-checkpoint.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Boruta on the Madalon Data Set\n", + "Author: [Mike Bernico](mike.bernico@gmail.com)\n", + "\n", + "This example demonstrates using Boruta to find all relevant features in the Madalon dataset, which is an artificial dataset used in NIPS2003 and cited in the [Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)\n", + "\n", + "This dataset has 2000 observations and 500 features. We will use Boruta to identify the features that are relevant to the classification task.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Installation\n", + "#!pip install boruta" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from boruta import BorutaPy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "def load_data():\n", + " # URLS for dataset via UCI\n", + " train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'\n", + " train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'\n", + "\n", + " X_data = pd.read_csv(train_data_url, sep=\" \", header=None)\n", + " y_data = pd.read_csv(train_label_url, sep=\" \", header=None)\n", + " data = X_data.loc[:, :499]\n", + " data['target'] = y_data[0]\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data = load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...491492493494495496497498499target
0485477537479452471491476475473...481477485511485481479475496-1
1483458460487587475526479485469...478487338513486483492510517-1
2487542499468448471442478480477...481492650506501480489499498-1
3480491510485495472417474502476...4804745724544694754824944611
4484502528489466481402478487468...4794524354865084815044955111
\n", + "

5 rows × 501 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 491 492 493 \\\n", + "0 485 477 537 479 452 471 491 476 475 473 ... 481 477 485 \n", + "1 483 458 460 487 587 475 526 479 485 469 ... 478 487 338 \n", + "2 487 542 499 468 448 471 442 478 480 477 ... 481 492 650 \n", + "3 480 491 510 485 495 472 417 474 502 476 ... 480 474 572 \n", + "4 484 502 528 489 466 481 402 478 487 468 ... 479 452 435 \n", + "\n", + " 494 495 496 497 498 499 target \n", + "0 511 485 481 479 475 496 -1 \n", + "1 513 486 483 492 510 517 -1 \n", + "2 506 501 480 489 499 498 -1 \n", + "3 454 469 475 482 494 461 1 \n", + "4 486 508 481 504 495 511 1 \n", + "\n", + "[5 rows x 501 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y = data.pop('target')\n", + "X = data.copy().values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Boruta conforms to the sklearn api and can be used in a Pipeline as well as on it's own. Here we will demonstrate stand alone operation.\n", + "\n", + "First we will instantiate an estimator that Boruta will use. Then we will instantiate a Boruta Object." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n", + "# Define Boruta feature selection method\n", + "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once built, we can use this object to identify the relevant features in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feat_selector.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Boruta has confirmed only a few features as useful. When our run ended, Boruta was undecided on 2 features. '\n", + "\n", + "We can interrogate .support_ to understand which features were selected. .support_ returns an array of booleans that we can use to slice our feature matrix to include only relevant columns. Of course, .transform can also be used, as expected in the scikit API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check selected features\n", + "print(feat_selector.support_)\n", + "# Select the chosen features from our dataframe.\n", + "selected = X[:, feat_selector.support_]\n", + "print (\"\")\n", + "print (\"Selected Feature Matrix Shape\")\n", + "print (selected.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also interrogate the ranking of the unselected features with .ranking_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feat_selector.ranking_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/boruta/examples/Madalon_Data_Set.ipynb b/boruta/examples/Madalon_Data_Set.ipynb index 435eba0..45fb14a 100644 --- a/boruta/examples/Madalon_Data_Set.ipynb +++ b/boruta/examples/Madalon_Data_Set.ipynb @@ -18,9 +18,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Installation\n", @@ -43,7 +41,10 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -279,9 +280,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n", @@ -348,9 +347,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -371,7 +368,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.7.7" }, "varInspector": { "cols": { @@ -404,5 +401,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py b/boruta/test/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000..e69de29 diff --git a/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py new file mode 100644 index 0000000..507db6f --- /dev/null +++ b/boruta/test/.ipynb_checkpoints/modif_tests-checkpoint.py @@ -0,0 +1,154 @@ +%matplotlib inline +from sklearn.datasets import fetch_openml +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance +from sklearn.compose import ColumnTransformer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.datasets import fetch_openml +from sklearn.inspection import permutation_importance +import catboost +from boruta import BorutaPy as bp +from sklearn.datasets import load_boston, load_diabetes +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import gc +import shap +# lightgbm and catboost +import lightgbm as lgb +from lightgbm import LGBMRegressor, LGBMClassifier +from xgboost import XGBRegressor, XGBClassifier +from catboost import CatBoostRegressor, CatBoostClassifier +from sys import getsizeof, path +plt.style.use('fivethirtyeight') +rng = np.random.RandomState(seed=42) + + + +# Convert the cat. pred. for boosting +def cat_var(df, col_excl=None, return_cat=True): + """Identify categorical features. + + Parameters + ---------- + df: original df after missing operations + + Returns + ------- + cat_var_df: summary df with col index and col name for all categorical vars + :param return_cat: Boolean, return encoded cols as type 'category' + :param df: pd.DF, the encoded data-frame + :param col_excl: list, colums not to be encoded + """ + + if col_excl is None: + non_num_cols = list(set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number])))) + else: + non_num_cols = list( + set(list(df.columns)) - set(list(df.select_dtypes(include=[np.number]))) - set(col_excl)) + + # cat_var_index = [i for i, x in enumerate(df[col_names].dtypes.tolist()) if isinstance(x, pd.CategoricalDtype) or x == 'object'] + # cat_var_name = [x for i, x in enumerate(col_names) if i in cat_var_index] + + cat_var_index = [df.columns.get_loc(c) for c in non_num_cols if c in df] + + cat_var_df = pd.DataFrame({'cat_ind': cat_var_index, + 'cat_name': non_num_cols}) + + cols_need_mapped = cat_var_df.cat_name.to_list() + inv_mapper = {col: dict(enumerate(df[col].astype('category').cat.categories)) for col in df[cols_need_mapped]} + mapper = {col: {v: k for k, v in inv_mapper[col].items()} for col in df[cols_need_mapped]} + + for c in cols_need_mapped: + df.loc[:, c] = df.loc[:, c].map(mapper[c]).fillna(0).astype(int) + + if return_cat: + df[non_num_cols] = df[non_num_cols].astype('category') + return df, cat_var_df, inv_mapper + + +def get_titanic_data(): + # Fetch Titanic data and add random cat and numbers + # Example taken from https://scikit-learn.org/stable/auto_examples/inspection/ + # plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + rng = np.random.RandomState(seed=42) + X['random_cat'] = rng.randint(3, size=X.shape[0]) + X['random_cat'] = X['random_cat'].astype('str') + X['random_num'] = rng.randn(X.shape[0]) + + categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] + numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] + X = X[categorical_columns + numerical_columns] + # Impute + categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing'))]) + numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean'))]) + preprocessing = ColumnTransformer([('cat', categorical_pipe, categorical_columns), ('num', numerical_pipe, numerical_columns)]) + X_trans = preprocessing.fit_transform(X) + X = pd.DataFrame(X_trans, columns = X.columns) + # encode + X, cat_var_df, inv_mapper = cat_var(X) + return X, y + +def get_boston_data(): + boston = load_boston() + X = pd.DataFrame(boston.data) + X['random_num1'] = rng.randn(X.shape[0]) + X['random_num2'] = np.random.poisson(1, X.shape[0]) + y = pd.Series(boston.target) + return X, y + +# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors +def testing_estimators(varimp, models, X, y): + for model in models: + print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) + feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y) + print(feat_selector.support_names_) + feat_selector.plot_importance() + gc.enable() + del(feat_selector, model) + gc.collect() + +def testing_clf_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + +def testing_regr_all_varimp(X, y): + for varimp in ['shap', 'pimp', 'native']: + models = [catboost.CatBoostRegressor(random_state=42, verbose=0), XGBRegressor(random_state=42, verbose=0), LGBMRegressor(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y) + gc.enable() + del(models) + gc.collect() + + +X, y = get_titanic_data() +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +# lightgbm faster and better than RF +lgb_model = LGBMClassifier(n_jobs= 4) +lgb_model.fit(X_train, y_train) +result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) +sorted_idx = result.importances_mean.argsort() +# Plot +fig, ax = plt.subplots() +ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx]) +ax.set_title("Permutation Importances (test set)") +fig.tight_layout() +plt.show() + +if __name__ == '__main__': + # classification and cat. pred + X, y = get_titanic_data() + testing_clf_all_varimp(X=X, y=y) + # regression + X, y = get_boston_data() + testing_regr_all_varimp(X=X, y=y) \ No newline at end of file diff --git a/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py new file mode 100644 index 0000000..5d5ce9f --- /dev/null +++ b/boruta/test/.ipynb_checkpoints/unit_tests-checkpoint.py @@ -0,0 +1,57 @@ +import unittest +from boruta import BorutaPy +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +import numpy as np + + +class BorutaTestCases(unittest.TestCase): + + def test_get_tree_num(self): + rfc = RandomForestClassifier(max_depth=10) + bt = BorutaPy(rfc) + self.assertEqual(bt._get_tree_num(10), 44, "Tree Est. Math Fail") + self.assertEqual(bt._get_tree_num(100), 141, "Tree Est. Math Fail") + + def test_if_boruta_extracts_relevant_features(self): + np.random.seed(42) + y = np.random.binomial(1, 0.5, 1000) + X = np.zeros((1000, 10)) + + z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000) + z[z == -1] = 0 + z[z == 2] = 1 + + # 5 relevant features + X[:, 0] = z + X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000) + X[:, 2] = y + np.random.normal(0, 1, 1000) + X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000) + X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) + + # 5 irrelevant features + X[:, 5] = np.random.normal(0, 1, 1000) + X[:, 6] = np.random.poisson(1, 1000) + X[:, 7] = np.random.binomial(1, 0.3, 1000) + X[:, 8] = np.random.normal(0, 1, 1000) + X[:, 9] = np.random.poisson(1, 1000) + + rfc = RandomForestClassifier() + bt = BorutaPy(rfc) + bt.fit(X, y) + + # make sure that only all the relevant features are returned + self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0])) + + # test if this works as expected for dataframe input + X_df, y_df = pd.DataFrame(X), pd.Series(y) + bt.fit(X_df, y_df) + self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0])) + + # check it dataframe is returned when return_df=True + self.assertIsInstance(bt.transform(X_df, return_df=True), pd.DataFrame) + +if __name__ == '__main__': + unittest.main() + + From a6c4b8ad85166aee10f07e04b6facdac2277257c Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:43:51 +0200 Subject: [PATCH 06/19] naming of var consistent with master --- boruta/boruta_py.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index eaa79ba..49b2ce7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,3 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Author: Daniel Homola + +Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ + +License: BSD 3 clause +""" + from __future__ import print_function, division import numpy as np import scipy as sp @@ -11,16 +21,6 @@ from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Author: Daniel Homola - -Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - -License: BSD 3 clause -""" - class BorutaPy(BaseEstimator, TransformerMixin): @@ -171,20 +171,20 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.two_step = two_step self.max_iter = max_iter self.random_state = random_state - self.random_state_instance = None self.verbose = verbose self.weight = weight self.importance = importance self.cat_name = None self.cat_idx = None # Catboost doesn't allow to change random seed after fitting - self.is_cat = 'catboost' in str(type(self.estimator)) + self._is_catboost = 'catboost' in str(type(self.estimator)) # Random state throws an error with lightgbm - self.is_lgb = 'lightgbm' in str(type(self.estimator)) + self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None + self.__version__ = '0.3' def fit(self, X, y): """ @@ -322,7 +322,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if not self.is_cat: + if not self._is_catboost: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -377,7 +377,7 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if not self.is_cat: + if not self._is_catboost: if self._is_lightgbm: # https://github.com/scikit-learn-contrib/boruta_py/pull/78 self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) @@ -499,7 +499,7 @@ def _get_tree_num(self, n_feat): def _get_imp(self, X, y): try: - if self.is_cat: + if self._is_catboost: X = pd.DataFrame(X) obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() if obj_feat: @@ -546,7 +546,7 @@ def _get_shap_imp(self, X, y): if self._is_tree_based(): try: - if self.is_cat: + if self._is_catboost: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) @@ -603,7 +603,7 @@ def _get_perm_imp(self, X, y): if self._is_tree_based(): try: - if self.is_cat: + if self._is_catboost: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) else: model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) From e278764fc4c4a86eaec1bbeee841e559c51cad23 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:52:16 +0200 Subject: [PATCH 07/19] splitting import into different lines --- boruta/boruta_py.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 49b2ce7..ab773e5 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -17,7 +17,8 @@ import matplotlib import matplotlib.pyplot as plt from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator, is_classifier, is_regressor +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.base import is_classifier, is_regressor from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D @@ -176,15 +177,14 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.importance = importance self.cat_name = None self.cat_idx = None - # Catboost doesn't allow to change random seed after fitting - self._is_catboost = 'catboost' in str(type(self.estimator)) - # Random state throws an error with lightgbm - self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None - self.__version__ = '0.3' + # Catboost doesn't allow to change random seed after fitting + self._is_catboost = 'catboost' in str(type(self.estimator)) + # Random state throws an error with lightgbm + self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) def fit(self, X, y): """ From 132e265677f456cb92956c800c9b637c27a8a052 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Wed, 8 Jul 2020 11:54:32 +0200 Subject: [PATCH 08/19] import warnings --- boruta/boruta_py.py | 1 + 1 file changed, 1 insertion(+) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index ab773e5..6a1de8d 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -22,6 +22,7 @@ from sklearn.model_selection import RepeatedKFold, train_test_split from sklearn.inspection import permutation_importance from matplotlib.lines import Line2D +import warnings class BorutaPy(BaseEstimator, TransformerMixin): From 504abf50b63ba7fb2e5429124043018b53cf09ef Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 28 Aug 2020 15:05:50 +0200 Subject: [PATCH 09/19] move weight in fit method --- boruta/boruta_py.py | 572 +++++++++++++++++++++++++------------ boruta/test/modif_tests.py | 37 +-- 2 files changed, 412 insertions(+), 197 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 6a1de8d..e1e14f7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,30 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Author: Daniel Homola - -Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ - -License: BSD 3 clause -""" - -from __future__ import print_function, division -import numpy as np -import scipy as sp -import shap -import pandas as pd -import time -import matplotlib -import matplotlib.pyplot as plt -from sklearn.utils import check_random_state, check_X_y -from sklearn.base import TransformerMixin, BaseEstimator -from sklearn.base import is_classifier, is_regressor -from sklearn.model_selection import RepeatedKFold, train_test_split -from sklearn.inspection import permutation_importance -from matplotlib.lines import Line2D -import warnings - - class BorutaPy(BaseEstimator, TransformerMixin): """ Improved Python implementation of the Boruta R package. @@ -51,8 +24,8 @@ class BorutaPy(BaseEstimator, TransformerMixin): crucial parameter. For more info, please read about the perc parameter. - Automatic tree number: Setting the n_estimator to 'auto' will calculate the number of trees - in each itartion based on the number of features under investigation. - This way more trees are used when the training data has many feautres + in each iteration based on the number of features under investigation. + This way more trees are used when the training data has many features and less when most of the features have been rejected. - Ranking of features: After fitting BorutaPy it provides the user with ranking of features. @@ -73,6 +46,32 @@ class BorutaPy(BaseEstimator, TransformerMixin): care about all factors that contribute to it, not just the bluntest signs of it in context of your methodology (yes, minimal optimal set of features by definition depends on your classifier choice). + + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + Parameters ---------- estimator : object @@ -96,7 +95,9 @@ class BorutaPy(BaseEstimator, TransformerMixin): correction steps. importance : str, default = 'shap' The kind of variable importance used to compare and discriminate original - vs shadow predictors. + vs shadow predictors. Note that the builtin tree importance (gini/impurity based + importance) is biased towards numerical and large cardinality predictors, even + if they are random. Shapley values and permutation imp. are robust w.r.t those predictors. two_step : Boolean, default = True If you want to use the original implementation of Boruta with Bonferroni correction only set this to False. @@ -107,12 +108,13 @@ class BorutaPy(BaseEstimator, TransformerMixin): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - weight : pd.Series verbose : int, default=0 Controls verbosity of output: - 0: no output - 1: displays iteration number - 2: which features have been selected already + + Attributes ---------- n_features_ : int @@ -127,6 +129,20 @@ class BorutaPy(BaseEstimator, TransformerMixin): ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1 and tentative features are assigned rank 2. + cat_name : list of str + the name of the categorical columns + cat_idx : list of int + the index of the categorical columns + imp_real_hist : array + array of the historical feature importance of the real predictors + sha_max : float + the maximum feature importance of the shadow predictors + col_names : list of str + the names of the real predictors + tag_df : dataframe + the df with the details (accepted or rejected) of the feature selection + + Examples -------- @@ -164,7 +180,7 @@ class BorutaPy(BaseEstimator, TransformerMixin): Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010 """ - def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importance='shap', + def __init__(self, estimator, n_estimators=1000, perc=90, alpha=0.05, importance='shap', two_step=True, max_iter=100, random_state=None, weight=None, verbose=0): self.estimator = estimator self.n_estimators = n_estimators @@ -173,21 +189,22 @@ def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05, importanc self.two_step = two_step self.max_iter = max_iter self.random_state = random_state + self.random_state_instance = None self.verbose = verbose self.weight = weight self.importance = importance self.cat_name = None self.cat_idx = None + # Catboost doesn't allow to change random seed after fitting + self.is_cat = is_catboost(estimator) + self.is_lgb = is_lightgbm(estimator) # plotting self.imp_real_hist = None self.sha_max = None self.col_names = None - # Catboost doesn't allow to change random seed after fitting - self._is_catboost = 'catboost' in str(type(self.estimator)) - # Random state throws an error with lightgbm - self._is_lightgbm = 'lightgbm' in str(type(self.estimator)) + self.tag_df = None - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """ Fits the Boruta feature selection with the provided estimator. Parameters @@ -196,12 +213,15 @@ def fit(self, X, y): The training input samples. y : array-like, shape = [n_samples] The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample """ self.imp_real_hist = np.empty((0, X.shape[1]), float) if isinstance(X, pd.DataFrame) is not True: X = pd.DataFrame(X) self.col_names = X.columns.to_list() - return self._fit(X, y) + + return self._fit(X, y, sample_weight=sample_weight) def transform(self, X, weak=False, return_df=False): """ @@ -225,15 +245,18 @@ def transform(self, X, weak=False, return_df=False): return self._transform(X, weak, return_df) - def fit_transform(self, X, y, weak=False, return_df=False): + def fit_transform(self, X, y, sample_weight=None, weak=False, return_df=False): """ Fits Boruta, then reduces the input X to the selected features. + Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample weak: boolean, default = False If set to true, the tentative features are also used to reduce X. return_df : boolean, default = False @@ -244,18 +267,42 @@ def fit_transform(self, X, y, weak=False, return_df=False): X : array-like, shape = [n_samples, n_features_] The input matrix X's columns are reduced to the features which were selected by Boruta. + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val """ - self._fit(X, y) + self._fit(X, y, sample_weight=sample_weight) return self._transform(X, weak, return_df) - def plot_importance(self): + def plot_importance(self, n_feat_per_inch=5): """ Boxplot of the variable importance, ordered by magnitude The max shadow variable importance illustrated by the dashed line. Requires to apply the fit method first. :return: boxplot """ + set_my_plt_style() if self.imp_real_hist is None: raise ValueError("Use the fit method first to compute the var.imp") @@ -268,38 +315,33 @@ def plot_importance(self): medianprops=dict(linestyle='-', linewidth=1.5), whiskerprops=dict(linestyle='-', linewidth=1.5), capprops=dict(linestyle='-', linewidth=1.5), - showfliers=False, grid=True, rot=0, vert=False, patch_artist=True + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, + figsize=(10, vimp_df.shape[1] / n_feat_per_inch), fontsize=9 ) - - box_face_col = ["tab:blue"] * sum(self.support_) + ["#FDD023"] * sum(self.support_weak_) + blue_color = "#2590fa" + yellow_color = "#f0be00" + box_face_col = [blue_color] * sum(self.support_) + [yellow_color] * sum(self.support_weak_) for c in range(len(box_face_col)): bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) - bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_color(box_face_col[c]) + + xrange = vimp_df.max(skipna=True).max(skipna=True) - vimp_df.min(skipna=True).min(skipna=True) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True) - 0.10 * xrange) - xrange = vimp_df.max(skipna=True).max(skipna=True)-vimp_df.min(skipna=True).min(skipna=True) - bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True)-0.10*xrange) - custom_lines = [Line2D([0], [0], color="tab:blue", lw=5), - Line2D([0], [0], color="#FDD023", lw=5), + custom_lines = [Line2D([0], [0], color=blue_color, lw=5), + Line2D([0], [0], color=yellow_color, lw=5), Line2D([0], [0], color="gray", lw=5), Line2D([0], [0], linestyle='--', color="gray", lw=2)] bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") plt.axvline(x=self.sha_max, linestyle='--', color='gray') fig = bp.get_figure() plt.title('Boruta importance and selected predictors') - fig.set_size_inches((10, 2 * np.rint(max(vimp_df.shape) / 10))) - plt.tight_layout() + fig.set_size_inches((10, 1.5 * np.rint(max(vimp_df.shape) / 10))) + # plt.tight_layout() plt.show() - def _is_tree_based(self): - """ - checking if the estimator is tree-based (kernel SAP is too slow to be used here, unless using sampling) - :return: - """ - tree_based_models = ['lightgbm', 'xgboost', 'catboost', '_forest'] - condition = any(i in str(type(self.estimator)) for i in tree_based_models) - return condition - - def _validate_pandas_input(self, arg): + @staticmethod + def _validate_pandas_input(arg): try: return arg.values except AttributeError: @@ -307,7 +349,45 @@ def _validate_pandas_input(self, arg): "input needs to be a numpy array or pandas data frame." ) - def _fit(self, X_raw, y): + def _fit(self, X_raw, y, sample_weight=None): + """ + Private method. + Chaining: + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + Parameters + ---------- + X_raw : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + self : object + Nothing but attributes + """ + # self.is_cat = is_catboost(self.estimator) start_time = time.time() # basic cat features encoding @@ -323,7 +403,7 @@ def _fit(self, X_raw, y): # a way without loop but need to re-do astype Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() - if not self._is_catboost: + if self.is_cat is False: if cat_feat: X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) else: @@ -347,6 +427,10 @@ def _fit(self, X_raw, y): if not isinstance(y, np.ndarray): y = self._validate_pandas_input(y) + if sample_weight is not None: + if not isinstance(sample_weight, np.ndarray): + sample_weight = self._validate_pandas_input(sample_weight) + self.random_state = check_random_state(self.random_state) # setup variables for Boruta n_sample, n_feat = X.shape @@ -378,16 +462,14 @@ def _fit(self, X_raw, y): # make sure we start with a new tree in each iteration # Catboost doesn't allow to change random seed after fitting - if not self._is_catboost: - if self._is_lightgbm: - # https://github.com/scikit-learn-contrib/boruta_py/pull/78 + if self.is_cat is False: + if self.is_lgb: self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) else: self.estimator.set_params(random_state=self.random_state) - # add shadow attributes, shuffle them and train estimator, get imps - cur_imp = self._add_shadows_get_imps(X, y, dec_reg) + cur_imp = self._add_shadows_get_imps(X, y, sample_weight, dec_reg) # get the threshold of shadow importances we will use for rejection imp_sha_max = np.percentile(cur_imp[1], self.perc) @@ -428,10 +510,23 @@ def _fit(self, X_raw, y): # for plotting self.imp_real_hist = imp_history self.sha_max = imp_sha_max + + if isinstance(X_raw, np.ndarray): + X_raw = pd.DataFrame(X_raw) + if isinstance(X_raw, pd.DataFrame): self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x] + self.tag_df = pd.DataFrame({'predictor': list(X_raw.columns)}) + self.tag_df['Boruta'] = 1 + self.tag_df['Boruta'] = np.where(self.tag_df['predictor'].isin(list(self.support_names_)), 1, 0) - # ranking, confirmed variables are rank 1 + if isinstance(X_raw, pd.DataFrame): + self.support_weak_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_weak_) if x] + self.tag_df['Boruta_weak_incl'] = np.where(self.tag_df['predictor'].isin( + list(self.support_names_ + self.support_weak_names_) + ), 1, 0) + + # ranking, confirmed variables are rank 1 self.ranking_ = np.ones(n_feat, dtype=np.int) # tentative variables are rank 2 self.ranking_[tentative] = 2 @@ -470,6 +565,20 @@ def _fit(self, X_raw, y): return self def _transform(self, X, weak=False, return_df=False): + """ + Private method + + transform the predictor matrix by dropping the rejected and (optional) the undecided predictors + :param X: pd.DataFrame + predictor matrix + :param weak: bool + whether to drop or not the undecided predictors + :param return_df: bool + return a pandas dataframe or not + :return: + X: np.array or pd.DataFrame + the transformed predictors matrix + """ # sanity check try: self.ranking_ @@ -489,7 +598,7 @@ def _transform(self, X, weak=False, return_df=False): def _get_tree_num(self, n_feat): depth = self.estimator.get_params()['max_depth'] - if depth == None: + if depth is None: depth = 10 # how many times a feature should be considered on average f_repr = 100 @@ -498,17 +607,36 @@ def _get_tree_num(self, n_feat): n_estimators = int(multi * f_repr) return n_estimators - def _get_imp(self, X, y): + def _get_imp(self, X, y, sample_weight=None): + """ + Get the native feature importance (impurity based for instance) + This is know to return biased and uninformative results. + e.g. + https://scikit-learn.org/stable/auto_examples/inspection/ + plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + + or + https://explained.ai/rf-importance/ + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + """ try: - if self._is_catboost: + if self.is_cat: X = pd.DataFrame(X) obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() if obj_feat: X[obj_feat] = X[obj_feat].astype('str').astype('category') cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist() - self.estimator.fit(X, y, sample_weight=self.weight, cat_features=cat_feat) + self.estimator.fit(X, y, sample_weight=sample_weight, cat_features=cat_feat) else: - self.estimator.fit(X, y, sample_weight=self.weight) + self.estimator.fit(X, y, sample_weight=sample_weight) except Exception as e: raise ValueError('Please check your X and y variable. The provided ' @@ -520,111 +648,28 @@ def _get_imp(self, X, y): 'are currently supported in BorutaPy.') return imp - def _get_shap_imp(self, X, y): - # SHAP and permutation importances must be computed on unseen data - if self.weight is not None: - w = self.weight - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) - else: - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) - w_tr, w_tt = None, None - - X_tr = pd.DataFrame(X_tr) - X_tt = pd.DataFrame(X_tt) - obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) - obj_idx = None - - if obj_feat: - X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() - - if self._is_tree_based(): - try: - if self._is_catboost: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) - else: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) - - except Exception as e: - raise ValueError('Please check your X and y variable. The provided ' - 'estimator cannot be fitted to your data.\n' + str(e)) - # build the explainer - explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") - shap_values = explainer.shap_values(X_tt) - # flatten to 2D if classification and lightgbm - if is_classifier(self.estimator): - if isinstance(shap_values, list): - # for lightgbm clf sklearn api, shap returns list of arrays - # https://github.com/slundberg/shap/issues/526 - class_inds = range(len(shap_values)) - shap_imp = np.zeros(shap_values[0].shape[1]) - for i, ind in enumerate(class_inds): - shap_imp += np.abs(shap_values[ind]).mean(0) - shap_imp /= len(shap_values) - else: - shap_imp = np.abs(shap_values).mean(0) - else: - shap_imp = np.abs(shap_values).mean(0) - else: - raise ValueError('Not a tree based model') - - return shap_imp - - def _get_perm_imp(self, X, y): - - if self.weight is not None: - w = self.weight - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) - else: - if is_regressor(self.estimator): - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) - else: - X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) - w_tr, w_tt = None, None - - X_tr = pd.DataFrame(X_tr) - X_tt = pd.DataFrame(X_tt) - obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) - obj_idx = None - - if obj_feat: - X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') - X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') - obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() - - if self._is_tree_based(): - try: - if self._is_catboost: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) - else: - model = self.estimator.fit(X_tr, y_tr, sample_weight=w_tr) - - except Exception as e: - raise ValueError('Please check your X and y variable. The provided ' - 'estimator cannot be fitted to your data.\n' + str(e)) - - perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) - imp = perm_imp.importances_mean.ravel() - else: - raise ValueError('Not a tree based model') - - return imp - def _get_shuffle(self, seq): self.random_state.shuffle(seq) return seq - def _add_shadows_get_imps(self, X, y, dec_reg): + def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg): + """ + Add a shuffled copy of the columns (shadows) and get the feature importance of the augmented data set + + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :return: + imp_real: array + feature importance of the real predictors + imp_sha: array + feature importance of the shadow predictors + """ # find features that are tentative still x_cur_ind = np.where(dec_reg >= 0)[0] x_cur = np.copy(X[:, x_cur_ind]) @@ -638,9 +683,9 @@ def _add_shadows_get_imps(self, X, y, dec_reg): x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha) # get importance of the merged matrix if self.importance == 'shap': - imp = self._get_shap_imp(np.hstack((x_cur, x_sha)), y) + imp = _get_shap_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) elif self.importance == 'pimp': - imp = self._get_perm_imp(np.hstack((x_cur, x_sha)), y) + imp = _get_perm_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) else: imp = self._get_imp(np.hstack((x_cur, x_sha)), y) @@ -652,7 +697,18 @@ def _add_shadows_get_imps(self, X, y, dec_reg): return imp_real, imp_sha - def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): + @staticmethod + def _assign_hits(hit_reg, cur_imp, imp_sha_max): + """ + count how many times a given feature was more important than the best of the shadow features + :param hit_reg: array + count how many times a given feature was more important than the best of the shadow features + :param cur_imp: array + current importance + :param imp_sha_max: array + importance of the best shadow predictor + :return: + """ # register hits for features that did better than the best of shadows cur_imp_no_nan = cur_imp[0] cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0 @@ -661,6 +717,20 @@ def _assign_hits(self, hit_reg, cur_imp, imp_sha_max): return hit_reg def _do_tests(self, dec_reg, hit_reg, _iter): + """ + Perform the rest if the feature should be tagget as relevant (confirmed), not relevant (rejected) + or undecided. The test is performed by considering the binomial tentatives over several attempts. + I.e. count how many times a given feature was more important than the best of the shadow features + and test if the associated probability to the z-score is below, between or above the rejection or acceptance + threshold. + + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :param hit_reg: array + counts how many times a given feature was more important than the best of the shadow features + :param _iter: + :return: + """ active_features = np.where(dec_reg >= 0)[0] hits = hit_reg[active_features] # get uncorrected p values based on hit_reg @@ -696,7 +766,8 @@ def _do_tests(self, dec_reg, hit_reg, _iter): dec_reg[active_features[to_reject]] = -1 return dec_reg - def _fdrcorrection(self, pvals, alpha=0.05): + @staticmethod + def _fdrcorrection(pvals, alpha=0.05): """ Benjamini/Hochberg p-value correction for false discovery rate, from statsmodels package. Included here for decoupling dependency on statsmodels. @@ -734,7 +805,8 @@ def _fdrcorrection(self, pvals, alpha=0.05): reject_[pvals_sortind] = reject return reject_, pvals_corrected_ - def _nanrankdata(self, X, axis=1): + @staticmethod + def _nanrankdata(X, axis=1): """ Replaces bottleneck's nanrankdata with scipy and numpy alternative. """ @@ -744,7 +816,12 @@ def _nanrankdata(self, X, axis=1): def _check_params(self, X, y): """ + Private method Check hyperparameters as well as X and y before proceeding with fit. + :param X: pd.DataFrame + predictor matrix + :param y: pd.series + target """ # check X and y are consistent len, X is Array and y is column X, y = check_X_y(X, y, dtype=None, force_all_finite=False) @@ -755,6 +832,19 @@ def _check_params(self, X, y): raise ValueError('Alpha should be between 0 and 1.') def _print_results(self, dec_reg, _iter, flag): + """ + Private method + printing the result + :param dec_reg: array + if the feature as been tagged as relevant (confirmed), not relevant (rejected) or undecided + :param _iter: int + the iteration number + :param flag: int + is still in the feature selection process or not + :return: + output: str + the output to be printed out + """ n_iter = str(_iter) + ' / ' + str(self.max_iter) n_confirmed = np.where(dec_reg == 1)[0].shape[0] n_rejected = np.where(dec_reg == -1)[0].shape[0] @@ -780,3 +870,125 @@ def _print_results(self, dec_reg, _iter, flag): vimp = 'native' output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result print(output) + + +def _split_fit_estimator(estimator, X, y, sample_weight=None): + """ + Private function + split the train, test and fit the model + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + model + fitted model + X_tt: array [n_samples, n_features] + the test split, predictors + y_tt: array [n_samples] + the test split, target + """ + if sample_weight is not None: + w = sample_weight + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if check_if_tree_based(estimator): + try: + if is_catboost(estimator): + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) + else: + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + else: + raise ValueError('Not a tree based model') + + return model, X_tt, y_tt, w_tt + + +def _get_shap_imp(estimator, X, y, sample_weight=None): + """ + Private function + Get the SHAP feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + shap_imp, array + the SHAP importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + # build the explainer + explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") + shap_values = explainer.shap_values(X_tt) + # flatten to 2D if classification and lightgbm + if is_classifier(estimator): + if isinstance(shap_values, list): + # for lightgbm clf sklearn api, shap returns list of arrays + # https://github.com/slundberg/shap/issues/526 + class_inds = range(len(shap_values)) + shap_imp = np.zeros(shap_values[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp += np.abs(shap_values[ind]).mean(0) + shap_imp /= len(shap_values) + else: + shap_imp = np.abs(shap_values).mean(0) + else: + shap_imp = np.abs(shap_values).mean(0) + + return shap_imp + + +def _get_perm_imp(estimator, X, y, sample_weight): + """ + Private function + Get the permutation feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + imp, array + the permutation importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + imp = perm_imp.importances_mean.ravel() + return imp diff --git a/boruta/test/modif_tests.py b/boruta/test/modif_tests.py index 507db6f..3bc4a5d 100644 --- a/boruta/test/modif_tests.py +++ b/boruta/test/modif_tests.py @@ -92,7 +92,9 @@ def get_titanic_data(): X = pd.DataFrame(X_trans, columns = X.columns) # encode X, cat_var_df, inv_mapper = cat_var(X) - return X, y + # sample weight is just a dummy random vector for testing purpose + sample_weight = np.random.uniform(0,1, len(y)) + return X, y, sample_weight def get_boston_data(): boston = load_boston() @@ -103,23 +105,23 @@ def get_boston_data(): return X, y # Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors -def testing_estimators(varimp, models, X, y): +def testing_estimators(varimp, models, X, y, sample_weight=None): for model in models: - print('testing ' + str(type(model)) + ' for var.imp: ' + varimp) - feat_selector = bp(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) - feat_selector.fit(X, y) + print('='*20 +' testing: {mod:>55} for var.imp: {vimp:<15} '.format(mod=str(model), vimp=varimp)+'='*20 ) + feat_selector = noglmgroot.BorutaPy(model, n_estimators = 100, verbose= 1, max_iter= 10, random_state=42, weight=None, importance=varimp) + feat_selector.fit(X, y, sample_weight) print(feat_selector.support_names_) - feat_selector.plot_importance() + feat_selector.plot_importance(n_feat_per_inch=3) gc.enable() del(feat_selector, model) gc.collect() -def testing_clf_all_varimp(X, y): +def testing_clf_all_varimp(X, y, sample_weight=None): for varimp in ['shap', 'pimp', 'native']: - models = [catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] - testing_estimators(varimp=varimp, models=models, X=X, y=y) + models = [RandomForestClassifier(n_jobs= 4, oob_score= True), catboost.CatBoostClassifier(random_state=42, verbose=0), XGBClassifier(random_state=42, verbose=0), LGBMClassifier(random_state=42, verbose=0)] + testing_estimators(varimp=varimp, models=models, X=X, y=y, sample_weight=sample_weight) gc.enable() - del(models) + del models gc.collect() def testing_regr_all_varimp(X, y): @@ -129,13 +131,13 @@ def testing_regr_all_varimp(X, y): gc.enable() del(models) gc.collect() - -X, y = get_titanic_data() -X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) +print('='*20 + ' Benchmarking using sklearn permutation importance ' + '='*20 ) +X, y, sample_weight = get_titanic_data() +X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, sample_weight, stratify=y, random_state=42) # lightgbm faster and better than RF lgb_model = LGBMClassifier(n_jobs= 4) -lgb_model.fit(X_train, y_train) +lgb_model.fit(X_train, y_train, sample_weight=w_train) result = permutation_importance(lgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) sorted_idx = result.importances_mean.argsort() # Plot @@ -146,9 +148,10 @@ def testing_regr_all_varimp(X, y): plt.show() if __name__ == '__main__': - # classification and cat. pred - X, y = get_titanic_data() - testing_clf_all_varimp(X=X, y=y) + # classification and cat. pred, sample weight is just a dummy random vector for testing purpose + X, y, sample_weight = get_titanic_data() #get_titanic_data() + testing_clf_all_varimp(X=X, y=y, sample_weight=sample_weight) + # regression X, y = get_boston_data() testing_regr_all_varimp(X=X, y=y) \ No newline at end of file From dfd1d00d15e46deabd5c8cf14c1445a1a30355be Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 12 Feb 2021 09:43:24 +0100 Subject: [PATCH 10/19] typo --- .idea/.gitignore | 3 + .idea/boruta_py.iml | 12 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .../boruta_py-checkpoint.py | 993 ++++++++++++++++++ boruta/boruta_py.py | 1 - 8 files changed, 1032 insertions(+), 1 deletion(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/boruta_py.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 boruta/.ipynb_checkpoints/boruta_py-checkpoint.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..0e40fe8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ + +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/boruta_py.iml b/.idea/boruta_py.iml new file mode 100644 index 0000000..7c9d48f --- /dev/null +++ b/.idea/boruta_py.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a2e120d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..b8ad864 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/boruta/.ipynb_checkpoints/boruta_py-checkpoint.py b/boruta/.ipynb_checkpoints/boruta_py-checkpoint.py new file mode 100644 index 0000000..91286ad --- /dev/null +++ b/boruta/.ipynb_checkpoints/boruta_py-checkpoint.py @@ -0,0 +1,993 @@ +class BorutaPy(BaseEstimator, TransformerMixin): + """ + Improved Python implementation of the Boruta R package. + The improvements of this implementation include: + - Faster run times: + Thanks to scikit-learn's fast implementation of the ensemble methods. + - Scikit-learn like interface: + Use BorutaPy just like any other scikit learner: fit, fit_transform and + transform are all implemented in a similar fashion. + - Modularity: + Any ensemble method could be used: random forest, extra trees + classifier, even gradient boosted trees. + - Two step correction: + The original Boruta code corrects for multiple testing in an overly + conservative way. In this implementation, the Benjamini Hochberg FDR is + used to correct in each iteration across active features. This means + only those features are included in the correction which are still in + the selection process. Following this, each that passed goes through a + regular Bonferroni correction to check for the repeated testing over + the iterations. + - Percentile: + Instead of using the max values of the shadow features the user can + specify which percentile to use. This gives a finer control over this + crucial parameter. For more info, please read about the perc parameter. + - Automatic tree number: + Setting the n_estimator to 'auto' will calculate the number of trees + in each iteration based on the number of features under investigation. + This way more trees are used when the training data has many features + and less when most of the features have been rejected. + - Ranking of features: + After fitting BorutaPy it provides the user with ranking of features. + Confirmed ones are 1, Tentatives are 2, and the rejected are ranked + starting from 3, based on their feature importance history through + the iterations. + - Using either the native variable importance, scikit permutation importance, + SHAP importance. + We highly recommend using pruned trees with a depth between 3-7. + For more, see the docs of these functions, and the examples below. + Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ + Boruta is an all relevant feature selection method, while most other are + minimal optimal; this means it tries to find all features carrying + information usable for prediction, rather than finding a possibly compact + subset of features on which some classifier has a minimal error. + Why bother with all relevant feature selection? + When you try to understand the phenomenon that made your data, you should + care about all factors that contribute to it, not just the bluntest signs + of it in context of your methodology (yes, minimal optimal set of features + by definition depends on your classifier choice). + + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + + Parameters + ---------- + estimator : object + A supervised learning estimator, with a 'fit' method that returns the + feature_importances_ attribute. Important features must correspond to + high absolute values in the feature_importances_. + n_estimators : int or string, default = 1000 + If int sets the number of estimators in the chosen ensemble method. + If 'auto' this is determined automatically based on the size of the + dataset. The other parameters of the used estimators need to be set + with initialisation. + perc : int, default = 100 + Instead of the max we use the percentile defined by the user, to pick + our threshold for comparison between shadow and real features. The max + tend to be too stringent. This provides a finer control over this. The + lower perc is the more false positives will be picked as relevant but + also the less relevant features will be left out. The usual trade-off. + The default is essentially the vanilla Boruta corresponding to the max. + alpha : float, default = 0.05 + Level at which the corrected p-values will get rejected in both + correction steps. + importance : str, default = 'shap' + The kind of variable importance used to compare and discriminate original + vs shadow predictors. Note that the builtin tree importance (gini/impurity based + importance) is biased towards numerical and large cardinality predictors, even + if they are random. Shapley values and permutation imp. are robust w.r.t those predictors. + two_step : Boolean, default = True + If you want to use the original implementation of Boruta with Bonferroni + correction only set this to False. + max_iter : int, default = 100 + The number of maximum iterations to perform. + random_state : int, RandomState instance or None; default=None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + verbose : int, default=0 + Controls verbosity of output: + - 0: no output + - 1: displays iteration number + - 2: which features have been selected already + + + Attributes + ---------- + n_features_ : int + The number of selected features. + support_ : array of shape [n_features] + The mask of selected features - only confirmed ones are True. + support_weak_ : array of shape [n_features] + The mask of selected tentative features, which haven't gained enough + support during the max_iter number of iterations.. + ranking_ : array of shape [n_features] + The feature ranking, such that ``ranking_[i]`` corresponds to the + ranking position of the i-th feature. Selected (i.e., estimated + best) features are assigned rank 1 and tentative features are assigned + rank 2. + cat_name : list of str + the name of the categorical columns + cat_idx : list of int + the index of the categorical columns + imp_real_hist : array + array of the historical feature importance of the real predictors + sha_max : float + the maximum feature importance of the shadow predictors + col_names : list of str + the names of the real predictors + tag_df : dataframe + the df with the details (accepted or rejected) of the feature selection + + + Examples + -------- + + import pandas as pd + from sklearn.ensemble import RandomForestClassifier + from boruta import BorutaPy + + # load X and y + # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute + X = pd.read_csv('examples/test_X.csv', index_col=0).values + y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values + y = y.ravel() + + # define random forest classifier, with utilising all cores and + # sampling in proportion to y labels + rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) + + # define Boruta feature selection method + feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1) + + # find all relevant features - 5 features should be selected + feat_selector.fit(X, y) + + # check selected features - first 5 features are selected + feat_selector.support_ + + # check ranking of features + feat_selector.ranking_ + + # call transform() on X to filter it down to selected features + X_filtered = feat_selector.transform(X) + References + ---------- + [1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package" + Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010 + """ + + def __init__(self, estimator, n_estimators=1000, perc=90, alpha=0.05, importance='shap', + two_step=True, max_iter=100, random_state=None, weight=None, verbose=0): + self.estimator = estimator + self.n_estimators = n_estimators + self.perc = perc + self.alpha = alpha + self.two_step = two_step + self.max_iter = max_iter + self.random_state = random_state + self.random_state_instance = None + self.verbose = verbose + self.weight = weight + self.importance = importance + self.cat_name = None + self.cat_idx = None + # Catboost doesn't allow to change random seed after fitting + self.is_cat = is_catboost(estimator) + self.is_lgb = is_lightgbm(estimator) + # plotting + self.imp_real_hist = None + self.sha_max = None + self.col_names = None + self.tag_df = None + + def fit(self, X, y, sample_weight=None): + """ + Fits the Boruta feature selection with the provided estimator. + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + """ + self.imp_real_hist = np.empty((0, X.shape[1]), float) + if isinstance(X, pd.DataFrame) is not True: + X = pd.DataFrame(X) + self.col_names = X.columns.to_list() + + return self._fit(X, y, sample_weight=sample_weight) + + def transform(self, X, weak=False, return_df=False): + """ + Reduces the input X to the features selected by Boruta. + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + weak: boolean, default = False + If set to true, the tentative features are also used to reduce X. + + return_df : boolean, default = False + If ``X`` if a pandas dataframe and this parameter is set to True, + the transformed data will also be a dataframe. + Returns + ------- + X : array-like, shape = [n_samples, n_features_] + The input matrix X's columns are reduced to the features which were + selected by Boruta. + """ + + return self._transform(X, weak, return_df) + + def fit_transform(self, X, y, sample_weight=None, weak=False, return_df=False): + """ + Fits Boruta, then reduces the input X to the selected features. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + weak: boolean, default = False + If set to true, the tentative features are also used to reduce X. + return_df : boolean, default = False + If ``X`` if a pandas dataframe and this parameter is set to True, + the transformed data will also be a dataframe. + Returns + ------- + X : array-like, shape = [n_samples, n_features_] + The input matrix X's columns are reduced to the features which were + selected by Boruta. + + Summary + ------- + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + """ + + self._fit(X, y, sample_weight=sample_weight) + return self._transform(X, weak, return_df) + + def plot_importance(self, n_feat_per_inch=5): + """ + Boxplot of the variable importance, ordered by magnitude + The max shadow variable importance illustrated by the dashed line. + Requires to apply the fit method first. + :return: boxplot + """ + if self.imp_real_hist is None: + raise ValueError("Use the fit method first to compute the var.imp") + + color = {'boxes': 'gray', 'whiskers': 'gray', 'medians': '#404040', 'caps': 'gray'} + vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.col_names) + vimp_df = vimp_df.reindex(vimp_df.mean().sort_values(ascending=True).index, axis=1) + bp = vimp_df.boxplot(color=color, + boxprops=dict(linestyle='-', linewidth=1.5), + flierprops=dict(linestyle='-', linewidth=1.5), + medianprops=dict(linestyle='-', linewidth=1.5), + whiskerprops=dict(linestyle='-', linewidth=1.5), + capprops=dict(linestyle='-', linewidth=1.5), + showfliers=False, grid=True, rot=0, vert=False, patch_artist=True, + figsize=(10, vimp_df.shape[1] / n_feat_per_inch), fontsize=9 + ) + blue_color = "#2590fa" + yellow_color = "#f0be00" + box_face_col = [blue_color] * sum(self.support_) + [yellow_color] * sum(self.support_weak_) + for c in range(len(box_face_col)): + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_facecolor(box_face_col[c]) + bp.findobj(matplotlib.patches.Patch)[len(self.support_) - c - 1].set_color(box_face_col[c]) + + xrange = vimp_df.max(skipna=True).max(skipna=True) - vimp_df.min(skipna=True).min(skipna=True) + bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True) - 0.10 * xrange) + + custom_lines = [Line2D([0], [0], color=blue_color, lw=5), + Line2D([0], [0], color=yellow_color, lw=5), + Line2D([0], [0], color="gray", lw=5), + Line2D([0], [0], linestyle='--', color="gray", lw=2)] + bp.legend(custom_lines, ['confirmed', 'tentative', 'rejected', 'sha. max'], loc="lower right") + plt.axvline(x=self.sha_max, linestyle='--', color='gray') + fig = bp.get_figure() + plt.title('Boruta importance and selected predictors') + fig.set_size_inches((10, 1.5 * np.rint(max(vimp_df.shape) / 10))) + # plt.tight_layout() + plt.show() + + @staticmethod + def _validate_pandas_input(arg): + try: + return arg.values + except AttributeError: + raise ValueError( + "input needs to be a numpy array or pandas data frame." + ) + + def _fit(self, X_raw, y, sample_weight=None): + """ + Private method. + Chaining: + - Loop over n_iter or until dec_reg == 0 + - add shadows + o find features that are tentative + o make sure that at least 5 columns are added + o shuffle shadows + o get feature importance + * fit the estimator + * extract feature importance (native, shap or permutation) + * return feature importance + o separate the importance of shadows and real + + - Calculate the maximum shadow importance and append to the previous run + - Assign hits using the imp_sha_max of this run + o find all the feat imp > imp_sha_max + o tag them as hits + o add +1 to the previous tag vector + - Perform a test + o select non rejected features yet + o get a binomial p-values (nbr of times the feat has been tagged as important on the n_iter done so far) + o reject or not according the (corrected) p-val + + Parameters + ---------- + X_raw : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + self : object + Nothing but attributes + """ + # self.is_cat = is_catboost(self.estimator) + + start_time = time.time() + # basic cat features encoding + # First, let's store "object" columns as categorical columns + # obj_feat = X_raw.dtypes.loc[X_raw.dtypes == 'object'].index.tolist() + obj_feat = list(set(list(X_raw.columns)) - set(list(X_raw.select_dtypes(include=[np.number])))) + + if obj_feat: + X_raw.loc[:, obj_feat] = X_raw[obj_feat].astype('str').astype('category') + cat_feat = X_raw.dtypes.loc[X_raw.dtypes == 'category'].index.tolist() + cat_idx = [X_raw.columns.get_loc(c) for c in cat_feat if c in cat_feat] + if cat_feat: + # a way without loop but need to re-do astype + Cat = X_raw[cat_feat].stack().astype('category').cat.codes.unstack() + + if self.is_cat is False: + if cat_feat: + X = pd.concat([X_raw[X_raw.columns.difference(cat_feat)], Cat], axis=1) + else: + X = X_raw + else: + X = X_raw + + X = np.nan_to_num(X) + y = np.nan_to_num(y) + # w = w.fillna(0) + + self.cat_name = cat_feat + self.cat_idx = cat_idx + + # check input params + self._check_params(X, y) + + if not isinstance(X, np.ndarray): + X = self._validate_pandas_input(X) + + if not isinstance(y, np.ndarray): + y = self._validate_pandas_input(y) + + if sample_weight is not None: + if not isinstance(sample_weight, np.ndarray): + sample_weight = self._validate_pandas_input(sample_weight) + + self.random_state = check_random_state(self.random_state) + # setup variables for Boruta + n_sample, n_feat = X.shape + _iter = 1 + # holds the decision about each feature: + # 0 - default state = tentative in original code + # 1 - accepted in original code + # -1 - rejected in original code + dec_reg = np.zeros(n_feat, dtype=np.int) + # counts how many times a given feature was more important than + # the best of the shadow features + hit_reg = np.zeros(n_feat, dtype=np.int) + # these record the history of the iterations + imp_history = np.zeros(n_feat, dtype=np.float) + sha_max_history = [] + + # set n_estimators + if self.n_estimators != 'auto': + self.estimator.set_params(n_estimators=self.n_estimators) + + # main feature selection loop + while np.any(dec_reg == 0) and _iter < self.max_iter: + # find optimal number of trees and depth + if self.n_estimators == 'auto': + # number of features that aren't rejected + not_rejected = np.where(dec_reg >= 0)[0].shape[0] + n_tree = self._get_tree_num(not_rejected) + self.estimator.set_params(n_estimators=n_tree) + + # make sure we start with a new tree in each iteration + # Catboost doesn't allow to change random seed after fitting + if self.is_cat is False: + if self.is_lgb: + self.estimator.set_params(random_state=self.random_state.randint(0, 10000)) + else: + self.estimator.set_params(random_state=self.random_state) + + # add shadow attributes, shuffle them and train estimator, get imps + cur_imp = self._add_shadows_get_imps(X, y, sample_weight, dec_reg) + + # get the threshold of shadow importances we will use for rejection + imp_sha_max = np.percentile(cur_imp[1], self.perc) + + # record importance history + sha_max_history.append(imp_sha_max) + imp_history = np.vstack((imp_history, cur_imp[0])) + + # register which feature is more imp than the max of shadows + hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max) + + # based on hit_reg we check if a feature is doing better than + # expected by chance + dec_reg = self._do_tests(dec_reg, hit_reg, _iter) + + # print out confirmed features + if self.verbose > 0 and _iter < self.max_iter: + self._print_results(dec_reg, _iter, 0) + if _iter < self.max_iter: + _iter += 1 + + # we automatically apply R package's rough fix for tentative ones + confirmed = np.where(dec_reg == 1)[0] + tentative = np.where(dec_reg == 0)[0] + # ignore the first row of zeros + tentative_median = np.median(imp_history[1:, tentative], axis=0) + # which tentative to keep + tentative_confirmed = np.where(tentative_median + > np.median(sha_max_history))[0] + tentative = tentative[tentative_confirmed] + + # basic result variables + self.n_features_ = confirmed.shape[0] + self.support_ = np.zeros(n_feat, dtype=np.bool) + self.support_[confirmed] = 1 + self.support_weak_ = np.zeros(n_feat, dtype=np.bool) + self.support_weak_[tentative] = 1 + # for plotting + self.imp_real_hist = imp_history + self.sha_max = imp_sha_max + + if isinstance(X_raw, np.ndarray): + X_raw = pd.DataFrame(X_raw) + + if isinstance(X_raw, pd.DataFrame): + self.support_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_) if x] + self.tag_df = pd.DataFrame({'predictor': list(X_raw.columns)}) + self.tag_df['Boruta'] = 1 + self.tag_df['Boruta'] = np.where(self.tag_df['predictor'].isin(list(self.support_names_)), 1, 0) + + if isinstance(X_raw, pd.DataFrame): + self.support_weak_names_ = [X_raw.columns[i] for i, x in enumerate(self.support_weak_) if x] + self.tag_df['Boruta_weak_incl'] = np.where(self.tag_df['predictor'].isin( + list(self.support_names_ + self.support_weak_names_) + ), 1, 0) + + # ranking, confirmed variables are rank 1 + self.ranking_ = np.ones(n_feat, dtype=np.int) + # tentative variables are rank 2 + self.ranking_[tentative] = 2 + # selected = confirmed and tentative + selected = np.hstack((confirmed, tentative)) + # all rejected features are sorted by importance history + not_selected = np.setdiff1d(np.arange(n_feat), selected) + # large importance values should rank higher = lower ranks -> *(-1) + imp_history_rejected = imp_history[1:, not_selected] * -1 + + # update rank for not_selected features + if not_selected.shape[0] > 0: + # calculate ranks in each iteration, then median of ranks across feats + iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) + rank_medians = np.nanmedian(iter_ranks, axis=0) + ranks = self._nanrankdata(rank_medians, axis=0) + + # set smallest rank to 3 if there are tentative feats + if tentative.shape[0] > 0: + ranks = ranks - np.min(ranks) + 3 + else: + # and 2 otherwise + ranks = ranks - np.min(ranks) + 2 + self.ranking_[not_selected] = ranks + else: + # all are selected, thus we set feature supports to True + self.support_ = np.ones(n_feat, dtype=np.bool) + + # notify user + if self.verbose > 0: + self._print_results(dec_reg, _iter, 1) + self.running_time = time.time() - start_time + hours, rem = divmod(self.running_time, 3600) + minutes, seconds = divmod(rem, 60) + print("All relevant predictors selected in {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) + return self + + def _transform(self, X, weak=False, return_df=False): + """ + Private method + + transform the predictor matrix by dropping the rejected and (optional) the undecided predictors + :param X: pd.DataFrame + predictor matrix + :param weak: bool + whether to drop or not the undecided predictors + :param return_df: bool + return a pandas dataframe or not + :return: + X: np.array or pd.DataFrame + the transformed predictors matrix + """ + # sanity check + try: + self.ranking_ + except AttributeError: + raise ValueError('You need to call the fit(X, y) method first.') + + if weak: + indices = self.support_ + self.support_weak_ + else: + indices = self.support_ + + if return_df: + X = X.iloc[:, indices] + else: + X = X[:, indices] + return X + + def _get_tree_num(self, n_feat): + depth = self.estimator.get_params()['max_depth'] + if depth is None: + depth = 10 + # how many times a feature should be considered on average + f_repr = 100 + # n_feat * 2 because the training matrix is extended with n shadow features + multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth)) + n_estimators = int(multi * f_repr) + return n_estimators + + def _get_imp(self, X, y, sample_weight=None): + """ + Get the native feature importance (impurity based for instance) + This is know to return biased and uninformative results. + e.g. + https://scikit-learn.org/stable/auto_examples/inspection/ + plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py + + or + https://explained.ai/rf-importance/ + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The training input samples. + y : array-like, shape = [n_samples] + The target values. + sample_weight : array-like, shape = [n_samples], default=None + Individual weights for each sample + """ + try: + if self.is_cat: + X = pd.DataFrame(X) + obj_feat = X.dtypes.loc[(X.dtypes == 'object') | (X.dtypes == 'category')].index.tolist() + if obj_feat: + X[obj_feat] = X[obj_feat].astype('str').astype('category') + cat_feat = X.dtypes.loc[X.dtypes == 'category'].index.tolist() + self.estimator.fit(X, y, sample_weight=sample_weight, cat_features=cat_feat) + else: + self.estimator.fit(X, y, sample_weight=sample_weight) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + try: + imp = self.estimator.feature_importances_ + except Exception: + raise ValueError('Only methods with feature_importance_ attribute ' + 'are currently supported in BorutaPy.') + return imp + + def _get_shuffle(self, seq): + self.random_state.shuffle(seq) + return seq + + def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg): + """ + Add a shuffled copy of the columns (shadows) and get the feature importance of the augmented data set + + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :return: + imp_real: array + feature importance of the real predictors + imp_sha: array + feature importance of the shadow predictors + """ + # find features that are tentative still + x_cur_ind = np.where(dec_reg >= 0)[0] + x_cur = np.copy(X[:, x_cur_ind]) + x_cur_w = x_cur.shape[1] + # deep copy the matrix for the shadow matrix + x_sha = np.copy(x_cur) + # make sure there's at least 5 columns in the shadow matrix for + while (x_sha.shape[1] < 5): + x_sha = np.hstack((x_sha, x_sha)) + # shuffle xSha + x_sha = np.apply_along_axis(self._get_shuffle, 0, x_sha) + # get importance of the merged matrix + if self.importance == 'shap': + imp = _get_shap_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) + elif self.importance == 'pimp': + imp = _get_perm_imp(self.estimator, np.hstack((x_cur, x_sha)), y, sample_weight) + else: + imp = self._get_imp(np.hstack((x_cur, x_sha)), y) + + # separate importances of real and shadow features + imp_sha = imp[x_cur_w:] + imp_real = np.zeros(X.shape[1]) + imp_real[:] = np.nan + imp_real[x_cur_ind] = imp[:x_cur_w] + + return imp_real, imp_sha + + @staticmethod + def _assign_hits(hit_reg, cur_imp, imp_sha_max): + """ + count how many times a given feature was more important than the best of the shadow features + :param hit_reg: array + count how many times a given feature was more important than the best of the shadow features + :param cur_imp: array + current importance + :param imp_sha_max: array + importance of the best shadow predictor + :return: + """ + # register hits for features that did better than the best of shadows + cur_imp_no_nan = cur_imp[0] + cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0 + hits = np.where(cur_imp_no_nan > imp_sha_max)[0] + hit_reg[hits] += 1 + return hit_reg + + def _do_tests(self, dec_reg, hit_reg, _iter): + """ + Perform the rest if the feature should be tagget as relevant (confirmed), not relevant (rejected) + or undecided. The test is performed by considering the binomial tentatives over several attempts. + I.e. count how many times a given feature was more important than the best of the shadow features + and test if the associated probability to the z-score is below, between or above the rejection or acceptance + threshold. + + :param dec_reg: array + holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected) + :param hit_reg: array + counts how many times a given feature was more important than the best of the shadow features + :param _iter: + :return: + """ + active_features = np.where(dec_reg >= 0)[0] + hits = hit_reg[active_features] + # get uncorrected p values based on hit_reg + to_accept_ps = sp.stats.binom.sf(hits - 1, _iter, .5).flatten() + to_reject_ps = sp.stats.binom.cdf(hits, _iter, .5).flatten() + + if self.two_step: + # two step multicor process + # first we correct for testing several features in each round using FDR + to_accept = self._fdrcorrection(to_accept_ps, alpha=self.alpha)[0] + to_reject = self._fdrcorrection(to_reject_ps, alpha=self.alpha)[0] + + # second we correct for testing the same feature over and over again + # using bonferroni + to_accept2 = to_accept_ps <= self.alpha / float(_iter) + to_reject2 = to_reject_ps <= self.alpha / float(_iter) + + # combine the two multi corrections, and get indexes + to_accept *= to_accept2 + to_reject *= to_reject2 + else: + # as in th original Boruta, we simply do bonferroni correction + # with the total n_feat in each iteration + to_accept = to_accept_ps <= self.alpha / float(len(dec_reg)) + to_reject = to_reject_ps <= self.alpha / float(len(dec_reg)) + + # find features which are 0 and have been rejected or accepted + to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0] + to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0] + + # updating dec_reg + dec_reg[active_features[to_accept]] = 1 + dec_reg[active_features[to_reject]] = -1 + return dec_reg + + @staticmethod + def _fdrcorrection(pvals, alpha=0.05): + """ + Benjamini/Hochberg p-value correction for false discovery rate, from + statsmodels package. Included here for decoupling dependency on statsmodels. + Parameters + ---------- + pvals : array_like + set of p-values of the individual tests. + alpha : float + error rate + Returns + ------- + rejected : array, bool + True if a hypothesis is rejected, False if not + pvalue-corrected : array + pvalues adjusted for multiple hypothesis testing to limit FDR + """ + pvals = np.asarray(pvals) + pvals_sortind = np.argsort(pvals) + pvals_sorted = np.take(pvals, pvals_sortind) + nobs = len(pvals_sorted) + ecdffactor = np.arange(1, nobs + 1) / float(nobs) + + reject = pvals_sorted <= ecdffactor * alpha + if reject.any(): + rejectmax = max(np.nonzero(reject)[0]) + reject[:rejectmax] = True + + pvals_corrected_raw = pvals_sorted / ecdffactor + pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] + pvals_corrected[pvals_corrected > 1] = 1 + # reorder p-values and rejection mask to original order of pvals + pvals_corrected_ = np.empty_like(pvals_corrected) + pvals_corrected_[pvals_sortind] = pvals_corrected + reject_ = np.empty_like(reject) + reject_[pvals_sortind] = reject + return reject_, pvals_corrected_ + + @staticmethod + def _nanrankdata(X, axis=1): + """ + Replaces bottleneck's nanrankdata with scipy and numpy alternative. + """ + ranks = sp.stats.mstats.rankdata(X, axis=axis) + ranks[np.isnan(X)] = np.nan + return ranks + + def _check_params(self, X, y): + """ + Private method + Check hyperparameters as well as X and y before proceeding with fit. + :param X: pd.DataFrame + predictor matrix + :param y: pd.series + target + """ + # check X and y are consistent len, X is Array and y is column + X, y = check_X_y(X, y, dtype=None, force_all_finite=False) + if self.perc <= 0 or self.perc > 100: + raise ValueError('The percentile should be between 0 and 100.') + + if self.alpha <= 0 or self.alpha > 1: + raise ValueError('Alpha should be between 0 and 1.') + + def _print_results(self, dec_reg, _iter, flag): + """ + Private method + printing the result + :param dec_reg: array + if the feature as been tagged as relevant (confirmed), not relevant (rejected) or undecided + :param _iter: int + the iteration number + :param flag: int + is still in the feature selection process or not + :return: + output: str + the output to be printed out + """ + n_iter = str(_iter) + ' / ' + str(self.max_iter) + n_confirmed = np.where(dec_reg == 1)[0].shape[0] + n_rejected = np.where(dec_reg == -1)[0].shape[0] + cols = ['Iteration: ', 'Confirmed: ', 'Tentative: ', 'Rejected: '] + + # still in feature selection + if flag == 0: + n_tentative = np.where(dec_reg == 0)[0].shape[0] + content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected]) + if self.verbose == 1: + output = cols[0] + n_iter + elif self.verbose > 1: + output = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)]) + + # Boruta finished running and tentatives have been filtered + else: + n_tentative = np.sum(self.support_weak_) + content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected]) + result = '\n'.join([x[0] + '\t' + x[1] for x in zip(cols, content)]) + if self.importance in ['shap', 'pimp']: + vimp = str(self.importance) + else: + vimp = 'native' + output = "\n\nBorutaPy finished running using " + vimp + " var. imp.\n\n" + result + print(output) + + +def _split_fit_estimator(estimator, X, y, sample_weight=None): + """ + Private function + split the train, test and fit the model + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + model + fitted model + X_tt: array [n_samples, n_features] + the test split, predictors + y_tt: array [n_samples] + the test split, target + """ + if sample_weight is not None: + w = sample_weight + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(X, y, w, stratify=y, random_state=42) + else: + if is_regressor(estimator): + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42) + else: + X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42) + w_tr, w_tt = None, None + + X_tr = pd.DataFrame(X_tr) + X_tt = pd.DataFrame(X_tt) + obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number])))) + obj_idx = None + + if obj_feat: + X_tr[obj_feat] = X_tr[obj_feat].astype('str').astype('category') + X_tt[obj_feat] = X_tt[obj_feat].astype('str').astype('category') + obj_idx = np.argwhere(X_tr.columns.isin(obj_feat)).ravel() + + if check_if_tree_based(estimator): + try: + if is_catboost(estimator): + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat) + else: + model = estimator.fit(X_tr, y_tr, sample_weight=w_tr) + + except Exception as e: + raise ValueError('Please check your X and y variable. The provided ' + 'estimator cannot be fitted to your data.\n' + str(e)) + else: + raise ValueError('Not a tree based model') + + return model, X_tt, y_tt, w_tt + + +def _get_shap_imp(estimator, X, y, sample_weight=None): + """ + Private function + Get the SHAP feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + shap_imp, array + the SHAP importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + # build the explainer + explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") + shap_values = explainer.shap_values(X_tt) + # flatten to 2D if classification and lightgbm + if is_classifier(estimator): + if isinstance(shap_values, list): + # for lightgbm clf sklearn api, shap returns list of arrays + # https://github.com/slundberg/shap/issues/526 + class_inds = range(len(shap_values)) + shap_imp = np.zeros(shap_values[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp += np.abs(shap_values[ind]).mean(0) + shap_imp /= len(shap_values) + else: + shap_imp = np.abs(shap_values).mean(0) + else: + shap_imp = np.abs(shap_values).mean(0) + + return shap_imp + + +def _get_perm_imp(estimator, X, y, sample_weight): + """ + Private function + Get the permutation feature importance + + :param estimator: sklearn estimator + :param X: pd.DataFrame of shape [n_samples, n_features] + predictor matrix + :param y: pd.series of shape [n_samples] + target + :param sample_weight: array-like, shape = [n_samples], default=None + Individual weights for each sample + + :return: + imp, array + the permutation importance array + """ + model, X_tt, y_tt, w_tt = _split_fit_estimator(estimator, X, y, sample_weight=sample_weight) + perm_imp = permutation_importance(model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1) + imp = perm_imp.importances_mean.ravel() + return imp diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index e1e14f7..91286ad 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -302,7 +302,6 @@ def plot_importance(self, n_feat_per_inch=5): Requires to apply the fit method first. :return: boxplot """ - set_my_plt_style() if self.imp_real_hist is None: raise ValueError("Use the fit method first to compute the var.imp") From 8cab61019f9962051ca46812c45996f3e90e8123 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 12 Feb 2021 09:45:33 +0100 Subject: [PATCH 11/19] sync --- .idea/workspace.xml | 56 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .idea/workspace.xml diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..2e8498f --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + 1613117955488 + + + + + + + \ No newline at end of file From cc9f4e54c465e41dcbb20b5a2f363617ea247966 Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 12 Feb 2021 09:54:13 +0100 Subject: [PATCH 12/19] sync --- boruta/boruta_py.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 91286ad..5e078b7 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Author: Daniel Homola +Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/ +License: BSD 3 clause +""" + +from __future__ import print_function, division +import numpy as np +import scipy as sp +from sklearn.utils import check_random_state, check_X_y +from sklearn.base import TransformerMixin, BaseEstimator +import warnings + + + class BorutaPy(BaseEstimator, TransformerMixin): """ Improved Python implementation of the Boruta R package. From 0e5b44bb90d56987bd8b2ef91579272e63c33ada Mon Sep 17 00:00:00 2001 From: Thomas Bury Date: Fri, 20 Aug 2021 17:13:08 +0200 Subject: [PATCH 13/19] add docstrings and refactor --- .gitignore | 171 +++ .idea/boruta_py.iml | 2 +- .idea/misc.xml | 5 +- .idea/workspace.xml | 6 + boruta/__init__.py | 2 +- boruta/boruta_py.py | 931 +++++++++++++-- .../Madalon_Data_Set-checkpoint.ipynb | 226 +++- boruta/examples/Madalon_Data_Set.ipynb | 226 +++- boruta/examples/Whats_new.ipynb | 783 +++++++++++++ boruta/examples/Xgboost_boruta_pimp.ipynb | 814 +++++++++++++ .../catboost_info/catboost_training.json | 1004 +++++++++++++++++ .../catboost_info/learn/events.out.tfevents | Bin 0 -> 48870 bytes boruta/examples/catboost_info/learn_error.tsv | 1001 ++++++++++++++++ boruta/examples/catboost_info/time_left.tsv | 1001 ++++++++++++++++ setup.py | 9 +- 15 files changed, 6015 insertions(+), 166 deletions(-) create mode 100644 .gitignore create mode 100644 boruta/examples/Whats_new.ipynb create mode 100644 boruta/examples/Xgboost_boruta_pimp.ipynb create mode 100644 boruta/examples/catboost_info/catboost_training.json create mode 100644 boruta/examples/catboost_info/learn/events.out.tfevents create mode 100644 boruta/examples/catboost_info/learn_error.tsv create mode 100644 boruta/examples/catboost_info/time_left.tsv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f691a7b --- /dev/null +++ b/.gitignore @@ -0,0 +1,171 @@ +### Python template +# example NB +examples/catboost_info/ +examples/.ipynb_checkpoints/ +examples/cb_model.json + +# example NB +boruta/catboost_info/ +boruta/.ipynb_checkpoints/ +boruta/cb_model.json/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Pycharm +.idea/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests diff --git a/.idea/boruta_py.iml b/.idea/boruta_py.iml index 7c9d48f..432992a 100644 --- a/.idea/boruta_py.iml +++ b/.idea/boruta_py.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index a2e120d..6f35e9f 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ - + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 0ada83b..2f96d0e 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,8 +2,14 @@ + + + + + +