From f68cfcd3aa05704dd89ee98717eec04a2c3a7708 Mon Sep 17 00:00:00 2001 From: Mark Peng Date: Thu, 27 Dec 2018 22:13:02 +0800 Subject: [PATCH 1/2] Fixed Tuple Index Out of range error and unit test. --- boruta/boruta_py.py | 2 +- boruta/test/unit_tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py index 99e80d1..bfeb373 100644 --- a/boruta/boruta_py.py +++ b/boruta/boruta_py.py @@ -333,7 +333,7 @@ def _fit(self, X, y): imp_history_rejected = imp_history[1:, not_selected] * -1 # update rank for not_selected features - if not_selected.shape[0] > 0 and not_selected.shape[1] > 0: + if not_selected.shape[0] > 0: # calculate ranks in each iteration, then median of ranks across feats iter_ranks = self._nanrankdata(imp_history_rejected, axis=1) rank_medians = np.nanmedian(iter_ranks, axis=0) diff --git a/boruta/test/unit_tests.py b/boruta/test/unit_tests.py index 7f4889b..e15e0b6 100644 --- a/boruta/test/unit_tests.py +++ b/boruta/test/unit_tests.py @@ -40,7 +40,7 @@ def test_if_boruta_extracts_relevant_features(self): bt.fit(X, y) # make sure that only all the relevant features are returned - self.assertItemsEqual(range(5), list(np.where(bt.support_)[0])) + self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0])) if __name__ == '__main__': From badc33e852cf7123cc44ae5fae88b9995c5b7fc8 Mon Sep 17 00:00:00 2001 From: Mark Peng Date: Thu, 27 Dec 2018 22:46:39 +0800 Subject: [PATCH 2/2] Fixed compatibility issues in example notebook for recent versions of Python 3 and pandas. --- boruta/examples/Madalon_Data_Set.ipynb | 86 +++++++++++++++----------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/boruta/examples/Madalon_Data_Set.ipynb b/boruta/examples/Madalon_Data_Set.ipynb index af6404b..435eba0 100644 --- a/boruta/examples/Madalon_Data_Set.ipynb +++ b/boruta/examples/Madalon_Data_Set.ipynb @@ -30,9 +30,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -53,21 +51,18 @@ " # URLS for dataset via UCI\n", " train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'\n", " train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'\n", - " \n", - " \n", + "\n", " X_data = pd.read_csv(train_data_url, sep=\" \", header=None)\n", " y_data = pd.read_csv(train_label_url, sep=\" \", header=None)\n", - " data = X_data.ix[:,0:499]\n", - " data['target'] = y_data[0] \n", + " data = X_data.loc[:, :499]\n", + " data['target'] = y_data[0]\n", " return data" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "data = load_data()" @@ -76,9 +71,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -267,13 +260,11 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "y=data.pop('target')\n", - "X=data.copy()" + "y = data.pop('target')\n", + "X = data.copy().values" ] }, { @@ -293,9 +284,9 @@ }, "outputs": [], "source": [ - "rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=7)\n", - "# define Boruta feature selection method\n", - "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2)" + "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n", + "# Define Boruta feature selection method\n", + "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)" ] }, { @@ -308,12 +299,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "feat_selector.fit(X,y)" + "feat_selector.fit(X, y)" ] }, { @@ -328,15 +317,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "# check selected features\n", + "# Check selected features\n", "print(feat_selector.support_)\n", - "#select the chosen features from our dataframe.\n", - "selected = X.ix[:,feat_selector.support_]\n", + "# Select the chosen features from our dataframe.\n", + "selected = X[:, feat_selector.support_]\n", "print (\"\")\n", "print (\"Selected Feature Matrix Shape\")\n", "print (selected.shape)" @@ -352,9 +339,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "feat_selector.ranking_" @@ -386,9 +371,38 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.6.5" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }