scikit-learn-contrib · ThomasBury · May 14, 2020 · May 16, 2020 · May 18, 2020 · May 28, 2020
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,172 @@
+### Python template
+# example NB
+examples/catboost_info/
+examples/.ipynb_checkpoints/
+examples/cb_model.json
+
+# example NB
+boruta/catboost_info/
+boruta/.ipynb_checkpoints/
+boruta/cb_model.json/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Pycharm
+.idea/
+.idea/*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# CMake
+cmake-build-debug/
+cmake-build-release/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
diff --git a/README.md b/README.md
@@ -118,6 +118,17 @@ dividing the p-value threshold with the current iteration index.
 If this two step correction is not required, the two_step parameter has to be
 set to False, then (with perc=100) BorutaPy behaves exactly as the R version.
 
+
+## Differences with the original Boruta scheme ##
+
+   - Allow using sample_weight, for applications like Poisson regression or
+       any requiring weights
+   - 3 different feature importances: native, SHAP (if installed) and permutation.
+       Native being the least consistent
+       (because of the imp. biased towards numerical and large cardinality categorical, 
+       see [Beware Default Random Forest Importances](https://explained.ai/rf-importance/#5)
+       but the fastest of the 3.
+
 ## Parameters ##
 
 __estimator__ : object
@@ -175,36 +186,35 @@ __verbose__ : int, default=0
 
 ## Examples ##
 
-```python
-import pandas as pd
-from sklearn.ensemble import RandomForestClassifier
-from boruta import BorutaPy
-
-# load X and y
-# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
-X = pd.read_csv('examples/test_X.csv', index_col=0).values
-y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
-y = y.ravel()
-
-# define random forest classifier, with utilising all cores and
-# sampling in proportion to y labels
-rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
-
-# define Boruta feature selection method
-feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
-
-# find all relevant features - 5 features should be selected
-feat_selector.fit(X, y)
-
-# check selected features - first 5 features are selected
-feat_selector.support_
-
-# check ranking of features
-feat_selector.ranking_
-
-# call transform() on X to filter it down to selected features
-X_filtered = feat_selector.transform(X)
-```
+    import pandas as pd
+    from sklearn.ensemble import RandomForestClassifier
+    from boruta import BorutaPy
+
+    # load X and y
+    # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
+    X = pd.read_csv('examples/test_X.csv', index_col=0).values
+    y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
+    y = y.ravel()
+
+    # define random forest classifier, with utilising all cores and
+    # sampling in proportion to y labels
+    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
+
+    # define Boruta feature selection method
+    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
+
+    # find all relevant features - 5 features should be selected
+    feat_selector.fit(X, y)
+
+    # check selected features - first 5 features are selected
+    feat_selector.support_
+
+    # check ranking of features
+    feat_selector.ranking_
+
+    # call transform() on X to filter it down to selected features
+    X_filtered = feat_selector.transform(X)
+
 
 ## References ##
 

diff --git a/boruta/__init__.py b/boruta/__init__.py
@@ -1 +1 @@
-from .boruta_py import  BorutaPy
+from .boruta_py import BorutaPy
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .boruta_py import BorutaPy
		from .boruta_py import BorutaPy