Merge pull request #1 from bnriiitb/v0.1.0

V0.1.0
bnriiitb · Nov 9, 2021 · 06cbfba · 06cbfba
2 parents 777cd56 + 451d4c6
commit 06cbfba
Show file tree

Hide file tree

Showing 20 changed files with 286 additions and 2 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/key-driver-analysis.iml b/.idea/key-driver-analysis.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.md
+include infer_gender/data/*
+include infer_gender/data/py37/*
diff --git a/README.md b/README.md
@@ -1,2 +1,91 @@
-# key-driver-analysis
-Key Driver Analysis
+# Key Driver Analysis
+---
+
+Key Driver Analysis also known as Importance Analysis and Relative Importance Analysis. The goal of this analysis is to quantify the relative importance of each of the predictor variables in predicting the target variable. Each of the predictors is commonly referred to as a driver.
+
+For more information on key driver analysis refer to [this blog post](https://bnriiitb.github.io/blog/key-driver-analysis/driver-analysis/importance-analysis/relative-importance-analysis/johnson-relative-weights/shapley-regression/2021/05/04/key-driver-analysis.html) 
+
+
+## Installation
+---
+### Using pip
+[![PyPi Version](https://badge.fury.io/py/key-driver-analysis.svg)](https://pypi.org/project/key-driver-analysis/)
+
+You can install using the pip package manager by running
+```sh
+pip install key-driver-analysis
+```
+
+Alternatively, you could install the latest version directly from Github:
+```sh
+pip install https://github.com/TVS-Motor-Company/key-driver-analysis/key-driver-analysis/archive/master.zip
+```
+
+### Using conda
+
+You can install using the conda package manager by running
+```sh
+conda install -c conda-forge key-driver-analysis
+```
+### From source
+
+Download the source code by cloning the repository or by pressing 'Download ZIP' on this page.
+
+Install by navigating to the proper directory and running:
+```sh
+python setup.py install
+```
+
+
+## Usage
+---
+
+```python
+import pandas as pd
+import key_driver_analysis as kda
+
+df = pd.DataFrame(data={
+        'age': [40, 50, 60, 10, 20, 30, 7, 80, 90],
+        'salary': [123, 4423, 56563, 75545, 2345, 2346, 5534, 775, 34345],
+        'no_of_cars_owned': [1, 3, 4, 2, 1, 3, 5, 3, 2],
+        'no_of_mobiles_purchased': [10, 3, 5, 65, 34, 6, 21, 76, 9]
+    })
+    print(df)
+    target = 'no_of_mobiles_purchased'
+    features=set(df.columns.tolist()).difference(set([target]))
+    print(f'target --> {target}')
+    print(f'features --> {features}')
+    rw_df = kda.relative_importance(df,
+                                target=target,
+                                features=features,
+                                verbose=True)
+    print(rw_df)
+```
+
+```text
+
+   age  salary  no_of_cars_owned  no_of_mobiles_purchased
+0   40     123                 1                       10
+1   50    4423                 3                        3
+2   60   56563                 4                        5
+3   10   75545                 2                       65
+4   20    2345                 1                       34
+5   30    2346                 3                        6
+6    7    5534                 5                       21
+7   80     775                 3                       76
+8   90   34345                 2                        9
+target --> no_of_mobiles_purchased
+features --> {'salary', 'no_of_cars_owned', 'age'}
+(9, 4)
+Dataset size before dropping nulls --> (9, 4)
+Dataset size after dropping nulls --> (9, 4)
+r2 score --> 0.05963122389990851
+            feature  raw_rel_imp  norm_rel_imp
+0            salary     0.035140     58.928857
+1  no_of_cars_owned     0.019415     32.558853
+2               age     0.005076      8.512289
+```
+## References
+* [RWA Web: A Free, Comprehensive, Web-Based, and User-Friendly Tool for Relative Weight Analyses by Scott Tonidandel and James M. LeBreton](https://link.springer.com/article/10.1007/s10869-014-9351-z)
+* [Relative Importance Analysis: A Useful Supplement to Regression Analysis by Scott Tonidandel and James M. LeBreton](https://link.springer.com/article/10.1007/s10869-010-9204-3)
+* [Determining the Statistical Significance of Relative Weights by Scott Tonidandel et al](https://pubmed.ncbi.nlm.nih.gov/19968399/)
diff --git a/build.sh b/build.sh
@@ -0,0 +1,9 @@
+rm requirements.txt
+pipreqs .
+rm -r build
+rm -r key_driver_analysis.egg-info
+rm -r dist
+python setup.py sdist bdist_wheel
+twine check dist/*
+#twine upload --repository testpypi dist/*
+#twine upload --repository pypi dist/*
diff --git a/docs/.gitkeep b/docs/.gitkeep
diff --git a/key_driver_analysis/__init__.py b/key_driver_analysis/__init__.py
@@ -0,0 +1 @@
+from .kda import relative_importance
diff --git a/key_driver_analysis/kda.py b/key_driver_analysis/kda.py
@@ -0,0 +1,74 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_selection import VarianceThreshold
+
+
+def relative_importance(df, target, features, drop_features_with_nulls=True, drop_invariant_features=True,
+                        invariance_threshold=0.03, verbose=False):
+    _df = df.copy()
+    _features = features.copy()
+    print(_df.shape)
+    if verbose:
+        print(f"Dataset size before dropping nulls --> {_df.shape}")
+    # drop if there are any nulls in the target
+    _df = _df.dropna(subset=[target])
+    if verbose:
+        print(f"Dataset size after dropping nulls --> {_df.shape}")
+    # drop features with null data
+    if drop_features_with_nulls:
+        # check for features with null data
+        features_with_null_data = (
+            _df[_features]
+                .columns[_df[_features].isna().any()]
+                .tolist()
+        )
+        if len(features_with_null_data) > 0 and verbose:
+            print(
+                f'{" ,".join(features_with_null_data)} will be ignored to compute RW due to the presence of nulls'
+            )
+        _features = [col for col in _features if col not in features_with_null_data]
+
+    if drop_invariant_features:
+        feat_selector = VarianceThreshold(threshold=invariance_threshold)
+        try:
+            feat_selector.fit(_df[_features])
+        except ValueError as ve:
+            print(ve)
+        invariant_features = [
+            feat
+            for feat, var in zip(_features, feat_selector.variances_)
+            if var <= invariance_threshold
+        ]
+        if verbose and len(invariant_features) > 0:
+            print(
+                f'{" ,".join(invariant_features)} will be ignored to compute RW due to invariance'
+            )
+        # exclude the invariant features
+        _features = [col for col in _features if col not in invariant_features]
+    all_features = _features.copy()
+    all_features.insert(0, target)
+    corr_all = _df[all_features].apply(pd.to_numeric, errors="coerce").corr()
+    corr_xx = corr_all.iloc[1:, 1:].copy()
+    corr_yy = corr_all.iloc[1:, 0].copy()
+    w_corr_xx, v_corr_xx = np.linalg.eig(corr_xx)
+    num_x = len(corr_xx)
+    idx_diag = np.diag_indices(num_x)
+    diag = np.zeros((num_x, num_x), float)
+    diag[idx_diag] = w_corr_xx
+    delta = np.sqrt(diag)
+    coef_xz = v_corr_xx @ delta @ v_corr_xx.transpose()
+    coef_yz = np.linalg.inv(coef_xz) @ corr_yy
+    rsquare = sum(np.square(coef_yz))
+    if verbose:
+        print(f"r2 score --> {rsquare}")
+    raw_weights = np.square(coef_xz) @ np.square(coef_yz)
+    normalized_weights = (raw_weights / rsquare) * 100
+    rw_df = pd.DataFrame(
+        data={
+            "feature": _features,
+            "raw_rel_imp": raw_weights,
+            "norm_rel_imp": normalized_weights,
+        }
+    )
+    rw_df.sort_values(["raw_rel_imp"], ascending=False, inplace=True)
+    return rw_df
diff --git a/key_driver_analysis/tests/test_relative_importance.py b/key_driver_analysis/tests/test_relative_importance.py
@@ -0,0 +1,20 @@
+if __name__ == "__main__":
+    import pandas as pd
+    from key_driver_analysis import relative_importance
+
+    df = pd.DataFrame(data={
+        'age': [40, 50, 60, 10, 20, 30, 7, 80, 90],
+        'salary': [123, 4423, 56563, 75545, 2345, 2346, 5534, 775, 34345],
+        'no_of_cars_owned': [1, 3, 4, 2, 1, 3, 5, 3, 2],
+        'no_of_mobiles_purchased': [10, 3, 5, 65, 34, 6, 21, 76, 9]
+    })
+    print(df)
+    target = 'no_of_mobiles_purchased'
+    features=set(df.columns.tolist()).difference(set([target]))
+    print(f'target --> {target}')
+    print(f'features --> {features}')
+    rw_df = relative_importance(df,
+                                target=target,
+                                features=features,
+                                verbose=True)
+    print(rw_df)
diff --git a/key_driver_analysis/version.py b/key_driver_analysis/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.2"
diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.21.3
+pandas==1.3.4
+scikit_learn==1.0.1
+setuptools==58.0.4
diff --git a/setup.py b/setup.py
@@ -0,0 +1,35 @@
+import pathlib
+from setuptools import setup, find_packages
+import os
+
+# The directory containing this file
+HERE = pathlib.Path(__file__).parent
+
+# The text of the README file
+README = (HERE / "README.md").read_text()
+
+# Get the code version
+version = {}
+with open(os.path.join(HERE, "key_driver_analysis/version.py")) as fp:
+    exec(fp.read(), version)
+__version__ = version["__version__"]
+
+setup(
+    name="key-driver-analysis",
+    version=__version__,
+    description="Key Driver Analysis",
+    long_description=README,
+    long_description_content_type="text/markdown",
+    url="https://github.com/bnriiitb/key-driver-analysis",
+    author="Nagaraju Budigam",
+    author_email="[email protected]",
+    license="MIT",
+    packages=find_packages(),
+    install_requires=["numpy>=1.21.3", "pandas>=1.3.4", "scikit_learn>=1.0.1", "setuptools>=58.0.4"],
+    classifiers=[
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+    ]
+)
diff --git a/test/.gitkeep b/test/.gitkeep