-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
V0.1.0
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
include README.md | ||
include infer_gender/data/* | ||
include infer_gender/data/py37/* |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,91 @@ | ||
# key-driver-analysis | ||
Key Driver Analysis | ||
# Key Driver Analysis | ||
--- | ||
|
||
Key Driver Analysis also known as Importance Analysis and Relative Importance Analysis. The goal of this analysis is to quantify the relative importance of each of the predictor variables in predicting the target variable. Each of the predictors is commonly referred to as a driver. | ||
|
||
For more information on key driver analysis refer to [this blog post](https://bnriiitb.github.io/blog/key-driver-analysis/driver-analysis/importance-analysis/relative-importance-analysis/johnson-relative-weights/shapley-regression/2021/05/04/key-driver-analysis.html) | ||
|
||
|
||
## Installation | ||
--- | ||
### Using pip | ||
[![PyPi Version](https://badge.fury.io/py/key-driver-analysis.svg)](https://pypi.org/project/key-driver-analysis/) | ||
|
||
You can install using the pip package manager by running | ||
```sh | ||
pip install key-driver-analysis | ||
``` | ||
|
||
Alternatively, you could install the latest version directly from Github: | ||
```sh | ||
pip install https://github.com/TVS-Motor-Company/key-driver-analysis/key-driver-analysis/archive/master.zip | ||
``` | ||
|
||
### Using conda | ||
|
||
You can install using the conda package manager by running | ||
```sh | ||
conda install -c conda-forge key-driver-analysis | ||
``` | ||
### From source | ||
|
||
Download the source code by cloning the repository or by pressing 'Download ZIP' on this page. | ||
|
||
Install by navigating to the proper directory and running: | ||
```sh | ||
python setup.py install | ||
``` | ||
|
||
|
||
## Usage | ||
--- | ||
|
||
```python | ||
import pandas as pd | ||
import key_driver_analysis as kda | ||
|
||
df = pd.DataFrame(data={ | ||
'age': [40, 50, 60, 10, 20, 30, 7, 80, 90], | ||
'salary': [123, 4423, 56563, 75545, 2345, 2346, 5534, 775, 34345], | ||
'no_of_cars_owned': [1, 3, 4, 2, 1, 3, 5, 3, 2], | ||
'no_of_mobiles_purchased': [10, 3, 5, 65, 34, 6, 21, 76, 9] | ||
}) | ||
print(df) | ||
target = 'no_of_mobiles_purchased' | ||
features=set(df.columns.tolist()).difference(set([target])) | ||
print(f'target --> {target}') | ||
print(f'features --> {features}') | ||
rw_df = kda.relative_importance(df, | ||
target=target, | ||
features=features, | ||
verbose=True) | ||
print(rw_df) | ||
``` | ||
|
||
```text | ||
age salary no_of_cars_owned no_of_mobiles_purchased | ||
0 40 123 1 10 | ||
1 50 4423 3 3 | ||
2 60 56563 4 5 | ||
3 10 75545 2 65 | ||
4 20 2345 1 34 | ||
5 30 2346 3 6 | ||
6 7 5534 5 21 | ||
7 80 775 3 76 | ||
8 90 34345 2 9 | ||
target --> no_of_mobiles_purchased | ||
features --> {'salary', 'no_of_cars_owned', 'age'} | ||
(9, 4) | ||
Dataset size before dropping nulls --> (9, 4) | ||
Dataset size after dropping nulls --> (9, 4) | ||
r2 score --> 0.05963122389990851 | ||
feature raw_rel_imp norm_rel_imp | ||
0 salary 0.035140 58.928857 | ||
1 no_of_cars_owned 0.019415 32.558853 | ||
2 age 0.005076 8.512289 | ||
``` | ||
## References | ||
* [RWA Web: A Free, Comprehensive, Web-Based, and User-Friendly Tool for Relative Weight Analyses by Scott Tonidandel and James M. LeBreton](https://link.springer.com/article/10.1007/s10869-014-9351-z) | ||
* [Relative Importance Analysis: A Useful Supplement to Regression Analysis by Scott Tonidandel and James M. LeBreton](https://link.springer.com/article/10.1007/s10869-010-9204-3) | ||
* [Determining the Statistical Significance of Relative Weights by Scott Tonidandel et al](https://pubmed.ncbi.nlm.nih.gov/19968399/) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
rm requirements.txt | ||
pipreqs . | ||
rm -r build | ||
rm -r key_driver_analysis.egg-info | ||
rm -r dist | ||
python setup.py sdist bdist_wheel | ||
twine check dist/* | ||
#twine upload --repository testpypi dist/* | ||
#twine upload --repository pypi dist/* |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .kda import relative_importance |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.feature_selection import VarianceThreshold | ||
|
||
|
||
def relative_importance(df, target, features, drop_features_with_nulls=True, drop_invariant_features=True, | ||
invariance_threshold=0.03, verbose=False): | ||
_df = df.copy() | ||
_features = features.copy() | ||
print(_df.shape) | ||
if verbose: | ||
print(f"Dataset size before dropping nulls --> {_df.shape}") | ||
# drop if there are any nulls in the target | ||
_df = _df.dropna(subset=[target]) | ||
if verbose: | ||
print(f"Dataset size after dropping nulls --> {_df.shape}") | ||
# drop features with null data | ||
if drop_features_with_nulls: | ||
# check for features with null data | ||
features_with_null_data = ( | ||
_df[_features] | ||
.columns[_df[_features].isna().any()] | ||
.tolist() | ||
) | ||
if len(features_with_null_data) > 0 and verbose: | ||
print( | ||
f'{" ,".join(features_with_null_data)} will be ignored to compute RW due to the presence of nulls' | ||
) | ||
_features = [col for col in _features if col not in features_with_null_data] | ||
|
||
if drop_invariant_features: | ||
feat_selector = VarianceThreshold(threshold=invariance_threshold) | ||
try: | ||
feat_selector.fit(_df[_features]) | ||
except ValueError as ve: | ||
print(ve) | ||
invariant_features = [ | ||
feat | ||
for feat, var in zip(_features, feat_selector.variances_) | ||
if var <= invariance_threshold | ||
] | ||
if verbose and len(invariant_features) > 0: | ||
print( | ||
f'{" ,".join(invariant_features)} will be ignored to compute RW due to invariance' | ||
) | ||
# exclude the invariant features | ||
_features = [col for col in _features if col not in invariant_features] | ||
all_features = _features.copy() | ||
all_features.insert(0, target) | ||
corr_all = _df[all_features].apply(pd.to_numeric, errors="coerce").corr() | ||
corr_xx = corr_all.iloc[1:, 1:].copy() | ||
corr_yy = corr_all.iloc[1:, 0].copy() | ||
w_corr_xx, v_corr_xx = np.linalg.eig(corr_xx) | ||
num_x = len(corr_xx) | ||
idx_diag = np.diag_indices(num_x) | ||
diag = np.zeros((num_x, num_x), float) | ||
diag[idx_diag] = w_corr_xx | ||
delta = np.sqrt(diag) | ||
coef_xz = v_corr_xx @ delta @ v_corr_xx.transpose() | ||
coef_yz = np.linalg.inv(coef_xz) @ corr_yy | ||
rsquare = sum(np.square(coef_yz)) | ||
if verbose: | ||
print(f"r2 score --> {rsquare}") | ||
raw_weights = np.square(coef_xz) @ np.square(coef_yz) | ||
normalized_weights = (raw_weights / rsquare) * 100 | ||
rw_df = pd.DataFrame( | ||
data={ | ||
"feature": _features, | ||
"raw_rel_imp": raw_weights, | ||
"norm_rel_imp": normalized_weights, | ||
} | ||
) | ||
rw_df.sort_values(["raw_rel_imp"], ascending=False, inplace=True) | ||
return rw_df |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
if __name__ == "__main__": | ||
import pandas as pd | ||
from key_driver_analysis import relative_importance | ||
|
||
df = pd.DataFrame(data={ | ||
'age': [40, 50, 60, 10, 20, 30, 7, 80, 90], | ||
'salary': [123, 4423, 56563, 75545, 2345, 2346, 5534, 775, 34345], | ||
'no_of_cars_owned': [1, 3, 4, 2, 1, 3, 5, 3, 2], | ||
'no_of_mobiles_purchased': [10, 3, 5, 65, 34, 6, 21, 76, 9] | ||
}) | ||
print(df) | ||
target = 'no_of_mobiles_purchased' | ||
features=set(df.columns.tolist()).difference(set([target])) | ||
print(f'target --> {target}') | ||
print(f'features --> {features}') | ||
rw_df = relative_importance(df, | ||
target=target, | ||
features=features, | ||
verbose=True) | ||
print(rw_df) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__version__ = "0.1.2" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
numpy==1.21.3 | ||
pandas==1.3.4 | ||
scikit_learn==1.0.1 | ||
setuptools==58.0.4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pathlib | ||
from setuptools import setup, find_packages | ||
import os | ||
|
||
# The directory containing this file | ||
HERE = pathlib.Path(__file__).parent | ||
|
||
# The text of the README file | ||
README = (HERE / "README.md").read_text() | ||
|
||
# Get the code version | ||
version = {} | ||
with open(os.path.join(HERE, "key_driver_analysis/version.py")) as fp: | ||
exec(fp.read(), version) | ||
__version__ = version["__version__"] | ||
|
||
setup( | ||
name="key-driver-analysis", | ||
version=__version__, | ||
description="Key Driver Analysis", | ||
long_description=README, | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/bnriiitb/key-driver-analysis", | ||
author="Nagaraju Budigam", | ||
author_email="[email protected]", | ||
license="MIT", | ||
packages=find_packages(), | ||
install_requires=["numpy>=1.21.3", "pandas>=1.3.4", "scikit_learn>=1.0.1", "setuptools>=58.0.4"], | ||
classifiers=[ | ||
"License :: OSI Approved :: MIT License", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3.7", | ||
"Programming Language :: Python :: 3.8", | ||
] | ||
) |