This notebook contains the following sections:
- Data Understanding
- Data Exploration
- Model Building
- Hyperparamater tuning
- Final submission
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
raw_train=pd.read_csv('./Financial_Risk_Participants_Data/Train.csv')
raw_train.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
City | Location_Score | Internal_Audit_Score | External_Audit_Score | Fin_Score | Loss_score | Past_Results | IsUnderRisk | |
---|---|---|---|---|---|---|---|---|
0 | 2 | 8.032 | 14 | 8 | 3 | 6 | 0 | 1 |
1 | 31 | 77.730 | 8 | 3 | 3 | 8 | 1 | 0 |
2 | 40 | 59.203 | 3 | 12 | 11 | 3 | 0 | 1 |
3 | 12 | 73.080 | 4 | 5 | 7 | 6 | 0 | 0 |
4 | 4 | 15.666 | 13 | 15 | 6 | 7 | 2 | 1 |
raw_test=pd.read_csv('./Financial_Risk_Participants_Data/Test.csv')
raw_test.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
City | Location_Score | Internal_Audit_Score | External_Audit_Score | Fin_Score | Loss_score | Past_Results | |
---|---|---|---|---|---|---|---|
0 | 41 | 18.272 | 13 | 12 | 9 | 7 | 0 |
1 | 17 | 64.799 | 6 | 10 | 7 | 4 | 1 |
2 | 31 | 68.890 | 3 | 3 | 7 | 8 | 0 |
3 | 3 | 16.492 | 15 | 10 | 7 | 4 | 1 |
4 | 9 | 17.178 | 7 | 3 | 8 | 3 | 1 |
raw_train.shape
(543, 8)
raw_train.isnull().sum()
City 0
Location_Score 0
Internal_Audit_Score 0
External_Audit_Score 0
Fin_Score 0
Loss_score 0
Past_Results 0
IsUnderRisk 0
dtype: int64
raw_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543 entries, 0 to 542
Data columns (total 8 columns):
City 543 non-null int64
Location_Score 543 non-null float64
Internal_Audit_Score 543 non-null int64
External_Audit_Score 543 non-null int64
Fin_Score 543 non-null int64
Loss_score 543 non-null int64
Past_Results 543 non-null int64
IsUnderRisk 543 non-null int64
dtypes: float64(1), int64(7)
memory usage: 34.0 KB
plt.figure(figsize=(12,9))
sns.heatmap(raw_train.corr(),annot=True)
plt.show()
discrete = [var for var in raw_train.columns if raw_train[var].dtype!='O' and var!='IsUnderRisk' and raw_train[var].nunique()<10]
continuous = [var for var in raw_train.columns if raw_train[var].dtype!='O' and var!='IsUnderRisk' and var not in discrete]
# categorical
categorical = [var for var in raw_train.columns if raw_train[var].dtype=='O']
print('There are {} discrete variables'.format(len(discrete)))
print('There are {} continuous variables'.format(len(continuous)))
print('There are {} categorical variables'.format(len(categorical)))
There are 1 discrete variables
There are 6 continuous variables
There are 0 categorical variables
discrete
['Past_Results']
continuous
['City',
'Location_Score',
'Internal_Audit_Score',
'External_Audit_Score',
'Fin_Score',
'Loss_score']
plt.figure(figsize=(12,9))
sns.pairplot(raw_train)
plt.show()
<Figure size 864x648 with 0 Axes>
def sub_file(filename,preds):
sub_df=pd.DataFrame(preds,columns=['0','1'])
sub_df.to_excel(filename,index=False)
print(sub_df.head())
X=raw_train.drop('IsUnderRisk',axis=1)
y=raw_train.IsUnderRisk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,log_loss,make_scorer
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, RandomizedSearchCV
log_loss_scorer=make_scorer(log_loss)
def build_model(model,X,y):
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1,stratify=y)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
y_train_pred=model.predict(X_train)
print("Training Log Loss: : ",log_loss(y_train,y_train_pred))
print("Testing Log Loss: : ",log_loss(y_test,y_pred))
print("Training accuracy: : ",accuracy_score(y_train,y_train_pred))
print("Testing accuracy: : ",accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
cv_results=cross_val_score(model,X,y,cv=10,n_jobs=-1,scoring=log_loss_scorer)
print("Cross validation Log Loss:", cv_results.mean())
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_std=std.fit_transform(X)
lr=LogisticRegression()
build_model(lr,X_std,y)
Training Log Loss: : 6.36666658039777
Testing Log Loss: : 4.4362171294801
Training accuracy: : 0.815668202764977
Testing accuracy: : 0.8715596330275229
[[35 6]
[ 8 60]]
Cross validation Log Loss: 6.67873178982795
lr_params={
"penalty":['l1','l2'],
"C":[0.001,0.01,0.1,1,10,100]
}
lr_grid=GridSearchCV(estimator=lr,
param_grid=lr_params,
cv=10,verbose=2,n_jobs=-1)
lr_grid.fit(X_std,y)
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 6.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 7.5s finished
GridSearchCV(cv=10, error_score=nan,
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.0001, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=2)
print("Grid_search best score:",lr_grid.best_score_)
Grid_search best score: 0.8067676767676769
lr_grid.best_params_
{'C': 0.01, 'penalty': 'l2'}
build_model(lr_grid.best_estimator_,X,y)
Training Log Loss: : 6.685000001537178
Testing Log Loss: : 5.069963481656719
Training accuracy: : 0.8064516129032258
Testing accuracy: : 0.8532110091743119
[[34 7]
[ 9 59]]
Cross validation Log Loss: 6.486852633294605
std=StandardScaler()
X_std=std.fit_transform(X)
svc=SVC(probability=True)
build_model(svc,X_std,y)
Training Log Loss: : 4.377059668396568
Testing Log Loss: : 4.4361951222113785
Training accuracy: : 0.8732718894009217
Testing accuracy: : 0.8715596330275229
[[38 3]
[11 57]]
Cross validation Log Loss: 5.090145364653574
svc_params={
"C":[0.001,0.01,0.1,1,10,100,1000],
"gamma":['auto','scale'],
"class_weight":['balanced',None]
}
svc_grid=GridSearchCV(estimator=svc,
param_grid=svc_params,
cv=10,verbose=2,n_jobs=-1)
svc_grid.fit(X_std,y)
Fitting 10 folds for each of 28 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 tasks | elapsed: 2.0s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 11.4s finished
GridSearchCV(cv=10, error_score=nan,
estimator=SVC(C=1.0, break_ties=False, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='scale', kernel='rbf', max_iter=-1,
probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'class_weight': ['balanced', None],
'gamma': ['auto', 'scale']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=2)
svc_grid.best_score_
0.8636363636363636
svc_grid.best_params_
{'C': 10, 'class_weight': 'balanced', 'gamma': 'auto'}
build_model(svc_grid.best_estimator_,X_std,y)
Training Log Loss: : 3.103722299058151
Testing Log Loss: : 4.436187786455138
Training accuracy: : 0.9101382488479263
Testing accuracy: : 0.8715596330275229
[[39 2]
[12 56]]
Cross validation Log Loss: 4.709853740451932
rf=RandomForestClassifier()
build_model(rf,X,y)
Training Log Loss: : 9.992007221626413e-16
Testing Log Loss: : 3.4855792618245687
Training accuracy: : 1.0
Testing accuracy: : 0.8990825688073395
[[39 2]
[ 9 59]]
Cross validation Log Loss: 5.2785444787755615
rf_params={
"n_estimators":[120,300,500,800,1200],
"max_depth":[5,8,15,25,30,None],
"min_samples_split":[1,2,5,10,15,None],
"min_samples_leaf":[1,2,5,10],
"max_features":["log2","sqrt",None]
}
rf_random=RandomizedSearchCV(estimator=rf,
param_distributions=rf_params,
n_iter=50,cv=10,verbose=2,random_state=4,n_jobs=-1)
rf_random.fit(X,y)
Fitting 10 folds for each of 50 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 51.3s
[Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 2.3min
[Parallel(n_jobs=-1)]: Done 357 tasks | elapsed: 4.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 5.8min finished
RandomizedSearchCV(cv=10, error_score=nan,
estimator=RandomForestClassifier(bootstrap=True,
ccp_alpha=0.0,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_job...
warm_start=False),
iid='deprecated', n_iter=50, n_jobs=-1,
param_distributions={'max_depth': [5, 8, 15, 25, 30, None],
'max_features': ['log2', 'sqrt', None],
'min_samples_leaf': [1, 2, 5, 10],
'min_samples_split': [1, 2, 5, 10, 15,
None],
'n_estimators': [120, 300, 500, 800,
1200]},
pre_dispatch='2*n_jobs', random_state=4, refit=True,
return_train_score=False, scoring=None, verbose=2)
rf_random.best_score_
0.861919191919192
rf_random.best_params_
{'n_estimators': 500,
'min_samples_split': 2,
'min_samples_leaf': 2,
'max_features': 'sqrt',
'max_depth': 5}
build_model(rf_random.best_estimator_,X,y)
Training Log Loss: : 3.342469601327119
Testing Log Loss: : 4.1193256140011885
Training accuracy: : 0.9032258064516129
Testing accuracy: : 0.8807339449541285
[[38 3]
[10 58]]
Cross validation Log Loss: 4.831973690775467
plt.barh(y=X.columns,width=rf_random.best_estimator_.feature_importances_*100)
<BarContainer object of 7 artists>
gb=GradientBoostingClassifier()
build_model(gb,X,y)
Training Log Loss: : 1.3529087490857206
Testing Log Loss: : 3.4855865975808085
Training accuracy: : 0.9608294930875576
Testing accuracy: : 0.8990825688073395
[[38 3]
[ 8 60]]
Cross validation Log Loss: 4.516827815637886
gb.get_params()
{'ccp_alpha': 0.0,
'criterion': 'friedman_mse',
'init': None,
'learning_rate': 0.1,
'loss': 'deviance',
'max_depth': 3,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_impurity_split': None,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'presort': 'deprecated',
'random_state': None,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
gb_params={
"learning_rate":[0.01,0.015,0.025,0.05,0.1],
"max_depth":[3,5,7,9,12,14],
"subsample":[0.6,0.7,0.8,0.9,1.0],
"min_samples_split":[1,2,5,10,15,None],
"min_samples_leaf":[1,2,5,10],
"max_features":["log2","sqrt",None],
"n_estimators":[120,300,500,800,1200],
}
gb_random=RandomizedSearchCV(estimator=gb,
param_distributions=gb_params,
n_iter=25,cv=10,verbose=2,random_state=4,n_jobs=-1)
gb_random.fit(X,y)
Fitting 10 folds for each of 25 candidates, totalling 250 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 35.9s
[Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 2.6min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 3.3min finished
RandomizedSearchCV(cv=10, error_score=nan,
estimator=GradientBoostingClassifier(ccp_alpha=0.0,
criterion='friedman_mse',
init=None,
learning_rate=0.1,
loss='deviance',
max_depth=3,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_it...
param_distributions={'learning_rate': [0.01, 0.015, 0.025,
0.05, 0.1],
'max_depth': [3, 5, 7, 9, 12, 14],
'max_features': ['log2', 'sqrt', None],
'min_samples_leaf': [1, 2, 5, 10],
'min_samples_split': [1, 2, 5, 10, 15,
None],
'n_estimators': [120, 300, 500, 800,
1200],
'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]},
pre_dispatch='2*n_jobs', random_state=4, refit=True,
return_train_score=False, scoring=None, verbose=2)
gb_random.best_score_
0.8654545454545455
gb_random.best_params_
{'subsample': 0.7,
'n_estimators': 300,
'min_samples_split': 2,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 3,
'learning_rate': 0.1}
build_model(gb_random.best_estimator_,X,y)
Training Log Loss: : 0.15916671056970505
Testing Log Loss: : 4.11934762126991
Training accuracy: : 0.9953917050691244
Testing accuracy: : 0.8807339449541285
[[35 6]
[ 7 61]]
Cross validation Log Loss: 5.087831178555844
vote_clf=VotingClassifier([("randomForest",rf_random.best_estimator_),("GradientBoost",gb_random.best_estimator_)])
build_model(vote_clf,X,y)
Training Log Loss: : 3.103714929496583
Testing Log Loss: : 4.753057294665327
Training accuracy: : 0.9101382488479263
Testing accuracy: : 0.8623853211009175
[[39 2]
[13 55]]
Cross validation Log Loss: 4.707530750705729
models=["Logistic","SVM","RandomForest","GradientBoosting","VotingClassifier"]
cross_vals=[6.67873178982795,5.090145364653574,5.2785444787755615,4.516827815637886,4.707530750705729]
plt.figure(figsize=(12,9))
plt.bar(x=models,height=cross_vals)
plt.title("Log Loss of cross validation")
plt.show()