forked from rayidghani/magicloops
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmagicloops.py
136 lines (118 loc) · 5.99 KB
/
magicloops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm, model_selection
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import random
import pylab as pl
import matplotlib.pyplot as plt
from scipy import optimize
import time
np.random.seed(524)
def define_clfs_params():
clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LR': LogisticRegression(penalty='l1', C=1e5),
'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss="hinge", penalty="l2"),
'KNN': KNeighborsClassifier(n_neighbors=3)
}
grid = {
'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
'NB' : {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
return clfs, grid
def clf_loop(models_to_run, clfs, grid, X, y):
for n in range(1, 2):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
for index,clf in enumerate([clfs[x] for x in models_to_run]):
print (models_to_run[index])
parameter_values = grid[models_to_run[index]]
count = 0
for p in ParameterGrid(parameter_values):
try:
count +=1
file_short = "{}_{}_{}".format(str(clf)[0:15], str(index), count)
clf.set_params(**p)
print (clf)
y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
print (precision_at_k(y_test,y_pred_probs,.05))
plot_precision_recall_n(y_test,y_pred_probs,clf, file_short)
except IndexError as e:
print ("Error: {}".format(e))
continue
def plot_precision_recall_n(y_true, y_prob, model_name, filename_short):
from sklearn.metrics import precision_recall_curve
y_score = y_prob
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
precision_curve = precision_curve[:-1]
recall_curve = recall_curve[:-1]
pct_above_per_thresh = []
number_scored = len(y_score)
for value in pr_thresholds:
num_above_thresh = len(y_score[y_score>=value])
pct_above_thresh = num_above_thresh / float(number_scored)
pct_above_per_thresh.append(pct_above_thresh)
pct_above_per_thresh = np.array(pct_above_per_thresh)
plt.clf()
fig, ax1 = plt.subplots()
ax1.plot(pct_above_per_thresh, precision_curve, 'b')
ax1.set_xlabel('percent of population')
ax1.set_ylabel('precision', color='b')
ax2 = ax1.twinx()
ax2.plot(pct_above_per_thresh, recall_curve, 'r')
ax2.set_ylabel('recall', color='r')
plt.rcParams.update({'font.size': '9'})
plt.rcParams.update({'figure.dpi': '300'})
plt.rcParams.update({'figure.figsize': '16, 12'})
plt.title(model_name)
filename = "results/PR_curve_{}.png".format(filename_short)
plt.savefig(filename)
plt.close('all')
def precision_at_k(y_true, y_scores, k):
threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
return metrics.precision_score(y_true, y_pred)
def main(dataset, outcome, features, all_features=False):
# Set Outcome and Features
df = pd.read_csv(dataset,index_col=0)
if all_features == False:
features = features
elif all_features == True:
features = list(df.ix[:, df.columns != outcome].columns) # All Features Except Outcome
X = df[features]
y = df[outcome]
# Print Message
print ("-"*70)
print ("Running Sci-Kit Learn Magic Loop on Dataset {} ...".format(dataset))
print ("Dependent Variable: {}".format(str(outcome)))
print ("Features: {}".format(str(features)))
print ("-"*70)
# Main Loop
clfs, grid = define_clfs_params()
models_to_run=['KNN','RF','LR','ET','AB','GB','NB','DT']
clf_loop(models_to_run, clfs,grid, X,y)
if __name__ == '__main__':
main()