-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadvcl_rulebased.py
152 lines (121 loc) · 4.45 KB
/
advcl_rulebased.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wittgenstein as lw
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
import utils
def draw_confusion_matrix(Clf, X, y):
titles_options = [
("Confusion matrix, without normalization", None),
("Rules Based Confusion matrix", "true"),
]
for title, normalize in titles_options:
disp = plot_confusion_matrix(Clf, X, y, cmap="Reds", normalize=normalize)
disp.ax_.set_title(title)
plt.show()
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results["rank_test_score"] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".format(
results["mean_test_score"][candidate],
results["std_test_score"][candidate],
)
)
print("Parameters: {0}".format(results["params"][candidate]))
print("")
class_name = ("album", "type")
df = utils.load_tracks(buckets="discrete")
# df=df.head(1000)
column2drop = [
("track", "license"),
("track", "language_code"),
]
df.drop(column2drop, axis=1, inplace=True)
print(df.info())
df["album", "type"] = df["album", "type"].replace(
["Single Tracks", "Live Performance", "Radio Program"],
["NotAlbum", "NotAlbum", "NotAlbum"],
)
df["album", "type"] = df["album", "type"].replace(["Album", "NotAlbum"], [True, False])
"""
# feature to reshape
label_encoders = dict()
column2encode = [
("album", "type"),
]
for col in column2encode:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
"""
attributes = [col for col in df.columns if col != class_name]
X = df[attributes].values
y = df[class_name]
dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep="=")
dfY = df[class_name]
df = pd.concat([dfX, dfY], axis=1)
print(df.info())
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=100, stratify=y
)
ripper_clf = lw.RIPPER()
"""
ripper_clf.fit(X_train, y_train, class_feat=("album", "type"), pos_class=0, k=1, prune_size=0.33)
print(ripper_clf)
# Collect performance metrics
precision = ripper_clf.score(X_test, y_test, precision_score)
recall = ripper_clf.score(X_test, y_test, recall_score)
cond_count = ripper_clf.ruleset_.count_conds()
print(ripper_clf.ruleset_.out_pretty())
print(f'precision: {precision} recall: {recall} conds: {cond_count}')
# Apply the decision tree on the training set
print("Apply the decision tree on the training set: \n")
y_pred = ripper_clf.predict(X_train)
y_train = y_train.apply(lambda x: 1 - x)
print("Accuracy %s" % accuracy_score(y_train, y_pred))
print("F1-score %s" % f1_score(y_train, y_pred, average=None))
print(classification_report(y_train, y_pred))
confusion_matrix(y_train, y_pred)
# Apply the decision tree on the test set and evaluate the performance
print("Apply the decision tree on the test set and evaluate the performance: \n")
y_pred = ripper_clf.predict(X_test)
y_test = y_test.apply(lambda x: 1 - x)
print("Accuracy %s" % accuracy_score(y_test, y_pred))
print("F1-score %s" % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)
draw_confusion_matrix(ripper_clf, X_test, y_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print(roc_auc)
roc_auc = roc_auc_score(y_test, y_pred, average=None)
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.legend(loc="lower right", fontsize=14, frameon=False)
plt.show()
"""
print("GRID SEARCH:")
# grid search
param_grid = {
"prune_size": [0.33, 0.5, 0.77],
"k": [1, 2],
"class_feat": ["album", "type"],
"pos_class": [0],
}
grid = GridSearchCV(estimator=ripper_clf, param_grid=param_grid)
grid.fit(X_train, y_train)
clf = grid.best_estimator_
print(report(grid.cv_results_, n_top=3))