-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathpredict_svm.py
151 lines (114 loc) · 5.57 KB
/
predict_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
# Author: Antoine DELPLACE
# Last update: 17/01/2020
"""
Use Support Vector Machine to predict which flow is a malware.
Tune different hyperparameters: gamma_scale, degree, regularization penalty
Try a kernel approximation with l2 hinge
Parameters
----------
data_window.h5 : extracted data from preprocessing1.py
data_window3.h5 : extracted data from preprocessing2.py
data_window_labels.npy : label numpy array from preprocessing1.py
Return
----------
Print and Plot the results of the hyperparameter tuning (precision, recall and f1)
Print train and test accuracy, precison, recall, f1 and support for the kernel approximation
"""
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import h5py
from sklearn import model_selection, feature_selection, kernel_approximation, ensemble, linear_model, metrics
print("Import data")
X = pd.read_hdf('data_window.h5', key='data')
X.reset_index(drop=True, inplace=True)
X2 = pd.read_hdf('data_window3.h5', key='data')
X2.reset_index(drop=True, inplace=True)
X = X.join(X2)
X.drop('window_id', axis=1, inplace=True)
y = X['Label_<lambda>']
X.drop('Label_<lambda>', axis=1, inplace=True)
labels = np.load("data_window_labels.npy")
print(X.columns.values)
print(labels)
print(np.where(labels == 'flow=From-Botne')[0][0])
y_bin6 = y==np.where(labels == 'flow=From-Botne')[0][0]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_bin6, test_size=0.33, random_state=123456)
print("y", np.unique(y, return_counts=True))
print("y_train", np.unique(y_train, return_counts=True))
print("y_test", np.unique(y_test, return_counts=True))
## Train
def apply_svm_cross_validation(X, y, svc_args={'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':1e-9, 'tol':1e-3, 'random_state':123456, 'class_weight':None}, kernel_args={'kernel':'rbf', 'gamma':None, 'degree':None, 'n_components':100, 'random_state':123456}):
#print("kernel_approx")
#feature_map_nystroem = kernel_approximation.Nystroem(**kernel_args)
#feature_map_nystroem.fit(X)
#X_new = feature_map_nystroem.transform(X)
print("SVM")
clf = linear_model.SGDClassifier(**svc_args)
cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456)
scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True)
print(scores)
return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1'])]
gamma_scale = 1/X_train.shape[1]
print("gamma_scale=", gamma_scale)
tab_gamma = np.concatenate((np.linspace(0.001, 0.04, 10), [gamma_scale]))
print(tab_gamma)
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, kernel_args={'kernel':'rbf', 'gamma':gamma, 'degree':None, 'n_components':200, 'random_state':123456}) for gamma in tab_gamma])
print(tab_score)
plt.plot(tab_gamma, tab_score[:, 0])
plt.plot(tab_gamma, tab_score[:, 1])
plt.plot(tab_gamma, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Gamma")
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_svm_gamma.pdf", format="pdf")
plt.show()
tab_degree = np.linspace(2, 4, 3)
print(tab_degree)
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, kernel_args={'kernel':'poly', 'gamma':None, 'degree':degree, 'n_components':200, 'random_state':123456}) for degree in tab_degree])
print(tab_score)
plt.plot(tab_degree, tab_score[:, 0])
plt.plot(tab_degree, tab_score[:, 1])
plt.plot(tab_degree, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Degree")
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_svm_degree.pdf", format="pdf")
plt.show()
tab_penalty = ['l1', 'l2', 'elasticnet']
print(tab_penalty)
tab_score = np.array([apply_svm_cross_validation(X_train, y_train, {'loss':'hinge', 'penalty':penalty, 'max_iter':1000, 'alpha':1e-9, 'tol':1e-3, 'random_state':123456, 'class_weight':None}) for penalty in tab_penalty])
print(tab_score)
plt.plot(tab_penalty, tab_score[:, 0])
plt.plot(tab_penalty, tab_score[:, 1])
plt.plot(tab_penalty, tab_score[:, 2])
plt.legend(["test_precision", "test_recall", "test_f1"])
plt.xlabel("Regularization Penalty")
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig("cross_validation_svm_penalty.pdf", format="pdf")
plt.show()
feature_map_nystroem = kernel_approximation.Nystroem(kernel='poly', gamma=None, degree=2, n_components=200, random_state=123456)
feature_map_nystroem.fit(X_train)
X_train_new = feature_map_nystroem.transform(X_train)
X_test_new = feature_map_nystroem.transform(X_test)
clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, alpha=1e-9, tol=1e-3, random_state=123456, class_weight=None, verbose=1)
clf.fit(X_train_new, y_train)
print("Train")
y_pred_train = clf.predict(X_train_new)
print("accuracy score = ", metrics.balanced_accuracy_score(y_train, y_pred_train))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_train, y_pred_train)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])
print("Test")
y_pred_test = clf.predict(X_test_new)
print("accuracy score = ", metrics.balanced_accuracy_score(y_test, y_pred_test))
precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test, y_pred_test)
print("precision = ", precision[1])
print("recall = ", recall[1])
print("fbeta_score = ", fbeta_score[1])
print("support = ", support[1])