-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
205 lines (166 loc) · 7.77 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import joblib # To save the models
from river import ensemble
from river.tree import HoeffdingTreeClassifier, HoeffdingAdaptiveTreeClassifier
# Load data
print("Loading data...")
df_ids2018 = pd.read_csv('cleaned_ids2018_sampled.csv')
# Check if labels are numeric or strings and map if necessary
if df_ids2018['Label'].dtype == 'object':
label_dict = {
'Benign': 1,
'FTP-BruteForce': 2,
'SSH-Bruteforce': 3,
'DDOS attack-HOIC': 4,
'Bot': 5,
'DoS attacks-GoldenEye': 6,
'DoS attacks-Slowloris': 7,
'DDOS attack-LOIC-UDP': 8,
'Brute Force -Web': 9,
'Brute Force -XSS': 10,
'SQL Injection': 11
}
print("Mapping string labels to numeric labels...")
df_ids2018['Label'] = df_ids2018['Label'].map(label_dict)
# Log original label distribution
print("Original Label distribution:\n", df_ids2018['Label'].value_counts())
# Use the 12 selected features
selected_features = [
'Tot Fwd Pkts', 'TotLen Fwd Pkts', 'Bwd Pkt Len Max', 'Flow Pkts/s',
'Fwd IAT Mean', 'Bwd IAT Tot', 'Bwd IAT Mean', 'RST Flag Cnt',
'URG Flag Cnt', 'Init Fwd Win Byts', 'Fwd Seg Size Min', 'Idle Max'
]
X = df_ids2018[selected_features]
y = df_ids2018['Label']
# Split data into train and test sets
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
# Log label distribution in training set
print("Label distribution in training set before resampling:\n", Counter(y_train))
# Apply undersampling to reduce major attack classes
benign_count = int(len(y_train) * 0.5)
attack_count = len(y_train) - benign_count
under_strategy = {
1: benign_count,
2: min(int(attack_count / 10), Counter(y_train)[2]),
3: min(int(attack_count / 10), Counter(y_train)[3]),
4: min(int(attack_count / 10), Counter(y_train)[4]),
5: min(int(attack_count / 10), Counter(y_train)[5]),
6: min(int(attack_count / 10), Counter(y_train)[6]),
7: min(int(attack_count / 10), Counter(y_train)[7]),
8: min(int(attack_count / 10), Counter(y_train)[8]),
9: min(int(attack_count / 10), Counter(y_train)[9]),
10: min(int(attack_count / 10), Counter(y_train)[10]),
11: min(int(attack_count / 10), Counter(y_train)[11])
}
under_sampler = RandomUnderSampler(sampling_strategy=under_strategy, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
# Log label distribution after undersampling
print("Label distribution after undersampling:\n", Counter(y_train_under))
# Apply SMOTE for oversampling minority attack classes
smote_strategy = {
1: benign_count,
2: min(int(attack_count / 10), Counter(y_train_under)[2]),
3: min(int(attack_count / 10), Counter(y_train_under)[3]),
4: min(int(attack_count / 10), Counter(y_train_under)[4]),
5: min(int(attack_count / 10), Counter(y_train_under)[5]),
6: min(int(attack_count / 10), Counter(y_train_under)[6]),
7: min(int(attack_count / 10), Counter(y_train_under)[7]),
8: min(int(attack_count / 10), Counter(y_train_under)[8]),
9: min(int(attack_count / 10), Counter(y_train_under)[9]),
10: min(int(attack_count / 10), Counter(y_train_under)[10]),
11: min(int(attack_count / 10), Counter(y_train_under)[11])
}
smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_under, y_train_under)
# Subtract 1 from the labels for XGBoost
y_train_res_xgb = y_train_res - 1
y_test_xgb = y_test - 1
# Train Random Forest model
print("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)
rf_model.fit(X_train_res, y_train_res)
# Save the Random Forest model
joblib.dump(rf_model, 'rf_model.joblib')
print("Random Forest model saved as 'rf_model.joblib'")
# Predict and evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest classification report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))
# Train XGBoost model
print("Training XGBoost model...")
xgb_model = XGBClassifier(n_estimators=100, n_jobs=-1, random_state=42, use_label_encoder=False)
xgb_model.fit(X_train_res, y_train_res_xgb)
# Save the XGBoost model
joblib.dump(xgb_model, 'xgb_model.joblib')
print("XGBoost model saved as 'xgb_model.joblib'")
# Predict and evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
print("XGB Accuracy:", accuracy_score(y_test_xgb, y_pred_xgb))
print("XGBoost classification report:\n", classification_report(y_test_xgb, y_pred_xgb))
print("XGBoost confusion matrix:\n", confusion_matrix(y_test_xgb, y_pred_xgb))
# Convert to dictionary format for River (online learning library)
X_train_dict = X_train_res.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')
# Train and evaluate Online Random Forest (ORF) with streaming data
print("Training Online Random Forest (ORF) model incrementally...")
orf_model = HoeffdingAdaptiveTreeClassifier()
for xi, yi in zip(X_train_dict, y_train_res):
orf_model.learn_one(xi, yi)
# Stream data to ORF and predict
print("Evaluating ORF model with streaming data...")
orf_preds = []
for xi, yi in zip(X_test_dict, y_test): # Stream test data one by one
orf_pred = orf_model.predict_one(xi)
orf_preds.append(orf_pred)
orf_model.learn_one(xi, yi) # Update the model incrementally after each prediction
print("ORF Accuracy:", accuracy_score(y_test, orf_preds))
print("ORF Confusion Matrix:\n", confusion_matrix(y_test, orf_preds))
# Save the ORF model
joblib.dump(orf_model, 'orf_model.joblib')
print("ORF model saved as 'orf_model.joblib'")
# Train and evaluate Hoeffding Tree (HT) with streaming data
print("Training Hoeffding Tree (HT) model incrementally...")
ht_model = HoeffdingTreeClassifier()
for xi, yi in zip(X_train_dict, y_train_res):
ht_model.learn_one(xi, yi)
# Stream data to HT and predict
print("Evaluating HT model with streaming data...")
ht_preds = []
for xi, yi in zip(X_test_dict, y_test): # Stream test data one by one
ht_pred = ht_model.predict_one(xi)
ht_preds.append(ht_pred)
ht_model.learn_one(xi, yi) # Update the model incrementally after each prediction
print("HT Accuracy:", accuracy_score(y_test, ht_preds))
print("HT Confusion Matrix:\n", confusion_matrix(y_test, ht_preds))
# Save the HT model
joblib.dump(ht_model, 'ht_model.joblib')
print("HT model saved as 'ht_model.joblib'")
#Custom evaluation: Pass one record per label to all four models and print predictions
print("Performing custom evaluation for all models...")
unique_labels = y_test.unique()
for label in unique_labels:
sample_record = X_test[y_test == label].iloc[0].to_dict()
print(f"Record for label {label}:\n", sample_record)#
#Prediction using Random Forest
rf_pred = rf_model.predict([list(sample_record.values())])
print(f"Random Forest prediction for label {label}: {rf_pred}")
# Prediction using XGBoost
xgb_pred = xgb_model.predict([list(sample_record.values())])
print(f"XGBoost prediction for label {label}: {xgb_pred}")
# Prediction using ORF
orf_pred = orf_model.predict_one(sample_record)
print(f"ORF prediction for label {label}: {orf_pred}")
# Prediction using HT
ht_pred = ht_model.predict_one(sample_record)
print(f"HT prediction for label {label}: {ht_pred}")
#End of process
print("Process complete.")