-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclassifier_kaggle.py
124 lines (106 loc) · 4.76 KB
/
classifier_kaggle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Classifier - CNN
Classify images from different families with a CNN for the public Kaggle dataset (https://www.kaggle.com/c/malware-classification/overview)
Author: Benoît Michel
Date : June 2021
"""
#from numpy.random import seed
#seed(1)
#from tensorflow import set_random_seed
#set_random_seed(2)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras
import seaborn as sns
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Dropout, Flatten, Softmax
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn import metrics
import time
path_root = """\\Documents\\Unif\\MQ4\\LINGI2990_TFE\\Code\\Images2_RGB\\""" # TO ADAPT ACCORDING TO YOUR DATASET LOCATION
batches = ImageDataGenerator().flow_from_directory(directory=path_root, target_size=(224,224), batch_size=10868)
#print(batches.class_indices)
imgs, labels = next(batches)
#print(imgs.shape)
#print(labels.shape)
classes = batches.class_indices.keys()
perc = (sum(labels)/labels.shape[0])*100
#plt.xticks(rotation='vertical')
#plt.bar(classes,perc)
#plt.show()
num_classes = 9 # TO ADAPT ACCORDING TO THE REAL NUMBER OF MALWARE FAMILIES
def malware_model(i):
print(i)
Malware_model = Sequential()
Malware_model.add(Conv2D(30, kernel_size=(3, 3),
activation='relu',
input_shape=(224,224,3)))
Malware_model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
Malware_model.add(Conv2D(15, (3, 3), activation='relu'))
Malware_model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
Malware_model.add(Dropout(0.25))
Malware_model.add(Flatten())
Malware_model.add(Dense(1024, activation='relu'))
Malware_model.add(Dropout(0.5))
Malware_model.add(Dense(512, activation='relu'))
Malware_model.add(Dense(num_classes, activation='softmax'))
Malware_model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
return Malware_model
accuracies = list()
for i in range(1, 2): # CHANGE LIST FOR TESTS ON PARAMETERS
X_train, X_test, y_train, y_test = train_test_split(imgs/255.,labels, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
Malware_model = malware_model(i)
Malware_model.summary()
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
Malware_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, callbacks=[es])
scores = Malware_model.evaluate(X_test, y_test)
# POSSIBILITY TO DEFINE A THRESHOLD OF CERTAINTY OF THE CLASSIFICATION AND MAKE MORE COMPLEX ANALYSIS IF NOT ACHIEVED
probas = Malware_model.predict(X_test)
print(probas)
print('Final CNN accuracy: ', scores[1])
accuracies.append(scores[1])
print("Accuracies", accuracies)
# Confusion matrix
y_pred = Malware_model.predict_classes(X_test, verbose=0)
y_test2 = np.argmax(y_test, axis=1)
c_matrix = metrics.confusion_matrix(y_test2, y_pred)
def confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
"""Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
Arguments
---------
confusion_matrix: numpy.ndarray
The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix.
Similarly constructed ndarrays can also be used.
class_names: list
An ordered list of class names, in the order they index the given confusion matrix.
figsize: tuple
A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
the second determining the vertical size. Defaults to (10,7).
fontsize: int
Font size for axes labels. Defaults to 14.
"""
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
try:
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
except ValueError:
raise ValueError("Confusion matrix values must be integers.")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
class_names= batches.class_indices.keys()
confusion_matrix(c_matrix, class_names, figsize = (20,7), fontsize=14)