classifier.py

"""
	Classifier - CNN
	Classify images from different families with a CNN for the private Cisco dataset

	Author: Benoît Michel
	Date : June 2021
"""

#from numpy.random import seed
#seed(1)
#from tensorflow import set_random_seed
#set_random_seed(2)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras
import seaborn as sns
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Dropout, Flatten, Softmax
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn import metrics
from PIL import Image
import time

Image.MAX_IMAGE_PIXELS = None # To avoid warnings with big images

path_root = """\\Documents\\Unif\\MQ4\\LINGI2990_TFE\\Code\\Images_TrueRGB\\""" # TO ADAPT ACCORDING TO YOUR DATASET LOCATION

batches = ImageDataGenerator().flow_from_directory(directory=path_root, target_size=(224,224), batch_size=13059)
#print(batches.class_indices)

imgs, labels = next(batches)
#print(imgs.shape)
#print(labels.shape)

classes = batches.class_indices.keys()
perc = (sum(labels)/labels.shape[0])*100
#plt.xticks(rotation='vertical')
#plt.bar(classes,perc)
#plt.show()

num_classes = 102  # TO ADAPT ACCORDING TO THE REAL NUMBER OF MALWARE FAMILIES
def malware_model(i):
    print(i)
    Malware_model = Sequential()
    Malware_model.add(Conv2D(30, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=(224,224,3)))
    Malware_model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
    Malware_model.add(Conv2D(15, (3, 3), activation='relu'))
    Malware_model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
    Malware_model.add(Dropout(0.25))
    Malware_model.add(Flatten())
    Malware_model.add(Dense(1024, activation='relu'))
    Malware_model.add(Dropout(0.5))
    Malware_model.add(Dense(512, activation='relu'))
    #Malware_model.add(Dropout(j))
    Malware_model.add(Dense(num_classes, activation='softmax'))
    Malware_model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return Malware_model

accuracies = list()
for i in [1]: # CHANGE LIST FOR TESTS ON PARAMETERS
    X_train, X_test, y_train, y_test = train_test_split(imgs/255.,labels, test_size=0.3)
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    Malware_model = malware_model(i)
    Malware_model.summary()

    # TRY TO PUT WEIGHTS ON THE TRAINING ACCORDING TO THE NUMBER OF MALWARES IN THE FAMILY
    #y_train_new = np.argmax(y_train, axis=1)
    #print(y_train_new)
    #class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train_new), y_train_new) # hidden ?
    #print("Test : ", class_weights)

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
    Malware_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, verbose=1, callbacks=[es]) #class_weight=class_weights
    scores = Malware_model.evaluate(X_test, y_test)

    # POSSIBILITY TO DEFINE A THRESHOLD OF CERTAINTY OF THE CLASSIFICATION AND MAKE MORE COMPLEX ANALYSIS IF NOT ACHIEVED
    #probas = Malware_model.predict(X_test)
    #print(probas)

    print('Final CNN accuracy: ', scores[1])
    accuracies.append(scores[1])

print("Accuracies", accuracies)

# Confusion matrix
y_pred = Malware_model.predict_classes(X_test, verbose=0)
y_test2 = np.argmax(y_test, axis=1)
c_matrix = metrics.confusion_matrix(y_test2, y_pred)

def confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.

    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix.
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names,
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

class_names= batches.class_indices.keys()
confusion_matrix(c_matrix, class_names, figsize = (20,7), fontsize=14)