Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Malware analysis #11

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions machine_learning/Malware analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#! /usr/bin/python2
# used for DATAFrames and DataFrames can hold different types data of multidimensional arrays.
import pandas as pd
# Numpy provides robust data structures for efficient computation of multi-dimensional arrays & matrices.
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import model_selection, tree, linear_model
from sklearn.feature_selection import SelectFromModel
import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn import tree

data = pd.read_csv('data.csv', sep='|') # generate df as data
# now droping some coloumns as axis 1(mean coloumn) and will show the values in the rows
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values # values of legitimate data

print('Researching important feature based on %i total features\n' %
X.shape[1]) # shape() is use in pandas to give number of row/column

# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1] # will save value 13 as shape is (138047, 13) :}

# now converting in training and testing data in 20% range hahhahaha ! as total x is 138047 and testing is 138047*0.2=27610 :)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X_new, y, test_size=0.2)
features = []

print('%i features identified as important:' %
nb_features) # as mentioned above


# important features sorted
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
print("%d. feature %s (%f)" % (
f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

# mean adding to the empty 'features' array the 'important features'
# [::-1] mean start with last towards first
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
features.append(data.columns[2+f])

print(features)

# Algorithm comparison
algorithms = {
# The max_depth parameter denotes maximum depth of the tree.

# In case, of random forest, these ensemble classifiers are
# the randomly created decision trees. Each decision tree is a single classifier and the target prediction is based on
# the majority voting method.
"RandomForest": ske.RandomForestClassifier(n_estimators=50),
# n_estimators ==The number of trees in the forest.

"GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
"AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
# Ada mean Adaptive
# Both are boosting algorithms which means that they convert a set of weak learners into a single strong learner. They
# both initialize a strong learner (usually a decision tree) and iteratively create a weak learner that is added to the
# strong learner. They differ on how they create the weak learners during the iterative process.

"GNB": GaussianNB(),
# Bayes theorem is based on conditional probability. The conditional probability helps us calculating the probability
# that something will happen
"DecisionTree": tree.DecisionTreeClassifier(max_depth=10)
}
results = {}
print("\nNow testing algorithms")
for algo in algorithms:
clf = algorithms[algo]
clf.fit(X_train, y_train) # fit may be called as 'trained'
score = clf.score(X_test, y_test)
print("%s : %f %%" % (algo, score*100))
results[algo] = score

tree.plot_tree(clf)

winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' %
(winner, results[winner]*100))


# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
# Persist an arbitrary Python object into one file.
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'wb').write(pickle.dumps(features))
# joblib works especially well with NumPy arrays which are used by sklearn so depending on the classifier type you use you might
# have performance and size benefits using joblib.Otherwise pickle does work correctly so saving a trained classifier and loading
# it again will produce the same results no matter which of the serialization libraries you use
print('Saved')

# Identify false and true positive rates
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print(mt)
# A confusion matrix, also known as an error matrix,[4] is a specific table layout that allows visualization
# of the performance of an algorithm, typically a supervised learning.
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ((mt[1][0] / float(sum(mt[1]))*100)))
Binary file added machine_learning/data.zip
Binary file not shown.