-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes
88 lines (79 loc) · 3.07 KB
/
naive_bayes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#Import libraries
import pandas as pd
import numpy as np
import random
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
def getfscore(cm):
TP=cm[0][0]
TN=cm[1][1]
FN=cm[0][1]
FP=cm[1][0]
precision = TP/(TP+FP)
recall = TP/(TP+FN)
fmesr=(2*precision*recall)/(precision+recall)
return fmesr
#load Dataset
dataset = pd.read_csv("creditcard.csv")
colName = [i for i in dataset.columns]
colName = colName[1:-1]
dataset = dataset.sample(frac = 1) #shuffle
x = dataset.iloc[:,1:-1].values.tolist() #features
y = dataset.iloc[:,-1].values.tolist() #labels
negData = y.count(0)
posData = y.count(1)
negPer = (negData/len(y))*100
posPer = (posData/len(y))*100
print("Fraud data percentage: {0}".format(posPer))
print("Dataset is highly unbalanced. We will have to balance the dataset")
x_neg = dataset.loc[dataset['Class']==0].sample(frac=1)
x_pos = dataset.loc[dataset['Class']==1].sample(frac=1)
#Geneate 5 random datasets
xnegArr=[]
for i in range(5):
k = random.randrange(negData) - posData
xnegArr.append(x_neg[k:k+posData])
new_dsArr=[]
for i in range(5):
new_dsArr.append(pd.concat([x_pos, xnegArr[i]]).sample(frac = 1))
print("Method 1: training using Naive Bayes....")
#training for one dataset
x_set = new_dsArr[0].iloc[:,1:-1].values.tolist()
y_set = new_dsArr[0].iloc[:,-1].values.tolist()
x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.30, random_state=1)
cls_nb = GaussianNB()
cls_nb.fit(x_train, y_train)
y_pred = cls_nb.predict(x_test)
#Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("For dataset {0}, f-measure score is: {1}".format(1, getfscore(cm)))
print("using classifier on remaining test dataset")
for i in range(1,5):
x_set = (new_dsArr[i].iloc[:,1:-1].values.tolist())
y_set = new_dsArr[i].iloc[:,-1].values.tolist()
y_pred = cls_nb.predict(x_set)
cm = confusion_matrix(y_set, y_pred)
print("For dataset {0}, f-measure score is: {1}".format(i+1,getfscore(cm)))
ds = pd.concat([x_pos[0:350], x_neg[0:190000]]).sample(frac = 1) #Complete training dataset
ds_test = pd.concat([x_pos[350:], x_neg[190000:]]).sample(frac = 1) #Complete test dataset
x_train = (ds.iloc[:,1:-1].values.tolist())
y_train = ds.iloc[:,-1].values.tolist()
x_test = (ds_test.iloc[:,1:-1].values.tolist())
y_test= ds_test.iloc[:,-1].values.tolist()
#x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.30, random_state=1)
y_pred = cls_nb.predict(x_train)
cm = confusion_matrix(y_train, y_pred)
print("For train dataset, f-measure score is: {0}".format(getfscore(cm)))
y_pred = cls_nb.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print("For test dataset, f-measure score is: {0}".format(getfscore(cm)))
print("")
print("testing on complete dataset")
y_pred = cls_nb.predict(x)
cm = confusion_matrix(y, y_pred)
print("For original dataset, f-measure score is: {0}".format(getfscore(cm)))