-
Notifications
You must be signed in to change notification settings - Fork 0
/
GaussianNaiveBayes.py
97 lines (73 loc) · 4.23 KB
/
GaussianNaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
import numpy as np
import math
from math import sqrt, exp, pi, pow
class GaussianNaiveBayes:
class ColumnGaussianAttributes:
def __init__(self, col_name, mu, sigma, max_val, min_val):
self.col_name = col_name
self.mu = mu
self.sigma = sigma
self.max_val = max_val
self.min_val = min_val
def calc_maximum_likelihood(self, x, mu, std):
# use the pdf to calculate a probability for each value
# given the feature exhibits a Gaussian distribution
# x = feature mean
# mu = population mean
# what is the probability that a feature = x, where does this lie in the dataset
# distribution? Probabilities closer to the mean, will have more of an impact
# we are using the priors to create a gaussian distribution in order
# to calculate the present, this returns the likelihood
return 1 / (std * sqrt(2 * pi)) * exp(-1 / 2 * pow(((x - mu) / std), 2))
def __init__(self, x_train, y_train):
# finds the gaussian distribution of each column
# find the mean/std of each feature
self.classifiers = {}
self.prior_prob_list = {}
# get the unique target values in y
self.target_vals = pd.unique(y_train)
total_train_size = len(x_train)
# for each target value find the feature set std, mu and prior associated with that target value
for val in self.target_vals:
self.classifiers[val] = []
# get all rows associated with the current target value
clf_rows = x_train[y_train.to_numpy() == val]
# store the prior for each target value
self.prior_prob_list[val] = len(clf_rows) / total_train_size
# get the std and mu of each column in the current list of rows
for col in clf_rows:
row_col = clf_rows[col]
self.classifiers[val].append(self.ColumnGaussianAttributes(col,
row_col.mean(),
row_col.std(),
row_col.max(),
row_col.min()))
def predict(self, x):
prediction_list = []
# Create a prediction for each row associated with each classification
# the high association based on the PDF and Bayes' Theorem wins
# Grab the current row to classify
for index, row in x.iterrows():
current_guess = None
predicted_classifier = None
# Get each classifier name from the classifier dict and calculate the probability of this row matching each
# target classifier using the earlier sigma and mean calculations
for clf_name in self.classifiers.keys():
# Initial guess for classification of this row if it were to be this type of classification
clf_probability = math.log(self.prior_prob_list[clf_name])
# Grab column info list in the classifier dict and iterate through each column
for gaussian_col_attr in self.classifiers[clf_name]:
# Get the column value from the current row and calculate the probability it belongs to
# this classifier based on the pdf and the prior
x = row[gaussian_col_attr.col_name]
# Calculate the rest of the probabilities based on X columns values
clf_probability += math.log(self.calc_maximum_likelihood(x,
gaussian_col_attr.mu,
gaussian_col_attr.sigma))
# Check the current prediction and update it with the current calculations
if current_guess is None or clf_probability > current_guess:
current_guess = clf_probability
predicted_classifier = clf_name
prediction_list.append(predicted_classifier)
return np.array(prediction_list)