Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
iretes committed Dec 30, 2023
2 parents 91f0f4e + 37601d2 commit fb750e3
Show file tree
Hide file tree
Showing 3 changed files with 677 additions and 49 deletions.
274 changes: 225 additions & 49 deletions TASK_3/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,35 +33,24 @@
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from classification_utils import compute_clf_scores, plot_confusion_matrix

# %% [markdown]
# We load data

# %%
# load the data
incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv', index_col=0)
incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv', index_col=0)
true_labels_train_df = pd.read_csv('../data/clf_y_train.csv', index_col=0)
true_labels_train = true_labels_train_df.values.ravel()
true_labels_test_df = pd.read_csv('../data/clf_y_test.csv', index_col=0)
true_labels_test = true_labels_test_df.values.ravel()

# load the names of the features to use for the classification task
features_for_clf = [
'location_imp', 'latitude', 'longitude', 'state_code', 'congressional_district',
'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants',
'day', 'day_of_week', 'month', 'poverty_perc', 'democrat',
'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
]
incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv')
incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv')

# project on the features_to_use
indicators_train_df = incidents_train_df[features_for_clf]
indicators_test_df = incidents_test_df[features_for_clf]
# %%
incidents_train_df.head(2)

# %%
indicators_train_df.head(2)
true_labels_train = np.where(incidents_train_df['n_killed'] > 0, True, False)
true_labels_test = np.where(incidents_test_df['n_killed'] > 0, True, False)

# %%
print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
Expand All @@ -75,6 +64,8 @@
# %% [markdown]
# GaussianNB implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.
#
# Bernulli (requires samples to be represented as binary-valued feature vectors)
#
# Multi-variate Bernoulli does not capture the number of
# times each word occurs, and that it explicitly includes
# the non-occurrence probability of words that do not appear in the document.
Expand All @@ -87,6 +78,49 @@
# %% [markdown]
# ### Gaussian Naive Bayes Classifier

# %% [markdown]
# Choose features for classification:

# %%
features_for_clf = [
'latitude', 'longitude', 'state_code', 'congressional_district',
'poverty_perc', 'location_imp', 'year', 'month',
'age_range', 'max_age', 'avg_age',
'n_participants', 'n_child_prop', 'n_teen_prop', 'n_males_prop',
# tag
#'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
#'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
]

indicators_train_df = incidents_train_df[features_for_clf]
indicators_test_df = incidents_test_df[features_for_clf]

# %% [markdown]
# Prepare Train set and Test set:

# %%
# train set
indicators_train_df.dropna(inplace=True)
true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)

#test set
indicators_test_df.dropna(inplace=True)
true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)

# %% [markdown]
# Oversampling of the minority class:

# %%
ros = RandomOverSampler(random_state=42)
indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)

# %%
print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')

# %% [markdown]
# Classification:

# %%
gnb = GaussianNB()
gnb.fit(indicators_train_df, true_labels_train)
Expand Down Expand Up @@ -114,48 +148,72 @@
)

# %% [markdown]
# ### Bernulli Naive Bayes Classifier
# ### Multinomial Naive Bayes Classifier

# %% [markdown]
# Choose features for classification:

# %%
bnb = BernoulliNB()
bnb.fit(indicators_train_df, true_labels_train)
y_pred = bnb.predict(indicators_test_df)
features_for_clf = [
'state_code', 'congressional_district', 'month',
'age_range', 'max_age',
'n_participants', 'n_adult',
#'n_injured', 'n_arrested', 'n_unharmed',
# tag
'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
]

print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum()))
indicators_train_df = incidents_train_df[features_for_clf]
indicators_test_df = incidents_test_df[features_for_clf]

# %% [markdown]
# Discretize latitude and longitude:

# %%
test_scores = compute_clf_scores(
y_true=true_labels_test,
y_pred=y_pred,
params=None,
train_time=None,
score_time=None,
prob_pred=None,
clf_name='BernoulliNB',
)
test_scores
print(f'Latidude: max value {max(incidents_train_df["latitude"].max(), incidents_test_df["latitude"].max())}, \
min value {min(incidents_train_df["latitude"].min(), incidents_test_df["latitude"].min())}')
print(f'Longitude: max value {max(incidents_train_df["longitude"].max(), incidents_test_df["longitude"].max())}, \
min value {min(incidents_train_df["longitude"].min(), incidents_test_df["longitude"].min())}')

# %%
plot_confusion_matrix(
y_true=true_labels_test,
y_pred=y_pred,
title='BernoulliNB'
)
# discetize lat and long
lat_bins = np.linspace(19, 72, 72-19)
long_bins = np.linspace(-166, -67, 166-67)

# train set
indicators_train_df.loc[:, 'latitude_disc'] = pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False)
indicators_train_df.loc[:, 'longitude_disc'] = pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False)

# test set
indicators_test_df.loc[:, 'latitude_disc'] = pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False)
indicators_test_df.loc[:, 'longitude_disc'] = pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False)

# %% [markdown]
# ### Multinomial Naive Bayes Classifier
# Prepare Train set and Test set:

# %%
# remuving not positive features
features_for_clf = [
'location_imp', 'state_code', 'congressional_district',
'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants',
'day', 'day_of_week', 'month', 'poverty_perc', 'democrat', 'aggression', 'accidental',
'defensive', 'suicide', 'road', 'house', 'school', 'business',
'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction']
# train set
indicators_train_df.dropna(inplace=True)
true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)

indicators_train_df = incidents_train_df[features_for_clf]
indicators_test_df = incidents_test_df[features_for_clf]
#test set
indicators_test_df.dropna(inplace=True)
true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)

# %% [markdown]
# Oversampling of the minority class:

# %%
ros = RandomOverSampler(random_state=42)
indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)

# %%
print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')

# %% [markdown]
# Classification:

# %%
mnb = MultinomialNB()
Expand Down Expand Up @@ -212,6 +270,124 @@
title='ComplementNB'
)

# %% [markdown]
# ### Bernulli Naive Bayes Classifier

# %%
features_for_clf = [
# tag: (binary data)
'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
]

indicators_train_df = incidents_train_df[features_for_clf]
indicators_test_df = incidents_test_df[features_for_clf]

# %%
def one_hot_encoder(data_train, data_test):
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(data_train)
return ohe.transform(data_train), ohe.transform(data_test)

# %%
# binarize lat and long: one hot encoding
lat_bins = np.linspace(19, 72, 10)
long_bins = np.linspace(-166, -67, 10)

lat_train, lat_test = one_hot_encoder(
data_train=np.array(pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1),
data_test=np.array(pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1))
long_train, long_test = one_hot_encoder(
data_train=np.array(pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1),
data_test=np.array(pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1))

for i in range(9):
# train set
indicators_train_df.loc[:, f'latitude_{i}'] = lat_train[:, i]
indicators_train_df.loc[:, f'longitude_{i}'] = long_train[:, i]
# test set
indicators_test_df.loc[:, f'latitude_{i}'] = lat_test[:, i]
indicators_test_df.loc[:, f'longitude_{i}'] = long_test[:, i]

# %%
# binarize age_range
age_range_train, age_range_test = one_hot_encoder(
data_train=np.array(incidents_train_df['age_range']).reshape(-1, 1),
data_test=np.array(incidents_test_df['age_range']).reshape(-1, 1))
for i in range(age_range_train.shape[1]):
indicators_train_df.loc[:, f'age_range_{i}'] = age_range_train[:, i] # train set
indicators_test_df.loc[:, f'age_range_{i}'] = age_range_test[:, i] # test set

# %%
# binarize n_participants
n_participants_train, n_participants_test = one_hot_encoder(
data_train=np.array(incidents_train_df['n_participants']).reshape(-1, 1),
data_test=np.array(incidents_test_df['n_participants']).reshape(-1, 1))
for i in range(n_participants_train.shape[1]):
indicators_train_df.loc[:, f'n_participants_{i}'] = n_participants_train[:, i] # train set
indicators_test_df.loc[:, f'n_participants_{i}'] = n_participants_test[:, i] # test set

"""# binarize n_adult
n_adult_train, n_adult_test = one_hot_encoder(
data_train=np.array(incidents_train_df['n_adult']).reshape(-1, 1),
data_test=np.array(incidents_test_df['n_adult']).reshape(-1, 1))
for i in range(n_adult_train.shape[1]):
indicators_train_df.loc[:, f'n_adult_{i}'] = n_adult_train[:, i] # train set
indicators_test_df.loc[:, f'n_adult_{i}'] = n_adult_test[:, i] # test set"""


# %% [markdown]
# Prepare train set and Test set:

# %%
# train set
indicators_train_df.dropna(inplace=True)
true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)

#test set
indicators_test_df.dropna(inplace=True)
true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)

# %% [markdown]
# Oversampling of the minority class:

# %%
ros = RandomOverSampler(random_state=42)
indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)

# %%
print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')

# %% [markdown]
# Classification:

# %%
bnb = BernoulliNB()
bnb.fit(indicators_train_df, true_labels_train)
y_pred = bnb.predict(indicators_test_df)

print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum()))

# %%
test_scores = compute_clf_scores(
y_true=true_labels_test,
y_pred=y_pred,
params=None,
train_time=None,
score_time=None,
prob_pred=None,
clf_name='BernoulliNB',
)
test_scores

# %%
plot_confusion_matrix(
y_true=true_labels_test,
y_pred=y_pred,
title='BernoulliNB'
)

# %% [markdown]
# ### Bernulli NB using Normalized Data

Expand Down
Loading

0 comments on commit fb750e3

Please sign in to comment.