diff --git a/TASK_3/naive_bayes.py b/TASK_3/naive_bayes.py index 36f41da..9a3badf 100644 --- a/TASK_3/naive_bayes.py +++ b/TASK_3/naive_bayes.py @@ -33,6 +33,8 @@ import pandas as pd import numpy as np from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB +from imblearn.over_sampling import RandomOverSampler +from sklearn.preprocessing import OneHotEncoder from classification_utils import compute_clf_scores, plot_confusion_matrix # %% [markdown] @@ -40,28 +42,15 @@ # %% # load the data -incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv', index_col=0) -incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv', index_col=0) -true_labels_train_df = pd.read_csv('../data/clf_y_train.csv', index_col=0) -true_labels_train = true_labels_train_df.values.ravel() -true_labels_test_df = pd.read_csv('../data/clf_y_test.csv', index_col=0) -true_labels_test = true_labels_test_df.values.ravel() - -# load the names of the features to use for the classification task -features_for_clf = [ - 'location_imp', 'latitude', 'longitude', 'state_code', 'congressional_district', - 'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants', - 'day', 'day_of_week', 'month', 'poverty_perc', 'democrat', - 'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business', - 'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction' -] +incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv') +incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv') -# project on the features_to_use -indicators_train_df = incidents_train_df[features_for_clf] -indicators_test_df = incidents_test_df[features_for_clf] +# %% +incidents_train_df.head(2) # %% -indicators_train_df.head(2) +true_labels_train = np.where(incidents_train_df['n_killed'] > 0, True, False) +true_labels_test = np.where(incidents_test_df['n_killed'] > 0, True, False) # %% print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)') @@ -75,6 +64,8 @@ # %% [markdown] # GaussianNB implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian. # +# Bernulli (requires samples to be represented as binary-valued feature vectors) +# # Multi-variate Bernoulli does not capture the number of # times each word occurs, and that it explicitly includes # the non-occurrence probability of words that do not appear in the document. @@ -87,6 +78,49 @@ # %% [markdown] # ### Gaussian Naive Bayes Classifier +# %% [markdown] +# Choose features for classification: + +# %% +features_for_clf = [ + 'latitude', 'longitude', 'state_code', 'congressional_district', + 'poverty_perc', 'location_imp', 'year', 'month', + 'age_range', 'max_age', 'avg_age', + 'n_participants', 'n_child_prop', 'n_teen_prop', 'n_males_prop', + # tag + #'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business', + #'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction' + ] + +indicators_train_df = incidents_train_df[features_for_clf] +indicators_test_df = incidents_test_df[features_for_clf] + +# %% [markdown] +# Prepare Train set and Test set: + +# %% +# train set +indicators_train_df.dropna(inplace=True) +true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False) + +#test set +indicators_test_df.dropna(inplace=True) +true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False) + +# %% [markdown] +# Oversampling of the minority class: + +# %% +ros = RandomOverSampler(random_state=42) +indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train) + +# %% +print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)') +print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)') + +# %% [markdown] +# Classification: + # %% gnb = GaussianNB() gnb.fit(indicators_train_df, true_labels_train) @@ -114,48 +148,72 @@ ) # %% [markdown] -# ### Bernulli Naive Bayes Classifier +# ### Multinomial Naive Bayes Classifier + +# %% [markdown] +# Choose features for classification: # %% -bnb = BernoulliNB() -bnb.fit(indicators_train_df, true_labels_train) -y_pred = bnb.predict(indicators_test_df) +features_for_clf = [ + 'state_code', 'congressional_district', 'month', + 'age_range', 'max_age', + 'n_participants', 'n_adult', + #'n_injured', 'n_arrested', 'n_unharmed', + # tag + 'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business', + 'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction' + ] -print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum())) +indicators_train_df = incidents_train_df[features_for_clf] +indicators_test_df = incidents_test_df[features_for_clf] + +# %% [markdown] +# Discretize latitude and longitude: # %% -test_scores = compute_clf_scores( - y_true=true_labels_test, - y_pred=y_pred, - params=None, - train_time=None, - score_time=None, - prob_pred=None, - clf_name='BernoulliNB', -) -test_scores +print(f'Latidude: max value {max(incidents_train_df["latitude"].max(), incidents_test_df["latitude"].max())}, \ + min value {min(incidents_train_df["latitude"].min(), incidents_test_df["latitude"].min())}') +print(f'Longitude: max value {max(incidents_train_df["longitude"].max(), incidents_test_df["longitude"].max())}, \ + min value {min(incidents_train_df["longitude"].min(), incidents_test_df["longitude"].min())}') # %% -plot_confusion_matrix( - y_true=true_labels_test, - y_pred=y_pred, - title='BernoulliNB' -) +# discetize lat and long +lat_bins = np.linspace(19, 72, 72-19) +long_bins = np.linspace(-166, -67, 166-67) + +# train set +indicators_train_df.loc[:, 'latitude_disc'] = pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False) +indicators_train_df.loc[:, 'longitude_disc'] = pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False) + +# test set +indicators_test_df.loc[:, 'latitude_disc'] = pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False) +indicators_test_df.loc[:, 'longitude_disc'] = pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False) # %% [markdown] -# ### Multinomial Naive Bayes Classifier +# Prepare Train set and Test set: # %% -# remuving not positive features -features_for_clf = [ - 'location_imp', 'state_code', 'congressional_district', - 'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants', - 'day', 'day_of_week', 'month', 'poverty_perc', 'democrat', 'aggression', 'accidental', - 'defensive', 'suicide', 'road', 'house', 'school', 'business', - 'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'] +# train set +indicators_train_df.dropna(inplace=True) +true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False) -indicators_train_df = incidents_train_df[features_for_clf] -indicators_test_df = incidents_test_df[features_for_clf] +#test set +indicators_test_df.dropna(inplace=True) +true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False) + +# %% [markdown] +# Oversampling of the minority class: + +# %% +ros = RandomOverSampler(random_state=42) +indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train) + +# %% +print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)') +print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)') + +# %% [markdown] +# Classification: # %% mnb = MultinomialNB() @@ -212,6 +270,124 @@ title='ComplementNB' ) +# %% [markdown] +# ### Bernulli Naive Bayes Classifier + +# %% +features_for_clf = [ + # tag: (binary data) + 'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business', + 'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction' + ] + +indicators_train_df = incidents_train_df[features_for_clf] +indicators_test_df = incidents_test_df[features_for_clf] + +# %% +def one_hot_encoder(data_train, data_test): + ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) + ohe.fit(data_train) + return ohe.transform(data_train), ohe.transform(data_test) + +# %% +# binarize lat and long: one hot encoding +lat_bins = np.linspace(19, 72, 10) +long_bins = np.linspace(-166, -67, 10) + +lat_train, lat_test = one_hot_encoder( + data_train=np.array(pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1), + data_test=np.array(pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1)) +long_train, long_test = one_hot_encoder( + data_train=np.array(pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1), + data_test=np.array(pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1)) + +for i in range(9): + # train set + indicators_train_df.loc[:, f'latitude_{i}'] = lat_train[:, i] + indicators_train_df.loc[:, f'longitude_{i}'] = long_train[:, i] + # test set + indicators_test_df.loc[:, f'latitude_{i}'] = lat_test[:, i] + indicators_test_df.loc[:, f'longitude_{i}'] = long_test[:, i] + +# %% +# binarize age_range +age_range_train, age_range_test = one_hot_encoder( + data_train=np.array(incidents_train_df['age_range']).reshape(-1, 1), + data_test=np.array(incidents_test_df['age_range']).reshape(-1, 1)) +for i in range(age_range_train.shape[1]): + indicators_train_df.loc[:, f'age_range_{i}'] = age_range_train[:, i] # train set + indicators_test_df.loc[:, f'age_range_{i}'] = age_range_test[:, i] # test set + +# %% +# binarize n_participants +n_participants_train, n_participants_test = one_hot_encoder( + data_train=np.array(incidents_train_df['n_participants']).reshape(-1, 1), + data_test=np.array(incidents_test_df['n_participants']).reshape(-1, 1)) +for i in range(n_participants_train.shape[1]): + indicators_train_df.loc[:, f'n_participants_{i}'] = n_participants_train[:, i] # train set + indicators_test_df.loc[:, f'n_participants_{i}'] = n_participants_test[:, i] # test set + +"""# binarize n_adult +n_adult_train, n_adult_test = one_hot_encoder( + data_train=np.array(incidents_train_df['n_adult']).reshape(-1, 1), + data_test=np.array(incidents_test_df['n_adult']).reshape(-1, 1)) +for i in range(n_adult_train.shape[1]): + indicators_train_df.loc[:, f'n_adult_{i}'] = n_adult_train[:, i] # train set + indicators_test_df.loc[:, f'n_adult_{i}'] = n_adult_test[:, i] # test set""" + + +# %% [markdown] +# Prepare train set and Test set: + +# %% +# train set +indicators_train_df.dropna(inplace=True) +true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False) + +#test set +indicators_test_df.dropna(inplace=True) +true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False) + +# %% [markdown] +# Oversampling of the minority class: + +# %% +ros = RandomOverSampler(random_state=42) +indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train) + +# %% +print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)') +print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)') + +# %% [markdown] +# Classification: + +# %% +bnb = BernoulliNB() +bnb.fit(indicators_train_df, true_labels_train) +y_pred = bnb.predict(indicators_test_df) + +print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum())) + +# %% +test_scores = compute_clf_scores( + y_true=true_labels_test, + y_pred=y_pred, + params=None, + train_time=None, + score_time=None, + prob_pred=None, + clf_name='BernoulliNB', +) +test_scores + +# %% +plot_confusion_matrix( + y_true=true_labels_test, + y_pred=y_pred, + title='BernoulliNB' +) + # %% [markdown] # ### Bernulli NB using Normalized Data diff --git a/TASK_4/time_series.py b/TASK_4/time_series.py new file mode 100644 index 0000000..ca9a47f --- /dev/null +++ b/TASK_4/time_series.py @@ -0,0 +1,451 @@ +# %% [markdown] +# # Time Series Analysis + +# %% +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from tslearn.clustering import TimeSeriesKMeans +from tslearn.preprocessing import TimeSeriesScalerMeanVariance +import plotly.express as px + +# %% +incidents_df = pd.read_csv( + '../data/incidents_indicators.csv', + index_col=0, + parse_dates=['date', 'date_original'], + date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d') +) + +# Drop rows where year is outside of 2014-2017 +incidents_df = incidents_df[incidents_df['year'].between(2014, 2017)] + +# %% +incidents_df.head(2) + +# %% +incidents_df['year'].unique() + +# %% [markdown] +# First monday of 2014: Monday 6th +# +# weeks strat from monday + +# %% +# Add columns week number, start from 0 for the first week of 2014 +incidents_df['week'] = (incidents_df['date'] - pd.to_datetime('2013-12-30')).dt.days // 7 + +# %% [markdown] +# settimane numerate da 0 (da mercoledi 1 gennaio 2014 a domanica 5 gennaio 2014: prima settimana di 5 giorni) a 208 +# +# 2017 finisce di domenica + +# %% +# number of weeks in the dataset +incidents_df['week'].max() + +# %% +((365*4+1) - 5 ) / 7 # ok :) + +# %% +incidents_df['week'].unique().shape # all weeks are present + +# %% +# gruop by week and count incidents +plt.figure(figsize=(20, 5)) +plt.bar( + incidents_df.groupby('week').size().index, + incidents_df.groupby('week').size().values +) +plt.title('Number of incidents per week'); + +# %% [markdown] +# ### Group incidents by City and Week + +# %% +# group by wee, city and state +incidents_df.groupby(['week', 'city', 'state']).count() + +# %% [markdown] +# We consider only cities with a number of weeks with incidents greater than 15% of the total number of the weeks of the 4 years. + +# %% +0.15 * 209 # consider only cities with incidents in more than 30 weeks + +# %% +incidents_df.groupby(['city', 'state'])['week'].count() # 10200 distinct cities + +# %% +(incidents_df.groupby(['city', 'state'])['week'].count() > 30).to_list().count(True) # 664 cities with incidents in more than 30 weeks + +# %% +# list of index of incidents in city with incidents in more than 30 weeks +index_list = np.where(incidents_df.groupby(['city', 'state'])['week'].transform('count') > 30) + +# %% +# create a df with incidents_df where index is in index_list +incidents_df = incidents_df.iloc[index_list] +incidents_df.head(2) + +# %% +incidents_df.groupby(['week', 'city', 'state']).count() + +# %% +incidents_df['state'].unique().shape # 51: all states are present + +# %% +# gruop by week and count incidents +plt.figure(figsize=(20, 5)) +plt.bar( + incidents_df.groupby('week').size().index, + incidents_df.groupby('week').size().values +) +plt.title('Number of incidents per week'); + +# %% [markdown] +# ## Create Time series + +# %% +# Model each city as a sequence of incidents +incidents_df.groupby(['city', 'state'])['week'].count().sort_values(ascending=False) # 664 time series + +# %% [markdown] +# Time series: mean number of participants per incident per week in each city +# +# 0 if we have no incidents in the week or NaN values (i.e. incidents where we don not know the nember of participants) + +# %% +# create a dataset with series of mean number of participants per incident per week in each city +incidents_df['n_participants'] = incidents_df['n_participants'].fillna(0) # substitute NaN with 0 +incidents_by_city_df = incidents_df.groupby(['city', 'state', 'week'])['n_participants'].mean().reset_index() +incidents_by_city_df = incidents_by_city_df.pivot(index=['city', 'state'], columns='week', values='n_participants') +incidents_by_city_df = incidents_by_city_df.fillna(0) # substitute NaN with 0 +incidents_by_city_df + +# %% +incidents_by_city_df.groupby('state')[0].count().sort_values(ascending=False) + +# %% +# plot time series for city in ALASKA state +plt.figure(figsize=(20, 5)) +plt.plot(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T, '.--') +plt.title('Number of participants per incident per week') +plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA' + ].index.get_level_values('city'), loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +# Offset translation +plt.figure(figsize=(20, 5)) +plt.plot(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T - + incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.mean(axis=1), '.--') +plt.title('Number of participants per incident per week, offset translation') +plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].index.get_level_values('city'), + loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +# Amplitude translation +plt.figure(figsize=(20, 5)) +plt.plot((incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T - + incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.mean(axis=1)) / + incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.std(axis=1), '.--') +plt.title('Number of participants per incident per week, amplitude translation') +plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].index.get_level_values('city'), + loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +# mean of all time series +plt.figure(figsize=(20, 5)) +plt.plot(incidents_by_city_df.values.mean(axis=0), '.--') +plt.title('Mean of number of participants per incident per week'); + +# %% [markdown] +# Grafico coerente, molti 0 nel dataset e la maggior parte degli incidenti aveva 1 solo partecipante + +# %% [markdown] +# ## Clustering + +# %% [markdown] +# ### Shape-based clustering: k-means + +# %% [markdown] +# Choose best k: + +# %% +X = TimeSeriesScalerMeanVariance().fit_transform(incidents_by_city_df.values) # scale time series +k_list = [2, 5, 10, 15, 20, 50] +inertia_list = [] # sum of distances of samples to their closest cluster center + +for k in range(2, 20): + km = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=100, random_state=42) + km.fit(X) + pred = km.predict(X) + print("n clusters = ", k, "\t Clusters =", np.unique(pred,return_counts=True)[1], "\t Inertia =", km.inertia_) + inertia_list.append(km.inertia_) + +# %% +plt.figure(figsize=(20, 5)) +plt.plot(inertia_list, '.--') +plt.xticks(range(len(inertia_list)), range(2, 20)) +plt.xlabel('Number of clusters') +plt.ylabel('Inertia') +plt.title('Inertia for different number of clusters'); + +# %% [markdown] +# Fit chosen model + +# %% +best_k = 11 +km = TimeSeriesKMeans(n_clusters=best_k, metric="dtw", max_iter=100, random_state=42) +km.fit(X) + +# %% +plt.figure(figsize=(20, 5)) +plt.plot(km.cluster_centers_.reshape(incidents_by_city_df.values.shape[1], best_k)) +plt.title('Centroids of clusters') +plt.legend(range(11), loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +km.inertia_ # Sum of distances of samples to their closest cluster center + +# %% +cluster = km.fit_predict(incidents_by_city_df.values) + +# %% [markdown] +# Visualize clusters + +# %% +cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index() +cluster_df['cluster'] = cluster +cluster_df.head(2) + +# %% +fig = px.scatter_mapbox( + lat=cluster_df['latitude'], + lon=cluster_df['longitude'], + zoom=2, + color=cluster_df['cluster'], + height=400, + width=1000, + text=cluster_df['city'] + ', ' + cluster_df['state'] +) +fig.update_layout(mapbox_style="open-street-map") +fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}) +fig.show() + +# %% +plt.figure(figsize=(20, 5)) +plt.bar( + cluster_df.groupby('cluster').size().index, + cluster_df.groupby('cluster').size().values +) +plt.title('Number of cities per cluster'); + +# %% +# visualize time series for each cluster (mean) +plt.figure(figsize=(20, 5)) +plt.plot(incidents_by_city_df.groupby(cluster).mean().values.T, '.--') +plt.title('Number of participants per incident per week, mean for each cluster') +plt.legend(incidents_by_city_df.groupby(cluster).mean().index, loc='upper left', bbox_to_anchor=(1, 1)); + +# %% [markdown] +# ### Compression-based clustering + +# %% +from sklearn.metrics import pairwise_distances +import zlib +from sklearn.cluster import DBSCAN +from tslearn.piecewise import PiecewiseAggregateApproximation +from sklearn import metrics +from scipy.spatial.distance import pdist, squareform + +# %% [markdown] +# #### DBSCAN measuring the distance between each pair of points in a dataset via Pairwise Distances + +# %% +def cdm_dist(x, y): + # compounding dissimilarity measure + x_str = (' '.join([str(v) for v in x.ravel()])).encode('utf-8') + y_str = (' '.join([str(v) for v in y.ravel()])).encode('utf-8') + return len(zlib.compress(x_str + y_str)) / (len(zlib.compress(x_str)) + len(zlib.compress(y_str))) + +X = incidents_by_city_df.values +M = pairwise_distances(X.reshape(X.shape[0], X.shape[1]), metric=cdm_dist) + +# %% +#TODO: hierarchical con M + +# %% +def find_best_eps(X, min_samples_range=[3, 5, 9, 15]): + dist = pdist(X, 'euclidean') # pair wise distance + dist = squareform(dist) # distance matrix given the vector dist + + # Calculate sorted list of distances for points for each k in k_list + # and plot the graph of distance from k-th nearest neighbour + fig, ax = plt.subplots(int(np.ceil(len(min_samples_range)/3)), 3, figsize=(20, 8)) + + for i, k in enumerate(min_samples_range): + kth_distances = list() + for d in dist: + index_kth_distance = np.argsort(d)[k] + kth_distances.append(d[index_kth_distance]) + + # Plot the graph of distance from k-th nearest neighbour + ax[int(i/3), int(i%3)].plot(range(0, len(kth_distances)), sorted(kth_distances)) + ax[int(i/3), int(i%3)].set_ylabel('%sth near neighbor distance' %k) + ax[int(i/3), int(i%3)].set_xlabel('Point Sorted according to distance of %sth near neighbor' %k) + ax[int(i/3), int(i%3)].tick_params(axis='both', which='major', labelsize=8) + ax[int(i/3), int(i%3)].grid(linestyle='--', linewidth=0.5, alpha=0.6) + + plt.show() + +def dbscan(X, eps=0.1, min_samples=10): + # Compute DBSCAN + db = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(X) + labels = db.labels_ + + # Number of clusters in labels, ignoring noise if present. + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + n_noise_ = list(labels).count(-1) + + return {'eps': eps, 'min_samples': min_samples, + '#clusters': len(set(labels)) - (1 if -1 in labels else 0), + '#noise': list(labels).count(-1), '%noise': list(labels).count(-1)/X.shape[0]*100, + 'silhouette_coef': metrics.silhouette_score(X, labels) if n_clusters_ > 1 else None, + '#cluster0': list(labels).count(0), '#cluster1': list(labels).count(1), + '#cluster2': list(labels).count(2), '#cluster3': list(labels).count(3), + '#cluster4': list(labels).count(4), '#cluster5': list(labels).count(5), + '#cluster6': list(labels).count(6), '#cluster7': list(labels).count(7)} + +# %% +find_best_eps(M, min_samples_range=[3, 5, 9, 15, 20, 30]) + +# %% +eps = [0.55, 0.6, 0.65] +# eps: maximum distance between two samples for one to be considered as in the neighborhood of the other. +min_samples = [2, 5, 7] + +dbscan_df = pd.DataFrame(columns=['eps', 'min_samples', '#clusters', '#noise', '%noise', 'silhouette_coef', + '#cluster0', '#cluster1', '#cluster2', '#cluster3', '#cluster4', '#cluster5', '#cluster6', '#cluster7']) + +for e in eps: + for k in min_samples: + db = dbscan(M, eps=e, min_samples=k) + dbscan_df = pd.concat([dbscan_df, pd.DataFrame(db, index=[0])], ignore_index=True) + +# %% +dbscan_df + +# %% +dbscan = DBSCAN(eps=0.63, min_samples=2, metric='precomputed') +dbscan.fit(M) + +# %% +cluster = dbscan.labels_ + +cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index() +cluster_df['cluster'] = cluster +cluster_df.head(2) + +# %% +fig = px.scatter_mapbox( + lat=cluster_df['latitude'], + lon=cluster_df['longitude'], + zoom=2, + color=cluster_df['cluster'], + height=400, + width=1000, + text=cluster_df['city'] + ', ' + cluster_df['state'] +) +fig.update_layout(mapbox_style="open-street-map") +fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}) +fig.show() + +# %% +plt.figure(figsize=(20, 5)) +plt.bar( + cluster_df.groupby('cluster').size().index, + cluster_df.groupby('cluster').size().values +) +plt.title('Number of cities per cluster'); + +# %% [markdown] +# #### K-means using Piecewise Aggregate Approximation of time series + +# %% [markdown] +# Piecewise Aggregate Approximation (PAA) is a technique used in time series analysis to reduce the dimensionality of a time series while preserving its essential characteristics. +# +# PAA approximates a time-series $X$ of length $n$ into vector $\hat{X}=(\hat{x}_1,…,\hat{x}_M)$ +# of any arbitrary length $M\leq n$ +# +# $x_i = \frac{M}{n} \sum_{j=\frac{M}{n}(i-1)+1}^{\frac{M}{n}i} X_j$ + +# %% +n_paa_segments = 100 +paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) +X_paa = paa.fit_transform(X) # PAA transformation + +# %% +plt.figure(figsize=(20, 5)) +plt.plot(X_paa.reshape(X_paa.shape[1], X_paa.shape[0])) +plt.title('PAA representation of time series'); + +# %% +km = TimeSeriesKMeans(n_clusters=11, metric="dtw", max_iter=5, random_state=0) +km.fit(X_paa) + +# %% +plt.figure(figsize=(20, 5)) +plt.plot(km.cluster_centers_.reshape(X_paa.shape[1], 11)) +plt.title('Centroids of clusters') +plt.legend(range(11), loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +plt.figure(figsize=(20, 5)) +for i in range(11): + plt.plot(np.mean(X[np.where(km.labels_ == i)[0]], axis=0), '.--') +plt.title('Number of participants per incident per week, mean for each cluster') +plt.legend(incidents_by_city_df.groupby(cluster).mean().index, loc='upper left', bbox_to_anchor=(1, 1)); + +# %% +cluster = km.labels_ + +cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index() +cluster_df['cluster'] = cluster +cluster_df.head(2) + +# %% +fig = px.scatter_mapbox( + lat=cluster_df['latitude'], + lon=cluster_df['longitude'], + zoom=2, + color=cluster_df['cluster'], + height=400, + width=1000, + text=cluster_df['city'] + ', ' + cluster_df['state'] +) +fig.update_layout(mapbox_style="open-street-map") +fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}) +fig.show() + +# %% +plt.figure(figsize=(20, 5)) +plt.bar( + cluster_df.groupby('cluster').size().index, + cluster_df.groupby('cluster').size().values +) +plt.title('Number of cities per cluster'); + +# %% [markdown] +# ## Matrix profile + +# %% +from matrixprofile-ts.matrixprofile import matrixProfile + +# %% +w = 3 +mp, mpi = matrixProfile.stomp(incidents_by_city_df.values, w) + +plt.plot(mp) +plt.title('Matrix Profile'); + + diff --git a/matrixprofile-ts b/matrixprofile-ts new file mode 160000 index 0000000..6a6f62b --- /dev/null +++ b/matrixprofile-ts @@ -0,0 +1 @@ +Subproject commit 6a6f62b72d351f958118a127e81b20469f13b0e7