diff --git a/TASK_3/naive_bayes.py b/TASK_3/naive_bayes.py
index 36f41da..9a3badf 100644
--- a/TASK_3/naive_bayes.py
+++ b/TASK_3/naive_bayes.py
@@ -33,6 +33,8 @@
 import pandas as pd
 import numpy as np
 from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.preprocessing import OneHotEncoder
 from classification_utils import compute_clf_scores, plot_confusion_matrix
 
 # %% [markdown]
@@ -40,28 +42,15 @@
 
 # %%
 # load the data
-incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv', index_col=0)
-incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv', index_col=0)
-true_labels_train_df = pd.read_csv('../data/clf_y_train.csv', index_col=0)
-true_labels_train = true_labels_train_df.values.ravel()
-true_labels_test_df = pd.read_csv('../data/clf_y_test.csv', index_col=0)
-true_labels_test = true_labels_test_df.values.ravel()
-
-# load the names of the features to use for the classification task
-features_for_clf = [
-    'location_imp', 'latitude', 'longitude', 'state_code', 'congressional_district', 
-    'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants', 
-    'day', 'day_of_week', 'month', 'poverty_perc', 'democrat', 
-    'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
-    'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
-]
+incidents_train_df = pd.read_csv('../data/clf_indicators_train.csv')
+incidents_test_df = pd.read_csv('../data/clf_indicators_test.csv')
 
-# project on the features_to_use
-indicators_train_df = incidents_train_df[features_for_clf]
-indicators_test_df = incidents_test_df[features_for_clf]
+# %%
+incidents_train_df.head(2)
 
 # %%
-indicators_train_df.head(2)
+true_labels_train = np.where(incidents_train_df['n_killed'] > 0, True, False)
+true_labels_test = np.where(incidents_test_df['n_killed'] > 0, True, False)
 
 # %%
 print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
@@ -75,6 +64,8 @@
 # %% [markdown]
 # GaussianNB implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.
 # 
+# Bernulli (requires samples to be represented as binary-valued feature vectors)
+# 
 # Multi-variate Bernoulli does not capture the number of
 # times each word occurs, and that it explicitly includes
 # the non-occurrence probability of words that do not appear in the document.
@@ -87,6 +78,49 @@
 # %% [markdown]
 # ### Gaussian Naive Bayes Classifier
 
+# %% [markdown]
+# Choose features for classification:
+
+# %%
+features_for_clf = [
+    'latitude', 'longitude', 'state_code', 'congressional_district', 
+    'poverty_perc', 'location_imp', 'year', 'month',
+    'age_range', 'max_age', 'avg_age',
+    'n_participants', 'n_child_prop', 'n_teen_prop', 'n_males_prop',
+    # tag
+    #'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
+    #'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
+    ]
+
+indicators_train_df = incidents_train_df[features_for_clf]
+indicators_test_df = incidents_test_df[features_for_clf]
+
+# %% [markdown]
+# Prepare Train set and Test set:
+
+# %%
+# train set
+indicators_train_df.dropna(inplace=True)
+true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)
+
+#test set
+indicators_test_df.dropna(inplace=True)
+true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)
+
+# %% [markdown]
+# Oversampling of the minority class:
+
+# %%
+ros = RandomOverSampler(random_state=42)
+indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)
+
+# %%
+print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
+print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')
+
+# %% [markdown]
+# Classification:
+
 # %%
 gnb = GaussianNB()
 gnb.fit(indicators_train_df, true_labels_train)
@@ -114,48 +148,72 @@
 )
 
 # %% [markdown]
-# ### Bernulli Naive Bayes Classifier
+# ### Multinomial Naive Bayes Classifier
+
+# %% [markdown]
+# Choose features for classification:
 
 # %%
-bnb = BernoulliNB()
-bnb.fit(indicators_train_df, true_labels_train)
-y_pred = bnb.predict(indicators_test_df)
+features_for_clf = [
+    'state_code', 'congressional_district', 'month',
+    'age_range', 'max_age',
+    'n_participants', 'n_adult',
+    #'n_injured', 'n_arrested', 'n_unharmed',
+    # tag
+    'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
+    'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
+    ]
 
-print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum()))
+indicators_train_df = incidents_train_df[features_for_clf]
+indicators_test_df = incidents_test_df[features_for_clf]
+
+# %% [markdown]
+# Discretize latitude and longitude:
 
 # %%
-test_scores = compute_clf_scores(
-    y_true=true_labels_test,
-    y_pred=y_pred,
-    params=None,
-    train_time=None,
-    score_time=None,
-    prob_pred=None,
-    clf_name='BernoulliNB',
-)
-test_scores
+print(f'Latidude: max value {max(incidents_train_df["latitude"].max(), incidents_test_df["latitude"].max())}, \
+    min value {min(incidents_train_df["latitude"].min(), incidents_test_df["latitude"].min())}')
+print(f'Longitude: max value {max(incidents_train_df["longitude"].max(), incidents_test_df["longitude"].max())}, \
+    min value {min(incidents_train_df["longitude"].min(), incidents_test_df["longitude"].min())}')
 
 # %%
-plot_confusion_matrix(
-    y_true=true_labels_test,
-    y_pred=y_pred,
-    title='BernoulliNB'
-)
+# discetize lat and long
+lat_bins = np.linspace(19, 72, 72-19)
+long_bins = np.linspace(-166, -67, 166-67)
+
+# train set
+indicators_train_df.loc[:, 'latitude_disc'] = pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False)
+indicators_train_df.loc[:, 'longitude_disc'] = pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False)
+
+# test set
+indicators_test_df.loc[:, 'latitude_disc'] = pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False)
+indicators_test_df.loc[:, 'longitude_disc'] = pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False)
 
 # %% [markdown]
-# ### Multinomial Naive Bayes Classifier
+# Prepare Train set and Test set:
 
 # %%
-# remuving not positive features
-features_for_clf = [
-    'location_imp', 'state_code', 'congressional_district', 
-    'age_range', 'avg_age', 'n_child_prop', 'n_teen_prop', 'n_males_prop', 'n_participants', 
-    'day', 'day_of_week', 'month', 'poverty_perc', 'democrat', 'aggression', 'accidental',
-    'defensive', 'suicide', 'road', 'house', 'school', 'business',
-    'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction']
+# train set
+indicators_train_df.dropna(inplace=True)
+true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)
 
-indicators_train_df = incidents_train_df[features_for_clf]
-indicators_test_df = incidents_test_df[features_for_clf]
+#test set
+indicators_test_df.dropna(inplace=True)
+true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)
+
+# %% [markdown]
+# Oversampling of the minority class:
+
+# %%
+ros = RandomOverSampler(random_state=42)
+indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)
+
+# %%
+print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
+print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')
+
+# %% [markdown]
+# Classification:
 
 # %%
 mnb = MultinomialNB()
@@ -212,6 +270,124 @@
     title='ComplementNB'
 )
 
+# %% [markdown]
+# ### Bernulli Naive Bayes Classifier
+
+# %%
+features_for_clf = [
+    # tag: (binary data)
+    'aggression', 'accidental', 'defensive', 'suicide', 'road', 'house', 'school', 'business',
+    'illegal_holding', 'drug_alcohol', 'officers', 'organized', 'social_reasons', 'abduction'
+    ]
+
+indicators_train_df = incidents_train_df[features_for_clf]
+indicators_test_df = incidents_test_df[features_for_clf]
+
+# %%
+def one_hot_encoder(data_train, data_test):
+    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+    ohe.fit(data_train)
+    return ohe.transform(data_train), ohe.transform(data_test)
+
+# %%
+# binarize lat and long: one hot encoding
+lat_bins = np.linspace(19, 72, 10)
+long_bins = np.linspace(-166, -67, 10)
+
+lat_train, lat_test = one_hot_encoder(
+    data_train=np.array(pd.cut(incidents_train_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1), 
+    data_test=np.array(pd.cut(incidents_test_df['latitude'], bins=lat_bins, labels=False)).reshape(-1, 1))
+long_train, long_test = one_hot_encoder(
+    data_train=np.array(pd.cut(incidents_train_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1), 
+    data_test=np.array(pd.cut(incidents_test_df['longitude'], bins=long_bins, labels=False)).reshape(-1, 1))
+
+for i in range(9):
+    # train set
+    indicators_train_df.loc[:, f'latitude_{i}'] = lat_train[:, i]
+    indicators_train_df.loc[:, f'longitude_{i}'] = long_train[:, i]
+    # test set
+    indicators_test_df.loc[:, f'latitude_{i}'] = lat_test[:, i]
+    indicators_test_df.loc[:, f'longitude_{i}'] = long_test[:, i]
+
+# %%
+# binarize age_range
+age_range_train, age_range_test = one_hot_encoder(
+    data_train=np.array(incidents_train_df['age_range']).reshape(-1, 1), 
+    data_test=np.array(incidents_test_df['age_range']).reshape(-1, 1))
+for i in range(age_range_train.shape[1]):
+    indicators_train_df.loc[:, f'age_range_{i}'] = age_range_train[:, i] # train set
+    indicators_test_df.loc[:, f'age_range_{i}'] = age_range_test[:, i] # test set
+
+# %%
+# binarize n_participants
+n_participants_train, n_participants_test = one_hot_encoder(
+    data_train=np.array(incidents_train_df['n_participants']).reshape(-1, 1), 
+    data_test=np.array(incidents_test_df['n_participants']).reshape(-1, 1))
+for i in range(n_participants_train.shape[1]):
+    indicators_train_df.loc[:, f'n_participants_{i}'] = n_participants_train[:, i] # train set
+    indicators_test_df.loc[:, f'n_participants_{i}'] = n_participants_test[:, i] # test set
+
+"""# binarize n_adult
+n_adult_train, n_adult_test = one_hot_encoder(
+    data_train=np.array(incidents_train_df['n_adult']).reshape(-1, 1), 
+    data_test=np.array(incidents_test_df['n_adult']).reshape(-1, 1))
+for i in range(n_adult_train.shape[1]):
+    indicators_train_df.loc[:, f'n_adult_{i}'] = n_adult_train[:, i] # train set
+    indicators_test_df.loc[:, f'n_adult_{i}'] = n_adult_test[:, i] # test set"""
+
+
+# %% [markdown]
+# Prepare train set and Test set:
+
+# %%
+# train set
+indicators_train_df.dropna(inplace=True)
+true_labels_train = np.where(incidents_train_df['n_killed'][indicators_train_df.index] > 0, True, False)
+
+#test set
+indicators_test_df.dropna(inplace=True)
+true_labels_test = np.where(incidents_test_df['n_killed'][indicators_test_df.index] > 0, True, False)
+
+# %% [markdown]
+# Oversampling of the minority class:
+
+# %%
+ros = RandomOverSampler(random_state=42)
+indicators_train_df, true_labels_train = ros.fit_resample(indicators_train_df, true_labels_train)
+
+# %%
+print(f'Number of label True in train set: {np.sum(true_labels_train)}, ({np.sum(true_labels_train)/len(true_labels_train)*100}%)')
+print(f'Number of label False in train set: {len(true_labels_train)-np.sum(true_labels_train)}, ({(len(true_labels_train)-np.sum(true_labels_train))/len(true_labels_train)*100}%)')
+
+# %% [markdown]
+# Classification:
+
+# %%
+bnb = BernoulliNB()
+bnb.fit(indicators_train_df, true_labels_train)
+y_pred = bnb.predict(indicators_test_df)
+
+print("Number of mislabeled points out of a total %d points : %d" % (indicators_test_df.shape[0], (true_labels_test != y_pred).sum()))
+
+# %%
+test_scores = compute_clf_scores(
+    y_true=true_labels_test,
+    y_pred=y_pred,
+    params=None,
+    train_time=None,
+    score_time=None,
+    prob_pred=None,
+    clf_name='BernoulliNB',
+)
+test_scores
+
+# %%
+plot_confusion_matrix(
+    y_true=true_labels_test,
+    y_pred=y_pred,
+    title='BernoulliNB'
+)
+
 # %% [markdown]
 # ### Bernulli NB using Normalized Data
 
diff --git a/TASK_4/time_series.py b/TASK_4/time_series.py
new file mode 100644
index 0000000..ca9a47f
--- /dev/null
+++ b/TASK_4/time_series.py
@@ -0,0 +1,451 @@
+# %% [markdown]
+# # Time Series Analysis
+
+# %%
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from tslearn.clustering import TimeSeriesKMeans
+from tslearn.preprocessing import TimeSeriesScalerMeanVariance
+import plotly.express as px
+
+# %%
+incidents_df = pd.read_csv(
+    '../data/incidents_indicators.csv',
+    index_col=0,
+    parse_dates=['date', 'date_original'],
+    date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d')
+)
+
+# Drop rows where year is outside of 2014-2017
+incidents_df = incidents_df[incidents_df['year'].between(2014, 2017)]
+
+# %%
+incidents_df.head(2)
+
+# %%
+incidents_df['year'].unique()
+
+# %% [markdown]
+# First monday of 2014: Monday 6th 
+# 
+# weeks strat from monday
+
+# %%
+# Add columns week number, start from 0 for the first week of 2014
+incidents_df['week'] = (incidents_df['date'] - pd.to_datetime('2013-12-30')).dt.days // 7
+
+# %% [markdown]
+# settimane numerate da 0 (da mercoledi 1 gennaio 2014 a domanica 5 gennaio 2014: prima settimana di 5 giorni) a 208
+# 
+# 2017 finisce di domenica
+
+# %%
+# number of weeks in the dataset
+incidents_df['week'].max()
+
+# %%
+((365*4+1) - 5 ) / 7 # ok :)
+
+# %%
+incidents_df['week'].unique().shape # all weeks are present
+
+# %%
+# gruop by week and count incidents
+plt.figure(figsize=(20, 5))
+plt.bar(
+    incidents_df.groupby('week').size().index,
+    incidents_df.groupby('week').size().values
+)
+plt.title('Number of incidents per week');
+
+# %% [markdown]
+# ### Group incidents by City and Week
+
+# %%
+# group by wee, city and state
+incidents_df.groupby(['week', 'city', 'state']).count()
+
+# %% [markdown]
+# We consider only cities with a number of weeks with incidents greater than 15% of the total number of the weeks of the 4 years.
+
+# %%
+0.15 * 209 # consider only cities with incidents in more than 30 weeks
+
+# %%
+incidents_df.groupby(['city', 'state'])['week'].count() # 10200 distinct cities
+
+# %%
+(incidents_df.groupby(['city', 'state'])['week'].count() > 30).to_list().count(True) # 664 cities with incidents in more than 30 weeks
+
+# %%
+# list of index of incidents in city with incidents in more than 30 weeks
+index_list = np.where(incidents_df.groupby(['city', 'state'])['week'].transform('count') > 30)
+
+# %%
+# create a df with incidents_df where index is in index_list
+incidents_df = incidents_df.iloc[index_list]
+incidents_df.head(2)
+
+# %%
+incidents_df.groupby(['week', 'city', 'state']).count()
+
+# %%
+incidents_df['state'].unique().shape # 51: all states are present
+
+# %%
+# gruop by week and count incidents
+plt.figure(figsize=(20, 5))
+plt.bar(
+    incidents_df.groupby('week').size().index,
+    incidents_df.groupby('week').size().values
+)
+plt.title('Number of incidents per week');
+
+# %% [markdown]
+# ## Create Time series
+
+# %%
+# Model each city as a sequence of incidents
+incidents_df.groupby(['city', 'state'])['week'].count().sort_values(ascending=False) # 664 time series
+
+# %% [markdown]
+# Time series: mean number of participants per incident per week in each city
+# 
+# 0 if we have no incidents in the week or NaN values (i.e. incidents where we don not know the nember of participants)
+
+# %%
+# create a dataset with series of mean number of participants per incident per week in each city
+incidents_df['n_participants'] = incidents_df['n_participants'].fillna(0) # substitute NaN with 0
+incidents_by_city_df = incidents_df.groupby(['city', 'state', 'week'])['n_participants'].mean().reset_index()
+incidents_by_city_df = incidents_by_city_df.pivot(index=['city', 'state'], columns='week', values='n_participants')
+incidents_by_city_df = incidents_by_city_df.fillna(0) # substitute NaN with 0
+incidents_by_city_df
+
+# %%
+incidents_by_city_df.groupby('state')[0].count().sort_values(ascending=False)
+
+# %%
+# plot time series for city in ALASKA state
+plt.figure(figsize=(20, 5))
+plt.plot(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T, '.--')
+plt.title('Number of participants per incident per week')
+plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'
+    ].index.get_level_values('city'), loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+# Offset translation
+plt.figure(figsize=(20, 5))
+plt.plot(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T - 
+    incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.mean(axis=1), '.--')
+plt.title('Number of participants per incident per week, offset translation')
+plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].index.get_level_values('city'), 
+    loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+# Amplitude translation
+plt.figure(figsize=(20, 5))
+plt.plot((incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.T - 
+    incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.mean(axis=1)) /
+    incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].values.std(axis=1), '.--')
+plt.title('Number of participants per incident per week, amplitude translation')
+plt.legend(incidents_by_city_df[incidents_by_city_df.index.get_level_values('state') == 'ALASKA'].index.get_level_values('city'), 
+    loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+# mean of all time series
+plt.figure(figsize=(20, 5))
+plt.plot(incidents_by_city_df.values.mean(axis=0), '.--')
+plt.title('Mean of number of participants per incident per week');
+
+# %% [markdown]
+# Grafico coerente, molti 0 nel dataset e la maggior parte degli incidenti aveva 1 solo partecipante
+
+# %% [markdown]
+# ## Clustering
+
+# %% [markdown]
+# ### Shape-based clustering: k-means
+
+# %% [markdown]
+# Choose best k:
+
+# %%
+X = TimeSeriesScalerMeanVariance().fit_transform(incidents_by_city_df.values) # scale time series
+k_list = [2, 5, 10, 15, 20, 50]
+inertia_list = [] # sum of distances of samples to their closest cluster center
+
+for k in range(2, 20):
+    km = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=100, random_state=42)
+    km.fit(X)
+    pred = km.predict(X)
+    print("n clusters = ", k, "\t Clusters =", np.unique(pred,return_counts=True)[1], "\t Inertia =", km.inertia_)
+    inertia_list.append(km.inertia_)
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.plot(inertia_list, '.--')
+plt.xticks(range(len(inertia_list)), range(2, 20))
+plt.xlabel('Number of clusters')
+plt.ylabel('Inertia')
+plt.title('Inertia for different number of clusters');
+
+# %% [markdown]
+# Fit chosen model
+
+# %%
+best_k = 11
+km = TimeSeriesKMeans(n_clusters=best_k, metric="dtw", max_iter=100, random_state=42)
+km.fit(X)
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.plot(km.cluster_centers_.reshape(incidents_by_city_df.values.shape[1], best_k))
+plt.title('Centroids of clusters')
+plt.legend(range(11), loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+km.inertia_ # Sum of distances of samples to their closest cluster center
+
+# %%
+cluster = km.fit_predict(incidents_by_city_df.values)
+
+# %% [markdown]
+# Visualize clusters
+
+# %%
+cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index()
+cluster_df['cluster'] = cluster
+cluster_df.head(2)
+
+# %%
+fig = px.scatter_mapbox(
+    lat=cluster_df['latitude'],
+    lon=cluster_df['longitude'],
+    zoom=2, 
+    color=cluster_df['cluster'],
+    height=400,
+    width=1000,
+    text=cluster_df['city'] + ', ' + cluster_df['state']
+)
+fig.update_layout(mapbox_style="open-street-map")
+fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
+fig.show()
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.bar(
+    cluster_df.groupby('cluster').size().index,
+    cluster_df.groupby('cluster').size().values
+)
+plt.title('Number of cities per cluster');
+
+# %%
+# visualize time series for each cluster (mean)
+plt.figure(figsize=(20, 5))
+plt.plot(incidents_by_city_df.groupby(cluster).mean().values.T, '.--')
+plt.title('Number of participants per incident per week, mean for each cluster')
+plt.legend(incidents_by_city_df.groupby(cluster).mean().index, loc='upper left', bbox_to_anchor=(1, 1));
+
+# %% [markdown]
+# ### Compression-based clustering
+
+# %%
+from sklearn.metrics import pairwise_distances
+import zlib
+from sklearn.cluster import DBSCAN
+from tslearn.piecewise import PiecewiseAggregateApproximation
+from sklearn import metrics 
+from scipy.spatial.distance import pdist, squareform
+
+# %% [markdown]
+# #### DBSCAN measuring the distance between each pair of points in a dataset via Pairwise Distances
+
+# %%
+def cdm_dist(x, y):
+    # compounding dissimilarity measure
+    x_str = (' '.join([str(v) for v in x.ravel()])).encode('utf-8')
+    y_str = (' '.join([str(v) for v in y.ravel()])).encode('utf-8')
+    return len(zlib.compress(x_str + y_str)) / (len(zlib.compress(x_str)) + len(zlib.compress(y_str)))
+
+X = incidents_by_city_df.values
+M = pairwise_distances(X.reshape(X.shape[0], X.shape[1]), metric=cdm_dist)
+
+# %%
+#TODO: hierarchical con M
+
+# %%
+def find_best_eps(X, min_samples_range=[3, 5, 9, 15]):
+    dist = pdist(X, 'euclidean') # pair wise distance
+    dist = squareform(dist) # distance matrix given the vector dist
+    
+    # Calculate sorted list of distances for points for each k in k_list
+    # and plot the graph of distance from k-th nearest neighbour
+    fig, ax = plt.subplots(int(np.ceil(len(min_samples_range)/3)), 3, figsize=(20, 8))
+
+    for i, k in enumerate(min_samples_range):
+        kth_distances = list()
+        for d in dist:
+            index_kth_distance = np.argsort(d)[k]
+            kth_distances.append(d[index_kth_distance])
+
+        # Plot the graph of distance from k-th nearest neighbour
+        ax[int(i/3), int(i%3)].plot(range(0, len(kth_distances)), sorted(kth_distances))
+        ax[int(i/3), int(i%3)].set_ylabel('%sth near neighbor distance' %k)
+        ax[int(i/3), int(i%3)].set_xlabel('Point Sorted according to distance of %sth near neighbor' %k)
+        ax[int(i/3), int(i%3)].tick_params(axis='both', which='major', labelsize=8)
+        ax[int(i/3), int(i%3)].grid(linestyle='--', linewidth=0.5, alpha=0.6)
+
+    plt.show()
+
+def dbscan(X, eps=0.1, min_samples=10):
+    # Compute DBSCAN      
+    db = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(X)
+    labels = db.labels_
+
+    # Number of clusters in labels, ignoring noise if present.
+    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+    n_noise_ = list(labels).count(-1)
+
+    return {'eps': eps, 'min_samples': min_samples, 
+        '#clusters': len(set(labels)) - (1 if -1 in labels else 0),
+        '#noise': list(labels).count(-1),  '%noise': list(labels).count(-1)/X.shape[0]*100,
+        'silhouette_coef': metrics.silhouette_score(X, labels) if n_clusters_ > 1 else None,
+        '#cluster0': list(labels).count(0), '#cluster1': list(labels).count(1), 
+        '#cluster2': list(labels).count(2), '#cluster3': list(labels).count(3), 
+        '#cluster4': list(labels).count(4), '#cluster5': list(labels).count(5),
+        '#cluster6': list(labels).count(6), '#cluster7': list(labels).count(7)}
+
+# %%
+find_best_eps(M, min_samples_range=[3, 5, 9, 15, 20, 30])
+
+# %%
+eps = [0.55, 0.6, 0.65]
+# eps: maximum distance between two samples for one to be considered as in the neighborhood of the other.
+min_samples = [2, 5, 7]
+
+dbscan_df = pd.DataFrame(columns=['eps', 'min_samples', '#clusters', '#noise', '%noise', 'silhouette_coef',
+    '#cluster0', '#cluster1', '#cluster2', '#cluster3', '#cluster4', '#cluster5', '#cluster6', '#cluster7'])
+
+for e in eps:
+    for k in min_samples:
+        db = dbscan(M, eps=e, min_samples=k)
+        dbscan_df = pd.concat([dbscan_df, pd.DataFrame(db, index=[0])], ignore_index=True)
+
+# %%
+dbscan_df
+
+# %%
+dbscan = DBSCAN(eps=0.63, min_samples=2, metric='precomputed')
+dbscan.fit(M)
+
+# %%
+cluster = dbscan.labels_
+
+cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index()
+cluster_df['cluster'] = cluster
+cluster_df.head(2)
+
+# %%
+fig = px.scatter_mapbox(
+    lat=cluster_df['latitude'],
+    lon=cluster_df['longitude'],
+    zoom=2, 
+    color=cluster_df['cluster'],
+    height=400,
+    width=1000,
+    text=cluster_df['city'] + ', ' + cluster_df['state']
+)
+fig.update_layout(mapbox_style="open-street-map")
+fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
+fig.show()
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.bar(
+    cluster_df.groupby('cluster').size().index,
+    cluster_df.groupby('cluster').size().values
+)
+plt.title('Number of cities per cluster');
+
+# %% [markdown]
+# #### K-means using Piecewise Aggregate Approximation of time series
+
+# %% [markdown]
+# Piecewise Aggregate Approximation (PAA) is a technique used in time series analysis to reduce the dimensionality of a time series while preserving its essential characteristics.
+# 
+# PAA approximates a time-series $X$ of length $n$ into vector $\hat{X}=(\hat{x}_1,…,\hat{x}_M)$
+#  of any arbitrary length  $M\leq n$
+#  
+# $x_i = \frac{M}{n} \sum_{j=\frac{M}{n}(i-1)+1}^{\frac{M}{n}i} X_j$
+
+# %%
+n_paa_segments = 100
+paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
+X_paa = paa.fit_transform(X) # PAA transformation
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.plot(X_paa.reshape(X_paa.shape[1], X_paa.shape[0]))
+plt.title('PAA representation of time series');
+
+# %%
+km = TimeSeriesKMeans(n_clusters=11, metric="dtw", max_iter=5, random_state=0)
+km.fit(X_paa)
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.plot(km.cluster_centers_.reshape(X_paa.shape[1], 11))
+plt.title('Centroids of clusters')
+plt.legend(range(11), loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+plt.figure(figsize=(20, 5))
+for i in range(11):
+    plt.plot(np.mean(X[np.where(km.labels_ == i)[0]], axis=0), '.--')
+plt.title('Number of participants per incident per week, mean for each cluster')
+plt.legend(incidents_by_city_df.groupby(cluster).mean().index, loc='upper left', bbox_to_anchor=(1, 1));
+
+# %%
+cluster = km.labels_
+
+cluster_df = incidents_df.groupby(['city', 'state'])[['latitude', 'longitude']].mean().reset_index()
+cluster_df['cluster'] = cluster
+cluster_df.head(2)
+
+# %%
+fig = px.scatter_mapbox(
+    lat=cluster_df['latitude'],
+    lon=cluster_df['longitude'],
+    zoom=2, 
+    color=cluster_df['cluster'],
+    height=400,
+    width=1000,
+    text=cluster_df['city'] + ', ' + cluster_df['state']
+)
+fig.update_layout(mapbox_style="open-street-map")
+fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
+fig.show()
+
+# %%
+plt.figure(figsize=(20, 5))
+plt.bar(
+    cluster_df.groupby('cluster').size().index,
+    cluster_df.groupby('cluster').size().values
+)
+plt.title('Number of cities per cluster');
+
+# %% [markdown]
+# ## Matrix profile
+
+# %%
+from matrixprofile-ts.matrixprofile import matrixProfile
+
+# %%
+w = 3
+mp, mpi = matrixProfile.stomp(incidents_by_city_df.values, w)
+
+plt.plot(mp)
+plt.title('Matrix Profile');
+
+
diff --git a/matrixprofile-ts b/matrixprofile-ts
new file mode 160000
index 0000000..6a6f62b
--- /dev/null
+++ b/matrixprofile-ts
@@ -0,0 +1 @@
+Subproject commit 6a6f62b72d351f958118a127e81b20469f13b0e7