Skip to content

Commit

Permalink
new pca based indicators
Browse files Browse the repository at this point in the history
  • Loading branch information
GiacomoAru committed Nov 16, 2023
1 parent 40b2ee8 commit 5debe25
Show file tree
Hide file tree
Showing 3 changed files with 131,927 additions and 103 deletions.
216 changes: 114 additions & 102 deletions TASK_1/indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
%matplotlib inline

# %%
incidents_df = pd.read_csv('../data/incidents_cleaned.csv')
incidents_df = pd.read_csv('../data/incidents_cleaned.csv', index_col=0)
incidents_df['date'] = pd.to_datetime(incidents_df['date'], format='%Y-%m-%d')

# %%
Expand Down Expand Up @@ -119,7 +119,7 @@ def log_normalization(df, new_df, columns):
log_ratios_wrt_center = ['log_'+x for x in ratios_wrt_center]

# %%
incidents_df.tail(10)
incidents_df.tail(3)

# %%
log_ratios.tail(10)
Expand Down Expand Up @@ -331,7 +331,6 @@ def compute_entropy_indicator(df, col_aggr, col_group, lab=''):

# %%
local_outlier_factors = pd.DataFrame(index=incidents_numeric.index, data=X_scores).rename(columns={0:'local_outlier_factor'})
local_outlier_factors['log_inv_local_outlier_factor'] = np.log2(local_outlier_factors['local_outlier_factor']*([-1]*len(local_outlier_factors['local_outlier_factor'])))
local_outlier_factors

# %%
Expand All @@ -347,15 +346,6 @@ def compute_entropy_indicator(df, col_aggr, col_group, lab=''):
figsize=(10, 5)
)

# %%
hist_box_plot(
local_outlier_factors,
'log_inv_local_outlier_factor',
title='log_inv_local_outlier_factor',
bins=int(np.log(local_outlier_factors.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%
#import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection
Expand Down Expand Up @@ -387,69 +377,6 @@ def update_legend_marker_size(handle, orig):
plt.title("Local Outlier Factor (LOF)")
plt.show()

# %%
import matplotlib.pyplot as mplt
import plotly.express as px
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

# %%
incidents_numeric

# %%
pca = PCA(n_components=4)
X_pca = pca.fit_transform(incidents_numeric)


nrows=4
ncols=6
row=0
fig, axs = mplt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20), sharex=True, sharey=True)
for i, col in enumerate(incidents_numeric.columns):
if i != 0 and i % ncols == 0:
row += 1
axs[row][i % ncols].scatter(X_pca[:, 0], X_pca[:, 1], edgecolor='k', s=40, c=incidents_numeric[col])
axs[row][i % ncols].set_title(col)
axs[row][i % ncols].set_xlabel("1st eigenvector")
axs[row][i % ncols].set_ylabel("2nd eigenvector")

# %%
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

x = X_pca[:, 0]
y = X_pca[:, 2]
z = X_pca[:, 1]

ax.set_xlabel("1st eigenvector")
ax.set_ylabel("3rd eigenvector")
ax.set_zlabel("2nd eigenvector")

ax.scatter(x, y, z)

fig = px.scatter_3d(x=x, y=y, z=z, labels={'x': '1st eigenvector', 'y': '3rd eigenvector', 'z': '2nd eigenvector'})
fig.show()

# %%
X_reconstructed = pca.inverse_transform(X_pca)
PCA_errors = pd.DataFrame(index=incidents_numeric.index)
PCA_errors['reconstruction_error'] = np.sum(np.square(X_pca-X_reconstructed), axis=1)

#incidents_df['pca_reconstruction_error'] = square_error

# %%
hist_box_plot(
PCA_errors,
'reconstruction_error',
title='reconstruction_error',
bins=int(np.log(ratios.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%


# %%
'''population_df = pd.read_csv('../data/external_data/population.csv')
population_df['n_males'] = population_df['male_child'] + population_df['male_teen'] + population_df['male_adult']
Expand Down Expand Up @@ -513,7 +440,6 @@ def compute_simple_subtraction(df, col_1, col_2):
# %%
indicators = pd.DataFrame(index=incidents_df.index, data=incidents_df[['latitude', 'longitude', 'location_importance',
'avg_age_participants',

'n_participants']].copy(deep=True))

# %%
Expand Down Expand Up @@ -553,7 +479,7 @@ def compute_simple_subtraction(df, col_1, col_2):
plt.xticks(rotation=90, ha='right');

# %%
indicators = indicators.join(log_ratios[['log_n_males_n_males_tot_semest_congd_ratio','log_n_killed_n_killed_tot_semest_congd_ratio','log_n_injured_n_injured_tot_semest_congd_ratio']])
indicators = indicators.join(log_ratios[['log_n_males_n_males_mean_semest_congd_ratio','log_n_killed_n_killed_mean_semest_congd_ratio','log_n_injured_n_injured_mean_semest_congd_ratio']])

# %%
indicators
Expand Down Expand Up @@ -658,25 +584,20 @@ def compute_simple_subtraction(df, col_1, col_2):
indicators.dropna().describe()

# %%
sns.heatmap(indicators.corr())
sns.heatmap(indicators.corr(), vmin=-1, vmax=1)

# %%
col_to_drop = ['n_participants_adult_prop', 'n_females_prop', 'log_n_killed_n_killed_tot_semest_congd_ratio', 'log_n_injured_n_injured_tot_semest_congd_ratio']
col_to_drop = ['n_participants_adult_prop', 'n_females_prop', 'log_n_killed_n_killed_mean_semest_congd_ratio', 'log_n_injured_n_injured_mean_semest_congd_ratio']
final_indicators = indicators.drop(columns=col_to_drop)

# %%
sns.heatmap(final_indicators.corr())
sns.heatmap(final_indicators.corr(), vmin=-1, vmax=1)

# %%
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=final_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=ciao,ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
a = {}
for c in final_indicators.columns:
Expand All @@ -702,20 +623,36 @@ def compute_simple_subtraction(df, col_1, col_2):
# %%
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler_obj = MinMaxScaler()#, MinMaxScaler(), RobustScaler()]
final_indicators = pd.DataFrame(data=scaler_obj.fit_transform(final_indicators.values), columns=final_indicators.columns)
normalized_indicators = pd.DataFrame(data=scaler_obj.fit_transform(final_indicators.values), columns=final_indicators.columns)

# %%
normalized_indicators.sample(2)

# %%
final_indicators
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=normalized_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
DATA_FOLDER_PATH = '../data/'
final_indicators.to_csv(DATA_FOLDER_PATH +'incidents_cleaned_indicators.csv')
normalized_indicators.to_csv(DATA_FOLDER_PATH +'incidents_cleaned_indicators.csv')

# %%
normalized_indicators.describe()

# %%
import matplotlib.pyplot as mplt
import plotly.express as px
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

# %%
dummy = final_indicators.dropna()
pca = PCA(n_components=4)
pca = PCA()
X_pca = pca.fit_transform(dummy)
pca_df = pd.DataFrame(index=incidents_df.index)

# %%
nrows=4
ncols=6
row=0
Expand All @@ -729,24 +666,99 @@ def compute_simple_subtraction(df, col_1, col_2):
axs[row][i % ncols].set_ylabel("2nd eigenvector")

# %%
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
nrows=4
ncols=6
row=0
fig, axs = mplt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20), sharex=True, sharey=True)
for i, col in enumerate(dummy.columns):
if i != 0 and i % ncols == 0:
row += 1
axs[row][i % ncols].scatter(X_pca[:, 18], X_pca[:, 19], edgecolor='k', s=40, c=dummy[col])
axs[row][i % ncols].set_title(col)
axs[row][i % ncols].set_xlabel("1st eigenvector")
axs[row][i % ncols].set_ylabel("2nd eigenvector")

x = X_pca[:, 0]
y = X_pca[:, 2]
z = X_pca[:, 1]
# %%
fig = px.scatter_3d(x=x, y=y, z=z, labels={'x': '1st eigenvector', 'y': '3rd eigenvector', 'z': '2nd eigenvector'})
fig.show()

ax.set_xlabel("1st eigenvector")
ax.set_ylabel("3rd eigenvector")
ax.set_zlabel("2nd eigenvector")
# %%
exp_var_pca = pca.explained_variance_ratio_
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, align='center')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component')
plt.title('Explained variance by principal component')
plt.xticks(np.arange(0,len(exp_var_pca),1.0));

ax.scatter(x, y, z)
# %%
def get_reconstruction_error(x_pca, x_orig, pca, n_comp):
dummy = np.matmul(x_pca[:,:n_comp], pca.components_[:n_comp,:]) + pca.mean_
return pd.DataFrame(index=x_orig.index, data=np.sum((dummy - x_orig.values)**2, axis=1))

fig = px.scatter_3d(x=x, y=y, z=z, labels={'x': '1st eigenvector', 'y': '3rd eigenvector', 'z': '2nd eigenvector'})
fig.show()

final_indicators['PCA_reconstruction_e_5C'] = get_reconstruction_error(X_pca, dummy, pca, 5)
final_indicators.sample(3)

# %%
hist_box_plot(
final_indicators,
'PCA_reconstruction_e_5C',
title='PCA_reconstruction_e_5C',
bins=int(np.log(ratios.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%
col = ['1st_comp',
'2nd_comp',
'3rd_comp',
'4th_comp',
'5th_comp',
'6th_comp',
'7th_comp',
'8th_comp',
'9th_comp',
'10th_comp',
'11th_comp',
'12th_comp',
'13th_comp',
'14th_comp',
'15th_comp',
'16th_comp',
'17th_comp',
'18th_comp',
'19th_comp',
'20th_comp',
,
'PCA_rec_error_11C']


# %%
pca_indicators = pd.DataFrame(index=dummy.index, data=X_pca, columns=col[:-2])

# %%
pca_indicators['PCA_rec_error_5C'] = get_reconstruction_error(X_pca, dummy, pca, 5)
pca_indicators['PCA_rec_error_11C'] = get_reconstruction_error(X_pca, dummy, pca, 11)
pca_indicators['PCA_rec_error_20C'] = get_reconstruction_error(X_pca, dummy, pca, 20)

# %%
pca_indicators.sample(3)

# %%
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=pca_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
pca_normalized_indicators = pd.DataFrame(data=scaler_obj.fit_transform(pca_indicators.values), columns=pca_indicators.columns)

# %%
fig, ax = plt.subplots(figsize=(15, 5))
sns.violinplot(data=pca_normalized_indicators,ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
final_indicators.describe()
pca_normalized_indicators.to_csv(DATA_FOLDER_PATH +'incidents_cleaned_indicators_PCA.csv')

# %% [markdown]
# ['date', 'date_original', 'year', 'month', 'day', 'day_of_week', 'state',
Expand Down
2 changes: 1 addition & 1 deletion data/incidents_cleaned_indicators.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
,latitude,longitude,location_importance,avg_age_participants,n_participants,age_range,n_participants_child_prop,n_participants_teen_prop,n_males_pr,n_killed_pr,n_injured_pr,n_arrested_pr,n_unharmed_pr,log_males_mean_ratio,log_avg_age_mean_SD,avg_age_entropy,city_entropy,address_entropy,n_participants_adult_entropy,tags_entropy
,latitude,longitude,location_importance,avg_age_participants,n_participants,age_range,n_participants_child_prop,n_participants_teen_prop,n_males_pr,n_killed_pr,n_injured_pr,n_arrested_pr,n_unharmed_pr,log_n_males_n_males_mean_semest_congd_ratio,log_avg_age_mean_SD,avg_age_entropy,city_entropy,address_entropy,n_participants_adult_entropy,tags_entropy
0,0.39675713350208913,0.8088232183169319,0.1355420702853075,0.18811881188118812,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.6051888747164211,0.8384832385328498,0.7382471440159952,0.0067198160392350345,0.38342031184183417,0.12140717341689686,0.18852673595841474
1,0.4318415943200052,0.8840081434553676,0.1355420702853075,0.6138613861386139,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.6108056193917699,0.9242420670625573,0.44181546633397134,0.44181546633397134,0.2029536644406963,0.056942726736323654,0.2769677978740386
2,0.44628858754552386,0.8411582967243848,0.0,,0.00980392156862745,,,,,0.0,0.5,0.0,0.5,,,,0.05877067040450571,0.0400267815037745,,0.169074731486928
Expand Down
Loading

0 comments on commit 5debe25

Please sign in to comment.