Skip to content

Commit

Permalink
plot changes
Browse files Browse the repository at this point in the history
  • Loading branch information
GiacomoAru committed Nov 12, 2023
1 parent b37cb32 commit bdd8350
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 39 deletions.
35 changes: 35 additions & 0 deletions TASK_1/incidents_understanding_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,41 @@ def group_by_day(df, date_col):
# %%
info_city.loc[info_city['tot_points'] > 1].info()

# %%
#import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection

def plot_info_city(df, lat, lon, info_circle):
def update_legend_marker_size(handle, orig):
"Customize size of the legend marker"
handle.update_from(orig)
handle.set_sizes([20])

plt.scatter(df[lon], df[lat], color="k", s=3.0, label="Data points")
# plot circles with radius proportional to the outlier scores
radius = (df[info_circle].max() - df[info_circle]) / (df[info_circle].max() - df[info_circle].min())
radius_scale = 10
scatter = plt.scatter(
df[lon],
df[lat],
s=radius*radius_scale,
edgecolors="r",
facecolors="none",
label=info_circle,
)
plt.axis("tight")
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.legend(
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
)
plt.title("coordinates of city centroids + \'" + info_circle +'\'')
plt.show()


# %%
plot_info_city(info_city, 'centroid_lat', 'centroid_lon', '75')

# %%
plot_scattermap_plotly(info_city, 'tot_points', x_column='centroid_lat',
y_column='centroid_lon', hover_name=False, zoom=2, title='Number of points per city')
Expand Down
133 changes: 96 additions & 37 deletions TASK_1/indicators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# %%
import pandas as pd
import numpy as np
Expand All @@ -8,7 +7,7 @@
import seaborn as sns
sys.path.append(os.path.abspath('..'))
from plot_utils import *
# %matplotlib inline
%matplotlib inline

# %%
incidents_df = pd.read_csv('../data/incidents_cleaned.csv')
Expand Down Expand Up @@ -137,55 +136,47 @@ def compute_ratio_indicator(df, new_df, ext_df, gby, num, den, suffix, agg_fun):
ratios_wrt_tot.append(att)

# %%
ratios[ratios_wrt_tot].describe() # females quantiles are 0 (that's why they suggested to do it for males only)
def log_normalization(df, new_df, columns):
for col in columns:
c = (df[df[col]!=0][col].min())/100
new_df['log_'+col] = np.log(df[col] + c) # 1% of the minimum value

# %%
ratios[ratios_wrt_center].describe()
log_ratios = pd.DataFrame(index=ratios.index)
log_normalization(ratios, log_ratios, ratios.columns)
log_ratios_wrt_tot = ['log_'+x for x in ratios_wrt_tot]
log_ratios_wrt_center = ['log_'+x for x in ratios_wrt_center]


# %%
ratios.boxplot(
column=ratios_wrt_tot,
rot=90,
figsize=(20, 10)
);
ratios[ratios_wrt_tot].describe() # females quantiles are 0 (that's why they suggested to do it for males only)

# %%
log_ratio_wrt_tot = ['log_'+col for col in ratios_wrt_tot]
log_ratio_wrt_center = ['log_'+col for col in ratios_wrt_center]
log_ratios = pd.DataFrame(index=ratios.index)
for col in ratios.columns:
c = (ratios[ratios[col]!=0][col].min())/100
log_ratios['log_'+col] = np.log(ratios[col] + c) # 1% of the minimum value
log_ratios.boxplot(
column=log_ratio_wrt_tot,
rot=90,
figsize=(20, 10)
);
ratios[ratios_wrt_center].describe()

# %%
fig, ax = plt.subplots(figsize=(25, 10))
sns.violinplot(data=ratios[ratios_wrt_tot],ax=ax)
plt.xticks(rotation=90, ha='right');


# %%
fig, ax = plt.subplots()
sns.violinplot(data=ratios[ratios_wrt_center],ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
fig, ax = plt.subplots(figsize=(25, 10))
sns.violinplot(data=log_ratios[log_ratio_wrt_tot],ax=ax)
sns.violinplot(data=log_ratios[log_ratios_wrt_tot],ax=ax)
plt.xticks(rotation=90, ha='right');

# %% [markdown]
# La trasformazione logaritmica serve a rendere i dati meno sparsi, e in questo caso è utilizzata con il proposito opposto...
#
#
# Non possiamo trasformare dei dati poco significanti in dati significanti in questo modo, attenzione e io consiglierei di non utilizzare il logaritmo per i valori tra [0,1]

# %%
fig, ax = plt.subplots(figsize=(15, 10))
sns.violinplot(data=log_ratios[log_ratio_wrt_center],ax=ax)
sns.violinplot(data=log_ratios[log_ratios_wrt_center],ax=ax)
plt.xticks(rotation=90, ha='right');

# %%
Expand All @@ -206,7 +197,7 @@ def compute_ratio_indicator(df, new_df, ext_df, gby, num, den, suffix, agg_fun):
ratios,
'n_males_n_males_tot_year_city_ratio',
title='n_males_n_males_tot_year_city_ratio',
bins=int(np.log(incidents_df.shape[0])), # Sturger's rule
bins=int(np.log(ratios.shape[0])), # Sturger's rule
figsize=(10, 5)
)

Expand All @@ -215,11 +206,10 @@ def compute_ratio_indicator(df, new_df, ext_df, gby, num, den, suffix, agg_fun):
log_ratios,
'log_n_males_n_males_mean_year_city_ratio',
title='log_n_males_n_males_mean_year_city_ratio',
bins=int(np.log(incidents_df.shape[0])), # Sturger's rule
bins=int(np.log(log_ratios.shape[0])), # Sturger's rule
figsize=(10, 5)
)


# %%
def compute_square_distance_indicator(df, new_df, ext_df, gby, minuend, subtrahend, suffix, agg_fun):
grouped_df = ext_df.groupby(gby)[subtrahend].agg(agg_fun)
Expand All @@ -228,11 +218,6 @@ def compute_square_distance_indicator(df, new_df, ext_df, gby, minuend, subtrahe
#df.drop(columns=[den+suffix], inplace=True)
#return df

def log_normalization(df, new_df, columns):
for col in columns:
c = (df[df[col]!=0][col].min())/100
new_df['log_'+col] = np.log(df[col] + c) # 1% of the minimum value

square_distances = pd.DataFrame(index=incidents_df.index)

# %%
Expand All @@ -242,7 +227,6 @@ def log_normalization(df, new_df, columns):
compute_square_distance_indicator(incidents_df, square_distances, incidents_df, ['year', 'state'], l, l, '_mean_year_state', 'mean')
compute_square_distance_indicator(incidents_df, square_distances, incidents_df, ['year', 'state', 'congressional_district'], l, l, '_mean_year_congdist', 'mean')


# %%
square_distances.sample(5)

Expand All @@ -255,7 +239,7 @@ def log_normalization(df, new_df, columns):
square_distances,
'n_killed_n_killed_mean_year_state_SD',
title='n_killed_n_killed_mean_year_state_SD',
bins=int(np.log(incidents_df.shape[0])), # Sturger's rule
bins=int(np.log(square_distances.shape[0])), # Sturger's rule
figsize=(10, 5)
)
square_distances.sample(10, random_state=1)
Expand Down Expand Up @@ -308,7 +292,7 @@ def compute_entropy_indicator(df, new_df, col_aggr, col_group, lab=''):
entropies,
'mix_col_3',
title='mix_col_3',
bins=int(np.log(incidents_df.shape[0])), # Sturger's rule
bins=int(np.log(entropies.shape[0])), # Sturger's rule
figsize=(10, 5)
)

Expand All @@ -325,10 +309,84 @@ def compute_entropy_indicator(df, new_df, col_aggr, col_group, lab=''):
entropies,
'mix_col_2',
title='mix_col_2',
bins=int(np.log(incidents_df.shape[0])), # Sturger's rule
bins=int(np.log(entropies.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%
from sklearn.neighbors import LocalOutlierFactor
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
incidents_numeric = incidents_df.select_dtypes(include=numerics).dropna(axis=0)
print(incidents_numeric.shape)

ground_truth = np.ones(incidents_numeric.shape[0], dtype=int)
#ground_truth[-n_outliers:] = -1

N_NEIGHBORS = 20
CONTAMINATION = 0.1
clf = LocalOutlierFactor(n_neighbors=N_NEIGHBORS, contamination=CONTAMINATION)


# %%
y_pred = clf.fit_predict(incidents_numeric)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
X_scores

# %%
local_outlier_factors = pd.DataFrame(index=incidents_numeric.index, data=X_scores).rename(columns={0:'local_outlier_factor'})
local_outlier_factors['log_inv_local_outlier_factor'] = np.log2(local_outlier_factors['local_outlier_factor']*([-1]*len(local_outlier_factors['local_outlier_factor'])))
local_outlier_factors

# %%
hist_box_plot(
local_outlier_factors,
'local_outlier_factor',
title='local_outlier_factor',
bins=int(np.log(local_outlier_factors.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%
hist_box_plot(
local_outlier_factors,
'log_inv_local_outlier_factor',
title='log_inv_local_outlier_factor',
bins=int(np.log(local_outlier_factors.shape[0])), # Sturger's rule
figsize=(10, 5)
)

# %%
#import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection


def update_legend_marker_size(handle, orig):
"Customize size of the legend marker"
handle.update_from(orig)
handle.set_sizes([20])

plt.scatter(incidents_numeric['longitude'], incidents_numeric['latitude'], color="k", s=3.0, label="Data points")
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
scatter = plt.scatter(
incidents_numeric['longitude'],
incidents_numeric['latitude'],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
#plt.xlim((-5, 5))
#plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
plt.legend(
handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
)
plt.title("Local Outlier Factor (LOF)")
plt.show()

# %%
population_df = pd.read_csv('../data/external_data/population.csv')
population_df['n_males'] = population_df['male_child'] + population_df['male_teen'] + population_df['male_adult']
Expand Down Expand Up @@ -373,9 +431,10 @@ def compute_entropy_indicator(df, new_df, col_aggr, col_group, lab=''):
# - uccisi, feriti ecc.. rispetto alla media, con norm. logaritmica
# - rapporto degli uccisi/totali o feriti/totali dell'incidente (magari sostituiti)
# - entropie pazzerelle (su tutti i tag o combinazioni di tag)
#
#

# %%




2 changes: 1 addition & 1 deletion html/incidents_per_day.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def hist_box_plot(
if kde: df[col].plot.kde(bw_method=bw_method, ax=ax_hist, secondary_y=True)
df.boxplot(ax=ax_box, column=col, vert=False, grid=False)
ax_box.set(yticks=[])
plt.suptitle(title)
plt.suptitle(title +' (#NotNanVal/#TotVal: ' + str(len(df[col].dropna())) + '/' + str(len(df[col])) + ')')
plt.xlabel(xlabel)
plt.ylabel(ylabel)

Expand Down

0 comments on commit bdd8350

Please sign in to comment.