Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
GiuliaGhisolfi committed Nov 13, 2023
2 parents 4f7d429 + adefba3 commit 8d2e7dc
Showing 1 changed file with 8 additions and 34 deletions.
42 changes: 8 additions & 34 deletions TASK_1/incidents_understanding_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@

# %%
print(f"# of rows before dropping duplicates: {incidents_df.shape[0]}")
incidents_df.drop_duplicates(inplace=True) #, ignore_index=True) # TODO: geopy assume qui non sia stato resettato??
incidents_df.drop_duplicates(inplace=True)
print(f"# of rows after dropping duplicates: {incidents_df.shape[0]}")

# %% [markdown]
Expand Down Expand Up @@ -518,7 +518,7 @@ def group_by_day(df, date_col):
# To fix these inconsistencies we used the library [GeoPy]((https://geopy.readthedocs.io/en/stable/)). This library allows to retrieve the address (state, county, suburb, city, town, village, location name, and other features) corresponding to a given latitude and longitude. We queried the library using all the latitudes and longitudes of the points in the dataset and we saved the results in the CSV file we now load:

# %%
geopy_path = os.path.join(DATA_FOLDER_PATH, 'external_data/geopy.csv') # TODO: questo potraà diventare geopy (cancellare il vecchio geopy)
geopy_path = os.path.join(DATA_FOLDER_PATH, 'external_data/geopy.csv')
geopy_df = pd.read_csv(geopy_path, index_col=['index'], low_memory=False, dtype={})
geopy_df.head(n=2)

Expand Down Expand Up @@ -571,7 +571,7 @@ def group_by_day(df, date_col):
incidents_df = load_checkpoint('checkpoint_1', date_cols=['date', 'date_original'])
else:
geo_df = incidents_df[['state', 'city_or_county', 'address', 'latitude', 'longitude']]
geo_df = pd.concat([geo_df, geopy_df.loc[incidents_df.index]], axis=1) # TODO: geopy ha più righe perchè tiene anche quelle dei vecchi duplicati????
geo_df = pd.concat([geo_df, geopy_df.loc[incidents_df.index]], axis=1)
geo_df = geo_df.apply(lambda row: check_geographical_data_consistency(row, additional_data=counties_df), axis=1)
incidents_df[geo_df.columns] = geo_df[geo_df.columns]
save_checkpoint(incidents_df, 'checkpoint_1')
Expand Down Expand Up @@ -1243,7 +1243,7 @@ def geodesic_distance(point1, point2):
# We plot the boundaries of the classifier:

# %%
alabama_color_map = { #TODO: NON SO DOVE SI ROMPE, adesso in y_train ha più di 7 valori
alabama_color_map = {
1:'red',
2:'orange',
3:'yellow',
Expand Down Expand Up @@ -1333,7 +1333,7 @@ def geodesic_distance(point1, point2):
# %% [markdown]
# #### Features
# %%
incidents_df.groupby(['address']).size().sort_values(ascending=False)[:50].plot( #TODO: TOGLIERE(?)
incidents_df.groupby(['address']).size().sort_values(ascending=False)[:50].plot(
kind='bar',
figsize=(10,6),
title='Counts of the addresses with the 50 highest number of incidents'
Expand Down Expand Up @@ -1687,7 +1687,7 @@ def max_min_value(attribute): # FIXME: convertire in float, escludere <= 122 e >

if LOAD_DATA_FROM_CHECKPOINT:
with zipfile.ZipFile('checkpoints/checkpoint_4.csv.zip', 'r') as zip_ref:
zip_ref.extractall('checkpoints/') # TODO: magari fare all'inizio una chiamata che decomprime tutti i *.zip
zip_ref.extractall('checkpoints/')
incidents_df = load_checkpoint('checkpoint_4', date_cols=['date', 'date_original'])
else:
new_age_df = age_temporary_df.apply(lambda row: set_gender_age_consistent_data(row), axis=1)
Expand Down Expand Up @@ -1957,31 +1957,6 @@ def max_min_value(attribute): # FIXME: convertire in float, escludere <= 122 e >
# %%
from TASK_1.data_preparation_utils import set_tags_consistent_data

CASTING = {'latitude':'Float64',
'longitude':'Float64',
'participant_age1':'Int64',
'min_age_participants':'Int64',
'avg_age_participants':'Int64',
'max_age_participants':'Int64',
'n_participants_child':'Int64',
'n_participants_teen':'Int64',
'n_participants_adult':'Int64',
'n_males':'Int64',
'n_females':'Int64',
'n_killed':'Int64',
'n_injured':'Int64',
'n_arrested':'Int64',
'n_unharmed':'Int64',
'n_participants':'Int64',
'year':'Int64',
'month':'Int64',
'day':'Int64',
'day_of_week':'Int64',
'location_importance':'Float64',
'state_house_district':'Int64',
'state_senate_district':'Int64',
'congressional_district':'Int64'
}
if LOAD_DATA_FROM_CHECKPOINT:
with zipfile.ZipFile('checkpoints/checkpoint_6.csv.zip', 'r') as zip_ref:
zip_ref.extractall('checkpoints/')
Expand Down Expand Up @@ -2315,10 +2290,9 @@ def max_min_value(attribute): # FIXME: convertire in float, escludere <= 122 e >
'firearm', 'air_gun', 'shots', 'aggression', 'suicide', 'injuries',
'death', 'road', 'illegal_holding', 'house', 'school', 'children',
'drugs', 'officers', 'organized', 'social_reasons', 'defensive',
'workplace', 'abduction', 'unintentional'] #TODO: add tag_consistency
'workplace', 'abduction', 'unintentional']

external_columns = ['povertyPercentage', 'party', 'candidatevotes', 'totalvotes', 'candidateperc', 'population_state_2010']
#TODO: majority state party?

# %% [markdown]
# We re-order the columns and we save the cleaned dataset:
Expand All @@ -2344,4 +2318,4 @@ def max_min_value(attribute): # FIXME: convertire in float, escludere <= 122 e >
sns.heatmap(corr_matrix, mask=np.triu(corr_matrix))

# %%
incidents_df.to_csv(DATA_FOLDER_PATH +'incidents_cleaned.csv', index=False)
incidents_df.to_csv(DATA_FOLDER_PATH +'incidents_cleaned.csv', index=True)

0 comments on commit 8d2e7dc

Please sign in to comment.