Skip to content

Commit

Permalink
plot to chose best state for clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
GiuliaGhisolfi committed Nov 13, 2023
1 parent b6e3e0b commit 4f7d429
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions TASK_2/density_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,48 @@ def get_color(row):
incidents_grouped_by_state['nan_entries_date_ratio'] = incidents_grouped_by_state['nan_entries_date'
] / incidents_grouped_by_state['not_nan_entries']

# %%
import plotly.graph_objects as go
from plotly.subplots import make_subplots

attribute_list = ['nan_entries_city_ratio', 'nan_entries_county_ratio', 'nan_entries_lat_long_ratio',
'nan_entries_n_participants_ratio', 'nan_entries_date_ratio']
label_list = ['City', 'County', 'Latitude and Longitude', 'Number of Participants', 'Date']

# Create subplots
fig = make_subplots(rows=2, cols=3, subplot_titles=label_list)

for i, (attribute, label) in enumerate(zip(attribute_list, label_list), start=1):
frame = px.choropleth(
incidents_grouped_by_state,
color=attribute,
locations='px_code',
locationmode="USA-states",
scope="usa",
title=f"Number of NaN entries by state for {label}",
hover_name='state',
hover_data={
'px_code': False,
'not_nan_entries': True,
'nan_entries_city': True,
'nan_entries_county': True,
'nan_entries_lat_long': True,
'nan_entries_n_participants': True,
'nan_entries_date': True,
},
)

frame.update_layout(
coloraxis_colorbar=dict(title=f'Ratio NaN entries for {label}'))

# Add subplot to the main figure
fig.add_trace(frame['data'][0], row=(i-1)//3+1, col=(i-1)%3+1)

# Update layout for the main figure
fig.update_layout(title_text="Number of NaN entries by state for different attributes", showlegend=False)

fig.show()

# %%
attribute_list = ['nan_entries_city_ratio', 'nan_entries_county_ratio', 'nan_entries_lat_long_ratio',
'nan_entries_n_participants_ratio', 'nan_entries_date_ratio']
Expand Down Expand Up @@ -256,6 +298,19 @@ def create_dataset(state):
# %% [markdown]
# # Density clustering

# %% [markdown]
# DBSCAN: density-based cluster, define a cluster as a dense region of objects.
#
# Partitional clustering, number of clester automatically detected from algorithm.
# Points in low-density region are classified as noise.
#
# Pros: can handle irregular clusters and with arbitrary shape and size, works well when noise or oulier are present.
# an find many cluster that K-means could not find.
#
# Contro: not able to classified correctly whan the clusters have widley varing density, and have trouble with high dimensional data because density is difficult to define.
#
#

# %% [markdown]
# ## Indices correlation

Expand Down

0 comments on commit 4f7d429

Please sign in to comment.