Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

quick notebook to lead and cluster GLM v24 dff #833

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
70dab03
fixing clustering instance
yavorska-iryna Sep 23, 2022
f131e21
adding option to edit abbreviations in plot
yavorska-iryna Sep 23, 2022
64cfac6
quick notebook to check familiar clsuters
yavorska-iryna Sep 23, 2022
50033fe
updated
yavorska-iryna Sep 28, 2022
862d48c
editing plot_cluster_size to plot abs difference
yavorska-iryna Sep 28, 2022
3720fb7
adding variability df functions
yavorska-iryna Oct 7, 2022
2ff3ca9
started notebook to make summary panels
yavorska-iryna Oct 7, 2022
7336c67
notebooks for familiar and novel controls
yavorska-iryna Oct 7, 2022
f3eaec0
updated panels E and F for figure 4
yavorska-iryna Oct 13, 2022
692cd91
improving cumulative plot
yavorska-iryna Oct 14, 2022
5588d6f
updated notebook
yavorska-iryna Nov 7, 2022
a02f890
small edit
yavorska-iryna Nov 8, 2022
3c41dcb
small edit
yavorska-iryna Nov 8, 2022
92d9d88
updated notebooks
yavorska-iryna Nov 8, 2022
edaa89b
notebook to cluster glm output on dff traces
yavorska-iryna Nov 8, 2022
7e28e33
adding GLM code to load across normalized rspm
yavorska-iryna Nov 9, 2022
b29b19f
linting
yavorska-iryna Nov 9, 2022
baddb92
behavior object is not defined!
yavorska-iryna Nov 9, 2022
9dc9942
multi session def traces
yavorska-iryna Nov 9, 2022
54201af
changed path in cache dir to work with pc and mac
yavorska-iryna Nov 9, 2022
f0abdbc
adding more figure panels
yavorska-iryna Nov 12, 2022
da83ff6
cleaning up the plots
yavorska-iryna Dec 2, 2022
7e39bac
figure updates
yavorska-iryna Jan 10, 2023
5a87520
delete summary figures notebook
yavorska-iryna Jan 11, 2023
8f52f23
delete novel control notebook
yavorska-iryna Jan 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
310 changes: 106 additions & 204 deletions notebooks/220920_Figure4_presentation.ipynb

Large diffs are not rendered by default.

2,462 changes: 2,462 additions & 0 deletions notebooks/221004_Figure4_summary_panels.ipynb

Large diffs are not rendered by default.

2,138 changes: 2,138 additions & 0 deletions notebooks/221107_figure_4_clustering_dff_control.ipynb

Large diffs are not rendered by default.

14,995 changes: 14,995 additions & 0 deletions notebooks/221109_multi_session_df_loading_for_VB_platform_paper.html

Large diffs are not rendered by default.

679 changes: 679 additions & 0 deletions notebooks/221109_multi_session_df_loading_for_VB_platform_paper.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions notebooks/dabase_functions_example_useage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1239,9 +1239,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "visual_behavior_analysis",
"display_name": "vba",
"language": "python",
"name": "visual_behavior_analysis"
"name": "vba"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1257,5 +1257,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion visual_behavior/data_access/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def get_platform_analysis_cache_dir():
This is the cache directory to use for all platform paper analysis
This cache contains NWB files downloaded directly from AWS
"""
return r'/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/platform_paper_cache'
return '//allen/programs/braintv/workgroups/nc-ophys/visual_behavior/platform_paper_cache'
# return r'\\allen\programs\braintv\workgroups\nc-ophys\visual_behavior\platform_paper_cache'


Expand Down
42 changes: 26 additions & 16 deletions visual_behavior/dimensionality_reduction/clustering/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def plot_coclustering_matrix_sorted_by_cluster_size(coclustering_matrices, clust
fig, ax = plt.subplots(figsize=figsize)
ax = sns.heatmap(sorted_coclustering_matrix, cmap="Greys", ax=ax, square=True,
cbar=True, cbar_kws={"drawedges": False, "label": 'probability of\nco-clustering', 'shrink': 0.7, },)
ax.set_title(get_cell_type_for_cre_line(cre_line, cluster_meta))
ax.set_title(processing.get_cre_line_map(cre_line))
ax.set_title('')
ax.set_yticks((0, sorted_coclustering_matrix.shape[0]))
ax.set_yticklabels((0, sorted_coclustering_matrix.shape[0]), fontsize=20)
Expand Down Expand Up @@ -502,7 +502,7 @@ def plot_umap_for_clusters(cluster_meta, feature_matrix, label_col='cluster_id',

label_col: column in cluster_meta to colorize points by
"""
import umap
# import umap
figsize = (15, 4)
fig, ax = plt.subplots(1, 3, figsize=figsize)
for i, cre_line in enumerate(get_cre_lines(cluster_meta)):
Expand Down Expand Up @@ -1231,11 +1231,14 @@ def get_cluster_color_map(labels):

def plot_N_clusters_by_cre_line(labels_cre, ax=None, palette=None):
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
fig, ax = plt.subplots(1, 1, figsize=(6, 3))

if palette is None:
palette = [(1.0, 0.596078431372549, 0.5882352941176471),
(0.6196078431372549, 0.8549019607843137, 0.8980392156862745),
(0.7725490196078432, 0.6901960784313725, 0.8352941176470589)]
markersize = [14, 10, 10]
linewidth = [4, 2, 2]
for i, cre_line in enumerate(labels_cre.keys()):
labels = labels_cre[cre_line]
(unique, counts) = np.unique(labels, return_counts=True)
Expand All @@ -1244,11 +1247,14 @@ def plot_N_clusters_by_cre_line(labels_cre, ax=None, palette=None):
cumulative_sum = [0]
for freq in frequencies:
cumulative_sum.append(cumulative_sum[-1] + freq)
ax.grid()

ax.plot(range(0, len(cumulative_sum)), cumulative_sum, 'o-', color=palette[i],
linewidth=4, markersize=10)
linewidth=linewidth[i], markersize=markersize[i])
ax.hlines(y=80, xmin=-0.5, xmax=11, linestyle='--', color='Grey')
ax.set_xlim([-0.5, 11])
ax.set_xticks(np.arange(0, len(cumulative_sum)))
ax.set_xlabel('Cluster number')
ax.set_ylabel('Cells per cluster (%)')
ax.set_ylabel('Cumulative % of all cells')
ax.legend(['Excitatory', 'SST inhibitory', 'VIP inhibitory'])

return ax
Expand Down Expand Up @@ -1611,7 +1617,7 @@ def plot_population_average_response_for_cluster(cluster_mdf, cre_line, cluster_


def plot_clusters_row(cluster_meta, feature_matrix, cre_line,
sort_order=None, rename_clusters=False, save_dir=None, folder=None, suffix='', formats=['.png', '.pdf']):
sort_order=None, rename_clusters=False, save_dir=None, folder=None, suffix='', abbreviate_experience=True, formats=['.png', '.pdf']):
"""
For each cluster in a given cre_line, plots dropout heatmaps, fraction cells per area/depth relative to chance,
fraction cells per cluster per area/depth, and population average omission response.
Expand Down Expand Up @@ -1649,7 +1655,7 @@ def plot_clusters_row(cluster_meta, feature_matrix, cre_line,
for i, cluster_id in enumerate(cluster_ids):
# plot mean dropout heatmap for this cluster
ax[i] = plot_dropout_heatmap(cluster_meta, feature_matrix, cre_line, cluster_id,
abbreviate_experience=True, abbreviate_features=True, ax=ax[i])
abbreviate_experience=abbreviate_experience, abbreviate_features=True, ax=ax[i])

# # population average for this cluster
# ax[i + (n_clusters * 1)] = plot_population_average_response_for_cluster(cluster_mdf, cre_line, cluster_id,
Expand Down Expand Up @@ -1997,7 +2003,7 @@ def plot_gap_statistic(gap_statistic, cre_lines=None, n_clusters_cre=None, tag='
n_clusters = n_clusters_cre[cre_line]
x = len(gap_statistic[cre_line]['gap'])

figsize = (10, 4)
figsize = (8, 4)
fig, ax = plt.subplots(1, 2, figsize=figsize)

ax[0].plot(np.arange(1, x + 1), gap_statistic[cre_line]['reference_inertia'], 'o-')
Expand All @@ -2011,6 +2017,7 @@ def plot_gap_statistic(gap_statistic, cre_lines=None, n_clusters_cre=None, tag='
ax[1].set_ylabel('Gap statistic')
ax[1].set_xlabel('Number of clusters')
ax[1].axvline(x=n_clusters, ymin=0, ymax=1, linestyle='--', color='gray')
ax[1].set_ylim([0,0.3])

title = processing.get_cre_line_map(cre_line) # get a more interpretable cell type name
plt.suptitle(title)
Expand Down Expand Up @@ -2071,7 +2078,10 @@ def plot_eigengap_values(eigenvalues_cre, cre_lines, n_clusters_cre=None, save_d
n_clusters_cre = processing.get_n_clusters_cre()

for cre_line in cre_lines:
eigenvalues = eigenvalues_cre[cre_line][1]
if len(eigenvalues_cre[cre_line]) < 4: # patchwork her.
eigenvalues = eigenvalues_cre[cre_line][1]
else:
eigenvalues = eigenvalues_cre[cre_line]
n_clusters = n_clusters_cre[cre_line]
suffix = cre_line
title = processing.get_cre_line_map(cre_line) # get a more interpretable cell type name
Expand Down Expand Up @@ -2706,7 +2716,7 @@ def plot_cluster_proportions_treemaps(location_fractions, cluster_meta, color_co


# plot cluster sizes
def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=None):
def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, diff_column, ax=None):
if ax is None:
fig, ax = plt.subplots()

Expand All @@ -2716,9 +2726,9 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N
ax.spines['right'].set_visible(False)
# plot size diff
ax = sns.barplot(data=cluster_size_df[cluster_size_df['cluster_id'] == cluster_id], x='cluster_id',
y='cluster_size_diff', color=color1, ax=ax)
y=diff_column, color=color1, ax=ax)
if stats_table is not None:
y = 1
y = cluster_size_df[[diff_column, 'cluster_id']].groupby('cluster_id').mean().max()
if stats_table[stats_table.cluster_id == cluster_id]['significant'].values[0] == True:
color = 'DarkRed'
pvalue = round(stats_table[stats_table.cluster_id == cluster_id]['imq'].values[0], 4)
Expand All @@ -2741,7 +2751,7 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N

ax.axhline(0, color='gray')
ax.set_xlabel('')
ax.set_ylim([-0.4, 1])
# ax.set_ylim([-0.4, 1])
ax.set_xlim([-1, 1])
ax.set_xticklabels('', fontsize=12)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)
Expand All @@ -2757,7 +2767,7 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N
return ax


def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_table=None,
def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_table=None, diff_column='cluster_size_diff',
ax=None, figsize=None, save_dir=None, folder=None):
if cre_line is not None:
if isinstance(cre_line, str):
Expand All @@ -2779,7 +2789,7 @@ def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_t

# plot cluster size first
for i, cluster_id in enumerate(cluster_ids):
ax[i] = plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table=stats_table, ax=ax[i])
ax[i] = plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table=stats_table, diff_column=diff_column, ax=ax[i])

fig.subplots_adjust(hspace=1.2, wspace=0.6)
plt.suptitle(processing.get_shuffle_label(shuffle_type), x=0.52, y=1.15)
Expand Down
108 changes: 93 additions & 15 deletions visual_behavior/dimensionality_reduction/clustering/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import visual_behavior.data_access.utilities as utilities

import visual_behavior_glm.GLM_analysis_tools as gat
import visual_behavior_glm.GLM_across_session as gas

import visual_behavior_glm.GLM_clustering as glm_clust
import visual_behavior_glm.GLM_params as glm_params
from statsmodels.stats.proportion import proportion_confint
Expand Down Expand Up @@ -213,24 +215,23 @@ def pivot_df(df, dropna=True, drop_duplicated_cells=True):
# loading & processing ###

def get_glm_results_pivoted_for_clustering(glm_version='24_events_all_L2_optimize_by_session',
model_output_type='adj_fraction_change_from_full', save_dir=None):
model_output_type='adj_fraction_change_from_full',
across_sessions_normalized=True, save_dir=None):
"""
loads GLM results pivoted where columns are dropout scores and rows are cells in specific experiments
filters to limit to cells matched in all experience levels
loads from file if file exists in save_dir, otherwise generates from mongo and saves to save dir
"""
if across_sessions_normalized and save_dir:
results_file_path = os.path.join(save_dir, glm_version + '_across_normalized_results_pivoted.h5')
elif save_dir:
results_file_path = os.path.join(save_dir, glm_version + '_results_pivoted.h5')

results_file_path = os.path.join(save_dir, glm_version + '_results_pivoted.h5')
if os.path.exists(results_file_path):
if os.path.exists(results_file_path) is False:
print('loading results_pivoted from', results_file_path)
results_pivoted = pd.read_hdf(results_file_path, key='df')

else: # if it doesnt exist, then load it and save (note this is slow)
results_pivoted = gat.build_pivoted_results_summary(value_to_use=model_output_type, results_summary=None,
glm_version=glm_version, cutoff=None)

# get rid of passive sessions
results_pivoted = results_pivoted[results_pivoted.passive == False]

# get cells table and limit to cells matched in closest familiar and novel active
cells_table = loading.get_cell_table()
Expand All @@ -244,16 +245,26 @@ def get_glm_results_pivoted_for_clustering(glm_version='24_events_all_L2_optimiz
matched_experiments = cells_table.ophys_experiment_id.unique()
matched_cells = cells_table.cell_specimen_id.unique()

if across_sessions_normalized is False:
results_pivoted = gat.build_pivoted_results_summary(value_to_use=model_output_type,
results_summary=None,
glm_version=glm_version, cutoff=None)

# get rid of passive sessions
results_pivoted = results_pivoted[results_pivoted.passive == False]
elif across_sessions_normalized is True:
results_pivoted, _ = gas.load_cells(glm_version=glm_version)

# limit results pivoted to to last familiar and second novel
results_pivoted = results_pivoted[results_pivoted.ophys_experiment_id.isin(matched_experiments)]
results_pivoted = results_pivoted[results_pivoted.cell_specimen_id.isin(matched_cells)]
print(len(results_pivoted.cell_specimen_id.unique()),
'cells in results_pivoted after limiting to strictly matched cells')

if save_dir:
# save filtered results to save_dir
results_pivoted.to_hdf(os.path.join(save_dir, glm_version + '_results_pivoted.h5'), key='df')
print('results_pivoted saved')
# if save_dir:
# # save filtered results to save_dir
# results_pivoted.to_hdf(results_file_path, key='df')
# print('results_pivoted saved')
return results_pivoted


Expand Down Expand Up @@ -748,6 +759,8 @@ def get_labels_for_coclust_matrix(X, model=SpectralClustering, nboot=np.arange(1
___________
:return: labels: matrix of labels, n repeats by n observations
'''
if model is SpectralClustering:
model = model()
labels = []
if n_clusters is not None:
model.n_clusters = n_clusters
Expand All @@ -767,6 +780,7 @@ def get_coClust_matrix(X, model=SpectralClustering, nboot=np.arange(150), n_clus
______________
returns: coClust_matrix: (ndarray) probability matrix of co-clustering together.
'''
#model = model()
labels = get_labels_for_coclust_matrix(X=X,
model=model,
nboot=nboot,
Expand Down Expand Up @@ -1588,7 +1602,7 @@ def shuffle_dropout_score(df_dropout, shuffle_type='all'):
Returns:
df_shuffled (pd. Dataframe) of shuffled dropout scores
'''
df_shuffled = df_dropout.copy(deep=True)
df_shuffled = df_dropout.copy()
regressors = df_dropout.columns.levels[0].values
experience_levels = df_dropout.columns.levels[1].values
if shuffle_type == 'all':
Expand Down Expand Up @@ -2422,5 +2436,69 @@ def get_shuffle_label(shuffle_type):
return shuffle_type_dict[shuffle_type]


def get_cluster_size_statistics_df():
return 10
def compute_sse(feature_matrix):
'''
Computes Sum of Squared Error between each cell in feature matrix and the mean.

INPUT:
feature_matrix (pd.DataFrame) dropout scores, rows are cell specimen ids

Returns:
SSE (list) of sse values between each cell and their mean.
'''

mean_values = feature_matrix.mean().values

SSE = np.sum(np.subtract(feature_matrix.values, mean_values) ** 2, axis=1)

return SSE


def get_variability_df(feature_matrix, cluster_df, columns=['cluster_id', 'cre_line', 'clustered'], metric='sse'):
'''
INPUT:
feature_matrix:
cluster_df: (pd.DataFrame) dataframe with columns ['cre_line', 'cluster_id'] and cell specimen id as an index
metric: (string)

Returns:
variability_df
'''

variability_df = pd.DataFrame(columns=columns)
cre_lines = np.sort(vba_clust.get_cre_lines(cluster_df))

columns = [*columns, metric]

if 'cell_specimen_id' in cluster_df.keys():
cluster_df.set_index('cell_specimen_id', inplace=True)

for cre_line in cre_lines:
print(cre_line)
cre_cluster_df = cluster_df[cluster_df.cre_line == cre_line]
cre_cell_ids = cre_cluster_df.index.values
cre_feature_matrix = feature_matrix.loc[cre_cell_ids]

cluster_ids = np.sort(cre_cluster_df['cluster_id'].values)
# compute values for each cluster id
for cluster_id in cluster_ids:
cluster_cids = cre_cluster_df[cre_cluster_df.cluster_id == cluster_id].index.values
cluster_feature_matrix = cre_feature_matrix.loc[cluster_cids]
if metric is 'sse':
values = compute_sse(cluster_feature_matrix)

variability_df = variability_df.append(pd.DataFrame({'cre_line': [cre_line] * len(values),
'cluster_id': [cluster_id] * len(values),
'clustered': [True] * len(values),
metric: values}, index=np.arange(len(values))),
ignore_index=True)

if metric is 'sse':
values = compute_sse(cre_feature_matrix)
variability_df = variability_df.append(pd.DataFrame({'cre_line': [cre_line] * len(values),
'cluster_id': [np.nan] * len(values),
'clustered': [False] * len(values),
metric: values}, index=np.arange(len(values))),
ignore_index=True)

return variability_df
5 changes: 4 additions & 1 deletion visual_behavior/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,7 +1336,10 @@ def cache_response_probability(behavior_session_id, engaged_only=True):
stimulus_presentations = stimulus_presentations[stimulus_presentations.engagement_state == 'engaged']

# compute response probability
response_matrix = behavior.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
# BEHAVIOR OBJECT IS NOT DEFINED. CHANGED IT TO DATASET, BUT NOT SURE IF THAT'S CORRECT ONE. IRYNA NOV 11,2022
# response_matrix = behavior.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
# engaged_only=engaged_only)
response_matrix = dataset.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
engaged_only=engaged_only)

filename = 'behavior_session_id={}.h5'.format(behavior_session_id)
Expand Down