AllenInstitute · yavorska-iryna · Sep 23, 2022 · Sep 23, 2022 · Sep 23, 2022 · Sep 28, 2022
diff --git a/notebooks/220920_Figure4_presentation.ipynb b/notebooks/220920_Figure4_presentation.ipynb
diff --git a/notebooks/221004_Figure4_summary_panels.ipynb b/notebooks/221004_Figure4_summary_panels.ipynb
diff --git a/notebooks/221107_figure_4_clustering_dff_control.ipynb b/notebooks/221107_figure_4_clustering_dff_control.ipynb
diff --git a/notebooks/221109_multi_session_df_loading_for_VB_platform_paper.html b/notebooks/221109_multi_session_df_loading_for_VB_platform_paper.html
diff --git a/notebooks/221109_multi_session_df_loading_for_VB_platform_paper.ipynb b/notebooks/221109_multi_session_df_loading_for_VB_platform_paper.ipynb
diff --git a/notebooks/dabase_functions_example_useage.ipynb b/notebooks/dabase_functions_example_useage.ipynb
@@ -1239,9 +1239,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "visual_behavior_analysis",
+   "display_name": "vba",
    "language": "python",
-   "name": "visual_behavior_analysis"
+   "name": "vba"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1257,5 +1257,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/visual_behavior/data_access/loading.py b/visual_behavior/data_access/loading.py
@@ -97,7 +97,7 @@ def get_platform_analysis_cache_dir():
     This is the cache directory to use for all platform paper analysis
     This cache contains NWB files downloaded directly from AWS
     """
-    return r'/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/platform_paper_cache'
+    return '//allen/programs/braintv/workgroups/nc-ophys/visual_behavior/platform_paper_cache'
     # return r'\\allen\programs\braintv\workgroups\nc-ophys\visual_behavior\platform_paper_cache'
 
 

diff --git a/visual_behavior/dimensionality_reduction/clustering/plotting.py b/visual_behavior/dimensionality_reduction/clustering/plotting.py
@@ -445,7 +445,7 @@ def plot_coclustering_matrix_sorted_by_cluster_size(coclustering_matrices, clust
         fig, ax = plt.subplots(figsize=figsize)
     ax = sns.heatmap(sorted_coclustering_matrix, cmap="Greys", ax=ax, square=True,
                      cbar=True, cbar_kws={"drawedges": False, "label": 'probability of\nco-clustering', 'shrink': 0.7, },)
-    ax.set_title(get_cell_type_for_cre_line(cre_line, cluster_meta))
+    ax.set_title(processing.get_cre_line_map(cre_line))
     ax.set_title('')
     ax.set_yticks((0, sorted_coclustering_matrix.shape[0]))
     ax.set_yticklabels((0, sorted_coclustering_matrix.shape[0]), fontsize=20)
@@ -502,7 +502,7 @@ def plot_umap_for_clusters(cluster_meta, feature_matrix, label_col='cluster_id',
 
     label_col: column in cluster_meta to colorize points by
     """
-    import umap
+#    import umap
     figsize = (15, 4)
     fig, ax = plt.subplots(1, 3, figsize=figsize)
     for i, cre_line in enumerate(get_cre_lines(cluster_meta)):
@@ -1231,11 +1231,14 @@ def get_cluster_color_map(labels):
 
 def plot_N_clusters_by_cre_line(labels_cre, ax=None, palette=None):
     if ax is None:
-        fig, ax = plt.subplots(1, 1, figsize=(7, 7))
+        fig, ax = plt.subplots(1, 1, figsize=(6, 3))
+
     if palette is None:
         palette = [(1.0, 0.596078431372549, 0.5882352941176471),
                    (0.6196078431372549, 0.8549019607843137, 0.8980392156862745),
                    (0.7725490196078432, 0.6901960784313725, 0.8352941176470589)]
+    markersize = [14, 10, 10]
+    linewidth = [4, 2, 2]
     for i, cre_line in enumerate(labels_cre.keys()):
         labels = labels_cre[cre_line]
         (unique, counts) = np.unique(labels, return_counts=True)
@@ -1244,11 +1247,14 @@ def plot_N_clusters_by_cre_line(labels_cre, ax=None, palette=None):
         cumulative_sum = [0]
         for freq in frequencies:
             cumulative_sum.append(cumulative_sum[-1] + freq)
-        ax.grid()
+
         ax.plot(range(0, len(cumulative_sum)), cumulative_sum, 'o-', color=palette[i],
-                linewidth=4, markersize=10)
+                linewidth=linewidth[i], markersize=markersize[i])
+        ax.hlines(y=80, xmin=-0.5, xmax=11, linestyle='--', color='Grey')
+        ax.set_xlim([-0.5, 11])
+        ax.set_xticks(np.arange(0, len(cumulative_sum)))
         ax.set_xlabel('Cluster number')
-        ax.set_ylabel('Cells per cluster (%)')
+        ax.set_ylabel('Cumulative % of all cells')
     ax.legend(['Excitatory', 'SST inhibitory', 'VIP inhibitory'])
 
     return ax
@@ -1611,7 +1617,7 @@ def plot_population_average_response_for_cluster(cluster_mdf, cre_line, cluster_
 
 
 def plot_clusters_row(cluster_meta, feature_matrix, cre_line,
-                      sort_order=None, rename_clusters=False, save_dir=None, folder=None, suffix='', formats=['.png', '.pdf']):
+                      sort_order=None, rename_clusters=False, save_dir=None, folder=None, suffix='', abbreviate_experience=True, formats=['.png', '.pdf']):
     """
     For each cluster in a given cre_line, plots dropout heatmaps, fraction cells per area/depth relative to chance,
     fraction cells per cluster per area/depth, and population average omission response.
@@ -1649,7 +1655,7 @@ def plot_clusters_row(cluster_meta, feature_matrix, cre_line,
     for i, cluster_id in enumerate(cluster_ids):
         # plot mean dropout heatmap for this cluster
         ax[i] = plot_dropout_heatmap(cluster_meta, feature_matrix, cre_line, cluster_id,
-                                     abbreviate_experience=True, abbreviate_features=True, ax=ax[i])
+                                     abbreviate_experience=abbreviate_experience, abbreviate_features=True, ax=ax[i])
 
         # # population average for this cluster
         # ax[i + (n_clusters * 1)] = plot_population_average_response_for_cluster(cluster_mdf, cre_line, cluster_id,
@@ -1997,7 +2003,7 @@ def plot_gap_statistic(gap_statistic, cre_lines=None, n_clusters_cre=None, tag='
         n_clusters = n_clusters_cre[cre_line]
         x = len(gap_statistic[cre_line]['gap'])
 
-        figsize = (10, 4)
+        figsize = (8, 4)
         fig, ax = plt.subplots(1, 2, figsize=figsize)
 
         ax[0].plot(np.arange(1, x + 1), gap_statistic[cre_line]['reference_inertia'], 'o-')
@@ -2011,6 +2017,7 @@ def plot_gap_statistic(gap_statistic, cre_lines=None, n_clusters_cre=None, tag='
         ax[1].set_ylabel('Gap statistic')
         ax[1].set_xlabel('Number of clusters')
         ax[1].axvline(x=n_clusters, ymin=0, ymax=1, linestyle='--', color='gray')
+        ax[1].set_ylim([0,0.3])
 
         title = processing.get_cre_line_map(cre_line)  # get a more interpretable cell type name
         plt.suptitle(title)
@@ -2071,7 +2078,10 @@ def plot_eigengap_values(eigenvalues_cre, cre_lines, n_clusters_cre=None, save_d
         n_clusters_cre = processing.get_n_clusters_cre()
 
     for cre_line in cre_lines:
-        eigenvalues = eigenvalues_cre[cre_line][1]
+        if len(eigenvalues_cre[cre_line]) < 4:  # patchwork her.
+            eigenvalues = eigenvalues_cre[cre_line][1]
+        else:
+            eigenvalues = eigenvalues_cre[cre_line]
         n_clusters = n_clusters_cre[cre_line]
         suffix = cre_line
         title = processing.get_cre_line_map(cre_line)  # get a more interpretable cell type name
@@ -2706,7 +2716,7 @@ def plot_cluster_proportions_treemaps(location_fractions, cluster_meta, color_co
 
 
 # plot cluster sizes
-def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=None):
+def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, diff_column, ax=None):
     if ax is None:
         fig, ax = plt.subplots()
 
@@ -2716,9 +2726,9 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N
     ax.spines['right'].set_visible(False)
     # plot size diff
     ax = sns.barplot(data=cluster_size_df[cluster_size_df['cluster_id'] == cluster_id], x='cluster_id',
-                     y='cluster_size_diff', color=color1, ax=ax)
+                     y=diff_column, color=color1, ax=ax)
     if stats_table is not None:
-        y = 1
+        y = cluster_size_df[[diff_column, 'cluster_id']].groupby('cluster_id').mean().max()
         if stats_table[stats_table.cluster_id == cluster_id]['significant'].values[0] == True:
             color = 'DarkRed'
             pvalue = round(stats_table[stats_table.cluster_id == cluster_id]['imq'].values[0], 4)
@@ -2741,7 +2751,7 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N
 
     ax.axhline(0, color='gray')
     ax.set_xlabel('')
-    ax.set_ylim([-0.4, 1])
+    # ax.set_ylim([-0.4, 1])
     ax.set_xlim([-1, 1])
     ax.set_xticklabels('', fontsize=12)
     ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)
@@ -2757,7 +2767,7 @@ def plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table, ax=N
     return ax
 
 
-def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_table=None,
+def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_table=None, diff_column='cluster_size_diff',
                       ax=None, figsize=None, save_dir=None, folder=None):
     if cre_line is not None:
         if isinstance(cre_line, str):
@@ -2779,7 +2789,7 @@ def plot_cluster_size(cluster_size_df, cre_line=None, shuffle_type=None, stats_t
 
     # plot cluster size first
     for i, cluster_id in enumerate(cluster_ids):
-        ax[i] = plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table=stats_table, ax=ax[i])
+        ax[i] = plot_cluster_size_for_cluster(cluster_size_df, cluster_id, stats_table=stats_table, diff_column=diff_column, ax=ax[i])
 
     fig.subplots_adjust(hspace=1.2, wspace=0.6)
     plt.suptitle(processing.get_shuffle_label(shuffle_type), x=0.52, y=1.15)

diff --git a/visual_behavior/dimensionality_reduction/clustering/processing.py b/visual_behavior/dimensionality_reduction/clustering/processing.py
@@ -21,6 +21,8 @@
 import visual_behavior.data_access.utilities as utilities
 
 import visual_behavior_glm.GLM_analysis_tools as gat
+import visual_behavior_glm.GLM_across_session as gas
+
 import visual_behavior_glm.GLM_clustering as glm_clust
 import visual_behavior_glm.GLM_params as glm_params
 from statsmodels.stats.proportion import proportion_confint
@@ -213,24 +215,23 @@ def pivot_df(df, dropna=True, drop_duplicated_cells=True):
 # loading & processing ###
 
 def get_glm_results_pivoted_for_clustering(glm_version='24_events_all_L2_optimize_by_session',
-                                           model_output_type='adj_fraction_change_from_full', save_dir=None):
+                                           model_output_type='adj_fraction_change_from_full',
+                                           across_sessions_normalized=True, save_dir=None):
     """
     loads GLM results pivoted where columns are dropout scores and rows are cells in specific experiments
     filters to limit to cells matched in all experience levels
     loads from file if file exists in save_dir, otherwise generates from mongo and saves to save dir
     """
+    if across_sessions_normalized and save_dir:
+        results_file_path = os.path.join(save_dir, glm_version + '_across_normalized_results_pivoted.h5')
+    elif save_dir:
+        results_file_path = os.path.join(save_dir, glm_version + '_results_pivoted.h5')
 
-    results_file_path = os.path.join(save_dir, glm_version + '_results_pivoted.h5')
-    if os.path.exists(results_file_path):
+    if os.path.exists(results_file_path) is False:
         print('loading results_pivoted from', results_file_path)
         results_pivoted = pd.read_hdf(results_file_path, key='df')
 
     else:  # if it doesnt exist, then load it and save (note this is slow)
-        results_pivoted = gat.build_pivoted_results_summary(value_to_use=model_output_type, results_summary=None,
-                                                            glm_version=glm_version, cutoff=None)
-
-        # get rid of passive sessions
-        results_pivoted = results_pivoted[results_pivoted.passive == False]
 
         # get cells table and limit to cells matched in closest familiar and novel active
         cells_table = loading.get_cell_table()
@@ -244,16 +245,26 @@ def get_glm_results_pivoted_for_clustering(glm_version='24_events_all_L2_optimiz
         matched_experiments = cells_table.ophys_experiment_id.unique()
         matched_cells = cells_table.cell_specimen_id.unique()
 
+        if across_sessions_normalized is False:
+            results_pivoted = gat.build_pivoted_results_summary(value_to_use=model_output_type,
+                                                                results_summary=None,
+                                                                glm_version=glm_version, cutoff=None)
+
+            # get rid of passive sessions
+            results_pivoted = results_pivoted[results_pivoted.passive == False]
+        elif across_sessions_normalized is True:
+            results_pivoted, _ = gas.load_cells(glm_version=glm_version)
+
         # limit results pivoted to to last familiar and second novel
         results_pivoted = results_pivoted[results_pivoted.ophys_experiment_id.isin(matched_experiments)]
         results_pivoted = results_pivoted[results_pivoted.cell_specimen_id.isin(matched_cells)]
         print(len(results_pivoted.cell_specimen_id.unique()),
               'cells in results_pivoted after limiting to strictly matched cells')
 
-        if save_dir:
-            # save filtered results to save_dir
-            results_pivoted.to_hdf(os.path.join(save_dir, glm_version + '_results_pivoted.h5'), key='df')
-            print('results_pivoted saved')
+        # if save_dir:
+        #     # save filtered results to save_dir
+        #     results_pivoted.to_hdf(results_file_path, key='df')
+        #     print('results_pivoted saved')
     return results_pivoted
 
 
@@ -748,6 +759,8 @@ def get_labels_for_coclust_matrix(X, model=SpectralClustering, nboot=np.arange(1
     ___________
     :return: labels: matrix of labels, n repeats by n observations
     '''
+    if model is SpectralClustering:
+        model = model()
     labels = []
     if n_clusters is not None:
         model.n_clusters = n_clusters
@@ -767,6 +780,7 @@ def get_coClust_matrix(X, model=SpectralClustering, nboot=np.arange(150), n_clus
     ______________
     returns: coClust_matrix: (ndarray) probability matrix of co-clustering together.
     '''
+    #model = model()
     labels = get_labels_for_coclust_matrix(X=X,
                                            model=model,
                                            nboot=nboot,
@@ -1588,7 +1602,7 @@ def shuffle_dropout_score(df_dropout, shuffle_type='all'):
     Returns:
         df_shuffled (pd. Dataframe) of shuffled dropout scores
     '''
-    df_shuffled = df_dropout.copy(deep=True)
+    df_shuffled = df_dropout.copy()
     regressors = df_dropout.columns.levels[0].values
     experience_levels = df_dropout.columns.levels[1].values
     if shuffle_type == 'all':
@@ -2422,5 +2436,69 @@ def get_shuffle_label(shuffle_type):
     return shuffle_type_dict[shuffle_type]
 
 
-def get_cluster_size_statistics_df():
-    return 10
+def compute_sse(feature_matrix):
+    '''
+    Computes Sum of Squared Error between each cell in feature matrix and the mean.
+
+    INPUT:
+    feature_matrix (pd.DataFrame) dropout scores, rows are cell specimen ids
+
+    Returns:
+    SSE (list) of sse values between each cell and their mean.
+    '''
+
+    mean_values = feature_matrix.mean().values
+
+    SSE = np.sum(np.subtract(feature_matrix.values, mean_values) ** 2, axis=1)
+
+    return SSE
+
+
+def get_variability_df(feature_matrix, cluster_df, columns=['cluster_id', 'cre_line', 'clustered'], metric='sse'):
+    '''
+    INPUT:
+    feature_matrix:
+    cluster_df: (pd.DataFrame) dataframe with columns ['cre_line', 'cluster_id'] and cell specimen id as an index
+    metric: (string)
+
+    Returns:
+    variability_df
+    '''
+
+    variability_df = pd.DataFrame(columns=columns)
+    cre_lines = np.sort(vba_clust.get_cre_lines(cluster_df))
+
+    columns = [*columns, metric]
+
+    if 'cell_specimen_id' in cluster_df.keys():
+        cluster_df.set_index('cell_specimen_id', inplace=True)
+
+    for cre_line in cre_lines:
+        print(cre_line)
+        cre_cluster_df = cluster_df[cluster_df.cre_line == cre_line]
+        cre_cell_ids = cre_cluster_df.index.values
+        cre_feature_matrix = feature_matrix.loc[cre_cell_ids]
+
+        cluster_ids = np.sort(cre_cluster_df['cluster_id'].values)
+        # compute values for each cluster id
+        for cluster_id in cluster_ids:
+            cluster_cids = cre_cluster_df[cre_cluster_df.cluster_id == cluster_id].index.values
+            cluster_feature_matrix = cre_feature_matrix.loc[cluster_cids]
+            if metric is 'sse':
+                values = compute_sse(cluster_feature_matrix)
+
+            variability_df = variability_df.append(pd.DataFrame({'cre_line': [cre_line] * len(values),
+                                                                 'cluster_id': [cluster_id] * len(values),
+                                                                 'clustered': [True] * len(values),
+                                                                 metric: values}, index=np.arange(len(values))),
+                                                   ignore_index=True)
+
+        if metric is 'sse':
+            values = compute_sse(cre_feature_matrix)
+        variability_df = variability_df.append(pd.DataFrame({'cre_line': [cre_line] * len(values),
+                                                             'cluster_id': [np.nan] * len(values),
+                                                             'clustered': [False] * len(values),
+                                                             metric: values}, index=np.arange(len(values))),
+                                               ignore_index=True)
+
+    return variability_df
diff --git a/visual_behavior/utilities.py b/visual_behavior/utilities.py
@@ -1336,7 +1336,10 @@ def cache_response_probability(behavior_session_id, engaged_only=True):
         stimulus_presentations = stimulus_presentations[stimulus_presentations.engagement_state == 'engaged']
 
     # compute response probability
-    response_matrix = behavior.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
+    # BEHAVIOR OBJECT IS NOT DEFINED. CHANGED IT TO DATASET, BUT NOT SURE IF THAT'S CORRECT ONE. IRYNA NOV 11,2022
+    # response_matrix = behavior.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
+    #                                                     engaged_only=engaged_only)
+    response_matrix = dataset.calculate_response_matrix(stimulus_presentations, aggfunc=np.mean, sort_by_column=True,
                                                          engaged_only=engaged_only)
 
     filename = 'behavior_session_id={}.h5'.format(behavior_session_id)