From 6fe8d4c0571a4761e370a4fbc057eff2356b3180 Mon Sep 17 00:00:00 2001 From: Henrique Musseli Cezar Date: Tue, 26 Sep 2023 15:37:50 +0200 Subject: [PATCH] Improved output of aggregates --- utils/aggregates.py | 82 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/utils/aggregates.py b/utils/aggregates.py index dbefa930..596aeecc 100644 --- a/utils/aggregates.py +++ b/utils/aggregates.py @@ -5,6 +5,7 @@ import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator import os +import sys import argparse from tqdm import tqdm import warnings @@ -114,6 +115,7 @@ def compute_clusters( print_sel.write(f"./colored_pdbs/snap_{frame}.pdb") + plt.close() return clusters @@ -132,6 +134,7 @@ def aggregates_clustering( plot_dendrograms, traj_in_memory, save_solvent, + summary_fig_size=(12, 8), ): u = mda.Universe(grofile, h5mdfile, in_memory=traj_in_memory) @@ -178,38 +181,85 @@ def aggregates_clustering( clusters = dask.compute(job_list, num_workers=nworkers) n_clusters = [] - clust_sizes = [] all_sizes = [] + total_clust_by_size = {} for c in clusters[0]: # get the number of clusters and sizes unique_clusts, clust_counts = np.unique(c, return_counts=True) + for size in clust_counts: + if size not in total_clust_by_size: + total_clust_by_size[size] = 1 + else: + total_clust_by_size[size] += 1 n_clusters.append(len(unique_clusts)) - clust_sizes.append(clust_counts) all_sizes += clust_counts.tolist() # based on cluster sizes get occurence of each size sizes, freq = np.unique(all_sizes, return_counts=True) freq = freq / len(u.trajectory[skip:end:stride]) - # write sizes and freq to file + # overall average number of aggregates + avg_n_aggs = np.average(n_clusters) + + # compute probability of picking cluster of size n + prob_by_size = {} + for k, v in total_clust_by_size.items(): + prob_by_size[k] = v / np.sum(n_clusters) + prob_by_size = dict(sorted(prob_by_size.items())) + + # compute probability of picking a random molecule and + # it belonging to a cluster of size n + prob_mol_size = {} + norm = len(at_sel) * len(u.trajectory[skip:end:stride]) + for k, v in total_clust_by_size.items(): + prob_mol_size[k] = k * v / norm + prob_mol_size = dict(sorted(prob_mol_size.items())) + + # write summary to file with open("summary_clustering.dat", "w") as of: + of.write("Executed command: " + " ".join(sys.argv) + "\n") + + of.write(f"\nAverage number of aggregates: {avg_n_aggs}\n") + + of.write("\nsize\tfrequency\n") for s, f in zip(sizes, freq): of.write(f"{s}\t{f}\n") + + of.write("\nsize\tprobability\n") + for s, p in prob_by_size.items(): + of.write(f"{s}\t{p}\n") + + of.write("\nsize\tprob molecule\n") + for s, p in prob_mol_size.items(): + of.write(f"{s}\t{p}\n") # plot results - fig, (ax1, ax2) = plt.subplots(2, 1) + _, axs = plt.subplots(2, 2, figsize=summary_fig_size) - ax1.plot(frames, n_clusters) - ax1.set_ylabel("Number of aggregates") - ax1.set_xlabel("Frame") - ax1.yaxis.set_major_locator(MaxNLocator(integer=True)) + axs[0, 0].plot(frames, n_clusters) + axs[0, 0].axhline(avg_n_aggs, linestyle="--") + axs[0, 0].set_ylabel("Num. of aggregates") + axs[0, 0].set_xlabel("Frame") + axs[0, 0].yaxis.set_major_locator(MaxNLocator(integer=True)) xticklabels = [f"{sizes[i]}" for i in range(len(sizes))] - ax2.bar(xticklabels, freq, width=0.8) - ax2.set_ylabel("Frequency") - ax2.set_xlabel("Aggregate size") - ax2.tick_params("x", labelrotation=60) + axs[0, 1].bar(xticklabels, freq, width=0.8) + axs[0, 1].set_ylabel("Avg. num. per snapshot") + axs[0, 1].set_xlabel("Aggregate size") + axs[0, 1].tick_params("x", labelrotation=60) + + xticklabels = [f"{k}" for k in prob_by_size.keys()] + axs[1, 0].bar(xticklabels, prob_by_size.values(), width=0.8) + axs[1, 0].set_ylabel("Prob.") + axs[1, 0].set_xlabel("Aggregate size") + axs[1, 0].tick_params("x", labelrotation=60) + + xticklabels = [f"{k}" for k in prob_mol_size.keys()] + axs[1, 1].bar(xticklabels, prob_mol_size.values(), width=0.8) + axs[1, 1].set_ylabel("Prob. molecule in agg.") + axs[1, 1].set_xlabel("Aggregate size") + axs[1, 1].tick_params("x", labelrotation=60) plt.tight_layout() plt.savefig("summary_clustering.pdf", bbox_inches="tight") @@ -294,6 +344,13 @@ def aggregates_clustering( default=False, help="plot the dendrograms (saved in ./dendrograms) (use with stride because its ~10x slower)", ) + parser.add_argument( + "--summary-fig-size", + type=int, + nargs=2, + default=(8, 6), + help="two integers to define the size of the summary figure (default = 8 6)", + ) parser.add_argument( "--traj-in-memory", action="store_true", @@ -347,4 +404,5 @@ def aggregates_clustering( args.plot_dendrograms, args.traj_in_memory, args.save_solvent, + tuple(args.summary_fig_size), )