-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_benchmark.py
153 lines (112 loc) · 3.78 KB
/
cluster_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/python3
"""
Wrapper for running the conformational state clustering on the benchmark dataset.
"""
# Third party imports
import argparse
import logging
import pathlib
import sys
import pandas as pd
from matplotlib import pyplot as plt
# Custom imports
from cluster_conformers import cluster_monomers
from cluster_conformers.utils import download_utils
# Global variables
PATH_BASE = pathlib.Path("benchmark_data")
PATH_BENCHMARK = PATH_BASE.joinpath("benchmark_monomeric_open_closed_conformers.csv")
PATH_MMCIFS = PATH_BASE.joinpath("updated_mmcifs")
PATH_SAVE_CA = PATH_BASE.joinpath("all_uniprots", "ca_distances")
PATH_SAVE_DD_MATXS = PATH_BASE.joinpath(
"all_uniprots", "distance_differences", "all_conformers"
)
PATH_SAVE_CLUSTER_RESULTS = PATH_BASE.joinpath("all_uniprots", "clustering_results")
PATH_SAVE_DD_MAPS = PATH_BASE.joinpath("all_uniprots", "distance_difference_maps")
PATH_ALPHAFOLD_MMCIFS = PATH_BASE.joinpath("alphafold_mmcifs")
def benchmark_parser(args):
"""
Collect and parse command line arguements for this script.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-s",
"--single-acc",
default=None,
type=str,
help="UniProt accession code. Only cluster this code from the benchmark "
" dataset. ",
)
return parser.parse_args(args)
def benchmark_cluster(benchmark_df, unp):
"""
Initialises clustering object and runs clustering methods for all chains per UniProt
segment described in the benchmark dataset CSV file.
"""
# Prepare input for ClusterConformations() based on info in benchmark CSV
structures_dict = {}
df_unp = benchmark_df[benchmark_df["UNP_ACC"] == unp]
for pdb in df_unp["PDBe_ID"].unique():
chains = list(df_unp["label_asym_id"][df_unp["PDBe_ID"] == pdb])
path_mmcif = PATH_MMCIFS.joinpath(f"{pdb}_updated.cif")
structures_dict[str(path_mmcif)] = chains
# Prepare chains in clustering object
unp_cluster = cluster_monomers.ClusterConformations(
unp=unp,
mmcifs_and_chains=structures_dict,
path_save_alphafold=PATH_ALPHAFOLD_MMCIFS, # Include AFDB structures
)
# Generate CA matrices
unp_cluster.ca_distance(PATH_SAVE_CA)
# Cluster
unp_cluster.cluster(
path_save_dd_matx=PATH_SAVE_DD_MATXS,
path_save_cluster_results=PATH_SAVE_CLUSTER_RESULTS,
)
# Post-processing
# unp_cluster.select_representatives()
"""
To render CA distance difference maps, dendrograms and swarm plots, uncomment
the lines between:
here ....
"""
# unp_cluster.make_dd_maps(PATH_SAVE_DD_MAPS)
png_bool = True
svg_bool = True
cluster_monomers.render_dendrogram(
unp=unp,
path_results=PATH_SAVE_CLUSTER_RESULTS,
path_save=PATH_SAVE_CLUSTER_RESULTS.joinpath("dendrograms"),
png=png_bool,
svg=svg_bool,
)
"""
.... and here
"""
# Flush objects
del unp_cluster
plt.clf()
def main(unp=None):
"""
Wrapper function for clustering
E.g. :
- `python cluster_benchmark.py` for whole dataset
- `python cluster_benchmark.py -s <uniprot>` for specific segment in dataset
"""
benchmark_df = pd.read_csv(PATH_BENCHMARK)
download_utils.fetch_benchmark_mmcifs(PATH_BENCHMARK, PATH_MMCIFS)
if unp:
benchmark_cluster(benchmark_df, unp)
else:
for unp in benchmark_df["UNP_ACC"].unique():
benchmark_cluster(benchmark_df, unp)
if __name__ == "__main__":
"""
Run from the command line
"""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s",
datefmt="%m-%d %H:%M",
)
args = benchmark_parser(sys.argv[1:])
main(args.single_acc)