-
Notifications
You must be signed in to change notification settings - Fork 0
/
dihedral_tica_featurize.py
124 lines (107 loc) · 4.59 KB
/
dihedral_tica_featurize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
from glob import glob
import jug
import mdtraj as md
import numpy as np
import pyemma
from pathlib import Path
@jug.TaskGenerator
def featurize_write(featurizer, traj_name, stride, traj_feature_suffix='.h5', parent=None,
rename_depth=-4, chunksize=2000):
features = pyemma.coordinates.load(traj_name, features=featurizer, stride=stride, chunksize=chunksize)
if parent:
tp = Path(traj_name).with_suffix(traj_feature_suffix)
traj_specific_feature_p = (parent / Path(*tp.parts[rename_depth:])).resolve()
traj_specific_feature_fn = str(traj_specific_feature_p)
traj_specific_feature_p.parent.mkdir(exist_ok=True, parents=True)
else:
traj_specific_feature_fn = traj_name.replace('.xtc', traj_feature_suffix)
print(traj_specific_feature_fn)
np.save(traj_specific_feature_fn, features)
return traj_specific_feature_fn
@jug.TaskGenerator
def write_feature_traj_fns(feature_traj_list_name, features_list):
with open(feature_traj_list_name, 'w') as f:
f.write('\n'.join(features_list))
topologies = {
't4l-1': 'prot_masses.pdb',
't4l-2': 'prot_masses.pdb',
't4l-3': 'prot_masses.pdb',
}
trajectory_paths = {
't4l-1': [
'traj-list-1.txt'
],
't4l-2': [
'traj-list-2.txt'
],
't4l-3': [
'traj-list-3.txt'
]
}
pocket_resids = np.loadtxt('pocket-resids-5.txt', dtype=int)
to_cluster = {}
for protein in trajectory_paths.keys():
print(protein)
to_cluster[protein] = {
'traj_paths': trajectory_paths[protein],
'top_path': topologies[protein],
'stride': 1,
'selstr': ' or '.join(f'residue {r}' for r in pocket_resids),
'chi_selstr': ' or '.join(f'residue {r}' for r in pocket_resids),
'which_chis': "all",
'description': 'pocket',
'tica-lags': [100, 200, 500, 1000],
'k': [25, 50, 75, 100, 150, 200, 300, 400, 500],
}
var_cutoff = 0.9
feat_traj_suff = '.npy'
overwrite = True
OUT_STEM = "recluster"
for protein, specs in to_cluster.items():
traj_paths = specs['traj_paths']
top_path = specs['top_path']
selstr = specs['selstr']
stride = specs['stride']
description = specs['description']
if 'chi_selstr' in specs.keys():
include_sidechains = True
chi_selstr = specs['chi_selstr']
output_filename = f"{OUT_STEM}/{protein}-backbone-{specs['which_chis']}-dihedrals-{description}-feature-fns.txt"
if 'which_chis' in specs.keys():
which_chis = specs['which_chis']
else:
which_chis = 'chi1'
else:
chi_selstr = selstr
output_filename = f'{OUT_STEM}/{protein}-backbone-dihedrals-{description}-feature-fns.txt'
include_sidechains = False
# If feature filename already exists, continue to next component of pipeline.
if os.path.exists(output_filename):
if overwrite:
print('Features filename list found, but overwrite set to "True", so overwriting.')
else:
print('Feature file already exists; refusing to redo.')
exit(1)
pdb = md.load(top_path)
feat = pyemma.coordinates.featurizer(pdb)
feat.add_backbone_torsions(selstr=selstr, cossin=True, periodic=False)
if include_sidechains:
feat.add_sidechain_torsions(selstr=chi_selstr, cossin=True, periodic=False, which=which_chis)
# save out description of features
os.makedirs(OUT_STEM, exist_ok=True)
np.save(output_filename.replace('feature-fns.txt', 'feature-descriptions.npy'), feat.describe())
if Path(traj_paths[0]).suffix == '.txt':
traj_list = []
for traj_list_path in traj_paths:
traj_list += Path(traj_list_path).read_text().strip().split()
else: # assume these are trajectory filenames
traj_list = sorted(list(np.concatenate([glob(traj_path) for traj_path in traj_paths])))
print('Beginning to generate tasks for reading, featurizing, and writing per trajectory.')
# spawn a bunch of jug tasks to read and featurize individual trajectories, saving them to independent h5 files
# NOTE: if parent is defined, will write to new directory structure rename_depth below top levels of traj files
# within parent. If not, it will write a 'feature.h5' in the same path as the parent of the trajectory file.
features_list = [featurize_write(feat, traj_name, stride, traj_feature_suffix=feat_traj_suff, parent=OUT_STEM)
for traj_name in traj_list]
# write the file names for each feature traj to a list that can be read by subsequent (clustering) scripts.
write_feature_traj_fns(output_filename, features_list)