diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py new file mode 100644 index 00000000..657c7e92 --- /dev/null +++ b/scripts/AssignOuter.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# This file is part of MSMBuilder. +# +# Copyright 2014 Stanford University +# +# MSMBuilder is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +from msmbuilder import arglib, assigning +from mdtraj import io +from msmbuilder.metrics import solvent +import logging +logger = logging.getLogger('msmbuilder.scripts.AssignOuter') + +parser = arglib.ArgumentParser(description= + """Combine the results of two clusterings. Specifically, if a conformation + is in state i after clustering with metric 1, and it is in state j + after clustering with metric 2, we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states.""") +parser.add_argument('assignment1', default='./Data1/Assignments.h5', + help='First assignment file') +parser.add_argument('assignment2', default='./Data2/Assignments.h5', + help='Second assignment file') +parser.add_argument('assignment_out', default='OuterProductAssignments.h5', + help='Output file') + + +def main(assign1_fn, assign2_fn, out_fn): + assign1 = io.loadh(assign1_fn, 'arr_0') + assign2 = io.loadh(assign2_fn, 'arr_0') + + new_assignments = assigning.outer_product_assignment(assign1, assign2) + io.saveh(out_fn, new_assignments) + logger.info('Saved outer product assignments to %s', out_fn) + + +if __name__ == "__main__": + args = parser.parse_args() + arglib.die_if_path_exists(args.assignment_out) + + main(args.assignment1, args.assignment2, args.assignment_out) diff --git a/scripts/CreateAtomIndices.py b/scripts/CreateAtomIndices.py index 98956932..7f51feef 100755 --- a/scripts/CreateAtomIndices.py +++ b/scripts/CreateAtomIndices.py @@ -25,21 +25,26 @@ logger = logging.getLogger('msmbuilder.scripts.CreateAtomIndices') + parser = arglib.ArgumentParser( description="Creates an atom indices file for RMSD from a PDB.") parser.add_argument('pdb') parser.add_argument('output', default='AtomIndices.dat') parser.add_argument('atom_type', help='''Atoms to include in index file. + One of four options: (1) minimal (CA, CB, C, N, O, recommended), (2) heavy, - (3) alpha (carbons), or (4) all. Use "all" in cases where protein - nomenclature may be inapproprate, although you may want to define your own - indices in such situations. Note that "heavy" keeps all heavy atoms that + (3) alpha (carbons), (4) water (water oxygens), or (5) all. Use "all" + in cases where protein nomenclature may be inapproprate, although you + may want to define your own indices in such situations. + Note that "heavy" keeps all heavy atoms that are not symmetry equivalent. By symmetry equivalent, we mean atoms identical under an exchange of labels. For example, heavy will exclude - the two pairs of equivalent carbons (CD, CE) in a PHE ring. - Note that AtomIndices.dat should be zero-indexed--that is, a 0 + the two pairs of equivalent carbons (CD, CE) in a PHE ring. + Note that AtomIndices.dat should be zero-indexed--that is, a 0 in AtomIndices.dat corresponds to the first atom in your PDB''', - choices=['minimal', 'heavy', 'alpha', 'all'], default='minimal') + choices=['minimal', 'heavy', 'alpha', 'all', 'water'], + default='minimal') + def run(PDBfn, atomtype): @@ -122,6 +127,7 @@ def run(PDBfn, atomtype): "CNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "NNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "SOL": [], + "HOH": [], "Cl-": [], "Na+": [] } @@ -135,6 +141,13 @@ def run(PDBfn, atomtype): elif atomtype == 'alpha': for key in toKeepDict.keys(): toKeepDict[key] = ["CA"] + elif atomtype == 'water': + # Note: we only keep water oxygens + for key in toKeepDict.keys(): + if key == "SOL" or key == 'HOH': + toKeepDict[key] = ["O"] + else: + toKeepDict[key] = [] elif atomtype == "all": pass else: diff --git a/src/python/assigning.py b/src/python/assigning.py index fc458160..247b6b6b 100644 --- a/src/python/assigning.py +++ b/src/python/assigning.py @@ -3,6 +3,7 @@ import numpy as np import tables import warnings +from msmbuilder import MSMLib as msmlib from mdtraj import io import logging logger = logging.getLogger(__name__) @@ -236,3 +237,52 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa "-- this function is deprecated"), DeprecationWarning) assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size, atom_indices_to_load) + + +def outer_product_assignment(assign1, assign2): + """Combine the results of two clusterings. + + Specifically, if a conformation is in state i after clustering + with metric 1, and it is in state j after clustering with metric 2, + we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states. + + Parameters + ---------- + assign1 : np.ndarray + Assignment array 1 + assign2 : np.ndarray + Assignment array 2 + + Returns + ---------- + new_assign : np.ndarray + New assignments array, renumbered from 0 ... N-1, N <= n1 * n2 + """ + + assert assign1.shape == assign2.shape, """Assignments must be + for the same set of trajectories.""" + new_assign = -1 * np.ones_like(assign1, dtype=np.int) + + nstates1 = np.max(assign1) + 1 + nstates2 = np.max(assign2) + 1 + + translations = np.reshape(np.arange(nstates1 * nstates2), + (nstates1, nstates2)) + + ass_shape = assign1.shape + for i in xrange(ass_shape[0]): + for j in xrange(ass_shape[1]): + #TODO: vectorize this loop + if assign1[i, j] == -1: + # No assignment here + assert assign2[i, j] == -1, """Assignments must be for + the same set of trajectories.""" + else: + new_assign[i, j] = translations[assign1[i, j], assign2[i, j]] + + # Renumber states in place + msmlib.renumber_states(new_assign) + return new_assign diff --git a/src/python/metrics/__init__.py b/src/python/metrics/__init__.py index 6d860cd0..70007637 100644 --- a/src/python/metrics/__init__.py +++ b/src/python/metrics/__init__.py @@ -19,3 +19,4 @@ from hybrid import Hybrid, HybridPNorm from projection import RedDimPNorm from positions import Positions +from solvent import SolventFp diff --git a/src/python/metrics/parsers.py b/src/python/metrics/parsers.py index 4bc79632..66856ea0 100644 --- a/src/python/metrics/parsers.py +++ b/src/python/metrics/parsers.py @@ -9,7 +9,7 @@ from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact, AtomPairs, ContinuousContact, AbstractDistanceMetric, Vectorized, - RedDimPNorm, Positions) + RedDimPNorm, Positions, SolventFp) def add_argument(group, *args, **kwargs): if 'default' in kwargs: @@ -20,7 +20,6 @@ def add_argument(group, *args, **kwargs): kwargs['help'] = d group.add_argument(*args, **kwargs) -################################################################################ def locate_metric_plugins(name): if not name in ['add_metric_parser', 'construct_metric', 'metric_class']: @@ -32,7 +31,7 @@ def locate_metric_plugins(name): def add_metric_parsers(parser): metric_parser_list = [] - + metric_subparser = parser.add_subparsers(dest='metric', description ='Available metrics to use.') #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') @@ -42,7 +41,7 @@ def add_metric_parsers(parser): between two structures, first they are rotated and translated with respect to one another to achieve maximum coincidence. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') - add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', + add_argument(rmsd, '-a', dest='rmsd_atom_indices', help='Atom indices to use in RMSD calculation. Pass "all" to use all atoms.', default='AtomIndices.dat') metric_parser_list.append(rmsd) @@ -54,7 +53,7 @@ def add_metric_parsers(parser): frames are computed by taking distances between these vectors in R^n. The euclidean distance is recommended, but other distance metrics are available (cityblock, etc). This code is executed in parallel on multiple cores (but - not multiple boxes) using OMP. ''') + not multiple boxes) using OMP. ''') add_argument(dihedral, '-a', dest='dihedral_angles', default='phi/psi', help='which dihedrals. Choose from phi, psi, chi (to choose multiple, seperate them with a slash), or user') add_argument(dihedral, '-f', dest='dihedral_userfilename', default='DihedralIndices.dat', help='filename for dihedral indices, N lines of 4 space-separated indices (otherwise ignored)') @@ -74,7 +73,7 @@ def add_metric_parsers(parser): Furthermore, the sense in which the distance between two residues is computed can be either specified as "CA", "closest", or "closest-heavy", which will respectively compute ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of - atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), + atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to one residue and j to the other residue. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') @@ -89,7 +88,7 @@ def add_metric_parsers(parser): atompairs = metric_subparser.add_parser('atompairs',description='''ATOMPAIRS: For each frame, we represent the conformation as a vector of particular atom-atom distances. Then the distance between frames is calculated using a specified norm on these vectors. This code is executed in - parallel (but not multiple boxes) using OMP.''') + parallel (but not multiple boxes) using OMP.''') add_argument(atompairs, '-a', dest='atompairs_which', help='path to file with 2D array of which atompairs to use.', default='AtomPairs.dat') add_argument(atompairs, '-p', dest='atompairs_p', default=2, help='p used for metric=minkowski (otherwise ignored)') @@ -97,6 +96,20 @@ def add_metric_parsers(parser): help='which distance metric', choices=AtomPairs.allowable_scipy_metrics) metric_parser_list.append(atompairs) + solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame + calculate a 'solvent fingerprint' by considering the weighted pairwise distances + between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""") + add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat', + help='Indices of the solvent (water) atoms') + add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat', + help='Indices of the solute (protein) atoms') + add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5, + type=float, help='std. dev. of gaussian kernel') + add_argument(solventfp, '-p', dest='solventfp_p', default=2, help='p used for metric=minkowski (otherwise ignored)') + add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean', + help='which distance metric', choices=SolventFp.allowable_scipy_metrics) + metric_parser_list.append(solventfp) + positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame we represent the conformation as a vector of atom positions, where the atoms have been aligned to a target structure.""") @@ -111,13 +124,13 @@ def add_metric_parsers(parser): tica = metric_subparser.add_parser( 'tica', description=''' tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f. in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.) - for more details. In addition to these options, you must provide an additional + for more details. In addition to these options, you must provide an additional metric you used to prepare the trajectories in the training step.''') add_argument(tica,'-p', dest='p', help='p value for p-norm') add_argument(tica,'-m', dest='projected_metric', help='metric to use in the projected space', choices=Vectorized.allowable_scipy_metrics, default='euclidean') - add_argument(tica, '-f', dest='tica_fn', + add_argument(tica, '-f', dest='tica_fn', help='tICA Object which was prepared by tICA_train.py') add_argument(tica, '-n', dest='num_vecs', type=int, help='Choose the top <-n> eigenvectors based on their eigenvalues') @@ -130,14 +143,12 @@ def add_metric_parsers(parser): add_argument(picklemetric, '-i', dest='picklemetric_input', required=True, help="Path to pickle file for the metric") metric_parser_list.append(picklemetric) - + for add_parser in locate_metric_plugins('add_metric_parser'): plugin_metric_parser = add_parser(metric_subparser, add_argument) metric_parser_list.append(plugin_metric_parser) - - return metric_parser_list - ################################################################################ + return metric_parser_list def construct_metric(args): @@ -154,27 +165,27 @@ def construct_metric(args): metric = Dihedral(metric=args.dihedral_metric, p=args.dihedral_p, angles=args.dihedral_angles, userfilename=args.dihedral_userfilename) - + elif metric_name == 'contact': if args.contact_which != 'all': contact_which = np.loadtxt(args.contact_which, np.int) else: contact_which = 'all' - if args.contact_cutoff_file != None: - contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) + if args.contact_cutoff_file != None: + contact_cutoff = np.loadtxt(args.contact_cutoff_file, np.float) elif args.contact_cutoff != None: contact_cutoff = float( args.contact_cutoff ) else: contact_cutoff = None - + if contact_cutoff != None and contact_cutoff < 0: metric = ContinuousContact(contacts=contact_which, scheme=args.contact_scheme) else: metric = BooleanContact(contacts=contact_which, cutoff=contact_cutoff, scheme=args.contact_scheme) - + elif metric_name == 'atompairs': if args.atompairs_which != None: pairs = np.loadtxt(args.atompairs_which, np.int) @@ -186,7 +197,7 @@ def construct_metric(args): elif metric_name == 'positions': target = md.load(args.target) - + if args.pos_atom_indices != None: atom_indices = np.loadtxt(args.pos_atom_indices, np.int) else: @@ -196,16 +207,24 @@ def construct_metric(args): align_indices = np.loadtxt(args.align_indices, np.int) else: align_indices = None - + metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices, metric=args.positions_metric, p=args.positions_p) + elif metric_name == 'solventfp': + proti = np.loadtxt(args.solventfp_proti, np.int) + wateri = np.loadtxt(args.solventfp_wateri, np.int) + metric = SolventFp(solute_indices=proti, + solvent_indices=wateri, + sigma=args.solventfp_sigma, + metric=args.solventfp_metric, + p=args.solventfp_p) + elif metric_name == "tica": tica_obj = tICA.load(args.tica_fn) - metric = RedDimPNorm(tica_obj, num_vecs=args.num_vecs, + metric = RedDimPNorm(tica_obj, num_vecs=args.num_vecs, metric=args.projected_metric, p=args.p) - elif metric_name == 'custom': with open(args.picklemetric_input) as f: metric = pickle.load(f) @@ -223,7 +242,7 @@ def construct_metric(args): except StopIteration: # This means that none of the plugins acceptedthe metric raise RuntimeError("Bad metric. Could not be constructed by any built-in or plugin metric. Perhaps you have a poorly written plugin?") - + if not isinstance(metric, AbstractDistanceMetric): return ValueError("%s is not a AbstractDistanceMetric" % metric) diff --git a/src/python/metrics/solvent.py b/src/python/metrics/solvent.py new file mode 100644 index 00000000..19f61d53 --- /dev/null +++ b/src/python/metrics/solvent.py @@ -0,0 +1,105 @@ +import logging + +import mdtraj as md + +from baseclasses import Vectorized +import numpy as np + + +logger = logging.getLogger(__name__) + + +class SolventFp(Vectorized): + """Distance metric for calculating distances between frames based on their + solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8. + """ + + allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev', + 'cityblock', 'correlation', 'cosine', + 'euclidean', 'minkowski', 'sqeuclidean', + 'seuclidean', 'mahalanobis', 'sqmahalanobis'] + + def __init__(self, solute_indices, solvent_indices, sigma, + metric='euclidean', p=2, V=None, VI=None): + """Create a distance metric to capture solvent degrees of freedom + + Parameters + ---------- + solute_indices : ndarray + atom indices of the solute atoms + solvent_indices : ndarray + atom indices of the solvent atoms + sigma : float + width of gaussian kernel + metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', + 'correlation', 'cosine', 'euclidean', 'minkowski', + 'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'} + Distance metric to equip the vector space with. + p : int, optional + p-norm order, used for metric='minkowski' + V : ndarray, optional + variances, used for metric='seuclidean' + VI : ndarray, optional + inverse covariance matrix, used for metric='mahalanobi' + + """ + + # Check input indices + md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1, + name='solute', can_be_none=False) + md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1, + name='solvent', can_be_none=False) + + super(SolventFp, self).__init__(metric, p, V, VI) + self.solute_indices = solute_indices + self.solvent_indices = solvent_indices + self.sigma = sigma + + def __repr__(self): + "String representation of the object" + return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)' + % (self.metric, self.p, self.sigma)) + + def prepare_trajectory(self, trajectory): + """Calculate solvent fingerprints + Parameters + ---------- + trajectory : msmbuilder.Trajectory + An MSMBuilder trajectory to prepare + + Returns + ------- + fingerprints : ndarray + A 2D array of fingerprint vectors of + shape (traj_length, protein_atom) + """ + + # Give shorter names to these things + prot_indices = self.solute_indices + water_indices = self.solvent_indices + sigma = self.sigma + + # The result vector + fingerprints = np.zeros((trajectory.n_frames, len(prot_indices))) + + # Check for periodic information + if trajectory.unitcell_lengths is None: + logging.warn('No periodic information found for computing solventfp.') + + for i, prot_i in enumerate(prot_indices): + # For each protein atom, calculate distance to all water + # molecules + atom_pairs = np.empty((len(water_indices), 2)) + atom_pairs[:, 0] = prot_i + atom_pairs[:, 1] = water_indices + # Get a traj_length x n_water_indices vector of distances + distances = md.compute_distances(trajectory, + atom_pairs, + periodic=True) + # Calculate guassian kernel + distances = np.exp(-distances / (2 * sigma * sigma)) + + # Sum over water atoms for all frames + fingerprints[:, i] = np.sum(distances, axis=1) + + return fingerprints