From 32de146683e7f91317d005a788e1e5fb775de1f2 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 22 Feb 2023 18:24:43 +0000 Subject: [PATCH 1/6] WIP Use PDBInf --- environment.yml | 1 + gufe/components/proteincomponent.py | 157 ++++------------------------ setup.cfg | 0 3 files changed, 24 insertions(+), 134 deletions(-) create mode 100644 setup.cfg diff --git a/environment.yml b/environment.yml index 7adbed8d..1d0ff40e 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,7 @@ dependencies: - openff-units ==0.2.0 # https://github.com/openforcefield/openff-units/issues/69 - pint <0.22 # https://github.com/openforcefield/openff-units/issues/69 - openff-models >=0.0.5 + - pdbinf - pip - pydantic >1 - pytest diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py index e2697ccf..87366b21 100644 --- a/gufe/components/proteincomponent.py +++ b/gufe/components/proteincomponent.py @@ -1,15 +1,11 @@ # This code is part of OpenFE and is licensed under the MIT license. # For details, see https://github.com/OpenFreeEnergy/gufe -import ast -import json import io import numpy as np from os import PathLike -from typing import Union, Optional -from collections import defaultdict +from typing import Union -from openmm import app -from openmm import unit as omm_unit +import pdbinf from rdkit import Chem from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType @@ -51,10 +47,6 @@ } -negative_ions = ["F", "CL", "Br", "I"] -positive_ions = ["NA", "MG", "ZN"] - - class ProteinComponent(ExplicitMoleculeComponent): """ ``Component`` representing the contents of a PDB file, such as a protein. @@ -84,7 +76,6 @@ def __init__(self, rdkit: RDKitMol, name=""): "Consider loading via rdkit.Chem.MolFromPDBFile or similar.") super().__init__(rdkit=rdkit, name=name) - # FROM @classmethod def from_pdb_file(cls, pdb_file: str, name: str = ""): """ @@ -102,9 +93,10 @@ def from_pdb_file(cls, pdb_file: str, name: str = ""): ProteinComponent the deserialized molecule """ - openmm_PDBFile = PDBFile(pdb_file) - return cls._from_openmmPDBFile( - openmm_PDBFile=openmm_PDBFile, name=name + return cls( + rdkit=pdbinf.load_pdb_file(pdb_file, + templates=[pdbinf.STANDARD_AA_DOC]), + name=name, ) @classmethod @@ -114,7 +106,7 @@ def from_pdbx_file(cls, pdbx_file: str, name=""): Parameters ---------- - pdbxfile : str + pdbx_file : str path to the pdb file. name : str, optional name of the input protein, by default "" @@ -124,117 +116,12 @@ def from_pdbx_file(cls, pdbx_file: str, name=""): ProteinComponent the deserialized molecule """ - openmm_PDBxFile = PDBxFile(pdbx_file) - return cls._from_openmmPDBFile( - openmm_PDBFile=openmm_PDBxFile, name=name + return cls( + rdkit=pdbinf.load_pdbx_file(pdbx_file, + templates=[pdbinf.STANDARD_AA_DOC]), + name=name, ) - @classmethod - def _from_openmmPDBFile(cls, openmm_PDBFile: Union[PDBFile, PDBxFile], - name: str = ""): - """Converts to our internal representation (rdkit Mol) - - Parameters - ---------- - openmm_PDBFile : PDBFile or PDBxFile - object of the protein - name : str - name of the protein - - Returns - ------- - ProteinComponent - the deserialized molecule - """ - periodicTable = Chem.GetPeriodicTable() - mol_topology = openmm_PDBFile.getTopology() - - rd_mol = Mol() - editable_rdmol = EditableMol(rd_mol) - - # Add Atoms - for atom in mol_topology.atoms(): - a = Atom(atom.element.atomic_number) - - atom_monomerInfo = Chem.AtomPDBResidueInfo() - atom_monomerInfo.SetChainId(atom.residue.chain.id) - atom_monomerInfo.SetSerialNumber(int(atom.id)) - atom_monomerInfo.SetSegmentNumber(int(atom.residue.chain.index)) - atom_monomerInfo.SetInsertionCode(atom.residue.insertionCode) - atom_monomerInfo.SetName(atom.name) - atom_monomerInfo.SetResidueName(atom.residue.name) - atom_monomerInfo.SetResidueNumber(int(atom.residue.id)) - atom_monomerInfo.SetIsHeteroAtom(False) # TODO: Do hetatoms - - a.SetMonomerInfo(atom_monomerInfo) - - # additonally possible: - # atom_monomerInfo.SetSecondaryStructure - # atom_monomerInfo.SetMonomerType - # atom_monomerInfo.SetAltLoc - - editable_rdmol.AddAtom(a) - - # Add Bonds - for bond in mol_topology.bonds(): - bond_order = _BONDORDERS_OPENMM_TO_RDKIT[bond.order] - editable_rdmol.AddBond( - beginAtomIdx=bond.atom1.index, - endAtomIdx=bond.atom2.index, - order=bond_order, - ) - - # Set Positions - rd_mol = editable_rdmol.GetMol() - positions = np.array( - openmm_PDBFile.positions.value_in_unit(omm_unit.angstrom), ndmin=3 - ) - - for frame_id, frame in enumerate(positions): - conf = Conformer(frame_id) - for atom_id, atom_pos in enumerate(frame): - conf.SetAtomPosition(atom_id, atom_pos) - rd_mol.AddConformer(conf) - - # Add Additionals - # Formal Charge - netcharge = 0 - for a in rd_mol.GetAtoms(): - atomic_num = a.GetAtomicNum() - atom_name = a.GetMonomerInfo().GetName() - - connectivity = sum( - int(bond.GetBondType()) for bond in a.GetBonds() - ) - default_valence = periodicTable.GetDefaultValence(atomic_num) - - if connectivity == 0: # ions: - if atom_name in positive_ions: - fc = default_valence # e.g. Sodium ions - elif atom_name in negative_ions: - fc = - default_valence # e.g. Chlorine ions - else: # -no-cov- - resn = a.GetMonomerInfo().GetResidueName() - resind = int(a.GetMonomerInfo().GetResidueNumber()) - raise ValueError( - "I don't know this Ion or something really went " - f"wrong! \t{atom_name}\t{resn}\t-{resind}\t" - f"connectivity{connectivity}" - ) - elif default_valence > connectivity: - fc = - (default_valence - connectivity) # negative charge - elif default_valence < connectivity: - fc = + (connectivity - default_valence) # positive charge - else: - fc = 0 # neutral - - a.SetFormalCharge(fc) - a.UpdatePropertyCache(strict=True) - - netcharge += fc - - return cls(rdkit=rd_mol, name=name) - @classmethod def _from_dict(cls, ser_dict: dict, name: str = ""): """Deserialize from dict representation""" @@ -296,8 +183,7 @@ def _from_dict(cls, ser_dict: dict, name: str = ""): return cls(rdkit=rd_mol, name=name) - # TO - def to_openmm_topology(self) -> app.Topology: + def to_openmm_topology(self) -> "app.Topology": """Convert to an openmm Topology object Returns @@ -305,6 +191,8 @@ def to_openmm_topology(self) -> app.Topology: openmm.app.Topology resulting topology obj. """ + from openmm import app + def reskey(m): """key for defining when a residue has changed from previous @@ -372,9 +260,9 @@ def chainkey(m): return top - def to_openmm_positions(self) -> omm_unit.Quantity: - """ - serialize the positions to openmm.unit.Quantity + def to_openmm_positions(self) -> "openmm.app.unit.Quantity": + """serialize the positions to openmm.unit.Quantity + ! only one frame at the moment! Returns @@ -382,6 +270,8 @@ def to_openmm_positions(self) -> omm_unit.Quantity: omm_unit.Quantity Quantity containing protein atom positions """ + from openmm import unit as omm_unit + np_pos = deserialize_numpy(self.to_dict()["conformers"][0]) openmm_pos = ( list(map(lambda x: np.array(x), np_pos)) * omm_unit.angstrom @@ -390,8 +280,7 @@ def to_openmm_positions(self) -> omm_unit.Quantity: return openmm_pos def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase]) -> str: - """ - serialize protein to pdb file. + """Write protein to pdb file. Parameters ---------- @@ -436,8 +325,7 @@ def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes] def to_pdbx_file( self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase] ) -> str: - """ - serialize protein to pdbx file. + """Write protein to pdbx file. Parameters ---------- @@ -449,6 +337,8 @@ def to_pdbx_file( str string path to the resulting pdbx. """ + from openmm import unit as omm_unit + # get top: top = self.to_openmm_topology() @@ -474,7 +364,6 @@ def to_pdbx_file( PDBxFile.writeFile(topology=top, positions=openmm_pos, file=out_file) - if must_close: # we only close the file if we had to open it out_file.close() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..e69de29b From 1c2bc2a8f2cc11d217761effd60190f8482420c0 Mon Sep 17 00:00:00 2001 From: richard gowers Date: Wed, 9 Aug 2023 07:56:33 +0100 Subject: [PATCH 2/6] move SMC serialization into ExplicitMolecule (to later share with Protein) --- gufe/components/explicitmoleculecomponent.py | 143 ++++++++++++++++- gufe/components/smallmoleculecomponent.py | 153 ++----------------- 2 files changed, 151 insertions(+), 145 deletions(-) diff --git a/gufe/components/explicitmoleculecomponent.py b/gufe/components/explicitmoleculecomponent.py index fe5e7c59..86b97dcc 100644 --- a/gufe/components/explicitmoleculecomponent.py +++ b/gufe/components/explicitmoleculecomponent.py @@ -2,12 +2,12 @@ import numpy as np import warnings from rdkit import Chem -from typing import Optional +from typing import Any, Optional from .component import Component -# typing from ..custom_typing import RDKitMol +from ..molhashing import deserialize_numpy, serialize_numpy def _ensure_ofe_name(mol: RDKitMol, name: str) -> str: @@ -37,6 +37,143 @@ def _ensure_ofe_name(mol: RDKitMol, name: str) -> str: return name +_INT_TO_ATOMCHIRAL = { + 0: Chem.rdchem.ChiralType.CHI_UNSPECIFIED, + 1: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, + 2: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, + 3: Chem.rdchem.ChiralType.CHI_OTHER, +} +# support for non-tetrahedral stereo requires rdkit 2022.09.1+ +if hasattr(Chem.rdchem.ChiralType, 'CHI_TETRAHEDRAL'): + _INT_TO_ATOMCHIRAL.update({ + 4: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL, + 5: Chem.rdchem.ChiralType.CHI_ALLENE, + 6: Chem.rdchem.ChiralType.CHI_SQUAREPLANAR, + 7: Chem.rdchem.ChiralType.CHI_TRIGONALBIPYRAMIDAL, + 8: Chem.rdchem.ChiralType.CHI_OCTAHEDRAL, + }) +_ATOMCHIRAL_TO_INT = {v: k for k, v in _INT_TO_ATOMCHIRAL.items()} + + +_INT_TO_BONDTYPE = { + 0: Chem.rdchem.BondType.UNSPECIFIED, + 1: Chem.rdchem.BondType.SINGLE, + 2: Chem.rdchem.BondType.DOUBLE, + 3: Chem.rdchem.BondType.TRIPLE, + 4: Chem.rdchem.BondType.QUADRUPLE, + 5: Chem.rdchem.BondType.QUINTUPLE, + 6: Chem.rdchem.BondType.HEXTUPLE, + 7: Chem.rdchem.BondType.ONEANDAHALF, + 8: Chem.rdchem.BondType.TWOANDAHALF, + 9: Chem.rdchem.BondType.THREEANDAHALF, + 10: Chem.rdchem.BondType.FOURANDAHALF, + 11: Chem.rdchem.BondType.FIVEANDAHALF, + 12: Chem.rdchem.BondType.AROMATIC, + 13: Chem.rdchem.BondType.IONIC, + 14: Chem.rdchem.BondType.HYDROGEN, + 15: Chem.rdchem.BondType.THREECENTER, + 16: Chem.rdchem.BondType.DATIVEONE, + 17: Chem.rdchem.BondType.DATIVE, + 18: Chem.rdchem.BondType.DATIVEL, + 19: Chem.rdchem.BondType.DATIVER, + 20: Chem.rdchem.BondType.OTHER, + 21: Chem.rdchem.BondType.ZERO} +_BONDTYPE_TO_INT = {v: k for k, v in _INT_TO_BONDTYPE.items()} +_INT_TO_BONDSTEREO = { + 0: Chem.rdchem.BondStereo.STEREONONE, + 1: Chem.rdchem.BondStereo.STEREOANY, + 2: Chem.rdchem.BondStereo.STEREOZ, + 3: Chem.rdchem.BondStereo.STEREOE, + 4: Chem.rdchem.BondStereo.STEREOCIS, + 5: Chem.rdchem.BondStereo.STEREOTRANS} +_BONDSTEREO_TO_INT = {v: k for k, v in _INT_TO_BONDSTEREO.items()} + + +def _setprops(obj, d: dict) -> None: + # add props onto rdkit "obj" (atom/bond/mol/conformer) + # props are guaranteed one of Bool, Int, Float or String type + for k, v in d.items(): + if isinstance(v, bool): + obj.SetBoolProp(k, v) + elif isinstance(v, int): + obj.SetIntProp(k, v) + elif isinstance(v, float): + obj.SetDoubleProp(k, v) + else: # isinstance(v, str): + obj.SetProp(k, v) + + +def _mol_from_dict(d: dict) -> Chem.Mol: + m = Chem.Mol() + em = Chem.EditableMol(m) + + for atom in d['atoms']: + a = Chem.Atom(atom[0]) + a.SetIsotope(atom[1]) + a.SetFormalCharge(atom[2]) + a.SetIsAromatic(atom[3]) + a.SetChiralTag(_INT_TO_ATOMCHIRAL[atom[4]]) + a.SetAtomMapNum(atom[5]) + _setprops(a, atom[6]) + em.AddAtom(a) + + for bond in d['bonds']: + em.AddBond(bond[0], bond[1], _INT_TO_BONDTYPE[bond[2]]) + # other fields are applied onto the ROMol + + m = em.GetMol() + + for bond, b in zip(d['bonds'], m.GetBonds()): + b.SetStereo(_INT_TO_BONDSTEREO[bond[3]]) + _setprops(b, bond[4]) + + pos = deserialize_numpy(d['conformer'][0]) + c = Chem.Conformer(m.GetNumAtoms()) + for i, p in enumerate(pos): + c.SetAtomPosition(i, p) + _setprops(c, d['conformer'][1]) + m.AddConformer(c) + + _setprops(m, d['molprops']) + + m.UpdatePropertyCache() + + return m + + +def _mol_to_dict(m: Chem.Mol) -> dict[str, Any]: + # in a perfect world we'd use ToBinary() + # but this format slowly evolves, so the future hash of a SMC could change if rdkit were updated + # this is based on that method, with some irrelevant fields cut out + + output: dict[str, Any] = {} + + atoms = [] + for atom in m.GetAtoms(): + atoms.append(( + atom.GetAtomicNum(), atom.GetIsotope(), atom.GetFormalCharge(), atom.GetIsAromatic(), + _ATOMCHIRAL_TO_INT[atom.GetChiralTag()], atom.GetAtomMapNum(), + atom.GetPropsAsDict(includePrivate=False), + )) + output['atoms'] = atoms + + bonds = [] + for bond in m.GetBonds(): + bonds.append(( + bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), _BONDTYPE_TO_INT[bond.GetBondType()], + _BONDSTEREO_TO_INT[bond.GetStereo()], + bond.GetPropsAsDict(includePrivate=False) + )) + output['bonds'] = bonds + + conf = m.GetConformer() + output['conformer'] = (serialize_numpy(conf.GetPositions()), conf.GetPropsAsDict(includePrivate=False)) + + output['molprops'] = m.GetPropsAsDict(includePrivate=False) + + return output + + def _check_partial_charges(mol: RDKitMol) -> None: """ Checks for the presence of partial charges. @@ -131,7 +268,6 @@ def smiles(self) -> str: def total_charge(self): return Chem.GetFormalCharge(self._rdkit) - # TO def to_rdkit(self) -> RDKitMol: """Return an RDKit copied representation of this molecule""" return Chem.Mol(self._rdkit) @@ -139,7 +275,6 @@ def to_rdkit(self) -> RDKitMol: def to_json(self): return json.dumps(self.to_dict()) - # From @classmethod def from_json(cls, json_str): dct = json.loads(json_str) diff --git a/gufe/components/smallmoleculecomponent.py b/gufe/components/smallmoleculecomponent.py index 849097b5..85db3bce 100644 --- a/gufe/components/smallmoleculecomponent.py +++ b/gufe/components/smallmoleculecomponent.py @@ -5,78 +5,13 @@ # openff complains about oechem being missing, shhh logger = logging.getLogger('openff.toolkit') logger.setLevel(logging.ERROR) -from typing import Any from rdkit import Chem -from .explicitmoleculecomponent import ExplicitMoleculeComponent -from ..molhashing import deserialize_numpy, serialize_numpy - - -_INT_TO_ATOMCHIRAL = { - 0: Chem.rdchem.ChiralType.CHI_UNSPECIFIED, - 1: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW, - 2: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, - 3: Chem.rdchem.ChiralType.CHI_OTHER, -} -# support for non-tetrahedral stereo requires rdkit 2022.09.1+ -if hasattr(Chem.rdchem.ChiralType, 'CHI_TETRAHEDRAL'): - _INT_TO_ATOMCHIRAL.update({ - 4: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL, - 5: Chem.rdchem.ChiralType.CHI_ALLENE, - 6: Chem.rdchem.ChiralType.CHI_SQUAREPLANAR, - 7: Chem.rdchem.ChiralType.CHI_TRIGONALBIPYRAMIDAL, - 8: Chem.rdchem.ChiralType.CHI_OCTAHEDRAL, - }) -_ATOMCHIRAL_TO_INT = {v: k for k, v in _INT_TO_ATOMCHIRAL.items()} - - -_INT_TO_BONDTYPE = { - 0: Chem.rdchem.BondType.UNSPECIFIED, - 1: Chem.rdchem.BondType.SINGLE, - 2: Chem.rdchem.BondType.DOUBLE, - 3: Chem.rdchem.BondType.TRIPLE, - 4: Chem.rdchem.BondType.QUADRUPLE, - 5: Chem.rdchem.BondType.QUINTUPLE, - 6: Chem.rdchem.BondType.HEXTUPLE, - 7: Chem.rdchem.BondType.ONEANDAHALF, - 8: Chem.rdchem.BondType.TWOANDAHALF, - 9: Chem.rdchem.BondType.THREEANDAHALF, - 10: Chem.rdchem.BondType.FOURANDAHALF, - 11: Chem.rdchem.BondType.FIVEANDAHALF, - 12: Chem.rdchem.BondType.AROMATIC, - 13: Chem.rdchem.BondType.IONIC, - 14: Chem.rdchem.BondType.HYDROGEN, - 15: Chem.rdchem.BondType.THREECENTER, - 16: Chem.rdchem.BondType.DATIVEONE, - 17: Chem.rdchem.BondType.DATIVE, - 18: Chem.rdchem.BondType.DATIVEL, - 19: Chem.rdchem.BondType.DATIVER, - 20: Chem.rdchem.BondType.OTHER, - 21: Chem.rdchem.BondType.ZERO} -_BONDTYPE_TO_INT = {v: k for k, v in _INT_TO_BONDTYPE.items()} -_INT_TO_BONDSTEREO = { - 0: Chem.rdchem.BondStereo.STEREONONE, - 1: Chem.rdchem.BondStereo.STEREOANY, - 2: Chem.rdchem.BondStereo.STEREOZ, - 3: Chem.rdchem.BondStereo.STEREOE, - 4: Chem.rdchem.BondStereo.STEREOCIS, - 5: Chem.rdchem.BondStereo.STEREOTRANS} -_BONDSTEREO_TO_INT = {v: k for k, v in _INT_TO_BONDSTEREO.items()} - - -def _setprops(obj, d: dict) -> None: - # add props onto rdkit "obj" (atom/bond/mol/conformer) - # props are guaranteed one of Bool, Int, Float or String type - for k, v in d.items(): - if isinstance(v, bool): - obj.SetBoolProp(k, v) - elif isinstance(v, int): - obj.SetIntProp(k, v) - elif isinstance(v, float): - obj.SetDoubleProp(k, v) - else: # isinstance(v, str): - obj.SetProp(k, v) +from .explicitmoleculecomponent import ( + ExplicitMoleculeComponent, + _mol_to_dict, _mol_from_dict, +) class SmallMoleculeComponent(ExplicitMoleculeComponent): @@ -106,6 +41,14 @@ class SmallMoleculeComponent(ExplicitMoleculeComponent): name : str, optional A human readable tag for this molecule. This name will be used in the hash. """ + @classmethod + def _from_dict(cls, d: dict): + m = _mol_from_dict(d) + + return cls(rdkit=m) + + def _to_dict(self) -> dict: + return _mol_to_dict(self._rdkit) def to_sdf(self) -> str: """Create a string based on SDF. @@ -192,75 +135,3 @@ def to_openff(self): def from_openff(cls, openff, name: str = ""): """Construct from an OpenFF toolkit Molecule""" return cls(openff.to_rdkit(), name=name) - - def _to_dict(self) -> dict: - """Serialize to dict representation""" - # in a perfect world we'd use ToBinary() - # but this format slowly evolves, so the future hash of a SMC could change if rdkit were updated - # this is based on that method, with some irrelevant fields cut out - - output: dict[str, Any] = {} - - atoms = [] - for atom in self._rdkit.GetAtoms(): - atoms.append(( - atom.GetAtomicNum(), atom.GetIsotope(), atom.GetFormalCharge(), atom.GetIsAromatic(), - _ATOMCHIRAL_TO_INT[atom.GetChiralTag()], atom.GetAtomMapNum(), - atom.GetPropsAsDict(includePrivate=False), - )) - output['atoms'] = atoms - - bonds = [] - for bond in self._rdkit.GetBonds(): - bonds.append(( - bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), _BONDTYPE_TO_INT[bond.GetBondType()], - _BONDSTEREO_TO_INT[bond.GetStereo()], - bond.GetPropsAsDict(includePrivate=False) - )) - output['bonds'] = bonds - - conf = self._rdkit.GetConformer() - output['conformer'] = (serialize_numpy(conf.GetPositions()), conf.GetPropsAsDict(includePrivate=False)) - - output['molprops'] = self._rdkit.GetPropsAsDict(includePrivate=False) - - return output - - @classmethod - def _from_dict(cls, d: dict): - """Deserialize from dict representation""" - m = Chem.Mol() - em = Chem.EditableMol(m) - - for atom in d['atoms']: - a = Chem.Atom(atom[0]) - a.SetIsotope(atom[1]) - a.SetFormalCharge(atom[2]) - a.SetIsAromatic(atom[3]) - a.SetChiralTag(_INT_TO_ATOMCHIRAL[atom[4]]) - a.SetAtomMapNum(atom[5]) - _setprops(a, atom[6]) - em.AddAtom(a) - - for bond in d['bonds']: - em.AddBond(bond[0], bond[1], _INT_TO_BONDTYPE[bond[2]]) - # other fields are applied onto the ROMol - - m = em.GetMol() - - for bond, b in zip(d['bonds'], m.GetBonds()): - b.SetStereo(_INT_TO_BONDSTEREO[bond[3]]) - _setprops(b, bond[4]) - - pos = deserialize_numpy(d['conformer'][0]) - c = Chem.Conformer(m.GetNumAtoms()) - for i, p in enumerate(pos): - c.SetAtomPosition(i, p) - _setprops(c, d['conformer'][1]) - m.AddConformer(c) - - _setprops(m, d['molprops']) - - m.UpdatePropertyCache() - - return cls(rdkit=m) From ea1271cb6fec7b8b9035cd502cbecd38327b8565 Mon Sep 17 00:00:00 2001 From: richard gowers Date: Wed, 9 Aug 2023 07:57:18 +0100 Subject: [PATCH 3/6] use ExplicitMolecule serialization in Protein --- gufe/components/proteincomponent.py | 131 +++++++--------------------- 1 file changed, 32 insertions(+), 99 deletions(-) diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py index 87366b21..a28610b3 100644 --- a/gufe/components/proteincomponent.py +++ b/gufe/components/proteincomponent.py @@ -11,7 +11,10 @@ from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType from ..custom_typing import RDKitMol -from .explicitmoleculecomponent import ExplicitMoleculeComponent +from .explicitmoleculecomponent import ( + ExplicitMoleculeComponent, + _mol_from_dict, _mol_to_dict, +) from ..vendor.pdb_file.pdbfile import PDBFile from ..vendor.pdb_file.pdbxfile import PDBxFile @@ -19,20 +22,6 @@ from ..molhashing import deserialize_numpy, serialize_numpy -_BONDORDERS_OPENMM_TO_RDKIT = { - 1: BondType.SINGLE, - 2: BondType.DOUBLE, - 3: BondType.TRIPLE, - app.Single: BondType.SINGLE, - app.Double: BondType.DOUBLE, - app.Triple: BondType.TRIPLE, - app.Aromatic: BondType.AROMATIC, - None: BondType.UNSPECIFIED, -} -_BONDORDERS_RDKIT_TO_OPENMM = { - v: k for k, v in _BONDORDERS_OPENMM_TO_RDKIT.items() -} - # builtin dict of strings to enum members, boy I hope this is stable _BONDORDER_STR_TO_RDKIT = Chem.BondType.names _BONDORDER_RDKIT_TO_STR = {v: k for k, v in _BONDORDER_STR_TO_RDKIT.items()} @@ -123,65 +112,27 @@ def from_pdbx_file(cls, pdbx_file: str, name=""): ) @classmethod - def _from_dict(cls, ser_dict: dict, name: str = ""): + def _from_dict(cls, ser_dict: dict): """Deserialize from dict representation""" + monomer_infos = ser_dict.pop('monomer_info') - # Mol - rd_mol = Mol() - editable_rdmol = EditableMol(rd_mol) + m = _mol_from_dict(ser_dict) - # Add Atoms - for atom in ser_dict["atoms"]: - atomic_num = int(atom[0]) - - a = Atom(atomic_num) + for atom, info in zip(m.GetAtoms(), monomer_infos): mi = Chem.AtomPDBResidueInfo() - mi.SetChainId(atom[1]) - mi.SetSerialNumber(atom[2]) - mi.SetSegmentNumber(atom[3]) - mi.SetInsertionCode(atom[4]) - mi.SetName(atom[5]) - mi.SetResidueName(atom[6]) - mi.SetResidueNumber(int(atom[7])) - mi.SetIsHeteroAtom(atom[8] == 'Y') - a.SetFormalCharge(atom[9]) - - a.SetMonomerInfo(mi) - - editable_rdmol.AddAtom(a) - - # Add Bonds - for bond in ser_dict["bonds"]: - atomBeginIdx = int(bond[0]) - atomEndIdx = int(bond[1]) - bondType = _BONDORDER_STR_TO_RDKIT[bond[2]] - editable_rdmol.AddBond( - beginAtomIdx=atomBeginIdx, - endAtomIdx=atomEndIdx, - order=bondType, - ) - - # Set Positions - rd_mol = editable_rdmol.GetMol() - positions = ser_dict["conformers"] - for i, frame_pos in enumerate(positions): - frame_pos = deserialize_numpy(frame_pos) - conf = Conformer(i) - for atom_id, atom_pos in enumerate(frame_pos): - conf.SetAtomPosition(atom_id, atom_pos) # unit: nm - rd_mol.AddConformer(conf) + mi.SetChainId(mi[0]) + mi.SetSerialNumber(mi[1]) + mi.SetSegmentNumber(mi[2]) + mi.SetInsertionCode(mi[3]) + mi.SetName(mi[4]) + mi.SetResidueName(mi[5]) + mi.SetResidueNumber(mi[6]) + mi.SetIsHeteroAtom(mi[7] == 'Y') - # Adding missing bond info - for bond_id, bond in enumerate(rd_mol.GetBonds()): - # Can't set these on an editable mol, go round a second time - _, _, _, arom = ser_dict["bonds"][bond_id] - bond.SetIsAromatic(arom == 'Y') + atom.SetMonomerInfo(mi) - if "name" in ser_dict: - name = ser_dict["name"] - - return cls(rdkit=rd_mol, name=name) + return cls(rdkit=m) def to_openmm_topology(self) -> "app.Topology": """Convert to an openmm Topology object @@ -193,6 +144,14 @@ def to_openmm_topology(self) -> "app.Topology": """ from openmm import app + BONDORDERS_RDKIT_TO_OPENMM = { + BondType.SINGLE: app.Single, + BondType.DOUBLE: app.Double, + BondType.TRIPLE: app.Triple, + BondType.AROMATIC: app.Aromatic, + BondType.UNSPECIFIED: None, + } + def reskey(m): """key for defining when a residue has changed from previous @@ -255,7 +214,7 @@ def chainkey(m): a1 = atom_lookup[bond.GetBeginAtomIdx()] a2 = atom_lookup[bond.GetEndAtomIdx()] top.addBond(a1, a2, - order=_BONDORDERS_RDKIT_TO_OPENMM.get( + order=BONDORDERS_RDKIT_TO_OPENMM.get( bond.GetBondType(), None)) return top @@ -372,15 +331,13 @@ def to_pdbx_file( def _to_dict(self) -> dict: """Serialize to dict representation""" + d = _mol_to_dict(self._rdkit) - atoms = [] + monomer_infos = [] for atom in self._rdkit.GetAtoms(): mi = atom.GetMonomerInfo() - # TODO: Stereo? - atoms.append( - ( - atom.GetAtomicNum(), + monomer_infos.append(( mi.GetChainId(), mi.GetSerialNumber(), mi.GetSegmentNumber(), @@ -389,32 +346,8 @@ def _to_dict(self) -> dict: mi.GetResidueName(), mi.GetResidueNumber(), 'Y' if mi.GetIsHeteroAtom() else 'N', - atom.GetFormalCharge(), - ) - ) + )) - bonds = [ - ( - bond.GetBeginAtomIdx(), - bond.GetEndAtomIdx(), - _BONDORDER_RDKIT_TO_STR[bond.GetBondType()], - 'Y' if bond.GetIsAromatic() else 'N', - # bond.GetStereo() or "", do we need this? i.e. are openff ffs going to use cis/trans SMARTS? - ) - for bond in self._rdkit.GetBonds() - ] - - conformers = [ - serialize_numpy(conf.GetPositions()) # .m_as(unit.angstrom) - for conf in self._rdkit.GetConformers() - ] - - # Result - d = { - "atoms": atoms, - "bonds": bonds, - "name": self.name, - "conformers": conformers, - } + d['monomer_info'] = monomer_infos return d From 4e91467503a929195db5723e8dfacc344ac5072c Mon Sep 17 00:00:00 2001 From: richard gowers Date: Thu, 10 Aug 2023 12:08:44 +0100 Subject: [PATCH 4/6] fixup Protein deserialization --- gufe/components/proteincomponent.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py index a28610b3..85d12992 100644 --- a/gufe/components/proteincomponent.py +++ b/gufe/components/proteincomponent.py @@ -121,14 +121,14 @@ def _from_dict(cls, ser_dict: dict): for atom, info in zip(m.GetAtoms(), monomer_infos): mi = Chem.AtomPDBResidueInfo() - mi.SetChainId(mi[0]) - mi.SetSerialNumber(mi[1]) - mi.SetSegmentNumber(mi[2]) - mi.SetInsertionCode(mi[3]) - mi.SetName(mi[4]) - mi.SetResidueName(mi[5]) - mi.SetResidueNumber(mi[6]) - mi.SetIsHeteroAtom(mi[7] == 'Y') + mi.SetChainId(info[0]) + mi.SetSerialNumber(info[1]) + mi.SetSegmentNumber(info[2]) + mi.SetInsertionCode(info[3]) + mi.SetName(info[4]) + mi.SetResidueName(info[5]) + mi.SetResidueNumber(info[6]) + mi.SetIsHeteroAtom(info[7] == 'Y') atom.SetMonomerInfo(mi) From 8658083f95580ede16c398c1f7bce4235db32872 Mon Sep 17 00:00:00 2001 From: richard gowers Date: Thu, 10 Aug 2023 12:08:55 +0100 Subject: [PATCH 5/6] fixup to_openmm_positions --- gufe/components/proteincomponent.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py index 85d12992..143ac1a0 100644 --- a/gufe/components/proteincomponent.py +++ b/gufe/components/proteincomponent.py @@ -231,12 +231,9 @@ def to_openmm_positions(self) -> "openmm.app.unit.Quantity": """ from openmm import unit as omm_unit - np_pos = deserialize_numpy(self.to_dict()["conformers"][0]) - openmm_pos = ( - list(map(lambda x: np.array(x), np_pos)) * omm_unit.angstrom - ) + np_pos = self._rdkit.GetConformer().GetPositions() - return openmm_pos + return np_pos * omm_unit.angstrom def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase]) -> str: """Write protein to pdb file. From 215dace8826ec0b224b5e31ae7c30b5aac5701ff Mon Sep 17 00:00:00 2001 From: richard gowers Date: Thu, 10 Aug 2023 13:40:28 +0100 Subject: [PATCH 6/6] use pooch for PLB files previous solution used io.StringIO which rdkit doesn't like (and wasn't documented as accepted) --- gufe/tests/conftest.py | 82 ++++++++++++----------------- gufe/tests/test_proteincomponent.py | 50 ++++++------------ 2 files changed, 50 insertions(+), 82 deletions(-) diff --git a/gufe/tests/conftest.py b/gufe/tests/conftest.py index 034fac04..8ca4d444 100644 --- a/gufe/tests/conftest.py +++ b/gufe/tests/conftest.py @@ -4,8 +4,8 @@ import importlib.resources import urllib.request from urllib.error import URLError -import io import functools +import pooch import pytest from rdkit import Chem from rdkit.Chem import AllChem @@ -20,58 +20,42 @@ else: HAS_INTERNET = True +PLB_files = pooch.create( + path=pooch.os_cache('pdbinf'), + base_url='https://github.com/openforcefield/protein-ligand-benchmark/raw/d3387602bbeb0167abf00dfb81753d8936775dd2/data/', + version=None, + registry={ + 'p38/01_protein/crd/protein.pdb': '3f0bf718644e7c29f5200cd3def4240ac25ef5fb1948b2e64deb5015d8a45aa4', + 'mcl1/01_protein/crd/protein.pdb': 'f80ff9dd93a5d9dd6e90091e9631a8ce7fe0dc931e16543e22c1f92009660306', + 'cdk2/01_protein/crd/protein.pdb': '15d1e509d7951ca45ea266d51a627d5f452dcf0bb5bd48751ae57eb29e28ab69', + 'shp2/01_protein/crd/protein.pdb': 'd6759cbd135aaddaa658446064df4095d978d3681c014a0528b542d60b2c8770', + 'pde2/01_protein/crd/protein.pdb': '3b7967c1717789215452cdf919520625602d5438a9d2a18620726b8b1b3a8ef0', + 'cmet/01_protein/crd/protein.pdb': '155ec32941a9082dbdbbfde460ff97c88d4fe7e100e9a9577edb5a9e7b6467ae', + 'ptp1b/01_protein/crd/protein.pdb': 'bfa0f9204e96aa463b80946b788c4153cd24701291007eb77638a16fd156634e', + 'thrombin/01_protein/crd/protein.pdb': 'eb4ea18bef9c4c71dcdc922616d6719ee918112be87a0bd6b274c856eff1dd59', + 'cdk8/01_protein/crd/protein.pdb': 'b058774526a19775d8f438b14e9d6da331b6de74e0ef9e96db575f6c0bb067b2', + 'pfkfb3/01_protein/crd/protein.pdb': '4367710db0dbf284cc715ae9a8dd82d06bd77dcc3fb0885678e16632a2732dcc', + 'tyk2/01_protein/crd/protein.pdb': '9090684f4bdae90afbe5f2698a14c778396c024c19ceb6333de4808d9e29fae6', + 'syk/01_protein/crd/protein.pdb': 'f6199d0c1818eb5bb24e164426789cf39cae7aa32c8ca2e98f5f44d299a6f82f', + 'tnks2/01_protein/crd/protein.pdb': 'fc7681a05dbf07590aa8de133f981b6d8ae9cebcc23d54addc2c4fe80be80299', + 'eg5/01_protein/crd/protein.pdb': 'f2964a785c922502dc86fb4e2e5295d32d41d5b68b8c3246e989de5234c3fd0f', + 'hif2a/01_protein/crd/protein.pdb': '5bbf520e7c102a65cc7ba0253fd66f43562f77284c82b3b9613e997b7ac76c93', + + }, +) -class URLFileLike: - def __init__(self, url, encoding='utf-8'): - self.url = url - self.encoding = encoding - self.data = None - def __call__(self): +@pytest.fixture(params=['p38', 'mcl1', 'cdk2', 'shp2', 'pde2', 'cmet', 'ptp1b', + 'thrombin', 'cdk8', 'pfkfb3', 'tyk2', 'syk', 'tnks2', + 'eg5', 'hif2a', '181l']) +def PDB_files(request): + if request.param == '181l': + with importlib.resources.path('gufe.tests.data', '181l.pdb') as file: + return str(file) + else: if not HAS_INTERNET: # pragma: no-cover pytest.skip("Skipping because internet seems faulty") - - if self.data is None: - req = urllib.request.urlopen(self.url) - self.data = req.read().decode(self.encoding) - - return io.StringIO(self.data) - - -def get_test_filename(filename): - with importlib.resources.path('gufe.tests.data', filename) as file: - return str(file) - - -_benchmark_pdb_names = [ - "cmet_protein", - "hif2a_protein", - "mcl1_protein", - "p38_protein", - "ptp1b_protein", - "syk_protein", - "thrombin_protein", - "tnsk2_protein", - "tyk2_protein", - ] - - -_pl_benchmark_url_pattern = ( - "https://github.com/OpenFreeEnergy/openfe-benchmarks/blob/main/openfe_benchmarks/data/{name}.pdb?raw=true" -) - - -PDB_BENCHMARK_LOADERS = { - name: URLFileLike(url=_pl_benchmark_url_pattern.format(name=name)) - for name in _benchmark_pdb_names -} - -PDB_FILE_LOADERS = { - name: lambda: get_test_filename(name) - for name in ["181l.pdb"] -} - -ALL_PDB_LOADERS = dict(**PDB_BENCHMARK_LOADERS, **PDB_FILE_LOADERS) + return PLB_files.fetch('{}/01_protein/crd/protein.pdb'.format(request.param)) @pytest.fixture diff --git a/gufe/tests/test_proteincomponent.py b/gufe/tests/test_proteincomponent.py index 934095a7..90161c4a 100644 --- a/gufe/tests/test_proteincomponent.py +++ b/gufe/tests/test_proteincomponent.py @@ -14,7 +14,7 @@ from openmm import unit from numpy.testing import assert_almost_equal -from .conftest import ALL_PDB_LOADERS +from .conftest import PLB_files @pytest.fixture @@ -94,11 +94,8 @@ class TestProteinComponent(GufeTokenizableTestsMixin): def instance(self, PDB_181L_path): return self.cls.from_pdb_file(PDB_181L_path, name="Steve") - # From - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_from_pdb_file(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - p = self.cls.from_pdb_file(in_pdb_io, name="Steve") + def test_from_pdb_file(self, PDB_files): + p = self.cls.from_pdb_file(PDB_files, name="Steve") assert isinstance(p, ProteinComponent) assert p.name == "Steve" @@ -177,21 +174,16 @@ def test_to_pdb_input_types(self, PDB_181L_OpenMMClean_path, tmp_path, output_func=p.to_pdb_file ) - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_pdb_round_trip(self, in_pdb_path, tmp_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - p = self.cls.from_pdb_file(in_pdb_io, name="Wuff") - out_file_name = "tmp_"+in_pdb_path+".pdb" + def test_to_pdb_round_trip(self, PDB_files, tmp_path): + p = self.cls.from_pdb_file(PDB_files, name="Wuff") + out_file_name = "tmp_foo.pdb" out_file = tmp_path / out_file_name p.to_pdb_file(str(out_file)) - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - # generate openMM reference file: - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) - out_ref_file_name = "tmp_"+in_pdb_path+"_openmm_ref.pdb" + openmm_pdb = pdbfile.PDBFile(PDB_files) + out_ref_file_name = "tmp_foo_openmm_ref.pdb" out_ref_file = tmp_path / out_ref_file_name pdbfile.PDBFile.writeFile(openmm_pdb.topology, openmm_pdb.positions, file=open(str(out_ref_file), "w")) @@ -213,16 +205,11 @@ def test_dummy_from_dict(self, PDB_181L_OpenMMClean_path): assert p == p2 - # parametrize - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_openmm_positions(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) + def test_to_openmm_positions(self, PDB_files): + openmm_pdb = pdbfile.PDBFile(PDB_files) openmm_pos = openmm_pdb.positions - p = self.cls.from_pdb_file(in_pdb_io, name="Bob") + p = self.cls.from_pdb_file(PDB_files, name="Bob") gufe_openmm_pos = p.to_openmm_positions() v1 = gufe_openmm_pos.value_in_unit(unit.nanometer) @@ -230,16 +217,11 @@ def test_to_openmm_positions(self, in_pdb_path): assert_almost_equal(actual=v1, desired=v2, decimal=6) - # parametrize - @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys()) - def test_to_openmm_topology(self, in_pdb_path): - in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]() - - openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io) + def test_to_openmm_topology(self, PDB_files): + openmm_pdb = pdbfile.PDBFile(PDB_files) openmm_top = openmm_pdb.topology - p = self.cls.from_pdb_file(in_pdb_io, name="Bob") + p = self.cls.from_pdb_file(PDB_files, name="Bob") gufe_openmm_top = p.to_openmm_topology() assert_topology_equal(openmm_top, gufe_openmm_top) @@ -290,7 +272,9 @@ def test_protein_total_charge(self, PDB_181L_path): assert m1.total_charge == 7 def test_protein_total_charge_thromb(self): - m1 = self.cls.from_pdb_file(ALL_PDB_LOADERS["thrombin_protein"]()) + f = PLB_files.fetch('thrombin/01_protein/crd/protein.pdb') + + m1 = self.cls.from_pdb_file(f) assert m1.total_charge == 6