From 32de146683e7f91317d005a788e1e5fb775de1f2 Mon Sep 17 00:00:00 2001
From: richard <richardjgowers@gmail.com>
Date: Wed, 22 Feb 2023 18:24:43 +0000
Subject: [PATCH 1/6] WIP Use PDBInf

---
 environment.yml                     |   1 +
 gufe/components/proteincomponent.py | 157 ++++------------------------
 setup.cfg                           |   0
 3 files changed, 24 insertions(+), 134 deletions(-)
 create mode 100644 setup.cfg

diff --git a/environment.yml b/environment.yml
index 7adbed8d..1d0ff40e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,6 +10,7 @@ dependencies:
   - openff-units ==0.2.0  # https://github.com/openforcefield/openff-units/issues/69
   - pint <0.22  # https://github.com/openforcefield/openff-units/issues/69
   - openff-models >=0.0.5
+  - pdbinf
   - pip
   - pydantic >1
   - pytest
diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py
index e2697ccf..87366b21 100644
--- a/gufe/components/proteincomponent.py
+++ b/gufe/components/proteincomponent.py
@@ -1,15 +1,11 @@
 # This code is part of OpenFE and is licensed under the MIT license.
 # For details, see https://github.com/OpenFreeEnergy/gufe
-import ast
-import json
 import io
 import numpy as np
 from os import PathLike
-from typing import Union, Optional
-from collections import defaultdict
+from typing import Union
 
-from openmm import app
-from openmm import unit as omm_unit
+import pdbinf
 
 from rdkit import Chem
 from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType
@@ -51,10 +47,6 @@
 }
 
 
-negative_ions = ["F", "CL", "Br", "I"]
-positive_ions = ["NA", "MG", "ZN"]
-
-
 class ProteinComponent(ExplicitMoleculeComponent):
     """
     ``Component`` representing the contents of a PDB file, such as a protein.
@@ -84,7 +76,6 @@ def __init__(self, rdkit: RDKitMol, name=""):
                             "Consider loading via rdkit.Chem.MolFromPDBFile or similar.")
         super().__init__(rdkit=rdkit, name=name)
 
-    # FROM
     @classmethod
     def from_pdb_file(cls, pdb_file: str, name: str = ""):
         """
@@ -102,9 +93,10 @@ def from_pdb_file(cls, pdb_file: str, name: str = ""):
         ProteinComponent
             the deserialized molecule
         """
-        openmm_PDBFile = PDBFile(pdb_file)
-        return cls._from_openmmPDBFile(
-            openmm_PDBFile=openmm_PDBFile, name=name
+        return cls(
+            rdkit=pdbinf.load_pdb_file(pdb_file,
+                                 templates=[pdbinf.STANDARD_AA_DOC]),
+            name=name,
         )
 
     @classmethod
@@ -114,7 +106,7 @@ def from_pdbx_file(cls, pdbx_file: str, name=""):
 
         Parameters
         ----------
-        pdbxfile : str
+        pdbx_file : str
             path to the pdb file.
         name : str, optional
             name of the input protein, by default ""
@@ -124,117 +116,12 @@ def from_pdbx_file(cls, pdbx_file: str, name=""):
         ProteinComponent
             the deserialized molecule
         """
-        openmm_PDBxFile = PDBxFile(pdbx_file)
-        return cls._from_openmmPDBFile(
-            openmm_PDBFile=openmm_PDBxFile, name=name
+        return cls(
+            rdkit=pdbinf.load_pdbx_file(pdbx_file,
+                                  templates=[pdbinf.STANDARD_AA_DOC]),
+            name=name,
         )
 
-    @classmethod
-    def _from_openmmPDBFile(cls, openmm_PDBFile: Union[PDBFile, PDBxFile],
-                            name: str = ""):
-        """Converts to our internal representation (rdkit Mol)
-
-        Parameters
-        ----------
-        openmm_PDBFile : PDBFile or PDBxFile
-            object of the protein
-        name : str
-            name of the protein
-
-        Returns
-        -------
-        ProteinComponent
-            the deserialized molecule
-        """
-        periodicTable = Chem.GetPeriodicTable()
-        mol_topology = openmm_PDBFile.getTopology()
-
-        rd_mol = Mol()
-        editable_rdmol = EditableMol(rd_mol)
-
-        # Add Atoms
-        for atom in mol_topology.atoms():
-            a = Atom(atom.element.atomic_number)
-
-            atom_monomerInfo = Chem.AtomPDBResidueInfo()
-            atom_monomerInfo.SetChainId(atom.residue.chain.id)
-            atom_monomerInfo.SetSerialNumber(int(atom.id))
-            atom_monomerInfo.SetSegmentNumber(int(atom.residue.chain.index))
-            atom_monomerInfo.SetInsertionCode(atom.residue.insertionCode)
-            atom_monomerInfo.SetName(atom.name)
-            atom_monomerInfo.SetResidueName(atom.residue.name)
-            atom_monomerInfo.SetResidueNumber(int(atom.residue.id))
-            atom_monomerInfo.SetIsHeteroAtom(False)  # TODO: Do hetatoms
-
-            a.SetMonomerInfo(atom_monomerInfo)
-
-            # additonally possible:
-            # atom_monomerInfo.SetSecondaryStructure
-            # atom_monomerInfo.SetMonomerType
-            # atom_monomerInfo.SetAltLoc
-
-            editable_rdmol.AddAtom(a)
-
-        # Add Bonds
-        for bond in mol_topology.bonds():
-            bond_order = _BONDORDERS_OPENMM_TO_RDKIT[bond.order]
-            editable_rdmol.AddBond(
-                beginAtomIdx=bond.atom1.index,
-                endAtomIdx=bond.atom2.index,
-                order=bond_order,
-            )
-
-        # Set Positions
-        rd_mol = editable_rdmol.GetMol()
-        positions = np.array(
-            openmm_PDBFile.positions.value_in_unit(omm_unit.angstrom), ndmin=3
-        )
-
-        for frame_id, frame in enumerate(positions):
-            conf = Conformer(frame_id)
-            for atom_id, atom_pos in enumerate(frame):
-                conf.SetAtomPosition(atom_id, atom_pos)
-            rd_mol.AddConformer(conf)
-
-        # Add Additionals
-        # Formal Charge
-        netcharge = 0
-        for a in rd_mol.GetAtoms():
-            atomic_num = a.GetAtomicNum()
-            atom_name = a.GetMonomerInfo().GetName()
-
-            connectivity = sum(
-                int(bond.GetBondType()) for bond in a.GetBonds()
-            )
-            default_valence = periodicTable.GetDefaultValence(atomic_num)
-
-            if connectivity == 0:  # ions:
-                if atom_name in positive_ions:
-                    fc = default_valence  # e.g. Sodium ions
-                elif atom_name in negative_ions:
-                    fc = - default_valence  # e.g. Chlorine ions
-                else:  # -no-cov-
-                    resn = a.GetMonomerInfo().GetResidueName()
-                    resind = int(a.GetMonomerInfo().GetResidueNumber())
-                    raise ValueError(
-                        "I don't know this Ion or something really went "
-                        f"wrong! \t{atom_name}\t{resn}\t-{resind}\t"
-                        f"connectivity{connectivity}"
-                    )
-            elif default_valence > connectivity:
-                fc = - (default_valence - connectivity)  # negative charge
-            elif default_valence < connectivity:
-                fc = + (connectivity - default_valence)  # positive charge
-            else:
-                fc = 0  # neutral
-
-            a.SetFormalCharge(fc)
-            a.UpdatePropertyCache(strict=True)
-
-            netcharge += fc
-
-        return cls(rdkit=rd_mol, name=name)
-
     @classmethod
     def _from_dict(cls, ser_dict: dict, name: str = ""):
         """Deserialize from dict representation"""
@@ -296,8 +183,7 @@ def _from_dict(cls, ser_dict: dict, name: str = ""):
 
         return cls(rdkit=rd_mol, name=name)
 
-    # TO
-    def to_openmm_topology(self) -> app.Topology:
+    def to_openmm_topology(self) -> "app.Topology":
         """Convert to an openmm Topology object
 
         Returns
@@ -305,6 +191,8 @@ def to_openmm_topology(self) -> app.Topology:
         openmm.app.Topology
             resulting topology obj.
         """
+        from openmm import app
+
         def reskey(m):
             """key for defining when a residue has changed from previous
 
@@ -372,9 +260,9 @@ def chainkey(m):
 
         return top
 
-    def to_openmm_positions(self) -> omm_unit.Quantity:
-        """
-        serialize the positions to openmm.unit.Quantity
+    def to_openmm_positions(self) -> "openmm.app.unit.Quantity":
+        """serialize the positions to openmm.unit.Quantity
+
         ! only one frame at the moment!
 
         Returns
@@ -382,6 +270,8 @@ def to_openmm_positions(self) -> omm_unit.Quantity:
         omm_unit.Quantity
             Quantity containing protein atom positions
         """
+        from openmm import unit as omm_unit
+
         np_pos = deserialize_numpy(self.to_dict()["conformers"][0])
         openmm_pos = (
             list(map(lambda x: np.array(x), np_pos)) * omm_unit.angstrom
@@ -390,8 +280,7 @@ def to_openmm_positions(self) -> omm_unit.Quantity:
         return openmm_pos
 
     def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase]) -> str:
-        """
-        serialize protein to pdb file.
+        """Write protein to pdb file.
 
         Parameters
         ----------
@@ -436,8 +325,7 @@ def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes]
     def to_pdbx_file(
         self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase]
     ) -> str:
-        """
-        serialize protein to pdbx file.
+        """Write protein to pdbx file.
 
         Parameters
         ----------
@@ -449,6 +337,8 @@ def to_pdbx_file(
         str
             string path to the resulting pdbx.
         """
+        from openmm import unit as omm_unit
+
         # get top:
         top = self.to_openmm_topology()
 
@@ -474,7 +364,6 @@ def to_pdbx_file(
 
         PDBxFile.writeFile(topology=top, positions=openmm_pos, file=out_file)
 
-
         if must_close:
             # we only close the file if we had to open it
             out_file.close()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..e69de29b

From 1c2bc2a8f2cc11d217761effd60190f8482420c0 Mon Sep 17 00:00:00 2001
From: richard gowers <richardjgowers@gmail.com>
Date: Wed, 9 Aug 2023 07:56:33 +0100
Subject: [PATCH 2/6] move SMC serialization into ExplicitMolecule

(to later share with Protein)
---
 gufe/components/explicitmoleculecomponent.py | 143 ++++++++++++++++-
 gufe/components/smallmoleculecomponent.py    | 153 ++-----------------
 2 files changed, 151 insertions(+), 145 deletions(-)

diff --git a/gufe/components/explicitmoleculecomponent.py b/gufe/components/explicitmoleculecomponent.py
index fe5e7c59..86b97dcc 100644
--- a/gufe/components/explicitmoleculecomponent.py
+++ b/gufe/components/explicitmoleculecomponent.py
@@ -2,12 +2,12 @@
 import numpy as np
 import warnings
 from rdkit import Chem
-from typing import Optional
+from typing import Any, Optional
 
 from .component import Component
 
-# typing
 from ..custom_typing import RDKitMol
+from ..molhashing import deserialize_numpy, serialize_numpy
 
 
 def _ensure_ofe_name(mol: RDKitMol, name: str) -> str:
@@ -37,6 +37,143 @@ def _ensure_ofe_name(mol: RDKitMol, name: str) -> str:
     return name
 
 
+_INT_TO_ATOMCHIRAL = {
+    0: Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+    1: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+    2: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+    3: Chem.rdchem.ChiralType.CHI_OTHER,
+}
+# support for non-tetrahedral stereo requires rdkit 2022.09.1+
+if hasattr(Chem.rdchem.ChiralType, 'CHI_TETRAHEDRAL'):
+    _INT_TO_ATOMCHIRAL.update({
+        4: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL,
+        5: Chem.rdchem.ChiralType.CHI_ALLENE,
+        6: Chem.rdchem.ChiralType.CHI_SQUAREPLANAR,
+        7: Chem.rdchem.ChiralType.CHI_TRIGONALBIPYRAMIDAL,
+        8: Chem.rdchem.ChiralType.CHI_OCTAHEDRAL,
+    })
+_ATOMCHIRAL_TO_INT = {v: k for k, v in _INT_TO_ATOMCHIRAL.items()}
+
+
+_INT_TO_BONDTYPE = {
+    0: Chem.rdchem.BondType.UNSPECIFIED,
+    1: Chem.rdchem.BondType.SINGLE,
+    2: Chem.rdchem.BondType.DOUBLE,
+    3: Chem.rdchem.BondType.TRIPLE,
+    4: Chem.rdchem.BondType.QUADRUPLE,
+    5: Chem.rdchem.BondType.QUINTUPLE,
+    6: Chem.rdchem.BondType.HEXTUPLE,
+    7: Chem.rdchem.BondType.ONEANDAHALF,
+    8: Chem.rdchem.BondType.TWOANDAHALF,
+    9: Chem.rdchem.BondType.THREEANDAHALF,
+    10: Chem.rdchem.BondType.FOURANDAHALF,
+    11: Chem.rdchem.BondType.FIVEANDAHALF,
+    12: Chem.rdchem.BondType.AROMATIC,
+    13: Chem.rdchem.BondType.IONIC,
+    14: Chem.rdchem.BondType.HYDROGEN,
+    15: Chem.rdchem.BondType.THREECENTER,
+    16: Chem.rdchem.BondType.DATIVEONE,
+    17: Chem.rdchem.BondType.DATIVE,
+    18: Chem.rdchem.BondType.DATIVEL,
+    19: Chem.rdchem.BondType.DATIVER,
+    20: Chem.rdchem.BondType.OTHER,
+    21: Chem.rdchem.BondType.ZERO}
+_BONDTYPE_TO_INT = {v: k for k, v in _INT_TO_BONDTYPE.items()}
+_INT_TO_BONDSTEREO = {
+    0: Chem.rdchem.BondStereo.STEREONONE,
+    1: Chem.rdchem.BondStereo.STEREOANY,
+    2: Chem.rdchem.BondStereo.STEREOZ,
+    3: Chem.rdchem.BondStereo.STEREOE,
+    4: Chem.rdchem.BondStereo.STEREOCIS,
+    5: Chem.rdchem.BondStereo.STEREOTRANS}
+_BONDSTEREO_TO_INT = {v: k for k, v in _INT_TO_BONDSTEREO.items()}
+
+
+def _setprops(obj, d: dict) -> None:
+    # add props onto rdkit "obj" (atom/bond/mol/conformer)
+    # props are guaranteed one of Bool, Int, Float or String type
+    for k, v in d.items():
+        if isinstance(v, bool):
+            obj.SetBoolProp(k, v)
+        elif isinstance(v, int):
+            obj.SetIntProp(k, v)
+        elif isinstance(v, float):
+            obj.SetDoubleProp(k, v)
+        else:  # isinstance(v, str):
+            obj.SetProp(k, v)
+
+
+def _mol_from_dict(d: dict) -> Chem.Mol:
+    m = Chem.Mol()
+    em = Chem.EditableMol(m)
+
+    for atom in d['atoms']:
+        a = Chem.Atom(atom[0])
+        a.SetIsotope(atom[1])
+        a.SetFormalCharge(atom[2])
+        a.SetIsAromatic(atom[3])
+        a.SetChiralTag(_INT_TO_ATOMCHIRAL[atom[4]])
+        a.SetAtomMapNum(atom[5])
+        _setprops(a, atom[6])
+        em.AddAtom(a)
+
+    for bond in d['bonds']:
+        em.AddBond(bond[0], bond[1], _INT_TO_BONDTYPE[bond[2]])
+        # other fields are applied onto the ROMol
+
+    m = em.GetMol()
+
+    for bond, b in zip(d['bonds'], m.GetBonds()):
+        b.SetStereo(_INT_TO_BONDSTEREO[bond[3]])
+        _setprops(b, bond[4])
+
+    pos = deserialize_numpy(d['conformer'][0])
+    c = Chem.Conformer(m.GetNumAtoms())
+    for i, p in enumerate(pos):
+        c.SetAtomPosition(i, p)
+    _setprops(c, d['conformer'][1])
+    m.AddConformer(c)
+
+    _setprops(m, d['molprops'])
+
+    m.UpdatePropertyCache()
+
+    return m
+
+
+def _mol_to_dict(m: Chem.Mol) -> dict[str, Any]:
+    # in a perfect world we'd use ToBinary()
+    # but this format slowly evolves, so the future hash of a SMC could change if rdkit were updated
+    # this is based on that method, with some irrelevant fields cut out
+
+    output: dict[str, Any] = {}
+
+    atoms = []
+    for atom in m.GetAtoms():
+        atoms.append((
+            atom.GetAtomicNum(), atom.GetIsotope(), atom.GetFormalCharge(), atom.GetIsAromatic(),
+            _ATOMCHIRAL_TO_INT[atom.GetChiralTag()], atom.GetAtomMapNum(),
+            atom.GetPropsAsDict(includePrivate=False),
+        ))
+    output['atoms'] = atoms
+
+    bonds = []
+    for bond in m.GetBonds():
+        bonds.append((
+            bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), _BONDTYPE_TO_INT[bond.GetBondType()],
+            _BONDSTEREO_TO_INT[bond.GetStereo()],
+            bond.GetPropsAsDict(includePrivate=False)
+        ))
+    output['bonds'] = bonds
+
+    conf = m.GetConformer()
+    output['conformer'] = (serialize_numpy(conf.GetPositions()), conf.GetPropsAsDict(includePrivate=False))
+
+    output['molprops'] = m.GetPropsAsDict(includePrivate=False)
+
+    return output
+
+
 def _check_partial_charges(mol: RDKitMol) -> None:
     """
     Checks for the presence of partial charges.
@@ -131,7 +268,6 @@ def smiles(self) -> str:
     def total_charge(self):
         return Chem.GetFormalCharge(self._rdkit)
 
-    # TO
     def to_rdkit(self) -> RDKitMol:
         """Return an RDKit copied representation of this molecule"""
         return Chem.Mol(self._rdkit)
@@ -139,7 +275,6 @@ def to_rdkit(self) -> RDKitMol:
     def to_json(self):
         return json.dumps(self.to_dict())
 
-    # From
     @classmethod
     def from_json(cls, json_str):
         dct = json.loads(json_str)
diff --git a/gufe/components/smallmoleculecomponent.py b/gufe/components/smallmoleculecomponent.py
index 849097b5..85db3bce 100644
--- a/gufe/components/smallmoleculecomponent.py
+++ b/gufe/components/smallmoleculecomponent.py
@@ -5,78 +5,13 @@
 # openff complains about oechem being missing, shhh
 logger = logging.getLogger('openff.toolkit')
 logger.setLevel(logging.ERROR)
-from typing import Any
 
 from rdkit import Chem
 
-from .explicitmoleculecomponent import ExplicitMoleculeComponent
-from ..molhashing import deserialize_numpy, serialize_numpy
-
-
-_INT_TO_ATOMCHIRAL = {
-    0: Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
-    1: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
-    2: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
-    3: Chem.rdchem.ChiralType.CHI_OTHER,
-}
-# support for non-tetrahedral stereo requires rdkit 2022.09.1+
-if hasattr(Chem.rdchem.ChiralType, 'CHI_TETRAHEDRAL'):
-    _INT_TO_ATOMCHIRAL.update({
-        4: Chem.rdchem.ChiralType.CHI_TETRAHEDRAL,
-        5: Chem.rdchem.ChiralType.CHI_ALLENE,
-        6: Chem.rdchem.ChiralType.CHI_SQUAREPLANAR,
-        7: Chem.rdchem.ChiralType.CHI_TRIGONALBIPYRAMIDAL,
-        8: Chem.rdchem.ChiralType.CHI_OCTAHEDRAL,
-    })
-_ATOMCHIRAL_TO_INT = {v: k for k, v in _INT_TO_ATOMCHIRAL.items()}
-
-
-_INT_TO_BONDTYPE = {
-    0: Chem.rdchem.BondType.UNSPECIFIED,
-    1: Chem.rdchem.BondType.SINGLE,
-    2: Chem.rdchem.BondType.DOUBLE,
-    3: Chem.rdchem.BondType.TRIPLE,
-    4: Chem.rdchem.BondType.QUADRUPLE,
-    5: Chem.rdchem.BondType.QUINTUPLE,
-    6: Chem.rdchem.BondType.HEXTUPLE,
-    7: Chem.rdchem.BondType.ONEANDAHALF,
-    8: Chem.rdchem.BondType.TWOANDAHALF,
-    9: Chem.rdchem.BondType.THREEANDAHALF,
-    10: Chem.rdchem.BondType.FOURANDAHALF,
-    11: Chem.rdchem.BondType.FIVEANDAHALF,
-    12: Chem.rdchem.BondType.AROMATIC,
-    13: Chem.rdchem.BondType.IONIC,
-    14: Chem.rdchem.BondType.HYDROGEN,
-    15: Chem.rdchem.BondType.THREECENTER,
-    16: Chem.rdchem.BondType.DATIVEONE,
-    17: Chem.rdchem.BondType.DATIVE,
-    18: Chem.rdchem.BondType.DATIVEL,
-    19: Chem.rdchem.BondType.DATIVER,
-    20: Chem.rdchem.BondType.OTHER,
-    21: Chem.rdchem.BondType.ZERO}
-_BONDTYPE_TO_INT = {v: k for k, v in _INT_TO_BONDTYPE.items()}
-_INT_TO_BONDSTEREO = {
-    0: Chem.rdchem.BondStereo.STEREONONE,
-    1: Chem.rdchem.BondStereo.STEREOANY,
-    2: Chem.rdchem.BondStereo.STEREOZ,
-    3: Chem.rdchem.BondStereo.STEREOE,
-    4: Chem.rdchem.BondStereo.STEREOCIS,
-    5: Chem.rdchem.BondStereo.STEREOTRANS}
-_BONDSTEREO_TO_INT = {v: k for k, v in _INT_TO_BONDSTEREO.items()}
-
-
-def _setprops(obj, d: dict) -> None:
-    # add props onto rdkit "obj" (atom/bond/mol/conformer)
-    # props are guaranteed one of Bool, Int, Float or String type
-    for k, v in d.items():
-        if isinstance(v, bool):
-            obj.SetBoolProp(k, v)
-        elif isinstance(v, int):
-            obj.SetIntProp(k, v)
-        elif isinstance(v, float):
-            obj.SetDoubleProp(k, v)
-        else:  # isinstance(v, str):
-            obj.SetProp(k, v)
+from .explicitmoleculecomponent import (
+    ExplicitMoleculeComponent,
+    _mol_to_dict, _mol_from_dict,
+)
 
 
 class SmallMoleculeComponent(ExplicitMoleculeComponent):
@@ -106,6 +41,14 @@ class SmallMoleculeComponent(ExplicitMoleculeComponent):
     name : str, optional
         A human readable tag for this molecule.  This name will be used in the hash.
     """
+    @classmethod
+    def _from_dict(cls, d: dict):
+        m = _mol_from_dict(d)
+
+        return cls(rdkit=m)
+
+    def _to_dict(self) -> dict:
+        return _mol_to_dict(self._rdkit)
 
     def to_sdf(self) -> str:
         """Create a string based on SDF.
@@ -192,75 +135,3 @@ def to_openff(self):
     def from_openff(cls, openff, name: str = ""):
         """Construct from an OpenFF toolkit Molecule"""
         return cls(openff.to_rdkit(), name=name)
-
-    def _to_dict(self) -> dict:
-        """Serialize to dict representation"""
-        # in a perfect world we'd use ToBinary()
-        # but this format slowly evolves, so the future hash of a SMC could change if rdkit were updated
-        # this is based on that method, with some irrelevant fields cut out
-
-        output: dict[str, Any] = {}
-
-        atoms = []
-        for atom in self._rdkit.GetAtoms():
-            atoms.append((
-                atom.GetAtomicNum(), atom.GetIsotope(), atom.GetFormalCharge(), atom.GetIsAromatic(),
-                _ATOMCHIRAL_TO_INT[atom.GetChiralTag()], atom.GetAtomMapNum(),
-                atom.GetPropsAsDict(includePrivate=False),
-            ))
-        output['atoms'] = atoms
-
-        bonds = []
-        for bond in self._rdkit.GetBonds():
-            bonds.append((
-                bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), _BONDTYPE_TO_INT[bond.GetBondType()],
-                _BONDSTEREO_TO_INT[bond.GetStereo()],
-                bond.GetPropsAsDict(includePrivate=False)
-            ))
-        output['bonds'] = bonds
-
-        conf = self._rdkit.GetConformer()
-        output['conformer'] = (serialize_numpy(conf.GetPositions()), conf.GetPropsAsDict(includePrivate=False))
-
-        output['molprops'] = self._rdkit.GetPropsAsDict(includePrivate=False)
-
-        return output
-
-    @classmethod
-    def _from_dict(cls, d: dict):
-        """Deserialize from dict representation"""
-        m = Chem.Mol()
-        em = Chem.EditableMol(m)
-
-        for atom in d['atoms']:
-            a = Chem.Atom(atom[0])
-            a.SetIsotope(atom[1])
-            a.SetFormalCharge(atom[2])
-            a.SetIsAromatic(atom[3])
-            a.SetChiralTag(_INT_TO_ATOMCHIRAL[atom[4]])
-            a.SetAtomMapNum(atom[5])
-            _setprops(a, atom[6])
-            em.AddAtom(a)
-
-        for bond in d['bonds']:
-            em.AddBond(bond[0], bond[1], _INT_TO_BONDTYPE[bond[2]])
-            # other fields are applied onto the ROMol
-
-        m = em.GetMol()
-
-        for bond, b in zip(d['bonds'], m.GetBonds()):
-            b.SetStereo(_INT_TO_BONDSTEREO[bond[3]])
-            _setprops(b, bond[4])
-
-        pos = deserialize_numpy(d['conformer'][0])
-        c = Chem.Conformer(m.GetNumAtoms())
-        for i, p in enumerate(pos):
-            c.SetAtomPosition(i, p)
-        _setprops(c, d['conformer'][1])
-        m.AddConformer(c)
-
-        _setprops(m, d['molprops'])
-
-        m.UpdatePropertyCache()
-
-        return cls(rdkit=m)

From ea1271cb6fec7b8b9035cd502cbecd38327b8565 Mon Sep 17 00:00:00 2001
From: richard gowers <richardjgowers@gmail.com>
Date: Wed, 9 Aug 2023 07:57:18 +0100
Subject: [PATCH 3/6] use ExplicitMolecule serialization in Protein

---
 gufe/components/proteincomponent.py | 131 +++++++---------------------
 1 file changed, 32 insertions(+), 99 deletions(-)

diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py
index 87366b21..a28610b3 100644
--- a/gufe/components/proteincomponent.py
+++ b/gufe/components/proteincomponent.py
@@ -11,7 +11,10 @@
 from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType
 
 from ..custom_typing import RDKitMol
-from .explicitmoleculecomponent import ExplicitMoleculeComponent
+from .explicitmoleculecomponent import (
+    ExplicitMoleculeComponent,
+    _mol_from_dict, _mol_to_dict,
+)
 from ..vendor.pdb_file.pdbfile import PDBFile
 from ..vendor.pdb_file.pdbxfile import PDBxFile
 
@@ -19,20 +22,6 @@
 from ..molhashing import deserialize_numpy, serialize_numpy
 
 
-_BONDORDERS_OPENMM_TO_RDKIT = {
-    1: BondType.SINGLE,
-    2: BondType.DOUBLE,
-    3: BondType.TRIPLE,
-    app.Single: BondType.SINGLE,
-    app.Double: BondType.DOUBLE,
-    app.Triple: BondType.TRIPLE,
-    app.Aromatic: BondType.AROMATIC,
-    None: BondType.UNSPECIFIED,
-}
-_BONDORDERS_RDKIT_TO_OPENMM = {
-    v: k for k, v in _BONDORDERS_OPENMM_TO_RDKIT.items()
-}
-
 # builtin dict of strings to enum members, boy I hope this is stable
 _BONDORDER_STR_TO_RDKIT = Chem.BondType.names
 _BONDORDER_RDKIT_TO_STR = {v: k for k, v in _BONDORDER_STR_TO_RDKIT.items()}
@@ -123,65 +112,27 @@ def from_pdbx_file(cls, pdbx_file: str, name=""):
         )
 
     @classmethod
-    def _from_dict(cls, ser_dict: dict, name: str = ""):
+    def _from_dict(cls, ser_dict: dict):
         """Deserialize from dict representation"""
+        monomer_infos = ser_dict.pop('monomer_info')
 
-        # Mol
-        rd_mol = Mol()
-        editable_rdmol = EditableMol(rd_mol)
+        m = _mol_from_dict(ser_dict)
 
-        # Add Atoms
-        for atom in ser_dict["atoms"]:
-            atomic_num = int(atom[0])
-
-            a = Atom(atomic_num)
+        for atom, info in zip(m.GetAtoms(), monomer_infos):
             mi = Chem.AtomPDBResidueInfo()
 
-            mi.SetChainId(atom[1])
-            mi.SetSerialNumber(atom[2])
-            mi.SetSegmentNumber(atom[3])
-            mi.SetInsertionCode(atom[4])
-            mi.SetName(atom[5])
-            mi.SetResidueName(atom[6])
-            mi.SetResidueNumber(int(atom[7]))
-            mi.SetIsHeteroAtom(atom[8] == 'Y')
-            a.SetFormalCharge(atom[9])
-
-            a.SetMonomerInfo(mi)
-
-            editable_rdmol.AddAtom(a)
-
-        # Add Bonds
-        for bond in ser_dict["bonds"]:
-            atomBeginIdx = int(bond[0])
-            atomEndIdx = int(bond[1])
-            bondType = _BONDORDER_STR_TO_RDKIT[bond[2]]
-            editable_rdmol.AddBond(
-                beginAtomIdx=atomBeginIdx,
-                endAtomIdx=atomEndIdx,
-                order=bondType,
-            )
-
-        # Set Positions
-        rd_mol = editable_rdmol.GetMol()
-        positions = ser_dict["conformers"]
-        for i, frame_pos in enumerate(positions):
-            frame_pos = deserialize_numpy(frame_pos)
-            conf = Conformer(i)
-            for atom_id, atom_pos in enumerate(frame_pos):
-                conf.SetAtomPosition(atom_id, atom_pos)  # unit: nm
-            rd_mol.AddConformer(conf)
+            mi.SetChainId(mi[0])
+            mi.SetSerialNumber(mi[1])
+            mi.SetSegmentNumber(mi[2])
+            mi.SetInsertionCode(mi[3])
+            mi.SetName(mi[4])
+            mi.SetResidueName(mi[5])
+            mi.SetResidueNumber(mi[6])
+            mi.SetIsHeteroAtom(mi[7] == 'Y')
 
-        # Adding missing bond info
-        for bond_id, bond in enumerate(rd_mol.GetBonds()):
-            # Can't set these on an editable mol, go round a second time
-            _, _, _, arom = ser_dict["bonds"][bond_id]
-            bond.SetIsAromatic(arom == 'Y')
+            atom.SetMonomerInfo(mi)
 
-        if "name" in ser_dict:
-            name = ser_dict["name"]
-
-        return cls(rdkit=rd_mol, name=name)
+        return cls(rdkit=m)
 
     def to_openmm_topology(self) -> "app.Topology":
         """Convert to an openmm Topology object
@@ -193,6 +144,14 @@ def to_openmm_topology(self) -> "app.Topology":
         """
         from openmm import app
 
+        BONDORDERS_RDKIT_TO_OPENMM = {
+            BondType.SINGLE: app.Single,
+            BondType.DOUBLE: app.Double,
+            BondType.TRIPLE: app.Triple,
+            BondType.AROMATIC: app.Aromatic,
+            BondType.UNSPECIFIED: None,
+        }
+
         def reskey(m):
             """key for defining when a residue has changed from previous
 
@@ -255,7 +214,7 @@ def chainkey(m):
             a1 = atom_lookup[bond.GetBeginAtomIdx()]
             a2 = atom_lookup[bond.GetEndAtomIdx()]
             top.addBond(a1, a2,
-                        order=_BONDORDERS_RDKIT_TO_OPENMM.get(
+                        order=BONDORDERS_RDKIT_TO_OPENMM.get(
                             bond.GetBondType(), None))
 
         return top
@@ -372,15 +331,13 @@ def to_pdbx_file(
 
     def _to_dict(self) -> dict:
         """Serialize to dict representation"""
+        d = _mol_to_dict(self._rdkit)
 
-        atoms = []
+        monomer_infos = []
         for atom in self._rdkit.GetAtoms():
             mi = atom.GetMonomerInfo()
 
-            # TODO: Stereo?
-            atoms.append(
-                (
-                    atom.GetAtomicNum(),
+            monomer_infos.append((
                     mi.GetChainId(),
                     mi.GetSerialNumber(),
                     mi.GetSegmentNumber(),
@@ -389,32 +346,8 @@ def _to_dict(self) -> dict:
                     mi.GetResidueName(),
                     mi.GetResidueNumber(),
                     'Y' if mi.GetIsHeteroAtom() else 'N',
-                    atom.GetFormalCharge(),
-                )
-            )
+            ))
 
-        bonds = [
-            (
-                bond.GetBeginAtomIdx(),
-                bond.GetEndAtomIdx(),
-                _BONDORDER_RDKIT_TO_STR[bond.GetBondType()],
-                'Y' if bond.GetIsAromatic() else 'N',
-                # bond.GetStereo() or "",  do we need this? i.e. are openff ffs going to use cis/trans SMARTS?
-            )
-            for bond in self._rdkit.GetBonds()
-        ]
-
-        conformers = [
-            serialize_numpy(conf.GetPositions())  # .m_as(unit.angstrom)
-            for conf in self._rdkit.GetConformers()
-        ]
-
-        # Result
-        d = {
-            "atoms": atoms,
-            "bonds": bonds,
-            "name": self.name,
-            "conformers": conformers,
-        }
+        d['monomer_info'] = monomer_infos
 
         return d

From 4e91467503a929195db5723e8dfacc344ac5072c Mon Sep 17 00:00:00 2001
From: richard gowers <richardjgowers@gmail.com>
Date: Thu, 10 Aug 2023 12:08:44 +0100
Subject: [PATCH 4/6] fixup Protein deserialization

---
 gufe/components/proteincomponent.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py
index a28610b3..85d12992 100644
--- a/gufe/components/proteincomponent.py
+++ b/gufe/components/proteincomponent.py
@@ -121,14 +121,14 @@ def _from_dict(cls, ser_dict: dict):
         for atom, info in zip(m.GetAtoms(), monomer_infos):
             mi = Chem.AtomPDBResidueInfo()
 
-            mi.SetChainId(mi[0])
-            mi.SetSerialNumber(mi[1])
-            mi.SetSegmentNumber(mi[2])
-            mi.SetInsertionCode(mi[3])
-            mi.SetName(mi[4])
-            mi.SetResidueName(mi[5])
-            mi.SetResidueNumber(mi[6])
-            mi.SetIsHeteroAtom(mi[7] == 'Y')
+            mi.SetChainId(info[0])
+            mi.SetSerialNumber(info[1])
+            mi.SetSegmentNumber(info[2])
+            mi.SetInsertionCode(info[3])
+            mi.SetName(info[4])
+            mi.SetResidueName(info[5])
+            mi.SetResidueNumber(info[6])
+            mi.SetIsHeteroAtom(info[7] == 'Y')
 
             atom.SetMonomerInfo(mi)
 

From 8658083f95580ede16c398c1f7bce4235db32872 Mon Sep 17 00:00:00 2001
From: richard gowers <richardjgowers@gmail.com>
Date: Thu, 10 Aug 2023 12:08:55 +0100
Subject: [PATCH 5/6] fixup to_openmm_positions

---
 gufe/components/proteincomponent.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/gufe/components/proteincomponent.py b/gufe/components/proteincomponent.py
index 85d12992..143ac1a0 100644
--- a/gufe/components/proteincomponent.py
+++ b/gufe/components/proteincomponent.py
@@ -231,12 +231,9 @@ def to_openmm_positions(self) -> "openmm.app.unit.Quantity":
         """
         from openmm import unit as omm_unit
 
-        np_pos = deserialize_numpy(self.to_dict()["conformers"][0])
-        openmm_pos = (
-            list(map(lambda x: np.array(x), np_pos)) * omm_unit.angstrom
-        )
+        np_pos = self._rdkit.GetConformer().GetPositions()
 
-        return openmm_pos
+        return np_pos * omm_unit.angstrom
 
     def to_pdb_file(self, out_path: Union[str, bytes, PathLike[str], PathLike[bytes], io.TextIOBase]) -> str:
         """Write protein to pdb file.

From 215dace8826ec0b224b5e31ae7c30b5aac5701ff Mon Sep 17 00:00:00 2001
From: richard gowers <richardjgowers@gmail.com>
Date: Thu, 10 Aug 2023 13:40:28 +0100
Subject: [PATCH 6/6] use pooch for PLB files

previous solution used io.StringIO which rdkit doesn't like (and wasn't documented as accepted)
---
 gufe/tests/conftest.py              | 82 ++++++++++++-----------------
 gufe/tests/test_proteincomponent.py | 50 ++++++------------
 2 files changed, 50 insertions(+), 82 deletions(-)

diff --git a/gufe/tests/conftest.py b/gufe/tests/conftest.py
index 034fac04..8ca4d444 100644
--- a/gufe/tests/conftest.py
+++ b/gufe/tests/conftest.py
@@ -4,8 +4,8 @@
 import importlib.resources
 import urllib.request
 from urllib.error import URLError
-import io
 import functools
+import pooch
 import pytest
 from rdkit import Chem
 from rdkit.Chem import AllChem
@@ -20,58 +20,42 @@
 else:
     HAS_INTERNET = True
 
+PLB_files = pooch.create(
+    path=pooch.os_cache('pdbinf'),
+    base_url='https://github.com/openforcefield/protein-ligand-benchmark/raw/d3387602bbeb0167abf00dfb81753d8936775dd2/data/',
+    version=None,
+    registry={
+        'p38/01_protein/crd/protein.pdb': '3f0bf718644e7c29f5200cd3def4240ac25ef5fb1948b2e64deb5015d8a45aa4',
+        'mcl1/01_protein/crd/protein.pdb': 'f80ff9dd93a5d9dd6e90091e9631a8ce7fe0dc931e16543e22c1f92009660306',
+        'cdk2/01_protein/crd/protein.pdb': '15d1e509d7951ca45ea266d51a627d5f452dcf0bb5bd48751ae57eb29e28ab69',
+        'shp2/01_protein/crd/protein.pdb': 'd6759cbd135aaddaa658446064df4095d978d3681c014a0528b542d60b2c8770',
+        'pde2/01_protein/crd/protein.pdb': '3b7967c1717789215452cdf919520625602d5438a9d2a18620726b8b1b3a8ef0',
+        'cmet/01_protein/crd/protein.pdb': '155ec32941a9082dbdbbfde460ff97c88d4fe7e100e9a9577edb5a9e7b6467ae',
+        'ptp1b/01_protein/crd/protein.pdb': 'bfa0f9204e96aa463b80946b788c4153cd24701291007eb77638a16fd156634e',
+        'thrombin/01_protein/crd/protein.pdb': 'eb4ea18bef9c4c71dcdc922616d6719ee918112be87a0bd6b274c856eff1dd59',
+        'cdk8/01_protein/crd/protein.pdb': 'b058774526a19775d8f438b14e9d6da331b6de74e0ef9e96db575f6c0bb067b2',
+        'pfkfb3/01_protein/crd/protein.pdb': '4367710db0dbf284cc715ae9a8dd82d06bd77dcc3fb0885678e16632a2732dcc',
+        'tyk2/01_protein/crd/protein.pdb': '9090684f4bdae90afbe5f2698a14c778396c024c19ceb6333de4808d9e29fae6',
+        'syk/01_protein/crd/protein.pdb': 'f6199d0c1818eb5bb24e164426789cf39cae7aa32c8ca2e98f5f44d299a6f82f',
+        'tnks2/01_protein/crd/protein.pdb': 'fc7681a05dbf07590aa8de133f981b6d8ae9cebcc23d54addc2c4fe80be80299',
+        'eg5/01_protein/crd/protein.pdb': 'f2964a785c922502dc86fb4e2e5295d32d41d5b68b8c3246e989de5234c3fd0f',
+        'hif2a/01_protein/crd/protein.pdb': '5bbf520e7c102a65cc7ba0253fd66f43562f77284c82b3b9613e997b7ac76c93',
+
+    },
+)
 
-class URLFileLike:
-    def __init__(self, url, encoding='utf-8'):
-        self.url = url
-        self.encoding = encoding
-        self.data = None
 
-    def __call__(self):
+@pytest.fixture(params=['p38', 'mcl1', 'cdk2', 'shp2', 'pde2', 'cmet', 'ptp1b',
+                        'thrombin', 'cdk8', 'pfkfb3', 'tyk2', 'syk', 'tnks2',
+                        'eg5', 'hif2a', '181l'])
+def PDB_files(request):
+    if request.param == '181l':
+        with importlib.resources.path('gufe.tests.data', '181l.pdb') as file:
+            return str(file)
+    else:
         if not HAS_INTERNET:  # pragma: no-cover
             pytest.skip("Skipping because internet seems faulty")
-
-        if self.data is None:
-            req = urllib.request.urlopen(self.url)
-            self.data = req.read().decode(self.encoding)
-
-        return io.StringIO(self.data)
-
-
-def get_test_filename(filename):
-    with importlib.resources.path('gufe.tests.data', filename) as file:
-        return str(file)
-
-
-_benchmark_pdb_names = [
-        "cmet_protein",
-        "hif2a_protein",
-        "mcl1_protein",
-        "p38_protein",
-        "ptp1b_protein",
-        "syk_protein",
-        "thrombin_protein",
-        "tnsk2_protein",
-        "tyk2_protein",
-        ]
-
-
-_pl_benchmark_url_pattern = (
-    "https://github.com/OpenFreeEnergy/openfe-benchmarks/blob/main/openfe_benchmarks/data/{name}.pdb?raw=true"
-)
-
-
-PDB_BENCHMARK_LOADERS = {
-    name: URLFileLike(url=_pl_benchmark_url_pattern.format(name=name))
-    for name in _benchmark_pdb_names
-}
-
-PDB_FILE_LOADERS = {
-    name: lambda: get_test_filename(name)
-    for name in ["181l.pdb"]
-}
-
-ALL_PDB_LOADERS = dict(**PDB_BENCHMARK_LOADERS, **PDB_FILE_LOADERS)
+        return PLB_files.fetch('{}/01_protein/crd/protein.pdb'.format(request.param))
 
 
 @pytest.fixture
diff --git a/gufe/tests/test_proteincomponent.py b/gufe/tests/test_proteincomponent.py
index 934095a7..90161c4a 100644
--- a/gufe/tests/test_proteincomponent.py
+++ b/gufe/tests/test_proteincomponent.py
@@ -14,7 +14,7 @@
 from openmm import unit
 from numpy.testing import assert_almost_equal
 
-from .conftest import ALL_PDB_LOADERS
+from .conftest import PLB_files
 
 
 @pytest.fixture
@@ -94,11 +94,8 @@ class TestProteinComponent(GufeTokenizableTestsMixin):
     def instance(self, PDB_181L_path):
         return self.cls.from_pdb_file(PDB_181L_path, name="Steve")
 
-    # From
-    @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys())
-    def test_from_pdb_file(self, in_pdb_path):
-        in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]()
-        p = self.cls.from_pdb_file(in_pdb_io, name="Steve")
+    def test_from_pdb_file(self, PDB_files):
+        p = self.cls.from_pdb_file(PDB_files, name="Steve")
 
         assert isinstance(p, ProteinComponent)
         assert p.name == "Steve"
@@ -177,21 +174,16 @@ def test_to_pdb_input_types(self, PDB_181L_OpenMMClean_path, tmp_path,
             output_func=p.to_pdb_file
         )
 
-    @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys())
-    def test_to_pdb_round_trip(self, in_pdb_path, tmp_path):
-        in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]()
-
-        p = self.cls.from_pdb_file(in_pdb_io, name="Wuff")
-        out_file_name = "tmp_"+in_pdb_path+".pdb"
+    def test_to_pdb_round_trip(self, PDB_files, tmp_path):
+        p = self.cls.from_pdb_file(PDB_files, name="Wuff")
+        out_file_name = "tmp_foo.pdb"
         out_file = tmp_path / out_file_name
 
         p.to_pdb_file(str(out_file))
 
-        ref_in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]()
-
         # generate openMM reference file:
-        openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io)
-        out_ref_file_name = "tmp_"+in_pdb_path+"_openmm_ref.pdb"
+        openmm_pdb = pdbfile.PDBFile(PDB_files)
+        out_ref_file_name = "tmp_foo_openmm_ref.pdb"
         out_ref_file = tmp_path / out_ref_file_name
 
         pdbfile.PDBFile.writeFile(openmm_pdb.topology, openmm_pdb.positions, file=open(str(out_ref_file), "w"))
@@ -213,16 +205,11 @@ def test_dummy_from_dict(self, PDB_181L_OpenMMClean_path):
 
         assert p == p2
 
-    # parametrize
-    @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys())
-    def test_to_openmm_positions(self, in_pdb_path):
-        in_pdb_io = ALL_PDB_LOADERS[in_pdb_path]()
-        ref_in_pdb_io =  ALL_PDB_LOADERS[in_pdb_path]()
-
-        openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io)
+    def test_to_openmm_positions(self, PDB_files):
+        openmm_pdb = pdbfile.PDBFile(PDB_files)
         openmm_pos = openmm_pdb.positions
 
-        p = self.cls.from_pdb_file(in_pdb_io, name="Bob")
+        p = self.cls.from_pdb_file(PDB_files, name="Bob")
         gufe_openmm_pos = p.to_openmm_positions()
 
         v1 = gufe_openmm_pos.value_in_unit(unit.nanometer)
@@ -230,16 +217,11 @@ def test_to_openmm_positions(self, in_pdb_path):
 
         assert_almost_equal(actual=v1, desired=v2, decimal=6)
 
-    # parametrize
-    @pytest.mark.parametrize('in_pdb_path', ALL_PDB_LOADERS.keys())
-    def test_to_openmm_topology(self, in_pdb_path):
-        in_pdb_io =  ALL_PDB_LOADERS[in_pdb_path]()
-        ref_in_pdb_io =  ALL_PDB_LOADERS[in_pdb_path]()
-
-        openmm_pdb = pdbfile.PDBFile(ref_in_pdb_io)
+    def test_to_openmm_topology(self, PDB_files):
+        openmm_pdb = pdbfile.PDBFile(PDB_files)
         openmm_top = openmm_pdb.topology
 
-        p = self.cls.from_pdb_file(in_pdb_io, name="Bob")
+        p = self.cls.from_pdb_file(PDB_files, name="Bob")
         gufe_openmm_top = p.to_openmm_topology()
         assert_topology_equal(openmm_top, gufe_openmm_top)
 
@@ -290,7 +272,9 @@ def test_protein_total_charge(self, PDB_181L_path):
         assert m1.total_charge == 7
 
     def test_protein_total_charge_thromb(self):
-        m1 = self.cls.from_pdb_file(ALL_PDB_LOADERS["thrombin_protein"]())
+        f = PLB_files.fetch('thrombin/01_protein/crd/protein.pdb')
+
+        m1 = self.cls.from_pdb_file(f)
 
         assert m1.total_charge == 6