From e6e9658c589416b36ef9f0b83c6d5f41a710690a Mon Sep 17 00:00:00 2001
From: Alex Morehead <acmwhb@missouri.edu>
Date: Fri, 18 Aug 2023 13:22:37 -0700
Subject: [PATCH 1/3] Add the ability for the `PDBManager` to perform
 interface-based chain filtering

---
 graphein/ml/datasets/pdb_data.py | 221 ++++++++++++++++++++++++++++++-
 1 file changed, 216 insertions(+), 5 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 9082746a0..c1c82612e 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -1,3 +1,4 @@
+import copy
 import gzip
 import os
 import shutil
@@ -13,6 +14,7 @@
 from biopandas.pdb import PandasPdb
 from loguru import logger as log
 from pandas.core.groupby.generic import DataFrameGroupBy
+from scipy.spatial.distance import cdist
 from tqdm import tqdm
 
 from graphein.protein.utils import (
@@ -23,6 +25,11 @@
 )
 from graphein.utils.dependencies import is_tool
 
+PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING: List[str] = ["CA", "C4'"]
+SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING: List[str] = ["H"]
+PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
+SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
+
 
 class PDBManager:
     """A utility for creating selections of experimental PDB structures."""
@@ -1818,6 +1825,99 @@ def select_pdb_by_criterion(
                 pdb.df[key] = filtered_pdb
         return pdb
 
+    def filter_chains_by_interface_criteria(
+        self,
+        pdb: PandasPdb,
+        primary_interchain_contact_atoms_for_filtering: List[str] = PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING,
+        secondary_interchain_contact_atoms_not_for_filtering: List[str] = SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING,
+        primary_hydrogen_bond_atoms_for_filtering: List[str] = PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        secondary_hydrogen_bond_atoms_for_filtering: List[str] = SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
+        chain_id_col: str = "chain_id",
+        atom_name_col: str = "atom_name",
+        atom_df_name: str = "ATOM",
+    ) -> PandasPdb:
+        """Filter a PDB using interface criteria.
+
+        :param pdb: The PDB object to filter by interface criteria.
+        :type pdb: PandasPdb
+        :param primary_interchain_contact_atoms_for_filtering: The main atoms in each residue with
+            which to measure inter-chain pairwise residue distances.
+        :type primary_interchain_contact_atoms_for_filtering: List[str], optional
+        :param secondary_interchain_contact_atoms_not_for_filtering: The secondary atoms in each residue without
+            which to measure inter-chain pairwise residue distances.
+        :type secondary_interchain_contact_atoms_not_for_filtering: List[str], optional
+        :param primary_hydrogen_bond_atoms_for_filtering: The main atoms in each residue with
+            which to measure inter-chain atom distances for hydrogen bonding.
+        :type primary_hydrogen_bond_atoms_for_filtering: List[str], optional
+        :param secondary_hydrogen_bond_atoms_for_filtering: The secondary atoms in each residue with
+            which to measure inter-chain atom distances for hydrogen bonding.
+        :type secondary_hydrogen_bond_atoms_for_filtering: List[str], optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported.
+        :type hydrogen_bond_count: int, optional
+        :param chain_id_col: Name of the chain ID DataFrame column.
+        type: chain_id_col: str, optional
+        :param atom_name_col: Name of the atom name DataFrame column.
+        type: atom_name_col: str, optional
+        :param atom_df_name: Name of the DataFrame by which to access
+            ATOM entries within a PandasPdb object.
+        :type atom_df_name: str, defaults to ``ATOM``
+
+        :return: The filtered PDB object.
+        :rtype: PandasPdb
+        """
+        filtered_pdb = copy.deepcopy(pdb)
+    
+        atom_data = pdb.df[atom_df_name]
+        unique_chain_ids = atom_data[chain_id_col].unique()
+        
+        interface_contact_atom_mask = atom_data[atom_name_col].isin(primary_interchain_contact_atoms_for_filtering)
+        interface_contact_other_atom_mask = ~atom_data[atom_name_col].isin(secondary_interchain_contact_atoms_not_for_filtering)
+        hydrogen_bond_atom_mask = atom_data[atom_name_col].isin(primary_hydrogen_bond_atoms_for_filtering)
+        hydrogen_bond_other_atom_mask = atom_data[atom_name_col].isin(secondary_hydrogen_bond_atoms_for_filtering)
+        
+        for chain1 in unique_chain_ids:
+            interface_contact_chain1_mask = (atom_data[chain_id_col] == chain1) & interface_contact_atom_mask
+            hydrogen_bond_chain1_mask = (atom_data[chain_id_col] == chain1) & hydrogen_bond_atom_mask
+            interface_contact_chain1_residues = atom_data[interface_contact_chain1_mask]
+            hydrogen_bond_chain1_atoms = atom_data[hydrogen_bond_chain1_mask]
+            
+            if np.sum(interface_contact_chain1_mask) == 0 or np.sum(hydrogen_bond_chain1_mask) == 0:
+                continue
+            
+            interface_contact_chain1_coords = interface_contact_chain1_residues[["x_coord", "y_coord", "z_coord"]].to_numpy()
+            interface_contact_non_chain1_coords = atom_data.loc[interface_contact_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
+            hydrogen_bond_chain1_coords = hydrogen_bond_chain1_atoms[["x_coord", "y_coord", "z_coord"]].to_numpy()
+            hydrogen_bond_non_chain1_coords = atom_data.loc[hydrogen_bond_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
+
+            interface_contact_distances = cdist(interface_contact_chain1_coords, interface_contact_non_chain1_coords, metric="euclidean")
+            hydrogen_bond_distances = cdist(hydrogen_bond_chain1_coords, hydrogen_bond_non_chain1_coords, metric="euclidean")
+            
+            num_interface_contacts = np.sum(interface_contact_distances <= interface_contact_criterion, axis=1).sum()
+            chain_within_interface = (num_interface_contacts >= interface_contact_count).item()
+
+            num_hydrogen_bonds = np.sum(hydrogen_bond_distances <= hydrogen_bond_criterion, axis=1).sum()
+            chain_with_sufficient_bond_count = (num_hydrogen_bonds >= hydrogen_bond_count).item()
+            
+            if not chain_within_interface or not chain_with_sufficient_bond_count:
+                log.info(f"Filtering out chain {chain1} within PDB {pdb.pdb_path}, as it contains {num_interface_contacts} (of {interface_contact_count} required) interface contacts and {num_hydrogen_bonds} (of {hydrogen_bond_count} required) hydrogen bonds")
+                filtered_pdb.df[atom_df_name] = filtered_pdb.df[atom_df_name][filtered_pdb.df[atom_df_name][chain_id_col] != chain1]
+        
+        return filtered_pdb
+
     def write_out_pdb_chain_groups(
         self,
         df: pd.DataFrame,
@@ -1828,6 +1928,11 @@ def write_out_pdb_chain_groups(
         atom_df_name: str = "ATOM",
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
     ):
         """Record groups of PDB codes and associated chains
         as collated PDB files.
@@ -1852,6 +1957,28 @@ def write_out_pdb_chain_groups(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         """
         if len(df) > 0:
             split_dir = Path(out_dir) / split
@@ -1896,14 +2023,29 @@ def write_out_pdb_chain_groups(
                         for chain in entry_chains
                         if chain in pdb_atom_chains
                     ]
-                    chains = (
-                        chains
-                        if max_num_chains_per_pdb_code == -1
-                        else chains[:max_num_chains_per_pdb_code]
-                    )
+                    if not filter_for_interface_contacts:
+                        chains = (
+                            chains
+                            if max_num_chains_per_pdb_code == -1
+                            else chains[:max_num_chains_per_pdb_code]
+                        )
                     pdb_chains = self.select_pdb_by_criterion(
                         pdb, "chain_id", chains, entry_pdb_code
                     )
+                    num_pdb_chains = len(pdb_chains.df[atom_df_name].chain_id.unique().tolist())
+                    if filter_for_interface_contacts and num_pdb_chains > 1:
+                        pdb_chains = self.filter_chains_by_interface_criteria(
+                            pdb=pdb_chains,
+                            interface_contact_criterion=interface_contact_criterion,
+                            hydrogen_bond_criterion=hydrogen_bond_criterion,
+                            interface_contact_count=interface_contact_count,
+                            hydrogen_bond_count=hydrogen_bond_count,
+                        )
+                        pdb_chains = (
+                            pdb_chains
+                            if max_num_chains_per_pdb_code == -1
+                            else pdb_chains[:max_num_chains_per_pdb_code]
+                        )
                     # export selected chains within the same PDB file
                     pdb_chains.to_pdb(str(output_pdb_filepath))
 
@@ -1915,6 +2057,11 @@ def write_df_pdbs(
         splits: Optional[List[str]] = None,
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
     ):
         """Write the given selection as a collection of PDB files.
 
@@ -1935,6 +2082,28 @@ def write_df_pdbs(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         """
         out_dir = Path(pdb_dir) / out_dir
         os.makedirs(out_dir, exist_ok=True)
@@ -1950,6 +2119,11 @@ def write_df_pdbs(
                     merge_fn=self.merge_pdb_chain_groups,
                     max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
                     models=models,
+                    filter_for_interface_contacts=filter_for_interface_contacts,
+                    interface_contact_criterion=interface_contact_criterion,
+                    hydrogen_bond_criterion=hydrogen_bond_criterion,
+                    interface_contact_count=interface_contact_count,
+                    hydrogen_bond_count=hydrogen_bond_count,
                 )
         else:
             self.write_out_pdb_chain_groups(
@@ -1960,6 +2134,11 @@ def write_df_pdbs(
                 merge_fn=self.merge_pdb_chain_groups,
                 max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
                 models=models,
+                filter_for_interface_contacts=filter_for_interface_contacts,
+                interface_contact_criterion=interface_contact_criterion,
+                hydrogen_bond_criterion=hydrogen_bond_criterion,
+                interface_contact_count=interface_contact_count,
+                hydrogen_bond_count=hydrogen_bond_count,
             )
 
     def export_pdbs(
@@ -1968,6 +2147,11 @@ def export_pdbs(
         splits: Optional[List[str]] = None,
         max_num_chains_per_pdb_code: int = -1,
         models: List[int] = [1],
+        filter_for_interface_contacts: bool = False,
+        interface_contact_criterion: float = 7.0,
+        hydrogen_bond_criterion: float = 3.5,
+        interface_contact_count: int = 16,
+        hydrogen_bond_count: int = 10,
         force: bool = False,
     ):
         """Write the selection as a collection of PDB files.
@@ -1983,6 +2167,28 @@ def export_pdbs(
         :param models: List of indices of models from which to extract chains,
             defaults to ``[1]``.
         :type models: List[int], optional
+        :param filter_for_interface_contacts: Whether to filter for complex
+            chains that constitute at least one inter-chain interface, as
+            defined by the subsequent parameters ``interface_contact_criterion``,
+            ``hydrogen_bond_criterion``, ``interface_contact_count``,
+            and ``hydrogen_bond_count``.
+        :param filter_for_interface_contacts: bool, optional
+        :param interface_contact_criterion: Distance between two inter-chain
+            residues at which to classify a residue pair as interface contacts.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_criterion: float, optional
+        :param hydrogen_bond_criterion: Distance between two inter-chain
+            atoms at which to classify an atom pair as hydrogen-bonded.
+            Only referenced if ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_criterion: float, optional
+        :param interface_contact_count: Number of interface contacts required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type interface_contact_count: int, optional
+        :param hydrogen_bond_count: Number of hydrogen bonds required
+            to select a chain to be exported. Only referenced if
+            ``filter_for_interface_contacts`` is ``True``.
+        :type hydrogen_bond_count: int, optional
         :param force: Whether to raise an error if the download selection
             contains PDBs which are not available in PDB format.
         """
@@ -1999,5 +2205,10 @@ def export_pdbs(
             splits=splits,
             max_num_chains_per_pdb_code=max_num_chains_per_pdb_code,
             models=models,
+            filter_for_interface_contacts=filter_for_interface_contacts,
+            interface_contact_criterion=interface_contact_criterion,
+            hydrogen_bond_criterion=hydrogen_bond_criterion,
+            interface_contact_count=interface_contact_count,
+            hydrogen_bond_count=hydrogen_bond_count,
         )
         log.info("Done writing selection of PDB chains")

From 8c821f0cc1aff959fe4aad7f224b7c569c75017c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 18 Aug 2023 20:26:23 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 graphein/ml/datasets/pdb_data.py | 161 +++++++++++++++++++++++--------
 1 file changed, 123 insertions(+), 38 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index c1c82612e..24065aec2 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -27,8 +27,28 @@
 
 PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING: List[str] = ["CA", "C4'"]
 SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING: List[str] = ["H"]
-PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
-SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = ["N", "O", "N1", "N9", "N3", "C2", "C4", "C5", "C6"]
+PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = [
+    "N",
+    "O",
+    "N1",
+    "N9",
+    "N3",
+    "C2",
+    "C4",
+    "C5",
+    "C6",
+]
+SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = [
+    "N",
+    "O",
+    "N1",
+    "N9",
+    "N3",
+    "C2",
+    "C4",
+    "C5",
+    "C6",
+]
 
 
 class PDBManager:
@@ -1828,10 +1848,18 @@ def select_pdb_by_criterion(
     def filter_chains_by_interface_criteria(
         self,
         pdb: PandasPdb,
-        primary_interchain_contact_atoms_for_filtering: List[str] = PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING,
-        secondary_interchain_contact_atoms_not_for_filtering: List[str] = SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING,
-        primary_hydrogen_bond_atoms_for_filtering: List[str] = PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
-        secondary_hydrogen_bond_atoms_for_filtering: List[str] = SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        primary_interchain_contact_atoms_for_filtering: List[
+            str
+        ] = PRIMARY_INTERCHAIN_CONTACT_ATOMS_FOR_FILTERING,
+        secondary_interchain_contact_atoms_not_for_filtering: List[
+            str
+        ] = SECONDARY_INTERCHAIN_CONTACT_ATOMS_NOT_FOR_FILTERING,
+        primary_hydrogen_bond_atoms_for_filtering: List[
+            str
+        ] = PRIMARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
+        secondary_hydrogen_bond_atoms_for_filtering: List[
+            str
+        ] = SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING,
         interface_contact_criterion: float = 7.0,
         hydrogen_bond_criterion: float = 3.5,
         interface_contact_count: int = 16,
@@ -1880,42 +1908,97 @@ def filter_chains_by_interface_criteria(
         :rtype: PandasPdb
         """
         filtered_pdb = copy.deepcopy(pdb)
-    
+
         atom_data = pdb.df[atom_df_name]
         unique_chain_ids = atom_data[chain_id_col].unique()
-        
-        interface_contact_atom_mask = atom_data[atom_name_col].isin(primary_interchain_contact_atoms_for_filtering)
-        interface_contact_other_atom_mask = ~atom_data[atom_name_col].isin(secondary_interchain_contact_atoms_not_for_filtering)
-        hydrogen_bond_atom_mask = atom_data[atom_name_col].isin(primary_hydrogen_bond_atoms_for_filtering)
-        hydrogen_bond_other_atom_mask = atom_data[atom_name_col].isin(secondary_hydrogen_bond_atoms_for_filtering)
-        
+
+        interface_contact_atom_mask = atom_data[atom_name_col].isin(
+            primary_interchain_contact_atoms_for_filtering
+        )
+        interface_contact_other_atom_mask = ~atom_data[atom_name_col].isin(
+            secondary_interchain_contact_atoms_not_for_filtering
+        )
+        hydrogen_bond_atom_mask = atom_data[atom_name_col].isin(
+            primary_hydrogen_bond_atoms_for_filtering
+        )
+        hydrogen_bond_other_atom_mask = atom_data[atom_name_col].isin(
+            secondary_hydrogen_bond_atoms_for_filtering
+        )
+
         for chain1 in unique_chain_ids:
-            interface_contact_chain1_mask = (atom_data[chain_id_col] == chain1) & interface_contact_atom_mask
-            hydrogen_bond_chain1_mask = (atom_data[chain_id_col] == chain1) & hydrogen_bond_atom_mask
-            interface_contact_chain1_residues = atom_data[interface_contact_chain1_mask]
+            interface_contact_chain1_mask = (
+                atom_data[chain_id_col] == chain1
+            ) & interface_contact_atom_mask
+            hydrogen_bond_chain1_mask = (
+                atom_data[chain_id_col] == chain1
+            ) & hydrogen_bond_atom_mask
+            interface_contact_chain1_residues = atom_data[
+                interface_contact_chain1_mask
+            ]
             hydrogen_bond_chain1_atoms = atom_data[hydrogen_bond_chain1_mask]
-            
-            if np.sum(interface_contact_chain1_mask) == 0 or np.sum(hydrogen_bond_chain1_mask) == 0:
+
+            if (
+                np.sum(interface_contact_chain1_mask) == 0
+                or np.sum(hydrogen_bond_chain1_mask) == 0
+            ):
                 continue
-            
-            interface_contact_chain1_coords = interface_contact_chain1_residues[["x_coord", "y_coord", "z_coord"]].to_numpy()
-            interface_contact_non_chain1_coords = atom_data.loc[interface_contact_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
-            hydrogen_bond_chain1_coords = hydrogen_bond_chain1_atoms[["x_coord", "y_coord", "z_coord"]].to_numpy()
-            hydrogen_bond_non_chain1_coords = atom_data.loc[hydrogen_bond_other_atom_mask & (atom_data[chain_id_col] != chain1), ["x_coord", "y_coord", "z_coord"]].to_numpy()
-
-            interface_contact_distances = cdist(interface_contact_chain1_coords, interface_contact_non_chain1_coords, metric="euclidean")
-            hydrogen_bond_distances = cdist(hydrogen_bond_chain1_coords, hydrogen_bond_non_chain1_coords, metric="euclidean")
-            
-            num_interface_contacts = np.sum(interface_contact_distances <= interface_contact_criterion, axis=1).sum()
-            chain_within_interface = (num_interface_contacts >= interface_contact_count).item()
-
-            num_hydrogen_bonds = np.sum(hydrogen_bond_distances <= hydrogen_bond_criterion, axis=1).sum()
-            chain_with_sufficient_bond_count = (num_hydrogen_bonds >= hydrogen_bond_count).item()
-            
-            if not chain_within_interface or not chain_with_sufficient_bond_count:
-                log.info(f"Filtering out chain {chain1} within PDB {pdb.pdb_path}, as it contains {num_interface_contacts} (of {interface_contact_count} required) interface contacts and {num_hydrogen_bonds} (of {hydrogen_bond_count} required) hydrogen bonds")
-                filtered_pdb.df[atom_df_name] = filtered_pdb.df[atom_df_name][filtered_pdb.df[atom_df_name][chain_id_col] != chain1]
-        
+
+            interface_contact_chain1_coords = (
+                interface_contact_chain1_residues[
+                    ["x_coord", "y_coord", "z_coord"]
+                ].to_numpy()
+            )
+            interface_contact_non_chain1_coords = atom_data.loc[
+                interface_contact_other_atom_mask
+                & (atom_data[chain_id_col] != chain1),
+                ["x_coord", "y_coord", "z_coord"],
+            ].to_numpy()
+            hydrogen_bond_chain1_coords = hydrogen_bond_chain1_atoms[
+                ["x_coord", "y_coord", "z_coord"]
+            ].to_numpy()
+            hydrogen_bond_non_chain1_coords = atom_data.loc[
+                hydrogen_bond_other_atom_mask
+                & (atom_data[chain_id_col] != chain1),
+                ["x_coord", "y_coord", "z_coord"],
+            ].to_numpy()
+
+            interface_contact_distances = cdist(
+                interface_contact_chain1_coords,
+                interface_contact_non_chain1_coords,
+                metric="euclidean",
+            )
+            hydrogen_bond_distances = cdist(
+                hydrogen_bond_chain1_coords,
+                hydrogen_bond_non_chain1_coords,
+                metric="euclidean",
+            )
+
+            num_interface_contacts = np.sum(
+                interface_contact_distances <= interface_contact_criterion,
+                axis=1,
+            ).sum()
+            chain_within_interface = (
+                num_interface_contacts >= interface_contact_count
+            ).item()
+
+            num_hydrogen_bonds = np.sum(
+                hydrogen_bond_distances <= hydrogen_bond_criterion, axis=1
+            ).sum()
+            chain_with_sufficient_bond_count = (
+                num_hydrogen_bonds >= hydrogen_bond_count
+            ).item()
+
+            if (
+                not chain_within_interface
+                or not chain_with_sufficient_bond_count
+            ):
+                log.info(
+                    f"Filtering out chain {chain1} within PDB {pdb.pdb_path}, as it contains {num_interface_contacts} (of {interface_contact_count} required) interface contacts and {num_hydrogen_bonds} (of {hydrogen_bond_count} required) hydrogen bonds"
+                )
+                filtered_pdb.df[atom_df_name] = filtered_pdb.df[atom_df_name][
+                    filtered_pdb.df[atom_df_name][chain_id_col] != chain1
+                ]
+
         return filtered_pdb
 
     def write_out_pdb_chain_groups(
@@ -2032,7 +2115,9 @@ def write_out_pdb_chain_groups(
                     pdb_chains = self.select_pdb_by_criterion(
                         pdb, "chain_id", chains, entry_pdb_code
                     )
-                    num_pdb_chains = len(pdb_chains.df[atom_df_name].chain_id.unique().tolist())
+                    num_pdb_chains = len(
+                        pdb_chains.df[atom_df_name].chain_id.unique().tolist()
+                    )
                     if filter_for_interface_contacts and num_pdb_chains > 1:
                         pdb_chains = self.filter_chains_by_interface_criteria(
                             pdb=pdb_chains,

From 4b39a4b25c7f3cae2975c859af52d0dc5dda7701 Mon Sep 17 00:00:00 2001
From: Alex Morehead <acmwhb@missouri.edu>
Date: Sat, 19 Aug 2023 16:04:48 -0600
Subject: [PATCH 3/3] Remove C atoms from hydrogen bond calculation

---
 graphein/ml/datasets/pdb_data.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py
index 24065aec2..a79ef582d 100644
--- a/graphein/ml/datasets/pdb_data.py
+++ b/graphein/ml/datasets/pdb_data.py
@@ -33,10 +33,6 @@
     "N1",
     "N9",
     "N3",
-    "C2",
-    "C4",
-    "C5",
-    "C6",
 ]
 SECONDARY_HYDROGEN_BOND_ATOMS_FOR_FILTERING: List[str] = [
     "N",
@@ -44,10 +40,6 @@
     "N1",
     "N9",
     "N3",
-    "C2",
-    "C4",
-    "C5",
-    "C6",
 ]