Skip to content

Commit

Permalink
[feat] added get_model and get_models fct to mmcif (#145)
Browse files Browse the repository at this point in the history
* [feat] added get_model and get_models fct to mmcif

* [docs] add CHANGELOG.md

* [feat] added tests for multiple models

* [feat] added tests for multiple models

* [feat] rename df to biopandas_structure

* bump changelog + changelog workflow

* Delete docs/sources/CHANGELOG.md

---------

Co-authored-by: Arian Jamasb <[email protected]>
  • Loading branch information
kierandidi and a-r-j authored Jul 8, 2024
1 parent 67aa2f2 commit 3e26557
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/changelog-enforcer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ jobs:
- uses: actions/checkout@v3
- uses: dangoslen/changelog-enforcer@v3
with:
skipLabels: 'skip-changelog'
skipLabels: 'skip-changelog'
changeLogPath: 'docs/CHANGELOG.md'
65 changes: 63 additions & 2 deletions biopandas/mmcif/pandas_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

from __future__ import annotations
import gzip
import sys
import copy
import warnings
from typing import Dict, List, Optional
from urllib.error import HTTPError, URLError
Expand Down Expand Up @@ -69,6 +70,66 @@ def read_mmcif(self, path):
# self.header, self.code = self._parse_header_code() #TODO: implement
self.code = self.data["entry"]["id"][0].lower()
return self

def label_models(self):
"""Adds a column ("model_id") to the underlying
DataFrames containing the model number."""
if "ATOM" in self.df.keys():
self.df["ATOM"]["model_id"] = self.df["ATOM"]["pdbx_PDB_model_num"]
if "HETATM" in self.df.keys():
self.df["HETATM"]["model_id"] = self.df["HETATM"]["pdbx_PDB_model_num"]
return self

def get_model(self, model_index: int) -> PandasMmcif:
"""Returns a new PandasMmcif object with the dataframes subset to the
given model index.
Parameters
----------
model_index : int
An integer representing the model index to subset to.
Returns
---------
pandas_pdb.PandasPdb : A new PandasMMcif object containing the
structure subsetted to the given model.
"""

biopandas_structure = copy.deepcopy(self)
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"] == model_index]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"] == model_index
]
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasMmcif:
"""Returns a new PandasMmcif object with the dataframes subset to the
given model index.
Parameters
----------
model_indices : List[int]
A list representing the model indexes to subset to.
Returns
---------
pandas_pdb.PandasMmtf : A new PandasMmcif object
containing the structure subsetted to the given model.
"""

biopandas_structure = copy.deepcopy(self)

if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[x in model_indices for x in biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].tolist()]
]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[x in model_indices for x in biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"].tolist()]
]
return biopandas_structure

def fetch_mmcif(
self,
Expand Down Expand Up @@ -583,4 +644,4 @@ def convert_to_pandas_pdb(
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
)

return pandaspdb
return pandaspdb
Binary file added biopandas/mmcif/tests/data/2jyf.cif.gz
Binary file not shown.
30 changes: 30 additions & 0 deletions biopandas/mmcif/tests/test_multiple_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# BioPandas
# Author: Sebastian Raschka <[email protected]>
# Author: Arian Jamasb <[email protected]>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas
import os

from biopandas.mmcif import PandasMmcif

TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "2jyf.cif.gz")

def test_label_models():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
biopandas_structure.label_models()
assert "model_id" in biopandas_structure.df["ATOM"].columns

def test_get_model():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
MODEL_INDEX = 1
new_biopandas_structure = biopandas_structure.get_model(MODEL_INDEX)
assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() == MODEL_INDEX


def test_get_models():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
MODEL_INDICES = [1, 3, 5]

new_biopandas_structure = biopandas_structure.get_models(MODEL_INDICES)
assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() in MODEL_INDICES
44 changes: 22 additions & 22 deletions biopandas/mmtf/pandas_mmtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,21 +438,21 @@ def get_model(self, model_index: int) -> PandasMmtf:
structure subsetted to the given model.
"""

df = copy.deepcopy(self)
biopandas_structure = copy.deepcopy(self)

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
df.df["ATOM"]["model_id"] == model_index
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
biopandas_structure.df["ATOM"]["model_id"] == model_index
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
df.df["HETATM"]["model_id"] == model_index
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["model_id"] == model_index
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
df.df["ANISOU"]["model_id"] == model_index
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
biopandas_structure.df["ANISOU"]["model_id"] == model_index
]
return df
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasMmtf:
"""Returns a new PandasMmtf object with the dataframes subset to the
Expand All @@ -469,30 +469,30 @@ def get_models(self, model_indices: List[int]) -> PandasMmtf:
containing the structure subsetted to the given model.
"""

df = copy.deepcopy(self)
biopandas_structure = copy.deepcopy(self)

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[
x in model_indices
for x in df.df["ATOM"]["model_id"].tolist()
for x in biopandas_structure.df["ATOM"]["model_id"].tolist()
]
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[
x in model_indices
for x in df.df["HETATM"]["model_id"].tolist()
for x in biopandas_structure.df["HETATM"]["model_id"].tolist()
]
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
[
x in model_indices
for x in df.df["ANISOU"]["model_id"].tolist()
for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()
]
]
return df
return biopandas_structure


def fetch_mmtf(pdb_code: str) -> pd.DataFrame:
Expand Down
48 changes: 24 additions & 24 deletions biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,20 +843,20 @@ def get_model(self, model_index: int) -> PandasPdb:
structure subsetted to the given model.
"""

df = deepcopy(self)
df.label_models()

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[df.df["ATOM"]["model_id"] == model_index]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
df.df["HETATM"]["model_id"] == model_index
biopandas_structure = deepcopy(self)
biopandas_structure.label_models()

if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["model_id"] == model_index]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["model_id"] == model_index
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
df.df["ANISOU"]["model_id"] == model_index
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
biopandas_structure.df["ANISOU"]["model_id"] == model_index
]
return df
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasPdb:
"""Returns a new PandasPDB object with the dataframes subset to the given model index.
Expand All @@ -872,22 +872,22 @@ def get_models(self, model_indices: List[int]) -> PandasPdb:
containing the structure subsetted to the given model.
"""

df = deepcopy(self)
df.label_models()
biopandas_structure = deepcopy(self)
biopandas_structure.label_models()

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
[x in model_indices for x in df.df["ATOM"]["model_id"].tolist()]
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[x in model_indices for x in biopandas_structure.df["ATOM"]["model_id"].tolist()]
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
[x in model_indices for x in df.df["HETATM"]["model_id"].tolist()]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[x in model_indices for x in biopandas_structure.df["HETATM"]["model_id"].tolist()]
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
[x in model_indices for x in df.df["ANISOU"]["model_id"].tolist()]
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
[x in model_indices for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()]
]
return df
return biopandas_structure

def to_pdb_stream(self, records: tuple[str] = ("ATOM", "HETATM")) -> StringIO:
"""Writes a PDB dataframe to a stream.
Expand Down
3 changes: 2 additions & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ The CHANGELOG for the current development version is available at

### 0.5.1dev1 (UNRELEASED)

- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`)
- Feature: added method to `PandasMmcif` that allow to select by model ids. PR #[145](https://github.com/BioPandas/biopandas/pull/145))
- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`) PR #[146](https://github.com/BioPandas/biopandas/pull/146))


### 0.5.0dev1 (31/7/2023)
Expand Down

0 comments on commit 3e26557

Please sign in to comment.