Skip to content

Commit

Permalink
Merge branch 'main' into stack_tmalign
Browse files Browse the repository at this point in the history
  • Loading branch information
a-r-j authored Jul 8, 2024
2 parents 7b52570 + 67aa2f2 commit a6cd5ed
Show file tree
Hide file tree
Showing 36 changed files with 1,785 additions and 509 deletions.
4 changes: 2 additions & 2 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ install:
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
- conda info -a
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas nose looseversion python=%PYTHON_VERSION%
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas pytest looseversion python=%PYTHON_VERSION%
- activate test-environment

test_script:
- nosetests -s -v
- pytest -s -v
16 changes: 16 additions & 0 deletions .github/workflows/changelog-enforcer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Changelog Enforcer

on: # yamllint disable-line rule:truthy
pull_request:
types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]

jobs:

changelog:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: dangoslen/changelog-enforcer@v3
with:
skipLabels: 'skip-changelog'
1,213 changes: 1,039 additions & 174 deletions biopandas/constants.py

Large diffs are not rendered by default.

37 changes: 24 additions & 13 deletions biopandas/mmcif/mmcif_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,28 @@ def __init__(self, parser_obj):
self.names_defined = False

def add_name(self, name):
cat_name = type(name) == str and partition_string(name, ".") or ["", "", ""]
cat_name = (
isinstance(name, str) and partition_string(name, ".") or ["", "", ""]
)
if cat_name[1]:
if cat_name[0] not in self.parser_obj.current_target[-2]:
self.parser_obj.current_target[-2][cat_name[0]] = {}
if cat_name[2] not in self.parser_obj.current_target[-2][cat_name[0]]:
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]] = []
if (
cat_name[2]
not in self.parser_obj.current_target[-2][cat_name[0]]
):
self.parser_obj.current_target[-2][cat_name[0]][
cat_name[2]
] = []
self.ref_list.append(
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]]
)
else:
if cat_name[0] not in self.parser_obj.current_target[-2]:
self.parser_obj.current_target[-2][cat_name[0]] = []
self.ref_list.append(self.parser_obj.current_target[-2][cat_name[0]])
self.ref_list.append(
self.parser_obj.current_target[-2][cat_name[0]]
)
self.length = len(self.ref_list)

def push_value(self, value):
Expand Down Expand Up @@ -218,16 +227,16 @@ def __repr__(self):
def __cif_float_range__(inp):
try:
pos = inp.index("-", 1)
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1 :]))
except:
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1:]))
except Exception:
return (__CIFFloat__(inp),)


def __cif_int_range__(inp):
try:
pos = inp.index("-", 1)
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1 :]))
except:
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1:]))
except Exception:
return (__CIFInt__(inp),)


Expand All @@ -239,12 +248,12 @@ def __load_cif_dic__(dic_file, force=False):
if force:
throw
dic = json.loads(open(jsf).read())
except:
except Exception:
parser = CIFParser()
parser.parse(open(dic_file))
json.dump(parser.data, open(jsf_dic, "w"))
for k, v in parser.data["data_mmcif_pdbx.dic"].items():
if type(v) != dict or "item_type" not in v:
if not isinstance(v, dict) or "item_type" not in v:
continue
name = partition_string(k[6:], ".")
if name[0] not in dic:
Expand Down Expand Up @@ -285,11 +294,13 @@ def __dump_cif__(jso):
def __dump_str__(inp):
if inp is None:
return "?"
if type(inp) is not str:
if not isinstance(inp, str):
return str(inp)
if re.search(__CIF_STR_NL_CHECK__, inp) is not None:
return "\n;%s\n;" % inp
return "'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
return (
"'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
)


def __pad_string__(inp, flength):
Expand Down Expand Up @@ -354,7 +365,7 @@ def __dump_part__(jso):

def load_cif_data(data, do_clean=True, do_type=True):
parser = CIFParser()
if type(data) == str:
if isinstance(data, str):
parser.parse_string(data)
else:
parser.parse(data) # fileobj
Expand Down
144 changes: 98 additions & 46 deletions biopandas/mmcif/pandas_mmcif.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Class for working with MMCIF files."""

# BioPandas
# Authors: Arian Jamasb <[email protected]>,
# Authors: Sebastian Raschka <[email protected]>
Expand Down Expand Up @@ -69,56 +70,76 @@ def read_mmcif(self, path):
self.code = self.data["entry"]["id"][0].lower()
return self

def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"):
def fetch_mmcif(
self,
pdb_code: Optional[str] = None,
uniprot_id: Optional[str] = None,
source: str = "pdb",
):
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
.
.
Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
Parameters
----------
pdb_code : str, optional
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
uniprot_id : str, optional
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
source : str
The source to retrieve the structure from
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
source : str
The source to retrieve the structure from
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
Returns
---------
self
Returns
---------
self
"""
# Sanitize input
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
invalid_input_identifier_2 = (
pdb_code is not None and uniprot_id is not None
)
invalid_input_combination_1 = (
uniprot_id is not None and source == "pdb"
)
invalid_input_combination_2 = pdb_code is not None and source in {
"alphafold2-v3", "alphafold2-v4"}
"alphafold2-v3",
"alphafold2-v4",
}

if invalid_input_identifier_1 or invalid_input_identifier_2:
raise ValueError(
"Please provide either a PDB code or a UniProt ID.")
"Please provide either a PDB code or a UniProt ID."
)

if invalid_input_combination_1:
raise ValueError(
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.")
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
)
elif invalid_input_combination_2:
raise ValueError(
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.")
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}."
)

if source == "pdb":
self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code)
elif source == "alphafold2-v3":
af2_version = 3
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
self.mmcif_path, self.mmcif_text = self._fetch_af2(
uniprot_id, af2_version
)
elif source == "alphafold2-v4":
af2_version = 4
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
self.mmcif_path, self.mmcif_text = self._fetch_af2(
uniprot_id, af2_version
)
else:
raise ValueError(f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'.")
raise ValueError(
f"Invalid source: {source}."
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
)

self._df = self._construct_df(text=self.mmcif_text)
return self
Expand All @@ -129,7 +150,8 @@ def _construct_df(self, text: str):
self.data = data
df: Dict[str, pd.DataFrame] = {}
full_df = pd.DataFrame.from_dict(
data["atom_site"], orient="index").transpose()
data["atom_site"], orient="index"
).transpose()
full_df = full_df.astype(mmcif_col_types, errors="ignore")
df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"])
df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"])
Expand All @@ -148,8 +170,9 @@ def _fetch_mmcif(pdb_code):
response = urlopen(url)
txt = response.read()
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
except HTTPError as e:
print(f"HTTP Error {e.code}")
Expand All @@ -166,11 +189,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
try:
response = urlopen(url)
txt = response.read()
txt = txt.decode('utf-8') if sys.version_info[0] >= 3 else txt.encode('ascii')
txt = (
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
except HTTPError as e:
print(f'HTTP Error {e.code}')
print(f"HTTP Error {e.code}")
except URLError as e:
print(f'URL Error {e.args}')
print(f"URL Error {e.args}")
return url, txt

@staticmethod
Expand All @@ -184,7 +211,8 @@ def _read_mmcif(path):
openf = gzip.open
else:
allowed_formats = ", ".join(
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz")
)
raise ValueError(
f"Wrong file format; allowed file formats are {allowed_formats}"
)
Expand All @@ -194,8 +222,9 @@ def _read_mmcif(path):

if path.endswith(".gz"):
txt = (
txt.decode(
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
txt.decode("utf-8")
if sys.version_info[0] >= 3
else txt.encode("ascii")
)
return path, txt

Expand Down Expand Up @@ -271,14 +300,19 @@ def _get_mainchain(
def _get_hydrogen(df, invert):
"""Return only hydrogen atom entries from a DataFrame"""
return (
df[(df["type_symbol"] != "H")] if invert else df[(
df["type_symbol"] == "H")]
df[(df["type_symbol"] != "H")]
if invert
else df[(df["type_symbol"] == "H")]
)

@staticmethod
def _get_heavy(df, invert):
"""Return only heavy atom entries from a DataFrame"""
return df[df["type_symbol"] == "H"] if invert else df[df["type_symbol"] != "H"]
return (
df[df["type_symbol"] == "H"]
if invert
else df[df["type_symbol"] != "H"]
)

@staticmethod
def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
Expand All @@ -288,7 +322,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
@staticmethod
def _get_carbon(df, invert):
"""Return carbon atom entries from a DataFrame"""
return df[df["type_symbol"] != "C"] if invert else df[df["type_symbol"] == "C"]
return (
df[df["type_symbol"] != "C"]
if invert
else df[df["type_symbol"] == "C"]
)

def amino3to1(
self,
Expand Down Expand Up @@ -339,8 +377,9 @@ def amino3to1(
indices.append(ind)
cmp = num

transl = tmp.iloc[indices][residue_col].map(
amino3to1dict).fillna(fillna)
transl = (
tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna)
)

return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1)

Expand Down Expand Up @@ -425,7 +464,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):

return np.sqrt(
np.sum(
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
** 2,
axis=1,
)
)

Expand All @@ -451,7 +492,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
"""
return np.sqrt(
np.sum(
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
** 2,
axis=1,
)
)

Expand Down Expand Up @@ -485,7 +528,11 @@ def read_mmcif_from_list(self, mmcif_lines):
self.code = self.data["entry"]["id"][0].lower()
return self

def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]) -> PandasPdb:
def convert_to_pandas_pdb(
self,
offset_chains: bool = True,
records: List[str] = ["ATOM", "HETATM"],
) -> PandasPdb:
"""Returns a PandasPdb object with the same data as the PandasMmcif
object.
Expand Down Expand Up @@ -525,10 +572,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =

# Update atom numbers
if offset_chains:
offsets = pandaspdb.df["ATOM"]["chain_id"].astype(
"category").cat.codes
pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets
offsets = (
pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes
)
pandaspdb.df["ATOM"]["atom_number"] = (
pandaspdb.df["ATOM"]["atom_number"] + offsets
)
hetatom_offset = offsets.max() + 1
pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
pandaspdb.df["HETATM"]["atom_number"] = (
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
)

return pandaspdb
Loading

0 comments on commit a6cd5ed

Please sign in to comment.