Skip to content

Commit

Permalink
scope: map invalid amino acids to "X"
Browse files Browse the repository at this point in the history
  • Loading branch information
aditya0by0 committed Jan 27, 2025
1 parent 191c979 commit b7ca0e5
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions chebai/preprocessing/datasets/scope/scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import gzip
import os
import re
import shutil
from abc import ABC, abstractmethod
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -441,14 +442,18 @@ def _parse_pdb_sequence_file(self) -> Dict[str, Dict[str, str]]:
and values are dictionaries mapping chain IDs (lowercase) to their corresponding sequences.
"""
pdb_chain_seq_mapping: Dict[str, Dict[str, str]] = {}
valid_amino_acids = "".join(ProteinDataReader.AA_LETTER)

for record in SeqIO.parse(
os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta"
):
pdb_id, chain = record.id.split("_")
if str(record.seq):
pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[chain.lower()] = (
str(record.seq)
)
sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq))

pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[
chain.lower()
] = sequence
return pdb_chain_seq_mapping

@staticmethod
Expand Down

0 comments on commit b7ca0e5

Please sign in to comment.