From b7ca0e54cdd31405150aa7e75dd3b00aeda4f1bc Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 27 Jan 2025 11:24:33 +0100 Subject: [PATCH] scope: map invalid amino acids to "X" --- chebai/preprocessing/datasets/scope/scope.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/chebai/preprocessing/datasets/scope/scope.py b/chebai/preprocessing/datasets/scope/scope.py index 7108170a..99840448 100644 --- a/chebai/preprocessing/datasets/scope/scope.py +++ b/chebai/preprocessing/datasets/scope/scope.py @@ -12,6 +12,7 @@ import gzip import os +import re import shutil from abc import ABC, abstractmethod from tempfile import NamedTemporaryFile @@ -441,14 +442,18 @@ def _parse_pdb_sequence_file(self) -> Dict[str, Dict[str, str]]: and values are dictionaries mapping chain IDs (lowercase) to their corresponding sequences. """ pdb_chain_seq_mapping: Dict[str, Dict[str, str]] = {} + valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) + for record in SeqIO.parse( os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta" ): pdb_id, chain = record.id.split("_") if str(record.seq): - pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[chain.lower()] = ( - str(record.seq) - ) + sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq)) + + pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[ + chain.lower() + ] = sequence return pdb_chain_seq_mapping @staticmethod