From 7f6e095422ce0e34f3158d8b8680450f45679075 Mon Sep 17 00:00:00 2001 From: NatureGeorge <414731811@qq.com> Date: Wed, 24 Feb 2021 09:31:07 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A6v0.2.7a1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdb_profiling/__init__.py | 2 +- pdb_profiling/commands/__init__.py | 19 +++++ pdb_profiling/commands/command.py | 15 ++-- pdb_profiling/data.py | 106 ++++++++++++++++++++++- pdb_profiling/processors/pdbe/record.py | 109 ++++++++++++++++++------ pdb_profiling/utils.py | 3 +- pdb_profiling/warnings.py | 6 +- setup.py | 2 +- 8 files changed, 224 insertions(+), 38 deletions(-) diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py index 67b26c4..23475f9 100644 --- a/pdb_profiling/__init__.py +++ b/pdb_profiling/__init__.py @@ -4,7 +4,7 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-05-13 08:54:09 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University -__version__ = '0.2.6' +__version__ = '0.2.7' def default_config(folder='./'): diff --git a/pdb_profiling/commands/__init__.py b/pdb_profiling/commands/__init__.py index d6b5101..35f0033 100644 --- a/pdb_profiling/commands/__init__.py +++ b/pdb_profiling/commands/__init__.py @@ -28,7 +28,26 @@ class IDMapping(orm.Model): Entry = orm.String(max_length=50, primary_key=True) isoform = orm.String(max_length=50, primary_key=True) is_canonical = orm.Boolean() + + class ResidueMapping(orm.Model): + __tablename__ = 'ResidueMapping' + __metadata__ = self.metadata + __database__ = self.database + UniProt = orm.String(max_length=50, primary_key=True) + author_insertion_code = orm.String(max_length=50, allow_null=True, allow_blank=True, default='') + author_residue_number = orm.Integer() + chain_id = orm.String(max_length=10) + struct_asym_id = orm.String(max_length=10, primary_key=True) + entity_id = orm.Integer(primary_key=True) + pdb_id = orm.String(max_length=4, primary_key=True) + residue_number = orm.Integer(primary_key=True) + unp_residue_number = orm.Integer(primary_key=True) + residue_name = orm.String(max_length=10) + observed_ratio = orm.Float() + multiple_conformers = orm.JSON(allow_null=True) + conflict_code = orm.String(max_length=3, allow_null=True) self.Mutation = Mutation self.IDMapping = IDMapping + self.ResidueMapping = ResidueMapping diff --git a/pdb_profiling/commands/command.py b/pdb_profiling/commands/command.py index 1205003..33c53ab 100644 --- a/pdb_profiling/commands/command.py +++ b/pdb_profiling/commands/command.py @@ -129,7 +129,7 @@ def id_mapping(ctx, input, column, sep, chunksize): @click.option('--func', type=str, default='pipe_select_mo') @click.option('--kwargs', type=str, default='{}') @click.option('--chunksize', type=int, help="the chunksize parameter", default=200) -@click.option('--entry_filter', type=str, default='(release_date < "20201020") and ((experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR")') +@click.option('--entry_filter', type=str, default='(release_date < "20210101") and ((experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR")') @click.option('--chain_filter', type=str, default="UNK_COUNT < SEQRES_COUNT and ca_p_only == False and identity >=0.9 and repeated == False and reversed == False and OBS_COUNT > 20") @click.option('--skip_pdbs', type=str, default='1fc2,6wrg,5jm5,6vnn,2i6l,4zai,5jn1,6bj0,6yth,4fc3,7acu,6lsd,6llc,6xoz,6xp0,6xp1,6xp2,6xp3,6xp4,6xp5,6xp6,6xp7,6xp8,6xpa,6zqz,6t5h,6xwd,6xxc') @click.option('--omit', type=int, default=0) @@ -204,9 +204,9 @@ def get_unp_id(args): @Interface.command("residue-mapping") @click.option('--input', type=click.Path()) @click.option('--chunksize', type=int, help="the chunksize parameter", default=10000) -@click.option('--output', type=str) -def residue_mapping(input, chunksize, output): - output = Path(output) +@click.option('--output', type=str, default=None) +@click.pass_context +def residue_mapping(ctx, input, chunksize, output): dfs = read_csv(input, sep='\t', keep_default_na=False, na_values=['NULL', 'null'], chunksize=chunksize) for df in dfs: @@ -222,7 +222,12 @@ def residue_mapping(input, chunksize, output): with Progress(*progress_bar_args) as p: res = ob.run(p.track).result() res_mapping_df = concat(res, sort=False, ignore_index=True) - res_mapping_df[sorted(res_mapping_df.columns)].to_csv(output, sep='\t', mode='a+', index=False, header=not output.exists()) + if output is not None: + output = Path(output) + res_mapping_df[sorted(res_mapping_df.columns)].to_csv(output, sep='\t', mode='a+', index=False, header=not output.exists()) + else: + sqlite_api = ctx.obj['custom_db'] + sqlite_api.sync_insert(sqlite_api.ResidueMapping, res_mapping_df.to_dict('records')) sleep(uniform(0, 1)) diff --git a/pdb_profiling/data.py b/pdb_profiling/data.py index e097ce3..149c3bc 100644 --- a/pdb_profiling/data.py +++ b/pdb_profiling/data.py @@ -4,12 +4,37 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-10-10 04:06:06 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University +# from copy import deepcopy + +''' +class SwapKeyDict(dict): + + def __missing__(self, key): + swap = key[::-1] + if swap not in self: + if hasattr(self, 'gap') and None in key: + return self.gap + raise KeyError(key) + else: + return self[swap] + + def set_gap(self, gap): + cur = deepcopy(self) + cur.gap = gap + return cur +''' + +def store_swap_key(data_from, data_to): + for key, value in data_from.items(): + data_to[key] = value + data_to[key[::-1]] = value + return data_to ''' Matrix Data From: https://github.com/biopython/biopython/blob/master/Bio/SubsMat/MatrixInfo.py ''' -blosum62 = { +blosum62 = store_swap_key({ ('W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0, ('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3, ('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1, @@ -79,7 +104,79 @@ ('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3, ('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1, ('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4 -} +}, dict()) + +blosum95 = store_swap_key({ + ("W", "F"): 0, ("L", "R"): -3, ("S", "P"): -2, ("V", "T"): -1, + ("Q", "Q"): 7, ("N", "A"): -2, ("Z", "Y"): -4, ("W", "R"): -4, + ("Q", "A"): -1, ("S", "D"): -1, ("H", "H"): 9, ("S", "H"): -2, + ("H", "D"): -2, ("L", "N"): -5, ("W", "A"): -4, ("Y", "M"): -3, + ("G", "R"): -4, ("Y", "I"): -2, ("Y", "E"): -4, ("B", "Y"): -4, + ("Y", "A"): -3, ("V", "D"): -5, ("B", "S"): -1, ("Y", "Y"): 8, + ("G", "N"): -1, ("E", "C"): -6, ("Y", "Q"): -3, ("Z", "Z"): 4, + ("V", "A"): -1, ("C", "C"): 9, ("M", "R"): -2, ("V", "E"): -3, + ("T", "N"): -1, ("P", "P"): 8, ("V", "I"): 3, ("V", "S"): -3, + ("Z", "P"): -2, ("V", "M"): 0, ("T", "F"): -3, ("V", "Q"): -3, + ("K", "K"): 6, ("P", "D"): -3, ("I", "H"): -4, ("I", "D"): -5, + ("T", "R"): -2, ("P", "L"): -4, ("K", "G"): -3, ("M", "N"): -3, + ("P", "H"): -3, ("F", "Q"): -4, ("Z", "G"): -3, ("X", "L"): -2, + ("T", "M"): -1, ("Z", "C"): -5, ("X", "H"): -2, ("D", "R"): -3, + ("B", "W"): -6, ("X", "D"): -2, ("Z", "K"): 0, ("F", "A"): -3, + ("Z", "W"): -4, ("F", "E"): -5, ("D", "N"): 1, ("B", "K"): -1, + ("X", "X"): -2, ("F", "I"): -1, ("B", "G"): -2, ("X", "T"): -1, + ("F", "M"): -1, ("B", "C"): -4, ("Z", "I"): -4, ("Z", "V"): -3, + ("S", "S"): 5, ("L", "Q"): -3, ("W", "E"): -5, ("Q", "R"): 0, + ("N", "N"): 7, ("W", "M"): -2, ("Q", "C"): -4, ("W", "I"): -4, + ("S", "C"): -2, ("L", "A"): -2, ("S", "G"): -1, ("L", "E"): -4, + ("W", "Q"): -3, ("H", "G"): -3, ("S", "K"): -1, ("Q", "N"): 0, + ("N", "R"): -1, ("H", "C"): -5, ("Y", "N"): -3, ("G", "Q"): -3, + ("Y", "F"): 3, ("C", "A"): -1, ("V", "L"): 0, ("G", "E"): -3, + ("G", "A"): -1, ("K", "R"): 2, ("E", "D"): 1, ("Y", "R"): -3, + ("M", "Q"): -1, ("T", "I"): -2, ("C", "D"): -5, ("V", "F"): -2, + ("T", "A"): 0, ("T", "P"): -2, ("B", "P"): -3, ("T", "E"): -2, + ("V", "N"): -4, ("P", "G"): -4, ("M", "A"): -2, ("K", "H"): -1, + ("V", "R"): -4, ("P", "C"): -5, ("M", "E"): -3, ("K", "L"): -3, + ("V", "V"): 5, ("M", "I"): 1, ("T", "Q"): -1, ("I", "G"): -6, + ("P", "K"): -2, ("M", "M"): 7, ("K", "D"): -2, ("I", "C"): -2, + ("Z", "D"): 0, ("F", "R"): -4, ("X", "K"): -1, ("Q", "D"): -1, + ("X", "G"): -3, ("Z", "L"): -4, ("X", "C"): -3, ("Z", "H"): 0, + ("B", "L"): -5, ("B", "H"): -1, ("F", "F"): 7, ("X", "W"): -4, + ("B", "D"): 4, ("D", "A"): -3, ("S", "L"): -3, ("X", "S"): -1, + ("F", "N"): -4, ("S", "R"): -2, ("W", "D"): -6, ("V", "Y"): -3, + ("W", "L"): -3, ("H", "R"): -1, ("W", "H"): -3, ("H", "N"): 0, + ("W", "T"): -4, ("T", "T"): 6, ("S", "F"): -3, ("W", "P"): -5, + ("L", "D"): -5, ("B", "I"): -5, ("L", "H"): -4, ("S", "N"): 0, + ("B", "T"): -1, ("L", "L"): 5, ("Y", "K"): -3, ("E", "Q"): 2, + ("Y", "G"): -5, ("Z", "S"): -1, ("Y", "C"): -4, ("G", "D"): -2, + ("B", "V"): -5, ("E", "A"): -1, ("Y", "W"): 2, ("E", "E"): 6, + ("Y", "S"): -3, ("C", "N"): -4, ("V", "C"): -2, ("T", "H"): -2, + ("P", "R"): -3, ("V", "G"): -5, ("T", "L"): -2, ("V", "K"): -3, + ("K", "Q"): 1, ("R", "A"): -2, ("I", "R"): -4, ("T", "D"): -2, + ("P", "F"): -5, ("I", "N"): -4, ("K", "I"): -4, ("M", "D"): -5, + ("V", "W"): -3, ("W", "W"): 11, ("M", "H"): -3, ("P", "N"): -3, + ("K", "A"): -1, ("M", "L"): 2, ("K", "E"): 0, ("Z", "E"): 4, + ("X", "N"): -2, ("Z", "A"): -1, ("Z", "M"): -2, ("X", "F"): -2, + ("K", "C"): -5, ("B", "Q"): -1, ("X", "B"): -2, ("B", "M"): -4, + ("F", "C"): -3, ("Z", "Q"): 4, ("X", "Z"): -1, ("F", "G"): -5, + ("B", "E"): 0, ("X", "V"): -2, ("F", "K"): -4, ("B", "A"): -3, + ("X", "R"): -2, ("D", "D"): 7, ("W", "G"): -5, ("Z", "F"): -4, + ("S", "Q"): -1, ("W", "C"): -4, ("W", "K"): -5, ("H", "Q"): 1, + ("L", "C"): -3, ("W", "N"): -5, ("S", "A"): 1, ("L", "G"): -5, + ("W", "S"): -4, ("S", "E"): -1, ("H", "E"): -1, ("S", "I"): -3, + ("H", "A"): -3, ("S", "M"): -3, ("Y", "L"): -2, ("Y", "H"): 1, + ("Y", "D"): -5, ("E", "R"): -1, ("X", "P"): -3, ("G", "G"): 6, + ("G", "C"): -5, ("E", "N"): -1, ("Y", "T"): -3, ("Y", "P"): -5, + ("T", "K"): -1, ("A", "A"): 5, ("P", "Q"): -2, ("T", "C"): -2, + ("V", "H"): -4, ("T", "G"): -3, ("I", "Q"): -4, ("Z", "T"): -2, + ("C", "R"): -5, ("V", "P"): -4, ("P", "E"): -2, ("M", "C"): -3, + ("K", "N"): 0, ("I", "I"): 5, ("P", "A"): -1, ("M", "G"): -4, + ("T", "S"): 1, ("I", "E"): -4, ("P", "M"): -3, ("M", "K"): -2, + ("I", "A"): -2, ("P", "I"): -4, ("R", "R"): 7, ("X", "M"): -2, + ("L", "I"): 1, ("X", "I"): -2, ("Z", "B"): 0, ("X", "E"): -2, + ("Z", "N"): -1, ("X", "A"): -1, ("B", "R"): -2, ("B", "N"): 4, + ("F", "D"): -5, ("X", "Y"): -2, ("Z", "R"): -1, ("F", "H"): -2, + ("B", "F"): -5, ("F", "L"): 0, ("X", "Q"): -1, ("B", "B"): 4 +}, dict()) ''' @@ -95,7 +192,8 @@ After transform the distance matrix into a similarity matrix, normalize each value by the highest_similarity. NOTE: now the maximum similarity is 1 ''' -miyata_similarity_matrix = { + +miyata_similarity_matrix = store_swap_key({ ('A', 'A'): 1.0, ('A', 'C'): -0.112, ('A', 'P'): 0.952, @@ -305,4 +403,4 @@ ('Y', 'S'): -1.664, ('Y', 'T'): -0.96, ('Y', 'V'): -0.216, - ('Y', 'Y'): 1.0} + ('Y', 'Y'): 1.0}, dict()) diff --git a/pdb_profiling/processors/pdbe/record.py b/pdb_profiling/processors/pdbe/record.py index e4c92ed..d4fc7ff 100644 --- a/pdb_profiling/processors/pdbe/record.py +++ b/pdb_profiling/processors/pdbe/record.py @@ -5,6 +5,7 @@ # @Last Modified: 2020-08-11 10:48:11 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University from typing import Iterable, Union, Callable, Optional, Hashable, Dict, Coroutine, List, Tuple +from inspect import isawaitable from numpy import array, where as np_where, count_nonzero, nan, dot, exp, square from pathlib import Path from pandas import isna, concat, DataFrame, Series, merge @@ -49,7 +50,7 @@ from pdb_profiling.warnings import (WithoutCifKeyWarning, PISAErrorWarning, ConflictChainIDWarning, PossibleObsoletedUniProtWarning, PossibleObsoletedPDBEntryWarning, SkipAssemblyWarning, - PeptideLinkingWarning, MultiWrittenWarning) + PeptideLinkingWarning, MultiWrittenWarning, WithoutRCSBClusterMembershipWarning) from pdb_profiling.ensure import aio_file_exists_stat from textdistance import sorensen from warnings import warn @@ -137,6 +138,11 @@ def get_db_semaphore(cls): @classmethod def set_folder(cls, folder: Union[Path, str]): + """Set your folder path + + Args: + folder (Union[Path, str]): the path to set + """ folder = Path(folder) assert folder.exists(), "Folder not exist! Please create it or input a valid folder!" cls.folder = folder @@ -154,6 +160,17 @@ def check_folder(cls): raise ValueError(f"Please set folder via {cls.__name__}.set_folder(folder: Union[Path, str])") def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None) -> Unfuture: + """fetch data from PDBe API + + Args: + api_suffix (str): the suffix of the API that you want to retrieve info. + then_func (Optional[Callable[[Unfuture], Unfuture]], optional): function arg that pass to Unfuture.then(). Defaults to None. + json (bool, optional): whether the data is treated and returned as JSON. Defaults to False. + mask_id (str, optional): Defaults to None. + + Returns: + Unfuture: Unfuture object + """ assert api_suffix in API_SET, f"Invlaid API SUFFIX! Valid set:\n{API_SET}" identifier = self.get_id() if mask_id is None else mask_id task = self.tasks.get((self.__class__.__name__, api_suffix, then_func, json, identifier), None) @@ -229,7 +246,7 @@ async def to_dataframe_with_kwargs(cls, path, **kwargs): @staticmethod @unsync async def result_set_to_dataframe(data): - if isinstance(data, (Unfuture, Coroutine)): + if isawaitable(data): data = await data if data is None: return @@ -384,7 +401,7 @@ def fetch_from_PDBArchive(self, api_suffix: str, then_func: Optional[Callable[[U @classmethod @unsync async def cif2atom_sites_df(cls, path: Union[Unfuture, Coroutine, str, Path]): - if isinstance(path, (Unfuture, Coroutine)): + if isawaitable(path): path = await path async with aiofiles_open(path, 'rt') as file_io: handle = await file_io.read() @@ -414,7 +431,7 @@ async def cif2residue_listing(cls, path: Union[Unfuture, Coroutine, str, Path]): 'authore_residue_number', 'chain_id', 'author_insertion_code') - if isinstance(path, (Unfuture, Coroutine)): + if isawaitable(path): path = await path with gzip_open(path, 'rt') as handle: mmcif_dict = MMCIF2DictPlus(handle, cols) @@ -490,7 +507,7 @@ def to_rank(rank_dict, assembly_id, struct_asym_id): assert var[1] <= var[0] return var[1] - if isinstance(path, (Unfuture, Coroutine)): + if isawaitable(path): path = await path path = Path(path) if path.suffix == '.cif': @@ -1264,7 +1281,7 @@ async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focu async def expand_multiple_conformers(dfrm: Union[DataFrame, Unfuture, Coroutine]): '''for residue_listing dataframe''' ''' - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm ''' pass @@ -1387,6 +1404,15 @@ async def rcsb_cluster_membership(self, entity_id, identity_cutoff:int=100): } ''' dfs = [] + try: + assert res['data']['polymer_entity']['rcsb_cluster_membership'] is not None + except Exception as e: + info = f"polymer_entity(entry_id: \"{self.pdb_id}\", entity_id: \"{entity_id}\") -> {res}" + if isinstance(e, AssertionError): + warn(info, WithoutRCSBClusterMembershipWarning) + return + else: + raise ValueError(info) for i in res['data']['polymer_entity']['rcsb_cluster_membership']: if i['identity'] != identity_cutoff: continue @@ -1431,7 +1457,7 @@ def __init__(self, pdb_ass_id, pdb_ob: Optional[PDB]=None): NOTE: reference: ''' self.interface_filters = { - 'symmetry_operator': ('isin', ('1_555', '1555')) # 1555 for api%pisa%asiscomponent%+6e4h%0%interfaces + 'symmetry_operator': ('isin', ('1_555', '1555', 1555)) # 1555 for api%pisa%asiscomponent%+6e4h%0%interfaces } # 'structure_2.symmetry_id': ('eq', '1_555'),'css': ('ge', 0) def set_id(self, pdb_ass_id: str): @@ -1533,7 +1559,7 @@ def to_interface_id(pdb_assembly_id, focus_interface_ids): if interfacelist_df is None: interfacelist_df, use_au = await self.get_interfacelist_df( 'api/pisa/interfacelist/', PDBAssemble.to_interfacelist_df) - self.interface_filters['structure_2.symmetry_id'] = ('isin', ('1_555', '1555')) + self.interface_filters['structure_2.symmetry_id'] = ('isin', ('1_555', '1555', 1555)) del self.interface_filters['symmetry_operator'] else: interfacelist_df = interfacelist_df.rename(columns={'complex_formation_score': 'css'}) @@ -1860,8 +1886,19 @@ async def get_interface_res_dict(self, **kwargs): class SIFTS(PDB): + ''' + TODO + + 1. Better OligoState + * RAW (both from wwPDB and self assigned) + * FILTERED + 2. Define Best Isoform + 3. UniProt Isoform Interaction + 4. PDBChain Instance Interaction (Biological Relevance) + ''' tasks = LRUCache(maxsize=1024) + sa_cache = LRUCache(maxsize=100) EntityChain = namedtuple('EntityChain', 'pdb_id entity_chain_info entity_count chain_count') UniProtEntity = namedtuple('UniProtEntity', 'pdb_id unp_entity_info entity_unp_info entity_with_unp_count min_unp_count') @@ -1906,7 +1943,7 @@ def fetch_unp_fasta(cls, identifier): @classmethod @unsync async def complete_chains(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]): - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm if cls.complete_chains_run_as_completed: res = await SIFTSs(dfrm.pdb_id.unique()).fetch('fetch_from_pdbe_api', @@ -2136,7 +2173,7 @@ async def add_residue_conflict(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coro ''' TODO: optimization ''' - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm if isinstance(dfrm, Tuple): dfrm = dfrm[0] @@ -2188,40 +2225,54 @@ async def renew_sifts_mapping_from_graph_api(cls, UniProt, pdb_id, entity_id, pd @staticmethod @unsync async def deal_with_identical_entity_seq(dfrm): - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm already = set() cluster_dfs = [] - # dfrm = dfrm.copy() - # dfrm['pdb_sequence'] = '' + dfrm = dfrm.copy() + dfrm['pdb_sequence'] = b'' dfrm_nr = dfrm[['pdb_id', 'entity_id']].drop_duplicates() for pdb_id, entity_id in zip(dfrm_nr.pdb_id, dfrm_nr.entity_id): - # dfrm.loc[dfrm[dfrm.pdb_sequence.eq('') & dfrm.pdb_id.eq(pdb_id) & dfrm.entity_id.eq(entity_id)].index, 'pdb_sequence'] = await PDB(pdb_id).get_sequence(entity_id=entity_id, mode='raw_pdb_seq') + dfrm.loc[dfrm[dfrm.pdb_sequence.eq(b'') & dfrm.pdb_id.eq(pdb_id) & dfrm.entity_id.eq(entity_id)].index, 'pdb_sequence'] = compress(bytes(await PDB(pdb_id).get_sequence(entity_id=entity_id, mode='raw_pdb_seq'), encoding='utf-8')) if (pdb_id, entity_id) in already: continue cur_cluster_df = await PDB(pdb_id).rcsb_cluster_membership(entity_id=entity_id, identity_cutoff=100) - already |= set(zip(cur_cluster_df.pdb_id, cur_cluster_df.entity_id)) + try: + assert cur_cluster_df is not None + already |= set(zip(cur_cluster_df.pdb_id, cur_cluster_df.entity_id)) + except AssertionError: + cur_cluster_df = DataFrame([dict(pdb_id=pdb_id, entity_id=entity_id, cluster_id=-1)]) cluster_dfs.append(cur_cluster_df) cluster_df = concat(cluster_dfs, sort=False, ignore_index=True) assert not any(cluster_df.duplicated()) dfrm = dfrm.merge(cluster_df[['pdb_id','entity_id','cluster_id']], how='left') assert not any(dfrm.cluster_id.isnull()), f"{dfrm[dfrm.cluster_id.isnull()]}" - return dfrm + dfrm['fix_cluster_id'] = dfrm.groupby(['cluster_id', 'pdb_sequence']).ngroup().astype(str) + '_' + dfrm.cluster_id.astype(str) + # ignore/overried cases like (P00720,2b7x,B v.s P00720,2b7x,A) + return dfrm.drop(columns=['pdb_sequence']) @classmethod @unsync async def double_check_conflict_and_range(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]): - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm focus_part = dfrm[ dfrm.sifts_range_tag.isin(('Deletion', 'Insertion_Undivided', 'InDel_2', 'InDel_3')) & (dfrm.conflict_pdb_index.apply(get_str_dict_len)/dfrm.new_pdb_range.apply(range_len)).ge(0.1)] if len(focus_part) == 0: return dfrm - tasks = tuple(map(cls.renew_sifts_mapping_from_graph_api, focus_part.UniProt, focus_part.pdb_id, focus_part.entity_id, focus_part.pdb_range, focus_part.unp_range, focus_part.range_diff)) - dfrm.loc[focus_part.index, ['new_unp_range', 'new_pdb_range']] = [await task for task in tasks] - res = await cls.add_residue_conflict(dfrm.loc[focus_part.index].drop(columns=['conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range'])) + focus_part_iden = await cls.deal_with_identical_entity_seq(focus_part) + focus_part_iden_dd = focus_part_iden.drop_duplicates(subset=['UniProt', 'fix_cluster_id']).copy() + tasks = tuple(map(cls.renew_sifts_mapping_from_graph_api, focus_part_iden_dd.UniProt, focus_part_iden_dd.pdb_id, focus_part_iden_dd.entity_id, focus_part_iden_dd.pdb_range, focus_part_iden_dd.unp_range, focus_part_iden_dd.range_diff)) + focus_part_iden_dd[['new_unp_range', 'new_pdb_range']] = [await task for task in tasks] + focus_part_iden_dd = await cls.add_residue_conflict(focus_part_iden_dd.drop(columns=['conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range'])) + focus_cols = ['UniProt', 'fix_cluster_id', 'new_unp_range', 'new_pdb_range', + 'conflict_pdb_index', 'raw_pdb_index', 'conflict_pdb_range', 'conflict_unp_range'] + res = focus_part_iden.drop(columns=focus_cols[2:]).merge(focus_part_iden_dd[focus_cols], how='left') + assert res.isnull().sum().sum() == 0 + res = res.drop(columns=['fix_cluster_id', 'cluster_id']) + assert res.shape == focus_part.shape, f"{res.shape}, {focus_part.shape}" res.index = focus_part.index dfrm.loc[focus_part.index] = res return dfrm @@ -2297,7 +2348,7 @@ def check_range_tail(new_pdb_range, new_unp_range, pdb_range): @classmethod @unsync async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]): - if isinstance(dfrm, (Coroutine, Unfuture)): + if isawaitable(dfrm): dfrm = await dfrm if isinstance(dfrm, Tuple): dfrm = dfrm[0] @@ -2325,9 +2376,17 @@ async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]): @staticmethod def sliding_alignment_score(range_diff, pdb_seq, pdb_range, unp_seq, unp_range, **kwargs): + ''' + TODO: improve code + ''' + def generate_seq_item(seq, gap_index, gap_num): + yield from seq[:gap_index] + for _ in range(gap_num): + yield '-' + yield from seq[gap_index:] + def get_optimal_range(abs_diff, seg_to_add, seg_to_ori, lstart, lend, rstart, rend, on_left): - gap_seg = '-' * abs_diff - res = tuple(sum(blosum62.get((l, r), blosum62.get((r, l), 0)) for l, r in zip(seg_to_add[:i] + gap_seg + seg_to_add[i:], seg_to_ori)) for i in range(len(seg_to_add)+1)) + res = tuple(sum(blosum62.get((l, r), 0) for l, r in zip(generate_seq_item(seg_to_add, i, abs_diff), seg_to_ori)) for i in range(len(seg_to_add)+1)) max_val = max(res) index = res.index(max_val) assert index >= 0 # ??? @@ -2431,7 +2490,7 @@ def bs_score_aligned_part(new_pdb_range, conflict_pdb_range, conflict_pdb_index, pdb_aa = raw_pdb_index.get(i, None) if (i not in non_set) and (unp_aa is not None) and (unp_aa != pdb_aa): # NOT Modified & Conflict Residues are fall into here - theta = miyata_similarity_matrix.get((unp_aa, pdb_aa), miyata_similarity_matrix.get((pdb_aa, unp_aa), -3.104)) + theta = miyata_similarity_matrix.get((unp_aa, pdb_aa), -3.104) else: # UNK | Modified Residue theta = -3.104 @@ -2724,7 +2783,7 @@ def parallel_interact_df(sifts_df, i3d_df, common_cols=('revision_date', 'deposi rename_dict['pdb_id_1'] = 'pdb_id' sifts_df_ = sifts_df.add_suffix('_1').rename(columns=rename_dict) i3d_df = i3d_df.merge(sifts_df_) - sifts_df_ = sifts_df.drop(columns=sifts_df.columns & set(common_cols)).add_suffix('_2').rename(columns={'pdb_id_2': 'pdb_id'}) + sifts_df_ = sifts_df.drop(columns=sifts_df.columns.intersection(common_cols)).add_suffix('_2').rename(columns={'pdb_id_2': 'pdb_id'}) i3d_df = i3d_df.merge(sifts_df_) swap_index = i3d_df[ (i3d_df.struct_asym_id_1 > i3d_df.struct_asym_id_2) | diff --git a/pdb_profiling/utils.py b/pdb_profiling/utils.py index 9186b8e..bcba34d 100644 --- a/pdb_profiling/utils.py +++ b/pdb_profiling/utils.py @@ -7,6 +7,7 @@ import os import gzip import shutil +from inspect import isawaitable from typing import Optional, Union, Dict, Tuple, Iterable, Iterator, List, Coroutine, NamedTuple, Callable, Generator from logging import Logger from pandas import read_csv, DataFrame, isna, Series, concat @@ -578,7 +579,7 @@ async def get_seqs_from_parser(res, identifiers:Optional[Iterable[str]]=None): async def a_seq_parser(path: Union[Unfuture, Coroutine, Path, str]): - if isinstance(path, (Unfuture, Coroutine)): + if isawaitable(path): path = await path async with aiofiles_open(path, 'rt') as handle: header, content = None, '' diff --git a/pdb_profiling/warnings.py b/pdb_profiling/warnings.py index c3fcc1f..358f4c3 100644 --- a/pdb_profiling/warnings.py +++ b/pdb_profiling/warnings.py @@ -76,4 +76,8 @@ class FileExistsWarning(UserWarning): class InvalidFileContentWarning(UserWarning): - pass \ No newline at end of file + pass + + +class WithoutRCSBClusterMembershipWarning(UserWarning): + pass diff --git a/setup.py b/setup.py index 118c640..b2f9bc3 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ 'tenacity>=6.3.0', 'orjson>=3.0.2', 'pyexcel>=0.6.4', - 'pandas>=1.0.3', + 'pandas>=1.2.2', 'numpy>=1.18.1', 'textdistance>=4.1.5', 'databases>=0.3.2',