From a7841d4033b3d5be6d17d225133e40912cf000ea Mon Sep 17 00:00:00 2001 From: NatureGeorge <414731811@qq.com> Date: Thu, 5 Nov 2020 16:23:17 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=B6Fix=20Bugs=20&=20Prepare=20for=20do?= =?UTF-8?q?cs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/figs/greedy_algorithm.drawio.svg | 171 +++++ docs/figs/sifts_mapped_out_1.drawio.svg | 254 +++++++ docs/figs/sifts_mapped_out_2.drawio.svg | 288 ++++++++ docs/figs/sifts_repeated_mapping.drawio.svg | 524 +++++++++++++ docs/figs/sifts_reversed_mapping.drawio.svg | 770 ++++++++++++++++++++ pdb_profiling/__init__.py | 5 +- pdb_profiling/processors/pdbe/api.py | 395 ++-------- pdb_profiling/processors/pdbe/record.py | 535 +++++++------- pdb_profiling/processors/proteins/api.py | 7 +- pdb_profiling/processors/swissmodel/api.py | 113 ++- pdb_profiling/utils.py | 15 +- pdb_profiling/warnings.py | 3 + 12 files changed, 2459 insertions(+), 621 deletions(-) create mode 100644 docs/figs/greedy_algorithm.drawio.svg create mode 100644 docs/figs/sifts_mapped_out_1.drawio.svg create mode 100644 docs/figs/sifts_mapped_out_2.drawio.svg create mode 100644 docs/figs/sifts_repeated_mapping.drawio.svg create mode 100644 docs/figs/sifts_reversed_mapping.drawio.svg diff --git a/docs/figs/greedy_algorithm.drawio.svg b/docs/figs/greedy_algorithm.drawio.svg new file mode 100644 index 0000000..f912cc9 --- /dev/null +++ b/docs/figs/greedy_algorithm.drawio.svg @@ -0,0 +1,171 @@ + + + + + + + + + + + + + + + Compared with all + + element in the selected list, + + whether or not pass + + the user-defined criteria + + + + + + Compared with all... + + + + + + + + + + + Add to Selected List + + + + + + Add to Selected List + + + + + + + + + + + + + True + + + + + + True + + + + + + + + + + + pop new chain + + + + + + pop new chain + + + + + + + + + + + + Sorted Candidate + + + + Chain List + + + + + + + Sorted Candidate... + + + + + + + + + + + False + + + + + + False + + + + + + + + + + + Mo: Compare the coverage of + + the whole mapped range + + + + + + Mo: Compare the coverage of... + + + + + + + + + + + Ho/He: Compare the coverage pair of + + the mapped interface range + + + + + + Ho/He: Compare the coverage pair of... + + + + + + + + + Viewer does not support full SVG 1.1 + + + + \ No newline at end of file diff --git a/docs/figs/sifts_mapped_out_1.drawio.svg b/docs/figs/sifts_mapped_out_1.drawio.svg new file mode 100644 index 0000000..df27455 --- /dev/null +++ b/docs/figs/sifts_mapped_out_1.drawio.svg @@ -0,0 +1,254 @@ + + + + + + + + + + + 589 + + + + + + 589 + + + + + + + + + + + PDB Chain Sequence Index + + + + + + PDB Chain Sequence Index + + + + + + + + + + + UniProt Isoform Sequence Index + + + + + + UniProt Isoform Sequence Index + + + + + + + + + + + + 1283 + + + + + + 1283 + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + 695 + + + + + + 695 + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + 1289 + + + + + + 1289 + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + + + + + + + + Mapped Out + + + + + + Mapped Out + + + + + + + + + Viewer does not support full SVG 1.1 + + + + \ No newline at end of file diff --git a/docs/figs/sifts_mapped_out_2.drawio.svg b/docs/figs/sifts_mapped_out_2.drawio.svg new file mode 100644 index 0000000..7852318 --- /dev/null +++ b/docs/figs/sifts_mapped_out_2.drawio.svg @@ -0,0 +1,288 @@ + + + + + + + + + + + PDB Chain Sequence Index + + + + + + PDB Chain Sequence Index + + + + + + + + + + + UniProt Isoform Sequence Index + + + + + + UniProt Isoform Sequence Index + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + 68 + + + + + + 68 + + + + + + + + + + + + + 233 + + + + + + 233 + + + + + + + + + + + + + 148 + + + + + + 148 + + + + + + + + + + + 313 + + + + + + 313 + + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + 211 + + + + + + 211 + + + + + + + + + + + + + + + + + + + + Mapped Out + + + + + + Mapped Out + + + + + + + + + Viewer does not support full SVG 1.1 + + + + \ No newline at end of file diff --git a/docs/figs/sifts_repeated_mapping.drawio.svg b/docs/figs/sifts_repeated_mapping.drawio.svg new file mode 100644 index 0000000..bcb08b5 --- /dev/null +++ b/docs/figs/sifts_repeated_mapping.drawio.svg @@ -0,0 +1,524 @@ + + + + + + + + + + + 616 + + + + + + 616 + + + + + + + + + + + + 617 + + + + + + 617 + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + 621 + + + + + + 621 + + + + + + + + + + + 619 + + + + + + 619 + + + + + + + + + + + + PDB Chain Sequence Index + + + + + + PDB Chain Sequence Index + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + + + + + 620 + + + + + + 620 + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + 624 + + + + + + 624 + + + + + + + + + + + + UniProt Isoform Sequence Index + + + + + + UniProt Isoform Sequence Index + + + + + + + + + + + 616 + + + + + + 616 + + + + + + + + + + + 726 + + + + + + 726 + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + 4 + + + + + + 4 + + + + + + + + + + + + + + + 620 + + + + + + 620 + + + + + + + + + + + + + + 1616 + + + + + + 1616 + + + + + + + + + + + 1726 + + + + + + 1726 + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Viewer does not support full SVG 1.1 + + + + \ No newline at end of file diff --git a/docs/figs/sifts_reversed_mapping.drawio.svg b/docs/figs/sifts_reversed_mapping.drawio.svg new file mode 100644 index 0000000..c0592e0 --- /dev/null +++ b/docs/figs/sifts_reversed_mapping.drawio.svg @@ -0,0 +1,770 @@ + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + 26 + + + + + + 26 + + + + + + + + + + + + + 29 + + + + + + 29 + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + 49 + + + + + + 49 + + + + + + + + + + + + 2 + + + + + + 2 + + + + + + + + + + + 84 + + + + + + 84 + + + + + + + + + + + 3 + + + + + + 3 + + + + + + + + + + + 114 + + + + + + 114 + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + UniProt Isoform Sequence Index + + + + + + UniProt Isoform Sequence Index + + + + + + + + + + + PDB Chain Sequence Index + + + + + + PDB Chain Sequence Index + + + + + + + + + + + + + + + 90 + + + + + + 90 + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + 28 + + + + + + 28 + + + + + + + + + + + 23 + + + + + + 23 + + + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + + + + + + + 124 + + + + + + 124 + + + + + + + + + + + 141 + + + + + + 141 + + + + + + + + + + + 154 + + + + + + 154 + + + + + + + + + + + 68 + + + + + + 68 + + + + + + + + + + + 72 + + + + + + 72 + + + + + + + + + + + 85 + + + + + + 85 + + + + + + + + + + + + 28 + + + + + + 28 + + + + + + + + + + + + 27 + + + + + + 27 + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + ... + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + + + + + 1 + + + + + + + + + + + + 2 + + + + + + 2 + + + + + + + + + + + + Viewer does not support full SVG 1.1 + + + + \ No newline at end of file diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py index 4a92f11..945b602 100644 --- a/pdb_profiling/__init__.py +++ b/pdb_profiling/__init__.py @@ -6,7 +6,7 @@ # @Copyright (c) 2020 MinghuiGroup, Soochow University from re import compile as re_compile -__version__ = '0.1.14' +__version__ = '0.1.16' common_pat = r'^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]' @@ -39,6 +39,7 @@ def default_config(folder='./'): from pdb_profiling.processors.proteins.record import Identifier from pdb_profiling.processors import UniProtFASTA from pdb_profiling.processors.i3d.api import Interactome3D + from pdb_profiling.processors.swissmodel.api import SMR # Use Existing Handled PDBe API Results (e.g. tsv format results) ProcessPDBe.use_existing = True # Use Existing API Results (e.g. json format results downloaded from web) @@ -50,9 +51,11 @@ def default_config(folder='./'): Identifier.set_web_semaphore(30).result() UniProtFASTA.set_web_semaphore(30).result() Interactome3D.set_web_semaphore(30).result() + SMR.set_web_semaphore(30).result() # Set Folder that store downloaded and handled files Base.set_folder(folder) PDB.set_folder(folder) Identifier.set_folder(folder) UniProtFASTA.set_folder(folder) Interactome3D.set_folder(folder) + SMR.set_folder(folder) diff --git a/pdb_profiling/processors/pdbe/api.py b/pdb_profiling/processors/pdbe/api.py index 4504a7f..5f912f9 100644 --- a/pdb_profiling/processors/pdbe/api.py +++ b/pdb_profiling/processors/pdbe/api.py @@ -5,10 +5,10 @@ # @Last Modified: 2020-02-11 04:22:22 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University import os -import numpy as np +from numpy import array, nan, count_nonzero import pandas as pd from tablib import Dataset, InvalidDimensions, UnsupportedFormat -from typing import Union, Optional, Iterator, Iterable, Set, Dict, List, Any, Generator, Callable, Tuple +from typing import Union, Optional, Iterator, Iterable, Dict, List, Any, Generator, Callable, Tuple from json import JSONDecodeError import orjson as json from pathlib import Path @@ -22,13 +22,6 @@ from pdb_profiling.processors.transformer import Dict2Tabular from pdb_profiling.exceptions import WithoutExpectedKeyError -API_LYST: List = sorted(['summary', 'molecules', 'experiment', 'ligand_monomers', - 'modified_AA_or_NA', 'mutated_AA_or_NA', 'status', - 'polymer_coverage', 'secondary_structure', - 'residue_listing', 'binding_sites', 'files', 'observed_residues_ratio', - 'assembly', 'electron_density_statistics', - 'cofactor', 'drugbank', 'related_experiment_data']) - BASE_URL: str = 'https://www.ebi.ac.uk/pdbe/' FTP_URL: str = 'ftp://ftp.ebi.ac.uk/' @@ -53,7 +46,7 @@ def str_number_converter(x): return -1 -def dispatch_on_set(keys: Set): +def dispatch_on_set(*keys): ''' Decorator to add new dispatch functions ''' @@ -71,43 +64,6 @@ def traverseSuffixes(query: Any, *args): raise ValueError(f'Invalid query: {query}') -def convertJson2other( - data: Union[List, str, None], - append_data: Union[Iterable, Iterator], - converter: Optional[Dataset] = None, - export_format: str = 'tsv', - ignore_headers: Union[bool, int] = False, - log_func=print) -> Any: - ''' - Convert valid json-string/dict into specified format via `tablib.Dataset.export` - ''' - if converter is None: - converter = Dataset() - try: - if isinstance(data, str): - converter.json = data - elif isinstance(data, List): - converter.dict = data - elif data is None: - pass - else: - log_func(f'Invalid type for data`: {type(data)}') - return None - for data_to_append in append_data: - converter.append_col(*data_to_append) - if ignore_headers: - converter.headers = None - return converter.export(export_format) - except KeyError: - log_func('Not a valid json-string/dict to convert format via `tablib.Dataset.export`') - except JSONDecodeError: - log_func(f'Invalid json string') - except InvalidDimensions: - log_func('Invalid data or append_data') - except UnsupportedFormat: - log_func(f'Invalid export_format: {export_format}') - - class ProcessPDBe(Abclog): converters = { @@ -209,281 +165,12 @@ async def process(cls, path: Union[str, Path, Unfuture]): else: cls.logger.debug(f"Without Expected Data ({suffix}): {data}") return None - - -class ProcessSIFTS(ProcessPDBe): - @classmethod - def related_UNP_PDB(cls, filePath: Union[str, Path], related_unp: Optional[Iterable] = None, related_pdb: Optional[Iterable] = None): - ''' - Reference - - * http://www.ebi.ac.uk/pdbe/docs/sifts/quick.html - * A summary of the UniProt to PDB mappings showing the UniProt accession - followed by a semicolon-separated list of PDB four letter codes. - ''' - filePath = Path(filePath) - if filePath.is_dir(): - url = FTP_URL+FTP_DEFAULT_PATH - task = ('ftp', {'url': url}, str(filePath)) - res = UnsyncFetch.multi_tasks([task]).result() - filePath = decompression(res[0], remove=False, logger=cls.logger) - elif filePath.is_file() and filePath.exists(): - filePath = str(filePath) - else: - raise ValueError('Invalid value for filePath') - - dfrm = pd.read_csv(filePath, sep='\t', header=1) - pdb_list = list() - if related_unp is not None: - dfrm = dfrm[dfrm['SP_PRIMARY'].isin(related_unp)] - for i in dfrm.index: - pdb_list.extend(dfrm.loc[i, 'PDB'].split(';')) - if related_pdb is not None: - return set(pdb_list) & set(related_pdb), set(dfrm['SP_PRIMARY']) - else: - return set(pdb_list), set(dfrm['SP_PRIMARY']) - - @classmethod - def reformat(cls, path: Optional[str]=None, dfrm:Optional[pd.DataFrame]=None) -> pd.DataFrame: - if path is not None: - dfrm = pd.read_csv(path, sep='\t', converters=cls.converters) - group_info_col = ['pdb_id', 'chain_id', 'UniProt'] - range_info_col = ['pdb_start', 'pdb_end', 'unp_start', 'unp_end'] - reader = SeqRangeReader(group_info_col) - dfrm[['pdb_range', 'unp_range']] = pd.DataFrame(dfrm.apply( - lambda x: reader.check(tuple(x[i] for i in group_info_col), tuple( - x[i] for i in range_info_col)), - axis=1).values.tolist(), index=dfrm.index) - dfrm = dfrm.drop(columns=range_info_col).drop_duplicates( - subset=group_info_col, keep='last').reset_index(drop=True) - dfrm["Entry"] = dfrm["UniProt"].apply(lambda x: x.split('-')[0]) - return dfrm - - @classmethod - def dealWithInDel(cls, dfrm: pd.DataFrame, sort_by_unp:bool=True) -> pd.DataFrame: - def get_gap_list(li: List): - return [li[i+1][0] - li[i][1] - 1 for i in range(len(li)-1)] - - def get_range_diff(lyst_a: List, lyst_b: List): - array_a = np.array([right - left + 1 for left, right in lyst_a]) - array_b = np.array([right - left + 1 for left, right in lyst_b]) - return array_a - array_b - - def add_tage_to_range(df: pd.DataFrame, tage_name: str): - # ADD TAGE FOR SIFTS - df[tage_name] = 'Safe' - # No Insertion But Deletion[Pure Deletion] - df.loc[df[(df['group_info'] == 1) & ( - df['diff+'] > 0)].index, tage_name] = 'Deletion' - # Insertion & No Deletion - df.loc[df[ - (df['group_info'] == 1) & - (df['diff-'] > 0)].index, tage_name] = 'Insertion_Undivided' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] == df['group_info']) & - (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion' - # Insertion & Deletion - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] == df['group_info']) & - (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_1' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] != df['group_info']) & - (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_2' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] != df['group_info']) & - (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'InDel_3' - - dfrm.pdb_range = dfrm.pdb_range.apply(json.loads) - dfrm.unp_range = dfrm.unp_range.apply(json.loads) - dfrm['group_info'] = dfrm.apply(lambda x: len( - x['pdb_range']), axis=1) - - focus_index = dfrm[dfrm.group_info.gt(1)].index - if sort_by_unp and (len(focus_index) > 0): - focus_df = dfrm.loc[focus_index].apply(lambda x: sort_2_range( - x['unp_range'], x['pdb_range']), axis=1, result_type='expand') - focus_df.index = focus_index - focus_df.columns = ['unp_range', 'pdb_range'] - dfrm.loc[focus_index, ['unp_range', 'pdb_range']] = focus_df - - dfrm['pdb_gaps'] = dfrm.pdb_range.apply(get_gap_list) - dfrm['unp_gaps'] = dfrm.unp_range.apply(get_gap_list) - dfrm['range_diff'] = dfrm.apply(lambda x: get_range_diff(x['unp_range'], x['pdb_range']), axis=1) - dfrm['diff0'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x == 0)) - dfrm['diff+'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x > 0)) - dfrm['diff-'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x < 0)) - dfrm['unp_gaps0'] = dfrm.unp_gaps.apply(lambda x: x.count(0)) - add_tage_to_range(dfrm, tage_name='sifts_range_tag') - dfrm['repeated'] = dfrm.apply( - lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion_Undivided', axis=1) - dfrm['repeated'] = dfrm.apply( - lambda x: True if any(i < 0 for i in x['unp_gaps']) else x['repeated'], axis=1) - dfrm['reversed'] = dfrm.pdb_gaps.apply(lambda x: any(i < 0 for i in x)) - dfrm.pdb_range = dfrm.pdb_range.apply(lambda x: json.dumps(x).decode('utf-8')) - dfrm.unp_range = dfrm.unp_range.apply(lambda x: json.dumps(x).decode('utf-8')) - temp_cols = ['start', 'end', 'group_info', 'pdb_gaps', 'unp_gaps', 'range_diff', - 'diff0', 'diff+', 'diff-', 'unp_gaps0'] - return dfrm.drop(columns=temp_cols), dfrm[temp_cols] - - ''' - @staticmethod - def update_range(dfrm: pd.DataFrame, fasta_col: str, unp_fasta_files_folder: str, new_range_cols=('new_sifts_unp_range', 'new_sifts_pdb_range')) -> pd.DataFrame: - def getSeq(fasta_path: str): - unpSeq = None - try: - unpSeqOb = SeqIO.read(fasta_path, "fasta") - unpSeq = unpSeqOb.seq - except ValueError: - unpSeqOb = SeqIO.parse(fasta_path, "fasta") - for record in unpSeqOb: - if unp_id in record.id.split('|'): - unpSeq = record.seq - return unpSeq - - focus = ('Deletion', 'Insertion & Deletion') - focus_index = dfrm[dfrm['sifts_range_tage'].isin(focus)].index - updated_pdb_range, updated_unp_range = list(), list() - seqAligner = SeqPairwiseAlign() - for index in focus_index: - pdbSeq = dfrm.loc[index, fasta_col] - unp_entry = dfrm.loc[index, "Entry"] - unp_id = dfrm.loc[index, "UniProt"] - try: - fasta_path = os.path.join( - unp_fasta_files_folder, f'{unp_id}.fasta') - unpSeq = getSeq(fasta_path) - except FileNotFoundError: - try: - fasta_path = os.path.join( - unp_fasta_files_folder, f'{unp_entry}.fasta') - unpSeq = getSeq(fasta_path) - except FileNotFoundError: - unpSeq = None - res = seqAligner.makeAlignment(unpSeq, pdbSeq) - updated_unp_range.append(res[0]) - updated_pdb_range.append(res[1]) - - updated_range_df = pd.DataFrame( - {new_range_cols[0]: updated_unp_range, new_range_cols[1]: updated_pdb_range}, index=focus_index) - dfrm = pd.merge(dfrm, updated_range_df, left_index=True, - right_index=True, how='left') - dfrm[new_range_cols[0]] = dfrm.apply(lambda x: x['sifts_unp_range'] if pd.isna( - x[new_range_cols[0]]) else x[new_range_cols[0]], axis=1) - dfrm[new_range_cols[1]] = dfrm.apply(lambda x: x['sifts_pdb_range'] if pd.isna( - x[new_range_cols[1]]) else x[new_range_cols[1]], axis=1) - return dfrm - - @classmethod - def main(cls, filePath: Union[str, Path], folder: str, related_unp: Optional[Iterable] = None, related_pdb: Optional[Iterable] = None): - pdbs, _ = cls.related_UNP_PDB(filePath, related_unp, related_pdb) - res = cls.retrieve(pdbs, 'mappings/all_isoforms/', 'get', folder) - # return pd.concat((cls.dealWithInDe(cls.reformat(route)) for route in res if route is not None), sort=False, ignore_index=True) - return res - ''' - - -class ProcessEntryData(ProcessPDBe): - @staticmethod - def related_PDB(pdb_col: str, **kwargs) -> pd.Series: - dfrm = related_dataframe(**kwargs) - return dfrm[pdb_col].drop_duplicates() - - @classmethod - def main(cls, **kwargs): - pdbs = cls.related_PDB(**kwargs) - if len(pdbs) > 0: - res = cls.retrieve(pdbs, **kwargs) - try: - return pd.concat((pd.read_csv(route, sep=kwargs.get('sep', '\t'), converters=cls.converters) for route in res if route is not None), sort=False, ignore_index=True) - except ValueError: - cls.logger.error('Non-value to concat') - else: - return None - - @classmethod - def unit(cls, pdbs, **kwargs): - if len(pdbs) > 0: - res = cls.retrieve(pdbs, **kwargs) - try: - return pd.concat((pd.read_csv(route, sep=kwargs.get('sep', '\t'), converters=cls.converters) for route in res if route is not None), sort=False, ignore_index=True) - except ValueError: - cls.logger.warning('Non-value to concat') - else: - return None - - @staticmethod - def yieldObserved(dfrm: pd.DataFrame) -> Generator: - groups = dfrm.groupby(['pdb_id', 'entity_id', 'chain_id']) - for i, j in groups: - mod = j.dropna(subset=['chem_comp_id']) - yield i, len(j[j.observed_ratio.gt(0)]), len(mod[mod.observed_ratio.gt(0)]) - - @staticmethod - def traverse(data: Dict, cols: Tuple, cutoff=50): - ''' - temp - ''' - observed_res_count, observed_modified_res_count = cols - for pdb in data: - count = 0 - cleaned = 0 - for entity in data[pdb].values(): - for chain in entity.values(): - if chain[observed_res_count] - chain[observed_modified_res_count] < cutoff: - cleaned += 1 - count += 1 - yield pdb, count, cleaned - - @classmethod - def pipeline(cls, pdbs: Iterable, folder: str, chunksize: int = 1000): - for i in range(0, len(pdbs), chunksize): - related_pdbs = pdbs[i:i+chunksize] - molecules_dfrm = ProcessEntryData.unit( - related_pdbs, - suffix='api/pdb/entry/molecules/', - method='post', - folder=folder, - task_id=i) - res_listing_dfrm = ProcessEntryData.unit( - related_pdbs, - suffix='api/pdb/entry/residue_listing/', - method='get', - folder=folder, - task_id=i) - modified_AA_dfrm = ProcessEntryData.unit( - related_pdbs, - suffix='api/pdb/entry/modified_AA_or_NA/', - method='post', - folder=folder, - task_id=i) - if modified_AA_dfrm is not None: - res_listing_dfrm.drop(columns=['author_insertion_code'], inplace=True) - modified_AA_dfrm.drop(columns=['author_insertion_code'], inplace=True) - res_listing_mod_dfrm = pd.merge(res_listing_dfrm, modified_AA_dfrm, how='left') - else: - res_listing_mod_dfrm = res_listing_dfrm - res_listing_mod_dfrm['chem_comp_id'] = np.nan - pro_dfrm = molecules_dfrm[molecules_dfrm.molecule_type.isin(['polypeptide(L)', 'polypeptide(D)'])][['pdb_id', 'entity_id']].reset_index(drop=True) - pro_res_listing_mod_dfrm = pd.merge(res_listing_mod_dfrm, pro_dfrm) - data = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - for (pdb_id, entity_id, chain_id), observed_res_count, observed_modified_res_count in cls.yieldObserved(pro_res_listing_mod_dfrm): - data[pdb_id][entity_id][chain_id]['ob_res'] = observed_res_count - data[pdb_id][entity_id][chain_id]['ob_moded_res'] = observed_modified_res_count - with Path(folder, f'clean_pdb_statistic+{i}.tsv').open(mode='w+') as outFile: - for tup in cls.traverse(data, ('ob_res', 'ob_moded_res')): - outFile.write('%s\t%s\t%s\n' % tup) - with Path(folder, f'clean_pdb_statistic+{i}.json').open(mode='w+') as outFile: - json.dump(data, outFile) class PDBeDecoder(object): @staticmethod - @dispatch_on_set({'api/pdb/entry/status/', 'api/pdb/entry/summary/', 'api/pdb/entry/modified_AA_or_NA/', + @dispatch_on_set('api/pdb/entry/status/', 'api/pdb/entry/summary/', 'api/pdb/entry/modified_AA_or_NA/', 'api/pdb/entry/mutated_AA_or_NA/', 'api/pdb/entry/cofactor/', 'api/pdb/entry/molecules/', 'api/pdb/entry/ligand_monomers/', 'api/pdb/entry/experiment/', 'api/pdb/entry/carbohydrate_polymer/', 'api/pdb/entry/electron_density_statistics/', 'api/pdb/entry/related_experiment_data/', @@ -493,7 +180,7 @@ class PDBeDecoder(object): 'graph-api/compound/bonds/', 'graph-api/compound/summary/', 'graph-api/compound/cofactors/', 'graph-api/pdb/funpdbe/', 'graph-api/pdb/bound_excluding_branched/', - 'graph-api/pdb/bound_molecules/', 'graph-api/pdb/ligand_monomers/'}) + 'graph-api/pdb/bound_molecules/', 'graph-api/pdb/ligand_monomers/') def yieldCommon(data: Dict) -> Generator: for pdb in data: values = data[pdb] @@ -504,7 +191,7 @@ def yieldCommon(data: Dict) -> Generator: yield values, (default_id_tag(pdb, '_code_'),), (pdb,) @staticmethod - @dispatch_on_set({'api/pdb/entry/polymer_coverage/'}) + @dispatch_on_set('api/pdb/entry/polymer_coverage/') def yieldPolymerCoverage(data: Dict) -> Generator: for pdb in data: molecules = data[pdb]['molecules'] @@ -518,14 +205,14 @@ def yieldPolymerCoverage(data: Dict) -> Generator: yield observed, ('chain_id', 'struct_asym_id', 'entity_id', 'pdb_id'), (chain['chain_id'], chain['struct_asym_id'], entity['entity_id'], pdb) @staticmethod - @dispatch_on_set({'api/pdb/entry/observed_residues_ratio/'}) + @dispatch_on_set('api/pdb/entry/observed_residues_ratio/') def yieldObservedResiduesRatio(data: Dict) -> Generator: for pdb in data: for entity_id, entity in data[pdb].items(): yield entity, ('entity_id', 'pdb_id'), (entity_id, pdb) @staticmethod - @dispatch_on_set({'api/pdb/entry/residue_listing/'}) + @dispatch_on_set('api/pdb/entry/residue_listing/') def yieldResidues(data: Dict) -> Generator: for pdb in data: molecules = data[pdb]['molecules'] @@ -541,7 +228,7 @@ def yieldResidues(data: Dict) -> Generator: yield residues, ('chain_id', 'struct_asym_id', 'entity_id', 'pdb_id'), (chain['chain_id'], chain['struct_asym_id'], entity['entity_id'], pdb) @staticmethod - @dispatch_on_set({'api/pdb/entry/secondary_structure/', 'graph-api/pdb/secondary_structure/'}) + @dispatch_on_set('api/pdb/entry/secondary_structure/', 'graph-api/pdb/secondary_structure/') def yieldSecondaryStructure(data: Dict) -> Generator: for pdb in data: molecules = data[pdb]['molecules'] @@ -560,7 +247,7 @@ def yieldSecondaryStructure(data: Dict) -> Generator: yield fragment, ('secondary_structure', 'chain_id', 'struct_asym_id', 'entity_id', 'pdb_id'), (name, chain['chain_id'], chain['struct_asym_id'], entity['entity_id'], pdb) @staticmethod - @dispatch_on_set({'api/pdb/entry/binding_sites/'}) + @dispatch_on_set('api/pdb/entry/binding_sites/') def yieldBindingSites(data: Dict) -> Generator: for pdb in data: for site in data[pdb]: @@ -572,7 +259,7 @@ def yieldBindingSites(data: Dict) -> Generator: yield residues, ('residues_type', 'details', 'evidence_code', 'site_id', 'pdb_id'), (tage, site['details'], site['evidence_code'], site['site_id'], pdb) @staticmethod - @dispatch_on_set({'api/pdb/entry/assembly/'}) + @dispatch_on_set('api/pdb/entry/assembly/') def yieldAssembly(data: Dict) -> Generator: for pdb in data: for biounit in data[pdb]: @@ -586,7 +273,7 @@ def yieldAssembly(data: Dict) -> Generator: yield entities, tuple(keys)+('pdb_id',), tuple(biounit[key] for key in keys)+(pdb, ) @staticmethod - @dispatch_on_set({'api/pdb/entry/files/'}) + @dispatch_on_set('api/pdb/entry/files/') def yieldAssociatedFiles(data: Dict) -> Generator: for pdb in data: for key in data[pdb]: @@ -598,7 +285,7 @@ def yieldAssociatedFiles(data: Dict) -> Generator: continue @staticmethod - @dispatch_on_set({'api/mappings/all_isoforms/', 'api/mappings/uniprot/', + @dispatch_on_set('api/mappings/all_isoforms/', 'api/mappings/uniprot/', 'api/mappings/uniprot_segments/', 'api/mappings/isoforms/', 'api/mappings/uniref90/', 'api/mappings/homologene_uniref90/', 'api/mappings/interpro/', 'api/mappings/pfam/', @@ -611,9 +298,10 @@ def yieldAssociatedFiles(data: Dict) -> Generator: 'graph-api/mappings/uniprot/', 'graph-api/mappings/uniprot_segments/', 'graph-api/mappings/all_isoforms/', 'graph-api/mappings/', 'graph-api/mappings/isoforms/', 'graph-api/mappings/ensembl/', - 'graph-api/mappings/homologene/', 'graph-api/mappings/sequence_domains/' + 'graph-api/mappings/homologene/', 'graph-api/mappings/sequence_domains/', + 'api/mappings/' # 'graph-api/uniprot/' - }) + ) def yieldSIFTSAnnotation(data: Dict) -> Generator: valid_annotation_set = {'UniProt', 'Ensembl', 'Pfam', 'CATH', 'CATH-B', 'SCOP', 'InterPro', 'GO', 'EC', 'Homologene', 'HMMER'} for top_root in data: @@ -650,7 +338,7 @@ def yieldSIFTSAnnotation(data: Dict) -> Generator: raise ValueError(f'Unexpected data structure for inputted data: {data}') @staticmethod - @dispatch_on_set({'api/pisa/interfacelist/'}) + @dispatch_on_set('api/pisa/interfacelist/') def yieldPISAInterfaceList(data: Dict): for pdb in data: try: @@ -665,7 +353,7 @@ def yieldPISAInterfaceList(data: Dict): yield records, cols, tuple(data[pdb][col] for col in cols) @staticmethod - @dispatch_on_set({'api/pisa/interfacedetail/'}) + @dispatch_on_set('api/pisa/interfacedetail/') def yieldPISAInterfaceDetail(data: Dict): usecols = ( 'pdb_code', 'assemble_code', 'interface_number', @@ -690,7 +378,7 @@ def yieldPISAInterfaceDetail(data: Dict): yield data[pdb]['interface_detail.residues']['residue2']['residue']['residue_array'], usecols, tuple(data[pdb][col] for col in usecols) @staticmethod - @dispatch_on_set({'graph-api/residue_mapping/'}) + @dispatch_on_set('graph-api/residue_mapping/') def graph_api_residue_mapping(data: Dict): ''' * @@ -714,6 +402,48 @@ def graph_api_residue_mapping(data: Dict): residue['author_residue_number'], residue['author_insertion_code'], residue['observed'], feature_tag))), **feature} for feature_tag, feature in residue['features']['UniProt'].items()), None + @staticmethod + @dispatch_on_set('graph-api/pdb/sequence_conservation/') + def sequence_conservation(data: Dict): + for pdb in data: + yield [{ + 'pdb_id': pdb, + 'entity_id': data[pdb]['entity_id'], + 'length': data[pdb]['length'], + 'residue_number': val['start'], + 'conservation_score': val['conservation_score'], + 'letter_array': json.dumps(tuple(i['letter'] for i in val['amino'])).decode('utf-8'), + 'proba_array': json.dumps(tuple(i['proba'] for i in val['amino'])).decode('utf-8')} + for val in data[pdb]['data']], None + # letter_array, proba_array = zip(*((i['letter'], i['proba']) for i in val['amino'])) + + @staticmethod + @dispatch_on_set('graph-api/pdb/funpdbe_annotation/depth/', + 'graph-api/pdb/funpdbe_annotation/cath-funsites/', + 'graph-api/pdb/funpdbe_annotation/3Dcomplex/', + 'graph-api/pdb/funpdbe_annotation/akid/', + 'graph-api/pdb/funpdbe_annotation/3dligandsite/', + 'graph-api/pdb/funpdbe_annotation/camkinet/', + 'graph-api/pdb/funpdbe_annotation/canSAR/', + 'graph-api/pdb/funpdbe_annotation/ChannelsDB/', + 'graph-api/pdb/funpdbe_annotation/dynamine/', + 'graph-api/pdb/funpdbe_annotation/FoldX/', + 'graph-api/pdb/funpdbe_annotation/MetalPDB/', + 'graph-api/pdb/funpdbe_annotation/M-CSA/', + 'graph-api/pdb/funpdbe_annotation/p2rank/', + 'graph-api/pdb/funpdbe_annotation/Missense3D/', + 'graph-api/pdb/funpdbe_annotation/POPScomp_PDBML/', + 'graph-api/pdb/funpdbe_annotation/ProKinO/', + 'graph-api/pdb/funpdbe_annotation/14-3-3-pred/', + 'graph-api/pdb/funpdbe_annotation/' + ) + def funpdbe_resources(data: Dict): + for pdb in data: + info = data[pdb] + for val in info: + for annotation in val['annotations']: + yield annotation['site_residues'], ('pdb_id', 'origin', 'evidence_codes', 'site_id', 'label'), (pdb, val['origin'], val['evidence_codes'], annotation['site_id'], annotation['label']) + class PDBeModelServer(Abclog): ''' @@ -831,6 +561,3 @@ def task_unit(cls, pdb_with_version: Tuple, suffix: str, file_suffix: str, folde args = dict(url=f'{cls.root}{suffix}{pdb[1:3]}/pdb_0000{pdb}/{file_name}') return 'get', args, folder/file_name - -# TODO: Chain UniProt ID Mapping -> ProcessSIFTS -> ProcessPDBe -# TODO: Deal with oligomeric PDB diff --git a/pdb_profiling/processors/pdbe/record.py b/pdb_profiling/processors/pdbe/record.py index e540141..f961789 100644 --- a/pdb_profiling/processors/pdbe/record.py +++ b/pdb_profiling/processors/pdbe/record.py @@ -14,7 +14,7 @@ from smart_open import open as smart_open from re import compile as re_compile import orjson as json -from collections import defaultdict, namedtuple +from collections import defaultdict, namedtuple, OrderedDict from itertools import product, combinations_with_replacement, combinations from pdb_profiling import default_id_tag from pdb_profiling.exceptions import WithoutExpectedKeyError @@ -36,10 +36,11 @@ from pdb_profiling.processors.pdbe.api import ProcessPDBe, PDBeModelServer, PDBArchive, FUNCS as API_SET from pdb_profiling.processors.uniprot.api import UniProtFASTA from pdb_profiling.processors.pdbe import PDBeDB +from pdb_profiling.processors.swissmodel.api import SMR from pdb_profiling.data import blosum62 from pdb_profiling import cif_gz_stream from pdb_profiling.processors.i3d.api import Interactome3D -from pdb_profiling.warnings import PISAErrorWarning, MultiWrittenWarning +from pdb_profiling.warnings import PISAErrorWarning, MultiWrittenWarning, WithoutCifKeyWarning from textdistance import sorensen from warnings import warn from cachetools import LRUCache @@ -174,7 +175,7 @@ class PDB(Base): tasks = LRUCache(maxsize=1024) protein_sequence_pat = re_compile(r'([A-Z]{1}|\([A-Z0-9]+\))') - nucleotide_sequence_pat = re_compile(r'([AUCG]{1}|\(DA\)|\(DT\)|\(DC\)|\(DG\)|\(UNK\))') + nucleotide_sequence_pat = re_compile(r'([AUCGI]{1}|\(DA\)|\(DT\)|\(DC\)|\(DG\)|\(DI\)|\(UNK\))') StatsProteinEntitySeq = namedtuple( 'StatsProteinEntitySeq', 'pdb_id molecule_type entity_id ca_p_only SEQRES_COUNT STD_INDEX STD_COUNT NON_INDEX NON_COUNT UNK_INDEX UNK_COUNT ARTIFACT_INDEX') StatsNucleotideEntitySeq = namedtuple( @@ -456,7 +457,7 @@ async def assembly2eec(cls, path: Unfuture): return eec_df @unsync - async def set_assembly(self, focus_assembly_ids:Optional[Iterable[int]]=None, discard_multimer_chains_cutoff:int=16): + async def set_assembly(self, focus_assembly_ids:Optional[Iterable[int]]=None, discard_multimer_chains_cutoff:int=16, discard_multimer_chains_cutoff_for_au:Optional[int]=None): ''' NOTE even for NMR structures (e.g 1oo9), there exists assembly 1 for that entry NOTE Discard `details` is NaN -> not author_defined_assembly OR software_defined_assembly @@ -472,8 +473,11 @@ def to_assembly_id(pdb_id, assemblys): mol_df = mol_df[~mol_df.molecule_type.isin(('water', 'bound', 'carbohydrate_polymer'))] ass_eec_df = ass_eec_df[ass_eec_df.details.notnull() & ass_eec_df.entity_id.isin(mol_df.entity_id)] check_chain_num = ass_eec_df.groupby('assembly_id').in_chains.apply(lambda x: sum(i.count(',')+1 for i in x)) - assemblys = set(ass_eec_df.assembly_id) & set(check_chain_num[check_chain_num 0: + return res_map_df_full.merge(cf, how='left') + else: + return res_map_df_full + + @staticmethod + def convert_conflict_pdb_index(conflict_pdb_index): + if isinstance(conflict_pdb_index, str): + conflict_pdb_index = json.loads(conflict_pdb_index) + conflict_pdb_index = OrderedDict(conflict_pdb_index) + return DataFrame({'residue_number': (int(i) for i in conflict_pdb_index.keys()), 'conflict_code': (i for i in conflict_pdb_index.values())}) + + @unsync + async def get_map_res_df(self, UniProt, unp_range, pdb_range, your_sites, conflict_pdb_index=None, unp2pdb:bool=True,**kwargs): + assert kwargs.keys() <= frozenset({'chain_id', 'entity_id', 'struct_asym_id', 'pdb_id'}) + if unp2pdb: + info = { + 'unp_residue_number': your_sites, + 'residue_number': (SIFTS.convert_index(pdb_range, unp_range, your_sites)), + 'UniProt': UniProt} + else: + info = { + 'residue_number': your_sites, + 'unp_residue_number': SIFTS.convert_index(unp_range, pdb_range, your_sites), + 'UniProt': UniProt} + your_sites_df = DataFrame({**info, **kwargs}) + your_sites_df = your_sites_df[(~your_sites_df.residue_number.isnull()) | (~your_sites_df.unp_residue_number.isnull())] + if len(your_sites_df) == 0: + return + ret = (await self.fetch_from_web_api('api/pdb/entry/residue_listing/', Base.to_dataframe)).merge(your_sites_df) + if conflict_pdb_index is None: + return ret + else: + cf = self.convert_conflict_pdb_index(conflict_pdb_index) + if len(cf) > 0: + return ret.merge(cf, how='left') + else: + return ret - async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=16, **kwargs): - await self.set_assembly(focus_assembly_ids=focus_assembly_ids, discard_multimer_chains_cutoff=discard_multimer_chains_cutoff) + async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=16, discard_multimer_chains_cutoff_for_au=None, **kwargs): + await self.set_assembly(focus_assembly_ids=focus_assembly_ids, discard_multimer_chains_cutoff=discard_multimer_chains_cutoff, discard_multimer_chains_cutoff_for_au=discard_multimer_chains_cutoff_for_au) for assembly in self.assembly.values(): await getattr(assembly, func)(**kwargs) if len(assembly.interface) == 0: @@ -1180,7 +1233,7 @@ async def set_interface_res(self, keep_interface_res_df:bool=False): return elif self.info['chains'] != struct_sele_set: # NOTE: Exception example: 2beq assembly_id 1 interface_id 32 - warn(f"interfacedetail({struct_sele_set}) inconsistent with interfacelist({set(self.info['chains'])}) ! May miss some data.", PISAErrorWarning) + warn(f"{repr(self)}: interfacedetail({struct_sele_set}) inconsistent with interfacelist({set(self.info['chains'])}) ! May miss some data.", PISAErrorWarning) eec_as_df = await self.pdbAssemble_ob.get_assemble_eec_as_df() res_df = await self.pdbAssemble_ob.pdb_ob.fetch_from_web_api('api/pdb/entry/residue_listing/', Base.to_dataframe) interfacedetail_df = interfacedetail_df.merge(eec_as_df, how="left") @@ -1281,6 +1334,8 @@ class SIFTS(PDB): complete_chains_run_as_completed = False + weight = array([1, -1, -1, -1.79072623, -2.95685934, -4.6231746]) + def set_id(self, identifier: str): tag = default_id_tag(identifier, None) if tag == 'pdb_id': @@ -1299,6 +1354,14 @@ def get_id(self): def __repr__(self): return f"<{self.__class__.__name__} {self.level} {self.get_id()}>" + @classmethod + def fetch_unp_fasta(cls, identifier): + task = cls.tasks.get((identifier, 'UniProtFASTA.single_retrieve(identifier).then(a_seq_reader)'), None) + if task is None: + task = UniProtFASTA.single_retrieve(identifier).then(a_seq_reader) + cls.register_task((identifier, 'UniProtFASTA.single_retrieve(identifier).then(a_seq_reader)'), task) + return task + @classmethod @unsync async def complete_chains(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]): @@ -1402,7 +1465,7 @@ def add_tage_to_range(df: DataFrame, tage_name: str): dfrm['unp_gaps0'] = dfrm.unp_gaps.apply(lambda x: x.count(0)) add_tage_to_range(dfrm, tage_name='sifts_range_tag') dfrm['repeated'] = dfrm.apply( - lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion (Specail Case)', axis=1) + lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion_Undivided', axis=1) dfrm['repeated'] = dfrm.apply( lambda x: True if any(i < 0 for i in x['unp_gaps']) else x['repeated'], axis=1) dfrm['reversed'] = dfrm.pdb_gaps.apply(lambda x: any(i < 0 for i in x)) @@ -1478,12 +1541,21 @@ def unit(lrange, rrange, site): rrange = json.loads(rrange) if isinstance(rrange, str) else rrange return tuple(unit(lrange, rrange, site) for site in sites) - @staticmethod + @classmethod @unsync - async def get_residue_conflict(pdb_id, entity_id, pdb_range, Entry, UniProt, unp_range, on_pdb: bool = True): + async def get_residue_conflict(cls, pdb_id, entity_id, pdb_range, Entry, UniProt, unp_range): pdb_seq = await PDB(pdb_id).get_sequence(entity_id=entity_id) - _, unp_seq = await UniProtFASTA.single_retrieve(UniProt).then(a_seq_reader) - return json.dumps(to_interval(get_diff_index(pdb_seq, pdb_range, unp_seq, unp_range, on_pdb))).decode('utf-8'), len(unp_seq) + _, unp_seq = await cls.fetch_unp_fasta(UniProt) + indexes = tuple(get_diff_index(pdb_seq, pdb_range, unp_seq, unp_range)) + if len(indexes) > 0: + pdb_diff_index, unp_diff_index = zip(*indexes) + return ( + json.dumps(dict(zip((str(i) for i in pdb_diff_index), (unp_seq[i-1] for i in unp_diff_index)))).decode('utf-8'), + json.dumps(to_interval(pdb_diff_index)).decode('utf-8'), + json.dumps(to_interval(unp_diff_index)).decode('utf-8'), + len(unp_seq)) + else: + return ('{}', '[]', '[]', len(unp_seq)) @classmethod @unsync @@ -1508,7 +1580,7 @@ async def add_residue_conflict(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coro f_dfrm = dfrm[focus+[pdb_range_col, 'Entry', unp_range_col]].drop_duplicates(subset=focus).reset_index(drop=True) tasks = f_dfrm.apply(lambda x: cls.get_residue_conflict( x['pdb_id'], x['entity_id'], x[pdb_range_col], x['Entry'], x['UniProt'], x[unp_range_col]), axis=1) - f_dfrm['conflict_pdb_range'], f_dfrm['unp_len'] = zip(*[await i for i in tasks]) + f_dfrm['conflict_pdb_index'], f_dfrm['conflict_pdb_range'], f_dfrm['conflict_unp_range'], f_dfrm['unp_len'] = zip(*[await i for i in tasks]) dfrm_ed = merge(dfrm, f_dfrm) assert dfrm_ed.shape[0] == dfrm.shape[0], f"\n{dfrm_ed.shape}\n{dfrm.shape}" return dfrm_ed @@ -1638,14 +1710,14 @@ def get_optimal_range(abs_diff, seg_to_add, seg_to_ori, lstart, lend, rstart, re @unsync async def a_sliding_alignment(cls, range_diff, pdb_range, unp_range, pdb_id, entity_id, UniProt): pdb_seq = await PDB(pdb_id).get_sequence(entity_id=entity_id) - _, unp_seq = await UniProtFASTA.single_retrieve(UniProt).then(a_seq_reader) + _, unp_seq = await cls.fetch_unp_fasta(UniProt) pdb_range = json.loads(pdb_range) if isinstance(pdb_range, str) else pdb_range unp_range = json.loads(unp_range) if isinstance(unp_range, str) else unp_range return cls.sliding_alignment_score(range_diff, pdb_seq, pdb_range, unp_seq, unp_range, pdb_id=pdb_id,entity_id=entity_id,UniProt=UniProt) @unsync - async def pipe_score(self, sifts_df=None, complete_chains:bool=False, weight=array([1, -1, -1, -1.79072623, -2.95685934, -4.6231746])): + async def pipe_score(self, sifts_df=None, complete_chains:bool=False): if sifts_df is None: init_task = await self.fetch_from_web_api('api/mappings/all_isoforms/', Base.to_dataframe) if init_task is None: return @@ -1660,12 +1732,14 @@ async def pipe_score(self, sifts_df=None, complete_chains:bool=False, weight=arr 'experimental_method', 'multi_method'] if self.level == 'PDB Entry': - return await self.pipe_score_for_pdb_entry(sifts_df, exp_cols, weight) + return await self.pipe_score_for_pdb_entry(sifts_df, exp_cols) else: - return await self.pipe_score_for_unp_isoform(sifts_df, exp_cols, weight) + return await self.pipe_score_for_unp_isoform(sifts_df, exp_cols) @unsync - async def pipe_score_base(self, sifts_df, pec_df, weight): + async def pipe_score_base(self, sifts_df, pec_df): + weight = self.weight + def raw_score(vec): return dot(vec, weight) weight_ig3 = array([i for i in weight[:2]]+[i for i in weight[3:]]) @@ -1692,7 +1766,7 @@ def raw_score_ig3(vec): return dot(vec, weight_ig3) return full_df, s_df @unsync - async def pipe_score_for_pdb_entry(self, sifts_df, exp_cols, weight): + async def pipe_score_for_pdb_entry(self, sifts_df, exp_cols): exp_df = DataFrame(await PDBs.fetch_exp_pipe(self), columns=exp_cols) summary_dict = await self.fetch_from_web_api('api/pdb/entry/summary/', a_load_json, json=True) d = summary_dict[self.pdb_id][0] @@ -1701,12 +1775,12 @@ async def pipe_score_for_pdb_entry(self, sifts_df, exp_cols, weight): pec_df, _ = await self.stats_chain() - full_df, s_df = await self.pipe_score_base(sifts_df, pec_df, weight) + full_df, s_df = await self.pipe_score_base(sifts_df, pec_df) return full_df, s_df, exp_df @unsync - async def pipe_score_for_unp_isoform(self, sifts_df, exp_cols, weight): + async def pipe_score_for_unp_isoform(self, sifts_df, exp_cols): cur_pdbs = sifts_df.pdb_id.unique() pdbs = PDBs(cur_pdbs) @@ -1726,7 +1800,7 @@ async def pipe_score_for_unp_isoform(self, sifts_df, exp_cols, weight): # pec_df, _ = zip(*tasks) # pec_df = concat(pec_df, sort=False, ignore_index=True) - full_df, s_df = await self.pipe_score_base(sifts_df, pec_df, weight) + full_df, s_df = await self.pipe_score_base(sifts_df, pec_df) return full_df, s_df, exp_df @@ -1751,25 +1825,30 @@ async def pipe_select_base(self, exclude_pdbs=frozenset(), **kwargs): sele_df['select_rank'] = -1 return sele_df - @unsync - async def pipe_select_mo(self, exclude_pdbs=frozenset(), OC_cutoff=0.2, **kwargs): - sele_df = await self.pipe_select_base(exclude_pdbs, **kwargs) - if sele_df is None: - return - allow_sele_df = sele_df[sele_df.RAW_BS > 0] + @staticmethod + def select_mo(sele_df, OC_cutoff=0.2, sort_cols=['RAW_BS', '1/resolution', 'revision_date', 'id_score'], ascending=False, allow_mask=None): + allow_sele_df = sele_df[sele_df.RAW_BS > 0] if allow_mask is None else sele_df[allow_mask] def sele_func(dfrm): - rank_index = dfrm.sort_values(by=['RAW_BS', '1/resolution', 'revision_date', 'id_score'], ascending=False).index + rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index sele_df.loc[rank_index, 'select_rank'] = range(1, len(rank_index)+1) return select_range(dfrm.new_unp_range, rank_index, cutoff=OC_cutoff) - if kwargs.get('complete_chains', False): + if len(allow_sele_df.UniProt.unique()) > 1: sele_indexes = allow_sele_df.groupby('UniProt').apply(sele_func) sele_df.loc[[j for i in sele_indexes for j in i], 'select_tag'] = True else: sele_df.loc[sele_func(allow_sele_df), 'select_tag'] = True return sele_df + @unsync + async def pipe_select_mo(self, exclude_pdbs=frozenset(), OC_cutoff=0.2, **kwargs): + sele_df = await self.pipe_select_base(exclude_pdbs, **kwargs) + if sele_df is None: + return + else: + return self.select_mo(sele_df, OC_cutoff) + @staticmethod @unsync async def search_partner_from_i3d(Entry, interaction_types, columns='pdb_id,Entry_1,Entry_2,assembly_id,model_id_1,chain_id_1,model_id_2,chain_id_2,organism,interaction_type'): @@ -1832,118 +1911,82 @@ async def pipe_interface_res_dict(p_df, pdb_id): pass @unsync - async def pipe_select_ho(self, exclude_pdbs=frozenset(), interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, wasserstein_distance_cutoff=0.2, run_as_completed:bool=False, progress_bar=None,**kwargs): - sele_df = await self.pipe_select_mo(exclude_pdbs) + async def pipe_select_ho_base(self, exclude_pdbs=frozenset(), run_as_completed: bool=False, progress_bar=None, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs) if sele_df is None: return chain_pairs = sele_df.groupby('pdb_id').struct_asym_id.apply( lambda x: frozenset(combinations_with_replacement(x, 2))) - # res = [i for pdb_id in chain_pairs.index async for i in PDB(pdb_id).pipe_interface_res_dict(chain_pairs=chain_pairs[pdb_id], au2bu=True, func='pipe_protein_protein_interface')] - # interact_df = DataFrame(i for i in res if i is not None) - tasks = [self.pird_task_unit(pdb_id, chain_pairs[pdb_id], **kwargs) for pdb_id in chain_pairs.index] - if run_as_completed: - if progress_bar is not None: - res = [await i for i in progress_bar(as_completed(tasks), total=len(tasks))] - else: - res = [await i for i in as_completed(tasks)] - else: - res = [await i for i in tasks] - interact_df = DataFrame(j for i in res for j in i if j is not None) - if len(interact_df) == 0: - return - ''' - NOTE: exception case: 5b0y (see api%pisa%interfacelist%+5b0y%0.tsv) - - # assert all((~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())), f"{interact_df[interact_df.interface_range_1.isnull() | interact_df.interface_range_2.isnull()]}" - ''' - interact_df = interact_df[(~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())] - p_a_df = self.parallel_interact_df(sele_df, interact_df) - i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], 'ho') - if len(i3d_df) == 0: - p_df = p_a_df - p_df['in_i3d'] = False - else: - p_b_df = self.parallel_interact_df(sele_df[['UniProt', 'pdb_id', 'chain_id', 'struct_asym_id']], i3d_df) - p_df = p_a_df.merge(p_b_df, how='left') - p_df['in_i3d'] = p_df.organism.apply(lambda x: False if isna(x) else True) - p_df.drop(columns=['organism', 'interaction_type'], inplace=True) - - p_df['best_select_rank'] = p_df[['select_rank_1', - 'select_rank_2']].apply(lambda x: min(x), axis=1) + + p_df = await self.integrate_i3d_with_pisa_interact(sele_df, chain_pairs, 'ho', run_as_completed, progress_bar, **kwargs) + if p_df is None: + return None p_df['unp_range_DSC'] = p_df.apply(lambda x: sorensen.similarity( lyst2range(x['new_unp_range_1']), lyst2range(x['new_unp_range_2'])), axis=1) - p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) - p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) + return self.add_interact_common_cols(p_df) + + @staticmethod + def select_ho(p_df, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, wasserstein_distance_cutoff=0.2, sort_cols=['best_select_rank_score', 'second_select_rank_score', 'in_i3d'], ascending=False, allow_mask=None): p_df['i_select_tag'] = False p_df['i_select_rank'] = -1 - allow_p_df = p_df[ - (p_df.best_select_rank > 0) & - (p_df.unp_range_DSC>=unp_range_DSC_cutoff) & - ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & - ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + if allow_mask is None: + allow_p_df = p_df[ + (p_df.best_select_rank_score > 0) & + (p_df.unp_range_DSC >= unp_range_DSC_cutoff) & + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + else: + allow_p_df = p_df[ + allow_mask & + (p_df.unp_range_DSC >= unp_range_DSC_cutoff) & + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] def sele_func(dfrm): - rank_index = dfrm.sort_values(by='best_select_rank', ascending=False).index + rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) return select_ho_range(dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=wasserstein_distance_cutoff) - - p_df.loc[sele_func(allow_p_df), 'i_select_tag'] = True + + if len(p_df.i_group.unique()) == 1: + p_df.loc[sele_func(allow_p_df), 'i_select_tag'] = True + else: + sele_indexes = allow_p_df.groupby('i_group').apply(sele_func) + p_df.loc[[j for i in sele_indexes for j in i], 'i_select_tag'] = True return p_df @unsync - async def pipe_select_ho_iso(self, exclude_pdbs=frozenset(), interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, run_as_completed:bool=False, progress_bar=None, then_sort_interact:bool=True, **kwargs): - entry = self.identifier.split('-')[0] - sele_df = await self.pipe_select_mo(exclude_pdbs, complete_chains=True) + async def pipe_select_ho(self, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, wasserstein_distance_cutoff=0.2, **kwargs): + p_df = await self.pipe_select_ho_base(**kwargs) + if p_df is None: + return + else: + return self.select_ho(p_df, interface_mapped_cov_cutoff, unp_range_DSC_cutoff, wasserstein_distance_cutoff) + + @unsync + async def pipe_select_ho_iso_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True) if sele_df is None: return - sele_df = sele_df[sele_df.Entry.eq(entry)] + sele_df = sele_df[sele_df.Entry.eq(self.identifier.split('-')[0])] chain_pairs = sele_df.groupby('pdb_id').struct_asym_id.apply( lambda x: frozenset(combinations_with_replacement(x, 2))) - tasks = [self.pird_task_unit(pdb_id, chain_pairs[pdb_id], **kwargs) for pdb_id in chain_pairs.index] - if run_as_completed: - if progress_bar is not None: - res = [await i for i in progress_bar(as_completed(tasks), total=len(tasks))] - else: - res = [await i for i in as_completed(tasks)] - else: - res = [await i for i in tasks] - interact_df = DataFrame(j for i in res for j in i if j is not None) - if len(interact_df) == 0: - return - interact_df = interact_df[(~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())] - p_a_df = self.parallel_interact_df(sele_df, interact_df) - p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.UniProt_1 != p_a_df.UniProt_2) & (p_a_df.struct_asym_id_in_assembly_1 != p_a_df.struct_asym_id_in_assembly_2)].reset_index(drop=True) - i3d_df = await self.search_partner_from_i3d(entry, 'ho') - if len(i3d_df) == 0: - p_df = p_a_df - p_df['in_i3d'] = False - else: - p_b_df = self.parallel_interact_df(sele_df[['UniProt', 'pdb_id', 'chain_id', 'struct_asym_id']], i3d_df) - p_df = p_a_df.merge(p_b_df, how='left') - p_df['in_i3d'] = p_df.organism.apply(lambda x: False if isna(x) else True) - p_df.drop(columns=['organism', 'interaction_type'], inplace=True) - p_df['best_select_rank'] = p_df[['select_rank_1', - 'select_rank_2']].apply(lambda x: min(x), axis=1) - p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) - p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) - p_df['i_select_tag'] = False - p_df['i_select_rank'] = -1 - p_df['i_group'] = p_df.apply(lambda x: tuple( - sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) - allow_p_df = p_df[ - (p_df.best_select_rank > 0) & - ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & - ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] - def sele_func(dfrm): - rank_index = dfrm.sort_values(by=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'id_score_1', 'id_score_2'], ascending=False).index - p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) - return select_he_range(dfrm.UniProt_1, dfrm.UniProt_2, dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=DSC_cutoff) + p_df = await self.integrate_i3d_with_pisa_interact(sele_df, chain_pairs, 'ho_iso', run_as_completed, progress_bar, **kwargs) + return None if p_df is None else self.add_interact_common_cols(p_df) + + @classmethod + def select_ho_iso(cls, p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): + return cls.select_he(p_df, interface_mapped_cov_cutoff, DSC_cutoff, sort_cols, ascending, allow_mask) - sele_indexes = allow_p_df.groupby('i_group').apply(sele_func) - p_df.loc[[j for i in sele_indexes for j in i], 'i_select_tag'] = True - return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df + @unsync + async def pipe_select_ho_iso(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, then_sort_interact:bool=True, **kwargs): + p_df = await self.pipe_select_ho_iso_base(**kwargs) + if p_df is None: + return + else: + p_df = self.select_ho_iso(p_df, interface_mapped_cov_cutoff, DSC_cutoff) + return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df @staticmethod @unsync @@ -1951,148 +1994,51 @@ async def pird_task_unit(pdb_id, chain_pairs, **kwargs): return [i async for i in PDB(pdb_id).pipe_interface_res_dict(chain_pairs=chain_pairs, au2bu=True, func='pipe_protein_protein_interface', **kwargs)] @unsync - async def pipe_select_he(self, exclude_pdbs=frozenset(), interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, run_as_completed:bool=False, progress_bar=None, then_sort_interact:bool=True, **kwargs): - sele_df = await self.pipe_select_mo(exclude_pdbs, complete_chains=True) + async def pipe_select_he_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True) if sele_df is None: return if len(sele_df.Entry.unique()) == 1: return chain_pairs = sele_df.groupby(['pdb_id', 'entity_id']).struct_asym_id.apply(frozenset).groupby('pdb_id').apply(tuple).apply(lambda x: frozenset(res for i,j in combinations(range(len(x)), 2) for res in product(x[i], x[j]))) chain_pairs = chain_pairs[chain_pairs.apply(len) > 0] - tasks = [self.pird_task_unit(pdb_id, chain_pairs[pdb_id], **kwargs) for pdb_id in chain_pairs.index] - if run_as_completed: - if progress_bar is not None: - res = [await i for i in progress_bar(as_completed(tasks), total=len(tasks))] - else: - res = [await i for i in as_completed(tasks)] - else: - res = [await i for i in tasks] - interact_df = DataFrame(j for i in res for j in i if j is not None) - if len(interact_df) == 0: - return - interact_df = interact_df[(~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())] - p_a_df = self.parallel_interact_df(sele_df, interact_df) - p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.Entry_1 != p_a_df.Entry_2)].reset_index(drop=True) - if len(p_a_df) == 0: - return - i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], "he") - if len(i3d_df) == 0: - p_df = p_a_df - p_df['in_i3d'] = False - else: - p_b_df = self.parallel_interact_df(sele_df[['UniProt', 'pdb_id', 'chain_id', 'struct_asym_id']], i3d_df) - p_df = p_a_df.merge(p_b_df, how='left') - p_df['in_i3d'] = p_df.organism.apply(lambda x: False if isna(x) else True) - p_df.drop(columns=['organism', 'interaction_type'], inplace=True) - p_df['best_select_rank'] = p_df[['select_rank_1', - 'select_rank_2']].apply(lambda x: min(x), axis=1) - p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) - p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) + + p_df = await self.integrate_i3d_with_pisa_interact(sele_df, chain_pairs, 'he', run_as_completed, progress_bar, **kwargs) + return None if p_df is None else self.add_interact_common_cols(p_df) + + @staticmethod + def select_he(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): p_df['i_select_tag'] = False p_df['i_select_rank'] = -1 - p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) - allow_p_df = p_df[ - (p_df.best_select_rank > 0) & - ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & - ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + + if allow_mask is None: + allow_p_df = p_df[ + (p_df.best_select_rank_score > 0) & + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + else: + allow_p_df = p_df[ + allow_mask & + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] def sele_func(dfrm): - rank_index = dfrm.sort_values(by=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'id_score_1', 'id_score_2'], ascending=False).index + rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) return select_he_range(dfrm.UniProt_1, dfrm.UniProt_2, dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=DSC_cutoff) sele_indexes = allow_p_df.groupby('i_group').apply(sele_func) p_df.loc[[j for i in sele_indexes for j in i], 'i_select_tag'] = True - return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df - - @unsync - async def pipe_select_ho_(self, exclude_pdbs=frozenset(), consider_interface:bool=False, unp_range_DSC_cutoff=0.8, wasserstein_distance_cutoff=0.2): - i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], 'ho') - if len(i3d_df) == 0: - return - sele_df = await self.pipe_select_mo(exclude_pdbs) - if sele_df is None: - return - p_df = self.parallel_interact_df(sele_df, i3d_df) - p_df['best_select_rank'] = p_df[['select_rank_1', 'select_rank_2']].apply(lambda x: min(x), axis=1) - p_df['unp_range_DSC'] = p_df.apply(lambda x: sorensen.similarity( - lyst2range(x['new_unp_range_1']), lyst2range(x['new_unp_range_2'])), axis=1) - pdbs = p_df.pdb_id.unique() - if consider_interface: - ii_df = DataFrame([i for pdb_id in pdbs async for i in self.pipe_interface_res_dict(p_df, pdb_id)]) - if len(ii_df) == 0: - consider_interface = False - else: - p_df = p_df.merge(ii_df) - p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) - p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) - p_df['i_select_tag'] = False - p_df['i_select_rank'] = -1 - allow_p_df = p_df[(p_df.best_select_rank > 0) & (p_df.unp_range_DSC>=unp_range_DSC_cutoff)] - - def sele_func(dfrm): - rank_index = dfrm.sort_values(by='best_select_rank', ascending=False).index - p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) - return select_ho_range(dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=wasserstein_distance_cutoff) - - if not consider_interface: - p_df['i_select_tag'] = False - p_df['i_select_rank'] = -1 - allow_p_df = p_df[(p_df.best_select_rank > 0) & (p_df.unp_range_DSC>=unp_range_DSC_cutoff)] - - def sele_func(dfrm): - rank_index = dfrm.sort_values(by='best_select_rank', ascending=False).index - p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) - return select_ho_range(dfrm.new_unp_range_1, dfrm.new_unp_range_2, rank_index, cutoff=wasserstein_distance_cutoff) - - p_df.loc[sele_func(allow_p_df), 'i_select_tag'] = True - return p_df @unsync - async def pipe_select_he_(self, exclude_pdbs=frozenset(), consider_interface:bool=False, DSC_cutoff=0.2): - i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], 'he') - if len(i3d_df) == 0: - return - sele_df = await self.pipe_select_mo(exclude_pdbs, complete_chains=True) - if sele_df is None: + async def pipe_select_he(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, then_sort_interact:bool=True, **kwargs): + p_df = await self.pipe_select_he_base(**kwargs) + if p_df is None: return - p_df = self.parallel_interact_df(sele_df, i3d_df) - p_df = p_df[p_df.UniProt_1.eq(self.identifier) | p_df.UniProt_2.eq(self.identifier)].reset_index(drop=True) - p_df['best_select_rank'] = p_df[['select_rank_1', 'select_rank_2']].apply(lambda x: min(x), axis=1) - pdbs = p_df.pdb_id.unique() - if consider_interface: - ii_df = DataFrame([i for pdb_id in pdbs async for i in self.pipe_interface_res_dict(p_df, pdb_id)]) - if len(ii_df) == 0: - consider_interface = False - p_df = p_df.merge(ii_df) - p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) - p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(self.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) - p_df['i_select_tag'] = False - p_df['i_select_rank'] = -1 - p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) - allow_p_df = p_df[p_df.best_select_rank > 0] - - def sele_func(dfrm): - rank_index = dfrm.sort_values(by=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'id_score_1', 'id_score_2'], ascending=False).index - p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) - return select_he_range(dfrm.UniProt_1, dfrm.UniProt_2, dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=DSC_cutoff) - - if not consider_interface: - p_df['i_select_tag'] = False - p_df['i_select_rank'] = -1 - p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['Entry_1'], x['Entry_2']))), axis=1) - allow_p_df = p_df[p_df.best_select_rank > 0] - - def sele_func(dfrm): - rank_index = dfrm.sort_values(by=['RAW_BS_1', 'RAW_BS_2', '1/resolution', 'revision_date', 'id_score_1', 'id_score_2'], ascending=False).index - p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) - return select_he_range(dfrm.Entry_1, dfrm.Entry_2, dfrm.new_unp_range_1, dfrm.new_unp_range_2, rank_index, cutoff=DSC_cutoff) - - sele_indexes = allow_p_df.groupby('i_group').apply(sele_func) - p_df.loc[[j for i in sele_indexes for j in i], 'i_select_tag'] = True - - return p_df + else: + p_df = self.select_he(p_df, interface_mapped_cov_cutoff, DSC_cutoff) + return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df @unsync def sort_interact_cols(self, dfrm): @@ -2106,8 +2052,97 @@ def sort_interact_cols(self, dfrm): store_2 = dfrm.loc[swap_index, cols_2].rename(columns=dict(zip(cols_2, [col.replace('_2', '_1') for col in cols_2]))) dfrm.loc[swap_index, cols_1] = store_2 dfrm.loc[swap_index, cols_2] = store_1 + dfrm['i_group'] = dfrm.apply(lambda x: (x['UniProt_1'], x['UniProt_2']), axis=1) return dfrm + @staticmethod + def select_smr_mo(smr_df, allow_oligo_state=('monomer',), selected_sifts_unp_ranges=list(), smr_sort_cols=None, ascending=False, OC_cutoff=0.2): + smr_df['select_rank'] = -1 + smr_df['select_tag'] = False + allow_smr_df = smr_df[smr_df['oligo-state'].isin(allow_oligo_state)] + + if smr_sort_cols is None: + rank_index = allow_smr_df.index + else: + rank_index = allow_smr_df.sort_values( + by=smr_sort_cols, ascending=ascending).index + + smr_df.loc[rank_index, 'select_rank'] = range(1, len(rank_index)+1) + smr_df.loc[select_range(smr_df.unp_range, rank_index, cutoff=OC_cutoff, selected_ranges=list(selected_sifts_unp_ranges)), 'select_tag'] = True + return smr_df + + @unsync + async def pipe_select_smr_mo(self, smr_df=None, **kwargs): + if 'sifts_mo_df' in kwargs: + sifts_mo_df = kwargs['sifts_mo_df'] + else: + sifts_mo_df = await self.pipe_select_mo(**kwargs) + smr_df = (await SMR.single_retrieve(self.identifier).then(SMR.to_dataframe)) if smr_df is None else smr_df + if smr_df is None or len(smr_df) == 0: + return + return self.select_smr_mo( + smr_df, + kwargs.get('allow_oligo_state', ('monomer',)), + sifts_mo_df[sifts_mo_df.select_tag.eq(True)].new_unp_range, + kwargs.get('smr_sort_cols', None), + kwargs.get('ascending', False), + kwargs.get('OC_cutoff', 0.2)) + + @classmethod + def add_interact_common_cols(cls, p_df): + p_df['best_select_rank_score'] = p_df[['select_rank_1', + 'select_rank_2']].apply(lambda x: 1/min(x), axis=1) + p_df['second_select_rank_score'] = p_df[['select_rank_1', + 'select_rank_2']].apply(lambda x: 1/max(x), axis=1) + p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(cls.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) + p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(cls.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) + p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) + return p_df + + @unsync + async def integrate_i3d_with_pisa_interact(self, sele_df, chain_pairs, interaction_type:str, run_as_completed:bool=False, progress_bar=None, **kwargs): + tasks = [self.pird_task_unit(pdb_id, chain_pairs[pdb_id], **kwargs) for pdb_id in chain_pairs.index] + if run_as_completed: + if progress_bar is not None: + iter_ob = progress_bar(as_completed(tasks), total=len(tasks)) + else: + iter_ob = as_completed(tasks) + else: + iter_ob = tasks + res = [await i for i in iter_ob] + interact_df = DataFrame(j for i in res for j in i if j is not None) + if len(interact_df) == 0: + return + ''' + NOTE: exception case: 5b0y (see api%pisa%interfacelist%+5b0y%0.tsv) + + # assert all((~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())), f"{interact_df[interact_df.interface_range_1.isnull() | interact_df.interface_range_2.isnull()]}" + ''' + interact_df = interact_df[(~interact_df.interface_range_1.isnull()) & (~interact_df.interface_range_2.isnull())] + if len(interact_df) == 0: + return + p_a_df = self.parallel_interact_df(sele_df, interact_df) + if interaction_type == 'ho': + assert len(p_a_df) > 0 + elif interaction_type == 'ho_iso': + p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.UniProt_1 != p_a_df.UniProt_2) & (p_a_df.struct_asym_id_in_assembly_1 != p_a_df.struct_asym_id_in_assembly_2)].reset_index(drop=True) + elif interaction_type == 'he': + p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.Entry_1 != p_a_df.Entry_2)].reset_index(drop=True) + else: + raise ValueError(f"Invalid interaction_type: {interaction_type}!") + if len(p_a_df) == 0: + return + i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], interaction_type[:2]) + + if len(i3d_df) == 0: + p_df = p_a_df + p_df['in_i3d'] = False + else: + p_b_df = self.parallel_interact_df(sele_df[['UniProt', 'pdb_id', 'chain_id', 'struct_asym_id']], i3d_df) + p_df = p_a_df.merge(p_b_df, how='left') + p_df['in_i3d'] = p_df.organism.apply(lambda x: False if isna(x) else True) + p_df.drop(columns=['organism', 'interaction_type'], inplace=True) + return p_df class Compounds(Base): diff --git a/pdb_profiling/processors/proteins/api.py b/pdb_profiling/processors/proteins/api.py index 6843edc..673f085 100644 --- a/pdb_profiling/processors/proteins/api.py +++ b/pdb_profiling/processors/proteins/api.py @@ -10,6 +10,7 @@ from pandas import DataFrame from pdb_profiling.log import Abclog from pdb_profiling.fetcher.webfetch import UnsyncFetch +from pdb_profiling.utils import dumpsParams from urllib.parse import quote from slugify import slugify @@ -38,17 +39,13 @@ def get_file_suffix(cls) -> str: assert res in ('json', 'xml', 'x-gff'), f"Unexpected Case: {cls.headers}" return res - @classmethod - def dumpsParams(cls, params: Dict) -> str: - return '&'.join(f'{key}={value}' for key, value in params.items()) - @classmethod def task_unit(cls, suffix: str, params: Dict, folder: Path, identifier:Optional[str]=None) -> Tuple: args = dict( url=f'{BASE_URL}{suffix}' if identifier is None else f'{BASE_URL}{suffix}{quote(identifier)}', headers=cls.headers, params=params) - return 'get', args, folder/f'{slugify(identifier)+"_"+cls.dumpsParams(params) if identifier is not None else cls.dumpsParams(params)}.{cls.get_file_suffix()}' + return 'get', args, folder/f'{slugify(identifier)+"_"+dumpsParams(params) if identifier is not None else dumpsParams(params)}.{cls.get_file_suffix()}' @classmethod def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path, identifiers: Optional[Iterable[str]]) -> Generator: diff --git a/pdb_profiling/processors/swissmodel/api.py b/pdb_profiling/processors/swissmodel/api.py index 5e72e74..e77bd72 100644 --- a/pdb_profiling/processors/swissmodel/api.py +++ b/pdb_profiling/processors/swissmodel/api.py @@ -5,19 +5,20 @@ # @Last Modified: 2020-08-27 10:35:41 am # @Copyright (c) 2020 MinghuiGroup, Soochow University from pdb_profiling.fetcher.webfetch import UnsyncFetch -from pdb_profiling.utils import pipe_out +from pdb_profiling.utils import pipe_out, init_folder_from_suffix, init_semaphore, a_read_csv, dumpsParams from pdb_profiling.processors.transformer import Dict2Tabular -from typing import Union, Dict, Generator, Set, Any +from typing import Union, Dict, Generator, Set, Any, Optional, List from pathlib import Path -import logging +from pdb_profiling.log import Abclog from unsync import unsync from aiofiles import open as aiofiles_open -from orjson import loads as orjson_loads +import orjson as json +from pandas import Series, concat, isna BASE_URL: str = 'https://swissmodel.expasy.org/' -class SMR(object): +class SMR(Abclog): ''' Implement SWISS-MODEL Repository API @@ -33,6 +34,15 @@ class SMR(object): headers = {'accept': 'text/plain'} use_existing: bool = True + @classmethod + def set_folder(cls, folder: Union[Path, str]): + cls.folder = init_folder_from_suffix(Path(folder), 'swissmodel/repository/uniprot/') + + @classmethod + @unsync + async def set_web_semaphore(cls, web_semaphore_values): + cls.web_semaphore = await init_semaphore(web_semaphore_values) + @staticmethod def yieldSMR(data: Dict): cols = ('sequence_length', @@ -44,44 +54,54 @@ def yieldSMR(data: Dict): for col in ('ac', 'id', 'isoid'): data['result'][col] = uniprot_entries[0].get(col, None) + + for val in data['result']['structures']: + keys = tuple(val.keys()) + for key in keys: + if isinstance(val[key], (List, Dict)): + val[key] = json.dumps(val[key]).decode('utf-8') yield data['result']['structures'], cols, tuple(data['result'][col] for col in cols) + @classmethod + def task_unit(cls, unp, params, file_format, folder): + args = dict( + url=f'{BASE_URL}{cls.root}{unp}.{file_format}', + headers=cls.headers, + params=params) + return 'get', args, Path(folder)/f'{unp}_{dumpsParams(params)}.{file_format}' + @classmethod def yieldTasks(cls, unps, params: Dict, file_format: str, folder: Union[Path, str]) -> Generator: for unp in unps: - args = dict( - url=f'{BASE_URL}{cls.root}{unp}.{file_format}', - headers=cls.headers, - params=params) - yield 'get', args, Path(folder)/f'{unp}.{file_format}' + yield cls.task_unit(unp, params, file_format, folder) @classmethod - def retrieve(cls, unps, folder: Union[Path, str], params: Dict = dict(provider='swissmodel'), concur_req: int = 20, rate: float = 1.5, file_format: str = 'json', ret_res: bool = True, **kwargs): + def retrieve(cls, unps, folder: Optional[Union[Path, str]]=None, params: Dict = dict(provider='swissmodel'), concur_req: int = 20, rate: float = 1.5, file_format: str = 'json', ret_res: bool = True, **kwargs): assert file_format in ('json', 'pdb'), "Invalid file format" res = UnsyncFetch.multi_tasks( - cls.yieldTasks(unps, params, file_format, folder), + cls.yieldTasks(unps, params, file_format, cls.folder if folder is None else folder), cls.process, concur_req=concur_req, rate=rate, ret_res=ret_res, - semaphore=kwargs.get('semaphore', None)) + semaphore=kwargs.get('semaphore', cls.web_semaphore)) return res @classmethod - def single_retrieve(cls, unp: str, folder: Union[Path, str], semaphore, params: Dict = dict(provider='swissmodel'), rate: float = 1.5, file_format: str = 'json'): + def single_retrieve(cls, unp: str, folder: Optional[Union[Path, str]]=None, semaphore=None, params: Dict = dict(provider='swissmodel'), rate: float = 1.5, file_format: str = 'json'): assert file_format in ('json', 'pdb'), "Invalid file format" return UnsyncFetch.single_task( - task=next(cls.yieldTasks((unp, ), params, file_format, folder)), - semaphore=semaphore, + task=cls.task_unit(unp, params, file_format, cls.folder if folder is None else folder), + semaphore=cls.web_semaphore if semaphore is None else semaphore, to_do_func=cls.process, rate=rate) @classmethod @unsync async def process(cls, path): - logging.debug('Start to decode SMR JSON') + cls.logger.debug('Start to decode SMR JSON') if not isinstance(path, (str, Path)): path = await path if path is None or str(path).endswith('.pdb'): @@ -91,25 +111,66 @@ async def process(cls, path): return new_path async with aiofiles_open(path) as inFile: try: - data = orjson_loads(await inFile.read()) + data = json.loads(await inFile.read()) except Exception as e: - logging.error(f"Error in {path}") + cls.logger.error(f"Error in {path}") raise e res = Dict2Tabular.pyexcel_io(cls.yieldSMR(data)) if res is not None: if isinstance(res, Generator): - one = False + count = 0 for r in res: if r is not None: - await pipe_out(df=r, path=new_path, format='tsv', mode='a') - one = True - if not one: - logging.debug(f"Without Expected Data (swissmodel/repository/uniprot/): {data}") + await pipe_out(df=r, path=new_path, format='tsv', mode='a' if count else 'w') + count += 1 + if not count: + cls.logger.debug(f"Without Expected Data (swissmodel/repository/uniprot/): {data}") return None else: await pipe_out(df=res, path=new_path, format='tsv', mode='w') - logging.debug(f'Decoded file in {new_path}') + cls.logger.debug(f'Decoded file in {new_path}') return new_path else: - logging.warning(f"Without Expected Data (swissmodel/repository/uniprot/): {data}") + cls.logger.warning(f"Without Expected Data (swissmodel/repository/uniprot/): {data}") return None + + @classmethod + @unsync + async def to_dataframe(cls, path): + path = await path + if path is None: + return None + df = await a_read_csv(path, sep="\t", keep_default_na=False, na_values=['NULL', 'null', '']) + df.rename(columns={'ac': 'UniProt', 'sequence_length': 'unp_len', 'id': 'identifier'}, inplace=True) + df['unp_range'] = df.apply(lambda x: f"[[{x['from']},{x['to']}]]", axis=1) + if 'identity' in df.columns: + df.identity = df.identity/100 + if 'qmean' in df.columns: + return concat(( + df.drop(columns=['from', 'to', 'qmean']), + df.qmean.apply(lambda x: json.loads(x) if not isna(x) else default_qmean).apply(Series)), axis=1) + # add_prefix('qmean.') + else: + return df.drop(columns=['from', 'to']) + + +default_qmean = { + "acc_agreement_norm_score": None, + "acc_agreement_z_score": None, + "avg_local_score": None, + "avg_local_score_error": None, + "cbeta_norm_score": None, + "cbeta_z_score": None, + "interaction_norm_score": None, + "interaction_z_score": None, + "packing_norm_score": None, + "packing_z_score": None, + "qmean4_norm_score": None, + "qmean4_z_score": None, + "qmean6_norm_score": None, + "qmean6_z_score": None, + "ss_agreement_norm_score": None, + "ss_agreement_z_score": None, + "torsion_norm_score": None, + "torsion_z_score": None +} diff --git a/pdb_profiling/utils.py b/pdb_profiling/utils.py index bf6ae2d..a8f7245 100644 --- a/pdb_profiling/utils.py +++ b/pdb_profiling/utils.py @@ -770,13 +770,13 @@ def get_seq_seg(seq, ranges): yield start, seq[start-1:end] -def get_diff_index(lseq, lrange, rseq, rrange, on_left:bool=True): +def get_diff_index(lseq, lrange, rseq, rrange): if isinstance(lrange, str): lrange = json.loads(lrange) if isinstance(rrange, str): rrange = json.loads(rrange) for (lstart, lseg), (rstart, rseg) in zip(get_seq_seg(lseq, lrange), get_seq_seg(rseq, rrange)): - yield from (lstart+index if on_left else rstart+index for index, (r1, r2) in enumerate(zip(lseg, rseg)) if r1 != r2) + yield from ((lstart+index, rstart+index) for index, (r1, r2) in enumerate(zip(lseg, rseg)) if r1 != r2) def red_seq_seg(seq, ranges): @@ -819,21 +819,22 @@ def get_range_diff(lyst_a: Union[str, List, Tuple], lyst_b: Union[str, List, Tup return array_a - array_b -def select_range(ranges, indexes, cutoff=0.2, skip_index=[]): +def select_range(ranges, indexes, cutoff=0.2, skip_index=[], selected_ranges=None): select_index = [] + selected_ranges = [] if selected_ranges is None else selected_ranges def unit(cur_index): if cur_index in skip_index: return cur_range = ranges[cur_index] cur_range = json.loads(cur_range) if isinstance(cur_range, str) else cur_range - for selected in select_index: - selected_range = ranges[selected] + for selected_range in selected_ranges: selected_range = json.loads(selected_range) if isinstance(selected_range, str) else selected_range score = overlap.similarity(lyst2range(cur_range), lyst2range(selected_range)) if score > cutoff: return select_index.append(cur_index) + selected_ranges.append(cur_range) for index in indexes: unit(index) @@ -901,3 +902,7 @@ def unit(cur_index): for index in indexes: unit(index) return select_index + + +def dumpsParams(params: Dict) -> str: + return '&'.join(f'{key}={value}' for key, value in params.items()) diff --git a/pdb_profiling/warnings.py b/pdb_profiling/warnings.py index b6084a7..4601a43 100644 --- a/pdb_profiling/warnings.py +++ b/pdb_profiling/warnings.py @@ -10,6 +10,9 @@ class MultiWrittenWarning(UserWarning): pass +class WithoutCifKeyWarning(UserWarning): + pass + class PISAErrorWarning(UserWarning): pass