From 6df4d11bac965ad05568732f15de718b4fee7135 Mon Sep 17 00:00:00 2001 From: Zefeng Zhu <414731811@qq.com> Date: Sun, 25 Apr 2021 20:43:24 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=8C=8Ctowards=20v0.3.2=20(#15)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🦹‍♂️deal with old edge case * 🧜‍♂️add 1d-coordinates * 👨‍🌾impl dataclass * 👮‍♂️fix unchange * 🏃‍♂️add funcs for Identifier * 👨‍🎨add pipe_scheduled_ranged_map_res_df * ⚡change pytest-timeout * 🦼add demo files * ☔fix test path * 🌬fix command.py output path * 🦽add time for test_single_select * add auto_assign * add time --- README.md | 2 +- pdb_profiling/__init__.py | 2 +- pdb_profiling/commands/command.py | 10 +- pdb_profiling/cython/cyrange.pyx | 2 +- pdb_profiling/processors/__init__.py | 5 +- pdb_profiling/processors/pdbe/__init__.py | 21 - pdb_profiling/processors/pdbe/api.py | 17 +- pdb_profiling/processors/pdbe/record.py | 430 +++++++++++------- pdb_profiling/processors/proteins/api.py | 2 +- pdb_profiling/processors/proteins/record.py | 208 ++++++--- pdb_profiling/processors/rcsb/api.py | 16 +- pdb_profiling/processors/recordbase.py | 61 +++ pdb_profiling/utils.py | 8 +- setup.py | 5 +- ...api%pdb%sequence_conservation%+1cbs%1.json | 177 +++++++ ...h-api%residue_mapping%+3pg7%1%251%256.json | 1 + test/pytest/test_command.py | 13 +- test/pytest/test_sele.py | 61 ++- 18 files changed, 728 insertions(+), 313 deletions(-) create mode 100644 pdb_profiling/processors/recordbase.py create mode 100644 test/pytest/demo_dir/graph-api/pdb/sequence_conservation/graph-api%pdb%sequence_conservation%+1cbs%1.json create mode 100644 test/pytest/demo_dir/graph-api/residue_mapping/graph-api%residue_mapping%+3pg7%1%251%256.json diff --git a/README.md b/README.md index cc1a73f..331bc88 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Profiling Protein Structures from Protein Data Bank and integrate various resour ## Install -> Notice: require Python Environment >= 3.6, Platform Independent +> Notice: require Python Environment >= 3.7, Platform Independent Install by `pip` command. diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py index fb8e837..0900a28 100644 --- a/pdb_profiling/__init__.py +++ b/pdb_profiling/__init__.py @@ -4,7 +4,7 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-05-13 08:54:09 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University -__version__ = '0.2.12' +__version__ = '0.3.2' def default_config(folder='./'): diff --git a/pdb_profiling/commands/command.py b/pdb_profiling/commands/command.py index b3f6462..0f148cf 100644 --- a/pdb_profiling/commands/command.py +++ b/pdb_profiling/commands/command.py @@ -100,7 +100,7 @@ def do_nothing(dfrm): @click.option('--column', type=str, default=None) @click.option('--sep', type=str, default='\t') @click.option('--chunksize', type=int, help="the chunksize parameter", default=50) -@click.option('--auto_assign/--no-auto_assign', default=True, is_flag=True) +@click.option('--auto_assign/--no-auto_assign', default=False, is_flag=True) @click.option('--sleep/--no-sleep', default=True, is_flag=True) @click.pass_context def id_mapping(ctx, input, column, sep, chunksize, auto_assign, sleep): @@ -283,7 +283,7 @@ def residue_mapping(ctx, input, chunksize, output, sleep): na_values=['NULL', 'null', ''], chunksize=chunksize) sqlite_api = ctx.obj['custom_db'] if output is not None: - output = Path(output) + output = ctx.obj['folder']/output done = 0 for df in dfs: for col in ('new_pdb_range_raw', 'new_unp_range_raw', 'conflict_pdb_index'): @@ -294,7 +294,7 @@ def residue_mapping(ctx, input, chunksize, output, sleep): row.new_unp_range_raw, row.new_pdb_range_raw, conflict_pdb_index=row.conflict_pdb_index, - struct_asym_id=row.struct_asym_id) for _, row in df.iterrows()] + struct_asym_id=row.struct_asym_id) for row in df.to_records()] with Progress(*progress_bar_args) as p: res = ob.run(p.track).result() res_mapping_df = concat(res, sort=False, ignore_index=True) @@ -442,7 +442,7 @@ def export_residue_remapping(ctx, with_id, sele, output): if df.shape[0] == 0: continue df.rename(columns={'edUniProt': 'UniProt'}).to_csv( - output, index=False, mode='a+', sep='\t', header=not output_path.exists()) + output_path, index=False, mode='a+', sep='\t', header=not output_path.exists()) console.log(f'result saved in {output_path}') @@ -524,7 +524,7 @@ def export_smr_residue_remapping(ctx, identity_cutoff, length_cutoff, with_id, s if df.shape[0] == 0: continue df.rename(columns={'edUniProt': 'UniProt'}).to_csv( - output, index=False, mode='a+', sep='\t',header=not output_path.exists()) + output_path, index=False, mode='a+', sep='\t', header=not output_path.exists()) console.log(f'result saved in {output_path}') #full_df = read_csv(output_path, sep='\t', keep_default_na=False) #best_indexes = full_df.groupby(['UniProt','Pos', 'Alt']).select_rank.idxmin() diff --git a/pdb_profiling/cython/cyrange.pyx b/pdb_profiling/cython/cyrange.pyx index e7cf0b3..c28169e 100644 --- a/pdb_profiling/cython/cyrange.pyx +++ b/pdb_profiling/cython/cyrange.pyx @@ -235,7 +235,7 @@ cpdef bint isin_range(object input_range, int value): return False -cpdef int convert_index(object lrange, object rrange, int site): +cdef int convert_index(object lrange, object rrange, int site) except *: # convert from rrange to lrange cdef int lstart, rstart, lend, rend for (lstart, lend), (rstart, rend) in zip(lrange, rrange): diff --git a/pdb_profiling/processors/__init__.py b/pdb_profiling/processors/__init__.py index cab52bc..7db671f 100644 --- a/pdb_profiling/processors/__init__.py +++ b/pdb_profiling/processors/__init__.py @@ -7,12 +7,13 @@ from pdb_profiling.processors.pdbe.record import ( Base, PDB, - PDBAssemble, + PDBAssembly, PDBInterface, SIFTS, Compounds, PDBs, - SIFTSs + SIFTSs, + RCSB1DCoordinates, ) from pdb_profiling.processors.pdbe.api import PDBeModelServer, PDBArchive, PDBVersioned from pdb_profiling.processors.uniprot.api import UniProtINFO, UniProtAPI diff --git a/pdb_profiling/processors/pdbe/__init__.py b/pdb_profiling/processors/pdbe/__init__.py index 1a5de9d..e1113dc 100644 --- a/pdb_profiling/processors/pdbe/__init__.py +++ b/pdb_profiling/processors/pdbe/__init__.py @@ -8,27 +8,6 @@ from re import compile as re_compile from pdb_profiling.processors.database import SqliteDB -common_pat = r'^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]' - - -pats = dict(pdb_id=re_compile(common_pat+r'{4}$'), - pdb_entity_id=re_compile(common_pat+r'{4}_[0-9]+$'), - UniProt=re_compile(common_pat+r'{6,}[\-]*[0-9]*$'), - pdb_complex_id=re_compile(r'PDB-CPX-[0-9]+')) - - -def default_id_tag(identifier: str, default: str = '', raise_error: bool = False): - try: - for pat_name, pat in pats.items(): - if bool(pat.fullmatch(identifier)): - return pat_name - except Exception: - raise ValueError(f"Invalid Identifier: {identifier} !") - if raise_error: - raise ValueError(f'Unexpected Identifiers: {identifier}') - else: - return default - class PDBeDB(SqliteDB): diff --git a/pdb_profiling/processors/pdbe/api.py b/pdb_profiling/processors/pdbe/api.py index 4f3d86b..4736d28 100644 --- a/pdb_profiling/processors/pdbe/api.py +++ b/pdb_profiling/processors/pdbe/api.py @@ -14,7 +14,7 @@ from unsync import unsync, Unfuture from random import choice from hashlib import sha1 -from pdb_profiling.processors.pdbe import default_id_tag +from pdb_profiling.processors.recordbase import IdentifierBase from pdb_profiling.utils import related_dataframe, flatten_dict, pipe_out, dumpsParams from pdb_profiling.log import Abclog from pdb_profiling.fetcher.webfetch import UnsyncFetch @@ -43,6 +43,16 @@ FUNCS = [] +def mask_ib(i, default='', raise_error=False): + if i.source == 'PDB' and i.level == 'entry': + return 'pdb_id' + elif i.source == 'UniProt': + return 'UniProt' + elif raise_error: + raise AssertionError('Unexpected Case!') + else: + return default + def str_number_converter(x): try: return int(x) @@ -207,7 +217,7 @@ def yieldCommon(data: Dict) -> Generator: for key in value: if isinstance(value[key], (Dict, List)): value[key] = json.dumps(value[key]).decode('utf-8') - yield values, (default_id_tag(pdb, '_code_'),), (pdb,) + yield values, (mask_ib(IdentifierBase(pdb), '_code_'),), (pdb,) @staticmethod @dispatch_on_set('api/pdb/entry/polymer_coverage/') @@ -348,8 +358,7 @@ def yieldSIFTSAnnotation(data: Dict) -> Generator: continue chain[key] = json.dumps(value).decode( 'utf-8') if isinstance(value, Dict) else value - chain[default_id_tag( - top_root, raise_error=True)] = top_root + chain[mask_ib(IdentifierBase(top_root), raise_error=True)] = top_root chain[sec_root] = annotation yield chains, None elif len(data[top_root].keys()) == 1 and 'PDB' in data[top_root].keys(): diff --git a/pdb_profiling/processors/pdbe/record.py b/pdb_profiling/processors/pdbe/record.py index 65424c5..d476dea 100644 --- a/pdb_profiling/processors/pdbe/record.py +++ b/pdb_profiling/processors/pdbe/record.py @@ -22,7 +22,7 @@ from collections import defaultdict, namedtuple, OrderedDict from itertools import product, combinations_with_replacement, combinations from operator import itemgetter -from pdb_profiling.processors.pdbe import default_id_tag +from pdb_profiling.processors.recordbase import IdentifierBase from pdb_profiling.processors.transformer import Dict2Tabular from pdb_profiling.exceptions import * from pdb_profiling.cython.cyrange import to_interval, lyst22interval, lyst32interval, range_len, interval2set, subtract_range, add_range, overlap_range, outside_range, trim_range @@ -41,7 +41,7 @@ from pdb_profiling.processors.uniprot.api import UniProtINFO from pdb_profiling.processors.pdbe import PDBeDB from pdb_profiling.processors.rcsb import RCSBDB -from pdb_profiling.processors.rcsb.api import RCSBDataAPI, RCSBSearchAPI +from pdb_profiling.processors.rcsb.api import RCSBDataAPI, RCSBSearchAPI, RCSB1DCoordinatesAPI from pdb_profiling.processors.swissmodel.api import SMR from pdb_profiling.data import miyata_similarity_matrix from pdb_profiling import cif_gz_stream @@ -87,7 +87,16 @@ def __call__(self, that): return getattr(that, f'_{self._name}') -class Base(object): +class Base(IdentifierBase): + ''' + Impl + * PDBe Entry-Based API + * PDBe Graph API + * RCSB Data API + * RCSB Search API + * RCSB 1D-Coordinates API + * ... + ''' folder = None tasks = LRUCache(maxsize=1024) @@ -110,12 +119,6 @@ def pipe_register_task(cls, key, task_func, **kwargs): cls.tasks[key] = task return task - def set_neo4j_connection(self, api): - pass - - def set_sqlite_connection(self, api): - pass - @classmethod @unsync async def set_web_semaphore(cls, web_semaphore_value: int): @@ -132,17 +135,6 @@ async def set_rcsb_web_semaphore(cls, web_semaphore_value: int): def get_web_semaphore(cls): return cls.web_semaphore - ''' - @classmethod - @unsync - async def set_db_semaphore(cls, db_semaphore_value): - cls.db_semaphore = await init_semaphore(db_semaphore_value) - - @classmethod - def get_db_semaphore(cls): - return cls.db_semaphore - ''' - @classmethod def set_folder(cls, folder: Union[Path, str]): """Set your folder path @@ -154,7 +146,9 @@ def set_folder(cls, folder: Union[Path, str]): assert folder.exists(), "Folder not exist! Please create it or input a valid folder!" cls.folder = folder tuple(init_folder_from_suffixes(cls.folder, API_SET)) - tuple(init_folder_from_suffixes(cls.folder/'data_rcsb', RCSBDataAPI.api_set | {'graphql', 'search'})) + tuple(init_folder_from_suffixes(cls.folder/'data_rcsb', RCSBDataAPI.api_set | {'graphql', 'search', '1d_coordinates'})) + cls.sqlite_api = PDBeDB("sqlite:///%s" % (init_folder_from_suffix(cls.folder, 'local_db')/"PDBeDB.db")) + cls.rcsb_sqlite_api = RCSBDB("sqlite:///%s" % (init_folder_from_suffix(cls.folder, 'local_db')/"RCSBDB.db")) @classmethod def get_folder(cls) -> Path: @@ -179,7 +173,7 @@ def r_task(cls, task, then_func, api_suffix, identifier, json): cls.register_task((cls.__class__.__name__, api_suffix, then_func, json, identifier), task) return task - def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None, infer_path: bool = True) -> Unfuture: + def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None, infer_path: bool = True, **kwargs) -> Unfuture: """fetch data from PDBe API Args: @@ -197,7 +191,7 @@ def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unf if task is not None: return task - if infer_path: + if infer_path and not json: infer = self.infer_ret_from_args(api_suffix, identifier) if infer.exists(): task = unsync_wrap(infer) @@ -208,12 +202,13 @@ def fetch_from_pdbe_api(self, api_suffix: str, then_func: Optional[Callable[[Unf method='get', folder=self.get_folder()/api_suffix, semaphore=self.get_web_semaphore()) + args = {**kwargs, **args} if json: args['to_do_func'] = None task = ProcessPDBe.single_retrieve(**args) return self.r_task(task, then_func, api_suffix, identifier, json) - def fetch_from_rcsb_api(self, api_suffix: str, query=None, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None): + def fetch_from_rcsb_api(self, api_suffix: str, query=None, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, json: bool = False, mask_id: str = None, **kwargs): task = self.tasks.get((repr(self), api_suffix, query, then_func, json, mask_id), None) if task is not None: return task @@ -229,8 +224,12 @@ def fetch_from_rcsb_api(self, api_suffix: str, query=None, then_func: Optional[C elif api_suffix == 'search': args = dict(query=query, folder=self.get_folder()/'data_rcsb/search', semaphore=self.rcsb_semaphore) task_func = RCSBSearchAPI.single_retrieve + elif api_suffix == '1d_coordinates': + args = dict(query=query, folder=self.get_folder()/'data_rcsb/1d_coordinates', semaphore=self.rcsb_semaphore) + task_func = RCSB1DCoordinatesAPI.graphql_retrieve else: raise AssertionError(f"Invlaid API SUFFIX! Valid set:\n{RCSBDataAPI.api_set} or graphql or search") + args = {**kwargs, **args} if json: args['to_do_func'] = None task = task_func(**args) @@ -304,8 +303,6 @@ class PDB(Base): @classmethod def set_folder(cls, folder: Union[Path, str]): super().set_folder(folder) - cls.sqlite_api = PDBeDB("sqlite:///%s" % (init_folder_from_suffix(cls.folder, 'local_db')/"PDBeDB.db")) - cls.rcsb_sqlite_api = RCSBDB("sqlite:///%s" % (init_folder_from_suffix(cls.folder, 'local_db')/"RCSBDB.db")) cls.assembly_cif_folder = cls.folder/'pdbe_assembly_cif' cls.assembly_cif_folder.mkdir(parents=True, exist_ok=True) tuple(init_folder_from_suffixes(cls.folder/'model-server', PDBeModelServer.api_set)) @@ -357,20 +354,21 @@ def status(self): """ pass - def __init__(self, pdb_id: str): + def __post_init__(self): + super().__post_init__() self.check_folder() - self.set_id(pdb_id) + self.set_id() self.pdb_ob = self self.properties_inited = False - def set_id(self, pdb_id: str): - assert default_id_tag(pdb_id) == 'pdb_id', f"Invalid PDB ID: {pdb_id} !" - self.pdb_id = pdb_id.lower() + def set_id(self): + assert self.source == 'PDB' + self.pdb_id = self.identifier.lower() def get_id(self): return self.pdb_id - def fetch_from_coordinateServer_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, root='random', **params): + def fetch_from_coordinateServer_api(self, api_suffix: str, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, root='random', kwargs=dict(), **params): assert api_suffix in PDBeCoordinateServer.api_set, f"Invlaid API SUFFIX! Valid set:\n{PDBeCoordinateServer.api_set}" dparams = dumpsParams(params) task = self.tasks.get((repr(self), 'PDBeCoordinateServer', root, api_suffix, dparams, then_func), None) @@ -381,13 +379,14 @@ def fetch_from_coordinateServer_api(self, api_suffix: str, then_func: Optional[C suffix=api_suffix, params=params, folder=self.get_folder()/'coordinate-server'/api_suffix, - semaphore=self.get_web_semaphore()) + semaphore=self.get_web_semaphore(), + **kwargs) if then_func is not None: task = task.then(then_func) self.register_task((repr(self), 'PDBeCoordinateServer', root, api_suffix, dparams, then_func), task) return task - def fetch_from_modelServer_api(self, api_suffix: str, method: str = 'post', data_collection=None, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, filename='subset', **params) -> Unfuture: + def fetch_from_modelServer_api(self, api_suffix: str, method: str = 'post', data_collection=None, then_func: Optional[Callable[[Unfuture], Unfuture]] = None, filename='subset', kwargs=dict(), **params) -> Unfuture: assert api_suffix in PDBeModelServer.api_set, f"Invlaid API SUFFIX! Valid set:\n{PDBeModelServer.api_set}" dparams = dumpsParams(params) if len(params) > 0 else None task = self.tasks.get((repr(self), PDBeModelServer.root, api_suffix, method, data_collection, dparams, then_func), None) @@ -401,7 +400,8 @@ def fetch_from_modelServer_api(self, api_suffix: str, method: str = 'post', data semaphore=self.get_web_semaphore(), params=params, data_collection=data_collection, - filename=filename) + filename=filename, + **kwargs) if then_func is not None: task = task.then(then_func) self.register_task((repr(self), PDBeModelServer.root, api_suffix, method, data_collection, dparams, then_func), task) @@ -732,7 +732,7 @@ async def set_focus_assembly(self, focus_assembly_ids:Optional[Iterable[int]]=No assemblys = sorted(assemblys) if not hasattr(self, 'assembly'): await self.set_assembly() - self.focus_assembly: Dict[int, PDBAssemble] = {ass_id: ass_ob for ass_id, ass_ob in self.assembly.items() if ass_id in assemblys} + self.focus_assembly: Dict[int, PDBAssembly] = {ass_id: ass_ob for ass_id, ass_ob in self.assembly.items() if ass_id in assemblys} @unsync async def set_assembly(self): @@ -743,9 +743,9 @@ async def set_assembly(self): ass_eec_df = await self.fetch_from_pdbe_api('api/pdb/entry/assembly/', Base.to_dataframe) ass_eec_df = ass_eec_df[ass_eec_df.details.notnull()] assemblys = set(ass_eec_df.assembly_id) | {0} - self.assembly: Dict[int, PDBAssemble] = dict(zip( + self.assembly: Dict[int, PDBAssembly] = dict(zip( assemblys, - (PDBAssemble(ass_id, self) for ass_id in self.to_assembly_id(self.pdb_id, assemblys)))) + (PDBAssembly(ass_id).add_args(pdb_ob=self) for ass_id in self.to_assembly_id(self.pdb_id, assemblys)))) def get_assembly(self, assembly_id): return self.assembly[assembly_id] @@ -1133,14 +1133,14 @@ async def ms_source_ass_oper_df(self, struct_asym_id, residue_number): data_collection=json.dumps(dict(atom_site=[dict( label_asym_id=struct_asym_id, label_seq_id=int(residue_number))])).decode('utf-8'), - then_func=PDB.to_assg_oper_df) + then_func=self.to_assg_oper_df) return assg_oper_df @unsync async def cs_source_ass_oper_df(self, struct_asym_id, residue_number): assg_oper_df = await self.fetch_from_coordinateServer_api( 'residues', - then_func=PDB.to_assg_oper_df, + then_func=self.to_assg_oper_df, root='ebi', asymId=struct_asym_id, seqNumber=int(residue_number), @@ -1427,7 +1427,7 @@ async def pipe_interface_res_dict_ic(self, include_chains=None, use_copies:bool= return res @unsync - async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, **kwargs): + async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=True, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, **kwargs): # maybe the name `au2bu` should be changed since its actual behavior is to use copied chains if chain_pairs is not None: chain_pairs = chain_pairs[self.pdb_id] @@ -1459,16 +1459,6 @@ async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=False, focu if ((cur_chain_pairs is None) or (interface.info['chains'] in cur_chain_pairs)) and interface.info['css'] > css_cutoff: res.append(interface) return res - - @staticmethod - @unsync - async def expand_multiple_conformers(dfrm: Union[DataFrame, Unfuture, Coroutine]): - '''for residue_listing dataframe''' - ''' - if isawaitable(dfrm): - dfrm = await dfrm - ''' - pass @unsync async def get_binding_sites(self): @@ -1615,28 +1605,30 @@ async def rcsb_cluster_membership(self, entity_id, identity_cutoff:int=100): return df -class PDBAssemble(PDB): +class PDBAssembly(PDB): tasks = LRUCache(maxsize=1024) - id_pattern = re_compile(r"([a-z0-9]{4})/([0-9]+)") struct_range_pattern = re_compile(r"\[.+\]([A-Z]+[_0-9]*):-?[0-9]+[\?A-Z]*") # e.g. [FMN]B:149 [C2E]A:301 [ACE]H:-8? [BR]BA:957A rare_pat = re_compile(r"([A-Z]+)_([0-9]+)") # e.g. 2rde assembly 1 A_1, B_1... interface_structures_pat = re_compile(r"(\[.+\])?([A-Z]+)(:-?[0-9]+[\?A-Z]*)?\+(\[.+\])?([A-Z]+)(:-?[0-9]+[\?A-Z]*)?") # [4CA]BB:170+AB [ZN]D:154A+[CU]C:154 @property - def assemble_summary(self) -> Dict: + def assembly_summary(self) -> Dict: for ass in self.summary['assemblies']: if int(ass['assembly_id']) == self.assembly_id: return ass raise ValueError(f"{repr(self)}: Without expected assemble info\n{self.summary['assemblies']}") - def __init__(self, pdb_ass_id, pdb_ob: Optional[PDB]=None): - super().__init__(pdb_ass_id) + def add_args(self, pdb_ob=None): if pdb_ob is None: self.pdb_ob = PDB(self.pdb_id) else: self.pdb_ob = pdb_ob + return self + + def __post_init__(self): + super().__post_init__() ''' NOTE: reference: ''' @@ -1644,12 +1636,10 @@ def __init__(self, pdb_ass_id, pdb_ob: Optional[PDB]=None): 'symmetry_operator': ('isin', ('1_555', '1555', 1555)) # 1555 for api%pisa%asiscomponent%+6e4h%0%interfaces } # 'structure_2.symmetry_id': ('eq', '1_555'),'css': ('ge', 0) - def set_id(self, pdb_ass_id: str): - self.pdb_ass_id = pdb_ass_id.lower() - try: - self.pdb_id, self.assembly_id = self.id_pattern.fullmatch(self.pdb_ass_id).groups() - except AttributeError: - raise ValueError(f"Invalid ID: {self.pdb_ass_id}") + def set_id(self): + assert self.level == 'entry_like' and self.raw_identifier.count('/') == 1 + self.pdb_ass_id = self.raw_identifier.lower() + self.pdb_id, self.assembly_id = self.pdb_ass_id.split('/') self.assembly_id = int(self.assembly_id) def get_id(self): @@ -1738,10 +1728,10 @@ def to_interface_id(pdb_assembly_id, focus_interface_ids): yield f"{pdb_assembly_id}/{interface_id}" interfacelist_df, use_au = await self.get_interfacelist_df( - 'api/pisa/asiscomponent/', PDBAssemble.to_asiscomponent_interfaces_df) + 'api/pisa/asiscomponent/', PDBAssembly.to_asiscomponent_interfaces_df) if interfacelist_df is None: interfacelist_df, use_au = await self.get_interfacelist_df( - 'api/pisa/interfacelist/', PDBAssemble.to_interfacelist_df) + 'api/pisa/interfacelist/', PDBAssembly.to_interfacelist_df) self.interface_filters['structure_2.symmetry_id'] = ('isin', ('1_555', '1555', 1555)) del self.interface_filters['symmetry_operator'] else: @@ -1777,7 +1767,7 @@ def to_interface_id(pdb_assembly_id, focus_interface_ids): focus_interface_df.struct_asym_id_in_assembly_2) self.interface: Dict[int, PDBInterface] = dict(zip( - focus_interface_ids, (PDBInterface(if_id, self, use_au).store(chains=frozenset(chains), css=css) for if_id, chains, css in zip(to_interface_id(self.get_id(), focus_interface_ids), focus_interface_chains, focus_interface_df.css)))) + focus_interface_ids, (PDBInterface(if_id).add_args(PDBAssembly_ob=self, use_au=use_au).store(chains=frozenset(chains), css=css) for if_id, chains, css in zip(to_interface_id(self.get_id(), focus_interface_ids), focus_interface_chains, focus_interface_df.css)))) def get_interface(self, interface_id): return self.interface[interface_id] @@ -1864,19 +1854,18 @@ async def pipe_protein_nucleotide_interface(self, molecule_types=None): 'polydeoxyribonucleotide/polyribonucleotide hybrid') if molecule_types is None else molecule_types, False) -class PDBInterface(PDBAssemble): +class PDBInterface(PDBAssembly): tasks = LRUCache(maxsize=1024) - id_pattern = re_compile(r"([a-z0-9]{4})/([0-9]+)/([0-9]+)") - - def __init__(self, pdb_ass_int_id, pdbAssemble_ob: Optional[PDBAssemble]=None, use_au:bool=False): - super().__init__(pdb_ass_int_id) + def add_args(self, PDBAssembly_ob: Optional[PDBAssembly]=None, use_au:bool=False): self.use_au = use_au - if pdbAssemble_ob is None: - self.pdbAssemble_ob = PDBAssemble(f"{self.pdb_id}/{self.assembly_id}") + if PDBAssembly_ob is None: + self.PDBAssembly_ob = PDBAssembly( + f"{self.pdb_id}/{self.assembly_id}").add_args() else: - self.pdbAssemble_ob = pdbAssemble_ob + self.PDBAssembly_ob = PDBAssembly_ob + return self def __repr__(self): if hasattr(self, 'info'): @@ -1884,13 +1873,10 @@ def __repr__(self): else: return f"<{self.__class__.__name__} {self.get_id()}>" - def set_id(self, pdb_ass_int_id: str): - self.pdb_ass_int_id = pdb_ass_int_id.lower() - try: - self.pdb_id, self.assembly_id, self.interface_id = self.id_pattern.fullmatch( - self.pdb_ass_int_id).groups() - except AttributeError: - raise ValueError(f"Invalid ID: {self.pdb_ass_int_id}") + def set_id(self): + assert self.level == 'entry_like' and self.raw_identifier.count('/') == 2 + self.pdb_ass_int_id = self.raw_identifier.lower() + self.pdb_id, self.assembly_id, self.interface_id = self.raw_identifier.split('/') self.assembly_id = int(self.assembly_id) self.interface_id = int(self.interface_id) @@ -1988,8 +1974,8 @@ async def set_interface_res(self, keep_interface_res_df:bool=False): # NOTE: Exception example: 2beq assembly_id 1 interface_id 32 warn(f"{repr(self)}: interfacedetail({struct_sele_set}) inconsistent with interfacelist({set(self.info['chains'])}) ! May miss some data.", PISAErrorWarning) return - eec_as_df = await self.pdbAssemble_ob.get_assemble_eec_as_df() - res_df = await self.pdbAssemble_ob.pdb_ob.fetch_from_pdbe_api('api/pdb/entry/residue_listing/', Base.to_dataframe) + eec_as_df = await self.PDBAssembly_ob.get_assemble_eec_as_df() + res_df = await self.PDBAssembly_ob.pdb_ob.fetch_from_pdbe_api('api/pdb/entry/residue_listing/', Base.to_dataframe) interfacedetail_df = interfacedetail_df.merge(eec_as_df, how="left") interfacedetail_df = interfacedetail_df.merge(res_df, how="left") if keep_interface_res_df: @@ -2085,6 +2071,78 @@ async def get_interface_res_dict(self, **kwargs): return +class RCSB1DCoordinates(Base): + '''RCSB 1D-Coordinates''' + tasks = LRUCache(maxsize=1024) + + sequence_reference = { + ('RefSeq', 'genome'): 'NCBI_GENOME', + ('RefSeq', 'protein'): 'NCBI_PROTEIN', + ('RefSeq', 'model_protein'): 'NCBI_PROTEIN', + ('UniProt', 'isoform'): 'UNIPROT', + ('PDB', 'entity'): 'PDB_ENTITY', + ('PDB', 'instance'): 'PDB_INSTANCE', + } + + def __post_init__(self): + super().__post_init__() + self.check_folder() + self.seq_ref_type = self.sequence_reference[self.source, self.level] + + @property + def seq_ref_id(self): + if self.source in ('RefSeq', 'UniProt'): + return self.identifier + elif self.source == 'PDB': + return self.raw_identifier.upper() + + def get_id(self): + return self.seq_ref_id + + def alignment(self, seq_ref_type, with_seq:bool=False, **kwargs): + assert seq_ref_type in self.sequence_reference.values() + args = (self.seq_ref_type, seq_ref_type, self.seq_ref_id, 'query_sequence', 'target_sequence') if with_seq else (self.seq_ref_type, seq_ref_type, self.seq_ref_id, '', '') + return self.fetch_from_rcsb_api(api_suffix='1d_coordinates', query=''' + { + alignment(from:%s, to:%s, queryId:"%s"){ + %s + target_alignment { + target_id + %s + coverage{ + query_length + target_length + } + aligned_regions { + query_begin + query_end + target_begin + target_end + } + orientation + } + } + } + ''' % args, **kwargs) + + def yield_mapping(self, data): + for mapping in data['data']['alignment']['target_alignment']: + info = dict(query_id=self.seq_ref_id, + target_id=mapping['target_id'], + query_length=mapping['coverage']['query_length'], + target_length=mapping['coverage']['target_length'], + orientation=mapping['orientation']) + for region in mapping['aligned_regions']: + yield {**region,**info} + + @unsync + async def alignment_df(self, seq_ref_type, **kwargs): + try: + return DataFrame(self.yield_mapping(await self.alignment(seq_ref_type, **kwargs).then(a_load_json))) + except TypeError: + pass + + class SIFTS(PDB): ''' TODO @@ -2092,13 +2150,12 @@ class SIFTS(PDB): 1. Better OligoState * RAW (both from wwPDB and self assigned) * FILTERED - 2. Define Best Isoform - 3. UniProt Isoform Interaction + ~~2. Define Best Isoform~~ + ~~3. UniProt Isoform Interaction~~ ~~4. PDBChain Instance Interaction (Biological Relevance)~~ ''' tasks = LRUCache(maxsize=1024) - sa_cache = LRUCache(maxsize=100) EntityChain = namedtuple('EntityChain', 'pdb_id entity_chain_info entity_count chain_count') UniProtEntity = namedtuple('UniProtEntity', 'pdb_id unp_entity_info entity_unp_info entity_with_unp_count min_unp_count') @@ -2118,23 +2175,20 @@ class SIFTS(PDB): UniProtFASTA = UniProtINFO('fasta') - def set_id(self, identifier: str): - tag = default_id_tag(identifier, None) - if tag == 'pdb_id': - self.level = 'PDB Entry' - self.identifier = identifier.lower() - self.pdb_id = self.identifier - elif tag == 'UniProt': - self.level = tag - self.identifier = identifier.upper() + def set_id(self): + if self.source == 'PDB': + self.for_get_id = self.identifier.lower() + self.pdb_id = self.for_get_id + elif self.source == 'UniProt': + self.for_get_id = self.raw_identifier.upper() else: - raise AssertionError(f"Invalid identifier: <{identifier}, {tag}>") + raise AssertionError(f'Invalid identifier: {self.raw_identifier} for {self.__class__.__name__}!') def get_id(self): - return self.identifier + return self.for_get_id def __repr__(self): - return f"<{self.__class__.__name__} {self.level} {self.get_id()}>" + return f"<{self.__class__.__name__} {self.source} {self.level} {self.get_id()}>" @classmethod def fetch_unp_fasta(cls, identifier): @@ -2188,9 +2242,12 @@ def generate_new_identity(dfrm): if 'new_pdb_range_raw' in dfrm.columns: pdb_range_col = 'new_pdb_range_raw' unp_range_col = 'new_unp_range_raw' + else: + pdb_range_col = 'new_pdb_range' + unp_range_col = 'new_unp_range' new_pdb_range_len = dfrm[pdb_range_col].apply(range_len) - new_unp_range_len = dfrm[unp_range_col].apply(range_len) - assert all(new_pdb_range_len==new_unp_range_len) + new_unp_range_len = dfrm[unp_range_col].apply(range_len) # TODO: drop + assert all(new_pdb_range_len==new_unp_range_len) # TODO: drop conflict_range_len = dfrm.conflict_pdb_range.apply(range_len) pdb_gaps = dfrm[pdb_range_col].apply(get_gap_list) unp_gaps = dfrm[unp_range_col].apply(get_gap_list) @@ -2211,7 +2268,7 @@ def check_identity(dfrm: Union[DataFrame, Unfuture]): @staticmethod @unsync - def reformat(dfrm: Union[DataFrame, Unfuture]) -> DataFrame: + def reformat(dfrm: Union[DataFrame, Unfuture], drop_non_sequencial:bool=True) -> DataFrame: if isinstance(dfrm, Unfuture): dfrm = dfrm.result() if 'pdb_start' not in dfrm.columns: @@ -2220,6 +2277,18 @@ def reformat(dfrm: Union[DataFrame, Unfuture]) -> DataFrame: dfrm.rename(columns={ 'start.residue_number': 'pdb_start', 'end.residue_number': 'pdb_end'}, inplace=True) + + if drop_non_sequencial: + """ + NOTE: + {'pdb_id': '5o32', 'entity_id': 3, 'UniProt': 'P08603'} + [[1, 246], [260, 383]] + [[19, 264], [1107, 387]] + """ + pass_mask = (dfrm.pdb_start <= dfrm.pdb_end) & (dfrm.unp_start <= dfrm.unp_end) + if not all(pass_mask): + warn(f"Drop:\n{dfrm[~pass_mask][['UniProt','pdb_id','struct_asym_id','pdb_start','pdb_end','unp_start','unp_end']]}") + dfrm = dfrm[pass_mask].reset_index(drop=True) ''' NOTE: sort for cases like P00441 5j0c (chain A,B) NOTE: hasn't handle multiple identity values! @@ -2403,9 +2472,9 @@ async def add_residue_conflict(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coro dfrm = await dfrm if isinstance(dfrm, Tuple): dfrm = dfrm[0] - pdb_range_col = 'new_pdb_range' if 'new_pdb_range' in dfrm.columns else 'pdb_range' - unp_range_col = 'new_unp_range' if 'new_unp_range' in dfrm.columns else 'unp_range' - focus = ['UniProt', 'entity_id', 'pdb_id', 'pdb_range'] + pdb_range_col = 'new_pdb_range'# if 'new_pdb_range' in dfrm.columns else 'pdb_range' + unp_range_col = 'new_unp_range'# if 'new_unp_range' in dfrm.columns else 'unp_range' + focus = ['UniProt', 'entity_id', 'pdb_id', 'pdb_range', 'unp_range'] ''' (UniProt chain_id entity_id identifier identity is_canonical name pdb_id struct_asym_id pdb_range unp_range) NOTE: add pdb_range because of (P00720 B 1 ENLYS_BPT4 0.94 True ENLYS_BPT4 2b7x B [[1,24],[31,170]] [[1,24],[25,164]]) @@ -2561,24 +2630,6 @@ async def fetch_residue_mapping(self, entity_id:int, start:int, end:int, columns return df[columns] else: return - - @staticmethod - def check_range_tail(new_pdb_range, new_unp_range, pdb_range): - pdb_range = json.loads(pdb_range) if isinstance(pdb_range, str) else pdb_range - new_tail = new_pdb_range[-1][-1] - ori_tail = pdb_range[-1][-1] - tail_gap = new_tail - ori_tail - return tail_gap <= 0 - """if tail_gap > 0: - new_pdb_range = list(list(i) for i in new_pdb_range) - new_unp_range = list(list(i) for i in new_unp_range) - new_pdb_range[-1][-1] -= tail_gap - new_unp_range[-1][-1] -= tail_gap - new_pdb_range = tuple(tuple(i) for i in new_pdb_range) - new_unp_range = tuple(tuple(i) for i in new_unp_range) - return new_pdb_range, new_unp_range - """ - @classmethod @unsync @@ -2587,9 +2638,20 @@ async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]): dfrm = await dfrm if isinstance(dfrm, Tuple): dfrm = dfrm[0] - - focus = ['UniProt', 'entity_id', 'pdb_id'] - f_dfrm = dfrm[focus+['pdb_range', 'unp_range', 'Entry', 'range_diff', 'sifts_range_tag']].drop_duplicates(subset=focus) + ''' + NOTE: same entity with different range + UniProt P42262 P42262 + chain_id A B + entity_id 1 1 + identity 0.66 0.66 + is_canonical True True + pdb_id 2xhd 2xhd + struct_asym_id A B + pdb_range [[3,263]] [[2,263]] + unp_range [[413,796]] [[412,796]] + ''' + focus = ['UniProt', 'entity_id', 'pdb_id', 'pdb_range', 'unp_range'] + f_dfrm = dfrm[focus+['Entry', 'range_diff', 'sifts_range_tag']].drop_duplicates(subset=focus) f_dfrm = f_dfrm[f_dfrm.sifts_range_tag.isin(('Deletion', 'Insertion_Undivided', 'InDel_2', 'InDel_3'))].reset_index(drop=True) if len(f_dfrm) > 0: @@ -2597,7 +2659,7 @@ async def fix_range(cls, dfrm: Union[DataFrame, Tuple, Unfuture, Coroutine]): x['range_diff'], x['pdb_range'], x['unp_range'], x['pdb_id'], x['entity_id'], x['UniProt']), axis=1) res = [await i for i in tasks] f_dfrm[['new_pdb_range', 'new_unp_range']] = DataFrame(list(zip(*i)) for i in res) - assert all(cls.check_range_tail(*args) for args in zip(f_dfrm.new_pdb_range, f_dfrm.new_unp_range, f_dfrm.pdb_range)) + #assert all(cls.check_range_tail(*args) for args in zip(f_dfrm.new_pdb_range, f_dfrm.new_unp_range, f_dfrm.pdb_range)) #f_dfrm[['new_pdb_range', 'new_unp_range']] = DataFrame([cls.check_range_tail(*args) for args in zip(f_dfrm.new_pdb_range, f_dfrm.new_unp_range, f_dfrm.pdb_range)]) dfrm_ed = merge(dfrm, f_dfrm.drop(columns=['range_diff']), how='left') assert dfrm_ed.shape[0] == dfrm.shape[0] @@ -2638,8 +2700,15 @@ def get_optimal_range(lseq, rseq, lbeg, lend, rbeg, rend): cur_rbeg += seg_len else: raise ValueError(f'Unexpected type: {seg_type}') + """ + NOTE: + {'pdb_id': '5o32', 'entity_id': 3, 'UniProt': 'P08603'} + [[1, 246], [260, 383]] + [[19, 264], [1107, 387]] + """ + for diff, (lbeg, lseg), (rbeg, rseg) in zip(range_diff, get_seq_seg(pdb_seq, pdb_range, **kwargs), get_seq_seg(unp_seq, unp_range, **kwargs)): - for diff, (lbeg, lseg), (rbeg, rseg) in zip(range_diff, get_seq_seg(pdb_seq, pdb_range), get_seq_seg(unp_seq, unp_range)): + # assert bool(lseg) and bool(rseg), f"{pdb_range}\n{unp_range}" lend = lbeg + len(lseg) - 1 rend = rbeg + len(rseg) - 1 @@ -2701,7 +2770,7 @@ async def pipe_score(self, sifts_df=None, complete_chains:bool=False, check_pdb_ exp_cols = ['pdb_id', 'resolution', 'experimental_method_class', 'experimental_method', 'multi_method', '-r_factor', '-r_free'] - if self.level == 'PDB Entry': + if self.source == 'PDB': return await self.pipe_score_for_pdb_entry(sifts_df, exp_cols) else: return await self.pipe_score_for_unp_isoform(sifts_df, exp_cols) @@ -2936,7 +3005,7 @@ async def pipe_select_base(self, exclude_pdbs=frozenset(), **kwargs): res = await self.pipe_score(**kwargs) if res is None: return full_df, exp_df = res - if self.level == 'UniProt': + if self.source == 'UniProt': m_df = full_df[~full_df.pdb_id.isin(exclude_pdbs)] sele_df = merge( m_df.query(self.chain_filter) if self.chain_filter else m_df, @@ -2949,13 +3018,12 @@ async def pipe_select_base(self, exclude_pdbs=frozenset(), **kwargs): assert sele_df.experimental_method_class.isnull().sum() == 0 sele_df['1/resolution'] = 1 / sele_df.resolution sele_df['id_score'] = sele_df.chain_id.apply(id2score) - sele_df['select_tag'] = False - sele_df['select_rank'] = -1 return sele_df @staticmethod def select_mo(sele_df, OC_cutoff=0.2, sort_cols=['bs_score', '1/resolution', 'revision_date', 'id_score'], infer_new_col:bool=False, ascending=False, allow_mask=None): - sele_df.select_tag = False + sele_df['select_tag'] = False + sele_df['select_rank'] = -1 if infer_new_col: for col in sort_cols: if (col not in sele_df.columns) and (col[1:] in sele_df.columns) and (col[0] == '-'): @@ -3108,7 +3176,7 @@ async def pipe_select_ho_iso_base(self, exclude_pdbs=frozenset(), run_as_complet sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) if sele_df is None: return - sele_df = sele_df[sele_df.Entry.eq(self.identifier.split('-')[0])] + sele_df = sele_df[sele_df.Entry.eq(self.get_id().split('-')[0])] chain_pairs = sele_df.groupby('pdb_id').struct_asym_id.apply( lambda x: frozenset(combinations_with_replacement(x, 2))) @@ -3222,10 +3290,10 @@ async def pipe_select_else(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2 @unsync def sort_interact_cols(self, dfrm): - assert self.level == 'UniProt' + assert self.source == 'UniProt' if isinstance(dfrm, Unfuture): dfrm = dfrm.result() - swap_index = dfrm[dfrm.UniProt_1.ne(self.identifier)].index + swap_index = dfrm[dfrm.UniProt_1.ne(self.get_id())].index cols_1 = [col for col in dfrm.columns if '_1' in col] cols_2 = [col for col in dfrm.columns if '_2' in col] store_1 = dfrm.loc[swap_index, cols_1].rename(columns=dict(zip(cols_1, [col.replace('_1', '_2') for col in cols_1]))) @@ -3260,7 +3328,7 @@ async def pipe_select_smr_mo(self, smr_df=None, **kwargs): sifts_mo_df = kwargs['sifts_mo_df'] else: sifts_mo_df = await self.pipe_select_mo(**kwargs) - smr_df = (await SMR.single_retrieve(self.identifier).then(SMR.to_dataframe)) if smr_df is None else smr_df + smr_df = (await SMR.single_retrieve(self.get_id()).then(SMR.to_dataframe)) if smr_df is None else smr_df if smr_df is None or len(smr_df) == 0: return return self.select_smr_mo( @@ -3282,21 +3350,57 @@ def add_interact_common_cols(cls, p_df): p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) return p_df + @unsync + async def pipe_scheduled_interface_res_dict_for_pdb(self, **kwargs): + assert self.source == 'PDB' + return await self.schedule_interface_tasks((await self.pipe_interface_res_dict(**kwargs)),False,None) + @staticmethod async def schedule_interface_tasks(ob, run_as_completed, progress_bar): if run_as_completed: + assert isinstance(ob, (SIFTSs,PDBs)) res = await ob.run(tqdm=progress_bar) ob.tasks = [i.get_interface_res_dict() for interfaces in res for i in interfaces] res = await ob.run(tqdm=progress_bar) else: - res = [await i for i in ob.tasks] - inteface_lyst = [i for interfaces in res for i in interfaces if not i.use_au] # TODO: check whether 'not use_au' will affect selected results + if isinstance(ob, (SIFTSs,PDBs)): + res = [await i for i in ob.tasks] + inteface_lyst = [i for interfaces in res for i in interfaces if not i.use_au] # TODO: check whether 'not use_au' will affect selected results + else: + assert isinstance(ob, list) + inteface_lyst = [i for i in ob if not i.use_au] res = [] for index in range(0, len(inteface_lyst), 100): - ob.tasks = [i.get_interface_res_dict() for i in inteface_lyst[index:index+100]] - res.extend([await i for i in ob.tasks]) + tasks = [i.get_interface_res_dict() for i in inteface_lyst[index:index+100]] + res.extend([await i for i in tasks]) return DataFrame(j for j in res if j is not None) + @unsync + async def pipe_scheduled_ranged_map_res_df(self, chunksize=100, func_for_unp='pipe_select_mo', default_mask=True, with_sele_cols_for_unp=False, **kwargs): + if self.source == 'UniProt': + df = await getattr(self, func_for_unp)(**kwargs) + if default_mask: + records = df[df.select_rank.ne(-1)].to_records() + else: + records = df.to_records() + elif self.source == 'PDB': + df = await self.pipe_select_base(**kwargs) + records = df.to_records() + res = [] + for index in range(0, len(records), chunksize): + tasks = [PDB(row.pdb_id).get_ranged_map_res_df( + UniProt=row.UniProt, + unp_range=row.new_unp_range_raw, + pdb_range=row.new_pdb_range_raw, + conflict_pdb_index=row.conflict_pdb_index, + struct_asym_id=row.struct_asym_id) for row in records[index:index+chunksize]] + res.extend([await i for i in tasks]) + ret = concat(res, sort=False, ignore_index=True) + if with_sele_cols_for_unp and (self.source == 'UniProt') and (func_for_unp == 'pipe_select_mo'): + ret = ret.merge(df[['UniProt','pdb_id','struct_asym_id','bs_score','select_tag','select_rank','after_select_rank']], how='left') + assert ret.select_tag.isnull().sum() == 0 + return ret + @staticmethod def get_id_score_for_assembly(args): struct_asym_id, asym_id_rank, assembly_id = args @@ -3349,14 +3453,14 @@ async def pisa_interact_integrate_with_i3d(self, sele_df, chain_pairs, interacti if interaction_type == 'ho': assert len(p_a_df) > 0 elif interaction_type == 'ho_iso': - p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.UniProt_1 != p_a_df.UniProt_2) & (p_a_df.struct_asym_id_in_assembly_1 != p_a_df.struct_asym_id_in_assembly_2)].reset_index(drop=True) + p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.get_id()) | p_a_df.UniProt_2.eq(self.get_id())) & (p_a_df.UniProt_1 != p_a_df.UniProt_2) & (p_a_df.struct_asym_id_in_assembly_1 != p_a_df.struct_asym_id_in_assembly_2)].reset_index(drop=True) elif interaction_type == 'he': - p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.identifier) | p_a_df.UniProt_2.eq(self.identifier)) & (p_a_df.Entry_1 != p_a_df.Entry_2)].reset_index(drop=True) + p_a_df = p_a_df[(p_a_df.UniProt_1.eq(self.get_id()) | p_a_df.UniProt_2.eq(self.get_id())) & (p_a_df.Entry_1 != p_a_df.Entry_2)].reset_index(drop=True) else: raise ValueError(f"Invalid interaction_type: {interaction_type}!") if len(p_a_df) == 0: return - i3d_df = await self.search_partner_from_i3d(self.identifier.split('-')[0], interaction_type[:2]) + i3d_df = await self.search_partner_from_i3d(self.get_id().split('-')[0], interaction_type[:2]) if len(i3d_df) == 0: p_df = p_a_df @@ -3370,30 +3474,20 @@ async def pisa_interact_integrate_with_i3d(self, sele_df, chain_pairs, interacti @unsync async def unp_is_canonical(self): - if self.level != 'UniProt': + if self.source != 'UniProt': return None - if '-' not in self.identifier: + if '-' not in self.get_id(): return True try: - header = (await self.fetch_unp_fasta(self.identifier))[0] + header = (await self.fetch_unp_fasta(self.get_id()))[0] except TypeError: - warn(self.identifier, PossibleObsoletedUniProtWarning) + warn(self.get_id(), PossibleObsoletedUniProtWarning) return None - return self.identifier != self.unp_head.match(header).group(1) + return self.get_id() != self.unp_head.match(header).group(1) @unsync async def unp_is_canonical_with_id(self): - return self.identifier, (await self.unp_is_canonical()) - - """@classmethod - @unsync - def meta_pdbekb_annotaion(cls, dfrm): - if isinstance(dfrm, Unfuture): - dfrm = dfrm.result() - dfrm = dfrm.rename(columns={'data_resource': 'resource', 'residue_number': 'pdb_start'}) - dfrm['pdb_end'] = dfrm.pdb_start - dfrm['resource_id'] = dfrm.pdb_start.astype(str) - return dfrm[['pdb_id', 'entity_id', 'struct_asym_id', 'chain_id', 'resource', 'resource_id', 'pdb_start', 'pdb_end']]""" + return self.get_id(), (await self.unp_is_canonical()) @classmethod @unsync @@ -3439,7 +3533,7 @@ async def get_mapped_pdbekb_annotaions_task(cls, pdb_id, sub_sifts_df, api_suffi res_df = await pdb_ob.pipe_pdbekb_annotations(api_suffix, **kwargs) if res_df is None: return rets - for _, record in sub_sifts_df.iterrows(): + for record in sub_sifts_df.to_records(): res = await cls.get_mapped_pdbekb_annotaions_task_unit(pdb_ob, record, res_df) if res is not None: rets.append(res) @@ -3468,21 +3562,17 @@ class Compounds(Base): tasks = LRUCache(maxsize=1024) - def set_id(self, identifier: str): - assert len(identifier) > 0, "Empty string is not a valid identifier!" - self.identifier = identifier.upper() - - def get_id(self): - return self.identifier - - def __init__(self, identifier:str): + def __post_init__(self, **kwargs): + super().__post_init__(**kwargs) + assert self.level == 'compounds' self.check_folder() - self.set_id(identifier) + self.raw_identifier = self.raw_identifier.upper() class PDBs(tuple): + '''immutable iterable class (tuple-like)''' - def __new__(cls, iterable:Iterable): + def __new__(cls, iterable:Iterable=tuple()): return super(PDBs, cls).__new__(cls, (PDB(i) if isinstance(i, str) else i for i in iterable)) def __getitem__(self, slice): @@ -3570,7 +3660,7 @@ async def stats_chain(self, stats_nucleotide=False): class SIFTSs(PDBs): - def __new__(cls, iterable: Iterable): + def __new__(cls, iterable: Iterable=tuple()): return super(SIFTSs, cls).__new__(cls, (SIFTS(i) if isinstance(i, str) else i for i in iterable)) ''' diff --git a/pdb_profiling/processors/proteins/api.py b/pdb_profiling/processors/proteins/api.py index 9e98de7..b9d8d38 100644 --- a/pdb_profiling/processors/proteins/api.py +++ b/pdb_profiling/processors/proteins/api.py @@ -50,7 +50,7 @@ def task_unit(cls, suffix: str, params: Dict, folder: Path, identifier:Optional[ url=f'{BASE_URL}{suffix}' if identifier is None else f'{BASE_URL}{suffix}{quote(identifier)}', headers=cls.headers, params=params) - return 'get', args, folder/f'{slugify(identifier)+"_"+dumpsParams(params) if identifier is not None else dumpsParams(params)}.{cls.get_file_suffix()}' + return 'get', args, folder/(suffix if suffix[-1] == '/' else suffix+'_')/f'{slugify(identifier)+"_"+dumpsParams(params) if identifier is not None else dumpsParams(params)}.{cls.get_file_suffix()}' @classmethod def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path, identifiers: Optional[Iterable[str]]) -> Generator: diff --git a/pdb_profiling/processors/proteins/record.py b/pdb_profiling/processors/proteins/record.py index a50b469..7b47917 100644 --- a/pdb_profiling/processors/proteins/record.py +++ b/pdb_profiling/processors/proteins/record.py @@ -4,13 +4,14 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-09-28 05:43:34 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University +from pdb_profiling.processors.recordbase import IdentifierBase from pdb_profiling.processors.ensembl.api import EnsemblAPI from pdb_profiling.processors.eutils.api import EutilsAPI from pdb_profiling.processors.proteins.api import ProteinsAPI from pdb_profiling.processors.proteins import ProteinsDB from pdb_profiling.log import Abclog from pdb_profiling.warnings import PossibleObsoletedUniProtWarning, SequenceConflictWarning -from pdb_profiling.utils import init_folder_from_suffix, a_seq_reader, a_load_json, init_semaphore, unsync_wrap, unsync_run +from pdb_profiling.utils import init_folder_from_suffix, init_folder_from_suffixes, a_seq_reader, a_load_json, init_semaphore, unsync_wrap, unsync_run, flatten_dict from re import compile as re_compile from pathlib import Path from typing import Union, Optional, Tuple, Iterable, Callable, List @@ -19,32 +20,18 @@ from pandas import DataFrame, concat from collections import OrderedDict from warnings import warn +from orm.models import NoMatch -class Identifier(Abclog): - suffix = r'[0-9]+)[\.]*([0-9]*)' - pats = OrderedDict({ - ('RefSeq', 'model'): re_compile('(X[A-Z]{1}_%s' % suffix), - ('RefSeq', 'transcript'): re_compile(f'(NM_{suffix}'), - ('RefSeq', 'protein'): re_compile(f'(NP_{suffix}'), - ('Ensembl', 'gene'): re_compile(f'(ENS[A-Z]*G{suffix}'), - ('Ensembl', 'transcript'): re_compile(f'(ENS[A-Z]*T{suffix}'), - ('Ensembl', 'protein'): re_compile(f'(ENS[A-Z]*P{suffix}'), - ('UniProt', 'isoform'): re_compile(r'^((?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{6,})[\-]*([0-9]*)$') - }) - +class Identifier(Abclog, IdentifierBase): + ''' + Impl EBI Proteins API + ''' auto_assign_when_seq_conflict = False @classmethod @unsync - async def set_web_semaphore(cls, *web_semaphore_values): - if len(web_semaphore_values) == 0: - proteins_api, ensembl_api, eutils_api = 20, 20, 20 - elif len(web_semaphore_values) < 3: - num = web_semaphore_values[0] - proteins_api, ensembl_api, eutils_api = num, num, num - else: - proteins_api, ensembl_api, eutils_api = web_semaphore_values[:3] + async def set_web_semaphore(cls, proteins_api=20, ensembl_api=20, eutils_api=20): cls.proteins_api_web_semaphore = await init_semaphore(proteins_api) cls.ensembl_api_web_semaphore = await init_semaphore(ensembl_api) cls.eutils_api_web_semaphore = await init_semaphore(eutils_api) @@ -54,38 +41,23 @@ def set_folder(cls, folder: Union[Path, str]): cls.folder = Path(folder) cls.sqlite_api = ProteinsDB( "sqlite:///%s" % (init_folder_from_suffix(cls.folder, 'local_db')/"proteinsAPI.db")) - cls.proteins_api_folder = init_folder_from_suffix( - cls.folder, 'proteins/api/proteins') + cls.proteins_api_folder = cls.folder/'proteins/api/' + tuple(init_folder_from_suffixes( + cls.proteins_api_folder, ((i if i[-1] == '/' else i+'_') for i in ProteinsAPI.api_set))) cls.seq_folder = dict( RefSeq=init_folder_from_suffix(cls.folder, 'eutils/efetch'), Ensembl=init_folder_from_suffix(cls.folder, 'ensembl/sequence/id')) cls.ensembl_archive_folder = init_folder_from_suffix( cls.folder, 'ensembl/archive/id') - @classmethod - def get_type(cls, identifier: str): - for key, pat in cls.pats.items(): - res = pat.fullmatch(identifier) - if bool(res): - return key, res.groups() - - def __init__(self, identifier: str, folder: Optional[Union[Path, str]] = None): - try: - (self.source, self.level), (self.identifier, - self.version) = self.get_type(identifier) - self.raw_identifier = identifier - if folder is not None: - self.set_folder(folder) - getattr(self, 'sqlite_api') - except TypeError: - raise ValueError(f"Unexpected identifier type: {identifier}") - except AttributeError: - raise AttributeError( - "Please specify class variable `folder` via set_folder() first or pass `folder` in this method!") + def __post_init__(self): + super().__post_init__() + if not hasattr(self, 'sqlite_api'): + raise AttributeError("Please specify class variable `folder` via set_folder() first or pass `folder` in this method!") self.status = None def __repr__(self): - return f'<{self.source} {self.level} {self.identifier} {self.version}>' + return f'<{self.source} {self.level} {self.identifier} {self.identifier_suffix}>' @unsync async def set_status(self): @@ -176,25 +148,59 @@ async def query_from_DB_with_unp(self, table_name:str, columns: str = '*', exist return dbReferences_df, other_dbReferences_df, iso_df, features_df, int_df ''' - @classmethod @unsync - async def fetch_from_proteins_api(cls, suffix, identifier=None, params={}, rate=1.5): + async def fetch_from_proteins_api(self, suffix, id_suffix='', with_source:bool=False, params={}, rate=1.5): return await ProteinsAPI.single_retrieve( suffix=suffix, params=params, - folder=cls.proteins_api_folder, - semaphore=cls.proteins_api_web_semaphore, - identifier=identifier, + folder=self.proteins_api_folder, + semaphore=self.proteins_api_web_semaphore, + identifier=(f"{self.source}:" if with_source else '')+(self.raw_identifier if self.source in ('UniProt', 'Taxonomy') else self.identifier)+id_suffix, rate=rate) + @classmethod + def yield_mapping(cls, data): + if isinstance(data, list): + for sub in data: + yield from cls.yield_mapping_unit(sub) + elif isinstance(data, dict): + yield from cls.yield_mapping_unit(data) + + @staticmethod + def yield_mapping_unit(data): + for gnCoordinate in data['gnCoordinate']: + info = dict(accession=data['accession'], + ensemblGeneId=gnCoordinate['ensemblGeneId'], + ensemblTranscriptId=gnCoordinate['ensemblTranscriptId'], + ensemblTranslationId=gnCoordinate['ensemblTranslationId'], + chromosome=gnCoordinate['genomicLocation']['chromosome'], + #start=gnCoordinate['genomicLocation']['start'], + #end=gnCoordinate['genomicLocation']['end'], + reverseStrand=gnCoordinate['genomicLocation']['reverseStrand']) + for record in gnCoordinate['genomicLocation']['exon']: + to_flat = record.copy() + flatten_dict(to_flat, 'proteinLocation') + flatten_dict(to_flat, 'proteinLocation.begin') + flatten_dict(to_flat, 'proteinLocation.end') + flatten_dict(to_flat, 'genomeLocation') + flatten_dict(to_flat, 'genomeLocation.begin') + flatten_dict(to_flat, 'genomeLocation.end') + to_flat.update(info) + yield to_flat + @unsync - async def fetch_proteins_from_ProteinsAPI(self, reviewed='true', isoform=0): - res = await ProteinsAPI.pipe_summary(await ProteinsAPI.single_retrieve( + async def alignment_df(self, **kwargs): + assert self.source in ('Taxonomy', 'UniProt') + return DataFrame(self.yield_mapping( + await self.fetch_from_proteins_api('coordinates/', **kwargs).then(a_load_json))).rename(columns={'id': 'ensemblExonId'}) + + @unsync + async def fetch_proteins_from_ProteinsAPI(self, reviewed='true', isoform=0, **kwargs): + res = await ProteinsAPI.pipe_summary(await self.fetch_from_proteins_api( 'proteins/', - dict(offset=0, size=-1, reviewed=reviewed, isoform=isoform), - self.proteins_api_folder, - self.proteins_api_web_semaphore, - identifier=f'{self.source}:{self.identifier}' + with_source=True, + params=dict(offset=0, size=-1, reviewed=reviewed, isoform=isoform), + **kwargs ).then(a_load_json)) if res is None: return @@ -202,6 +208,52 @@ async def fetch_proteins_from_ProteinsAPI(self, reviewed='true', isoform=0): self.save_ProteinsAPI_data_to_DB(res, identifier=self.identifier) return res + @unsync + async def get_isoform_ob(self): + assert self.level == 'isoform' + try: + if self.identifier_suffix == '': + query_ob = await self.get_canonical_isoform_ob() + if query_ob is None: + return + else: + query_id = query_ob.isoform + else: + query_id = self.raw_identifier + return await self.sqlite_api.ALTERNATIVE_PRODUCTS.objects.get(isoform=query_id, sequenceStatus__in=('displayed', 'described')) + except NoMatch: + pass + + @unsync + async def get_canonical_isoform_ob(self): + assert self.level == 'isoform' + try: + return await self.sqlite_api.ALTERNATIVE_PRODUCTS.objects.get(accession=self.identifier, sequenceStatus='displayed') + except NoMatch: + pass + + @unsync + async def get_all_ref_identifiers(self, to_dataframe:bool=True, **kwargs): + if self.level == 'isoform': + if self.identifier_suffix == '': + c_ob = await self.get_canonical_isoform_ob() + if c_ob is None: + query_args = {'accession': self.identifier} + else: + query_args = {'isoform': c_ob.isoform} + else: + query_args = {'isoform': self.raw_identifier} + else: + query_args = {f"{self.level}__contains": self.identifier, 'type': self.source} + query_args.update(kwargs) + ret = await self.sqlite_api.DB_REFERENCES.objects.filter(**query_args).all() + if len(ret) == 0: + return None + if to_dataframe: + return DataFrame(ret) + else: + return ret + @unsync async def get_all_level_identifiers(self): try: @@ -210,7 +262,9 @@ async def get_all_level_identifiers(self): return dict(zip(('protein', 'transcript', 'gene'), await self.sqlite_api.database.fetch_one( query=f""" SELECT protein,transcript,gene FROM dbReferences - WHERE type == '{self.source}' AND ({self.level} == '{cur_id}' OR {self.level} LIKE '{cur_id}%')"""))) + WHERE type == '{self.source}' AND ({self.level} == '{cur_id}' + OR substr({self.level}, 0, instr({self.level}, '.')) == '{cur_id}' + )"""))) except TypeError: return @@ -224,7 +278,9 @@ async def map2unp_from_DB(self): res = await self.sqlite_api.database.fetch_one( query=f""" SELECT accession,isoform FROM dbReferences - WHERE type == '{self.source}' AND ({self.level} == '{cur_id}' OR {self.level} LIKE '{cur_id}%')""") + WHERE type == '{self.source}' AND ({self.level} == '{cur_id}' + OR substr({self.level}, 0, instr({self.level}, '.')) == '{cur_id}' + )""") if res is None: return else: @@ -292,12 +348,38 @@ async def fetch_sequence(self, newest: bool = True): self.seq_folder['Ensembl'], self.ensembl_api_web_semaphore).then(a_seq_reader) + @unsync + async def unp_is_canonical(self): + res = await self.query_from_DB_with_unp('ALTERNATIVE_PRODUCTS', columns='isoform,sequenceStatus') + if self.identifier_suffix == '': + return True + if res is None or res.shape[0] == 0: + if self.identifier_suffix != '1': + self.logger.warning(f'Possbile invalid isoform identifier {self.raw_identifier}') + return True + else: + focus = res[res.isoform.eq(self.raw_identifier) & res.sequenceStatus.isin(('described','displayed'))] + if focus.shape[0] == 0: + self.logger.error(f'Invalid isoform identifier {self.raw_identifier}') + return False + else: + assert focus.shape[0] == 1 + return focus.iloc[0]['sequenceStatus'] == 'displayed' + + @unsync + async def init(self): + if hasattr(self, 'inited'): + return self + else: + self.inited = True + await self.map2unp() + return self + @unsync async def map2unp(self, **kwargs): - if self.level == 'model': - return self.raw_identifier, 'NaN', 'NaN', False - elif self.source == 'UniProt': - return self.raw_identifier, self.identifier, self.raw_identifier, (self.raw_identifier == self.identifier) + if self.source == 'UniProt': + is_canonical = await self.unp_is_canonical() + return self.raw_identifier, self.identifier, (self.identifier if is_canonical else self.raw_identifier), is_canonical try: res = await self.map2unp_from_DB() except AssertionError: @@ -316,7 +398,9 @@ async def map2unp(self, **kwargs): class Identifiers(tuple): - def __new__(cls, iterable: Iterable): + '''immutable iterable class (tuple-like)''' + + def __new__(cls, iterable: Iterable=tuple()): return super(Identifiers, cls).__new__(cls, (Identifier(i) if isinstance(i, str) else i for i in iterable)) def __getitem__(self, slice): diff --git a/pdb_profiling/processors/rcsb/api.py b/pdb_profiling/processors/rcsb/api.py index 735bb30..5dcfaa4 100644 --- a/pdb_profiling/processors/rcsb/api.py +++ b/pdb_profiling/processors/rcsb/api.py @@ -4,14 +4,13 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-12-24 01:28:34 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University -from pdb_profiling.log import Abclog from pdb_profiling.fetcher.webfetch import UnsyncFetch from hashlib import sha1 from pathlib import Path from typing import Union -class RCSBDataAPI(Abclog): +class RCSBDataAPI(object): root = 'https://data.rcsb.org/' rest_api_root = f'{root}rest/v1/core/' graphql_root = f'{root}graphql' @@ -32,10 +31,21 @@ def graphql_retrieve(cls, query, folder, semaphore, to_do_func=None, rate: float return UnsyncFetch.single_task(task=('get', dict(url=cls.graphql_root, params=dict(query=query), headers=cls.headers), Path(folder)/f'{sha1(bytes(query, encoding="utf-8")).hexdigest()}.json'), semaphore=semaphore, to_do_func=to_do_func, rate=rate) -class RCSBSearchAPI(Abclog): +class RCSBSearchAPI(object): root = 'https://search.rcsb.org/rcsbsearch/v1/query' headers = {'Connection': 'close', 'Content-Type': 'application/json;charset=UTF-8'} @classmethod def single_retrieve(cls, query, folder, semaphore, to_do_func=None, rate: float = 1.5): return UnsyncFetch.single_task(task=('get', dict(url=cls.root, params=dict(json=query), headers=cls.headers), Path(folder)/f'{sha1(bytes(query, encoding="utf-8")).hexdigest()}.json'), semaphore=semaphore, to_do_func=to_do_func, rate=rate) + + +class RCSB1DCoordinatesAPI(object): + root = 'https://1d-coordinates.rcsb.org/' + graphql_root = f'{root}graphql' + + headers = {'Connection': 'close', 'Content-Type': 'application/json;charset=UTF-8'} + + @classmethod + def graphql_retrieve(cls, query, folder, semaphore, to_do_func=None, rate: float = 1.5): + return UnsyncFetch.single_task(task=('get', dict(url=cls.graphql_root, params=dict(query=query), headers=cls.headers), Path(folder)/f'{sha1(bytes(query, encoding="utf-8")).hexdigest()}.json'), semaphore=semaphore, to_do_func=to_do_func, rate=rate) diff --git a/pdb_profiling/processors/recordbase.py b/pdb_profiling/processors/recordbase.py new file mode 100644 index 0000000..6c09f88 --- /dev/null +++ b/pdb_profiling/processors/recordbase.py @@ -0,0 +1,61 @@ +# @Created Date: 2021-04-22 07:23:06 pm +# @Filename: recordbase.py +# @Email: 1730416009@stu.suda.edu.cn +# @Author: ZeFeng Zhu +# @Last Modified: 2021-04-22 09:21:16 pm +# @Copyright (c) 2021 MinghuiGroup, Soochow University +from dataclasses import dataclass +from collections import OrderedDict +from re import compile as re_compile + +ID_SUFFIX = r'[0-9]+)[\.]*([0-9]*)' +PDB_COMMON_PREFIX = r'^((?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]' +#RANGE_SUFFIX = r':([0-9]+)-*([0-9]*)$' + + +@dataclass +class IdentifierBase: + PATS = OrderedDict({ + re_compile(f'(NC_{ID_SUFFIX}'): ('RefSeq', 'genome'), + re_compile(r'([NX]{1}M_'+ID_SUFFIX): ('RefSeq', 'transcript'), + re_compile(r'([NX]{1}P_'+ID_SUFFIX): ('RefSeq', 'protein'), + re_compile(f'(ENS[A-Z]*E{ID_SUFFIX}'): ('Ensembl', 'exon'), + re_compile(f'(ENS[A-Z]*G{ID_SUFFIX}'): ('Ensembl', 'gene'), + re_compile(f'(ENS[A-Z]*T{ID_SUFFIX}'): ('Ensembl', 'transcript'), + re_compile(f'(ENS[A-Z]*P{ID_SUFFIX}'): ('Ensembl', 'protein'), + re_compile(r'(CCDS[0-9]+)'): ('CCDS', 'CCDS'), + re_compile(r'(rs[0-9]+)'): ('dbSNP', 'mutation'), + re_compile(r'^((?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{6,})[\-]*([0-9]*)$'): ('UniProt', 'isoform'), + re_compile(PDB_COMMON_PREFIX+r'{4})$'): ('PDB', 'entry'), + re_compile(PDB_COMMON_PREFIX+r'{4})-([0-9]+)$'): ('PDB', 'assembly'), + re_compile(PDB_COMMON_PREFIX+r'{4})_([0-9]+)$'): ('PDB', 'entity'), + re_compile(PDB_COMMON_PREFIX+r'{4})\.([A-Z]+)$'): ('PDB', 'instance'), + re_compile(PDB_COMMON_PREFIX+r'{4})/([0-9/]+)$'): ('PDB', 'entry_like'), + re_compile(r'(PDB-CPX-[0-9]+)'): ('PDB', 'complex'), + re_compile(r'([0-9]+)/([0-9]+)'): ('Taxonomy', 'genome'), + re_compile(r'([0-9]+)'): ('HGNC', 'HGNC'), + re_compile(r'([A-z0-9\-]+)'): ('PDB', 'compounds'), + }) + + raw_identifier: str + renew:bool = True + source: str = '' + level: str = '' + identifier: str = '' + identifier_suffix: str = '' + + @classmethod + def get_type(cls, raw_identifier: str): + for pat, group in cls.PATS.items(): + res = pat.fullmatch(raw_identifier) + if bool(res): + return group, res.groups() + raise AssertionError(f"Unexpected identifier type: {raw_identifier}") + + def __post_init__(self): + if self.renew: + (self.source, self.level), identifier_tp = self.get_type(self.raw_identifier) + if len(identifier_tp) == 2: + self.identifier, self.identifier_suffix = identifier_tp + else: + self.identifier, = identifier_tp diff --git a/pdb_profiling/utils.py b/pdb_profiling/utils.py index 04637a0..8ec302e 100644 --- a/pdb_profiling/utils.py +++ b/pdb_profiling/utils.py @@ -25,6 +25,7 @@ from operator import itemgetter from textdistance import overlap, sorensen from collections import Counter, OrderedDict +from warnings import warn """def to_interval(lyst: Union[Iterable, Iterator]) -> List: @@ -798,9 +799,12 @@ def unit(i1,i2): return tuple(unit(obs_range, unk_range))""" -def get_seq_seg(seq, ranges): +def get_seq_seg(seq, ranges, **kwargs): for start,end in ranges: - yield start, seq[start-1:end] + if end >= start: + yield start, seq[start-1:end] + else: + warn(f"{kwargs} -> Invalid Order: {ranges}, skip") def get_diff_index(lseq, lrange, rseq, rrange): diff --git a/setup.py b/setup.py index 3d10a8f..fe131a0 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name="pdb_profiling", - version='0.2.12', + version='0.3.2', packages=find_namespace_packages(), entry_points={'console_scripts': ['pdb_profiling=pdb_profiling.commands.command:Interface']}, @@ -46,10 +46,9 @@ long_description=readme, long_description_content_type="text/markdown", url="https://github.com/NatureGeorge/pdb-profiling", - python_requires=">=3.6.*", + python_requires=">=3.7.*", classifiers=[ "Operating System :: OS Independent", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8" ], diff --git a/test/pytest/demo_dir/graph-api/pdb/sequence_conservation/graph-api%pdb%sequence_conservation%+1cbs%1.json b/test/pytest/demo_dir/graph-api/pdb/sequence_conservation/graph-api%pdb%sequence_conservation%+1cbs%1.json new file mode 100644 index 0000000..0c744b6 --- /dev/null +++ b/test/pytest/demo_dir/graph-api/pdb/sequence_conservation/graph-api%pdb%sequence_conservation%+1cbs%1.json @@ -0,0 +1,177 @@ +{ + "1cbs": { + "entity_id": 1, + "length": 137, + "data": [ + { + "start": 1, + "end": 1, + "conservation_score": 1, + "tooltipContent": "Conservation score:1", + "amino": [ + { + "end": 1, + "letter": "P", + "proba": 0.428, + "start": 1, + "color": "#c0c000", + "tooltipContent": "Amino acid:PRO
Probability:42.80%" + }, + { + "end": 1, + "letter": "A", + "proba": 0.065, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:ALA
Probability:6.50%" + }, + { + "end": 1, + "letter": "S", + "proba": 0.055, + "start": 1, + "color": "#15c015", + "tooltipContent": "Amino acid:SER
Probability:5.50%" + }, + { + "end": 1, + "letter": "E", + "proba": 0.042, + "start": 1, + "color": "#c048c0", + "tooltipContent": "Amino acid:GLU
Probability:4.20%" + }, + { + "end": 1, + "letter": "T", + "proba": 0.042, + "start": 1, + "color": "#15c015", + "tooltipContent": "Amino acid:THR
Probability:4.20%" + }, + { + "end": 1, + "letter": "G", + "proba": 0.041, + "start": 1, + "color": "#f09048", + "tooltipContent": "Amino acid:GLY
Probability:4.10%" + }, + { + "end": 1, + "letter": "K", + "proba": 0.039, + "start": 1, + "color": "#f01505", + "tooltipContent": "Amino acid:LYS
Probability:3.90%" + }, + { + "end": 1, + "letter": "D", + "proba": 0.035, + "start": 1, + "color": "#c048c0", + "tooltipContent": "Amino acid:ASP
Probability:3.50%" + }, + { + "end": 1, + "letter": "L", + "proba": 0.035, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:LEU
Probability:3.50%" + }, + { + "end": 1, + "letter": "V", + "proba": 0.035, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:VAL
Probability:3.50%" + }, + { + "end": 1, + "letter": "N", + "proba": 0.031, + "start": 1, + "color": "#15c015", + "tooltipContent": "Amino acid:ASN
Probability:3.10%" + }, + { + "end": 1, + "letter": "R", + "proba": 0.031, + "start": 1, + "color": "#f01505", + "tooltipContent": "Amino acid:ARG
Probability:3.10%" + }, + { + "end": 1, + "letter": "Q", + "proba": 0.027, + "start": 1, + "color": "#15c015", + "tooltipContent": "Amino acid:GLN
Probability:2.70%" + }, + { + "end": 1, + "letter": "I", + "proba": 0.024, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:ILE
Probability:2.40%" + }, + { + "end": 1, + "letter": "F", + "proba": 0.015, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:PHE
Probability:1.50%" + }, + { + "end": 1, + "letter": "H", + "proba": 0.014, + "start": 1, + "color": "#15a4a4", + "tooltipContent": "Amino acid:HIS
Probability:1.40%" + }, + { + "end": 1, + "letter": "Y", + "proba": 0.014, + "start": 1, + "color": "#15a4a4", + "tooltipContent": "Amino acid:TYR
Probability:1.40%" + }, + { + "end": 1, + "letter": "M", + "proba": 0.012, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:MET
Probability:1.20%" + }, + { + "end": 1, + "letter": "C", + "proba": 0.011, + "start": 1, + "color": "#f08080", + "tooltipContent": "Amino acid:CYS
Probability:1.10%" + }, + { + "end": 1, + "letter": "W", + "proba": 0.005, + "start": 1, + "color": "#80a0f0", + "tooltipContent": "Amino acid:TRP
Probability:0.50%" + } + ], + "labelColor": "rgb(211,211,211)" + } + ] + } +} \ No newline at end of file diff --git a/test/pytest/demo_dir/graph-api/residue_mapping/graph-api%residue_mapping%+3pg7%1%251%256.json b/test/pytest/demo_dir/graph-api/residue_mapping/graph-api%residue_mapping%+3pg7%1%251%256.json new file mode 100644 index 0000000..7ae2cde --- /dev/null +++ b/test/pytest/demo_dir/graph-api/residue_mapping/graph-api%residue_mapping%+3pg7%1%251%256.json @@ -0,0 +1 @@ +{"3pg7":[{"entity_id":1,"chains":[{"auth_asym_id":"A","struct_asym_id":"A","residues":[{"residue_number":251,"author_residue_number":1810,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1811,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1832,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1832,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1811,"unp_one_letter_code":"E","pdb_one_letter_code":"E"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":12.4057,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":46.0869,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":33.6811,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.058,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket16","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":0.2534,"confidence_score":0.0026,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.397,"confidence_score":0.074,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.44,"confidence_score":0.098,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.033,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.813,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.322,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":252,"author_residue_number":1811,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1812,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1833,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1833,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1812,"unp_one_letter_code":"L","pdb_one_letter_code":"L"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":50.9769,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":7.7873,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":43.1895,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.046,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket16","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":0.6461,"confidence_score":0.0147,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.488,"confidence_score":0.094,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.346,"confidence_score":0.039,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.027,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.587,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.783,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":253,"author_residue_number":1812,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1813,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1834,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1834,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1813,"unp_one_letter_code":"S","pdb_one_letter_code":"S"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":40.6305,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":18.6429,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":21.9876,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.043,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket5","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.2581,"confidence_score":0.0712,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.596,"confidence_score":0.022,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.347,"confidence_score":0.034,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.02,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.513,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.754,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":254,"author_residue_number":1813,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1814,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1814,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1835,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1835,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":66.0982,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":18.2528,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":47.8453,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.053,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket5","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":0.8951,"confidence_score":0.0305,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.439,"confidence_score":0.08,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.382,"confidence_score":0.038,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.743,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.433,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.009,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":255,"author_residue_number":1814,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1815,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1836,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1836,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1815,"unp_one_letter_code":"P","pdb_one_letter_code":"P"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":43.9323,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":12.4365,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":56.3689,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.05,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket3","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.0973,"confidence_score":0.0508,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.349,"confidence_score":0.033,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.391,"confidence_score":0.046,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.009,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.51,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.732,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":256,"author_residue_number":1815,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1816,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1837,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1837,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1816,"unp_one_letter_code":"D","pdb_one_letter_code":"D"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":15.7707,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":42.0577,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":57.8285,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.054,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket3","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.8683,"confidence_score":0.1807,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.326,"confidence_score":0.021,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.429,"confidence_score":0.056,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.288,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.021,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.742,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}}]},{"auth_asym_id":"B","struct_asym_id":"B","residues":[{"residue_number":251,"author_residue_number":1810,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1811,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1832,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1832,"unp_one_letter_code":"E","pdb_one_letter_code":"E"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1811,"unp_one_letter_code":"E","pdb_one_letter_code":"E"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":20.5566,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":65.6964,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":45.1398,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.066,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket10","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":0.3915,"confidence_score":0.0054,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.379,"confidence_score":0.06,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.378,"confidence_score":0.058,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":96.7,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":96.7,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.033,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.813,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.322,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":252,"author_residue_number":1811,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1812,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1833,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1833,"unp_one_letter_code":"L","pdb_one_letter_code":"L"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1812,"unp_one_letter_code":"L","pdb_one_letter_code":"L"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":44.623,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":54.4701,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":9.847,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.059,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket10","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.0748,"confidence_score":0.0484,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.44,"confidence_score":0.054,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.346,"confidence_score":0.038,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":144.6,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":144.6,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.587,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.783,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.027,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":253,"author_residue_number":1812,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1813,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1834,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1834,"unp_one_letter_code":"S","pdb_one_letter_code":"S"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1813,"unp_one_letter_code":"S","pdb_one_letter_code":"S"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":46.0521,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":23.2131,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":22.839,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.048,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket5","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.6788,"confidence_score":0.1421,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.356,"confidence_score":0.039,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.609,"confidence_score":0.029,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":89.1,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":89.1,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.754,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.02,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.513,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":254,"author_residue_number":1813,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1814,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1814,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1835,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1835,"unp_one_letter_code":"Q","pdb_one_letter_code":"Q"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":32.0514,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":45.5333,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":13.4818,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.076,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket5","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.0221,"confidence_score":0.0426,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.388,"confidence_score":0.041,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.417,"confidence_score":0.05,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":73.6,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":73.6,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.743,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.433,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.009,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":255,"author_residue_number":1814,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1815,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1836,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1836,"unp_one_letter_code":"P","pdb_one_letter_code":"P"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1815,"unp_one_letter_code":"P","pdb_one_letter_code":"P"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":94.6952,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":116.052,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":21.3567,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.119,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket5","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":0.4546,"confidence_score":0.0072,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.348,"confidence_score":0.03,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.379,"confidence_score":0.051,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":113.1,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":113.1,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.009,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.51,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.732,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}},{"residue_number":256,"author_residue_number":1815,"author_insertion_code":"","observed":"Y","features":{"UniProt":{"P21359-6":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1816,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"PRO_0000010773":{"identifier":"Neurofibromin","name":"Neurofibromin","unp_residue_number":1837,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"P21359":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1837,"unp_one_letter_code":"D","pdb_one_letter_code":"D"},"P21359-2":{"identifier":"NF1_HUMAN","name":"NF1_HUMAN","unp_residue_number":1816,"unp_one_letter_code":"D","pdb_one_letter_code":"D"}},"Pfam":{},"InterPro":{"IPR011993":{"identifier":"PH-like domain superfamily","name":"PH-like domain superfamily"}},"CATH":{"2.30.29.30":{"homology":"Pleckstrin-homology domain (PH domain)/Phosphotyrosine-binding domain (PTB)","topology":"PH-domain like","architecture":"Roll","identifier":"PH-domain like","class":"Mainly Beta","name":"Neurofibromin. Chain: a, b. Synonym: neurofibromatosis-related proteinnf-1, neurofibromin truncated. Engineered: yes. Mutation: yes"}},"SCOP":{},"binding_sites":{},"FunPDBe":[{"origin":"POPScomp_PDBML","label":"hydrophobic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":52.2278,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"hydrophilic SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":25.1625,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"POPScomp_PDBML","label":"total SASA [A^2]","url":"https://github.com/Fraternalilab/POPScomp","raw_score":77.3903,"confidence_score":0.9,"confidence_classification":"high","evidence_codes":["ECO_0000246"]},{"origin":"WEBnma","label":"fluctuations","url":"http://apps.cbu.uib.no/webnma3","raw_score":0.216,"confidence_score":null,"confidence_classification":"","evidence_codes":["ECO_0006139"]},{"origin":"p2rank","label":"pocket4","url":"http://prankweb.cz/analyze/id_noconser/3PG7","raw_score":1.2144,"confidence_score":0.0654,"confidence_classification":"low","evidence_codes":["ECO_0000246"]},{"origin":"depth","label":"monomeric_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.328,"confidence_score":0.031,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"depth","label":"complex_residue_depth","url":"http://cospi.iiserpune.ac.in/cospi/depth","raw_score":0.401,"confidence_score":0.05,"confidence_classification":"high","evidence_codes":["ECO_0000362"]},{"origin":"3Dcomplex","label":"ASA_BiologicalUnit","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":205.9,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"Surface_residue","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":null,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"3Dcomplex","label":"ASA_alone","url":"http://shmoo.weizmann.ac.il/elevy/3dcomplexV6/Home.cgi","raw_score":205.9,"confidence_score":null,"confidence_classification":"curated","evidence_codes":["ECO_0000053","ECO_0007194"]},{"origin":"dynamine","label":"backbone","url":"http://dynamine.ibsquare.be/","raw_score":0.742,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"sidechain","url":"http://dynamine.ibsquare.be/","raw_score":0.288,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]},{"origin":"dynamine","label":"efoldmine","url":"http://dynamine.ibsquare.be/","raw_score":0.021,"confidence_score":0.5,"confidence_classification":"","evidence_codes":["ECO_0000364","ECO_0000203"]}]}}]}]}]} \ No newline at end of file diff --git a/test/pytest/test_command.py b/test/pytest/test_command.py index 1e7ba3d..b1be4b6 100644 --- a/test/pytest/test_command.py +++ b/test/pytest/test_command.py @@ -9,16 +9,17 @@ def test_command(): runner = CliRunner() + dargs = ['--folder', 'test/pytest/demo_dir'] for task in ('insert-mutation --input test/pytest/data/mutation.tsv --usecols Alt,Pos,Ref,ftId', - 'id-mapping', + 'id-mapping --auto_assign', 'check-muta-conflict', 'sifts-mapping --chunksize 15', - 'insert-sele-mapping --input pipe_select_mo.tsv', - 'residue-mapping --input pipe_select_mo.tsv', + 'insert-sele-mapping --input test/pytest/demo_dir/pipe_select_mo.tsv', + 'residue-mapping --input test/pytest/demo_dir/pipe_select_mo.tsv', 'export-mutation-mapping -o e_resmap.tsv --sele', - 'insert-sele-mutation-mapping -i e_resmap.tsv', + 'insert-sele-mutation-mapping -i test/pytest/demo_dir/e_resmap.tsv', 'sifts-mapping --func pipe_select_smr_mo --chunksize 10', - 'insert-smr-mapping -i pipe_select_smr_mo.tsv', + 'insert-smr-mapping -i test/pytest/demo_dir/pipe_select_smr_mo.tsv', 'export-smr-mutation-mapping -o e_smr_resmap.tsv --sele', 'insert-sifts-meta --input test/pytest/data/pdb_demo.tsv --api_suffix api/mappings/pfam/', 'insert-isoform-range', @@ -32,5 +33,5 @@ def test_command(): '-m', 'post', '-t', 'A_90_B_10'], ): - result = runner.invoke(Interface, task.split(' ') if not isinstance(task, list) else task) + result = runner.invoke(Interface, dargs+task.split(' ') if not isinstance(task, list) else dargs+task) assert result.exit_code == 0, str(task) diff --git a/test/pytest/test_sele.py b/test/pytest/test_sele.py index 950f2e0..b0fa7ac 100644 --- a/test/pytest/test_sele.py +++ b/test/pytest/test_sele.py @@ -5,13 +5,16 @@ # @Last Modified: 2021-03-15 09:07:11 pm # @Copyright (c) 2021 MinghuiGroup, Soochow University from pdb_profiling import default_config +from pdb_profiling.utils import a_load_json, a_concat +from pdb_profiling.processors import * from rich.progress import track +from pandas import DataFrame import pytest -default_config() +default_config('test/pytest/demo_dir') -@pytest.mark.timeout(90) +@pytest.mark.timeout(60) def test_init(): from pdb_profiling.processors.i3d.api import Interactome3D Interactome3D.pipe_init_interaction_meta().result() @@ -19,22 +22,18 @@ def test_init(): @pytest.mark.timeout(240) def test_single_select(): - from pdb_profiling.processors import SIFTS # SIFTS.chain_filter, SIFTS.entry_filter = '', '' demo = SIFTS('P21359-2') - demo.unp_is_canonical().result() demo.pipe_base().then(SIFTS.double_check_conflict_and_range).result() - demo.pipe_select_mo().result() - #demo.pipe_select_smr_mo(sifts_mo_df=df1).result() + demo.pipe_scheduled_ranged_map_res_df().result() demo.pipe_select_ho(run_as_completed=True, progress_bar=track).result() demo.pipe_select_he(run_as_completed=True, progress_bar=track).result() demo.pipe_select_ho_iso(run_as_completed=True).result() demo.pipe_select_else(func='pipe_protein_ligand_interface', css_cutoff=0.5, run_as_completed=True).result() -@pytest.mark.timeout(120) +@pytest.mark.timeout(90) def test_identifiers(): - from pdb_profiling.processors import Identifiers, Identifier demo = Identifiers([ 'ENSP00000491589', 'ENST00000379268', 'ENSP00000427757', 'ENSP00000266732', @@ -43,22 +42,22 @@ def test_identifiers(): 'ENST00000371100', 'ENST00000401731', 'ENSP00000387612']) demo.fetch('map2unp').run().result() - Identifier('P21359-3').fetch_from_proteins_api('coordinates/location/', identifier='P21359-3:550').result() + Identifier('P21359-3').fetch_from_proteins_api('coordinates/location/', ':550').result() + Identifier('P21359-2').init().result().get_isoform_ob().result() + Identifier('P21359').get_all_ref_identifiers().result() + Identifier('P21359').alignment_df().result() -@pytest.mark.timeout(120) +@pytest.mark.timeout(60) def test_uniprots_alt(): - from pdb_profiling.processors import UniProts, Identifiers - from pdb_profiling.utils import a_concat UniProts.fetch_VAR_SEQ_from_DB(('Q5VST9', 'Q5JWF2', 'P08631', 'O92972'), via_txt=True).result() demo_unps = ('Q5VST9', 'Q5JWF2', 'P21359', 'P68871', 'P63092', 'Q29960') Identifiers(demo_unps).query_from_DB_with_unps('ALTERNATIVE_PRODUCTS').run().then(a_concat).result() -@pytest.mark.timeout(180) +@pytest.mark.timeout(120) def test_other_api(): - from pdb_profiling.processors import PDB, SIFTS from pdb_profiling.processors.pdbe.api import PDBVersioned, PDBeKBAnnotations pdb_ob = PDB('1a01') pdb_ob.status @@ -68,7 +67,7 @@ def test_other_api(): pdb_ob.fetch_from_pdbe_api('api/pdb/entry/secondary_structure/').result() pdb_ob.fetch_from_pdbe_api('api/pdb/entry/files/').result() pdb_ob.fetch_from_pdbe_api('graph-api/pdb/funpdbe_annotation/').result() - pdb_ob.fetch_from_pdbe_api('graph-api/pdb/sequence_conservation/').result() + pdb_ob.fetch_from_pdbe_api('graph-api/pdb/sequence_conservation/', mask_id='1cbs/1').result() pdb_ob.fetch_from_pdbe_api('api/validation/RNA_pucker_suite_outliers/entry/').result() pdb_ob.fetch_from_pdbe_api('api/validation/rama_sidechain_listing/entry/').result() PDB('4zai').fetch_from_PDBArchive('obsolete/mmCIF/', PDB.cif2residue_listing).result() @@ -78,11 +77,11 @@ def test_other_api(): bm_df = pdb_ob.get_bound_molecules().result() [pdb_ob.get_bound_molecule_interaction(bm_id).result() for bm_id in bm_df.bm_id.unique()[:2]] SIFTS('P21359-2').fetch_from_pdbe_api('graph-api/uniprot/superposition/', SIFTS.to_dataframe).result() + PDBAssembly('1a01/1').add_args().assembly_summary -@pytest.mark.timeout(65) +@pytest.mark.timeout(70) def test_pdbekdb_self_annotation(): - from pdb_profiling.processors import SIFTS """from pdb_profiling.processors.pdbe.api import PDBeKBAnnotations PDBeKBAnnotations.root = PDBeKBAnnotations.ftp_root assert PDB('12ca').pipe_pdbekb_annotations('MetalPDB/').result() is not None @@ -92,16 +91,13 @@ def test_pdbekdb_self_annotation(): @pytest.mark.timeout(60) def test_fetch_residue_mapping(): - from pdb_profiling.processors import SIFTS - pdb_ob = SIFTS('1a01') - pdb_ob.fetch_residue_mapping(entity_id=1, start=20, end=25).result() - pdb_ob.fetch_residue_mapping(entity_id=1, start=24, end=27).result() + pdb_ob = SIFTS('3pg7') + pdb_ob.fetch_residue_mapping(entity_id=1, start=251, end=256).result() + pdb_ob.fetch_residue_mapping(entity_id=1, start=252, end=255).result() -@pytest.mark.timeout(90) +@pytest.mark.timeout(60) def test_rcsb_data_api(): - from pdb_profiling.processors import PDB, PDBAssemble - from pdb_profiling.utils import a_load_json pdb_id = '3hl2' ob = PDB(pdb_id) assembly_ids = ob.fetch_from_rcsb_api( @@ -111,7 +107,7 @@ def test_rcsb_data_api(): json=True).result()['data']['entry']['rcsb_entry_container_identifiers']['assembly_ids'] for assembly_id in assembly_ids: - data = PDBAssemble(f'{pdb_id}/{assembly_id}').fetch_from_rcsb_api('assembly/', then_func=a_load_json, json=True).result() + data = PDBAssembly(f'{pdb_id}/{assembly_id}').fetch_from_rcsb_api('assembly/', then_func=a_load_json, json=True).result() data['pdbx_struct_assembly_gen'] data['pdbx_struct_oper_list'] @@ -121,16 +117,14 @@ def test_rcsb_data_api(): assert (df1.merge(df2).shape) == df1.shape -@pytest.mark.timeout(60) +@pytest.mark.timeout(40) def test_rcsb_cluster_membership(): - from pdb_profiling.processors import PDB PDB('2d4q').rcsb_cluster_membership(entity_id=1, identity_cutoff=100).result() PDB('2e2x').rcsb_cluster_membership(entity_id=1, identity_cutoff=100).result() -@pytest.mark.timeout(60) +@pytest.mark.timeout(40) def test_other_SIFTS_func(): - from pdb_profiling.processors import SIFTS try: SIFTS('P21359').fetch_from_pdbe_api('api/mappings/all_isoforms/' ).then(SIFTS.to_dataframe @@ -142,9 +136,8 @@ def test_other_SIFTS_func(): pass -@pytest.mark.timeout(60) +@pytest.mark.timeout(30) def test_get_sequence(): - from pdb_profiling.processors import PDB ob = PDB('4u2v') ob.get_sequence(entity_id=1).result() ob.get_sequence(mode='raw_seq', entity_id=1).result() @@ -152,3 +145,9 @@ def test_get_sequence(): ob.get_sequence(mode='mod_x_seq', entity_id=1).result() ob.get_sequence(struct_asym_id='A').result() ob.get_sequence(chain_id='A').result() + + +@pytest.mark.timeout(20) +def test_show_rcsb_error(): + #assert RCSB1DCoordinates('6OB3.B').alignment_df('NCBI_GENOME').result() is not None + assert RCSB1DCoordinates('P21359').alignment_df('PDB_INSTANCE').result() is not None