From b3331fab03ad46d3db100a3208de9b1a5bc4f080 Mon Sep 17 00:00:00 2001 From: Zefeng Zhu <414731811@qq.com> Date: Thu, 20 May 2021 17:15:27 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=BFtowards=20v0.3.3=20(#16)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 💦opt for pisa * loose time * 🚋add export-interaction-mapping command * pdb_start -> pdb_beg --- pdb_profiling/__init__.py | 2 +- pdb_profiling/commands/__init__.py | 29 +- pdb_profiling/commands/command.py | 112 ++++- pdb_profiling/processors/pdbe/__init__.py | 4 +- pdb_profiling/processors/pdbe/api.py | 5 +- pdb_profiling/processors/pdbe/record.py | 505 +++++++++++++------- pdb_profiling/processors/proteins/record.py | 10 +- pdb_profiling/processors/swissmodel/api.py | 11 +- setup.py | 4 +- test/pytest/test_command.py | 3 + test/pytest/test_sele.py | 9 +- 11 files changed, 487 insertions(+), 207 deletions(-) diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py index 0900a28..951d201 100644 --- a/pdb_profiling/__init__.py +++ b/pdb_profiling/__init__.py @@ -4,7 +4,7 @@ # @Author: ZeFeng Zhu # @Last Modified: 2020-05-13 08:54:09 pm # @Copyright (c) 2020 MinghuiGroup, Soochow University -__version__ = '0.3.2' +__version__ = '0.3.3' def default_config(folder='./'): diff --git a/pdb_profiling/commands/__init__.py b/pdb_profiling/commands/__init__.py index ff8387d..7b09cfb 100644 --- a/pdb_profiling/commands/__init__.py +++ b/pdb_profiling/commands/__init__.py @@ -108,7 +108,7 @@ class ResidueAnnotation(orm.Model): chain_id = orm.String(max_length=10) resource = orm.String(max_length=100, primary_key=True) resource_id = orm.String(max_length=200, primary_key=True) - pdb_start = orm.Integer() + pdb_beg = orm.Integer() pdb_end = orm.Integer() class UniProtAnnotation(orm.Model): @@ -118,7 +118,7 @@ class UniProtAnnotation(orm.Model): UniProt = orm.String(max_length=50, primary_key=True) resource = orm.String(max_length=100, primary_key=True) resource_id = orm.String(max_length=200, primary_key=True) - unp_start = orm.Integer() + unp_beg = orm.Integer() unp_end = orm.Integer() class SMRModel(orm.Model): @@ -145,7 +145,29 @@ class MappedMutation(orm.Model): Ref = orm.String(max_length=3, primary_key=True) Pos = orm.Integer(primary_key=True) Alt = orm.String(max_length=3, primary_key=True) - + + + class PI(orm.Model): + __tablename__ = 'PI' + __metadata__ = self.metadata + __database__ = self.database + + UniProt = orm.String(max_length=50, primary_key=True) + pdb_id = orm.String(max_length=4, primary_key=True) + entity_id = orm.Integer(primary_key=True) + struct_asym_id = orm.String(max_length=10, primary_key=True) + chain_id = orm.String(max_length=10) + assembly_id = orm.Integer(primary_key=True) + model_id = orm.Integer() + struct_asym_id_in_assembly = orm.String(max_length=10, primary_key=True) + interface_id = orm.Integer(primary_key=True) + css = orm.Float() + i_select_tag = orm.Boolean() + i_select_rank = orm.Integer() + pdb_beg = orm.Integer() + pdb_end = orm.Integer() + + self.AAThree2one = AAThree2one self.UniProtSeq = UniProtSeq self.Mutation = Mutation @@ -156,3 +178,4 @@ class MappedMutation(orm.Model): self.ResidueAnnotation = ResidueAnnotation self.SMRModel = SMRModel self.MappedMutation = MappedMutation + self.PI = PI diff --git a/pdb_profiling/commands/command.py b/pdb_profiling/commands/command.py index 0f148cf..02c935a 100644 --- a/pdb_profiling/commands/command.py +++ b/pdb_profiling/commands/command.py @@ -145,6 +145,7 @@ def id_mapping(ctx, input, column, sep, chunksize, auto_assign, sleep): if sleep: tsleep(uniform(1, 10)) + @Interface.command('check-muta-conflict') @click.option('--chunksize', type=int, default=100000) @click.pass_context @@ -198,7 +199,7 @@ def get_unp_id(args): Entry, isoform, is_canonical = args return Entry if is_canonical else isoform - kwargs = dict(sub.split('=') for item in kwargs for sub in item.split(',')) + kwargs = dict(sub.split('=') for item in kwargs for sub in item.split(';')) if len(kwargs) > 0: for key,value in kwargs.items(): kwargs[key] = eval(value) @@ -305,7 +306,7 @@ def residue_mapping(ctx, input, chunksize, output, sleep): sqlite_api.sync_insert(sqlite_api.ResidueMappingRange, res_mapping_df.to_dict('records')) console.log(f'Done: {done}') if sleep: - tsleep(uniform(0, 3)) + tsleep(uniform(0, 2)) @Interface.command('insert-sele-mapping') @@ -383,6 +384,52 @@ def expand_iso_range(res): console.log(f'Done: {len(res)+chunksize*i}') +def pi2records(dfrm: DataFrame, usecols: list, pair_cols: list): + yield from yield_interact_records(dfrm[usecols[:13]].rename(columns=dict(zip(usecols[6:13], pair_cols)))) + yield from yield_interact_records(dfrm[usecols[:6]+usecols[13:]].rename(columns=dict(zip(usecols[13:], pair_cols)))) + +def yield_interact_records(dfrm: DataFrame): + if 'UniProt' in dfrm.columns: + for row in dfrm.itertuples(index=False): + for beg, end in eval(row.interface_range): + yield dict(UniProt=row.UniProt, pdb_id=row.pdb_id, entity_id=row.entity_id, + struct_asym_id=row.struct_asym_id, chain_id=row.chain_id, + assembly_id=row.assembly_id, model_id=row.model_id, + struct_asym_id_in_assembly=row.struct_asym_id_in_assembly, + interface_id=row.interface_id, css=row.css, + i_select_tag=row.i_select_tag, i_select_rank=row.i_select_rank, + pdb_beg=beg, pdb_end=end) + else: + for row in dfrm.itertuples(index=False): + for beg, end in eval(row.interface_range): + yield dict(UniProt='NaN', pdb_id=row.pdb_id, entity_id=row.entity_id, + struct_asym_id=row.struct_asym_id, chain_id=row.chain_id, + assembly_id=row.assembly_id, model_id=row.model_id, + struct_asym_id_in_assembly=row.struct_asym_id_in_assembly, + interface_id=row.interface_id, css=row.css, + i_select_tag=row.i_select_tag, i_select_rank=row.i_select_rank, + pdb_beg=beg, pdb_end=end) + +@Interface.command('insert-interaction') +@click.option('-i', '--input', type=click.Path()) +@click.option('--chunksize', type=int, help="the chunksize parameter", default=5000) +@click.option('--ppi/--no-ppi', is_flag=True, default=True) +@click.pass_context +def insert_interaction(ctx, input, chunksize, ppi): + custom_db = ctx.obj['custom_db'] + common_cols = ['pdb_id', 'assembly_id', 'interface_id', 'css', 'i_select_tag', 'i_select_rank'] + pair_cols = ['entity_id', 'struct_asym_id', 'chain_id', 'model_id', 'struct_asym_id_in_assembly', 'interface_range', 'UniProt'] + usecols = common_cols + [col+'_1' for col in pair_cols] + [col+'_2' for col in pair_cols] + df_usecols = usecols if ppi else usecols[:-1] + dfs = read_csv(input, sep='\t', keep_default_na=False, na_values=[''], chunksize=chunksize, usecols=df_usecols) + done: int = 0 + with console.status("[bold green]inserting..."): + for df in dfs: + custom_db.sync_insert(custom_db.PI, list(pi2records(df[df.i_select_rank.ne(-1)], usecols, pair_cols))) + done += df.shape[0] + console.log(f'Done: {done}') + + @Interface.command('export-mutation-mapping') @click.option('--with_id/--no-with_id', is_flag=True, default=False) @click.option('--sele/--no-sele', is_flag=True, default=True) @@ -446,6 +493,67 @@ def export_residue_remapping(ctx, with_id, sele, output): console.log(f'result saved in {output_path}') +@Interface.command('export-interaction-mapping') +@click.option('--with_id/--no-with_id', is_flag=True, default=False) +@click.option('-o', '--output', type=str, help='filename of output file') +@click.pass_context +def export_interface_mapping(ctx, with_id, output): + output_path = ctx.obj['folder']/output + query = """ + SELECT DISTINCT + %s + CASE IDMapping.is_canonical + WHEN 1 + THEN IDMapping.Entry + ELSE IDMapping.isoform + END edUniProt, Mutation.Ref, Mutation.Pos, Mutation.Alt, + Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.pdb_beg AS residue_number, + Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.auth_pdb_beg AS author_residue_number, + ResidueMappingRange.author_insertion_code, + ResidueMappingRange.observed_ratio, + ResidueMappingRange.pdb_id, + ResidueMappingRange.entity_id, + ResidueMappingRange.struct_asym_id, + ResidueMappingRange.chain_id, + PI.assembly_id, + PI.model_id, + PI.struct_asym_id_in_assembly, + PI.interface_id, + PI.css, + PI.i_select_tag, + PI.i_select_rank, + (Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.pdb_beg >= PI.pdb_beg AND Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.pdb_beg <= PI.pdb_end) AS is_interface_residue + FROM Mutation,ResidueMappingRange + INNER JOIN IDMapping ON Mutation.ftId = IDMapping.ftId + INNER JOIN UniProtSeq ON UniProtSeq.isoform = IDMapping.isoform + AND UniProtSeq.Pos = Mutation.Pos + AND UniProtSeq.Ref = Mutation.Ref + INNER JOIN SelectedMappingMeta ON SelectedMappingMeta.UniProt = ResidueMappingRange.UniProt + AND SelectedMappingMeta.pdb_id = ResidueMappingRange.pdb_id + AND SelectedMappingMeta.struct_asym_id = ResidueMappingRange.struct_asym_id + INNER JOIN PI ON PI.UniProt = ResidueMappingRange.UniProt + AND PI.pdb_id = ResidueMappingRange.pdb_id + AND PI.struct_asym_id = ResidueMappingRange.struct_asym_id + WHERE ResidueMappingRange.UniProt = edUniProt + AND Mutation.Pos >= ResidueMappingRange.unp_beg + AND Mutation.Pos <= ResidueMappingRange.unp_end + AND ResidueMappingRange.conflict_code IS NULL + AND ResidueMappingRange.observed_ratio > 0 + AND (ResidueMappingRange.residue_name = '' OR ResidueMappingRange.residue_name IN (SELECT three_letter_code FROM AAThree2one)) + AND SelectedMappingMeta.select_rank != -1 + ; + """ + query = query % ('Mutation.ftId,' if with_id else '') + with console.status("[bold green]query..."): + dfs = read_sql_query(query, ctx.obj['custom_db'].engine, chunksize=10000) + for df in dfs: + if df.shape[0] == 0: + continue + df.rename(columns={'edUniProt': 'UniProt'}).to_csv( + output_path, index=False, mode='a+', sep='\t', header=not output_path.exists()) + console.log(f'result saved in {output_path}') + + @Interface.command('insert-sele-mutation-mapping') @click.option('-i', '--input', type=click.Path()) @click.option('--chunksize', type=int, help="the chunksize parameter", default=10000) diff --git a/pdb_profiling/processors/pdbe/__init__.py b/pdb_profiling/processors/pdbe/__init__.py index e1113dc..9b0a14d 100644 --- a/pdb_profiling/processors/pdbe/__init__.py +++ b/pdb_profiling/processors/pdbe/__init__.py @@ -101,7 +101,7 @@ class profile_id(orm.Model): class PISAInterfaceDict(orm.Model): - __tablename__ = 'PISAInterface' + __tablename__ = 'PISAInterfaceDict' __metadata__ = self.metadata __database__ = self.database entity_id_1 = orm.Integer() @@ -127,6 +127,8 @@ class PISAInterfaceDict(orm.Model): interface_id = orm.Integer(primary_key=True) use_au = orm.Boolean() css = orm.Float() + is_polymer_1 = orm.Boolean() + is_polymer_2 = orm.Boolean() self.ResidueMapping = ResidueMapping self.StatsProteinEntitySeq = StatsProteinEntitySeq diff --git a/pdb_profiling/processors/pdbe/api.py b/pdb_profiling/processors/pdbe/api.py index 4736d28..d73043d 100644 --- a/pdb_profiling/processors/pdbe/api.py +++ b/pdb_profiling/processors/pdbe/api.py @@ -401,10 +401,7 @@ def yieldPISAInterfaceList(data: Dict): for record in records: flatten_dict(record, 'structure_1') flatten_dict(record, 'structure_2') - flatten_dict(data[pdb], 'page_title', False) - cols = sorted(i for i in data[pdb].keys() - if i != 'interfaceentries') - yield records, cols, tuple(data[pdb][col] for col in cols) + yield records, ('pdb_id', 'assembly_id'), (pdb, data[pdb]['page_title']['assemble_code']) @staticmethod @dispatch_on_set('api/pisa/interfacedetail/') diff --git a/pdb_profiling/processors/pdbe/record.py b/pdb_profiling/processors/pdbe/record.py index d476dea..0cbf4df 100644 --- a/pdb_profiling/processors/pdbe/record.py +++ b/pdb_profiling/processors/pdbe/record.py @@ -25,7 +25,7 @@ from pdb_profiling.processors.recordbase import IdentifierBase from pdb_profiling.processors.transformer import Dict2Tabular from pdb_profiling.exceptions import * -from pdb_profiling.cython.cyrange import to_interval, lyst22interval, lyst32interval, range_len, interval2set, subtract_range, add_range, overlap_range, outside_range, trim_range +from pdb_profiling.cython.cyrange import to_interval, lyst22interval, lyst32interval, range_len, subtract_range, add_range, overlap_range, outside_range, trim_range from pdb_profiling.utils import (init_semaphore, init_folder_from_suffix, a_read_csv, split_df_by_chain, unsync_wrap, related_dataframe, slice_series, @@ -519,8 +519,7 @@ async def cif2residue_listing(cls, path: Union[Unfuture, Coroutine, str, Path]): col_dict = dict(zip(cols, new_cols)) col_dict['data_'] = 'pdb_id' dfrm.rename(columns=col_dict, inplace=True) - assert all(dfrm['residue_number'] == dfrm['residue_number?'] - ), f"Unexpectd Cases: _pdbx_poly_seq_scheme.seq_id != _pdbx_poly_seq_scheme.ndb_seq_num\n{dfrm[dfrm['residue_number'] != dfrm['residue_number?']]}" + assert (dfrm['residue_number'] == dfrm['residue_number?']).all(), f"Unexpectd Cases: _pdbx_poly_seq_scheme.seq_id != _pdbx_poly_seq_scheme.ndb_seq_num\n{dfrm[dfrm['residue_number'] != dfrm['residue_number?']]}" dfrm.drop(columns=['residue_number?'], inplace=True) for col in ('residue_number', 'authore_residue_number'): dfrm[col] = dfrm[col].astype(int) @@ -718,7 +717,11 @@ async def set_focus_assembly(self, focus_assembly_ids:Optional[Iterable[int]]=No mol_df = await self.fetch_from_pdbe_api('api/pdb/entry/molecules/', Base.to_dataframe) mol_df = mol_df[~mol_df.molecule_type.isin(('water', 'bound', 'carbohydrate polymer'))] mol_df = mol_df[~(mol_df.molecule_type.isin(('polypeptide(L)', 'polypeptide(D)')) & mol_df.length.le(omit_peptide_length))] - ass_eec_df = ass_eec_df[ass_eec_df.details.notnull() & ass_eec_df.entity_id.isin(mol_df.entity_id)] + try: + ass_eec_df = ass_eec_df[ass_eec_df.details.notnull() & ass_eec_df.entity_id.isin(mol_df.entity_id)] + except AttributeError as e: + warn(f"{repr(self)}: None assembly") + raise e check_chain_num = ass_eec_df.groupby('assembly_id').in_chains.apply(lambda x: sum(i.count(',')+1 for i in x)) if discard_multimer_chains_cutoff is not None: assemblys = set(ass_eec_df.assembly_id) & set(check_chain_num[check_chain_num {"68":{"three_letter_code":"CR2","parent_chem_comp_ids":["GLY","TYR","GLY"],"one_letter_code":"GYG"}} + ''' + if isinstance(data, float): + warn(f'{repr(self)}: Unexpected float datatype for `pdb_sequence_indices_with_multiple_residues`') + assert '(' not in pdb_sequence, repr(self) + return pdb_sequence + elif isinstance(data, str): + if data == '{}': + return pdb_sequence + else: + data = json.loads(data) + for val_dict in data.values(): + one_letter_code = val_dict["one_letter_code"] + three_letter_code = val_dict["three_letter_code"] + if len(one_letter_code) != 1: + warn(f"Possible Peptide Linking: {repr(self)}, {kwargs}, {val_dict}; select the first code", PeptideLinkingWarning) + pdb_sequence = pdb_sequence.replace(f'({three_letter_code})', one_letter_code[0]) + return pdb_sequence + @unsync async def get_sequence(self, mode='fix_seq', **kwargs): ''' Get true SEQRES Sequence via entity_id | chain_id (default protein) | struct_asym_id ''' - - def deal_with_pdb_sequence_indices_with_multiple_residues(pdb_sequence, data): - ''' - Deal with pdb_sequence_indices_with_multiple_residues - 4u2v entity 1 -> {"68":{"three_letter_code":"CR2","parent_chem_comp_ids":["GLY","TYR","GLY"],"one_letter_code":"GYG"}} - ''' - data = json.loads(data) if isinstance(data, str) else data - for val_dict in data.values(): - one_letter_code = val_dict["one_letter_code"] - three_letter_code = val_dict["three_letter_code"] - if len(one_letter_code) != 1: - warn(f"Possible Peptide Linking: {repr(self)}, {kwargs}, {val_dict}; select the first code", PeptideLinkingWarning) - pdb_sequence = pdb_sequence.replace(f'({three_letter_code})', one_letter_code[0]) - return pdb_sequence - mol_df = await self.fetch_from_pdbe_api('api/pdb/entry/molecules/', Base.to_dataframe) if mol_df is None: - raise PossibleObsoletedPDBEntryError(f"None dataframe: {repr(self)}") + raise PossibleObsoletedPDBEntryError(f"None dataframe: {repr(self)}, either obsoleted or API lag update") if 'entity_id' in kwargs: cur_record = mol_df.loc[mol_df.entity_id.eq(kwargs['entity_id']).idxmax()] elif 'struct_asym_id' in kwargs: @@ -1031,8 +1041,8 @@ def deal_with_pdb_sequence_indices_with_multiple_residues(pdb_sequence, data): else: raise ValueError(f"Cannot get sequence with specified information: {kwargs}") if mode == 'fix_seq': - return deal_with_pdb_sequence_indices_with_multiple_residues( - cur_record['pdb_sequence'], cur_record['pdb_sequence_indices_with_multiple_residues']) + return self.deal_with_pdb_sequence_indices_with_multiple_residues( + cur_record['pdb_sequence'], cur_record['pdb_sequence_indices_with_multiple_residues'], kwargs) elif mode == 'raw_seq': return cur_record['sequence'] elif mode == 'raw_pdb_seq': @@ -1150,7 +1160,7 @@ async def cs_source_ass_oper_df(self, struct_asym_id, residue_number): @unsync async def set_subset_assembly_from_df(self, profile_id_df): mask = profile_id_df.assembly_id.ne(0) - if not any(mask): + if not mask.any(): self.subset_assembly = frozenset() else: if not hasattr(self, 'assembly'): @@ -1205,7 +1215,7 @@ async def profile_id(self): if assg_oper_df is not None: focus_assg_oper_df = assg_oper_df[assg_oper_df.struct_asym_id.isin(focus_res2eec_df.struct_asym_id)] new_focus_assg_oper_df = concat([add_0_assg_oper_df, focus_assg_oper_df.merge(focus_res2eec_df, how='left')], ignore_index=True, sort=False) - assert any(new_focus_assg_oper_df.isnull().sum()) is False, f"Unexpected Cases {new_focus_assg_oper_df}" + assert not new_focus_assg_oper_df.isnull().sum().any(), f"Unexpected Cases {new_focus_assg_oper_df}" if not hasattr(self, 'assembly'): await self.set_assembly() new_focus_assg_oper_df = new_focus_assg_oper_df[new_focus_assg_oper_df.assembly_id.isin(self.assembly.keys())].reset_index(drop=True) @@ -1301,7 +1311,7 @@ async def get_ranged_map_res_df(self, UniProt, unp_range, pdb_range, conflict_pd (~res_map_df_full.residue_name.isin(SEQ_DICT)) | res_map_df_full.conflict_code.notnull()) demo_record = res_map_df_full.iloc[0] - if all(~mask): + if (~mask).all(): range_df = self.three_range2range_df(*lyst32interval( res_map_df_full.unp_residue_number, res_map_df_full.residue_number, res_map_df_full.author_residue_number)) for col in ('pdb_id', 'entity_id', 'struct_asym_id', 'chain_id'): @@ -1313,7 +1323,7 @@ async def get_ranged_map_res_df(self, UniProt, unp_range, pdb_range, conflict_pd range_df['multiple_conformers'] = nan range_df['conflict_code'] = nan return range_df - elif all(mask): + elif mask.all(): res_map_df_full['UniProt'] = UniProt final_df = res_map_df_full.rename( columns={'unp_residue_number': 'unp_beg', 'residue_number': 'pdb_beg', 'author_residue_number': 'auth_pdb_beg'}) @@ -1402,7 +1412,7 @@ async def get_map_res_df(self, UniProt, unp_range, pdb_range, your_sites, confli return ret @unsync - async def pipe_interface_res_dict_ic(self, include_chains=None, use_copies:bool=True, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, **kwargs): + async def pipe_interface_res_dict_ic(self, include_chains=None, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, chains_count_cutoff=2, allow_not_polymer=False, **kwargs): # ic: include_chains version if include_chains is not None: include_chains = include_chains[self.pdb_id] @@ -1416,18 +1426,24 @@ async def pipe_interface_res_dict_ic(self, include_chains=None, use_copies:bool= continue if len(assembly.interface) == 0: continue - if use_copies and include_chains is not None: + if include_chains is not None: tr = await assembly.get_assemble_eec_as_df() cur_include_chains = frozenset(tr[tr.struct_asym_id.isin(include_chains)].struct_asym_id_in_assembly) else: cur_include_chains = include_chains for interface in assembly.interface.values(): - if ((cur_include_chains is None) or bool(interface.info['chains'] & cur_include_chains)) and interface.info['css'] > css_cutoff: + if ((cur_include_chains is None) or bool(interface.chain_set & cur_include_chains)) and interface.info['css'] > css_cutoff and len(interface.chain_set) >= chains_count_cutoff: + if func == 'pipe_protein_protein_interface' or func == 'pipe_protein_nucleotide_interface': + if not allow_not_polymer and not (interface.info['chains'][0][1] is None and interface.info['chains'][1][1] is None): + continue + elif func == 'pipe_protein_ligand_interface': + if not allow_not_polymer and not (interface.info['chains'][0][1] is None or interface.info['chains'][1][1] is None): + continue res.append(interface) return res @unsync - async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=True, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, **kwargs): + async def pipe_interface_res_dict(self, chain_pairs=None, focus_assembly_ids=None, func='set_interface', discard_multimer_chains_cutoff=21, discard_multimer_chains_cutoff_for_au=None, omit_peptide_length:int=20, css_cutoff=-1, chains_count_cutoff=2, allow_not_polymer=False, **kwargs): # maybe the name `au2bu` should be changed since its actual behavior is to use copied chains if chain_pairs is not None: chain_pairs = chain_pairs[self.pdb_id] @@ -1441,7 +1457,7 @@ async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=True, focus continue if len(assembly.interface) == 0: continue - if au2bu and chain_pairs is not None: + if chain_pairs is not None: tr = await assembly.get_assemble_eec_as_df() tr_info = tr.groupby('struct_asym_id').struct_asym_id_in_assembly.apply(frozenset).to_dict() tr_info = defaultdict(frozenset, tr_info) @@ -1456,7 +1472,13 @@ async def pipe_interface_res_dict(self, chain_pairs=None, au2bu:bool=True, focus else: cur_chain_pairs = chain_pairs for interface in assembly.interface.values(): - if ((cur_chain_pairs is None) or (interface.info['chains'] in cur_chain_pairs)) and interface.info['css'] > css_cutoff: + if ((cur_chain_pairs is None) or (interface.chain_set in cur_chain_pairs)) and interface.info['css'] > css_cutoff and len(interface.chain_set) >= chains_count_cutoff: + if func == 'pipe_protein_protein_interface' or func == 'pipe_protein_nucleotide_interface': + if not allow_not_polymer and not (interface.info['chains'][0][1] is None and interface.info['chains'][1][1] is None): + continue + elif func == 'pipe_protein_ligand_interface': + if not allow_not_polymer and not (interface.info['chains'][0][1] is None or interface.info['chains'][1][1] is None): + continue res.append(interface) return res @@ -1609,9 +1631,10 @@ class PDBAssembly(PDB): tasks = LRUCache(maxsize=1024) - struct_range_pattern = re_compile(r"\[.+\]([A-Z]+[_0-9]*):-?[0-9]+[\?A-Z]*") # e.g. [FMN]B:149 [C2E]A:301 [ACE]H:-8? [BR]BA:957A - rare_pat = re_compile(r"([A-Z]+)_([0-9]+)") # e.g. 2rde assembly 1 A_1, B_1... - interface_structures_pat = re_compile(r"(\[.+\])?([A-Z]+)(:-?[0-9]+[\?A-Z]*)?\+(\[.+\])?([A-Z]+)(:-?[0-9]+[\?A-Z]*)?") # [4CA]BB:170+AB [ZN]D:154A+[CU]C:154 + struct_range_pattern = re_compile(r"\[.+\]([A-Z]+[_0-9]*):(-?[0-9]+[\?A-Z]*)") # e.g. [FMN]B:149 [C2E]A:301 [ACE]H:-8? [BR]BA:957A + #rare_pat = re_compile(r"([A-Z]+)_([0-9]+)") # e.g. 2rde assembly 1 A_1, B_1... + interface_structures_pat = re_compile(r"(\[.+\])?([A-Z]+):?(-?[0-9]+[\?A-Z]*)?\+(\[.+\])?([A-Z]+):?(-?[0-9]+[\?A-Z]*)?") # [4CA]BB:170+AB [ZN]D:154A+[CU]C:154 [ACE]B:0?+B + seq_pat = re_compile(r"(-?[0-9]+)([\?A-Z]*)?") @property def assembly_summary(self) -> Dict: @@ -1645,7 +1668,7 @@ def set_id(self): def get_id(self): return self.pdb_ass_id - @classmethod + """@classmethod def transform(cls, x): res = cls.rare_pat.search(x) assert bool(res), f"Unexpected Case: {x}" @@ -1654,7 +1677,15 @@ def transform(cls, x): if num == 1: return chain else: - return chain+chr(63+num) + return chain+chr(63+num)""" + + @classmethod + def fix_auth_seq_id(cls, seq_id): + if seq_id is None: + return None,None + else: + auth_res_num, insertion_code = cls.seq_pat.fullmatch(seq_id).groups() + return auth_res_num, '' if insertion_code == '?' else insertion_code @classmethod async def to_asiscomponent_interfaces_df(cls, path: Unfuture): @@ -1663,41 +1694,49 @@ async def to_asiscomponent_interfaces_df(cls, path: Unfuture): return None interfacelist_df.rename(columns={"interface_number": "interface_id"}, inplace=True) try: - interfacelist_df[['struct_asym_id_in_assembly_1', 'struct_asym_id_in_assembly_2'] - ] = interfacelist_df.interface_structures.apply( - lambda x: cls.interface_structures_pat.fullmatch(x).group(2,5)).apply(Series) - except AttributeError: + interfacelist_df[[ + 'struct_asym_id_in_assembly_1', 'auth_seq_id_1', + 'struct_asym_id_in_assembly_2', 'auth_seq_id_2'] + ] = interfacelist_df.interface_structures.apply( + lambda x: cls.interface_structures_pat.fullmatch(x).group(2,3,5,6)).apply(Series) + interfacelist_df[['author_residue_number_1', 'author_insertion_code_1']] = interfacelist_df.auth_seq_id_1.apply(cls.fix_auth_seq_id).apply(Series) + interfacelist_df[['author_residue_number_2', 'author_insertion_code_2']] = interfacelist_df.auth_seq_id_2.apply(cls.fix_auth_seq_id).apply(Series) + except AttributeError as e: check = interfacelist_df.interface_structures.apply(lambda x: bool(cls.interface_structures_pat.fullmatch(x))) - warn(str(interfacelist_df[check.eq(False)])) - raise - return interfacelist_df + warn(str(interfacelist_df[~check])) + raise e + return interfacelist_df.drop(columns=['auth_seq_id_1', 'auth_seq_id_2']) @classmethod async def to_interfacelist_df(cls, path: Unfuture): interfacelist_df = await path.then(cls.to_dataframe) if interfacelist_df is None: return None - interfacelist_df.rename(columns={ - "id": "interface_id", - "pdb_code": "pdb_id", - "assemble_code": "assembly_id" - }, inplace=True) - assert all((~interfacelist_df['structure_1.range'].isnull()) & (~interfacelist_df['structure_2.range'].isnull())), str(interfacelist_df[interfacelist_df['structure_1.range'].isnull() | interfacelist_df['structure_2.range'].isnull()]) - if any('_' in i for i in interfacelist_df['structure_1.range']): - interfacelist_df['structure_1.range'] = interfacelist_df['structure_1.range'].apply(lambda x: cls.transform(x) if '_' in x else x) - interfacelist_df['struct_asym_id_in_assembly_1'] = interfacelist_df['structure_1.range'] - else: - check_m = interfacelist_df.apply(lambda x: bool(cls.struct_range_pattern.match(x['structure_1.range']) )if x['structure_1.original_range'] != '{-}' else True, axis=1) - assert len(check_m[~check_m]) == 0, f"{interfacelist_df.loc[check_m[~check_m].index].T}" - interfacelist_df['struct_asym_id_in_assembly_1'] = interfacelist_df.apply( - lambda x: cls.struct_range_pattern.match(x['structure_1.range']).group(1) if x['structure_1.original_range'] != '{-}' else x['structure_1.range'], axis=1) - if any('_' in i for i in interfacelist_df['structure_2.range']): - interfacelist_df['structure_2.range'] = interfacelist_df['structure_2.range'].apply(lambda x: cls.transform(x) if '_' in x else x) - interfacelist_df['struct_asym_id_in_assembly_2'] = interfacelist_df['structure_2.range'] - else: - interfacelist_df['struct_asym_id_in_assembly_2'] = interfacelist_df.apply( - lambda x: cls.struct_range_pattern.match(x['structure_2.range']).group(1) if x['structure_2.original_range'] != '{-}' else x['structure_2.range'], axis=1) - return interfacelist_df + interfacelist_df.rename(columns={"id": "interface_id"}, inplace=True) + assert ((interfacelist_df['structure_1.range'].notnull()) & (interfacelist_df['structure_2.range'].notnull())).all(), str(interfacelist_df[interfacelist_df['structure_1.range'].isnull() | interfacelist_df['structure_2.range'].isnull()]) + for col in ('struct_asym_id_in_assembly_1', 'auth_seq_id_1', 'struct_asym_id_in_assembly_2', 'auth_seq_id_2'): + interfacelist_df[col] = None + if (interfacelist_df['structure_1.range'].str.contains('_') | interfacelist_df['structure_2.range'].str.contains('_')).any(): + raise AssertionError(f'Error structure.range in \n{interfacelist_df}') + else: + range_check = interfacelist_df['structure_1.range'].apply(cls.struct_range_pattern.fullmatch) + range_check_yes = range_check.apply(bool) + check_m = range_check_yes | (interfacelist_df['structure_1.original_range']=='{-}') + assert check_m.all(), f"{interfacelist_df[~check_m].T}" + to_df_index = interfacelist_df[range_check_yes].index + interfacelist_df.loc[to_df_index, ['struct_asym_id_in_assembly_1', 'auth_seq_id_1']] = range_check.loc[to_df_index].apply(lambda x: x.groups()).tolist() + interfacelist_df.loc[interfacelist_df[~range_check_yes].index, 'struct_asym_id_in_assembly_1'] = interfacelist_df.loc[interfacelist_df[~range_check_yes].index, 'structure_1.range'] + # + range_check = interfacelist_df['structure_2.range'].apply(cls.struct_range_pattern.fullmatch) + range_check_yes = range_check.apply(bool) + check_m = range_check_yes | (interfacelist_df['structure_2.original_range']=='{-}') + assert check_m.all(), f"{interfacelist_df[~check_m].T}" + to_df_index = interfacelist_df[range_check_yes].index + interfacelist_df.loc[to_df_index, ['struct_asym_id_in_assembly_2', 'auth_seq_id_2']] = range_check.loc[to_df_index].apply(lambda x: x.groups()).tolist() + interfacelist_df.loc[interfacelist_df[~range_check_yes].index, 'struct_asym_id_in_assembly_2'] = interfacelist_df.loc[interfacelist_df[~range_check_yes].index, 'structure_2.range'] + interfacelist_df[['author_residue_number_1', 'author_insertion_code_1']] = interfacelist_df.auth_seq_id_1.apply(cls.fix_auth_seq_id).apply(Series) + interfacelist_df[['author_residue_number_2', 'author_insertion_code_2']] = interfacelist_df.auth_seq_id_2.apply(cls.fix_auth_seq_id).apply(Series) + return interfacelist_df.drop(columns=['auth_seq_id_1', 'auth_seq_id_2']) @unsync async def get_interfacelist_df(self, api_suffix, func): @@ -1763,11 +1802,15 @@ def to_interface_id(pdb_assembly_id, focus_interface_ids): self.interface_filters, interfacelist_df) focus_interface_ids = focus_interface_df.interface_id focus_interface_chains = zip( - focus_interface_df.struct_asym_id_in_assembly_1, - focus_interface_df.struct_asym_id_in_assembly_2) + focus_interface_df.struct_asym_id_in_assembly_1, + focus_interface_df.author_residue_number_1, + focus_interface_df.author_insertion_code_1, + focus_interface_df.struct_asym_id_in_assembly_2, + focus_interface_df.author_residue_number_2, + focus_interface_df.author_insertion_code_2) self.interface: Dict[int, PDBInterface] = dict(zip( - focus_interface_ids, (PDBInterface(if_id).add_args(PDBAssembly_ob=self, use_au=use_au).store(chains=frozenset(chains), css=css) for if_id, chains, css in zip(to_interface_id(self.get_id(), focus_interface_ids), focus_interface_chains, focus_interface_df.css)))) + focus_interface_ids, (PDBInterface(if_id).add_args(PDBAssembly_ob=self, use_au=use_au).store(chains=(chains[:3], chains[3:]), css=css) for if_id, chains, css in zip(to_interface_id(self.get_id(), focus_interface_ids), focus_interface_chains, focus_interface_df.css)))) def get_interface(self, interface_id): return self.interface[interface_id] @@ -1885,16 +1928,17 @@ def get_id(self): def store(self, **kwargs): self.info = kwargs + self.chain_set = frozenset(i[0] for i in self.info['chains']) return self @classmethod async def to_interfacedetail_df(cls, path: Unfuture): - def check_struct_selection(interfacedetail_df, colName): + """def check_struct_selection(interfacedetail_df, colName): sele = next(iter(interfacedetail_df[colName])) sele_m = cls.struct_range_pattern.fullmatch(sele) if bool(sele_m): - interfacedetail_df[colName] = sele_m.group(1) + interfacedetail_df[colName] = sele_m.group(1)""" interfacedetail_df = await cls.to_dataframe_with_kwargs(path, #usecols=['pdb_code', 'assemble_code', 'interface_number', 'chain_id', @@ -1917,12 +1961,6 @@ def check_struct_selection(interfacedetail_df, colName): 'interface_detail.interface_structure_2.structure.selection': "s2_selection"}, inplace=True) interfacedetail_df.author_insertion_code.fillna('', inplace=True) - check_struct_selection(interfacedetail_df, 's1_selection') - check_struct_selection(interfacedetail_df, 's2_selection') - if any('_' in i for i in interfacedetail_df['struct_asym_id_in_assembly']): - interfacedetail_df['struct_asym_id_in_assembly'] = interfacedetail_df['struct_asym_id_in_assembly'].apply(lambda x: cls.transform(x) if '_' in x else x) - interfacedetail_df['s1_selection'] = interfacedetail_df['s1_selection'].apply(lambda x: cls.transform(x) if '_' in x else x) - interfacedetail_df['s2_selection'] = interfacedetail_df['s2_selection'].apply(lambda x: cls.transform(x) if '_' in x else x) return interfacedetail_df @staticmethod @@ -1936,6 +1974,10 @@ def molecule_type_score(m_type): @unsync async def set_interface_res(self, keep_interface_res_df:bool=False): + # [ ]X-X_mod interaction + # [ ]X-Y_mod interaction + # [ ]X-X interaction -> error? + # [-]X-Y interaction run_following = False task = self.tasks.get((repr(self), 'PISAInterfaceDict'), None) if task is None: @@ -1963,21 +2005,66 @@ async def set_interface_res(self, keep_interface_res_df:bool=False): except Exception as e: raise AssertionError(e) interfacedetail_df['assembly_id'] = self.assembly_id - struct_sele_set = set(interfacedetail_df.head(1)[['s1_selection', 's2_selection']].to_records(index=False)[0]) - if len(struct_sele_set) != 2: + + assert not interfacedetail_df.struct_asym_id_in_assembly.str.contains('_').any(), f"Error chain_id in {repr(self)}" + s1_selection = interfacedetail_df.loc[0, 's1_selection'] + s2_selection = interfacedetail_df.loc[0, 's2_selection'] + s1_match = self.struct_range_pattern.fullmatch(s1_selection) + if bool(s1_match): + struct_asym_id_in_assembly_1, auth_seq_id_1 = s1_match.groups() + author_residue_number_1, author_insertion_code_1 = self.fix_auth_seq_id(auth_seq_id_1) + else: + struct_asym_id_in_assembly_1 = s1_selection + assert '_' not in struct_asym_id_in_assembly_1, f"Error s1_selection in {repr(self)}" + author_residue_number_1, author_insertion_code_1 = None, None + s2_match = self.struct_range_pattern.fullmatch(s2_selection) + if bool(s2_match): + struct_asym_id_in_assembly_2, auth_seq_id_2 = s2_match.groups() + author_residue_number_2, author_insertion_code_2 = self.fix_auth_seq_id(auth_seq_id_2) + else: + struct_asym_id_in_assembly_2 = s2_selection + assert '_' not in struct_asym_id_in_assembly_2, f"Error s2_selection in {repr(self)}" + author_residue_number_2, author_insertion_code_2 = None, None + map_dict = {s1_selection: struct_asym_id_in_assembly_1, s2_selection: struct_asym_id_in_assembly_2} + interfacedetail_df.struct_asym_id_in_assembly = interfacedetail_df.struct_asym_id_in_assembly.map(map_dict) + #if len(struct_sele_set) != 2: # NOTE: Exception example: 2beq assembly_id 1 interface_id 32 - warn(f"\n{interfacedetail_df.head(1)[['pdb_id', 'assembly_id', 'interface_id', 's1_selection', 's2_selection']].to_dict('records')[0]}", PISAErrorWarning) - return - elif not hasattr(self, 'info'): + #warn(f"skip possible modified_residue <-> standard_residues interaction within the same chain:\n{interfacedetail_df.head(1)[['pdb_id', 'assembly_id', 'interface_id', 's1_selection']].to_dict('records')[0]}") + #return + if not hasattr(self, 'info'): pass - elif self.info['chains'] != struct_sele_set: + elif self.info['chains'] != ((struct_asym_id_in_assembly_1, author_residue_number_1, author_insertion_code_1), (struct_asym_id_in_assembly_2, author_residue_number_2, author_insertion_code_2)): # NOTE: Exception example: 2beq assembly_id 1 interface_id 32 - warn(f"{repr(self)}: interfacedetail({struct_sele_set}) inconsistent with interfacelist({set(self.info['chains'])}) ! May miss some data.", PISAErrorWarning) + warn(f"{repr(self)}: interfacedetail({((struct_asym_id_in_assembly_1, author_residue_number_1, author_insertion_code_1), (struct_asym_id_in_assembly_2, author_residue_number_2, author_insertion_code_2))}) inconsistent with interfacelist({self.info['chains']}) ! May miss some data.", PISAErrorWarning) return eec_as_df = await self.PDBAssembly_ob.get_assemble_eec_as_df() res_df = await self.PDBAssembly_ob.pdb_ob.fetch_from_pdbe_api('api/pdb/entry/residue_listing/', Base.to_dataframe) interfacedetail_df = interfacedetail_df.merge(eec_as_df, how="left") interfacedetail_df = interfacedetail_df.merge(res_df, how="left") + if author_residue_number_1 is not None: + try: + struct_asym_id_1 = eec_as_df[eec_as_df.struct_asym_id_in_assembly.eq(struct_asym_id_in_assembly_1)].struct_asym_id.iloc[0] + residue_number_1 = res_df[res_df.struct_asym_id.eq(struct_asym_id_1) & + res_df.author_residue_number.eq(int(author_residue_number_1)) & + res_df.author_insertion_code.eq(author_insertion_code_1) + ].residue_number.iloc[0] + except IndexError as e: + warn(f"{repr(self)}: {struct_asym_id_in_assembly_1}, {author_residue_number_1}, {author_insertion_code_1}") + raise e + else: + residue_number_1 = None + if author_residue_number_2 is not None: + try: + struct_asym_id_2 = eec_as_df[eec_as_df.struct_asym_id_in_assembly.eq(struct_asym_id_in_assembly_2)].struct_asym_id.iloc[0] + residue_number_2 = res_df[res_df.struct_asym_id.eq(struct_asym_id_2) & + res_df.author_residue_number.eq(int(author_residue_number_2)) & + res_df.author_insertion_code.eq(author_insertion_code_2) + ].residue_number.iloc[0] + except IndexError as e: + warn(f"{repr(self)}: {struct_asym_id_in_assembly_2}, {author_residue_number_2}, {author_insertion_code_2}") + raise e + else: + residue_number_2 = None if keep_interface_res_df: self.interface_res_df = interfacedetail_df if hasattr(self, 'interface_res_dict'): @@ -1985,7 +2072,7 @@ async def set_interface_res(self, keep_interface_res_df:bool=False): # NOTE: Not rigorous filter for multiple_conformers check_merge = interfacedetail_df.residue_number.isnull() check_mc = interfacedetail_df.author_residue_number < 0 - if any(check_merge): + if check_merge.any(): # raise ValueError(f"Unexpected Data in Residue DataFrame: {check.head(1).to_dict('records')[0]}") example = interfacedetail_df[check_merge & check_mc].head(3) if example.shape[0]: @@ -2001,13 +2088,13 @@ async def set_interface_res(self, keep_interface_res_df:bool=False): 'struct_asym_id_in_assembly', 'asym_id_rank', 'model_id', 'assembly_id', 'interface_id', 'molecule_type', 'residue_number', 'buried_surface_area', 'solvent_accessible_area'] + asa_col = 12 # focus_cols.index('solvent_accessible_area') + bsa_col = 11 # focus_cols.index('buried_surface_area') + res_col = 10 # focus_cols.index('residue_number') nda = interfacedetail_df[focus_cols].to_numpy() def yield_record(): for _, (start, end) in slice_series(interfacedetail_df.struct_asym_id_in_assembly).items(): - asa_col = focus_cols.index('solvent_accessible_area') - bsa_col = focus_cols.index('buried_surface_area') - res_col = focus_cols.index('residue_number') asa_index = start+np_where(nda[start:end, asa_col] > 0)[0] asa_res = nda[asa_index, res_col] bsa_index = asa_index[np_where(nda[asa_index, bsa_col] > 0)] @@ -2022,20 +2109,32 @@ def yield_record(): common_keys = ('pdb_id', 'assembly_id', 'interface_id') record1 = {f"{key}_1": value for key, value in records[0].items() if key not in common_keys} - struct_sele_set = struct_sele_set - {record1['struct_asym_id_in_assembly_1']} + assert 'interface_range_1' in record1, f"Error in {repr(self)}" + record1['is_polymer_1'] = True + if record1['struct_asym_id_in_assembly_1'] == struct_asym_id_in_assembly_1: + the_other_sele = struct_asym_id_in_assembly_2 + the_other_interface_range = f'[[{residue_number_2},{residue_number_2}]]' if residue_number_2 is not None else None + else: + the_other_sele = struct_asym_id_in_assembly_1 + the_other_interface_range = f'[[{residue_number_1},{residue_number_1}]]'if residue_number_1 is not None else None if len(records) == 2: record2 = {f"{key}_2": value for key, value in records[1].items() if key not in common_keys} + assert record2['struct_asym_id_in_assembly_2'] == the_other_sele, f"Error in {repr(self)}" + assert 'interface_range_2' in record2, f"Error in {repr(self)}" + record2['is_polymer_2'] = True else: - saiia2 = struct_sele_set.pop() - record2 = {'struct_asym_id_in_assembly_2': saiia2} + record2 = {'struct_asym_id_in_assembly_2': the_other_sele} cur_keys = list(set(focus_cols[:-3])-set(common_keys)-{'struct_asym_id_in_assembly'}) try: - cur_record = eec_as_df[eec_as_df.struct_asym_id_in_assembly.eq(saiia2)][cur_keys].to_dict('records')[0] + cur_record = eec_as_df[eec_as_df.struct_asym_id_in_assembly.eq(the_other_sele)][cur_keys].to_dict('records')[0] except Exception: - raise ValueError(f"\n{self.get_id()},\n{saiia2},\n{eec_as_df}") + raise ValueError(f"\n{self.get_id()},\n{the_other_sele},\n{eec_as_df}") for key, value in cur_record.items(): record2[f"{key}_2"] = value + assert the_other_interface_range is not None, f"Error in {repr(self)}" + record2['interface_range_2'] = the_other_interface_range + record2['is_polymer_2'] = False record_dict = {**record1, **record2} for key in common_keys: @@ -2165,7 +2264,7 @@ class SIFTS(PDB): chain_filter = 'UNK_COUNT < SEQRES_COUNT and ca_p_only == False and new_identity >=0.9 and repeated == False and reversed == False and OBS_STD_COUNT >= 20' entry_filter = '(experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR"' - complete_chains_run_as_completed = False + # complete_chains_run_as_completed = False # weight = array([1, -1, -1, -1.79072623, -2.95685934, -4.6231746]) @@ -2197,31 +2296,62 @@ def fetch_unp_fasta(cls, identifier): task = cls.UniProtFASTA.single_retrieve(identifier).then(a_seq_reader) cls.register_task((identifier, 'UniProtFASTA.single_retrieve(identifier).then(a_seq_reader)'), task) return task + + @unsync + async def get_sequence(self, **kwargs): + if self.source == 'UniProt': + return (await SIFTS.fetch_unp_fasta(self.get_id()))[1] + else: + return await super().get_sequence(**kwargs) @classmethod @unsync async def complete_chains(cls, dfrm: Union[DataFrame, Unfuture, Coroutine]): if isawaitable(dfrm): dfrm = await dfrm - if cls.complete_chains_run_as_completed: - res = await SIFTSs(dfrm.pdb_id.unique()).fetch('fetch_from_pdbe_api', - api_suffix='api/mappings/all_isoforms/', - then_func=Base.to_dataframe).run() - else: - res = [await task for task in SIFTSs(dfrm.pdb_id.unique()).fetch('fetch_from_pdbe_api', - api_suffix='api/mappings/all_isoforms/', - then_func=Base.to_dataframe).tasks] + #if cls.complete_chains_run_as_completed: + # res = await SIFTSs(dfrm.pdb_id.unique()).fetch('fetch_from_pdbe_api', + # api_suffix='api/mappings/all_isoforms/', + # then_func=Base.to_dataframe).run() + #else: + res = [await task for task in SIFTSs(frozenset(dfrm.pdb_id)).fetch('fetch_from_pdbe_api', + api_suffix='api/mappings/all_isoforms/', + then_func=Base.to_dataframe).tasks] return concat(res, sort=False, ignore_index=True) @staticmethod @unsync + async def check_whether_pdbe_api_lag(dfrm): + if isawaitable(dfrm): + dfrm = await dfrm + if dfrm is None: + return + elif isinstance(dfrm, DataFrame): + pass + else: + return + pdbs = PDBs(frozenset(dfrm.pdb_id)) + tasks = [await task for task in pdbs.fetch('fetch_from_pdbe_api', api_suffix='api/pdb/entry/molecules/').tasks] + pass_pdbs = [i.name[-8:-4] for i in tasks if i is not None] + mask = dfrm.pdb_id.isin(pass_pdbs) + if mask.any(): + if (~mask).any(): + warn(f"{dfrm[~mask].pdb_id.tolist()}: either obsoleted or API lag update", PossibleObsoletedPDBEntryWarning) + return dfrm[mask].reset_index(drop=True) + else: + return + + """@staticmethod + @unsync async def check_pdb_status(dfrm): if isinstance(dfrm, Unfuture): dfrm = await dfrm - if isinstance(dfrm, Tuple): - dfrm = dfrm[0] elif dfrm is None: return + elif isinstance(dfrm, DataFrame): + pass + else: + return pdbs = PDBs(dfrm.pdb_id.unique()) tasks = [await task for task in pdbs.fetch('fetch_from_pdbe_api', api_suffix='api/pdb/entry/status/', then_func=a_load_json, json=True).tasks] pass_pdbs = [next(iter(i)) for i in tasks if next(iter(i.values()))[0]['status_code'] == 'REL'] @@ -2229,13 +2359,13 @@ async def check_pdb_status(dfrm): if len(res) > 0: return res.reset_index(drop=True) else: - return + return""" @staticmethod @unsync def generate_new_identity(dfrm): ''' - new_identity(Seq_A, Seq_B)= identical_characters / length(trimed_alignment) + new_identity = #identical characters / #aligned columns including columns containing a gap in either sequence ''' if isinstance(dfrm, Unfuture): dfrm = dfrm.result() @@ -2247,7 +2377,7 @@ def generate_new_identity(dfrm): unp_range_col = 'new_unp_range' new_pdb_range_len = dfrm[pdb_range_col].apply(range_len) new_unp_range_len = dfrm[unp_range_col].apply(range_len) # TODO: drop - assert all(new_pdb_range_len==new_unp_range_len) # TODO: drop + assert (new_pdb_range_len==new_unp_range_len).all() # TODO: drop conflict_range_len = dfrm.conflict_pdb_range.apply(range_len) pdb_gaps = dfrm[pdb_range_col].apply(get_gap_list) unp_gaps = dfrm[unp_range_col].apply(get_gap_list) @@ -2286,7 +2416,7 @@ def reformat(dfrm: Union[DataFrame, Unfuture], drop_non_sequencial:bool=True) -> [[19, 264], [1107, 387]] """ pass_mask = (dfrm.pdb_start <= dfrm.pdb_end) & (dfrm.unp_start <= dfrm.unp_end) - if not all(pass_mask): + if not pass_mask.all(): warn(f"Drop:\n{dfrm[~pass_mask][['UniProt','pdb_id','struct_asym_id','pdb_start','pdb_end','unp_start','unp_end']]}") dfrm = dfrm[pass_mask].reset_index(drop=True) ''' @@ -2307,39 +2437,40 @@ def reformat(dfrm: Union[DataFrame, Unfuture], drop_non_sequencial:bool=True) -> return dfrm @staticmethod + def add_tage_to_range(df: DataFrame, tage_name: str): + # ADD TAGE FOR SIFTS + df[tage_name] = 'Safe' + # No Insertion But Deletion[Pure Deletion] + df.loc[df[(df['group_info'] == 1) & ( + df['diff+'] > 0)].index, tage_name] = 'Deletion' + # Insertion & No Deletion + df.loc[df[ + (df['group_info'] == 1) & + (df['diff-'] > 0)].index, tage_name] = 'Insertion_Undivided' + df.loc[df[ + (df['group_info'] > 1) & + (df['diff0'] == df['group_info']) & + (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion' + # Insertion & Deletion + df.loc[df[ + (df['group_info'] > 1) & + (df['diff0'] == df['group_info']) & + (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_1' + df.loc[df[ + (df['group_info'] > 1) & + (df['diff0'] != df['group_info']) & + (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_2' + df.loc[df[ + (df['group_info'] > 1) & + (df['diff0'] != df['group_info']) & + (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'InDel_3' + + @classmethod @unsync - def dealWithInDel(dfrm: Union[DataFrame, Unfuture], sort_by_unp: bool = True) -> DataFrame: + def dealWithInDel(cls, dfrm: Union[DataFrame, Unfuture], sort_by_unp: bool = True) -> DataFrame: if isinstance(dfrm, Unfuture): dfrm = dfrm.result() - def add_tage_to_range(df: DataFrame, tage_name: str): - # ADD TAGE FOR SIFTS - df[tage_name] = 'Safe' - # No Insertion But Deletion[Pure Deletion] - df.loc[df[(df['group_info'] == 1) & ( - df['diff+'] > 0)].index, tage_name] = 'Deletion' - # Insertion & No Deletion - df.loc[df[ - (df['group_info'] == 1) & - (df['diff-'] > 0)].index, tage_name] = 'Insertion_Undivided' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] == df['group_info']) & - (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion' - # Insertion & Deletion - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] == df['group_info']) & - (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_1' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] != df['group_info']) & - (df['unp_gaps0'] != (df['group_info'] - 1))].index, tage_name] = 'InDel_2' - df.loc[df[ - (df['group_info'] > 1) & - (df['diff0'] != df['group_info']) & - (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'InDel_3' - dfrm.pdb_range = dfrm.pdb_range.apply(json.loads) dfrm.unp_range = dfrm.unp_range.apply(json.loads) dfrm['group_info'] = dfrm.apply(lambda x: len( @@ -2363,7 +2494,7 @@ def add_tage_to_range(df: DataFrame, tage_name: str): dfrm['diff-'] = dfrm.range_diff.apply( lambda x: count_nonzero(x < 0)) dfrm['unp_gaps0'] = dfrm.unp_gaps.apply(lambda x: x.count(0)) - add_tage_to_range(dfrm, tage_name='sifts_range_tag') + cls.add_tage_to_range(dfrm, tage_name='sifts_range_tag') dfrm['repeated'] = dfrm.apply( lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion_Undivided', axis=1) dfrm['repeated'] = dfrm.apply( @@ -2391,7 +2522,7 @@ def reverse_dict(info_dict): return res def min_unp(info_dict): - return min(len(set(res)) for res in product(*info_dict.values())) + return min(len(frozenset(res)) for res in product(*info_dict.values())) mol_df = await self.fetch_from_pdbe_api('api/pdb/entry/molecules/', Base.to_dataframe) mol_df = mol_df[mol_df.molecule_type.eq('polypeptide(L)')] @@ -2732,12 +2863,8 @@ async def a_re_align(cls, range_diff, pdb_range, unp_range, pdb_id, entity_id, U pdb_id=pdb_id,entity_id=entity_id,UniProt=UniProt) @unsync - async def pipe_base(self, complete_chains:bool=False, check_pdb_status:bool=False, skip_pdbs=None, only_canonical:bool=False): - init_task = self.fetch_from_pdbe_api('api/mappings/all_isoforms/', Base.to_dataframe) - if check_pdb_status: - init_task = await self.check_pdb_status(init_task) - else: - init_task = await init_task + async def pipe_base(self, complete_chains:bool=False, skip_pdbs=None, only_canonical:bool=False): + init_task = await self.fetch_from_pdbe_api('api/mappings/all_isoforms/', Base.to_dataframe).then(self.check_whether_pdbe_api_lag) if init_task is None: return elif skip_pdbs is not None: @@ -2762,9 +2889,9 @@ async def pipe_base(self, complete_chains:bool=False, check_pdb_status:bool=Fals return sifts_df @unsync - async def pipe_score(self, sifts_df=None, complete_chains:bool=False, check_pdb_status:bool=False, skip_pdbs=None): + async def pipe_score(self, sifts_df=None, complete_chains:bool=False, skip_pdbs=None): if sifts_df is None: - sifts_df = await self.pipe_base(complete_chains=complete_chains, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs) + sifts_df = await self.pipe_base(complete_chains=complete_chains, skip_pdbs=skip_pdbs) if sifts_df is None: return exp_cols = ['pdb_id', 'resolution', 'experimental_method_class', @@ -3022,17 +3149,23 @@ async def pipe_select_base(self, exclude_pdbs=frozenset(), **kwargs): @staticmethod def select_mo(sele_df, OC_cutoff=0.2, sort_cols=['bs_score', '1/resolution', 'revision_date', 'id_score'], infer_new_col:bool=False, ascending=False, allow_mask=None): + sele_df = sele_df.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True) sele_df['select_tag'] = False sele_df['select_rank'] = -1 if infer_new_col: for col in sort_cols: if (col not in sele_df.columns) and (col[1:] in sele_df.columns) and (col[0] == '-'): sele_df[col] = -sele_df[col[1:]] - - allow_sele_df = sele_df[sele_df.bs_score > 0] if allow_mask is None else sele_df[allow_mask] + + if allow_mask is True: + allow_sele_df = sele_df + elif allow_mask is None: + allow_sele_df = sele_df[sele_df.bs_score > 0] + else: + sele_df[allow_mask] def sele_func(dfrm): - rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index + rank_index = dfrm.index sele_df.loc[rank_index, 'select_rank'] = range(1, len(rank_index)+1) return select_range(dfrm.new_unp_range, rank_index, cutoff=OC_cutoff) @@ -3097,7 +3230,7 @@ def parallel_interact_df(sifts_df, i3d_df, common_cols=('revision_date', 'deposi store_2 = i3d_df.loc[swap_index, cols_2].rename(columns=dict(zip(cols_2, [col.replace('_2', '_1') for col in cols_2]))) i3d_df.loc[swap_index, cols_1] = store_2 i3d_df.loc[swap_index, cols_2] = store_1 - assert all((i3d_df.struct_asym_id_1 < i3d_df.struct_asym_id_2) | ((i3d_df.struct_asym_id_1 == i3d_df.struct_asym_id_2) & (i3d_df.model_id_1 < i3d_df.model_id_2))) + assert ((i3d_df.struct_asym_id_1 < i3d_df.struct_asym_id_2) | ((i3d_df.struct_asym_id_1 == i3d_df.struct_asym_id_2) & (i3d_df.model_id_1 < i3d_df.model_id_2))).all() return i3d_df ''' @@ -3117,8 +3250,8 @@ async def pipe_interface_res_dict(p_df, pdb_id): ''' @unsync - async def pipe_select_ho_base(self, exclude_pdbs=frozenset(), run_as_completed: bool=False, progress_bar=None, check_pdb_status:bool=False, skip_pdbs=None, select_mo_kwargs={}, **kwargs): - sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) + async def pipe_select_ho_base(self, exclude_pdbs=frozenset(), run_as_completed: bool=False, progress_bar=None, skip_pdbs=None, select_mo_kwargs={}, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) if sele_df is None: return chain_pairs = sele_df.groupby('pdb_id').struct_asym_id.apply( @@ -3134,7 +3267,8 @@ async def pipe_select_ho_base(self, exclude_pdbs=frozenset(), run_as_completed: return self.add_interact_common_cols(p_df) @staticmethod - def select_ho(p_df, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['best_select_rank_score', 'second_select_rank_score', 'in_i3d'], ascending=False, allow_mask=None): + def select_ho(p_df, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['css','best_select_rank_score', 'second_select_rank_score', 'in_i3d'], ascending=False, allow_mask=None): + p_df = p_df.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True) p_df['i_select_tag'] = False p_df['i_select_rank'] = -1 if allow_mask is None: @@ -3143,6 +3277,11 @@ def select_ho(p_df, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, D (p_df.unp_range_DSC >= unp_range_DSC_cutoff) & ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + elif allow_mask is True: + allow_p_df = p_df[ + (p_df.unp_range_DSC >= unp_range_DSC_cutoff) & + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] else: allow_p_df = p_df[ allow_mask & @@ -3151,7 +3290,7 @@ def select_ho(p_df, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cutoff=0.8, D ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] def sele_func(dfrm): - rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index + rank_index = dfrm.index p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) return select_ho_max_range(dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=DSC_cutoff) @@ -3172,8 +3311,8 @@ async def pipe_select_ho(self, interface_mapped_cov_cutoff=0.8, unp_range_DSC_cu return self.select_ho(p_df, interface_mapped_cov_cutoff, unp_range_DSC_cutoff, DSC_cutoff) @unsync - async def pipe_select_ho_iso_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, check_pdb_status:bool=False, skip_pdbs=None, select_mo_kwargs={}, **kwargs): - sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) + async def pipe_select_ho_iso_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, skip_pdbs=None, select_mo_kwargs={}, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) if sele_df is None: return sele_df = sele_df[sele_df.Entry.eq(self.get_id().split('-')[0])] @@ -3189,30 +3328,29 @@ async def pipe_select_ho_iso_base(self, exclude_pdbs=frozenset(), run_as_complet return self.add_interact_common_cols(p_df) @classmethod - def select_ho_iso(cls, p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['bs_score_1', 'bs_score_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): + def select_ho_iso(cls, p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['css', 'bs_score_1', 'bs_score_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): return cls.select_he(p_df, interface_mapped_cov_cutoff, DSC_cutoff, sort_cols, ascending, allow_mask) @unsync - async def pipe_select_ho_iso(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, then_sort_interact:bool=True, **kwargs): + async def pipe_select_ho_iso(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, **kwargs): p_df = await self.pipe_select_ho_iso_base(**kwargs) if p_df is None: return else: - p_df = self.select_ho_iso(p_df, interface_mapped_cov_cutoff, DSC_cutoff) - return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df + return self.select_ho_iso(self.sort_interact_cols(p_df), interface_mapped_cov_cutoff, DSC_cutoff) @unsync - async def pipe_select_else_base(self, func:str, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, check_pdb_status:bool=False, skip_pdbs=None, select_mo_kwargs={}, **kwargs): + async def pipe_select_else_base(self, func:str, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, skip_pdbs=None, select_mo_kwargs={}, **kwargs): assert func != 'pipe_protein_protein_interface' - sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) if sele_df is None: return include_chains = sele_df.groupby('pdb_id').struct_asym_id.apply(frozenset) return await self.pisa_interact_protein_else(sele_df, include_chains, func, run_as_completed, progress_bar, **kwargs) @unsync - async def pipe_select_he_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, check_pdb_status:bool=False, skip_pdbs=None, select_mo_kwargs={}, **kwargs): - sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True, check_pdb_status=check_pdb_status, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) + async def pipe_select_he_base(self, exclude_pdbs=frozenset(), run_as_completed:bool=False, progress_bar=None, skip_pdbs=None, select_mo_kwargs={}, **kwargs): + sele_df = await self.pipe_select_mo(exclude_pdbs=exclude_pdbs, complete_chains=True, skip_pdbs=skip_pdbs, select_mo_kwargs=select_mo_kwargs) if sele_df is None: return if len(sele_df.Entry.unique()) == 1: @@ -3229,16 +3367,19 @@ async def pipe_select_he_base(self, exclude_pdbs=frozenset(), run_as_completed:b return self.add_interact_common_cols(p_df) @staticmethod - def select_else(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['bs_score_1', '1/resolution', 'revision_date', 'id_score_1'], ascending=False, allow_mask=None): + def select_else(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['css', 'bs_score_1', '1/resolution', 'revision_date', 'id_score_1'], ascending=False, allow_mask=None): + p_df = p_df.sort_values(by=sort_cols, ascending=ascending).reset_index(drop=True) p_df['i_select_tag'] = False p_df['i_select_rank'] = -1 if allow_mask is None: allow_mask = (p_df.bs_score_1 > 0) + elif allow_mask is True: + allow_mask = p_df.bs_score_1.notnull() allow_p_df = p_df[allow_mask & ( (p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff)] def sele_func(dfrm): - rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index + rank_index = dfrm.index p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) return select_range(dfrm.unp_interface_range_1, rank_index, cutoff=DSC_cutoff, similarity_func=sorensen.similarity) @@ -3247,7 +3388,9 @@ def sele_func(dfrm): return p_df @staticmethod - def select_he(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['bs_score_1', 'bs_score_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): + def select_he(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=['css', 'bs_score_1', 'bs_score_2', '1/resolution', 'revision_date', 'in_i3d', 'id_score_1', 'id_score_2'], ascending=False, allow_mask=None): + p_df = p_df.sort_values(by=['i_group']+sort_cols, ascending=ascending).reset_index(drop=True) + # Groupby preserves the order of rows within each group. p_df['i_select_tag'] = False p_df['i_select_rank'] = -1 @@ -3256,14 +3399,18 @@ def select_he(p_df, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, sort_cols=[ (p_df.best_select_rank_score > 0) & ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] + elif allow_mask is True: + allow_p_df = p_df[ + ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & + ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] else: allow_p_df = p_df[ allow_mask & ((p_df.unp_interface_range_1.apply(range_len)/p_df.interface_range_1.apply(range_len)) >= interface_mapped_cov_cutoff) & ((p_df.unp_interface_range_2.apply(range_len)/p_df.interface_range_2.apply(range_len)) >= interface_mapped_cov_cutoff)] - + def sele_func(dfrm): - rank_index = dfrm.sort_values(by=sort_cols, ascending=ascending).index + rank_index = dfrm.index p_df.loc[rank_index, 'i_select_rank'] = range(1, len(rank_index)+1) return select_he_range(dfrm.UniProt_1, dfrm.UniProt_2, dfrm.unp_interface_range_1, dfrm.unp_interface_range_2, rank_index, cutoff=DSC_cutoff) @@ -3272,13 +3419,12 @@ def sele_func(dfrm): return p_df @unsync - async def pipe_select_he(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, then_sort_interact:bool=True, **kwargs): + async def pipe_select_he(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, **kwargs): p_df = await self.pipe_select_he_base(**kwargs) if p_df is None: return else: - p_df = self.select_he(p_df, interface_mapped_cov_cutoff, DSC_cutoff) - return (await self.sort_interact_cols(p_df)) if then_sort_interact else p_df + return self.select_he(self.sort_interact_cols(p_df), interface_mapped_cov_cutoff, DSC_cutoff) @unsync async def pipe_select_else(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2, **kwargs): @@ -3288,7 +3434,7 @@ async def pipe_select_else(self, interface_mapped_cov_cutoff=0.8, DSC_cutoff=0.2 else: return self.select_else(p_df, interface_mapped_cov_cutoff, DSC_cutoff) - @unsync + #@unsync def sort_interact_cols(self, dfrm): assert self.source == 'UniProt' if isinstance(dfrm, Unfuture): @@ -3300,7 +3446,7 @@ def sort_interact_cols(self, dfrm): store_2 = dfrm.loc[swap_index, cols_2].rename(columns=dict(zip(cols_2, [col.replace('_2', '_1') for col in cols_2]))) dfrm.loc[swap_index, cols_1] = store_2 dfrm.loc[swap_index, cols_2] = store_1 - dfrm['i_group'] = dfrm.apply(lambda x: (x['UniProt_1'], x['UniProt_2']), axis=1) + dfrm['i_group'] = dfrm.apply(lambda x: "('%s','%s')" % (x['UniProt_1'], x['UniProt_2']), axis=1) return dfrm @staticmethod @@ -3347,7 +3493,7 @@ def add_interact_common_cols(cls, p_df): 'select_rank_2']].apply(lambda x: 1/max(x), axis=1) p_df['unp_interface_range_1'] = p_df.apply(lambda x: to_interval(cls.convert_index(x['new_unp_range_1'], x['new_pdb_range_1'], expand_interval(x['interface_range_1']))), axis=1) p_df['unp_interface_range_2'] = p_df.apply(lambda x: to_interval(cls.convert_index(x['new_unp_range_2'], x['new_pdb_range_2'], expand_interval(x['interface_range_2']))), axis=1) - p_df['i_group'] = p_df.apply(lambda x: tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) + p_df['i_group'] = p_df.apply(lambda x: "('%s','%s')" % tuple(sorted((x['UniProt_1'], x['UniProt_2']))), axis=1) return p_df @unsync @@ -3408,7 +3554,8 @@ def get_id_score_for_assembly(args): @unsync async def pisa_interact_protein_else(self, sele_df, include_chains, func:str, run_as_completed:bool=False, progress_bar=None, **kwargs): - ob = PDBs(include_chains.index).fetch('pipe_interface_res_dict_ic', include_chains=include_chains, use_copies=True, func=func, **kwargs) + # TODO: check + ob = PDBs(include_chains.index).fetch('pipe_interface_res_dict_ic', include_chains=include_chains, func=func, **kwargs) interact_df = await self.schedule_interface_tasks(ob, run_as_completed, progress_bar) if len(interact_df) == 0: return @@ -3417,7 +3564,7 @@ async def pisa_interact_protein_else(self, sele_df, include_chains, func:str, ru if col not in interact_df.columns: interact_df[col] = nan check_mask = interact_df.molecule_type_1.isin(('polypeptide(L)', 'polypeptide(D)')) - if not all(check_mask): + if not check_mask.all(): # EXAMPLE: 5b0y/0/78 warn('Outdated PISA chain identifier! Current data could be ligand related: ' + str(interact_df[~check_mask].head(1).to_dict('records')[0]), PISAErrorWarning) @@ -3436,7 +3583,7 @@ async def pisa_interact_protein_else(self, sele_df, include_chains, func:str, ru @unsync async def pisa_interact_integrate_with_i3d(self, sele_df, chain_pairs, interaction_type:str, run_as_completed:bool=False, progress_bar=None, **kwargs): - ob = PDBs(chain_pairs.index).fetch('pipe_interface_res_dict', chain_pairs=chain_pairs, au2bu=True, func='pipe_protein_protein_interface', **kwargs) + ob = PDBs(chain_pairs.index).fetch('pipe_interface_res_dict', chain_pairs=chain_pairs, func='pipe_protein_protein_interface', **kwargs) interact_df = await self.schedule_interface_tasks(ob, run_as_completed, progress_bar) if len(interact_df) == 0: return @@ -3506,7 +3653,7 @@ def meta_sifts_annotation(cls, path): df['resource'] = resource_col return df[['pdb_id', 'entity_id', 'struct_asym_id', 'chain_id', 'resource', resource_col, 'start', 'end']].rename(columns={ resource_col: 'resource_id', - 'start': 'pdb_start', + 'start': 'pdb_beg', 'end': 'pdb_end'}) @staticmethod diff --git a/pdb_profiling/processors/proteins/record.py b/pdb_profiling/processors/proteins/record.py index 7b47917..fc02bcb 100644 --- a/pdb_profiling/processors/proteins/record.py +++ b/pdb_profiling/processors/proteins/record.py @@ -149,9 +149,9 @@ async def query_from_DB_with_unp(self, table_name:str, columns: str = '*', exist ''' @unsync - async def fetch_from_proteins_api(self, suffix, id_suffix='', with_source:bool=False, params={}, rate=1.5): + async def fetch_from_proteins_api(self, api_suffix, id_suffix='', with_source:bool=False, params={}, rate=1.5): return await ProteinsAPI.single_retrieve( - suffix=suffix, + suffix=api_suffix, params=params, folder=self.proteins_api_folder, semaphore=self.proteins_api_web_semaphore, @@ -189,10 +189,10 @@ def yield_mapping_unit(data): yield to_flat @unsync - async def alignment_df(self, **kwargs): - assert self.source in ('Taxonomy', 'UniProt') + async def alignment_df(self, api_suffix='coordinates/', **kwargs): + #assert self.source in ('Taxonomy', 'UniProt') return DataFrame(self.yield_mapping( - await self.fetch_from_proteins_api('coordinates/', **kwargs).then(a_load_json))).rename(columns={'id': 'ensemblExonId'}) + await self.fetch_from_proteins_api(api_suffix, **kwargs).then(a_load_json))).rename(columns={'id': 'ensemblExonId'}) @unsync async def fetch_proteins_from_ProteinsAPI(self, reviewed='true', isoform=0, **kwargs): diff --git a/pdb_profiling/processors/swissmodel/api.py b/pdb_profiling/processors/swissmodel/api.py index 43ccd78..ee99f49 100644 --- a/pdb_profiling/processors/swissmodel/api.py +++ b/pdb_profiling/processors/swissmodel/api.py @@ -49,14 +49,15 @@ def set_folder(cls, folder: Union[Path, str]): async def set_web_semaphore(cls, web_semaphore_values): cls.web_semaphore = await init_semaphore(web_semaphore_values) - @staticmethod - def yieldSMR(data: Dict): + @classmethod + def yieldSMR(cls, data: Dict): cols = ('sequence_length', 'ac', 'id', 'isoid') uniprot_entries = data['result']['uniprot_entries'] - - assert len( - uniprot_entries) == 1, f"Unexpected length of uniprot_entries: {uniprot_entries}" + if len(uniprot_entries) == 0: + cls.logger.warning(f'{data}: Zero length of uniprot_entries') + return + assert len(uniprot_entries) == 1, f"Unexpected length of uniprot_entries: {uniprot_entries}" for col in ('ac', 'id', 'isoid'): data['result'][col] = uniprot_entries[0].get(col, None) diff --git a/setup.py b/setup.py index fe131a0..f678a9a 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name="pdb_profiling", - version='0.3.2', + version='0.3.3', packages=find_namespace_packages(), entry_points={'console_scripts': ['pdb_profiling=pdb_profiling.commands.command:Interface']}, @@ -28,7 +28,7 @@ 'pandas>=1.1.5', 'numpy>=1.19.2', 'textdistance>=4.2.0', - 'databases[sqlite]>=0.3.2', + 'databases[sqlite]>=0.4.3', 'rich>=9.5.0', 'orm>=0.1.5', 'scikit-learn>=0.23.2', diff --git a/test/pytest/test_command.py b/test/pytest/test_command.py index b1be4b6..ae5a686 100644 --- a/test/pytest/test_command.py +++ b/test/pytest/test_command.py @@ -32,6 +32,9 @@ def test_command(): '-d', 'label_asym_id=B,label_seq_id=100', '-m', 'post', '-t', 'A_90_B_10'], + 'sifts-mapping --func pipe_select_ho --chunksize 5', + 'insert-interaction -i test/pytest/demo_dir/pipe_select_ho.tsv', + 'export-interaction-mapping -o e_interaction_resmap.tsv', ): result = runner.invoke(Interface, dargs+task.split(' ') if not isinstance(task, list) else dargs+task) assert result.exit_code == 0, str(task) diff --git a/test/pytest/test_sele.py b/test/pytest/test_sele.py index b0fa7ac..2b64714 100644 --- a/test/pytest/test_sele.py +++ b/test/pytest/test_sele.py @@ -20,13 +20,12 @@ def test_init(): Interactome3D.pipe_init_interaction_meta().result() -@pytest.mark.timeout(240) +@pytest.mark.timeout(300) def test_single_select(): # SIFTS.chain_filter, SIFTS.entry_filter = '', '' demo = SIFTS('P21359-2') demo.pipe_base().then(SIFTS.double_check_conflict_and_range).result() demo.pipe_scheduled_ranged_map_res_df().result() - demo.pipe_select_ho(run_as_completed=True, progress_bar=track).result() demo.pipe_select_he(run_as_completed=True, progress_bar=track).result() demo.pipe_select_ho_iso(run_as_completed=True).result() demo.pipe_select_else(func='pipe_protein_ligand_interface', css_cutoff=0.5, run_as_completed=True).result() @@ -80,7 +79,7 @@ def test_other_api(): PDBAssembly('1a01/1').add_args().assembly_summary -@pytest.mark.timeout(70) +@pytest.mark.timeout(80) def test_pdbekdb_self_annotation(): """from pdb_profiling.processors.pdbe.api import PDBeKBAnnotations PDBeKBAnnotations.root = PDBeKBAnnotations.ftp_root @@ -96,7 +95,7 @@ def test_fetch_residue_mapping(): pdb_ob.fetch_residue_mapping(entity_id=1, start=252, end=255).result() -@pytest.mark.timeout(60) +@pytest.mark.timeout(80) def test_rcsb_data_api(): pdb_id = '3hl2' ob = PDB(pdb_id) @@ -128,7 +127,7 @@ def test_other_SIFTS_func(): try: SIFTS('P21359').fetch_from_pdbe_api('api/mappings/all_isoforms/' ).then(SIFTS.to_dataframe - ).then(SIFTS.check_pdb_status + #).then(SIFTS.check_pdb_status ).then(SIFTS.check_identity ).then(SIFTS.reformat ).then(SIFTS.deal_with_identical_entity_seq).result()