Skip to content

Commit

Permalink
🤹‍♂️towards v0.2.12 (#13)
Browse files Browse the repository at this point in the history
* 🐎standardize e-r-m output

* 🐃fix bug report

* 🧶add header for unp api

* strange

* 🦺Impl UniProtTXT

* 🎫add SMRModel table

* 🛒add MappedMutation table

* 🧦fix err

* 🎃fix stream_txt retry

* 🧶check whether uniprot api recover

* 🤹‍♂️towards v0.2.12
  • Loading branch information
NatureGeorge authored Apr 10, 2021
1 parent 4f8cb48 commit 76c9de2
Show file tree
Hide file tree
Showing 15 changed files with 343 additions and 147 deletions.
20 changes: 5 additions & 15 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,19 @@ assignees: ''
**Describe the bug**
A clear and concise description of what the bug is.


**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error


**Expected behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]
**INFO (please complete the following information):**
- OS: [e.g. Debian]
- Version [e.g. 0.2.10]

**Smartphone (please complete the following information):**
- Device: [e.g. iPhone6]
- OS: [e.g. iOS8.1]
- Browser [e.g. stock browser, safari]
- Version [e.g. 22]

**Additional context**
Add any other context about the problem here.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pdb-profiling

[![DOI](https://img.shields.io/badge/DOI-10.5281/zenodo.4596476%20-gray.svg?colorB=5A65B3&style=flat)](https://zenodo.org/badge/latestdoi/247475852)
[![DOI](https://zenodo.org/badge/247475852.svg)](https://zenodo.org/badge/latestdoi/247475852)
[![License](https://img.shields.io/badge/License-MIT-blue.svg?style=flat&logo=github&colorB=5A65B3)](https://github.com/naturegeorge/pdb-profiling/blob/master/LICENSE)
[![SupportPythonVersion](https://img.shields.io/pypi/pyversions/pdb-profiling.svg?style=flat&logo=python&colorB=5A65B3)](https://pypi.org/project/pdb-profiling/)
[![Version](https://img.shields.io/pypi/v/pdb-profiling?style=flat&logo=PYPI&colorB=5A65B3)](https://github.com/naturegeorge/pdb-profiling/blob/master/pdb_profiling/__init__.py)
Expand Down Expand Up @@ -82,7 +82,6 @@ python setup.py install # or "sudo python setup.py install" or "pyt
* SWISS-MODEL Repository API
* UniProt API
* EBI Proteins API
* Interactome3D API
* RCSB Data API
* RCSB Search API
* ...
Expand Down
8 changes: 4 additions & 4 deletions pdb_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# @Author: ZeFeng Zhu
# @Last Modified: 2020-05-13 08:54:09 pm
# @Copyright (c) 2020 MinghuiGroup, Soochow University
__version__ = '0.2.10'
__version__ = '0.2.12'


def default_config(folder='./'):
Expand All @@ -13,7 +13,7 @@ def default_config(folder='./'):
from pdb_profiling.processors.pdbe.record import Base, PDB
# from pdb_profiling.processors.pdbe import api as pdbe_api
from pdb_profiling.processors.proteins.record import Identifier
from pdb_profiling.processors.uniprot.api import UniProtFASTA, UniProtAPI
from pdb_profiling.processors.uniprot.api import UniProtINFO, UniProtAPI
from pdb_profiling.processors.uniprot.record import UniProts
from pdb_profiling.processors.i3d.api import Interactome3D
from pdb_profiling.processors.swissmodel.api import SMR
Expand All @@ -27,15 +27,15 @@ def default_config(folder='./'):
Base.set_web_semaphore(30).result()
Base.set_rcsb_web_semaphore(6).result()
Identifier.set_web_semaphore(25).result()
UniProtFASTA.set_web_semaphore(30).result()
UniProtINFO.set_web_semaphore(30).result()
UniProtAPI.set_web_semaphore(30).result()
Interactome3D.set_web_semaphore(30).result()
SMR.set_web_semaphore(30).result()
# Set Folder that store downloaded and handled files
Base.set_folder(folder)
PDB.set_folder(folder)
Identifier.set_folder(folder)
UniProtFASTA.set_folder(folder)
UniProtINFO.set_folder(folder)
UniProtAPI.set_folder(folder)
UniProts.set_folder(folder)
Interactome3D.set_folder(folder)
Expand Down
28 changes: 27 additions & 1 deletion pdb_profiling/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,38 @@ class UniProtAnnotation(orm.Model):
unp_start = orm.Integer()
unp_end = orm.Integer()

class SMRModel(orm.Model):
__tablename__ = 'SMRModel'
__metadata__ = self.metadata
__database__ = self.database
UniProt = orm.String(max_length=50, primary_key=True)
coordinates = orm.String(max_length=500, primary_key=True)
unp_beg = orm.Integer()
unp_end = orm.Integer()
identity = orm.Float()
similarity = orm.Float()
coverage = orm.Float()
oligo_state = orm.String(max_length=50)
with_ligand = orm.Boolean()
select_rank = orm.Integer()
select_tag = orm.Boolean()

class MappedMutation(orm.Model):
__tablename__ = 'MappedMutation'
__metadata__ = self.metadata
__database__ = self.database
UniProt = orm.String(max_length=50, primary_key=True)
Ref = orm.String(max_length=3, primary_key=True)
Pos = orm.Integer(primary_key=True)
Alt = orm.String(max_length=3, primary_key=True)

self.AAThree2one = AAThree2one
self.UniProtSeq = UniProtSeq
self.Mutation = Mutation
self.IDMapping = IDMapping
self.UniProtAnnotation = UniProtAnnotation
#self.ResidueMapping = ResidueMapping
self.ResidueMappingRange = ResidueMappingRange
self.SelectedMappingMeta = SelectedMappingMeta
self.ResidueAnnotation = ResidueAnnotation
self.SMRModel = SMRModel
self.MappedMutation = MappedMutation
150 changes: 130 additions & 20 deletions pdb_profiling/commands/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def init_folder():


@Interface.command("insert-mutation")
@click.option("--input", help="the file that contains sites info", type=click.Path())
@click.option("-i", "--input", help="the file that contains sites info", type=click.Path())
@click.option("--sep", default="\t", help="the seperator of input file", type=str)
@click.option("--usecols", default='from_id,Ref,Pos,Alt', help="The comma-sep columns of site info", type=str)
@click.option("--headers/--no-headers", default=True, is_flag=True)
Expand Down Expand Up @@ -96,15 +96,17 @@ def do_nothing(dfrm):


@Interface.command("id-mapping")
@click.option('--input', type=click.Path(), default=None)
@click.option('-i', '--input', type=click.Path(), default=None)
@click.option('--column', type=str, default=None)
@click.option('--sep', type=str, default='\t')
@click.option('--chunksize', type=int, help="the chunksize parameter", default=200)
@click.option('--chunksize', type=int, help="the chunksize parameter", default=50)
@click.option('--auto_assign/--no-auto_assign', default=True, is_flag=True)
@click.option('--sleep/--no-sleep', default=True, is_flag=True)
@click.pass_context
def id_mapping(ctx, input, column, sep, chunksize, sleep):
def id_mapping(ctx, input, column, sep, chunksize, auto_assign, sleep):
sqlite_api = ctx.obj['custom_db']
cols = ('ftId', 'Entry', 'isoform', 'is_canonical')
Identifier.auto_assign_when_seq_conflict = auto_assign
if input is None:
total = unsync_run(sqlite_api.database.fetch_one(
query="SELECT COUNT(DISTINCT ftId) FROM Mutation WHERE ftId NOT IN (SELECT DISTINCT ftId FROM IDMapping)"))[0]
Expand Down Expand Up @@ -139,7 +141,7 @@ def id_mapping(ctx, input, column, sep, chunksize, sleep):
values = [dict(zip(cols, i)) for i in res]
if values:
sqlite_api.sync_insert(sqlite_api.IDMapping, values)
console.log(f'Done: {len(res)+chunksize*index}')
console.log(f'Done: {len(res)+index}')
if sleep:
tsleep(uniform(1, 10))

Expand Down Expand Up @@ -177,17 +179,17 @@ def get_seq(seq_dict, iso, pos):


@Interface.command("sifts-mapping")
@click.option('--input', type=click.Path(), default=None)
@click.option('-i', '--input', type=click.Path(), default=None)
@click.option('--column', type=str, default=None)
@click.option('--sep', type=str, default='\t')
@click.option('--func', type=str, default='pipe_select_mo')
@click.option('--kwargs', type=str, default='{}')
@click.option('--chunksize', type=int, help="the chunksize parameter", default=50)
@click.option('--entry_filter', type=str, default='(release_date < "20210101") and ((experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or experimental_method == "Solution NMR")')
@click.option('--chain_filter', type=str, default="UNK_COUNT < SEQRES_COUNT and ca_p_only == False and identity >=0.9 and repeated == False and reversed == False and OBS_COUNT > 20")
@click.option('--chain_filter', type=str, default="UNK_COUNT < SEQRES_COUNT and ca_p_only == False and identity >=0.9 and repeated == False and reversed == False and OBS_STD_COUNT >= 20")
@click.option('--skip_pdbs', type=str, default='')
@click.option('--omit', type=int, default=0)
@click.option('--output', type=str, default='')
@click.option('-o', '--output', type=str, default='')
@click.option('--iteroutput/--no-iteroutput', default=True, is_flag=True)
@click.option('--sleep/--no-sleep', default=True, is_flag=True)
@click.pass_context
Expand Down Expand Up @@ -265,9 +267,9 @@ def get_unp_id(args):


@Interface.command("residue-mapping")
@click.option('--input', type=click.Path())
@click.option('-i', '--input', type=click.Path())
@click.option('--chunksize', type=int, help="the chunksize parameter", default=500)
@click.option('--output', type=str, default=None)
@click.option('-o', '--output', type=str, default=None)
@click.option('--sleep/--no-sleep', default=True, is_flag=True)
@click.pass_context
def residue_mapping(ctx, input, chunksize, output, sleep):
Expand Down Expand Up @@ -301,7 +303,7 @@ def residue_mapping(ctx, input, chunksize, output, sleep):


@Interface.command('insert-sele-mapping')
@click.option('--input', type=click.Path())
@click.option('-i', '--input', type=click.Path())
@click.option('--chunksize', type=int, help="the chunksize parameter", default=10000)
@click.pass_context
def sele_mapping(ctx, input, chunksize):
Expand All @@ -322,7 +324,7 @@ def sele_mapping(ctx, input, chunksize):


@Interface.command('insert-sifts-meta')
@click.option('--input', type=click.Path())
@click.option('-i', '--input', type=click.Path())
@click.option('--chunksize', type=int, help="the chunksize parameter", default=500)
@click.option('--func', type=str, default='fetch_from_pdbe_api')
@click.option('--api_suffix', type=str)
Expand All @@ -338,7 +340,7 @@ async def insert_meta(pdb):
if df is not None:
await custom_db.async_insert(custom_db.ResidueAnnotation, df.to_dict('records'))

df = read_csv(input, header=None, chunksize=chunksize)
df = read_csv(input, header=None, chunksize=chunksize, keep_default_na=False, na_values=[''])
done = 0
for ids in df:
pdbs = PDBs(ids[0].unique())
Expand Down Expand Up @@ -375,22 +377,24 @@ def expand_iso_range(res):
console.log(f'Done: {len(res)+chunksize*i}')


@Interface.command('export-residue-mapping')
@Interface.command('export-mutation-mapping')
@click.option('--with_id/--no-with_id', is_flag=True, default=False)
@click.option('--sele/--no-sele', is_flag=True, default=True)
@click.option('-o', '--output', type=str, help='filename of output file')
@click.option("--sep", default="\t", help="the seperator of output file", type=str)
@click.pass_context
def export_residue_remapping(ctx, sele, output, sep):
def export_residue_remapping(ctx, with_id, sele, output):
output_path = ctx.obj['folder']/output
query = """
SELECT DISTINCT
%s
CASE IDMapping.is_canonical
WHEN 1
THEN IDMapping.Entry
ELSE IDMapping.isoform
END edUniProt, Mutation.Ref, Mutation.Pos, Mutation.Alt,
Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.pdb_beg AS residue_number,
(Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.auth_pdb_beg)||ResidueMappingRange.author_insertion_code AS auth_res_num,
Mutation.Pos - ResidueMappingRange.unp_beg + ResidueMappingRange.auth_pdb_beg AS author_residue_number,
ResidueMappingRange.author_insertion_code,
ResidueMappingRange.observed_ratio,
ResidueMappingRange.pdb_id,
ResidueMappingRange.entity_id,
Expand All @@ -413,20 +417,126 @@ def export_residue_remapping(ctx, sele, output, sep):
AND ResidueMappingRange.observed_ratio > 0
AND (ResidueMappingRange.residue_name = '' OR ResidueMappingRange.residue_name IN (SELECT three_letter_code FROM AAThree2one))
AND SelectedMappingMeta.select_rank != -1
{}
;"""
{} ;"""
if with_id:
query = query % 'Mutation.ftId,'
else:
query = query % ''
if sele:
query = query.format('MIN(SelectedMappingMeta.after_select_rank)', 'GROUP BY ResidueMappingRange.UniProt, Mutation.Pos, Mutation.Alt')
else:
query = query.format('SelectedMappingMeta.after_select_rank', '')
with console.status("[bold green]query..."):
dfs = read_sql_query(query, ctx.obj['custom_db'].engine, chunksize=10000)
for df in dfs:
if df.shape[0] == 0:
continue
df.rename(columns={'edUniProt': 'UniProt'}).to_csv(
output, index=False, sep=sep, mode='a+', header=not output_path.exists())
output, index=False, mode='a+', sep='\t', header=not output_path.exists())
console.log(f'result saved in {output_path}')


@Interface.command('insert-sele-mutation-mapping')
@click.option('-i', '--input', type=click.Path())
@click.option('--chunksize', type=int, help="the chunksize parameter", default=10000)
@click.pass_context
def insert_mapped_resmap(ctx, input, chunksize):
custom_db = ctx.obj['custom_db']
dfs = read_csv(input, sep='\t', keep_default_na=False,
na_values=[''], chunksize=chunksize,
usecols=['UniProt', 'Ref', 'Pos', 'Alt'])
done = 0
for df in dfs:
custom_db.sync_insert(custom_db.MappedMutation, df.to_dict('records'))
done += df.shape[0]
console.log(f'Done: {done}')


@Interface.command('export-smr-mutation-mapping')
@click.option('--identity_cutoff', type=float, default=0)
@click.option('--length_cutoff', type=int, default=0)
@click.option('--with_id/--no-with_id', is_flag=True, default=False)
@click.option('--sele/--no-sele', is_flag=True, default=True)
@click.option('--allow_oligo_state', type=str, default=None)
@click.option('-o', '--output', type=str, help='filename of output file')
@click.pass_context
def export_smr_residue_remapping(ctx, identity_cutoff, length_cutoff, with_id, sele, allow_oligo_state, output):
output_path = ctx.obj['folder']/output
# sele_o_path = ctx.obj['folder']/(output_path.name.replace(output_path.suffix,'')+'.sele'+output_path.suffix)
query = """
SELECT DISTINCT
%s
CASE IDMapping.is_canonical
WHEN 1
THEN IDMapping.Entry
ELSE IDMapping.isoform
END edUniProt, Mutation.Ref, Mutation.Pos, Mutation.Alt,
SMRModel.oligo_state,SMRModel.select_tag,SMRModel.coordinates,
{}
FROM Mutation, SMRModel
INNER JOIN IDMapping ON Mutation.ftId = IDMapping.ftId
INNER JOIN UniProtSeq ON UniProtSeq.isoform = IDMapping.isoform
AND UniProtSeq.Pos = Mutation.Pos
AND UniProtSeq.Ref = Mutation.Ref
WHERE SMRModel.UniProt = edUniProt
AND Mutation.Pos >= SMRModel.unp_beg
AND Mutation.Pos <= SMRModel.unp_end
AND SMRModel.identity >= %s
AND SMRModel.select_rank > 0
AND SMRModel.unp_end - SMRModel.unp_beg + 1 >= %s
%s
AND NOT EXISTS (SELECT * FROM MappedMutation
WHERE edUniProt = MappedMutation.UniProt
AND MappedMutation.Pos = Mutation.Pos
AND MappedMutation.Alt = Mutation.Alt LIMIT 1)
{};
"""
if with_id:
if allow_oligo_state is None:
query = query % ('Mutation.ftId,', identity_cutoff, length_cutoff, '')
else:
query = query % ('Mutation.ftId,', identity_cutoff, length_cutoff, f"AND SMRModel.oligo_state IN {allow_oligo_state}")
else:
if allow_oligo_state is None:
query = query % ('', identity_cutoff, length_cutoff, '')
else:
query = query % ('', identity_cutoff, length_cutoff, f"AND SMRModel.oligo_state IN {allow_oligo_state}")
if sele:
query = query.format('MIN(SMRModel.select_rank)', 'GROUP BY SMRModel.UniProt, Mutation.Pos, Mutation.Alt')
else:
query = query.format('SMRModel.select_rank', '')
with console.status("[bold green]query..."):
dfs = read_sql_query(query, ctx.obj['custom_db'].engine, chunksize=10000)
for df in dfs:
if df.shape[0] == 0:
continue
df.rename(columns={'edUniProt': 'UniProt'}).to_csv(
output, index=False, mode='a+', sep='\t',header=not output_path.exists())
console.log(f'result saved in {output_path}')
#full_df = read_csv(output_path, sep='\t', keep_default_na=False)
#best_indexes = full_df.groupby(['UniProt','Pos', 'Alt']).select_rank.idxmin()
#full_df.loc[best_indexes].to_csv(sele_o_path, sep='\t', index=False)
#console.log(f'sele result saved in {sele_o_path}')


@Interface.command('insert-smr-mapping')
@click.option('-i', '--input', type=click.Path())
@click.option('--chunksize', type=int, help="the chunksize parameter", default=10000)
@click.pass_context
def insert_smr_mapping(ctx, input, chunksize):
custom_db = ctx.obj['custom_db']
dfs = read_csv(input, sep='\t', keep_default_na=False,
na_values=[''], chunksize=chunksize,
usecols=['UniProt', 'coordinates', 'from', 'to', 'identity', 'similarity', 'coverage', 'oligo-state', 'ligand_chains', 'select_rank', 'select_tag'])
done = 0
for df in dfs:
df['with_ligand'] = df.ligand_chains.notnull()
df = df.drop(columns=['ligand_chains']).rename(columns={'oligo-state': 'oligo_state', 'from': 'unp_beg', 'to': 'unp_end'})
custom_db.sync_insert(custom_db.SMRModel, df.to_dict('records'))
done += df.shape[0]
console.log(f'Done: {done}')


@Interface.command('fetch1pdb')
@click.option('-i', '--pdb', type=str, help="PDB Identifier")
@click.option('-a', '--api', type=str, help="API Name")
Expand Down
2 changes: 1 addition & 1 deletion pdb_profiling/fetcher/webfetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ async def http_download(cls, semaphore, method: str, info: Dict, path: str, rate
await asyncio.sleep(rate)
return path
elif resp.status in (204, 300, 400, 403, 404, 405, 406):
cls.logger.debug(f"204|300|400|403|404|405|406 for: {info}")
cls.logger.debug(f"{resp.status} for: {info}")
return None
else:
mes = "code={resp.status}, message={resp.reason}, headers={resp.headers}".format(resp=resp)
Expand Down
Loading

0 comments on commit 76c9de2

Please sign in to comment.