diff --git a/README.md b/README.md
index 26a449b..e43bdd1 100644
--- a/README.md
+++ b/README.md
@@ -58,9 +58,9 @@ Profiling Protein Structures from Protein Data Bank and integrate various resour
*
* NOTE: currently only support minimum use
* Download data from PDB Archive against unexpected needs
- * wwwPDB&RCSB:
+ * wwPDB&RCSB:
* EBI:
- * wwwPDB Versioned:
+ * wwPDB Versioned:
## Install
diff --git a/docs/figs/ToUniProt_ali.svg b/docs/figs/ToUniProt_ali.svg
new file mode 100644
index 0000000..6873d05
--- /dev/null
+++ b/docs/figs/ToUniProt_ali.svg
@@ -0,0 +1,72986 @@
+
+
+
+
\ No newline at end of file
diff --git a/examples/ToUniProt.ipynb b/examples/ToUniProt.ipynb
new file mode 100644
index 0000000..f59615b
--- /dev/null
+++ b/examples/ToUniProt.ipynb
@@ -0,0 +1,247 @@
+{
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.1-final"
+ },
+ "orig_nbformat": 2,
+ "kernelspec": {
+ "name": "Python 3.7.1 64-bit ('base': conda)",
+ "display_name": "Python 3.7.1 64-bit ('base': conda)",
+ "metadata": {
+ "interpreter": {
+ "hash": "2266c607543d224cb119288ea55888d6fda87cc9a4c78c02ed099d39082a76ce"
+ }
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "Wall time: 4.99 ms\nWall time: 2.47 s\nWall time: 0 ns\nWall time: 82.1 ms\n"
+ }
+ ],
+ "source": [
+ "%time from pdb_profiling import default_config\n",
+ "%time from pdb_profiling.processors import Identifier, UniProtFASTA\n",
+ "%time from pdb_profiling.utils import a_seq_reader\n",
+ "\n",
+ "%time default_config('C:/GitWorks/pdb-profiling/test/demo')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": ""
+ },
+ "metadata": {},
+ "execution_count": 2
+ }
+ ],
+ "source": [
+ "demo = Identifier('NP_001291289.1')\n",
+ "demo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "('Q9C0B2', None)"
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
+ "source": [
+ "entry, isoform = demo.map2unp().result()\n",
+ "entry, isoform"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "{'protein': 'NP_001291289.1', 'transcript': 'NM_001304360.1', 'gene': None}"
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ],
+ "source": [
+ "demo.get_all_level_identifiers().result()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "('>NP_001291289.1 cilia- and flagella-associated protein 74 [Homo sapiens]',\n 'MEDDGSLLPEDELLADALLLEDERDELEDPEFDIKCLLQEAEDDVDPGHSSSVKELDTDADKLKKKTAEDRTQAFHLRQNLSALDKMHEEQELFTEKMRGELRACRQRRDLIDKQQEAVAAEIATEEEAGNMAAVGRLQAVSRRLFAELENERDLQSRTEAVLKESENTMWHIEIQEGRLEAFRTADREEVEATGRRLQVRAAEQLCREQEALGKVERNRLLRIRKSLNTQKELGLRHQKLLEDARKNHKVAVRFLKASLGRIREQEKKEEMECHEYMRRRMDAVVALKGSISANRDTLRKFQAWDRAKAELAEQRVQAEKKAILAQGRDAFRHLVHQRRRQELEAQKRAFEEEQKLRKQEIISRILKEEAEEEKRKKQHPPTSARHRLTLRDKTWNYISDFCKKTTVPTNTYTLDYEAAAGPGPSRLLEVVSSELIQGDPGASSEEETLAEPEISGLWNEDYKPYQVPKEDVDRKPVGGTKMDKDILERTVERLRSRVVHKQVVWGREFQGRPFNSKPELLHFQDFDIGKVYKKKITLVNTTYTINYCKLVGVEEHLRDFIHVDFDPPGPLSAGMSCEVLVTFKPMINKDLEGNISFLAQTGEFSVPLKCSTKKCSLSLDKELIDFGSYVVGETTSRTITLTNVGGLGTTFKFLPASEPCEMDDSQSALKLSSLLTYEDKSLYDKAATSFSEQQLEGTESSQADMQSRKELEKLDKEQEEEQPAEPERLTTVIPPSEEQTEITLGEVTEGEIGPFSSIKVPIVFTPVVPGDVQARFKVTFKNPQCPTLHFRVVGVAIDVPVWVPKPSVDLKICMYDRLYQDSVLVHTRSKAALRLKFEVCKELRAHLELLPKTGYIQAQSSYSVQLKFLPRHSLPEDAGRYFDKETRVLEAPMTIWVADQNKPVGFTVHAIVTTSDLELSPSEVDFGYCTIYEAIRTEISLHNHSLLPQEFGFVRLPKFVDVQPNDGFGTILPLETLQFCVIFQPTKAEEHRFQLTCKSEINRCFKLSCRAVGVHPPLELSHYQIKFAATALYDTSVATVYVINSHLSMSSPTHSKPRIGSEDASPMGPTSFEFLLPPDSPITISPSVGTVWPGKRCLVQVAFRPVLPEKLIRQEALPLLNKEMETKSFRKNMAPQRKDLHGLSFSVLRAQNRDKLFKVSVPHVLEMRKRELRPSSDEYQAARATLLRAFQAKFDTFVVPCVVASGDIKDRKGSEPLSFSPHNTLYLELWCPTVAPSVVVTSHKGKTIFNFGDVAVGHRSIKKISIQNVSPEDLALDFSLLNPNGPFVLLNHSSLLRAGGTQVLVLSFSPHESILAQETLDIITKRGTLTLTLMGTGVASMITCSIEGSVLNMGYVIAGESVSSGFKLQNNSLLPIKFSMHLDSLSSTRGRGQQQLPQFLSSPSQRTEVVGTQNLNGQSVFSVAPVKGVMDPGKTQDFTVTFSPDHESLYFSDKLQVVLFEKKISHQILLKGAACQHMMFVEGGDPLDVPVESLTAIPVFDPRHREASSRPGPLSPEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKTVEFSIDSVASLQHKGFSIEPSRGSVERGQTKTISISWVPPADFDPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP')"
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
+ "source": [
+ "'''\n",
+ "Identifier(\n",
+ " demo.get_all_level_identifiers().result()['protein']\n",
+ ").fetch_sequence().result()\n",
+ "'''\n",
+ "np_header, np_seq = demo.fetch_sequence().result()\n",
+ "np_header, np_seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "True"
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "demo.status"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "('>sp|Q9C0B2|CFA74_HUMAN Cilia- and flagella-associated protein 74 OS=Homo sapiens OX=9606 GN=CFAP74 PE=2 SV=3',\n 'MEDDGSLLPEDELLADALLLEDERDELEDPEFDIKCLLQEAEDDVDPGHSSSVKELDTDADKLKKKTAEDRTQAFHLRQNLSALDKMHEEQELFTEKMRGELRACRQRRDLIDKQQEAVAAEIATEEEAGNMAAVGRLQAVSRRLFAELENERDLQSRTEAVLKESENTMWHIEIQEGRLEAFRTADREEVEATGRRLQVRAAEQLCREQEALGKVERNRLLRIRKSLNTQKELGLRHQKLLEDARKNHKVAVRFLKASLGRIREQEKKEEMECHEYMRRRMDAVVALKGSISANRDTLRKFQAWDRAKAELAEQRVQAEKKAILAQGRDAFRHLVHQRRRQELEAQKRAFEEEQKLRKQEIISRILKEEAEEEKRKKQHPPTSARHRLTLRDKTWNYISDFCKKTTVPTNTYTLDYEAAAGPGPSRLLEVVSSELIQGDPGASSEEETLAEPEISGLWNEDYKPYQVPKEDVDRKPVGGTKMDKDILERTVERLRSRVVHKQVVWGREFQGRPFNSKPELLHFQDFDIGKVYKKKITLVNTTYTINYCKLVGVEEHLRDFIHVDFDPPGPLSAGMSCEVLVTFKPMINKDLEGNISFLAQTGEFSVPLKCSTKKCSLSLDKELIDFGSYVVGETTSRTITLTNVGGLGTTFKFLPASEPCEMDDSQSALKLSSLLTYEDKSLYDKAATSFSEQQLEGTESSQADMQSRKELEKLDKEQEEEQPAEPERLTTVIPPSEEQTEITLGEVTEGEIGPFSSIKVPIVFTPVVPGDVQARFKVTFKNPQCPTLHFRVVGVAIDVPVWVPKPSVDLKICMYDRLYQDSVLVHTRSKAALRLKFEVCKELRAHLELLPKTGYIQAQSSYSVQLKFLPRHSLPEDAGRYFDKETRVLEAPMTIWVADQNKPVGFTVHAIVTTSDLELSPSEVDFGYCTIYEAIRTEISLHNHSLLPQEFGFVRLPKFVDVQPNDGFGTILPLETLQFCVIFQPTKAEEHRFQLTCKSEINRCFKLSCRAVGVHPPLELSHYQIKFAATALYDTSVATVYVINSHLSMSSPTHSKPRIGSEDASPMGPTSFEFLLPPDSPITISPSVGTVWPGKRCLVQVAFRPVLPEKLIRQEALPLLNKEMETKSFRKNMAPQRKDLHGLSFSVLRAQNRDKLFKVSVPHVLEMRKRELRPSSDEYQAARATLLRAFQAKFDTFVVPCVVASGDIKDRKGSEPLSFSPHNTLYLELWCPTVAPSVVVTSHKGKTIFNFGDVAVGHRSIKKISIQNVSPEDLALDFSLLNPNGPFVLLNHSSLLRAGGTQVLVLSFSPHESILAQETLDIITKRGTLTLTLMGTGVASMITCSIEGSVLNMGYVIAGESVSSGFKLQNNSLLPIKFSMHLDSLSSTRGRGQQQLPQFLSSPSQRTEVVGTQNLNGQSVFSVAPVKGVMDPGKTQDFTVTFSPDHESLYFSDKLQVVLFEKKISHQILLKGAACQHMMFVEGGDPLDVPVESLTAIPVFDPRHREEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP')"
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "unp_header, unp_seq = UniProtFASTA.single_retrieve(entry, UniProtFASTA.folder, UniProtFASTA.web_semaphore).then(a_seq_reader).result()\n",
+ "unp_header, unp_seq"
+ ]
+ },
+ {
+ "source": [
+ "```py\n",
+ "from dtaidistance import alignment\n",
+ "%time value, matrix = alignment.needleman_wunsch(unp_seq, np_seq)\n",
+ "algn, s1a, s2a = alignment.best_alignment(matrix, unp_seq, np_seq, gap='-')\n",
+ "print(''.join(s1a[1500:]))\n",
+ "print(''.join(s2a[1500:]))\n",
+ "\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "plt.style.use('ggplot')\n",
+ "plt.figure(figsize=(10,8))\n",
+ "sns.heatmap(matrix[1500:,1500:],cmap='icefire')\n",
+ "plt.show()\n",
+ "```\n",
+ "\n",
+ "
"
+ ],
+ "cell_type": "markdown",
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "Wall time: 0 ns\nWall time: 20.9 ms\nWall time: 0 ns\n"
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(((0, 1507), (1507, 1551), (1551, 1552), (1552, 1553), (1553, 1584)),\n ((0, 1507), (1517, 1561), (1581, 1582), (1603, 1604), (1608, 1639)))"
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "source": [
+ "from Bio import Align\n",
+ "%time aligner = Align.PairwiseAligner()\n",
+ "%time alignments = aligner.align(unp_seq, np_seq)\n",
+ "%time alignments[0].aligned"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "FDPRHRE----------EAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKK--------------------P---------------------D----HPLMVSALLQLRGDVKETYKVIFVAQVLTGP\n|||||||----------||||||||||||||||||||||||||||||||||||||||||||--------------------|---------------------|----|||||||||||||||||||||||||||||||\nFDPRHREASSRPGPLSPEAEELRPILVTLDYIQFDTDTPAPPATRELQVGCIRTTQPSPKKTVEFSIDSVASLQHKGFSIEPSRGSVERGQTKTISISWVPPADFDPDHPLMVSALLQLRGDVKETYKVIFVAQVLTGP\n\n"
+ }
+ ],
+ "source": [
+ "for i in str(alignments[0]).split('\\n'):\n",
+ " print(i[1500:])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/pdb_profiling/__init__.py b/pdb_profiling/__init__.py
index 34d878a..b27a3a6 100644
--- a/pdb_profiling/__init__.py
+++ b/pdb_profiling/__init__.py
@@ -6,7 +6,7 @@
# @Copyright (c) 2020 MinghuiGroup, Soochow University
from re import compile as re_compile
-__version__ = '0.1.6'
+__version__ = '0.1.7'
common_pat = r'^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]'
@@ -34,6 +34,7 @@ def default_config(folder='./'):
from pdb_profiling.processors.pdbe.record import Base
from pdb_profiling.processors.pdbe.api import ProcessPDBe
from pdb_profiling.processors.proteins.record import Identifier
+ from pdb_profiling.processors import UniProtFASTA
# Use Existing Handled PDBe API Results (e.g. tsv format results)
ProcessPDBe.use_existing = True
# Use Existing API Results (e.g. json format results downloaded from web)
@@ -43,6 +44,8 @@ def default_config(folder='./'):
# Set WebFetcher's Semaphore
Base.set_web_semaphore(30).result()
Identifier.set_web_semaphore(30).result()
+ UniProtFASTA.set_web_semaphore(30).result()
# Set Folder that store downloaded and handled files
Base.set_folder(folder)
Identifier.set_folder(folder)
+ UniProtFASTA.set_folder(folder)
diff --git a/pdb_profiling/processors/eutils/api.py b/pdb_profiling/processors/eutils/api.py
index 676e79b..4d39377 100644
--- a/pdb_profiling/processors/eutils/api.py
+++ b/pdb_profiling/processors/eutils/api.py
@@ -22,7 +22,7 @@ class EutilsAPI(Abclog):
'''
headers = {"Content-Type": "text/plain"}
api_set = frozenset(('efetch.fcgi', 'einfo.fcgi', 'esearch.fcgi',
- 'epost.fcgi', 'esummary.fcgi'))
+ 'epost.fcgi', 'esummary.fcgi', 'egquery.fcgi'))
@classmethod
def dumpsParams(cls, params: Dict) -> str:
@@ -34,7 +34,7 @@ def task_unit(cls, suffix: str, params: Dict, folder: Path) -> Tuple:
url=f'{BASE_URL}{suffix}',
headers=cls.headers,
params=params)
- return 'get', args, folder/f'{cls.dumpsParams(params)}.{params.get("retmode", params.get("rettype", "txt"))}'
+ return 'get', args, folder/f'{cls.dumpsParams(params)}.{params.get("retmode", params.get("rettype", "xml"))}'
@classmethod
def yieldTasks(cls, suffix: str, params_collection: Iterable[Dict], folder: Path) -> Generator:
diff --git a/pdb_profiling/processors/pdbe/api.py b/pdb_profiling/processors/pdbe/api.py
index 3d47164..ad471b7 100644
--- a/pdb_profiling/processors/pdbe/api.py
+++ b/pdb_profiling/processors/pdbe/api.py
@@ -115,8 +115,8 @@ def __init__(self, name_group):
def output(self):
if self.pdb_range:
- pdb_range = json.dumps(self.pdb_range)
- unp_range = json.dumps(self.unp_range)
+ pdb_range = json.dumps(self.pdb_range).decode('utf-8')
+ unp_range = json.dumps(self.unp_range).decode('utf-8')
return pdb_range, unp_range
else:
return self.default_pdb_range, self.default_unp_range
@@ -261,12 +261,13 @@ def related_UNP_PDB(cls, filePath: Union[str, Path], related_unp: Optional[Itera
return set(pdb_list), set(dfrm['SP_PRIMARY'])
@classmethod
- def reformat(cls, path: str) -> pd.DataFrame:
- dfrm = pd.read_csv(path, sep='\t', converters=cls.converters)
+ def reformat(cls, path: Optional[str]=None, dfrm:Optional[pd.DataFrame]=None) -> pd.DataFrame:
+ if path is not None:
+ dfrm = pd.read_csv(path, sep='\t', converters=cls.converters)
group_info_col = ['pdb_id', 'chain_id', 'UniProt']
range_info_col = ['pdb_start', 'pdb_end', 'unp_start', 'unp_end']
reader = SeqRangeReader(group_info_col)
- dfrm[['sifts_pdb_range', 'sifts_unp_range']] = pd.DataFrame(dfrm.apply(
+ dfrm[['pdb_range', 'unp_range']] = pd.DataFrame(dfrm.apply(
lambda x: reader.check(tuple(x[i] for i in group_info_col), tuple(
x[i] for i in range_info_col)),
axis=1).values.tolist(), index=dfrm.index)
@@ -276,52 +277,72 @@ def reformat(cls, path: str) -> pd.DataFrame:
return dfrm
@staticmethod
- def dealWithInDe(dfrm: pd.DataFrame) -> pd.DataFrame:
+ def sort_2_range(unp_range: List, pdb_range: List):
+ unp_range, pdb_range = zip(
+ *sorted(zip(unp_range, pdb_range), key=lambda x: x[0][0]))
+ return unp_range, pdb_range
+
+ @classmethod
+ def dealWithInDel(cls, dfrm: pd.DataFrame, sort_by_unp:bool=True) -> pd.DataFrame:
def get_gap_list(li: List):
return [li[i+1][0] - li[i][1] - 1 for i in range(len(li)-1)]
def get_range_diff(lyst_a: List, lyst_b: List):
- array_a = np.array([ran[1] - ran[0] + 1 for ran in lyst_a])
- array_b = np.array([ran[1] - ran[0] + 1 for ran in lyst_b])
- return (array_a - array_b).tolist()
+ array_a = np.array([right - left + 1 for left, right in lyst_a])
+ array_b = np.array([right - left + 1 for left, right in lyst_b])
+ return array_a - array_b
def add_tage_to_range(df: pd.DataFrame, tage_name: str):
# ADD TAGE FOR SIFTS
df[tage_name] = 'Safe'
# No Insertion But Deletion[Pure Deletion]
df.loc[df[(df['group_info'] == 1) & (
- df['sifts_unp_pdb_var'] > 0)].index, tage_name] = 'Deletion'
+ df['diff+'] > 0)].index, tage_name] = 'Deletion'
# Insertion & No Deletion
df.loc[df[
- (df['group_info'] != 1) &
- (df['var_0_count'] == df['group_info']) &
- (df['unp_GAP_0_count'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion'
+ (df['group_info'] == 1) &
+ (df['diff-'] > 0)].index, tage_name] = 'Insertion (Specail Case)'
+ df.loc[df[
+ (df['group_info'] > 1) &
+ (df['diff0'] == df['group_info']) &
+ (df['unp_gaps0'] == (df['group_info'] - 1))].index, tage_name] = 'Insertion'
# Insertion & Deletion
df.loc[df[
- (df['group_info'] != 1) &
- ((df['var_0_count'] != df['group_info']) |
- (df['unp_GAP_0_count'] != (df['group_info'] - 1)))].index, tage_name] = 'Insertion & Deletion'
-
- dfrm['pdb_GAP_list'] = dfrm.apply(lambda x: json.dumps(
- get_gap_list(json.loads(x['sifts_pdb_range']))), axis=1)
- dfrm['unp_GAP_list'] = dfrm.apply(lambda x: json.dumps(
- get_gap_list(json.loads(x['sifts_unp_range']))), axis=1)
- dfrm['var_list'] = dfrm.apply(lambda x: json.dumps(get_range_diff(
- json.loads(x['sifts_unp_range']), json.loads(x['sifts_pdb_range']))), axis=1)
- dfrm['delete'] = dfrm.apply(
- lambda x: '-' in x['var_list'], axis=1)
- dfrm['delete'] = dfrm.apply(
- lambda x: True if '-' in x['unp_GAP_list'] else x['delete'], axis=1)
- dfrm['var_0_count'] = dfrm.apply(
- lambda x: json.loads(x['var_list']).count(0), axis=1)
- dfrm['unp_GAP_0_count'] = dfrm.apply(
- lambda x: json.loads(x['unp_GAP_list']).count(0), axis=1)
+ (df['group_info'] > 1) &
+ ((df['diff0'] != df['group_info']) |
+ (df['unp_gaps0'] != (df['group_info'] - 1)))].index, tage_name] = 'Insertion & Deletion'
+
+ dfrm.pdb_range = dfrm.pdb_range.apply(json.loads)
+ dfrm.unp_range = dfrm.unp_range.apply(json.loads)
dfrm['group_info'] = dfrm.apply(lambda x: len(
- json.loads(x['sifts_pdb_range'])), axis=1)
- dfrm['sifts_unp_pdb_var'] = dfrm.apply(
- lambda x: json.loads(x['var_list'])[0], axis=1)
- add_tage_to_range(dfrm, tage_name='sifts_range_tage')
- return dfrm
+ x['pdb_range']), axis=1)
+
+ focus_index = dfrm[dfrm.group_info.gt(1)].index
+ if sort_by_unp and (len(focus_index) > 0):
+ focus_df = dfrm.loc[focus_index].apply(lambda x: cls.sort_2_range(
+ x['unp_range'], x['pdb_range']), axis=1, result_type='expand')
+ focus_df.index = focus_index
+ focus_df.columns = ['unp_range', 'pdb_range']
+ dfrm.loc[focus_index, ['unp_range', 'pdb_range']] = focus_df
+
+ dfrm['pdb_gaps'] = dfrm.pdb_range.apply(get_gap_list)
+ dfrm['unp_gaps'] = dfrm.unp_range.apply(get_gap_list)
+ dfrm['range_diff'] = dfrm.apply(lambda x: get_range_diff(x['unp_range'], x['pdb_range']), axis=1)
+ dfrm['diff0'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x == 0))
+ dfrm['diff+'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x > 0))
+ dfrm['diff-'] = dfrm.range_diff.apply(lambda x: np.count_nonzero(x < 0))
+ dfrm['unp_gaps0'] = dfrm.unp_gaps.apply(lambda x: x.count(0))
+ add_tage_to_range(dfrm, tage_name='sifts_range_tag')
+ dfrm['repeated'] = dfrm.apply(
+ lambda x: x['diff-'] > 0 and x['sifts_range_tag'] != 'Insertion (Specail Case)', axis=1)
+ dfrm['repeated'] = dfrm.apply(
+ lambda x: True if any(i < 0 for i in x['unp_gaps']) else x['repeated'], axis=1)
+ dfrm['reversed'] = dfrm.pdb_gaps.apply(lambda x: any(i < 0 for i in x))
+ dfrm.pdb_range = dfrm.pdb_range.apply(lambda x: json.dumps(x).decode('utf-8'))
+ dfrm.unp_range = dfrm.unp_range.apply(lambda x: json.dumps(x).decode('utf-8'))
+ temp_cols = ['start', 'end', 'group_info', 'pdb_gaps', 'unp_gaps', 'range_diff',
+ 'diff0', 'diff+', 'diff-', 'unp_gaps0']
+ return dfrm.drop(columns=temp_cols), dfrm[temp_cols]
'''
@staticmethod
@@ -761,7 +782,7 @@ class PDBArchive(Abclog):
'''
Download files from PDB Archive
- * wwwPDB/RCSB: PDB_ARCHIVE_URL_WWPDB: str = 'https://ftp.wwpdb.org/pub/pdb/data/structures/'
+ * wwPDB/RCSB: PDB_ARCHIVE_URL_WWPDB: str = 'https://ftp.wwpdb.org/pub/pdb/data/structures/'
* EBI: PDB_ARCHIVE_URL_EBI: str = 'http://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/'
'''
root = PDB_ARCHIVE_URL_EBI
@@ -801,7 +822,7 @@ class PDBVersioned(PDBArchive):
'''
Download files from PDB Versioned
- * wwwPDB Versioned: PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/entries/'
+ * wwPDB Versioned: PDB_ARCHIVE_VERSIONED_URL: str = 'http://ftp-versioned.wwpdb.org/pdb_versioned/data/entries/'
>>> PDBVersioned.single_retrieve(
('2wmg', '_v1-2'), 'entries/',
diff --git a/pdb_profiling/processors/proteins/record.py b/pdb_profiling/processors/proteins/record.py
index 285e06d..5c0c324 100644
--- a/pdb_profiling/processors/proteins/record.py
+++ b/pdb_profiling/processors/proteins/record.py
@@ -73,8 +73,7 @@ def __init__(self, identifier: str, folder: Optional[Union[Path, str]] = None):
except AttributeError:
raise AttributeError(
"Please specify class variable `folder` via set_folder() first or pass `folder` in this method!")
- self.ensembl_status = None
- self.refseq_status = None
+ self.status = None
def __repr__(self):
return f'<{self.source} {self.level} {self.identifier} {self.version}>'
@@ -89,9 +88,9 @@ async def set_status(self):
self.ensembl_api_web_semaphore,
headers={'Content-Type': 'application/json'})
if res is None:
- self.ensembl_status = False
+ self.status = False
else:
- self.ensembl_status = await a_load_json(res)
+ self.status = await a_load_json(res)
@unsync
async def fetch_from_ProteinsAPI(self):
@@ -116,7 +115,17 @@ async def fetch_from_ProteinsAPI(self):
f"Can't find dbReference with {self.identifier}")
@unsync
- async def map2unp(self):
+ async def get_all_level_identifiers(self):
+ try:
+ return dict(zip(('protein', 'transcript', 'gene'), await self.sqlite_api.database.fetch_one(
+ query=f"""
+ SELECT protein,transcript,gene FROM dbReferences
+ WHERE type == '{self.source}' AND {self.level} == '{self.raw_identifier}'""")))
+ except TypeError:
+ return
+
+ @unsync
+ async def map2unp_from_localDB(self):
try:
entry, isoform = await self.sqlite_api.database.fetch_one(
query=f"""
@@ -149,19 +158,21 @@ async def fetch_sequence(self, newest: bool = True):
self.seq_folder['RefSeq'],
self.eutils_api_web_semaphore)
if res is not None:
+ self.status = True
return await a_seq_reader(res)
else:
+ self.status = False
self.logger.warning(f'Invalid Identifier!')
elif self.source == 'Ensembl':
- if self.ensembl_status is None:
+ if self.status is None:
await self.set_status()
- if self.ensembl_status is False:
+ if self.status is False:
self.logger.warning(f'Invalid Identifier!')
return
- elif self.ensembl_status['is_current'] != '1':
+ elif self.status['is_current'] != '1':
self.logger.warning(
- f'Not exists in current archive: \n{self.ensembl_status}')
+ f'Not exists in current archive: \n{self.status}')
return
if not newest:
self.logger.warning(
@@ -171,3 +182,11 @@ async def fetch_sequence(self, newest: bool = True):
dict(type='protein'),
self.seq_folder['Ensembl'],
self.ensembl_api_web_semaphore).then(a_seq_reader)
+
+ @unsync
+ async def map2unp(self):
+ res = await self.map2unp_from_localDB()
+ if res is None:
+ await self.fetch_from_ProteinsAPI()
+ res = await self.map2unp_from_localDB()
+ return res
diff --git a/pdb_profiling/processors/uniprot/api.py b/pdb_profiling/processors/uniprot/api.py
index aa0f606..13a0baa 100644
--- a/pdb_profiling/processors/uniprot/api.py
+++ b/pdb_profiling/processors/uniprot/api.py
@@ -18,6 +18,7 @@
from pdb_profiling.log import Abclog
from pdb_profiling.fetcher.webfetch import UnsyncFetch
from pdb_profiling.processors.uniprot.process import ExtractIsoAlt
+from pdb_profiling.utils import init_semaphore, init_folder_from_suffix
QUERY_COLUMNS: List[str] = [
@@ -385,6 +386,15 @@ class UniProtFASTA(Abclog):
params = {'include': 'yes'}
obj = {}
+ @classmethod
+ @unsync
+ async def set_web_semaphore(cls, web_semaphore_value:int):
+ cls.web_semaphore = await init_semaphore(web_semaphore_value)
+
+ @classmethod
+ def set_folder(cls, folder: Union[Path, str]):
+ cls.folder = init_folder_from_suffix(folder, 'UniProt/fasta/')
+
@classmethod
@unsync
async def process(cls, path: Union[str, Path, Unfuture]):