-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbapi_*.py
166 lines (146 loc) · 8.08 KB
/
dbapi_*.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python3
"""
---------Created by Manthos Pitoulias---------------
This API is performing 2 major tasks. 1) Conneting to db and fetching all the data
2) Converting the data to callable pandas fo the middle layer
Create locus_entries list for the middle layer to use in dataframe format
The format as of now 20/03/2021 is for the middle layer to call 8 pieces of
information:
0:Genebank_accession,
1:gene_identifier,
2:chromosomal_location,
3:protein_product_name,
4:coding_sequence_boundaries,
5:complete_DNA_sequence,
6:aminoacid_sequence,
7:is_reverse_copliment
This is the database API - it needs to access the MySQL database for real values.
"""
# Add the directory above to the module path to import the config file
#import sys
#sys.path.insert(0, "../")
import sys
sys.path.insert(0, "../db/")
sys.path.insert(0, "../")
#import dbapi # Import the database api
#import config # Import configuration information (if needed)
#import config # Import configuration information (e.g. database connection)
##################################### FILTERING FUNCTIONS ##############################################
def dframe_abnormalCDS_clean(df,column_name):
"""Accepts a dataframe as argument.
Some entries (very few) contain abnormal cds boundaries
such as (1-7|9) (one nucleotide exon?).
These entries are cleaned. However if the user wants to keep them, simply don't run the function"""
#indexes_cds_boundaries = []
column_name
for i in df.index:
x = df.loc[i,column_name]
if x.count('-') != x.count('|') + 1:
df = df[df.index != i]
else:
df = df
#indexes_cds_boundaries.append(i)
#print(indexes_cds_boundaries)
return(df)
###################################################################################
def dframe_non3exons_clean(df,column_name):
"""Accepts a dataframe.
CDS must be in number-number or number-number|number-number..... format
The combines length of exons should be a multiple of 3 to allow aminoacid conversion. Any other
value is perhaps the result of alternative splicing or some other biological modification which
we will not account for. The user can simply not-apply this function if he wishes to or change the CDS
format"""
import re
#indexes_triplet_exons = []
for i in df.index:
x = df.loc[i,column_name]
m1 = re.sub(r'(\d+)-(\d+)',lambda m: str(int(m.group(2)) - int(m.group(1)) + 1),x)
m2 = (re.findall(r'\d+',m1))
m2 = sum([int(ele) for ele in m2])
if m2 % 3 != 0:
df = df[df.index != i]
else:
df = df
#indexes_triplet_exons.append(i)
#print(indexes_triplet_exons))
return(df)
##################################### CONNECT TO DB ##############################################
import pymysql as mdb
def connect2db():
db_conn = mdb.connect(host ='****', user = '****', password = '****', db = '****')
cursor = db_conn.cursor()
return(cursor)
###################################### CALLABLE FUNCTIONS FOR BL #############################################
def get_all():
"""
This function will create a list of locus_entries and generate a pandas dataframe
out of it named df.
You can assign it to a vector with the form: x = get_all()
x will become the df dataframe
"""
import re
x = connect2db()
query = 'SELECT gbk_accession, gene_name, chrom_loc, protein_product, CDS, DNA_seq , translation, complement_status FROM genebank_entries;'
x.execute(query)
row = x.fetchall()
standard_bases = []
for ele in row:
atcg_only = re.search(r'[^ATCG]',ele[5])
if not atcg_only:
standard_bases.append(ele)
locus_entries = standard_bases
#locus_entries.append(['AC001564','ann3','southwest','anna dehydroxylase','20-52|80-202','CGCCCACTCTCCCATTTGTCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACGCGTAACGCAGCTGAGTATTCTACAGAGCTGGCGTACGCGTTGAACACTTCACGGATGATAGGGATTCGGGTAAAGAGCGCGTCATTGGGGGCTTATACAGGCGTAGACTACAACGGGCCCGGCTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACAGTCTATCGAGCGAGTCGATTACCAACGGGTGCGTTGCAGTTTTAATCTCTCGCCAGCACTGTAATAGCCACCAAGAGACTGATGATAGTCATGGGTGCTGAGCTGAG','QLSTHDYHQSLGGYYSAGERLKLQRTRW-STRSIDCRECIEMSMLFKALEL-LSRARCSLRLYKPPMTRSLPESLSSVKCSTRTPAL-NTQLRYA-EYLSGQSHRIRLRGQMGEWA',1])
#locus_entries.append(['AB000321','har2','center','harry kinase','19-88','ATTGTTATCTAGGTTGAGATAGTCATGTGCTCACGGAATTTATTGTATGAGTAGTGATTTGAAAGAATTGTTAGTTTGCTGGTTCAAGTAAAAATTCTTT','LLSRLR-SCAHGIYCMSSDLKELLVCWFK-KFF',1])
#locus_entries.append(['AB000123','man1','northeast','manthase','14-30|70-150','AATTCTTACATTTAAGACCCTAATATCACATCATTAGTGATTAATTGCCACTGCCAAAATTCTGTCCAGAAGCGTTTTAGTTCGCTCCACTAAAGTTGTTTAAAACGACTACCAAATCCGCATGTTAGGGGATTTCTTATTAATTCTTTTATCGTGAGGAACAGCGGATCTTAATGGATGGCCGCAGGTGGTATGGAAGCTAATAGCGCGGGTGAGAGGGTAATCAGCCGTGTTCACCTACACAACGCTAACGGGCGATTCTATAAGATTCCGCATTGCGTCTACTTATAAGATGTCTCAACGGTATCCGCAACTTGTGAAGTGCCTACTATCCTTAAACGCATATCTCGCCCAGTAGCTTCCCAATATGTGAGCATCA','NSYI-DPNITSLVINCHCQNSVQKRFSSLH-SCLKRLPNPHVRGFLINSFIVRNSGS-WMAAGGMEANSAGERVISRVHLHNANGRFYKIPHCVYL-DVSTVSATCEVPTILKRISRPVASQYVSI',0])
import pandas as pd
df = pd.DataFrame(locus_entries)
df.columns = ['Genbank_accession','gene_identifier','chromosomal_location','protein_product_name','coding_sequence_boundaries','complete_DNA_sequence','aminoacid_sequence','is_reverse_complement']
df = dframe_abnormalCDS_clean(df,'coding_sequence_boundaries')
df = dframe_non3exons_clean(df,'coding_sequence_boundaries')
df = df.reset_index(drop=True)
return(df)
###################################################################################
def get_summary(n,par = ''):
"""
Returns a data frame for the rows in the database that satisfy a certain condition. The columns have the following names and data types:
Genbank_accession: string
gene_identifier: string
chromosomal_location: string
protein_product_name: string
If it is called with paramter 0 it returns the above information for all the rows in the database. If it is called with parameter 1, 2, 3, and 4, it returns respectively the data that have a specific Genbank accession, gene identifier, chromosomal location, and protein product name.
"""
if n == 0:
df_slice = get_all().iloc[:,0:4]
return(df_slice)
if n ==1:
df_slice = get_all().iloc[:,0:4]
df_slice_ga = df_slice[df_slice['Genbank_accession'] == par]
return(df_slice_ga)
if n==2:
df_slice = get_all().iloc[:,0:4]
df_slice_gi = df_slice[df_slice['gene_identifier'] == par]
return(df_slice_gi)
if n==3:
df_slice = get_all().iloc[:,0:4]
df_slice_cl = df_slice[df_slice['chromosomal_location'] == par]
return(df_slice_cl)
if n==4:
df_slice = get_all().iloc[:,0:4]
df_slice_ppn = df_slice[df_slice['protein_product_name'] == par]
return(df_slice_ppn)
###################################################################################
def get_details(user_input):
""" returns a dictionary for a single row in the database. The row has the Genbank accession given as input to the function. The keys are the following and the values have the following data types:
complete_DNA_sequence: string
coding_sequence_boundaries: string (for example: "1-92|168-320")
aminoacid_sequence: string
is_reverse_complement: integer (value 0 or 1)
GENEBANK ACCESSION NUMBERS MUST BE UNIQUE FOR THE DICTIONARY TO WORK, otherwise turn to dataframe
"""
df_slice = get_all().iloc[:,[0,4,5,6,7]]
df_slice_get_det = df_slice[df_slice['Genbank_accession'] == user_input].iloc[:,[1,2,3,4]]
d = df_slice_get_det.to_dict('list')
d = {i:str(j[0]) for i,j in d.items() }
d = {k:int(v) if v.isdigit() else v for k,v in d.items()}
return(d)
###################################################################################