forked from MOZI-AI/knowledge-import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
biogrid_gene2uniprot.py
executable file
·75 lines (66 loc) · 3.01 KB
/
biogrid_gene2uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
__author__ = "Hedra"
__email__ = "[email protected]"
# The following script imports the biogrid_genes mapped to their coding uniprots though biogrid_id
# Requires: uniprot to biogrid_id mapping file uniprot2biogrid.csv and
# Biogrid gene to biogrid_id mapping file gene2biogrid.csv (run biogrid_genes.py to get this file)
import os
from datetime import date
import pandas as pd
import metadata
script = "https://github.com/MOZI-AI/knowledge-import/biogrid_gene2uniprot.py"
def to_atomese(data):
print("importing the data")
df = data.dropna()
genes = []
proteins = []
if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
os.makedirs('dataset')
output_file = "dataset/biogridgene2uniprot_{}.scm".format(str(date.today()))
with open(output_file, 'w') as f:
for i in range(df.shape[0]):
gene = df.iloc[i]['gene_symbol'].upper().strip()
biogrid_id = str(df.iloc[i]['biogrid_id'])
prot = df.iloc[i]['uniprot'].strip()
if gene and biogrid_id and prot:
if gene not in genes:
genes.append(gene)
if prot not in proteins:
proteins.append(prot)
f.write(
'(EvaluationLink \n'+
'\t(PredicateNode "expresses")\n'+
'\t(ListLink \n' +
'\t\t(GeneNode "'+ gene +'")\n' +
'\t\t(MoleculeNode "Uniprot:'+ prot +'")))\n\n' +
'(EvaluationLink \n' +
'\t(PredicateNode "has_biogridID")\n'+
'\t(ListLink \n' +
'\t\t(MoleculeNode "Uniprot:'+ prot +'")\n'+
'\t\t(ConceptNode "Bio:'+biogrid_id+'")))\n\n'+
'(EvaluationLink \n' +
'\t(PredicateNode "has_biogridID")\n'+
'\t(ListLink \n' +
'\t\t(GeneNode "'+ gene +'")\n'+
'\t\t(ConceptNode "Bio:'+biogrid_id+'")))\n\n')
metadata.update_meta("Biogrid-Gene2uniprot:latest",
"uniprot2biogrid.csv, gene2biogrid.csv",script,genes=str(len(genes)),prot=len(proteins))
print("Done, check {}".format(output_file))
if __name__ == "__main__":
'''
Requires: uniprot to biogrid_id mapping file uniprot2biogrid.csv and
Biogrid gene symbold to biogrid_id mapping file gene2biogrid.csv
(run biogrid_genes2id.py to get gene2biogrid.csv)
'''
print("imports the biogrid_genes mapped to their coding uniprots though biogrid_id\n")
try:
bio = pd.read_csv("raw_data/gene2biogrid.csv", sep="\t")
uniprot = pd.read_csv("raw_data/uniprot2biogrid.csv", sep=",")
except Exception as e:
print(e)
for i in range(uniprot.shape[0]):
biogrid_id = uniprot.iloc[i]['biogrid']
prot = uniprot.iloc[i]['uniprot']
# some uniprots has morethan one biogrid ID separated by comma
for b in biogrid_id.split(","):
bio.loc[bio['biogrid_id']==int(b), 'uniprot'] = prot
to_atomese(bio)