forked from MOZI-AI/knowledge-import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
coronavirus_biogrid.py
222 lines (175 loc) · 10.1 KB
/
coronavirus_biogrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
__author__ = "Hedra & Abdulrahman"
__email__ = "[email protected] & [email protected]"
# The following script imports SARS-CoV-2 (COVID-19) and Coronavirus-Related Interactions from thebiogrid.com
# Requires: BIOGRID-CORONAVIRUS-3.5.183.tab3.zip
# from https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-3.5.183/BIOGRID-CORONAVIRUS-3.5.183.tab3.zip
# The version 183 is first release (March 25 2020), It can also be any of the latest versions with the same format
import argparse
import os
import zipfile
from datetime import date
import pandas as pd
import wget
import metadata
from atomwrappers import *
def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="", symmetric=False, stv=""):
if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
if symmetric:
list_type = "SetLink"
else:
list_type = "ListLink"
return ("(EvaluationLink {}\n".format(stv) +
"\t (PredicateNode \"" + predicate + "\")\n" +
"\t ({} \n".format(list_type) +
"\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" +
"\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n")
else:
return ""
def member(node1, node1_type, node2, node2_type, prefix1="", prefix2=""):
if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
return ('(MemberLink\n' +
'\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n' +
'\t({} "'.format(node2_type) + prefix2 + str(node2) + '"))\n')
else:
return ""
def add_taxonomy(taxonomy_id, node, fp):
member_ln = CMemberLink(node, CConceptNode("ncbi:" + str(taxonomy_id)))
fp.write(member_ln.recursive_print() + "\n")
def define_protein_lns(gene_node, prot_node, bio_id, fp):
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node, prot_node))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node, CConceptNode("Bio:" + bio_id)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node, CConceptNode("Bio:" + bio_id)))
fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")
def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp):
if not prot_node_1.name in proteins_lst:
define_protein_lns(gene_node_1, prot_node_1, bio_id_1, fp)
proteins_lst.append(prot_node_1.name)
if not prot_node_2.name in proteins_lst:
define_protein_lns(gene_node_2, prot_node_2, bio_id_2, fp)
proteins_lst.append(prot_node_2.name)
def process_data(version, file_path):
if file_path:
try:
data = pd.read_csv(file_path, low_memory=False, delimiter='\t')
version = file_path.split('-')[-1].replace(".tab3.txt", "")
import_data(data, file_path, version, gene_level=True)
except Exception as e:
print(e)
else:
if version:
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-' + version + '/BIOGRID-CORONAVIRUS-' + version + '.tab3.zip'
else:
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip'
try:
dataset = wget.download(source, "raw_data")
version = zipfile.ZipFile(dataset).namelist()[0].split('-')[-1].replace(".tab3.txt", "")
print(version)
data = pd.read_csv(dataset, low_memory=False, delimiter='\t')
except:
print("Error processing biogrid version {0}".format(version))
raise
import_data(data, source, version, gene_level=True)
def import_data(data, source, version, gene_level=False, form='tab2'):
# Set the gene_level to True to get only the GGI without extra entrez and pubmedID info
print("started importing")
if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
os.makedirs('dataset')
if gene_level:
if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
os.makedirs('gene-level')
g = open('gene-level/COVID-19-biogrid_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w')
with open('dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm', 'w') as f:
gene_pairs = []
protein_pairs = []
entrez = []
covid_genes = []
proteins = []
for i in range(len(data)):
if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(
data.iloc[i]['Official Symbol Interactor B'])):
gene1 = str(data.iloc[i]['Official Symbol Interactor A']).upper().strip()
gene2 = str(data.iloc[i]['Official Symbol Interactor B']).upper().strip()
prot1 = str(data.iloc[i]['SWISS-PROT Accessions Interactor A']).strip()
prot2 = str(data.iloc[i]['SWISS-PROT Accessions Interactor B']).strip()
score = data.iloc[i]['Score']
entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip()
entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip()
taxonomy_id_1 = int(data.iloc[i]['Organism ID Interactor A'])
taxonomy_id_2 = int(data.iloc[i]['Organism ID Interactor B'])
gene_node_1 = CGeneNode(gene1)
gene_node_2 = CGeneNode(gene2)
prot_node_1 = CMoleculeNode("Uniprot:" + prot1)
prot_node_2 = CMoleculeNode("Uniprot:" + prot2)
stv_node = None
if not str(score) in ["-", "nan"]:
stv_node = CStv(1.0, round(float(score), 3))
if (gene1, gene2) not in gene_pairs:
if not gene1 in entrez:
entrez_ln_1 = CEvaluationLink(CPredicateNode("has_entrez_id"),
CListLink(gene_node_1, CConceptNode("entrez:" + entrez1)))
f.write(entrez_ln_1.recursive_print() + "\n")
entrez.append(gene1)
if not gene2 in entrez:
eval_ln_2 = CEvaluationLink(CPredicateNode("has_entrez_id"),
CListLink(gene_node_2, CConceptNode("entrez:" + entrez2)))
f.write(eval_ln_2.recursive_print() + "\n")
entrez.append(gene2)
interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
CSetLink(gene_node_1, gene_node_2), stv=stv_node)
f.write(interacts_ln.recursive_print() + "\n")
if gene_level:
g.write(interacts_ln.recursive_print() + "\n")
if taxonomy_id_1 == 2697049:
covid_genes.append(gene1)
add_taxonomy(taxonomy_id_1, gene_node_1, f)
add_taxonomy(taxonomy_id_1, prot_node_1, f)
if gene_level:
add_taxonomy(taxonomy_id_1, gene_node_1, g)
if taxonomy_id_2 == 2697049:
covid_genes.append(gene2)
add_taxonomy(taxonomy_id_2, gene_node_2, f)
add_taxonomy(taxonomy_id_2, prot_node_2, f)
if gene_level:
add_taxonomy(taxonomy_id_2, gene_node_2, g)
gene_pairs.append((gene1, gene2))
if (prot1, prot2) not in protein_pairs:
interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
CSetLink(prot_node_1, prot_node_2), stv=stv_node)
f.write(interacts_ln.recursive_print() + "\n")
bio_1 = str(data.iloc[i]['BioGRID ID Interactor A']).strip()
bio_2 = str(data.iloc[i]['BioGRID ID Interactor B']).strip()
add_protein_interaction(proteins, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_1, bio_2,
f)
protein_pairs.append((prot1, prot2))
f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs)
number_of_interactions = len(gene_pairs)
script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py"
metadata.update_meta("Coronavirus Biogrid:" + version, source, script, genes=str(len(set(entrez))),
prot=len(set(proteins)), interactions=str(number_of_interactions))
print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm')
with open("Covid19-genes", "w") as co:
co.write("\n".join(list(set(covid_genes))))
def parse_args():
parser = argparse.ArgumentParser(description='convert biogrid db to atomese')
parser.add_argument('--path', type=str, default='',
help='process local file in biogrid format')
parser.add_argument('--download', action='store_true', default=True,
help='download and process db from biogrid')
parser.add_argument('--version', type=str, default='',
help='version to download(by default lastest is used)')
return parser.parse_args()
if __name__ == "__main__":
"""
usage:
run the script with the path to the source data (if downloaded)
python coronavirus_biogrid.py --path=path/to/the/source_data
Or run the script and specify a version number you wanted or just hit enter (to get the latest)
"""
arguments = parse_args()
process_data(arguments.version, arguments.path)