forked from MOZI-AI/knowledge-import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
string_PPI.py
145 lines (127 loc) · 6.93 KB
/
string_PPI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Uniprot to string mapping https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz
# String PPI dataset https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz
# Columns definition http://www.string-db.org/help/faq/#what-does-the-columns-in-proteinsactions-file-mean
import pandas as pd
import wget
import os
import sys
import metadata
import datetime
source = "https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz"
mapping = "https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz"
def evaLink(term1, term2, predicate, link_type="ListLink",stv="", ppi=True):
if not (str(term1) == "nan" or str(term2) == 'nan'):
if ppi:
return("(EvaluationLink" + stv + "\n" +
"\t (PredicateNode \""+ predicate + "\")\n" +
"\t (" + link_type + " \n" +
"\t\t (MoleculeNode" + " \"Uniprot:" + term1 + "\")\n" +
"\t\t (MoleculeNode" + " \"Uniprot:" + term2 + "\")))\n" )
else:
return("(EvaluationLink" + stv + "\n" +
"\t (PredicateNode \""+ predicate + "\")\n" +
"\t (" + link_type + " \n" +
"\t\t (GeneNode" + " \"" + term1.upper() + "\")\n" +
"\t\t (GeneNode" + " \"" + term2.upper() + "\")))\n" )
else:
return ""
def import_string():
print("started at " + str(datetime.datetime.now()))
if not os.path.exists('raw_data/9606.protein.actions.v11.0.txt.gz'):
wget.download(source,"raw_data/")
if not os.path.exists('raw_data/human.uniprot_2_string.2018.tsv.gz'):
wget.download(mapping,"raw_data/")
df_data = pd.read_csv("raw_data/9606.protein.actions.v11.0.txt.gz", dtype=str, sep="\t")
df_data_symmetric = df_data[df_data['is_directional'] == "f"]
df_data_asymmetric = df_data[df_data['is_directional'] == "t"]
df_mapping = pd.read_csv("raw_data/human.uniprot_2_string.2018.tsv.gz", dtype=str, sep="\t", names=["code", "uniprot", "ensembl","num1","num2"])
# create a mapping dictionary
mapping_dict = {}
for e in df_mapping["ensembl"]:
if not e in mapping_dict.keys():
mapping_dict[e] = df_mapping[df_mapping["ensembl"] == e]["uniprot"].values[0]
print("Done with the Dict, importing into atomese")
print(len(df_data))
notmapped = []
"""
If the directionality of the interaction is true and a is acting, use ListLink and keep the order. Otherwise use SetLink
* is_directional - describes if the diractionality of the particular interaction is known.
* a_is_acting - the directionality of the action if applicable ('t' gives that item_id_a is acting upon item_id_b)
Example:
item_id_a item_id_b mode is_directional a_is_acting
ENSP00000000233 ENSP00000216366 reaction f f
<=> EvaluationLink
PredicateNode "reaction"
SetLink ENSP00000000233 ENSP00000216366
ENSP00000000233 ENSP00000216366 reaction t f
<=> EvaluationLink
PredicateNode "reaction"
ListLink ENSP00000216366 ENSP00000000233
ENSP00000000233 ENSP00000216366 reaction t t
<=> EvaluationLink
PredicateNode "reaction"
ListLink ENSP00000000233 ENSP00000216366
Keep symmetric relations and ignore if the same relation happens to be asymmetric
"""
symmetric = {}
if not os.path.exists(os.path.join(os.getcwd(), 'string_dataset')):
os.makedirs('string_dataset')
with open("string_dataset/string_ppi_{}.scm".format(str(datetime.date.today())), "w") as f, open('string_dataset/string_ggi_{}.scm'.format(str(datetime.date.today())), 'w') as g:
for i in range(len(df_data_symmetric)):
try:
prot1 = df_data_symmetric.iloc[i]['item_id_a']
prot2 = df_data_symmetric.iloc[i]['item_id_b']
mode = df_data_symmetric.iloc[i]['mode']
score = int(df_data_symmetric.iloc[i]['score'])
if prot1 in mapping_dict.keys() and prot2 in mapping_dict.keys():
prot1 = mapping_dict[prot1]
prot2 = mapping_dict[prot2]
else:
if not prot1 in mapping_dict.keys():
notmapped.append(prot1)
else:
notmapped.append(prot2)
continue
protein1 = prot1.split("|")[0]
gene1 = prot1.split("|")[1].split("_")[0]
protein2 = prot2.split("|")[0]
gene2 = prot2.split("|")[1].split("_")[0]
f.write(evaLink(protein1, protein2, mode, stv="(stv {} {})".format(1.0, score/1000),link_type="SetLink"))
g.write(evaLink(gene1, gene2, mode,stv="(stv {} {})".format(1.0, score/1000), link_type="SetLink", ppi=False))
symmetric[gene1 + gene2] = mode
except Exception as e:
print(e)
for i in range(len(df_data_asymmetric)):
try:
prot1 = df_data_asymmetric.iloc[i]['item_id_a']
prot2 = df_data_asymmetric.iloc[i]['item_id_b']
mode = df_data_asymmetric.iloc[i]['mode']
a_is_acting = df_data_asymmetric.iloc[i]['a_is_acting']
score = int(df_data_asymmetric.iloc[i]['score'])
if prot1 in mapping_dict.keys() and prot2 in mapping_dict.keys():
prot1 = mapping_dict[prot1]
prot2 = mapping_dict[prot2]
else:
if not prot1 in mapping_dict.keys():
notmapped.append(prot1)
else:
notmapped.append(prot2)
continue
protein1 = prot1.split("|")[0]
gene1 = prot1.split("|")[1].split("_")[0]
protein2 = prot2.split("|")[0]
gene2 = prot2.split("|")[1].split("_")[0]
if not (gene1+gene2 in symmetric.keys() and symmetric[gene1+gene2] == mode):
if a_is_acting is "t":
f.write(evaLink(protein1, protein2, mode, stv="(stv {} {})".format(1.0, score/1000)))
g.write(evaLink(gene1, gene2, mode, stv="(stv {} {})".format(1.0, score/1000), ppi=False))
else:
f.write(evaLink(protein2, protein1, mode, stv="(stv {} {})".format(1.0, score/1000)))
g.write(evaLink(gene2, gene1, mode, ppi=False, stv="(stv {} {})".format(1.0, score/1000)))
except Exception as e:
print(e)
print("Done " + str(datetime.datetime.now()))
with open("string_dataset/notmapped_ensembles.txt", "w") as n:
n.write("\n".join(set(notmapped)))
if __name__ == "__main__":
import_string()