forked from MOZI-AI/knowledge-import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GO_scm.py
executable file
·149 lines (137 loc) · 5.72 KB
/
GO_scm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python2.7
# 2017-12-03
# Edited March 2020
# Script to convert go.obo to atomspace representation in scheme
# Requires: file go.obo from http://www.berkeleybop.org/ontologies/go.obo or http://snapshot.geneontology.org/ontology/go.obo
import re
import wget
import metadata
import os
from datetime import date
source = "http://snapshot.geneontology.org/ontology/go.obo"
output_data = 'dataset/GO_{}.scm'.format(str(date.today()))
if os.path.exists(os.path.join(os.getcwd(),output_data)):
os.remove(output_data)
if os.path.exists('raw_data/go.obo'):
os.remove('raw_data/go.obo')
dataset = wget.download(source,"raw_data/")
f = open(dataset)
lines = f.readlines()
# store line of number --- "[Terms]" and [Typedef]
line_no = []
print("\nStarted importing\n")
for num, line in enumerate(lines, 1):
if "[Term]" in line or "[Typedef]" in line:
line_no.append(num)
line_no.sort()
# print len(line_no)
# function to write on file
def inLink(node1 , node2):
f_go.write("(InheritanceLink \n")
f_go.write("\t (ConceptNode \"" + node1 + "\")\n")
f_go.write("\t (ConceptNode \""+ node2 + "\")\n")
f_go.write(")\n\n")
#
def evaLink(predicateName ,node1 , node2 , node1_type, node2_type):
f_go.write("(EvaluationLink \n")
f_go.write("\t (PredicateNode \"" + predicateName + "\")\n")
f_go.write("\t (ListLink \n")
f_go.write("\t\t (" + node1_type + " \"" + node1 + "\")\n")
f_go.write("\t\t (" + node2_type + " \"" + node2 + "\")\n")
f_go.write("\t )\n")
f_go.write(")\n\n")
#
def go_term(idd):
inLink(idd,"GO_term")
def go_name(idd, name):
evaLink("GO_name", idd, name, "ConceptNode", "ConceptNode")
def go_namespace(idd, namespace):
evaLink("GO_namespace", idd, namespace ,"ConceptNode", "ConceptNode")
def go_definition(idd, definition):
evaLink("GO_definition", idd, definition.replace('"', '') ,"ConceptNode", "ConceptNode")
# def go_synonyms(idd,synonyms,synonym_type):
# evaLink(("GO_synonym_" +synonym_type),idd ,synonyms, "ConceptNode", "ConceptNode")
def go_isa(idd, isa_id):
inLink(idd, isa_id)
#
def go_altid(idd, alt_id):
evaLink("GO_alt_id", idd, alt_id, "ConceptNode", "ConceptNode")
def go_relationship(idd,relate_id, relation_type):
evaLink(("GO_" + relation_type), idd, relate_id, "ConceptNode" , "ConceptNode")
# open file to write
f_go = open(output_data, 'a')
goterm = {"biological_process":[],"molecular_function":[],"cellular_component":[]}
i = 0
# partition each line and call functions
while i < len(line_no):
if i + 1 == len(line_no):
part = lines[line_no[i] : len(lines)]
else:
part = lines[line_no[i] : line_no[i+1] - 1]
test = [l.partition(':') for l in part]
k = 0
rel_typeno = 0
is_a = []
alt_id =[]
relationship = []
relationship_type= []
idd =""
name= ""
namespace=""
obsolete =""
while k < len(test):
if (test[k][0] == 'is_obsolete'):
obsolete = (test[k][2].partition('\n')[0]).partition(' ')[2].replace('\\', '\\\\')
elif (test[k][0] == 'id'):
idd = (test[k][2].partition('\n')[0]).partition(' ')[2].replace('\\', '\\\\')
elif (test[k][0] == 'name'):
name = (test[k][2].partition('\n')[0]).partition(' ')[2].replace('\\', '\\\\')
elif (test[k][0] == 'namespace'):
namespace = (test[k][2].partition('\n')[0]).partition(' ')[2].replace('\\', '\\\\')
elif(test[k][0] == 'def'):
definition = re.sub('\[.*?\]',"",(test[k][2].partition('\n')[0]).partition(' ')[2]).replace('\\', '')
elif (test[k][0] == 'alt_id'):
alt_id.append((test[k][2].partition('\n')[0]).partition(' ')[2].replace('\\', '\\\\'))
elif (test[k][0] == 'relationship'):
relationship_type.append((((test[k][2].partition('\n')[0]).partition('GO')[0]).split(' ')[1]).replace('\\', '\\\\'))
while rel_typeno < len(relationship_type):
relationship.append((((test[k][2].partition('\n')[0]).partition(relationship_type[rel_typeno])[2]).partition('!')[0]).partition(' ')[2].replace('\\', '\\\\').strip())
rel_typeno = rel_typeno + 1
elif (test[k][0] == 'is_a'):
is_a.append(((test[k][2].partition('\n')[0]).partition('!')[0]).partition(' ')[2].replace('\\', '\\\\').strip())
k = k +1
if obsolete != 'true' and "GO:" in idd:
#go_term(idd)
go_name(idd, name)
go_namespace(idd, namespace)
if namespace in goterm.keys():
goterm[namespace].append(idd)
# go_definition(idd, definition)
# if len(synonym) != 0:
# sy_len = 0
# while sy_len < len(synonym):
# go_synonyms(idd, synonym[sy_len], synonym_type[sy_len])
# sy_len = sy_len + 1
if len(is_a) != 0:
isa_len = 0
while isa_len < len(is_a):
go_isa(idd, is_a[isa_len])
isa_len = isa_len + 1
# if len(alt_id) != 0:
# altid_len = 0
# while altid_len < len(alt_id):
# go_altid(idd, alt_id[altid_len])
# altid_len = altid_len + 1
# if len(relationship) != 0:
# parts_len = 0
# while parts_len < len(relationship):
# go_relationship(idd, relationship[parts_len], relationship_type[parts_len])
# parts_len = parts_len + 1
i= i + 1
f_go.close()
ns = {}
for k in goterm.keys():
ns[k] = len(set(goterm[k]))
script = "https://github.com/MOZI-AI/knowledge-import/GO_scm.py"
metadata.update_meta("GO Obo:latest", source,script,goterms=ns)
print("Done, check dataset/GO.scm")