-
Notifications
You must be signed in to change notification settings - Fork 4
/
gene_ontology.py
116 lines (94 loc) · 4.05 KB
/
gene_ontology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from collections import defaultdict
class GeneOntology(object):
def __init__(self, onto_file):
self.all_go = defaultdict(dict)
self._parse_go(onto_file)
self.mfo = self._get_go_annotations('mfo')
self.bpo = self._get_go_annotations('bpo')
self.cco = self._get_go_annotations('cco')
def _parse_go(self, onto_file):
# information to save for a GO term
go_id = ''
go_name = ''
namespace = ''
parents = set()
alt_ids = set()
term = False
with open(onto_file) as read_in:
for line in read_in:
splitted_line = line.strip().split(':')
if '[Term]' in line: # new term begins
term = True
if not go_id == '':
if go_id in self.all_go.keys():
print(go_id)
self.all_go[go_id] = {'name': go_name, 'go': namespace, 'parents': parents}
for a in alt_ids:
self.all_go[a] = {'name': go_name, 'go': namespace, 'parents': parents}
# reset annotations
go_id = ''
go_name = ''
namespace = ''
parents = set()
alt_ids = set()
elif term and 'id: GO:' in line and 'alt_id' not in line:
go_id = "GO:{}".format(splitted_line[2].strip())
elif term and 'alt_id: GO' in line:
alt_id = "GO:{}".format(splitted_line[2].strip())
alt_ids.add(alt_id)
elif term and 'name:' in line:
go_name = splitted_line[1].strip()
elif term and 'namespace:' in line:
tmp_nampespace = splitted_line[1].strip()
if tmp_nampespace == 'biological_process':
namespace = 'bpo'
elif tmp_nampespace == 'molecular_function':
namespace = 'mfo'
elif tmp_nampespace == 'cellular_component':
namespace = 'cco'
elif term and 'is_a:' in line:
splitted_term = splitted_line[2].split("!")
go_term = "GO:{}".format(splitted_term[0].strip())
parents.add(go_term)
elif '[Typedef]' in line:
term = False
self.all_go[go_id] = {'name': go_name, 'go': namespace, 'parents': parents}
# include all parents (also grandparents,...)
for go_term in self.all_go.keys():
new_parents = self._set_parents(go_term)
self.all_go[go_term]['parents'].update(new_parents)
def _set_parents(self, term):
new_parents = set()
parents = self.all_go[term]['parents']
for p in parents:
tmp_parents = self._set_parents(p)
new_parents.update(tmp_parents)
new_parents.update(parents)
return new_parents
def _get_go_annotations(self, onto):
ontology = defaultdict(dict)
for k in self.all_go.keys():
if self.all_go[k]['go'] == onto:
ontology[k] = self.all_go[k]
return ontology
def get_parent_terms(self, go_term):
if go_term in self.all_go.keys():
return self.all_go[go_term]['parents']
else:
return set()
def get_all_terms(self, leaf_annotations):
all_annotations = defaultdict(set)
for k in leaf_annotations.keys():
go_terms = leaf_annotations[k]
for g in go_terms:
parent_terms = self.get_parent_terms(g)
all_annotations[k].add(g)
all_annotations[k].update(parent_terms)
return all_annotations
def get_ontology(self, go_term):
if go_term in self.all_go.keys():
return self.all_go[go_term]['go']
else:
return ''
def get_name(self, go_term):
return self.all_go[go_term]['name']