-
Notifications
You must be signed in to change notification settings - Fork 4
/
file_utils.py
105 lines (88 loc) · 3.04 KB
/
file_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from collections import defaultdict
import h5py
def read_config_file(file_in):
"""
Read config file specifying all needed parameters
:param file_in:
:return:
"""
config_data = dict()
with open(file_in) as read_in:
for line in read_in:
splitted_line = line.strip().split(':')
config_data[splitted_line[0]] = splitted_line[1].strip()
return config_data
def read_go_annotations(file_in):
"""
Read known GO annotations from file
:param file_in:
:return:
"""
go_annotations = defaultdict(set)
with open(file_in) as read_in:
for line in read_in:
splitted_line = line.strip().split()
identifier = splitted_line[0]
go_terms = set(splitted_line[1].split(','))
go_annotations[identifier] = go_terms
return go_annotations
def read_embeddings(embeddings_in):
"""
Read embeddings from h5 file generated by bio_embeddings pipeline
:param embeddings_in:
:return:
"""
embeddings = dict()
with h5py.File(embeddings_in, 'r') as f:
for key, embedding in f.items():
original_id = embedding.attrs['original_id']
embeddings[original_id] = np.array(embedding)
return embeddings
def write_predictions_cafa(predictions, out_file, model_num, team_name):
"""
Write prediictions in CAFA format
:param predictions: predictions to write
:param out_file: output file
:param model_num: number of model that is used
:param team_name: Team name to use in output file
:return:
"""
with open(out_file, 'w') as out:
out.write('AUTHOR\t{}\nMODEL\t{}\nKEYWORDS\thomolog, machine learning, natural language processing.'
'\n'.format(team_name, model_num))
for p in predictions.keys():
prediction = predictions[p]
for pred in prediction.keys():
ri = prediction[pred]
out.write('{}\t{}\t'.format(p, pred))
out.write('{:0.2f}\n'.format(float(ri)))
out.write('END')
def write_predictions(predictions, out_file):
"""
Write predictions in the format 'target GO term RI'
:param predictions: predictions to write to file
:param out_file: file to write predictions to
:return:
"""
with open(out_file, 'w') as out:
out.write('Target ID\tGO Term\tRI\n')
for p in predictions.keys():
prediction = predictions[p]
for pred in prediction:
ri = prediction[pred]
out.write('{}\t{}\t'.format(p, pred))
out.write('{:0.2f}\n'.format(float(ri)))
def write_hits(hits, out_file):
"""
Write identifier for hit found for a respective query
:param hits:
:param out_file:
:return:
"""
with open(out_file, 'w') as out:
out.write('Query\tHit\tRI\n')
for q in hits.keys():
h = hits[q]
for k in h.keys():
ri = h[k]
out.write('{}\t{}\t{}\n'.format(q, k, ri))