-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathefo_compress.py
74 lines (62 loc) · 2.56 KB
/
efo_compress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import json
output_filename = 'efo-graph.min.json'
def compress(filename):
'''
Input should be a JSON file containing a list of objects,
each containing id, name and parentIds fields.
Compression does the following:
1. A words list is built from the words used across all
disease names (ordered by frequency descending). The
words in each name are then replaced with the index
of each word in the words list.
2. A diseases list is built from the diseases and efo
ids are replaced by the index of the disease in the
diseases list.
3. The output per disease is an array with id, name and
parentIds in positions 0, 1 and 2 respectively.
'''
with open(filename, 'r') as f_input:
diseases = json.load(f_input)
# build word lookup (word -> int)
unique_word_frequencies = {}
for disease in diseases:
words = disease['name'].split()
for w in words:
if not w in unique_word_frequencies.keys():
unique_word_frequencies[w] = 0
unique_word_frequencies[w] += 1
# build word list
word_list = sorted(unique_word_frequencies.keys(), key=unique_word_frequencies.get, reverse=True)
# compress names using word list
word_to_id = {w: i for i, w in enumerate(word_list)}
compressed_names = [{
'id': d['id'],
'compressedName': [word_to_id[w] for w in d['name'].split()],
'parentIds': d['parentIds']
} for d in diseases]
# build efo_id lookup (efo_id -> int)
efo_frequencies = {disease['id']: 0 for disease in diseases}
for disease in diseases:
for parent_id in disease['parentIds']:
efo_frequencies[parent_id] += 1
efo_list = sorted(efo_frequencies.keys(), key=efo_frequencies.get, reverse=True)
efo_to_id = {efo: i for i, efo in enumerate(efo_list)}
# build diseases list (and parents)
ids_and_names = [[] for i in range(len(efo_list))]
parents = [[] for i in range(len(efo_list))]
for disease in compressed_names:
int_id = efo_to_id[disease['id']]
ids_and_names[int_id] = [disease['id'], disease['compressedName']]
parents[int_id] = [efo_to_id[efo] for efo in disease['parentIds']]
# build top level object
compressed = {
'words': word_list,
'diseases': ids_and_names,
'parents': parents
}
with open(output_filename, 'w') as f_output:
json.dump(compressed, f_output, separators=(',', ':'), sort_keys=True)
if __name__ == '__main__':
filename = sys.argv[1]
compress(filename)