-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgo_enrichment.py
99 lines (75 loc) · 2.97 KB
/
go_enrichment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import goatools
import collections
import mygene
from goatools.base import download_go_basic_obo
GO_KEYS_FULL = 'go.BP', 'go.MF', 'go.CC'
GO_KEYS_SPLIT = [x.split('.')[1] for x in GO_KEYS_FULL]
def parse_mygene_output(mygene_output):
"""Convert mygene.querymany output to a gene id to go term mapping (dictionary)
Parameters
----------
mygene_output : dict or list
Dictionary (returnall=True) or list (returnall=False) of
output from mygene.querymany
Output
------
gene_name_to_go : dict
Mapping of gene name to a set of GO ids
"""
# if "returnall=True" was specified, need to get just the "out" key
if isinstance(mygene_output, dict):
mygene_output = mygene_output['out']
gene_name_to_go = collections.defaultdict(set)
for line in mygene_output:
gene_name = line['query']
try:
go_output = line['go']
except KeyError:
continue
for go_key in GO_KEYS_SPLIT:
try:
go_terms = go_output[go_key]
except KeyError:
continue
if isinstance(go_terms, dict):
go_ids = set([go_terms['id']])
else:
go_ids = set(x['id'] for x in go_terms)
gene_name_to_go[gene_name] |= go_ids
return gene_name_to_go
def gene_ids_to_go(gene_ids, species='human,mouse,rat',
scopes='entrezgene,ensemblgene,retired,symbol',
fields=GO_KEYS_FULL,
**kwargs):
"""Get associated GO terms for each gene ID
gene_ids : iterable of ids
List of gene ids that you want to map
species : str
Comma-separated species to limit search. Default is "human,mouse,rat"
scopes : str
Comma-separated type of gene ids that you are giving.
Default is "entrezgene,ensemblgene,retired,symbol"
fields : iterable
GO terms to use. Default is ['go.BP', 'go.MF', 'go.CC']
Returns
-------
gene_to_go : dict
Mapping of each provided gene id to a set object of GO terms
"""
mg = mygene.MyGeneInfo()
mygene_output = mg.querymany(gene_ids, fields=fields, scopes=scopes,
species=species, **kwargs)
gene_name_to_go = parse_mygene_output(mygene_output)
return gene_name_to_go
obo_fname = download_go_basic_obo()
obo_dag = goatools.obo_parser.GODag(obo_file=obo_fname)
def get_go_ids_from_ensembl_ids(ensembl_ids):
"""Get the GO term mapping from gene id to go term using mygene"""
mg = mygene.MyGeneInfo()
mygene_output = mg.querymany(ensembl_ids, scopes='ensemblgene', fields=['go.BP', 'go.MF', 'go.CC'], species='human',
returnall=True)
gene_name_to_go = parse_mygene_output(mygene_output)
return gene_name_to_go
def make_go_enricher(all_genes):
gene_name_to_go = get_go_ids_from_ensembl_ids(all_genes)
return goatools.GOEnrichmentStudy(all_genes, gene_name_to_go, obo_dag)