-
Notifications
You must be signed in to change notification settings - Fork 4
/
GeminiTable2CVL.py
executable file
·247 lines (202 loc) · 7.68 KB
/
GeminiTable2CVL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#########################################
# Author: Phillip Richmond #
# Contact: [email protected] #
# License: open source GNU #
#########################################
# The purpose of this python script is to take in a gemini table file, and add some annotations.
# The annotations are all gene-based at this point, and are simple to add from text files. The heavy lifting of finding the gene has already been done
# This script also assumes only 1 gene being in the annotation column. For cases of overlapping genes, or intergenic variants, this script will not handle those annotations well (as of April 23rd 2018)
###########
# Imports #
###########
import sys, argparse
import re
########################
# Function Definitions #
########################
def GetOptions():
parser = argparse.ArgumentParser()
parser.add_argument("-i","--InFile",help="This is a file that has come as an output from the GEMINI query/built-in functions (e.g. de novo)",type=str)
parser.add_argument("-o","--OutFile",help="Output Gemini Table file with Additional Annotations",type=str)
args = parser.parse_args()
infilename=args.InFile
outfilename=args.OutFile
return infilename,outfilename
# This script relies on a few different files in order to add some manual annotations. Those are defined in the main function at the bottom.
## Get a dictionary for the RefSeq Gene Summaries, gene is key, value is summary
def GetSummaryDict(SummaryFileName):
SummaryFile = open(SummaryFileName,'r')
SummaryForGenes = {}
for line in SummaryFile:
Number,Symbol,Summary = line.strip('\n').split("\t")
SummaryForGenes[Symbol]=Summary
return SummaryForGenes
## Get a dictionary for the OMIM gene map, gene is key, value is Mim#
def GetOMIM_Gene2MimDict(OMIM_Gene2MimFileName):
OMIMfile = open(OMIM_Gene2MimFileName,'r')
GenesToMim = {}
for line in OMIMfile:
if line[0]=='#':
continue
cols=line.strip('\n').split('\t')
mimNumber= cols[0]
if cols[1]!='gene':
continue
GeneSymbol=cols[3]
GenesToMim[GeneSymbol]=mimNumber
return GenesToMim
## Get a dictionary for the OMIM gene to phenotypes, gene is key, phenotypes are the values, with multiple phenos split by '|'
def GetOMIM_Gene2PhenoDict(OMIM_Gene2PhenoFileName):
OMIMfile = open(OMIM_Gene2PhenoFileName,'r')
GeneToPheno = {}
for line in OMIMfile:
if line[0]=='#':
continue
cols = line.strip('\n').split('\t')
gene = cols[0]
phenos = "|".join(cols[1:])
GeneToPheno[gene]=phenos
return GeneToPheno
####
# RVIS file looks like this:
#GENE ALL_0.01% %ALL_0.01%
#A1BG -0.35 29.49
def GetRVISDict(RVISFILE):
infile = open(RVISFILE,'r')
Gene2RVIS = {}
headerline = infile.readline()
for line in infile:
gene,rvis_score,rvis_pct = line.strip('\n').split('\t')
Gene2RVIS[gene] = [rvis_score,rvis_pct]
#print Gene2RVIS
return Gene2RVIS
# PLI file looks like this:
# gene pLI pRec
#AGRN 0.17335234048116 0.826647657926747
def GetPLIDict(PLIFILE):
infile = open(PLIFILE,'r')
Gene2PLI = {}
headerline = infile.readline()
for line in infile:
gene,pli_score,pli_pct = line.strip('\n').split('\t')
Gene2PLI[gene] = [pli_score,pli_pct]
#print Gene2PLI
return Gene2PLI
def GetHPODict(HPOFILE):
infile = open(HPOFILE,'r')
Gene2HPO={}
headerline = infile.readline()
for line in infile:
entrez,gene,pheno,hpo = line.strip('\n').split('\t')
if Gene2HPO.has_key(gene):
Gene2HPO[gene].append("%s|%s"%(pheno,hpo))
else:
Gene2HPO[gene]=["%s|%s"%(pheno,hpo)]
#print Gene2HPO
return Gene2HPO
def GetMESHOPDict(MESHOPFILE):
infile = open(MESHOPFILE,'r')
Gene2MESHOP={}
for line in infile:
gene,meshop = line.strip('\n').split('\t')
if Gene2MESHOP.has_key(gene):
Gene2MESHOP[gene].append(meshop)
else:
Gene2MESHOP[gene]=[meshop]
#print Gene2MESHOP
return Gene2MESHOP
# This function takes in the dictionaries desribed above, reads in the gemini infile, and outputs the gemini outfile
def AddColumnsToTable(GeminiInFileName,GeminiOutFileName,Gene2Pheno,Gene2Mim,GeneSummary,Gene2PLI,Gene2RVIS,Gene2HPO,Gene2MESHOP):
infile = open(GeminiInFileName,'r')
outfile = open(GeminiOutFileName,'w')
header = infile.readline()
header = header.strip('\n')
outfile.write("%s\tOMIM_Entry\tOMIM_Phenotypes\tRVIS_Score\tRVIS_Pct\tpLI_Score\tpLI_Pct\tHPO\tMeSHOP\tGeneSummary\n"%header)
for line in infile:
line = line.strip('\n')
cols = line.split("\t")
gene = cols[5]
# Check for Omim phenotype, add if there, if not make it '.'
if Gene2Pheno.has_key(gene):
omim_pheno=Gene2Pheno[gene]
else:
omim_pheno='.'
# Check for Omim gene, add hyperlink for excel if there, if not make it '.'
if Gene2Mim.has_key(gene):
omim_key=Gene2Mim[gene]
omim_hyperlink='=HYPERLINK(\"http://www.omim.org/entry/%s\")'%omim_key
else:
omim_hyperlink='.'
# HPO
if Gene2HPO.has_key(gene):
hpo=";".join(Gene2HPO[gene])
else:
hpo = '.'
# PLI
if Gene2PLI.has_key(gene):
pLI_score = Gene2PLI[gene][0]
pLI_pct = Gene2PLI[gene][1]
else:
pLI_score = '.'
pLI_pct = '.'
# RVIS
if Gene2RVIS.has_key(gene):
rvis_score = Gene2RVIS[gene][0]
rvis_pct = Gene2RVIS[gene][1]
else:
rvis_score = '.'
rvis_pct = '.'
# MESHOP
if Gene2MESHOP.has_key(gene):
meshop=";".join(Gene2MESHOP[gene])
else:
meshop = '.'
# Check for gene summary, if there, add it, if not, make it '.'
if GeneSummary.has_key(gene):
gene_summary=GeneSummary[gene]
else:
gene_summary='.'
# Fix some formatting stuff for Excel
# Exon number
cols[6]="\'%s"%cols[6]
# Gene name
#cols[5]="\'%s"%cols[5]
# join the cols
newline = "\t".join(cols)
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(newline,omim_pheno,omim_hyperlink,rvis_score,rvis_pct,pLI_score,pLI_pct,hpo,meshop,gene_summary))
#reset variables
gene_summary = '.'
omim_pheno = '.'
omim_hyperlink='.'
rvis_score = '.'
rvis_pct = '.'
pLI_score = '.'
pLI_pct = '.'
hpo = '.'
meshop = '.'
#-------------#
# Main #
#-------------#
if __name__ == "__main__":
# These are hard coded locations for database files. These files are small, so they are all just text files.
# They contain gene-based information
SummaryFileName = '/mnt/causes-data01/data/Databases/RefSeqGene_Summaries_270316.txt'
Gene2MimFileName = '/mnt/causes-data01/data/Databases/OMIM_mim2gene'
Gene2Disease = '/mnt/causes-data01/data/Databases/OMIM_phenotype_genelist'
PLI = '/mnt/causes-data01/data/Databases/TOLERANCE/PLI_March2016.txt'
RVIS = '/mnt/causes-data01/data/Databases/TOLERANCE/RVIS_March2016.txt'
MESHOP = '/mnt/causes-data01/data/Databases/gene2pubmedBG-hum-gene2pubmed-gene-mesh-p_ONLYMESHDISEAS_P-valuecorrected_withGeneSymbols.txt'
HPO = '/mnt/causes-data01/data/Databases/ALL_SOURCES_FREQUENT_FEATURES_genes_to_phenotype.txt'
# Read in the annotations
GeneSummaries = GetSummaryDict(SummaryFileName)
Gene2Mim = GetOMIM_Gene2MimDict(Gene2MimFileName)
Gene2Pheno = GetOMIM_Gene2PhenoDict(Gene2Disease)
Gene2PLI = GetPLIDict(PLI)
Gene2RVIS = GetRVISDict(RVIS)
Gene2HPO = GetHPODict(HPO)
Gene2MESHOP = GetMESHOPDict(MESHOP)
GeminiInfile,GeminiOutfile=GetOptions()
#GeminiInfile='/mnt/causes-data01/data/RICHMOND/AnnotateVariants/T008_compoundHet.txt'
#GeminiOutfile='/mnt/causes-data01/data/RICHMOND/AnnotateVariants/T008_compoundHet_annotated.txt'
AddColumnsToTable(GeminiInfile,GeminiOutfile,Gene2Pheno,Gene2Mim,GeneSummaries,Gene2PLI,Gene2RVIS,Gene2HPO,Gene2MESHOP)
sys.exit()