-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFisherTest.py
executable file
·135 lines (106 loc) · 2.59 KB
/
FisherTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
import sys
import re
import scipy.stats
back_terms=dict()
back_genes=0
generated_terms=dict()
generated_genes=0
## command line arguments
# taking the bionex filename
narg = len(sys.argv)
if narg<4:
print "...failed! enter background_file, generated_file, output_filename"
sys.exit()
else:
background_file=sys.argv[1]
generated_file=sys.argv[2]
fisher_value_file=sys.argv[3]
def loadBackground():
global back_terms
global back_genes
genes=dict()
fh=open(background_file, "r")
for line in fh:
if line.strip()!='':
gene =line.split('\t')[0].strip()
term =line.split('\t')[1].strip()
if back_terms.has_key(term):
back_terms[term]=back_terms[term]+1
else:
back_terms[term]=1
genes[gene]=gene
fh.close()
back_genes=len(genes)
#print "Terms in background set: "+str(len(back_terms))
#print "Genes in background set: "+str(len(genes))
'''
for k in back_terms.keys():
if back_terms[k]>50:
print k, back_terms[k]
'''
def loadGenerated():
global generated_terms
global generated_genes
genes=dict()
#generated_file="data/id_1452818557_90d53fd484242ffe7fba8c6aff7a9260_gene_iterm.txt"
fh=open(generated_file, "r")
for line in fh:
if line.strip()!='':
gene =line.split('\t')[0].strip()
term =line.split('\t')[1].strip()
if generated_terms.has_key(term):
generated_terms[term]=generated_terms[term]+1
else:
generated_terms[term]=1
genes[gene]=gene
fh.close()
generated_genes=len(genes)
#print "Terms in generated set: "+str(len(generated_terms))
#print "Genes in generated set: "+str(len(genes))
'''
for k in generated_terms.keys():
if generated_terms[k]>0:
print k, generated_terms[k]
'''
def FisherExactTest():
global back_terms
global back_genes
global generated_terms
global generated_genes
global fisher_value_file
outlines=list()
for t in generated_terms.keys():
a=0
b=0
c=0
d=0
in_generated=0
if generated_terms.has_key(t):
a+=generated_terms[t]
in_generated=generated_terms[t]
in_background=0
if back_terms.has_key(t):
b+=back_terms[t]
in_background=back_terms[t]
c+= (generated_genes-in_generated)
d+= (back_genes-in_background)
#break
#print a
#print b
#print c
#print d
#pvalue=0.000008234798237
oddsratio, pvalue = scipy.stats.fisher_exact([[a, b], [c, d]])
#print t+'\t'+str(pvalue)
outlines.append(t+'\t'+str(pvalue))
#print '0.000008234798237'
#print generated_file
fh=open(fisher_value_file, "w")
for line in outlines:
fh.write(line+'\n')
fh.close()
if __name__ == '__main__':
loadBackground()
loadGenerated()
FisherExactTest()