-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathInPheRNo_step2.py
166 lines (125 loc) · 8.34 KB
/
InPheRNo_step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
As input, this scripts takes in 3 files:
1) the file containing p-values of gene-phenotype associations for ALL the genes
(and not just genes of interest). If "None", it is assumed that the genes of interest
are all the genes.
Arguments -id and -igpa can be used to determine the address of this file.
2) the cleaned-up file containing p-values of gene-phenotype associations, only
for genes of interestm which was generated by "InPheRNo_step1" and as defaul was
placed in directory "Results" with the name "Pvalue_gene_phenotype_interest_tmp.csv".
3) the gene-TF pseudo p-value file generated in "InPheRNo_step1" and as defaul was
placed in directory "Results" with the name "Pvalue_gene_tf_tmp.csv".
If you have changed the name of directory and the name
of these files while running "InPheRNo_step1.py", you can provide the new names
using -ido, -igp, and -itg arguments.
"""
import os
import argparse
import pandas as pd
from scipy.stats import beta, uniform
import numpy as np
from scipy.stats import rv_continuous
import sys
import warnings
###############################################################################
def parse_args():
"""
Parse the arguments.
Parse the command line arguments/options using the argparse module
and return the parsed arguments (as an argparse.Namespace object,
as returned by argparse.parse_args()).
Returns:
argparse.Namespace: the parsed arguments
"""
parser = argparse.ArgumentParser()
# parser.add_argument('-l', '--location', default='cloud9', help='laptop or cloud9')
# parser.add_argument('-s', '--seed', default=1011, help='seed used for random generator')
parser.add_argument('-id', '--input_dir', default='./Data', help='address of directory containing gene-pheno association for ALL genes')
parser.add_argument('-igpa', '--input_gene_pheno_all', default='Pvalue_gene_phenotype_all.csv', help='name of file containing gene-phenotype association p-values for all genes, only used to estimate a_gp. If None, it is assumed to be the same as -igp')
parser.add_argument('-ido', '--input_dir_step_one', default='./Results', help='address of directory containing outputs of InPheRNo_step1.py')
parser.add_argument('-igp', '--input_gene_pheno', default='Pvalue_gene_phenotype_interest_tmp.csv', help='name of file containing gene-phenotype association p-values')
parser.add_argument('-itg', '--input_TF_gene', default='Pvalue_gene_tf_tmp.csv', help='name of file containing TF-gene association pseudo p-values generated by InPheRNo_step1.py')
parser.add_argument('-od', '--output_dir', default='./tmp', help='address of directory for results of different repeats')
parser.add_argument('-of', '--output_file', default="None", help='prefix to be added to output files')
parser.add_argument('-atg0', '--A_TF_gene_h0', default="None", help='alpha in beta distribution of TF-gene under Null hypothesis (TF does not regulate gene). If None, it is learnt from data. Otherwise a number must be given.')
parser.add_argument('-atg1', '--A_TF_gene_h1', default="None", help='alpha in beta distribution of TF-gene under alternative hypothesis (TF regulates gene). If None, it is learnt from data. Otherwise a number must be given.')
parser.add_argument('-agp', '--A_gene_pheno', default="None", help='alpha in beta distribution of gene-pheno. If None, it is estimated from data (p-value of gene phenotype for all genes is required). Otherwise a number must be given.')
parser.add_argument('-pt', '--Prior_T', default="None", help='Prior probability of T=1.')
parser.add_argument('-ptm', '--Prior_T_method', default='fixed', help='Method to set prior probability of T=1. If fixed is chosen and args.priot_T is None, we use 1/(10*n_TF).')
parser.add_argument('-rtg', '--R_TF_gene', default="None", help='mixing parameter. If None is selected, it is learnt from the data.')
parser.add_argument('-mnt', '--max_num_TF', default=15, help='Max number of TFs which was used as a parameter in InPheRNo_step1.py.')
parser.add_argument('-nr', '--num_repeat', default=100, help='Number of the repeats. Repeats are used to ensure stability of results. At least 100 repeats are recommended.')
parser.add_argument('-sr', '--start_repeat', default=0, help='Start index of the repeat. Useful if program gets killed in the middle or for parallelization.')
parser.add_argument('-er', '--end_repeat', default='None', help='End index of the repeat. Useful if program gets killed in the middle or for parallelization.')
parser.add_argument('-ni', '--num_iteration', default=200, help='number of iterations for the PGM.')
parser.add_argument('-nb', '--num_burn', default=100, help='number of iterations to burn.')
parser.add_argument('-nt', '--num_thin', default=1, help='number of iterations to thin by')
# parser.add_argument('-si', '--start_index', default=0, help='start index of the gene to consider')
# parser.add_argument('-ei', '--end_index', default='None', help='end index of the gene to consider')
parser.add_argument('-bs', '--batch_size', default=1000, help='number of target genes to run before saving to disk. Smaller value protects against high usage of memroy but increases the run time.')
args = parser.parse_args()
return(args)
###############################################################################
args = parse_args()
delim_gp = ','
delim_tg = ','
delim_gp_a = ','
if args.input_gene_pheno[-3:] in ['tsv', 'txt']:
delim_gp = '\t'
if args.input_gene_pheno_all[-3:] in ['tsv', 'txt']:
delim_gp_a = '\t'
if args.input_TF_gene[-3:] in ['tsv', 'txt']:
delim_tg = '\t'
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
address_TF_gene = os.path.join(args.input_dir_step_one, args.input_TF_gene)
pvalue_TF_gene = pd.read_csv(address_TF_gene, sep=delim_gp, index_col=0, header=0).T
n_TF = len(pvalue_TF_gene.index)
n_target_gene = len(pvalue_TF_gene.columns)
batch_size = int(args.batch_size)
n_batch = ((n_target_gene - 1) // batch_size) + 1
# end_index = args.end_index
# if int(args.start_index) > n_target_gene:
# sys.exit('Start index exceeds maximum number of genes in the data.')
# if end_index == 'None':
# end_index = n_target_gene
# elif int(end_index) > n_target_gene:
# end_index = n_target_gene
#############################################################################
###Estimate a_gp
class beta_unif_gen(rv_continuous):
def _pdf(self, x, a0, b0, c0):
return c0 * beta.pdf(x, a=a0, b=b0) + (1-c0) * uniform.pdf(x)
##########################################################################################
if args.input_gene_pheno_all not in ['None', 'NONE']:
address_gene_pheno_all = os.path.join(args.input_dir, args.input_gene_pheno_all)
pvalue_gene_pheno_all = pd.read_csv(address_gene_pheno_all, sep=delim_gp_a, index_col=0, header=0).T
if args.A_gene_pheno in ['None', 'NONE']:
np.seterr(divide='ignore', invalid='ignore')
beta_unif_rv = beta_unif_gen(name='beta_unif', a=0., b=1.)
a_gp_tmp, _, c, _, _= beta_unif_rv.fit(pvalue_gene_pheno_all, fb0=1, floc=0, fscale=1) #fb0 or fc0 fixes b0 and c0, respectively
if c > 1:
a_gp_tmp, _, _, _ = beta.fit(pvalue_gene_pheno_all, fb=1, floc=0, fscale=1)
A_gene = a_gp_tmp
else:
A_gene = float(args.A_gene_pheno)
##########################################################################################
if args.end_repeat in ['None', 'NONE']:
end_repeat = int(args.num_repeat)
else:
end_repeat = int(args.end_repeat)
for i_r in range(int(args.start_repeat), end_repeat):
for i_b in range(n_batch):
start_ind = i_b * batch_size
if i_b == (n_batch-1):
end_ind = n_target_gene
else:
end_ind = (i_b + 1) * batch_size
print('a_gp = ', A_gene)
command2run = "python TRN_twostage.py -atg0 %s -atg1 %s -agp %s -pt %s -ptm %s \
-rtg %s -igp %s -itg %s -od %s -of %s -mnt %s -ir %s -ni %s -nb %s -nt %s -si %s -ei %s" \
%(args.A_TF_gene_h0, args.A_TF_gene_h1, A_gene, args.Prior_T, args.Prior_T_method, \
args.R_TF_gene, os.path.join(args.input_dir_step_one, args.input_gene_pheno), os.path.join(args.input_dir_step_one, args.input_TF_gene), args.output_dir, \
args.output_file, args.max_num_TF, i_r, args.num_iteration, args.num_burn, args.num_thin, start_ind, end_ind)
print(command2run)
os.system(command2run)