-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathInPheRNo_step1.py
199 lines (146 loc) · 8.41 KB
/
InPheRNo_step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
"""
@author: emad2
This is the first step of InPheRNo. This script uses the normalized transcriptomic
data to generate P-value of gene-TF associations using ElasticNet. The results
generated for each (TF,gene) pair are pseudo-pvalues that approximately follow
a Beta distribution under the Null hypothesis.
As input, this script takes in 3 files: 1) A list of ALL TFs, 2) a file containing
p-values of gene-phenotype associations only for genes of interest, and 3) the
expression profiles of genes and TFs. If there are shared gene name sbetween list
of TFs and gene-phenotype pvalue file, the script drops those.
The script generates two outputs: a gene-phenotype p-value file and a gene-TF
pseudo p-value file such that the gene names match in the two files. These files
only include genes of interest and not all genes. The matrix of TF-gene p-values
contains values of -1 for entries which were assigned a coefficient of 0 in
ElasticNet.
"""
#import scipy.special as ssp
import numpy as np
#import glob
import pandas as pd
import time
import os
import scipy.stats as ss
from sklearn.linear_model import ElasticNetCV
import statsmodels.api as sm
#import pickle
import argparse
###############################################################################
# Parse command line options/arguments
parser = argparse.ArgumentParser()
parser.add_argument('-id', '--input_directory', default = './Data', help = 'Address of directory containing input files')
parser.add_argument('-od', '--output_directory', default = './Results', help = 'output directory adddress')
parser.add_argument('-it', '--input_tf', default = 'TF_Ensemble.csv', help = 'Name of the file containing list of TFs in a csv file. The file should not have a header.')
parser.add_argument('-ie', '--input_expression', default = 'expr_sample.csv', help = 'A file containing gene and TF expression data (gene x samples). The file has a header (sample names).')
parser.add_argument('-igp', '--input_gene_phenotype_interest', default = 'Pvalue_gene_phenotype_interest.csv', help = 'A file (gene x pvalue) containing p-values of gene-phenotype only for genes of interest (and not all genes), sorted in an ascending order based on the p-value (smallest p-values appear first). Only include genes of interest to reduce computation time. The file has a header.')
parser.add_argument('-mt', '--max_num_tf', default = 15, help = 'Maximum number of Tfs recovered for each gene using EN')
parser.add_argument('-lr', '--l1_ratio', default = 0.5, help = 'l1 ratio in EN model' )
parser.add_argument('-ogp', '--output_gene_phenotype', default = 'Pvalue_gene_phenotype_interest_tmp.csv', help = 'A file (gene x pvalue) containing p-values of gene-phenotype, sorted in an ascending order based on the p-value (smallest p-values appear first). The file has a header.')
parser.add_argument('-tgt', '--output_gene_tf', default = 'Pvalue_gene_tf_tmp.csv', help = 'A file (gene x tf) containing p-values of gene-tf, sorted in an ascending order based on gene-pheno file. The file has a header.')
args = parser.parse_args()
###############################################################################
delim_tl = ','
delim_ex = ','
delim_gp = ','
if args.input_tf[-3:] in ['tsv', 'txt']:
delim_tl = '\t'
if args.input_expression[-3:] in ['tsv', 'txt']:
delim_ex = '\t'
if args.input_gene_phenotype_interest[-3:] in ['tsv', 'txt']:
delim_gp = '\t'
address_TF = os.path.join(args.input_directory, args.input_tf)
address_out_dir = args.output_directory
if not os.path.exists(address_out_dir):
os.makedirs(address_out_dir)
address_in_expr = os.path.join(args.input_directory, args.input_expression)
address_in_gene_pheno = os.path.join(args.input_directory, args.input_gene_phenotype_interest)
address_out_pvalue_gp= os.path.join(args.output_directory, args.output_gene_phenotype)
address_out_pvalue_gt = os.path.join(args.output_directory, args.output_gene_tf)
max_num_coefs = args.max_num_tf #since we want to limit number of nonzero coefs, this max value is useful
l1_rat = args.l1_ratio #l1_ratio used for ElasticNet
np.random.seed(1011)
eps = 3e-308
##########%%%%%%%%%%@@@@@@@@@@!!!!!!!!!!@@@@@@@@@@%%%%%%%%%%##########
TF_list = list(pd.read_csv(address_TF, sep=delim_tl, header=None, index_col=0).index.values)
expr_all = pd.read_csv(address_in_expr, sep=delim_ex, index_col=0) #this is IQN of log2(RPKM+1) values for all cancers (not sorted)
gene_pheno_pval = pd.read_csv(address_in_gene_pheno, sep=delim_gp, index_col=0)
##########%%%%%%%%%%@@@@@@@@@@!!!!!!!!!!@@@@@@@@@@%%%%%%%%%%##########
gene_pheno_list = list(gene_pheno_pval.index.values)
gene_TF_list = list(expr_all.index.values)
#gene_TF_list_GTEX = list(expr_IQN_GTEX.index.values)
#gene_TF_list = list(set(gene_TF_list).intersection(gene_TF_list_GTEX))
only_TF_list = list(set(TF_list).intersection(set(gene_TF_list)))
only_TF_list.sort()
only_gene_list = list(set(gene_TF_list) - set(TF_list))
only_gene_list = list(set(only_gene_list).intersection(gene_pheno_list))
only_gene_list.sort()
#find intersetion of gene_pheno_pval file and genes with expression values
gene_pheno_pval = gene_pheno_pval.loc[only_gene_list]
pvalue_gp_df = gene_pheno_pval.sort_values(by=gene_pheno_pval.columns[0]) #sort
only_gene_list = list(pvalue_gp_df.index.values)
expr_gene = expr_all.loc[only_gene_list] #this contains expression of all samples
expr_TF = expr_all.loc[only_TF_list]
#Form a dataframe of gene x TF for pvalue_gt. This DF will be row-sorted depending on the cancer
pvalue_gt_array = (-1) * np.ones((len(only_gene_list), len(only_TF_list))) #A gene x TF matrix
X_features = expr_TF.values.T
start_time = time.clock()
for i in range(len(only_gene_list)):
print('Pvalue_gene_TF', i)
y = expr_gene.iloc[i].values
EN_model = ElasticNetCV(l1_ratio=l1_rat)
####make sure that number of nonzero coefs do not exceed max_num_coefs
alphas1, coefs1, _ = EN_model.path(X_features, y, eps=0.01, n_alphas=10)
num_coefs = np.sum(coefs1 != 0, axis=0)
#print(num_coefs)
#print(num_coefs[num_coefs <= max_num_coefs][-1])
rep_EN = 0
if num_coefs[-1] < max_num_coefs:
EN_coef = coefs1[:, -1]
selected_ind = np.array(range(len(only_TF_list)))[EN_coef!=0]
else:
while (num_coefs[0] != num_coefs[-1]) and (max(num_coefs[num_coefs <= max_num_coefs]) != max_num_coefs) and (rep_EN < 10):
rep_EN += 1
alpha_min = alphas1[(num_coefs <= max_num_coefs)][-1]
alpha_max = alphas1[(num_coefs > max_num_coefs)][0]
alphas3 = np.linspace(alpha_min, alpha_max, 10)
alphas1, coefs1, _ = EN_model.path(X_features, y, alphas=alphas3)
num_coefs = np.sum(coefs1 != 0, axis=0)
#print(num_coefs)
#print(num_coefs[num_coefs <= max_num_coefs][-1])
print('repeat', rep_EN )
if num_coefs[0] == num_coefs[-1]:
EN_coef = coefs1[:, 0]
selected_ind = np.array(range(len(only_TF_list)))[EN_coef!=0]
else:
EN_coef = coefs1[:, len(num_coefs[num_coefs <= max_num_coefs])-1]
selected_ind = np.array(range(len(only_TF_list)))[EN_coef!=0]
print(len(selected_ind))
n_selected_TF = len(selected_ind)
X_features_new = X_features[:, selected_ind]
model = sm.OLS(y, X_features_new)
results = model.fit()
ts_b = results.tvalues
#####prcise estimation of p-vals
N = np.shape(X_features)[0]-1 #degrees of freedom in ttest for regression
pvals_precise = []
x1_bar = 0 #mean of the first distribution
n1 = (N+2)//2 #num obs first distribution
n2 = N+2-n1 #num obs second distribution
x2_std = 0
t_precise = []
for j in range(len(ts_b)):
T = ts_b[j] #T statistic
x2_bar = -np.sign(T) #mean of the second distribution
x1_std=1/T*np.sqrt((n1+n2-2)/(n1-1)/(1/n1+1/n2)) #std of first distribution
(t, p) = ss.ttest_ind_from_stats(x1_bar, x1_std, n1, x2_bar, x2_std, n2)
if p < eps:
p = eps
pvals_precise.append(p)
t_precise.append(t)
pvalue_gt_array[i, selected_ind] = pvals_precise
pvalue_gt_df = pd.DataFrame(pvalue_gt_array, index=only_gene_list, columns=only_TF_list)
##########%%%%%%%%%%@@@@@@@@@@!!!!!!!!!!@@@@@@@@@@%%%%%%%%%%##########
pvalue_gt_df.to_csv(address_out_pvalue_gt) #sorted gene-TF pseudo p-values
pvalue_gp_df.to_csv(address_out_pvalue_gp)
##########%%%%%%%%%%@@@@@@@@@@!!!!!!!!!!@@@@@@@@@@%%%%%%%%%%##########