-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchiprnacombiner.py
executable file
·215 lines (198 loc) · 10.2 KB
/
chiprnacombiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from copy import deepcopy
import numpy as np
import pandas as pd
import tqdm
from typing import List,Dict
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import argparse
from version import __version__
def compute_specifity_statistics(df:pd.DataFrame)->Dict[str,Dict[int,int]]:
"""
Compute specifity statistics.
"""
#Get Unique values of 'Conditions' column
conditions = df['Conditions'].unique()
#iterate over conditions
result = {}
for condition in conditions:
df_tmp = deepcopy(df[df['Conditions'] == condition])
#for each row compute number of samples in samples list ['sample1','sample2','sample3'] in Samples column
df_tmp.loc[:,'Number_of_samples'] = df_tmp['Samples'].apply(lambda x: len(x.split(',')))
nos_unique = df_tmp['Number_of_samples'].unique()
nos_list = {}
for nos in nos_unique:
nos_list[nos] = len(df_tmp[df_tmp['Number_of_samples'] == nos])
result[condition] = nos_list
return result
def compute_specifity_statistics_genes(df:pd.DataFrame)->Dict[str,Dict[int,int]]:
"""
Compute specifity statistics.
"""
#Get Unique values of 'Conditions' column
conditions = df['Conditions'].unique()
#iterate over conditions
result = {}
for condition in conditions:
df_tmp = deepcopy(df[df['Conditions'] == condition])
#for each row compute number of samples in samples list ['sample1','sample2','sample3'] in Samples column
df_tmp['Number_of_samples'] = df_tmp['Samples'].apply(lambda x: len(x.split(',')))
nos_unique = df_tmp['Number_of_samples'].unique()
nos_list = {}
for nos in nos_unique:
df_tmp_tmp = df_tmp[df_tmp['Number_of_samples'] == nos]
genes = []
for index,row in df_tmp_tmp.iterrows():
try:
genes+=row['Genes'].split(',')
except AttributeError:
continue
nos_list[nos] = len(set(genes))
result[condition] = nos_list
return result
def plot_specifity_statistics(specifity_statistics:Dict[str,Dict[int,int]],title=""):
fig, ax = plt.subplots()
#plot each conditions separately in different bar on the same bar plot
i=0
i_inc = 0.7/len(specifity_statistics)
cond_list = list(specifity_statistics.keys())
for cond in sorted(cond_list):
values = specifity_statistics[cond]
ax.bar([x-i_inc*1.3+i+i_inc for x in values.keys()], values.values(), width=i_inc*1.3,label=cond)
i+=i_inc
#set xticks with step 1
ax.set_xticks(range(1,max([max(x.keys()) for x in specifity_statistics.values()])+1))
ax.set_xlabel('Number of samples')
ax.set_ylabel(f'Number of {title}')
ax.grid()
ax.legend()
return fig,ax
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=f"ChipRnaCombiner(ver {__version__}) - Tool for combine ChipSeq and RNASeq results to find genes that are controled by Super Enhancers\ Enhancers and\or have enrichment in promotor regions and are upregulated in RNASeq",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-indir", help="Directory with input files", type=str,
required=True)
parser.add_argument("-chipseq_file", help="File with Super Enhancers", type=str,
required=True)
parser.add_argument("-chipseq_promoter_file", help="File with Promoters", type=str,
default="")
parser.add_argument("-chipseq_promoter_condition", help="Conditions of differencial enrichment", type=str,
default="tumor")
parser.add_argument("-chipseq_promoter_condition_min_samples", help="Min number of samples that fit condition", type=int,
default=1)
parser.add_argument("-chipseq_promoter_condition_max_samples", help="Max number of samples that fit condition", type=int,
default=5)
parser.add_argument("-rna_enrich_file", help="File with RNASeq results", type=str,
default="")
parser.add_argument("-rna_celllines_file", help="File with RNASeq results for cell lines", type=str,
default="")
parser.add_argument("-lfc_thresh", help="Treshold odf absolute value of Log Fold Change (which genes are significant)", type=float,
default=1.0)
parser.add_argument("-pvalue_thresh",
help="Treshold of pValue (which genes are significant)", type=float,
default=0.05)
parser.add_argument("-gene_name_col",
help="Column name if RNA file with gene name", type=str,
required=True)
parser.add_argument("-output_file", help="Output file", type=str,
required=True)
parser.add_argument("-output_pdf", help="Output pdf file", type=str,
default="")
args = parser.parse_args()
dirname = args.indir
chipseq_se_file = args.chipseq_file
chipseq_promoter_file = args.chipseq_promoter_file
promoter_condition = args.chipseq_promoter_condition
rna_enrich_file = args.rna_enrich_file
rna_celllines_file = args.rna_celllines_file.split(',')
output_file = args.output_file
output_pdf = args.output_pdf
logfoldchangeCol = 'log2FoldChange'
pvalueCol = 'padj'
geneNameCol = args.gene_name_col
if len(rna_enrich_file) == 0 and len(rna_celllines_file) == 0 and len(chipseq_promoter_file) == 0:
raise RuntimeError("At least one of rna_enrich_file,rna_celllines_file,chipseq_promoter_file should be set")
if len(output_pdf) > 0:
pp = PdfPages(output_pdf)
df_se = pd.read_csv(dirname + chipseq_se_file)
spec_stat = compute_specifity_statistics(df_se)
fig1,ax1=plot_specifity_statistics(spec_stat,title="Super Enhancers")
if len(output_pdf) > 0:
pp.savefig(fig1)
spec_stat_genes= compute_specifity_statistics_genes(df_se)
fig2,ax2=plot_specifity_statistics(spec_stat_genes,title="Super Enhancers controled Genes")
if len(output_pdf) > 0:
pp.savefig(fig2)
if len(chipseq_promoter_file) > 0:
df_promoter = pd.read_csv(dirname + chipseq_promoter_file)
spec_stat = compute_specifity_statistics(df_promoter)
fig3,ax3=plot_specifity_statistics(spec_stat,title="Promoters")
if len(output_pdf) > 0:
pp.savefig(fig3)
enrichment_promotor = df_promoter['nearest_gene'].to_list()
if len(promoter_condition)>0:
enrichment_cond_promotor = {}
counts_list = list(np.linspace(args.chipseq_promoter_condition_min_samples,args.chipseq_promoter_condition_max_samples,args.chipseq_promoter_condition_max_samples-args.chipseq_promoter_condition_min_samples+1,dtype=int))
for counts in counts_list:
enrichment_cond_promotor[counts] = \
df_promoter[(df_promoter['Conditions'] == promoter_condition) & (df_promoter['Count'] >= counts)]['nearest_gene'].to_list()
if len(rna_enrich_file) > 0:
df_rna = pd.read_csv(dirname + rna_enrich_file)
diff_expressed = df_rna[df_rna[pvalueCol] > args.pvalue_thresh]
upreg = diff_expressed[np.abs(diff_expressed[logfoldchangeCol]) > args.lfc_thresh][geneNameCol].to_list()
else:
diff_expressed = None
if len(rna_celllines_file) > 0:
df_rna_celllines = {}
cellines_upreg = {}
for rna_cellline_file in rna_celllines_file:
df_rna_celllines[rna_cellline_file] = pd.read_csv(dirname + rna_cellline_file)
diff_expressed = df_rna_celllines[rna_cellline_file][df_rna_celllines[rna_cellline_file][pvalueCol] > args.pvalue_thresh]
cellines_upreg[rna_cellline_file] = diff_expressed[np.abs(diff_expressed[logfoldchangeCol]) > args.lfc_thresh][geneNameCol].to_list()
else:
cellines_upreg = {}
#print(df_rna)
#add column genes_promotor to df_se
df_se['Genes_promotor'] = None
df_se['Genes_tumor_promotor'] = None
df_se['Genes_upreg'] = None
for index,row in tqdm.tqdm(df_se.iterrows(),desc="Iterating over ChipSeqs regions",total=df_se.shape[0],unit="SE(s)"):
#print(row['Genes'])
if row['Genes'] == [] or row['Genes'] is np.nan:
continue
list_of_genes = row['Genes'].split(',')
# Find intersection of list_of_genes and enrichment_promotor
genes_promotor = list(set(list_of_genes) & set(enrichment_promotor))
df_se.at[index, f'Genes_promotor'] = ','.join(genes_promotor)
df_se.at[index, f'Genes_upreg'] = ','.join(set(list_of_genes) & set(upreg))
for rna_cellline_file in rna_celllines_file:
df_se.at[index, f'Genes_{rna_cellline_file}'] = ','.join(set(list_of_genes) & set(cellines_upreg[rna_cellline_file]))
for counts in counts_list:
genes_cond_promotor = list(set(genes_promotor) & set(enrichment_cond_promotor[counts]))
genes_cond_promotor_upreg = list(set(genes_cond_promotor) & set(upreg))
df_se.at[index, f'Genes_{promoter_condition}_promotor_{counts}'] = ','.join(genes_cond_promotor)
df_se.at[index, f'Genes_{promoter_condition}_promotor_upreg_{counts}'] = ','.join(genes_cond_promotor_upreg)
gtp = {}
gtpe = {}
for counts in counts_list:
gtp[counts] = len(df_se[f'Genes_{promoter_condition}_promotor_{counts}'].unique().tolist())
gtpe[counts] = len(df_se[f'Genes_{promoter_condition}_promotor_upreg_{counts}'].unique().tolist())
#bar plot of gtp and gtpe
fig, ax = plt.subplots()
ax.bar(gtp.keys(), gtp.values(), width=0.4, label=f'Genes with {promoter_condition} specified \n enrichment in promotor')
ax.bar(gtpe.keys(), gtpe.values(), width=0.4, label=f'Genes with {promoter_condition} specified \n enrichment in promotor and upregulation')
ax.set_xticks(counts_list)
ax.set_xticklabels(counts_list)
ax.set_xlabel('Number of samples')
ax.set_ylabel('Number of genes')
ax.grid()
ax.legend()
if len(output_pdf) > 0:
pp.savefig(fig)
df_se.to_csv(dirname + output_file, index=False)
#print(df_se)
#print(df_promoter)
#print(df_rna)
if len(output_pdf) > 0:
pp.close()