forked from yuz207/atac-seq-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 4
/
call-atac.py
executable file
·299 lines (239 loc) · 10.1 KB
/
call-atac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import numpy as np
import os
import subprocess
import sys
from multiprocessing import Pool, Queue, Process, Manager, cpu_count
# Set global parameters
if len(sys.argv) == 5:
# Chromosome sizes for clipping and bigwig generation
mm_chr_sz = sys.argv[4]
# Input location for nodup bam alignments
nodup_bam_loc = sys.argv[1]
# Specify experiment condition
# location further processing and analysis for experiment
expt_prefix = sys.argv[2]
pipe_out_loc = sys.argv[3]
# If single argument '-m', use the following manually input
# parameters
#elif len(sys.argv) == 2 and sys.argv[-1] == '-m':
elif len(sys.argv) > 1 and sys.argv[1] == '-m':
# Chromosome sizes for clipping and bigwig generation
mm_chr_sz = '~/annotation/chromosomes/mm10/mm10.chrom.sizes'
# Input location for nodup bam alignments
nodup_bam_loc = ''
# Specify experiment condition
# location further processing and analysis for experiment
expt_prefix = ''
pipe_out_loc = ''
# If no args or incorrect number input, break and print help
else:
print 'Error: Missing arguments. Exiting.\n\n' + \
'Usage: python call-atac.py [options] <input dir> <expt prefix> <output dir> <genome sz>\n\n' + \
'Options: \n' + \
' -m : Use parameters manually specified in script.\n' + \
' This option will ignore following arguments.'
exit()
# Print verbose params
print 'Running script: \'call-atac.py\'. \n\n' + \
'- Experiment prefix : %s\n' % expt_prefix + \
'- Input alignment dir : %s\n' % nodup_bam_loc + \
'- Peak call output dir : %s\n' % pipe_out_loc + \
'- Genome size : %s\n' % mm_chr_sz
'''
#################################
# LOAD MODULES AND INIT FUNCTIONS
#################################
'''
def pipe(cmd_list):
return ' | '.join(cmd_list)
# Function to run shell command -- crude approach to parallelization;
# argument is entire shell command, so multi_arg is list of commands to run
def run_bash(command):
print command
#subprocess.call(command, shell=True)
# Set analysis output directory in run_bash before every command
subprocess.call('cd %s; %s' % (pipe_out_loc, command), shell=True)
return None
def get_numpeaks(file_loc):
with open(file_loc) as in_file:
for num, line in enumerate(in_file):
pass
return num
# Remove control_lambda and treat_pileups, if no other bdgcmp operations
# will be performed
def remove_bdg(outdirs):
cmd_rm = ['rm -f %s/*.bdg ' % outdir for outdir in outdirs]
return '; '.join(cmd_rm)
# Remove mitochondrial reads and adjust reads for Tn5 insertion
def cmd_procalign(in_bam_loc, expt_prefix, in_bam_rep):
prefix = expt_prefix + '_' + str(in_bam_rep)
prefix2 = expt_prefix + '.' + str(in_bam_rep)
'''Faster method might be to awk the header to get all non-chrM headers,
and then just use samtools view chr*.'''
cmd_remove_mito = pipe([
'samtools view -h %s/%s_sorted_nodup.bam' % (in_bam_loc, prefix),
'awk -F "\\t" \'$3 != "chrM" { print $0 }\'',
'samtools view -bS - > ' +
'%s.sorted.nodup.non-chrM.bam' % prefix2,
])
# print cmd_remove_mito
cmd_bam2bed = pipe([
'bamToBed -i ' +
'%s.sorted.nodup.non-chrM.bam' % prefix2,
'awk -F "\\t" \'BEGIN {OFS=FS} ' +
'$6 == "+" {$2 = $2 + 4} $6 == "-" {$3 = $3 - 5}; 1\'',
'gzip -c > ' +
'%s.sorted.nodup.non-chrM.tn5.bed.gz ' % prefix2
])
# print cmd_bam2bed
return '; '.join([cmd_remove_mito, cmd_bam2bed])
# Generate flagstat summary
def cmd_flagstat(expt_prefix):
flagstat = ['samtools flagstat ' + '%s.%s.sorted.nodup.non-chrM.bam' % (expt_prefix, rep_no) +
' > flagstat%s.txt ' % rep_no for rep_no in ['1', '2']]
return '; '.join(flagstat)
# Get counts
def cmd_readcounts(flagstat_loc):
# Load flagstat counts; if pooled, add them together for scale_factor
counts = {}
for curr_no in [1, 2]:
with open('%s/flagstat%d.txt' % (flagstat_loc, curr_no)) as flagstat:
num_mapped = flagstat.readlines()[4].split(' ')[0]
num_scaled = int(num_mapped) / 1000000.
# print num_mapped, num_scaled
counts[curr_no] = num_scaled
# for pooled obvi, will be R1 + R2
return counts
# Call narrowPeaks, and generate treat_pileup and control_lambda bdg's
def cmd_callpeak(prefix, t_files, outdir, p_val, shift_val, ext_val):
command = 'macs2 callpeak ' + \
'-t %s ' % t_files + \
'-f BED -n %s ' % prefix + \
'--outdir %s ' % outdir + \
'-g mm -p %s --nomodel --shift -%s --extsize %s ' % (p_val, shift_val, ext_val) + \
'--keep-dup all -B --SPMR --call-summits '
return command
# Fold-enrichment
def cmd_FE(mm_chr_sz, outdir, prefix):
cmd_bdgcmp = 'macs2 bdgcmp ' + \
'-t %s/%s_treat_pileup.bdg ' % (outdir, prefix) + \
'-c %s/%s_control_lambda.bdg ' % (outdir, prefix) + \
'--outdir %s -o %s.FE.bdg ' % (outdir, prefix) + \
'-m FE'
# print cmd_bdgcmp
cmd_trimsort = pipe([
'sort -k1,1 -k2,2n %s/%s.FE.bdg' % (outdir, prefix),
'slopBed -i stdin -g %s -b 0' % mm_chr_sz,
'bedClip stdin %s %s/%s.FE.trim.bdg ' % (mm_chr_sz, outdir, prefix)
])
# print cmd_trimsort
cmd_bw = 'bedGraphToBigWig %s/%s.FE.trim.bdg %s %s/%s.FE.bw ' % (
outdir, prefix, mm_chr_sz, outdir, prefix)
# print cmd_bw
return '; '.join([cmd_bdgcmp, cmd_trimsort, cmd_bw])
# P-value
def cmd_ppois(mm_chr_sz, outdir, prefix, scale_factor):
cmd_bdgcmp = 'macs2 bdgcmp ' + \
'-t %s/%s_treat_pileup.bdg ' % (outdir, prefix) + \
'-c %s/%s_control_lambda.bdg ' % (outdir, prefix) + \
'--outdir %s -o %s.pval.bdg ' % (outdir, prefix) + \
'-m ppois -S %s' % str(scale_factor)
# print cmd_bdgcmp
cmd_trimsort = pipe([
'sort -k1,1 -k2,2n %s/%s.pval.bdg' % (outdir, prefix),
'slopBed -i stdin -g %s -b 0' % mm_chr_sz,
'bedClip stdin %s %s/%s.pval.trim.bdg ' % (mm_chr_sz, outdir, prefix)
])
# print cmd_trimsort
cmd_bw = 'bedGraphToBigWig %s/%s.pval.trim.bdg %s %s/%s.pval.bw ' % (
outdir, prefix, mm_chr_sz, outdir, prefix)
# print cmd_bw
return '; '.join([cmd_bdgcmp, cmd_trimsort, cmd_bw])
'''
################################
# READ-ALIGNMENT POST-PROCESSING
################################
First, remove mitochondrial reads by checking header or for ChrM.
Convert bam to bed s.t. paired reads are treated as individual
single reads, so both ends (cut sites) will pileup instead of just
the 5' end of a read pair. (-bampe will retain mate information.)
Adjust the ends to account for actual cut site due to Tn5 insertion
position: add 4 to '+' strands and subtract 5 from '-' strands.
'''
# Post-process read alignments for both replicates
procalign_cmds = [cmd_procalign(
nodup_bam_loc, expt_prefix, rep_num) for rep_num in [1, 2]]
# print '\n'.join(procalign_cmds) + '\n'
if __name__ == '__main__':
pool = Pool(processes=2)
pool.map(run_bash, procalign_cmds)
pool.close()
# Generate flagstat summary statistics for both reps, load into counts dict
# print cmd_flagstat(expt_prefix) + '\n'
run_bash(cmd_flagstat(expt_prefix))
counts = cmd_readcounts(os.path.expanduser(pipe_out_loc))
print counts, '\n'
'''
##########################################
# PEAK CALLING AND SIGNAL TRACK GENERATION
##########################################
Call peaks on individual and pooled replicates. Generate FE and
-log10(p-val) signal tracks.
'''
# Params for peak calling
shift_val = '37'
ext_val = '73'
p_val = '1e-2'
# File locations of post-processed alignments for both replicates
R1_bedgz = '%s.%d.sorted.nodup.non-chrM.tn5.bed.gz' % (expt_prefix, 1)
R2_bedgz = '%s.%d.sorted.nodup.non-chrM.tn5.bed.gz' % (expt_prefix, 2)
pooled_bedgz = ' '.join([R1_bedgz, R2_bedgz])
# Call peaks on both replicates and pooled reads
callpeak1 = cmd_callpeak('%s.%d' % (expt_prefix, 1),
R1_bedgz, 'R1', p_val, shift_val, ext_val)
callpeak2 = cmd_callpeak('%s.%d' % (expt_prefix, 2),
R2_bedgz, 'R2', p_val, shift_val, ext_val)
callpeakp = cmd_callpeak('%s.%s' % (expt_prefix, 'pooled'),
pooled_bedgz, 'pooled', p_val, shift_val, ext_val)
# print '\n'.join([callpeak1, callpeak2, callpeakp]) + '\n'
if __name__ == '__main__':
pool = Pool(processes=3)
pool.map(run_bash, [callpeak1, callpeak2, callpeakp])
pool.close()
'''
Generate comparative bedgraph from treat_pileup and control_lambda,
trim the coordinates outside of chromosome size limits (due to MACS bug),
then convert bedgraph to bigWig. Delete the intermediate files.
'''
# Generate FE signal tracks
FE1 = cmd_FE(mm_chr_sz, 'R1', '%s.%d' % (expt_prefix, 1))
FE2 = cmd_FE(mm_chr_sz, 'R2', '%s.%d' % (expt_prefix, 2))
FEp = cmd_FE(mm_chr_sz, 'pooled', '%s.%s' % (expt_prefix, 'pooled'))
# print '\n'.join([FE1, FE2, FEp]) + '\n'
if __name__ == '__main__':
pool = Pool(processes=3)
pool.map(run_bash, [FE1, FE2, FEp])
pool.close()
'''
Since --SPMR was used during peak calling, to generate the p-value track,
we must apply a scaling factor of the # of reads per million of the input
BED files. (Normally, we would take the lower of this value for treatment
and control, but there is no control for ATAC-seq.)
Use flagstat to count the number of total mapped reads in the non-chrM bam
file. For the pooled peaks, add the # of reads for each replicate. (An
alternate way to calculate this value is to simply count the number of lines
in the compressed BED file; zcat * | wc -l)
'''
# Generate p-value signal tracks
pv1 = cmd_ppois(mm_chr_sz, 'R1', '%s.%d' % (expt_prefix, 1), counts[1])
pv2 = cmd_ppois(mm_chr_sz, 'R2', '%s.%d' % (expt_prefix, 2), counts[2])
pvp = cmd_ppois(mm_chr_sz, 'pooled', '%s.%s' %
(expt_prefix, 'pooled'), counts[1] + counts[2])
# print '\n'.join([pv1, pv2, pvp]) + '\n'
if __name__ == '__main__':
pool = Pool(processes=3)
pool.map(run_bash, [pv1, pv2, pvp])
pool.close()
# Remove temporary files e.g. bedgraphs
print remove_bdg(['R1', 'R2', 'pooled', 'pseudoreps'])
# run_bash(remove_bdg(['R1','R2','pooled', 'pseudoreps']))