-
Notifications
You must be signed in to change notification settings - Fork 1
/
cappseq_umi_workflow.smk
584 lines (519 loc) · 27.2 KB
/
cappseq_umi_workflow.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
#!/usr/bin/env snakemake
import os
import gzip
import datetime
import pysam
import math
configpath = "config/cappseq_umi_config.yaml"
configfile: configpath
# Ensure config file is correct, and all required attributes are present
pathkeys = {"refgenome", "captureregions", "captureregionsil", "dbsnpvcf", "samplefile", "baseoutdir"} # config elements that are filepaths
for ckey, attribute in config["cappseq_umi_workflow"].items():
if attribute == "__UPDATE__":
raise AttributeError(f"\'__UPDATE__\' found for \'{ckey}\' in config file \'{configpath}\'. Please ensure the config file is updated with parameters relevant for your analysis")
# Check that required filepaths exist
if ckey in pathkeys:
if not os.path.exists(attribute):
raise AttributeError(f"Unable to locate \'{attribute}\' for key \'{ckey}\' in config file \'{configpath}\': No such file or directory")
# Load samples
# This assumes the sample file has three columns:
# sample_id R1_fastq_path R2_fastq_path
samplefile = config["cappseq_umi_workflow"]["samplefile"]
samplelist = []
r1_fastqs = {}
r2_fastqs = {}
sample_to_runid = {}
def is_gzipped(filepath):
with open(filepath, "rb") as f:
magicnum = f.read(2)
return magicnum == b'\x1f\x8b' # Magic number of gzipped files
# Process sample file
with open(samplefile) as f:
i = 0 # Line counter
for line in f:
i += 1
line = line.rstrip("\n").rstrip("\r")
if line.startswith("#"): # Ignore comment lines
continue
try:
cols = line.split("\t")
sample_id = cols[0]
r1_fastq = cols[1]
r2_fastq = cols[2]
run_id = cols[3]
except IndexError as e:
raise AttributeError(f"Unable to parse line {i} of sample file \'{samplefile}\'. Expecting three tab-delineated columns corresponding to \'sample_id\', \'R1_fastq\', \'R2_fastq\', and \'run_id\'") from e
# Check that the specified FASTQs exist
if not os.path.exists(r1_fastq):
raise AttributeError(f"Unable to parse line {i} of sample file \'{samplefile}\': Unable to locate \'{r1_fastq}\': No such file or directory")
if not os.path.exists(r2_fastq):
raise AttributeError(f"Unable to parse line {i} of sample file \'{samplefile}\': Unable to locate \'{r2_fastq}\': No such file or directory")
# Check that this sample doesn't already exist
if sample_id in r1_fastqs:
raise AttributeError(f"Duplicate sample ID \'{sample_id}\' in sample file \'{samplefile}")
samplelist.append(sample_id)
r1_fastqs[sample_id] = r1_fastq
r2_fastqs[sample_id] = r2_fastq
# Store the runID for this sample
sample_to_runid[sample_id] = run_id
outdir = config["cappseq_umi_workflow"]["baseoutdir"]
def generate_read_group(fastq, sample):
# Parses flowcell, lane, and barcode information from FASTQ read names
# Uses this information (and config file info) to generate a read group line
if is_gzipped(fastq):
readname = gzip.open(fastq, "rt").readline()
else:
readname = open(fastq, "r").readline()
# Parse out the attributes for the read group from the read name
readname = readname.rstrip("\n").rstrip("\r")
cols = readname.split(":")
sequencer = cols[0]
sequencer = sequencer.replace("@","")
flowcell = cols[2]
flowcell = flowcell.split("-")[-1]
lane = cols[3]
barcode = cols[-1]
# From the config (generic and should be consistent between runs)
description = config["cappseq_umi_workflow"]["readgroup"]["description"]
centre = config["cappseq_umi_workflow"]["readgroup"]["centre"]
platform = config["cappseq_umi_workflow"]["readgroup"]["platformunit"]
platformmodel = config["cappseq_umi_workflow"]["readgroup"]["platformmodel"]
# Get the date this sample was generated.
# This isn't perfect, but it should be relatively close to the sequencing date.
origpath = os.path.realpath(fastq)
date = datetime.date.fromtimestamp(os.path.getctime(origpath)).isoformat() # Get creation date of FASTQ file
platformunit = flowcell + "-" + lane + ":" + barcode
readgroup = f"@RG\\tID:{sample}\\tBC:{barcode}\\tCN:{centre}\\tDS:\'{description}\'\\tDT:{date}\\tLB:{sample}\\tPL:{platform}\\tPM:{platformmodel}\\tPU:{platformunit}\\tSM:{sample}"
return readgroup
# Adapted from the multiQC module
# Used to calculate the estimated total number of molecules in a library
def estimateLibrarySize(readPairs, uniqueReadPairs):
"""
Picard calculation to estimate library size
Taken & translated from the Picard codebase:
https://github.com/broadinstitute/picard/blob/78ea24466d9bcab93db89f22e6a6bf64d0ad7782/src/main/java/picard/sam/DuplicationMetrics.java#L153-L164
Note: Optical duplicates are contained in duplicates and therefore do not enter the calculation here.
See also the computation of READ_PAIR_NOT_OPTICAL_DUPLICATES.
* Estimates the size of a library based on the number of paired end molecules observed
* and the number of unique pairs observed.
* <p>
* Based on the Lander-Waterman equation that states:
* C/X = 1 - exp( -N/X )
* where
* X = number of distinct molecules in library
* N = number of read pairs
* C = number of distinct fragments observed in read pairs
"""
readPairDuplicates = readPairs - uniqueReadPairs
if readPairs > 0 and readPairDuplicates > 0:
m = 1.0
M = 100.0
if uniqueReadPairs >= readPairs or f(m * uniqueReadPairs, uniqueReadPairs, readPairs) < 0:
logging.warning("Picard recalculation of ESTIMATED_LIBRARY_SIZE skipped - metrics look wrong")
return None
# find value of M, large enough to act as other side for bisection method
while f(M * uniqueReadPairs, uniqueReadPairs, readPairs) > 0:
M *= 10.0
# use bisection method (no more than 40 times) to find solution
for i in range(40):
r = (m + M) / 2.0
u = f(r * uniqueReadPairs, uniqueReadPairs, readPairs)
if u == 0:
break
elif u > 0:
m = r
elif u < 0:
M = r
return uniqueReadPairs * (m + M) / 2.0
else:
return None
def f(x, c, n):
"""
Picard calculation used when estimating library size
Taken & translated from the Picard codebase:
https://github.com/broadinstitute/picard/blob/78ea24466d9bcab93db89f22e6a6bf64d0ad7782/src/main/java/picard/sam/DuplicationMetrics.java#L172-L177
* Method that is used in the computation of estimated library size.
"""
return c / x - 1 + math.exp(-n / x)
rule trim_umi:
"""
Trim the UMI from the front (and end) of each read pair
Also does some FASTQ cleanup (polyG tails etc)
Note that we are using --trim_front here instead of --umi, as we do not want the UMI stored in the read name
as then we can't use fgbio AnnotateBamWithUMIs, as the read names will be different from the FASTQ file
(In theory we could use CopyUmiFromReadName, but that requires a "-" deliminator between the forward and reverse
UMIs while fastp uses a "_" so, ¯\_(ツ)_/¯ )
"""
input:
r1 = lambda w: r1_fastqs[w.samplename],
r2 = lambda w: r2_fastqs[w.samplename]
output:
r1 = temp(os.path.join(outdir, "01-trimmedfastqs", "{samplename}.R1.trimmed.fastq.gz")),
r2 = temp(os.path.join(outdir, "01-trimmedfastqs", "{samplename}.R2.trimmed.fastq.gz")),
fastp_report = os.path.join(outdir, "01-trimmedfastqs", "{samplename}.fastp.json")
params:
barcodelength = config["cappseq_umi_workflow"]["barcodelength"],
outdir = os.path.join(outdir, "01-trimmedfastqs")
threads: 4
conda:
"envs/fastp.yaml"
log:
os.path.join(outdir, "logs", "{samplename}.fastp.log")
shell: """
fastp --overrepresentation_analysis --detect_adapter_for_pe --trim_front1 {params.barcodelength} \
--trim_front2 {params.barcodelength} --in1 {input.r1} --in2 {input.r2} --thread {threads} --out1 {output.r1} --out2 {output.r2} \
--report_title "fastp {wildcards.samplename}" --json {output.fastp_report} --trim_poly_x \
--qualified_quality_phred 20 2> {log}
"""
rule bwa_align_unsorted:
input:
r1 = rules.trim_umi.output.r1,
r2 = rules.trim_umi.output.r2,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
bam = temp(outdir + os.sep + "02-BWA" + os.sep + "{samplename}.bwa.unsort.bam")
params:
readgroup = lambda w: generate_read_group(r1_fastqs[w.samplename], w.samplename),
threads:
config["cappseq_umi_workflow"]["bwa_threads"]
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.bwa_allreads.log"
shell:
"bwa mem -t {threads} -R \"{params.readgroup}\" {input.refgenome} {input.r1} {input.r2} 2> {log} | samtools view -b > {output.bam} 2>> {log}"
# Add UMI tag
rule fgbio_annotate_umis:
input:
bam = rules.bwa_align_unsorted.output.bam,
r1 = lambda w: r1_fastqs[w.samplename],
r2 = lambda w: r2_fastqs[w.samplename]
output:
bam = temp(outdir + os.sep + "03-withumis" + os.sep + "{samplename}.bwa.umi.namesort.bam")
params:
umiloc = config["cappseq_umi_workflow"]["barcodelocation"]
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.annotateumis.log"
shell:
"fgbio AnnotateBamWithUmis --input {input.bam} --fastq {input.r1} --fastq {input.r2} --read-structure {params.umiloc} --output {output.bam} &> {log}"
# Group reads by UMI into families
rule fgbio_group_umis:
input:
bam = rules.fgbio_annotate_umis.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
bam = outdir + os.sep + "04-umigrouped" + os.sep + "{samplename}.umigrouped.sort.bam",
txt = outdir + os.sep + "04-umigrouped" + os.sep + "{samplename}.umigrouped.famsize.txt"
params:
maxedits = config["cappseq_umi_workflow"]["umiedits"],
outdir = outdir + os.sep + "04-umigrouped"
threads:
config["cappseq_umi_workflow"]["samtools_sort_threads"]
resources:
mem_mb = "2G"
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.groupumis.log"
shell:
"samtools sort -m {resources.mem_mb} -@ {threads} -n {input.bam} | fgbio SetMateInformation --ref {input.refgenome} 2> {log} | fgbio GroupReadsByUmi --edits {params.maxedits} --family-size-histogram {output.txt} --strategy paired > {output.bam} 2>> {log}"
# Generate a consensus of these families
rule fgbio_duplex_consensus:
input:
bam = rules.fgbio_group_umis.output.bam
output:
bam = temp(outdir + os.sep + "05-duplexconsensus" + os.sep + "{samplename}.consensus.unmapped.bam")
params:
minreads = config["cappseq_umi_workflow"]["minreads"],
sampleid = "{samplename}"
threads:
config["cappseq_umi_workflow"]["duplexconsensus_threads"]
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.duplexconsensus.log"
shell:
"fgbio CallDuplexConsensusReads --input {input.bam} --output {output.bam} --threads {threads} --read-group-id {params.sampleid} --min-reads {params.minreads} &> {log}"
# Because CallDuplexConsensusReads recalculates base qualities, and those numbers can be above those supported by certain tools, set a upper
# limit to base qualities.
# Also, remove all the extra space-taking tags
rule sanitize_bam:
input:
bam = rules.fgbio_duplex_consensus.output.bam
output:
bam = temp(outdir + os.sep + "06-sanitizebam" + os.sep + "{samplename}.consensus.unmapped.capqual.bam")
params:
max_base_qual = int(config["cappseq_umi_workflow"]["max_base_qual"]), # Bases with quality scores above this are capped at this
tagstoremove = config["cappseq_umi_workflow"]["tags_to_remove"],
min_base_qual = int(config["cappseq_umi_workflow"]["min_base_qual"]) # Bases with quality scores below this are masked
threads:
config["cappseq_umi_workflow"]["basequal_threads"]
run:
inFile = pysam.AlignmentFile(input.bam, check_sq=False, mode = "rb", threads=threads) # We have to provide check_sq=False in case this is an unaligned BAM
tagstoremove = set(params.tagstoremove)
# Remove the entries for these tags from the BAM header
inHeader = inFile.header.to_dict() # Returns a multi-level dictionary
outHeader = {}
for level, attributes in inHeader.items():
# If we are removing the RG tag, remove this level
if "RG" in tagstoremove and level == "RG":
continue
outHeader[level] = attributes
outFile = pysam.AlignmentFile(output.bam, header=outHeader, mode = "wb")
# Process reads
for read in inFile.fetch(until_eof=True):
# Cap the upper limit of base qualities
outqual = list(qual if qual <= params.max_base_qual else params.max_base_qual for qual in read.query_qualities)
# Mask bases with quality scores below the specified threshold
masked_seq = "".join(read.query_sequence[i] if read.query_qualities[i] >= 20 else "N" for i in range(0, read.query_length))
read.query_sequence = masked_seq
read.query_qualities = outqual
# Remove the unwanted tags
# pysam returns read tags as a list of tuples
# ex. [(NM, 2), (RG, "GJP00TM04")]
outtags = list(tag for tag in read.get_tags() if tag[0] not in tagstoremove)
read.set_tags(outtags)
outFile.write(read)
# For safety, manually close files
inFile.close()
outFile.close()
# Covert unaligned BAM back to FASTQ and map reads
rule bwa_realign_bam:
input:
bam = rules.sanitize_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
bam = temp(outdir + os.sep + "07-consensus_aligned" + os.sep + "{samplename}.consensus.mapped.namesort.bam")
threads:
config["cappseq_umi_workflow"]["bwa_threads"]
params:
readgroup = lambda w: generate_read_group(r1_fastqs[w.samplename], w.samplename)
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.bwa_realign.log"
shell:
"samtools fastq -@ {threads} -N {input.bam} 2> {log} | bwa mem -R \"{params.readgroup}\" -p -t {threads} {input.refgenome} - 2>> {log} | samtools view -b | "
"picard SortSam -SO queryname -I /dev/stdin -O /dev/stdout 2>> {log} | fgbio SetMateInformation --ref {input.refgenome} --output {output.bam} &> {log}"
# Add back in family information from the unaligned consensus BAM
rule picard_annotate_bam:
input:
unaligned_bam = rules.sanitize_bam.output.bam,
aligned_bam = rules.bwa_realign_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
bam = outdir + os.sep + "99-final" + os.sep + "{samplename}.consensus.mapped.annot.bam"
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picardannotate.log"
shell:
"picard SortSam -I {input.unaligned_bam} -O /dev/stdout -SO queryname --REFERENCE_SEQUENCE {input.refgenome} | picard MergeBamAlignment --ALIGNED_BAM {input.aligned_bam} --UNMAPPED_BAM /dev/stdin --REFERENCE_SEQUENCE {input.refgenome} -O {output.bam} &> {log} && "
"samtools index {output.bam}"
# Output sentinel confirming that the final BAMs are valid
rule picard_validate_sam:
input:
bam = rules.picard_annotate_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
txt = outdir + os.sep + "09-validoutput" + os.sep + "{samplename}.consensus.mapped.ValidateSamFile.is_valid"
params:
outdir = outdir + os.sep + "09-validoutput"
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picardvalidatesam.log"
shell:
"picard ValidateSamFile -I {input.bam} -R {input.refgenome} > {output.txt} 2> {log}"
### QUALITY CONTROL RULES ###
### Currently supported tools:
### FASTQC
### PICARD
rule qc_fastqc:
input:
bam = rules.bwa_align_unsorted.output.bam
output:
qc = outdir + os.sep + "Q1-fastqc" + os.sep + "{samplename}.bwa.unsort_fastqc.html"
params:
outdir = outdir + os.sep + "Q1-fastqc"
conda:
"envs/picard_fastqc.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.fastqc.log"
shell:
"fastqc -o {params.outdir} --nogroup -f bam {input.bam} 2> {log}"
rule qc_picard_hsmetrics:
input:
bam = rules.picard_annotate_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
hsmet = outdir + os.sep + "Q2-hs_metrics" + os.sep + "{samplename}.hs_metrics.txt",
tarcov = outdir + os.sep + "Q2-hs_metrics" + os.sep + "{samplename}.target_coverage.txt"
params:
capture_reg_il = config["cappseq_umi_workflow"]["captureregionsil"],
outdir = outdir + os.sep + "Q2-hs_metrics",
max_ram_records = "5000000",
cov_cap_sens = "20000"
conda:
"envs/picard_fastqc.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picard_hsmet.log"
shell:
"picard CollectHsMetrics -R {input.refgenome} -TI {params.capture_reg_il} -BI {params.capture_reg_il} -I {input.bam} -O {output.hsmet} --PER_TARGET_COVERAGE {output.tarcov} --MAX_RECORDS_IN_RAM {params.max_ram_records} --COVERAGE_CAP {params.cov_cap_sens} 2> {log}"
rule qc_picard_oxog:
input:
bam = rules.picard_annotate_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
txt = outdir + os.sep + "Q3-oxog_metrics" + os.sep + "{samplename}.oxoG_metrics.txt"
params:
outdir = outdir + os.sep + "Q3-oxog_metrics"
conda:
"envs/picard_fastqc.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picard_oxoG.log"
shell:
"picard CollectOxoGMetrics -I {input.bam} -R {input.refgenome} -O {output.txt} 2> {log}"
rule qc_picard_insertsize:
input:
bam = rules.picard_annotate_bam.output.bam
output:
txt = outdir + os.sep + "Q4-insert_size" + os.sep + "{samplename}.insert_size_metrics.txt",
pdf = outdir + os.sep + "Q4-insert_size" + os.sep + "{samplename}.insert_size_histogram.pdf"
params:
outdir = outdir + os.sep + "Q4-insert_size"
conda:
"envs/picard_fastqc.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picard_insertsize.log"
shell:
"picard CollectInsertSizeMetrics -I {input.bam} -O {output.txt} -H {output.pdf} 2> {log}"
rule qc_fgbio_errorrate:
input:
bam = rules.picard_annotate_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
txt = outdir + os.sep + "Q5-error_rate" + os.sep + "{samplename}.error_rate_by_read_position.txt"
params:
outprefix = outdir + os.sep + "Q5-error_rate" + os.sep + "{samplename}",
outdir = outdir + os.sep + "Q5-error_rate",
capture_reg_il = config["cappseq_umi_workflow"]["captureregionsil"],
dbsnpvcf = config["cappseq_umi_workflow"]["dbsnpvcf"]
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.error_rate_by_position.log"
shell:
"fgbio ErrorRateByReadPosition -i {input.bam} -r {input.refgenome} -v {params.dbsnpvcf} -l {params.capture_reg_il} --collapse -o {params.outprefix} 2> {log}"
rule qc_calc_dupl:
input:
collapsed_bam = rules.picard_annotate_bam.output.bam,
all_reads_bam = rules.bwa_align_unsorted.output.bam
output:
txt = outdir + os.sep + "Q6-dupl_rate" + os.sep + "{samplename}.dup_metrics.txt"
params:
outdir = outdir + os.sep + "Q6-dupl_rate"
run:
# This definitely isn't the most efficient way to calculate duplicate rate, but it works
# We are going to generate a Picard-style output file
# First, parse the final (collapsed) BAM, and calculate the total number of reads
col_bam = pysam.AlignmentFile(input.collapsed_bam)
consensus_reads = col_bam.count(until_eof=True, read_callback=lambda x: not x.is_duplicate and x.is_mapped and x.is_paired and not x.is_supplementary and not x.is_secondary)
col_read_pairs = consensus_reads / 2
col_unpaired_reads = col_bam.count(until_eof=True, read_callback=lambda x: not x.is_paired and not x.is_supplementary and not x.is_secondary)
# Now, parse the original (non-consensus) BAM, and calculate the total number
# of read pairs, unmapped reads, and unpaired reads
orig_bam = pysam.AlignmentFile(input.all_reads_bam, require_index=False)
orig_reads = orig_bam.count(until_eof=True, read_callback=lambda x: not x.is_duplicate and x.is_mapped and x.is_paired and not x.is_supplementary and not x.is_secondary)
orig_pairs = orig_reads / 2
unmapped_reads = orig_bam.count(until_eof=True, read_callback=lambda x: x.is_unmapped and not x.is_supplementary and not x.is_secondary)
unpaired_reads = orig_bam.count(until_eof=True, read_callback=lambda x: not x.is_paired and not x.is_supplementary and not x.is_secondary)
secondary_reads = orig_bam.count(until_eof=True, read_callback=lambda x: x.is_supplementary or x.is_secondary)
# Now, calculate the output stats
library = wildcards.samplename
read_pairs_examined = int(orig_pairs)
secondary_or_supplemental = secondary_reads
unpaired_dups = int(unpaired_reads - col_unpaired_reads)
read_pair_dups = int(orig_pairs - col_read_pairs)
optical_dup = 0 # In this consensus approach (via UMIs) we lose info on which are optical duplicates
per_dupl = read_pair_dups / orig_pairs
# Custom added by Chris. Calculate the total size of this library, and how much we have sequenced
estimated_library_size = int(estimateLibrarySize(orig_pairs, col_read_pairs)) # Use MultiQC's function to calculate the library size
prop_library_seq = col_read_pairs / estimated_library_size
# Close input
col_bam.close()
orig_bam.close()
# Write output
with open(output.txt, "w") as o:
# To trick MultiQC into thinking this is a Picard output
o.write("##picard.sam.markduplicates.MarkDuplicates BUT NOT ACTUALLY PICARD")
o.write(os.linesep)
header = ["LIBRARY", "UNPAIRED_READS_EXAMINED", "READ_PAIRS_EXAMINED", "SECONDARY_OR_SUPPLEMENTARY_RDS", "UNMAPPED_READS", "UNPAIRED_READ_DUPLICATES", "READ_PAIR_DUPLICATES",
"READ_PAIR_OPTICAL_DUPLICATES", "PERCENT_DUPLICATION", "ESTIMATED_LIBRARY_SIZE", "PERCENT_LIBRARY_SEQUENCED"]
o.write("\t".join(header))
o.write(os.linesep)
# Write out stats for this sample
out_values = [library, str(unpaired_reads), str(read_pairs_examined), str(secondary_or_supplemental), str(unmapped_reads),
str(unpaired_dups), str(read_pair_dups), str(optical_dup), str(per_dupl), str(estimated_library_size), str(prop_library_seq)]
o.write("\t".join(out_values))
o.write(os.linesep)
# Output sentinel confirming that the final BAMs are valid
rule qc_validate_sam:
input:
bam = rules.picard_annotate_bam.output.bam,
refgenome = config["cappseq_umi_workflow"]["refgenome"]
output:
txt = outdir + os.sep + "Q7-validatesam" + os.sep + "{samplename}.consensus.mapped.ValidateSamFile.is_valid"
params:
outdir = outdir + os.sep + "Q7-validatesam"
conda:
"envs/bwa_picard_fgbio.yaml"
log:
outdir + os.sep + "logs" + os.sep + "{samplename}.picardvalidatesam.log"
shell:
"picard ValidateSamFile -I {input.bam} -R {input.refgenome} > {output.txt} 2> {log}"
# Merge QC results via multiqc
checkpoint qc_multiqc:
input:
# Run multiqc once per run, and merge all samples from that run
dupl = lambda w: list(os.path.join(outdir, "Q6-dupl_rate", x + ".dup_metrics.txt") for x in samplelist if sample_to_runid[x] == w.runid),
errorate = lambda w: list(os.path.join(outdir, "Q5-error_rate", x + ".error_rate_by_read_position.txt") for x in samplelist if sample_to_runid[x] == w.runid),
insertsize = lambda w: list(os.path.join(outdir,"Q4-insert_size", x + ".insert_size_metrics.txt") for x in samplelist if sample_to_runid[x] == w.runid),
oxog = lambda w: list(os.path.join(outdir, "Q3-oxog_metrics", x + ".oxoG_metrics.txt") for x in samplelist if sample_to_runid[x] == w.runid),
hsmet = lambda w: list(os.path.join(outdir, "Q2-hs_metrics", x + ".hs_metrics.txt") for x in samplelist if sample_to_runid[x] == w.runid),
fastp = lambda w: list(os.path.join(outdir, "01-trimmedfastqs", x + ".fastp.json") for x in samplelist if sample_to_runid[x] == w.runid),
fastqc = lambda w: list(os.path.join(outdir, "Q1-fastqc", x + ".bwa.unsort_fastqc.html") for x in samplelist if sample_to_runid[x] == w.runid),
validatesam = lambda w: list(os.path.join(outdir, "Q7-validatesam", x + ".consensus.mapped.ValidateSamFile.is_valid") for x in samplelist if sample_to_runid[x] == w.runid),
famsizehist = lambda w: list(os.path.join(outdir, "04-umigrouped", x + ".umigrouped.famsize.txt") for x in samplelist if sample_to_runid[x] == w.runid)
output:
html = outdir + os.sep + "Q9-multiqc" + os.sep + "multiqc_report.{runid}.html",
params:
outdir = outdir + os.sep + "Q9-multiqc",
outname = lambda w: "multiqc_report." + w.runid + ".html",
ignoresamples = lambda w: "*\' --ignore-samples \'".join(x for x in samplelist if sample_to_runid[x] != w.runid),
modules = "-m picard -m fastqc -m fgbio -m fastp", # Should start with -m flag
config = config["cappseq_umi_workflow"]["multiqc_config"],
dupl_dir = rules.qc_calc_dupl.params.outdir,
errorrate_dir = rules.qc_fgbio_errorrate.params.outdir,
insertsize_dir = rules.qc_picard_insertsize.params.outdir,
oxog_dir = rules.qc_picard_oxog.params.outdir,
hsmet_dir = rules.qc_picard_hsmetrics.params.outdir,
fastp_dir = rules.trim_umi.params.outdir,
fastqc_dir = rules.qc_fastqc.params.outdir,
validsam_dir = rules.qc_validate_sam.params.outdir,
famsize_dir = rules.fgbio_group_umis.params.outdir
conda:
"envs/picard_fastqc.yaml"
log:
outdir + os.sep + "logs" + os.sep + "multiqc_{runid}.log"
shell:
"multiqc --no-data-dir --interactive --config {params.config} --outdir {params.outdir} --filename {params.outname} --force {params.modules} {params.dupl_dir} {params.errorrate_dir} {params.insertsize_dir} {params.oxog_dir} {params.hsmet_dir} {params.fastqc_dir} {params.validsam_dir} {params.famsize_dir} {params.fastp_dir} --ignore-samples \'{params.ignoresamples}*\' > {log}"
rule all:
input:
expand([str(rules.picard_annotate_bam.output.bam),
str(rules.qc_multiqc.output.html)],
samplename=samplelist,
runid=set(sample_to_runid.values()))
default_target: True