-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNanopore_SumStatQC.Rmd
792 lines (625 loc) · 40.6 KB
/
Nanopore_SumStatQC.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
---
title: "Nanopore summary statistics and basic QC"
date: "Report created: `r Sys.Date()`"
output:
html_document:
keep_md: no
number_sections: yes
self_contained: yes
theme: default
highlight: null
css: Static/ont_tutorial.css
toc: yes
toc_depth: 2
toc_float:
collapsed: yes
smooth_scroll: yes
df_print: paged
classoption: a4paper
link-citations: yes
bibliography: Static/Bibliography.bib
---
```{r setup, include=FALSE}
# call with:
# R --slave -e 'rmarkdown::render("Nanopore_SumStatQC.Rmd", "html_document")'
# R --slave -e 'rmarkdown::render("Nanopore_SumStatQC.Rmd", "pdf_document")'
knitr::opts_chunk$set(fig.width=9, fig.height=6, warning=FALSE, message=FALSE, tidy = FALSE, cache.extra = packageVersion('tufte'), cache.path = "Results/KnitR")
options(htmltools.dir.version = FALSE)
library(R.utils)
library(data.table)
library(digest)
library(dplyr)
library(plyr)
library(emojifont)
library(extrafont)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(tufte)
library(caTools)
library(yaml)
library(fastmatch) # added to accommodate the guppy barcode content
config <- yaml.load_file("config.yaml")
inputFile <- config$inputFile
flowcellId <- config$flowcellId
basecaller <- config$basecaller
tutorialText <- config$tutorialText
expRef <- config$expRef
Organism <- config$Organism
FlowCellType <- config$flowcellType
FlowCellID <- config$flowcellId
SequencingKit <- config$seqKit
sampleHours <- config$sampleHours ;# 48
sampleIntervalMinutes <- config$sampleIntervalMinutes ;# 60
readlenpercentile <- ifelse(is.null(config$readlenpercentile), 0.975, config$readlenpercentile) # 0.975
# core parameters used for the presentation and configuration of report
# not to be moved to config.yaml (yet)
qcThreshold <- 7
binFilter <- 5
scaling <- 1
reportDPI <- 120
# make a results directory
dir.create("Results", showWarnings = FALSE)
slurpContent <- function(filename) {
include = as.logical(tutorialText)
if (include) {
paste(readLines(filename), collapse="\n")
}
}
```
# Experiment: `r expRef`
* Organism: `r Organism`
* FlowCell type: `r FlowCellType`
* FlowCell ID: `r FlowCellID`
* Sequencing Kit: `r SequencingKit`
`r slurpContent("Static/TutorialPreamble.md")`
# Executive summary
```{r executivesummary, echo=FALSE}
# Using fread for fast and friendly import of sequence_summary file
# no definition of column types to speed import and allow for merged files
# could be worthwhile to fread(select=) to select only a subset of columns - this could
# preclude e.g. barcode data or different versions?
sequencedata <- data.table::fread(inputFile, stringsAsFactors=FALSE)
# remove the redundant headers from merged files
if (length(which(sequencedata[,1]=="filename")) > 0) {
sequencedata <- sequencedata[-which(sequencedata[,1]=="filename"),]
}
# coerce the columns used in analytics into more appropriate data-types
sequencedata$channel<-as.numeric(sequencedata$channel)
sequencedata$start_time<-as.numeric(sequencedata$start_time)
sequencedata$duration<-as.numeric(sequencedata$duration)
sequencedata$num_events<-as.numeric(sequencedata$num_events)
sequencedata$sequence_length_template<-as.numeric(sequencedata$sequence_length_template)
sequencedata$mean_qscore_template<-as.numeric(sequencedata$mean_qscore_template)
# passes_filtering is a useful flag; but there are examples of sequencing_summary.txt where this
# is not present - https://github.com/a-slide/pycoQC/blob/master/pycoQC/data/sequencing_summary_1D_DNA_Albacore_1.2.1.txt
if (! "passes_filtering" %in% colnames(sequencedata)) {
# set all of the reads to pass? apply a cutoff?
sequencedata$passes_filtering <- TRUE
} else {
sequencedata$passes_filtering <- as.logical(sequencedata$passes_filtering)
}
# create a convenient separation of pass and fail ...
passedSeqs <- sequencedata[which(sequencedata$passes_filtering), ]
failedSeqs <- sequencedata[which(!sequencedata$passes_filtering), ]
```
```{r pseudoValueBox, include=TRUE, echo=FALSE, fig.align="center", fig.width=2}
# calculate some basec, but key, metrics
readCount <- formatC(nrow(sequencedata), big.mark=",")
totalBases = sum(sequencedata$sequence_length_template,na.rm=T)/10^9
passedBases = sum(passedSeqs$sequence_length_template,na.rm=T)/10^9
gigabases <- round(totalBases,2)
# prepare a data object to render a summary graphic
figures <- 3
df <- data.frame(
x = cumsum(c(2, rep(6.5, figures-1))),
y = rep(2, figures),
h = rep(4, figures),
w = rep(6, figures))
df$info <- c("flowcell", readCount, gigabases)
df$key <- c(flowcellId,"Reads produced", "gigabases called")
df$icon <- fontawesome(c('fa-qrcode', 'fa-filter', 'fa-file-text-o'))
df$colour <- rep("steelblue", figures)
# and display the plot
ExecutiveSummaryValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
geom_tile(fill = brewer.pal(9,"Blues")[7]) +
geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=5) +
geom_text(label=df$info, size=10, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=23, hjust="right", nudge_x=2.85, nudge_y=0.8) +
coord_fixed() +
theme_void() +
guides(fill = F)
#ExecutiveSummaryValueBoxes
# some juggling here to prepare a plot that can be rendered across platforms = windows/linux/osx
ggsave(file.path("Results", "ExecutiveSummaryValueBoxes.png"), plot=ExecutiveSummaryValueBoxes, device="png", units="cm", width=30, height=5, dpi=reportDPI)
knitr::include_graphics(file.path("Results", "ExecutiveSummaryValueBoxes.png"), dpi=NA)
```
Basecalling was performed using the **`r config$basecaller`** software. Called reads were classified as either pass or fail depending on their mean quality score. For this analysis, a total of `r formatC(nrow(sequencedata), big.mark=",")` reads were basecalled and of these `r formatC(nrow(passedSeqs), big.mark=",")` (`r round(nrow(passedSeqs) / nrow(sequencedata) * 100, 1)`%) were passed as satisfying the quality metric. The passed reads contain a total of `r round(passedBases, 2)` Gb of DNA sequence. This passed-fraction amounts to `r round(passedBases / totalBases * 100, 1)`% of the total DNA nucleotide bases sequenced.
```{r qcPassGauge, echo=FALSE, include=TRUE, fig.align="center", fig.width=4}
df <- data.frame(matrix(nrow=1, ncol = 3))
names(df) <- c("variable", "percentage","label")
df$variable <- c("pass")
df$percentage <- c(round(length(which(sequencedata$passes_filtering==TRUE)) / nrow(sequencedata), 3))
df <- df %>% mutate(group=ifelse(percentage <0.6, "red",
ifelse(percentage>=0.6 & percentage<0.8, "orange","green")),
label=paste0(df$percentage*100, "%"))
title="Percentage of reads\npassing QC filter"
ggplot(df, aes(fill = group, ymax = percentage, ymin = 0, xmax = 2, xmin = 1)) +
geom_rect(aes(ymax=1, ymin=0, xmax=2, xmin=1), fill ="#ece8bd") +
geom_rect() +
coord_polar(theta = "y",start=-pi/2) + xlim(c(0, 2)) + ylim(c(0,2)) +
guides(fill=FALSE) +
guides(colour=FALSE) +
theme_void() +
theme(strip.background = element_blank(), strip.text.x = element_blank()) +
geom_text(aes(x = 0, y = 0, label = label), size=10) +
geom_text(aes(x=1.5, y=1.5, label=title), size=8) +
scale_fill_manual(values = c("red"="#C9146C", "orange"="#DA9112", "green"="#129188")) +
scale_colour_manual(values = c("red"="#C9146C", "orange"="#DA9112", "green"="#129188"))
```
\pagebreak
# Sequencing channel activity plot
The nanopores through which DNA is passed, and signal collected, are arrayed as a 2-dimensional matrix. A heatmap can be plotted showing channel productivity against spatial position on the matrix. Such a plot enables the identification of spatial artifacts that could result from membrane damage through e.g. the introduction of an air-bubble. This heatmap representation of spatial activity shows only gross spatial aberations. Since each channel can address four differemnt pores (Mux) the activity plot below shows the number of sequences produced per channel, not per pore.
```{r channel_plot, include=TRUE, echo=FALSE, fig.fullwidth = TRUE}
# create an empty read count container ... MinION or PromethION??
# https://gist.github.com/roblanf/df47b9748c3aae00809cc675aca79989
# build the map for R9.5 flowcell, as a long-form dataframe that translates
# channels into rows and columns on the flowcell. Good for plotting in R.
p1 = data.frame(channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4))
p2 = data.frame(channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4))
p3 = data.frame(channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4))
p4 = data.frame(channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4))
p5 = data.frame(channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4))
p6 = data.frame(channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4))
p7 = data.frame(channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4))
p8 = data.frame(channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4))
q1 = data.frame(channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4))
q2 = data.frame(channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4))
q3 = data.frame(channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4))
q4 = data.frame(channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4))
q5 = data.frame(channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4))
q6 = data.frame(channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4))
q7 = data.frame(channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4))
q8 = data.frame(channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4))
# long form as a data frame, i.e. map$channel[[1]] returns 33
channelMap = rbind(p1, p2, p3, p4, p5, p6, p7, p8, q1, q2, q3, q4, q5, q6, q7, q8)
hm.palette <- colorRampPalette(brewer.pal(9, 'Blues'), space='Lab') #RdPu, Oranges, Greens, YlOrRd, Purples
channelCounts <- as.data.frame(matrix(rep(0, 512), ncol=1))
channelCountRaw <- as.data.frame(table(unlist(sequencedata[, "channel"])), row.names=1)
channelCounts[row.names(channelCountRaw),] <- channelCountRaw[,1]
#channelMap <- cbind(channelMap[channelMap$channel,], frequency=channelCounts[channelMap$channel,])
channelMap <- merge(channelMap, channelCounts, by.x="channel", by.y=0)
colnames(channelMap)[4]<-"count"
channelMapMatrix <- reshape2::acast(channelMap, col ~ row, value.var = "count")
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(channelMap, aes(x = row, y = col, fill = count)) +
geom_tile() +
geom_text(data=channelMap,aes(x=row, y=col,label=count,color=count),show.legend = F, size=2.5) +
scale_x_discrete(breaks=NULL) +
scale_y_discrete(breaks=NULL) +
coord_equal() +
scale_fill_gradientn(colours = hm.palette(100)) +
scale_color_gradient2(low = hm.palette(100), high = hm.palette(1)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Channel activity plot showing number of reads per flowcell channel") +
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position="bottom",
legend.key.width=unit(5.6,"cm"))
```
\pagebreak
# Quality and length
The distribution of base-called DNA sequence lengths and their accompanying qualities are key metrics for the review of a sequencing library. This section of the QC review tutorial assesses the length and quality distributions for reads from this flowcell. We will review the total collection of sequences, including those that fail the mean quality filter.
```{r summaryStatMeasures, include=FALSE, echo=FALSE}
lenSorted <- rev(sort(passedSeqs$sequence_length_template))
passedMeanLength = round(mean(lenSorted), digits = 0)
N50 <- lenSorted[cumsum(lenSorted) >= sum(lenSorted)*0.5][1]
passedMeanQ = round(mean(passedSeqs$mean_qscore_template), digits = 1)
failedMeanQ = round(mean(failedSeqs$mean_qscore_template), digits = 1)
#N50 length is the length of the shortest contig such that the sum of contigs of equal length or longer is at least 50% of the total length of all contigs
```
```{r seqInfoPlot, include=TRUE, echo=FALSE, fig.fullwidth = TRUE, dpi=360, fig.width=9, fig.height=3}
figures <- 5
df <- data.frame(
x = cumsum(c(2, rep(6.5, figures-1))),
y = rep(2, figures),
h = rep(4, figures),
w = rep(6, figures))
df$info <- c(passedMeanLength, N50, passedMeanQ, failedMeanQ, prettyNum(max(passedSeqs$sequence_length_template), big.mark=","))
df$key <- c("Mean Read Length (nt)","N50","Mean Read Quality (QV)","Mean Failed QV","Longest Read")
df$icon <- fontawesome(c("fa-bar-chart", "fa-play", "fa-area-chart", "fa-bug", "fa-sort"))
df$colour <- rep("steelblue", figures)
ReadCharacteristicsValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
geom_tile(fill = brewer.pal(9,"Blues")[7]) +
geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=3.5) +
geom_text(label=df$info, size=5.5, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=13.3, hjust="right", nudge_x=2.85, nudge_y=0.8) +
coord_fixed() +
scale_fill_brewer(type = "qual",palette = "Dark2") +
theme_void() +
guides(fill = F)
ggsave(file.path("Results", "ReadCharacteristicsValueBoxes.png"), plot=ReadCharacteristicsValueBoxes, device="png", units="cm", width=20, height=5, dpi=reportDPI)
knitr::include_graphics(file.path("Results", "ReadCharacteristicsValueBoxes.png"), dpi=NA)
```
The distribution of sequence lengths will be dependent on the protocols that have been used to extract and/or prepare the starting DNA. Sequences from amplicon DNA will have a tight distribution of read lengths, while sequences from genomic DNA will have a broader distribution. The distribution will be further influenced if a size-selection step has been used, and will also be dependent on the choice of sequencing library preparation kits. The read-length distribution should be assessed to see if the distribution is concordant with that expected.
\hfill\break
<!-- weighted read length histogram -->
```{r weightedreadlength, echo=FALSE, include=TRUE}
# https://stackoverflow.com/questions/6461209/how-to-round-up-to-the-nearest-10-or-100-or-x
roundUpNice <- function(x, nice=seq(from=1, to=10, by=0.25)) {
if(length(x) != 1) stop("'x' must be of length 1")
10^floor(log10(x)) * nice[[which(x <= 10^floor(log10(x)) * nice)[[1]]]]
}
# pick a friendly upper limit to render sequence lengths into a histogram
# here we're aiming for a robustly rounded up 97.5 quantile of the data (skip a few outliers ...)
# upperLimit <- roundUpNice(as.numeric(quantile(x=sequencedata$sequence_length_template, probs=c(0.975))))
upperLimit <- roundUpNice(as.numeric(quantile(x=sequencedata$sequence_length_template, probs=c(readlenpercentile))))
# an ideal histogram will have 40 or so bins
histogramBinCount <- 40
breakVal = roundUpNice(upperLimit / histogramBinCount)
breaks <- seq(0, to=upperLimit, by=breakVal)
binAssignments <- cut(sequencedata$sequence_length_template, breaks, include.lowest=TRUE, right=FALSE)
scrapeBinnedBases <- function(level, qcpass) {
sum(subset(sequencedata[which(binAssignments == level), ], passes_filtering==qcpass)$sequence_length_template)
}
passedBinnedBases <- unlist(lapply(levels(binAssignments), scrapeBinnedBases, qcpass=TRUE))
failedBinnedBases <- unlist(lapply(levels(binAssignments), scrapeBinnedBases, qcpass=FALSE))
binnedBaseDist <- data.frame(length=head(breaks, -1), pass=passedBinnedBases, fail=failedBinnedBases)
binnedBaseMelt <- reshape2::melt(binnedBaseDist, id.vars=c("length"))
ggplot(binnedBaseMelt, aes(x=length, fill=variable, y=value)) +
geom_bar(stat="identity") +
xlab("Read length\n") + ylab("Number of bases sequenced\n") +
scale_fill_manual("QC", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
scale_x_continuous(limits=c(-breakVal,upperLimit), breaks=pretty(passedSeqs$sequence_length_template,n=40)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Histogram showing the number of sequenced bases against sequence length", fill="QV filter")+
geom_vline(xintercept = N50, size = 1) +
annotate("text", x=N50, y=max(passedBinnedBases + failedBinnedBases), label = " N50", hjust=0, colour="SteelBlue") +
geom_vline(xintercept = passedMeanLength, size = 1) +
annotate("text", x=passedMeanLength, y=max(passedBinnedBases + failedBinnedBases), label = " Mean", hjust=0, colour="SteelBlue")
```
The weighted read length histogram above shows the binned distribution of sequence length against number of sequence nucleotides contained within the bin. This plot will show clear peaks if for example, amplicons are sequenced or if size selection has been performed. The histogram includes annotations for N50 and mean sequence sizes. N50 describes the sequence length where 50% of the sequenced bases are contained within reads of this length, or longer. The mean sequence length is the average sequence length across the whole sequence collection.
```{r lengthdistribution, include=TRUE, cache=FALSE, fig.fullwidth=FALSE, echo=FALSE}
scrapeBinnedReads <- function(level, qcpass) {
length(subset(sequencedata[which(binAssignments == level), ], passes_filtering==qcpass)$sequence_length_template)
}
passedBinnedReads <- unlist(lapply(levels(binAssignments), scrapeBinnedReads, qcpass=TRUE))
failedBinnedReads <- unlist(lapply(levels(binAssignments), scrapeBinnedReads, qcpass=FALSE))
binnedReadDist <- data.frame(length=head(breaks, -1), pass=passedBinnedReads, fail=failedBinnedReads)
binnedReadMelt <- reshape2::melt(binnedReadDist, id.vars=c("length"))
ggplot(binnedReadMelt, aes(x=length, fill=variable, y=value)) +
geom_bar(stat="identity") +
xlab("Read length\n") + ylab("Number of reads\n") +
scale_fill_manual("QC", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
scale_x_continuous(limits=c(-breakVal,upperLimit), breaks=pretty(passedSeqs$sequence_length_template,n=40)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Histogram showing distribution of read lengths across quality passing sequences", fill="QV filter")+
geom_vline(xintercept = N50, size = 1) +
annotate("text", x=N50, y=max(passedBinnedReads + failedBinnedReads), label = " N50", hjust=0, colour="SteelBlue") +
geom_vline(xintercept = passedMeanLength, size = 1) +
annotate("text", x=passedMeanLength, y=max(passedBinnedReads + failedBinnedReads), label = " Mean", hjust=0, colour="SteelBlue")
```
\hfill\break
A histogram of mean QV scores reveals the relative abundance of sequences of different qualities. The distribution of sequence qualities is shaded by the QV filter pass status. This QV filter is applied during the base-calling process as a modifiable parameter. For downstream analyses we recommend processing only the higher-quality sequence reads. The histogram includes annotations for N50 and mean sequence sizes. N50 describes the sequence length where 50% of the sequenced bases are contained within reads of this length, or longer. The mean sequence length is the average sequence length across the whole sequence collection.
\hfill\break
```{r QvalueDistributions, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=FALSE}
ggplot(sequencedata, aes(x=mean_qscore_template, fill=passes_filtering)) +
geom_histogram(breaks=seq(from=0, to=15, by=0.1)) +
scale_fill_manual(name="QC", values=c("TRUE"=brewer.pal(6, "Paired")[2], "FALSE"=brewer.pal(6, "Paired")[1]), labels=c( "pass", "fail"), breaks=c("TRUE", "FALSE")) +
labs(title="Plot showing distribution of quality scores across all reads") +
xlab("Mean Q score of read") +
ylab("Number of reads")
```
\hfill\break
The plot above shows the distribution of mean read quality scores across the whole sequence collection. The distribution has been shaded for the sequence reads that have passed or failed the base-callers quality filter.
\hfill\break
```{r lengthQualityDensityPlots, include=TRUE, cache=FALSE, echo=FALSE, warning=FALSE}
# prepare the density plot, but do not render
lq_dens <- ggplot(sequencedata, aes(log10(sequence_length_template), mean_qscore_template)) + geom_bin2d(bins=100)
# extract the density map from the plot
lq_dens_counts <- ggplot_build(lq_dens)$data[[1]]
if (binFilter > 0) {
# remove the bins from the density map that do not contain sequence count above threshold
lq_dens_counts <- lq_dens_counts[-which(lq_dens_counts$count <= binFilter),]
}
# directly plot this modified density map (stat=="identity")
ggplot(lq_dens_counts) +
geom_bin2d(aes(x,y,fill=count), stat="identity") +
scale_fill_distiller(palette="Blues", trans="reverse") +
geom_hline(yintercept = qcThreshold, size = 1) +
xlab("log10(read length)") +
ylab("read mean quality") +
scale_x_continuous(breaks = c(1,2,3,4,5), label = c("10", "100", "1000", "10,000", "100,000")) +
annotation_logticks(base = 10, sides = "b", scaled = TRUE) +
labs(title="Contour Plot showing distribution of quality scores against log10 read lengths (all reads)")
```
\hfill\break
The density plot of mean sequence quality plotted against log10 sequence length is a useful graphic to show patterns within the broader sequence collection. The density plot shown in the figure below has been de-speckled by omitting the rarer sequence bins containing only `r binFilter` reads or fewer have been omitted. This is mainly aesthetic and masks some speckle around the periphery of the main density map.
\hfill\break
\pagebreak
# Time/duty performance
Another key metric in the quality review of a sequencing run is an analysis of the temporal performance of the run. During a run each sequencing channel will address a number of different pores (mux) and the individual pores may become temporarily or permanently blocked. It is therefore expected that during a run sequencing productivity will decrease. It is useful to consider whether the observed productivity decline is normal or if it happens more rapidly than expected. A rapid pore decline could be indicative of contaminants with the sequencing library.
\hfill\break
Plotting the number of bases generated per unit time over the course of a sequencing run can reveal unexpected behaviours. In an ideal experiment there should not be any sudden decreases in performance.
```{r basesPerTimeBin, include=TRUE, cache=FALSE, echo=FALSE}
sequencedata$start_time <- sequencedata$start_time - min(sequencedata$start_time)
sequencedata$start_time <- sequencedata$start_time / scaling
# assuming a 48 hour run, 5 minute intervals
# SP:2019-07-24 both now defined from config.yaml, see top of this script
# sampleHours = 48
# sampleIntervalMinutes = 60
breaks = seq(0, sampleHours*60*60, by=60*sampleIntervalMinutes)
binass <- findInterval(sequencedata$start_time, breaks)
mergeItPerHour <- function(interval, binnedAssignments, filter) {
totalbases = 0
if (length(which(binnedAssignments==interval))>0) {
subset <- sequencedata[which(binnedAssignments==interval), ]
if (length(which(subset$passes_filtering == filter)) > 0) {
totalbases = sum(subset[which(subset$passes_filtering == filter), "sequence_length_template"])
}
}
# need to scale what is being returned - totalbases value is total bases within an interval (sampleIntervalMinutes)
return(totalbases / 1e9 / sampleIntervalMinutes * 60)
}
binnedTemporalDataPerHour <- data.frame(
cbind(
time=breaks,
pass=unlist(lapply(seq(breaks), mergeItPerHour, binnedAssignments=binass,filter=TRUE)),
fail=unlist(lapply(seq(breaks), mergeItPerHour, binnedAssignments=binass, filter=FALSE))
)
)
binnedTemporalDataPerHour$time <- binnedTemporalDataPerHour$time / 60 / 60
ggplot(binnedTemporalDataPerHour, aes(time)) +
geom_line(aes(y = fail, colour = "fail"), size=1) +
geom_line(aes(y = pass, colour = "pass"), size=1) +
scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
xlab("Time (hours)") +
ylab("Gigabases sequenced per hour") +
labs(title="Plot showing sequence throughput against time")
```
The temporal data presented in the figure above has been scaled to gigabases of sequence produced per hour. For a finer resolution of performance the **`R`** code that prepares this report could be modified for a more frequent sample interval.
```{r temporalPerformance, echo=FALSE}
# binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw for cumulative plotting
binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass / 60 * sampleIntervalMinutes
binnedTemporalDataPerHour$fail <- binnedTemporalDataPerHour$fail / 60 * sampleIntervalMinutes
# https://stackoverflow.com/questions/31404679/can-ggplot2-find-the-intersections-or-is-there-any-other-neat-way
acquireTimePoints <- which(binnedTemporalDataPerHour$pass > 0)
targetInterpolate <- approxfun(x=binnedTemporalDataPerHour[acquireTimePoints, "time"], y=cumsum(binnedTemporalDataPerHour[acquireTimePoints, "pass"]))
base50 <- sum(passedSeqs$sequence_length_template)/1e9*0.5
base90 <- sum(passedSeqs$sequence_length_template)/1e9*0.9
T50 <- optimize(function(t0) abs(targetInterpolate(t0) - base50),
interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))
T90 <- optimize(function(t0) abs(targetInterpolate(t0) - base90),
interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))
```
```{r cumulativeSequencePlotBP, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=FALSE}
ggplot(binnedTemporalDataPerHour, aes(time)) +
geom_line(aes(y = cumsum(fail), colour = "fail"), size=1) +
geom_line(aes(y = cumsum(pass), colour = "pass"), size=1) +
scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
geom_segment(x=T50$minimum, y=0, xend=T50$minimum, yend=base50, colour="darkgray", size=1) +
geom_segment(x=0, y=base50, xend=T50$minimum, yend=base50, colour="darkgray", size=1) +
annotate("text", x=T50$minimum, y=base50, label=" T50", vjust=1, hjust=0, colour="SteelBlue") +
geom_segment(x=T90$minimum, y=0, xend=T90$minimum, yend=base90, colour="darkgray", size=1) +
geom_segment(x=0, y=base90, xend=T90$minimum, yend=base90, colour="darkgray", size=1) +
annotate("text", x=T90$minimum, y=base90, label=" T90", vjust=1, hjust=0, colour="SteelBlue") +
xlab("Time (hours)") +
ylab("Number of bases sequenced (Gigabases)") +
labs(title="Plot showing cumulative bases sequenced against time")
```
In addition to plotting the temporal production of data, the cumulative plot shown above shows how data is accumulated during the run. From this dataset, we have measured a total of `r round(sum(passedSeqs$sequence_length_template)/1e9, 2)` Gb of quality passing sequence. We can identify the timepoint T50, where 50% of sequenced bases have been collected within this time - or `r round(T50$minimum, 2)` hours in this example. This is displayed on the graph along with T90, the time at which 90% of the sequenced based have been acquired.
```{r cumulativeSequencePlotReads, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=TRUE}
mergeItReadsPerHour <- function(interval, binnedAssignments,filter) {
totalreads = 0
if (length(which(binnedAssignments==interval))>0) {
subset <- sequencedata[which(binnedAssignments==interval), ]
if (length(which(subset$passes_filtering == filter)) > 0) {
totalreads = nrow(subset[which(subset$passes_filtering == filter),])
}
}
# scale results to mean millions of reads per hour
return(totalreads/ 1e6 / sampleIntervalMinutes * 60)
}
binnedTemporalDataReadsPerHour <- data.frame(
cbind(time=breaks,
pass=unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments=binass, filter=TRUE)),
fail=unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments=binass, filter=FALSE))
)
)
binnedTemporalDataReadsPerHour$time <- binnedTemporalDataReadsPerHour$time / 60 / 60
# binnedTemporalDataReadsPerHour is scaled to Gbp per hour - rescale to raw for cumulative plotting
binnedTemporalDataReadsPerHour$pass <- binnedTemporalDataReadsPerHour$pass / 60 * sampleIntervalMinutes
binnedTemporalDataReadsPerHour$fail <- binnedTemporalDataReadsPerHour$fail / 60 * sampleIntervalMinutes
ggplot(binnedTemporalDataReadsPerHour, aes(time)) +
geom_line(aes(y = cumsum(fail), colour = "fail"), size=1) +
geom_line(aes(y = cumsum(pass), colour = "pass"), size=1) +
scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
xlab("Time (hours)") +
ylab("Number of reads sequenced (Millions)") +
labs(title="Plot showing cumulative reads sequenced against time")
```
In addition to the cumulative plot of sequenced bases, an equivalent plot for the sequenced reads can be plotted - this is shown in the figure above. This is not too dissimilar in structure or morphology to the cumulative baseplot.
\hfill\break
The speed/time plot is a useful tool to observe any substantial changes in sequencing speed. A marked slow-down in sequencing speed can indicate challenges within the sequencing chemistry that could have been caused by the method of DNA isolation or an abundance of small DNA fragments. Please contact our technical team if you see a profound slowdown with your sequencing.
```{r timeSpeedPlot, include=TRUE, cache=FALSE, echo=FALSE}
speedTime <- data.frame(segment=binass, rate=sequencedata$sequence_length_template / (sequencedata$duration/scaling))
ggplot(speedTime, aes(x=segment, y=rate, group=segment)) +
geom_boxplot(fill="steelblue", outlier.shape=NA) +
scale_x_continuous(name="Time (hours)") +
ylab("Sequencing rate (bases per second)") +
labs(title="boxplot showing distribution of translocation speed against time")
```
The data points shown in the speed-time plot shown above have been filtered to mask outlying sequences (sequences beyond the 95% range). The distribution of the boxplots and their 'whiskers' are unchanged.
\hfill\break
The quality/time plot is a useful tool to observe any substantial changes in sequencing quality (mean_qscore_template). A marked decay in sequencing quality can indicate challenges within the sequencing chemistry. Please contact our technical team if you see a profound quality loss with your sequencing.
```{r qualitySpeedPlot, include=TRUE, cache=FALSE, echo=FALSE}
qualityTime <- data.frame(segment=binass, pass=sequencedata$passes_filtering, avgqual=sequencedata$mean_qscore_template)
ggplot(qualityTime, aes(x=segment, y=avgqual, group=segment)) +
geom_boxplot(fill="steelblue", outlier.shape=NA) +
scale_x_continuous(name="Time (hours)") +
scale_y_continuous(limits = c(0,15)) +
ylab("read mean quality") +
labs(title="boxplot showing distribution of 'mean_qscore_template' values against time (from All reads)")
```
The plot for only PASS reads is added next
```{r qualitySpeedPlotPass, include=TRUE, cache=FALSE, echo=FALSE}
qualityTimePass <- subset(qualityTime, qualityTime$pass==TRUE)
ggplot(qualityTimePass, aes(x=segment, y=avgqual, group=segment)) +
geom_boxplot(fill="steelblue", outlier.shape=NA) +
scale_x_continuous(name="Time (hours)") +
scale_y_continuous(limits = c(0,15)) +
ylab("read mean quality") +
labs(title="boxplot showing distribution of 'mean_qscore_template' values against time (from 'Pass reads' only)") +
geom_hline(yintercept = qcThreshold, size = 1)
```
The black line corresponds to the quality cutoff used to define 'Pass reads'.
```{r activeChannelsPlot, include=TRUE, cache=FALSE, echo=FALSE}
mergeActiveChannels <- function(interval, binnedAssignments) {
totalChannels = 0
if (length(which(binnedAssignments==interval))>0) {
subset <- sequencedata[which(binnedAssignments==interval), ]
totalChannels = length(unique(subset$channel))
}
return(totalChannels)
}
binnedTemporalChannels <- data.frame(time=breaks,
channels=unlist(lapply(seq(breaks), mergeActiveChannels, binnedAssignments=binass)
)
)
binnedTemporalChannels$time <- binnedTemporalChannels$time / 60 / 60
ggplot(binnedTemporalChannels, aes(time)) +
geom_step(aes(y = channels), size=1, colour = "Steelblue") +
xlab("Time (hours)") +
ylab("Number of channels producing data") +
labs(title="Plot showing number of functional channels against time")
```
The graph presented above shows the number of sequencing channels that are actively producing data across time. A channel is defined as being active if one or more sequence reads are observed per time window (one hour for the default graph). It is expected that over the course of the run pores will block and the number of channels producing data will decrease. Changing the pore used by the channel (mux) and strategies to unblock pores mean that the number of functional channels may increase or decrease at a given timepoint but generally the number of channels producing data will decrease over time.
\pagebreak
# Demultiplexing
```{r demultiplex, cache=FALSE, echo=FALSE}
barcodes = 0
barcodeUnass = 0
barcodeRange <- c(0,0)
# if barcode_arrangement is lacking this could still be guppy called sequence?
if (!"barcode_arrangement" %in% colnames(passedSeqs)) {
if ("barcodeFile" %in% names(config) && file.exists(config$barcodeFile)) {
barcodedata <- data.table::fread(config$barcodeFile, select=c("read_id", "barcode_arrangement"), showProgress=TRUE, stringsAsFactors=FALSE)
pso <- order(passedSeqs$read_id, method="radix")
passedSeqs <- passedSeqs[pso, ]
bco <- order(barcodedata$read_id, method="radix")
barcodedata <- barcodedata[bco, ]
barcodeMapping <- fmatch(passedSeqs$read_id, barcodedata$read_id)
passedSeqs$barcode_arrangement <- barcodedata[barcodeMapping,c("barcode_arrangement")]
}
}
```
```{r demultiplexB, cache=FALSE, echo=FALSE}
if ("barcode_arrangement" %in% names(passedSeqs)) {
barcodedata=plyr::count(passedSeqs$barcode_arrangement)
barcodedata=subset(barcodedata, freq > 150)
names(barcodedata) <- gsub("x", "barcode", names(barcodedata))
if ("unclassified" %in% barcodedata$barcode) {
barcodes <- nrow(barcodedata[-which(barcodedata$barcode=="unclassified"),])
barcodeUnass <- sum(barcodedata[-which(barcodedata$barcode=="unclassified"),"freq"]) / sum(barcodedata$freq) * 100
barcodeRange <- range(subset(barcodedata, barcode!="unclassified")$freq)
} else {
barcodes <- nrow(barcodedata)
barcodeUnass = 100
barcodeRange <- range(barcodedata$freq)
}
}
figures <- 3
df <- data.frame(
x = cumsum(c(2, rep(6.5, figures-1))),
y = rep(2, figures),
h = rep(4, figures),
w = rep(6, figures))
df$info <- c(round(barcodeUnass, digits = 1), barcodes, paste(barcodeRange,collapse="\n"))
df$key <- c("Reads with barcode (%)","Barcoded libraries", "barcode variance")
df$icon <- fontawesome(c('fa-pie-chart', 'fa-barcode', 'fa-sliders'))
df$colour <- rep("steelblue", figures)
```
```{r pseudoValueBarcodes, include=TRUE, echo=FALSE, fig.fullwidth = TRUE, dpi=360, fig.width=9, fig.height=2.5}
if (barcodes > 0) {
MultiplexCharacteristicsValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
geom_tile(fill = brewer.pal(9,"Blues")[7]) +
geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=5) +
geom_text(label=df$info, size=10, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=23, hjust="right", nudge_x=2.85, nudge_y=0.8) +
coord_fixed() +
scale_fill_brewer(type = "qual",palette = "Dark2") +
theme_void() +
guides(fill = F)
ggsave(file.path("Results", "MultiplexCharacteristicsValueBoxes.png"), plot=MultiplexCharacteristicsValueBoxes, device="png", units="cm", width=30, height=5, dpi=reportDPI)
knitr::include_graphics(file.path("Results", "MultiplexCharacteristicsValueBoxes.png"), dpi=NA)
}
```
```{r convenienceScoringLibraries, echo=FALSE}
if (barcodes > 0) {
# https://www.ncbi.nlm.nih.gov/assembly/help/
ncalc <- function(len.vector, n) {
# N50 - length such that scaffolds of this length or longer include half the bases of the assembly
len.sorted <- rev(sort(len.vector))
len.sorted[cumsum(len.sorted) >= sum(len.sorted)*n][1]
}
lcalc <- function(len.vector, n) {
# L50 - number of scaffolds that are longer than, or equal to, the N50 length and therefore include half the bases of the assembly
len.sorted <- rev(sort(len.vector))
which(cumsum(len.sorted) >= sum(len.sorted)*n)[1]
}
N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
N90 <- ncalc(passedSeqs$sequence_length_template, 0.9)
L50 <- lcalc(passedSeqs$sequence_length_template, 0.5)
L90 <- lcalc(passedSeqs$sequence_length_template, 0.9)
seqSummary <- function(barcodeId, myBarcode, myVector, myMethod, xlist=NA) {
subVector <- myVector[which(myBarcode == barcodeId)]
params <- list(subVector)
if (!is.na(xlist)) {
params <- append(params, xlist)
}
do.call(myMethod, params)
}
barcodedata <- cbind(barcodedata, "%"=round(barcodedata$freq / sum(barcodedata$freq) * 100, digits=1))
barcodedata <- cbind(barcodedata, Mb=round(unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="sum")) / 1e06, digits=0))
barcodedata <- cbind(barcodedata, min=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="min")))
barcodedata <- cbind(barcodedata, max=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="max")))
barcodedata <- cbind(barcodedata, mean=round(unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="mean")), digits=0))
barcodedata <- cbind(barcodedata, N50=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="ncalc", xlist=list(n=0.5))))
barcodedata <- cbind(barcodedata, L50=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="lcalc", xlist=list(n=0.5))))
kable(barcodedata, format="markdown", caption="Table summarising barcode content")
}
```
`r if (barcodes > 0) {"The table above shows summary statistics for the barcode assignments within this sequence collection. The annotated barcode is presented along with the number of sequence reads assigned to it (freq), the percentage of reads assigned to the barcode (%), the megabases of DNA sequence (Mb), shortest read in nucleotides (min), longest read in nucleotides (max), mean sequence length in nucleotides (mean) and N50 and L50 values, again in nucleotides."}`
```{r demultiplex_plot, include=TRUE, echo=FALSE, cache=FALSE, fig.margin=TRUE}
if (barcodes > 0) {
if ("barcode_arrangement" %in% names(passedSeqs)) {
# it's a run that used the --barcoding flag
#library(extrafont)
#loadfonts(device = "win")
ggplot(barcodedata, aes(barcode, freq, fill=barcode)) +
geom_bar(stat="identity", width=0.5, fill="#9ecae1") +
xlab("\nDemultiplexed barcodes") +
ylab("\nFrequency") +
scale_y_continuous(expand = c(0,0)) +
labs(title="Histogram showing abundance of different barcodes") +
theme(axis.text.x = element_text(angle=45, hjust=1))
}
}
```
`r if (barcodes > 0) {"The histogram above shows the abundance of different barcodes within the sequence collection. The size of the bar corresponds to the frequency of the observation - this is the number of sequence reads observed."}`
\pagebreak
# Glossary of terms
* __knit__ is the command to render a Rmarkdown file. The knitr package is used to embed code, the results of R analyses and their figures within the typeset text from the document
* __L50__ describes the number of sequences (or contigs) that are longer than, or equal to, the N50 length and therefore include half the bases of the assembly
* __N50__ describes the length (read length, contig length etc) where half the bases of the sequence collection are contained within reads/contigs of this length or longer
* __QV__ the quality value, -log10(p) that any given base is incorrect. QV may be either at the individual base level, or may be averaged across whole sequences
* __sequencing_summary.txt__ is a summary file describing sequence characteristics following base calling with the Guppy software.
\fontsize{8}{12}
```{r sessionInfo, eval=FALSE, echo=FALSE, comment=NA}
options(width = 100)
utils:::print.sessionInfo(sessionInfo()[-7], locale=FALSE)
```
\fontsize{10}{14}
`r slurpContent("Static/TutorialPostamble.md")`