From 27ba81cd309e13d8271d23531ca6edfa91c2e4d6 Mon Sep 17 00:00:00 2001
From: Jeremiah Wala <jwala@broadinstitute.org>
Date: Tue, 7 Jul 2015 12:11:01 -0400
Subject: [PATCH] updated BamQCPlot

---
 R/BamQCPlot.R | 104 +++++++++++++++-----------------------------------
 1 file changed, 31 insertions(+), 73 deletions(-)

diff --git a/R/BamQCPlot.R b/R/BamQCPlot.R
index 4ed5c04..150ae74 100644
--- a/R/BamQCPlot.R
+++ b/R/BamQCPlot.R
@@ -4,7 +4,8 @@ library(optparse)
 
 option_list = list(
     make_option(c("-i", "--input"),  type = "character", default = "qcreport.txt",  help = "Input txt file from a snowman preprocess qcreport.txt"),
-    make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf",  help = "Output pdf to generate")
+    make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf",  help = "Output pdf to generate"),
+    make_option(c("-r", "--readgroup"), type = "character", default = NULL,  help = "Read group")
 )
 
 parseobj = OptionParser(option_list=option_list)
@@ -26,99 +27,56 @@ require(gridExtra)
 ## read the table
 con  <- file(opt$input, open = "r")
 
-df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame()
-rg <- list()
+rg <- df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame()
 
 ## function for parsing histogram strings
-function(ll, id) {
+parse_hist <- function(ll, id, rg) {
   ss <- strsplit(ll[id],',')[[1]]
-  df <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T))
+  df <- data.frame(matrix(as.numeric(unlist(strsplit(ss, '_'))), nrow=length(ss), byrow=T), stringsAsFactors=FALSE)
   colnames(df) <- c("Min", "Max", "Count")
+  df$readgroup = rg
   return (df)
 }
 
+line <- readLines(con, n = 1, warn = FALSE)
 while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
 
   ll <- strsplit(line, '\t')[[1]]
   
   thisrg = ll[1]
-  rg[[thisrg]] <- data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]),
+  rg <- rbind(rg, data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]),
                              Unmapped = as.numeric(ll[4]), MateUnmapped = as.numeric(ll[5]),
-                             QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7]))
-
-
-  ss <- strsplit(ll[8],',')[[1]]
-  df.mapq <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T))
-  colnames(df.mapq) <- c("Min", "Max", "Count")
-  
-  ## found a new one
-  #if (grepl("ReadGroup", line)) {
-  #  thisrg = gsub("READGROUP:BI:(.*)", "\\1", line)
-  #  rg[[thisrg]] <- data.frame(readgroup = thisrg)
-  #} else if (grepl("total", line)) {
-  #  rg[[thisrg]]$total = as.numeric(gsub("total,([0-9]+)", "\\1", line))
-  #} else if (grepl("unmap", line)) {
-  #  rg[[thisrg]]$unmap = as.numeric(gsub("unmap,([0-9]+)", "\\1", line))
-  #} else if (grepl("qcfail", line)) {
-  #  rg[[thisrg]]$qcfail = as.numeric(gsub("qcfail,([0-9]+)", "\\1", line))
-  #} else if (grepl("duplicate", line)) {
-  #  rg[[thisrg]]$duplicate = as.numeric(gsub("duplicate,([0-9]+)", "\\1", line))
-  #} else if (grepl("supplementary", line)) {
-  #  rg[[thisrg]]$supp = as.numeric(gsub("supplementary,([0-9]+)", "\\1", line))    
-  #} else if (grepl("mapq", line)) {
-  #  df.mapq <- rbind(df.mapq, as.numeric(strsplit(gsub("mapq,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("nm", line)) {
-  #  df.nm <- rbind(df.nm, as.numeric(strsplit(gsub("nm,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("isize", line)) {
-  #  df.isize <- rbind(df.isize, as.numeric(strsplit(gsub("isize,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("as", line)) {
-  #  df.as <- rbind(df.as, as.numeric(strsplit(gsub("as,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("xp", line)) {
-  #  df.xp <- rbind(df.xp, as.numeric(strsplit(gsub("xp,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("clip", line)) {
-  #  df.clip <- rbind(df.clip, as.numeric(strsplit(gsub("clip,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("len", line)) {
-  #  df.len <- rbind(df.len, as.numeric(strsplit(gsub("len,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else if (grepl("phred", line)) {
-  #  df.phred <- rbind(df.phred, as.numeric(strsplit(gsub("phred,([0-9]+)", "\\1", line), ",")[[1]]))
-  #} else {
-  #  stop(paste("Failed to read file at line:", line))
-  #}
+                             QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7])))
+
+  df.mapq <-  rbind(df.mapq, parse_hist(ll, 8, thisrg))
+  df.nm <-    rbind(df.nm, parse_hist(ll, 9, thisrg))
+  df.isize <- rbind(df.isize, parse_hist(ll, 10, thisrg))
+  df.clip <-  rbind(df.clip, parse_hist(ll, 11, thisrg))
+  #df.phred <- rbind(df.phred, parse_hist(ll, 12, thisrg))
+  df.length <- rbind(df.length, parse_hist(ll, 13, thisrg))
   
 }
 
 close(con)
 
-colnames(df.mapq) <- seq(from=0,to=60)
-colnames(df.nm) <- seq(from=0,to=ncol(df.nm)-1)
-colnames(df.isize) <- seq(from=0,to=ncol(df.isize)-1)
-colnames(df.xp) <- seq(from=0, to=ncol(df.xp)-1)
-colnames(df.as) <- seq(from=0, to=ncol(df.as)-1)
-colnames(df.len) <- seq(from=0, to=ncol(df.len)-1)
-colnames(df.phred) <- seq(from=0, to=ncol(df.phred)-1)
-colnames(df.clip) <- seq(from=0, to=ncol(df.clip)-1)
-readg <- sapply(rg, function(x) x$readgroup)
-
-df.mapq$readgroup  <- readg
-df.nm$readgroup    <- readg
-df.isize$readgroup <- readg
-df.xp$readgroup    <- readg
-df.as$readgroup    <- readg
-df.phred$readgroup <- readg
-df.len$readgroup   <- readg
-df.clip$readgroup  <- readg
-
-g.mapq  <- ggplot(df.mapq.m  <- melt(df.mapq, id='readgroup'),  aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61))    + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
-g.nm    <- ggplot(df.nm.m    <- melt(df.nm, id='readgroup'),    aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50))     + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
-g.isize <- ggplot(df.isize.m <- melt(df.isize, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(10,2001))  + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
+if (is.null(opt$readgroup))
+  opt$readgroup = unique(rg$readgroup)
+
+
+cbbPalette <- rep(c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), 50)
+
+#g.mapq  <- ggplot(df.mapq, aes(x=as.numeric(Min), y=pmax(log(Count,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61))    + ylab('Reads') + xlab('Mapping Quality')
+g.mapq  <- ggplot(df.mapq[df.mapq$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count, 10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_bar(stat='identity', position='dodge') + theme_bw() + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
+g.nm    <- ggplot(df.nm.m    <- melt(df.nm, id='readgroup'),    aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50))     + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
+g.isize <- ggplot(df.isize[df.isize$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))  + scale_color_manual("Read Group", values=cbbPalette) + theme(legend.position = 'none') + coord_cartesian(xlim=c(200,500))
 g.xp    <- ggplot(df.xp.m    <- melt(df.xp, id='readgroup'),    aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100))    + ylab('Reads') + xlab('XP Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
 g.as    <- ggplot(df.as.m    <- melt(df.as, id='readgroup'),    aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100))    + ylab('Reads') + xlab('AS Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
-g.len   <- ggplot(df.len.m   <- melt(df.len, id='readgroup'),   aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102))   + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
-g.clip  <- ggplot(df.clip.m  <- melt(df.clip, id='readgroup'),  aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,85))     + ylab('Reads') + xlab('Clipped bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
+g.len   <- ggplot(df.len.m   <- melt(df.len, id='readgroup'),   aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102))   + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
+g.clip <- ggplot(df.clip[df.clip$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('Clipped Bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))  + scale_color_manual("Read Group", values=cbbPalette)
 g.phred <- ggplot(df.phred.m <- melt(df.phred, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,43))     + ylab('Reads') + xlab('Mean read Phred quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) 
 
 
-pdf(opt$output, width=15, height=8)
-print(grid.arrange(g.mapq, g.nm, g.isize, g.len, g.clip, g.phred, ncol=3))
-dev.off()
+#pdf(opt$output, width=15, height=8)
+ppdf(print(grid.arrange(g.isize, g.clip, ncol=2)), width=4, height=6)
+#dev.off()