From 27ba81cd309e13d8271d23531ca6edfa91c2e4d6 Mon Sep 17 00:00:00 2001 From: Jeremiah Wala Date: Tue, 7 Jul 2015 12:11:01 -0400 Subject: [PATCH] updated BamQCPlot --- R/BamQCPlot.R | 104 +++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/R/BamQCPlot.R b/R/BamQCPlot.R index 4ed5c04..150ae74 100644 --- a/R/BamQCPlot.R +++ b/R/BamQCPlot.R @@ -4,7 +4,8 @@ library(optparse) option_list = list( make_option(c("-i", "--input"), type = "character", default = "qcreport.txt", help = "Input txt file from a snowman preprocess qcreport.txt"), - make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output pdf to generate") + make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output pdf to generate"), + make_option(c("-r", "--readgroup"), type = "character", default = NULL, help = "Read group") ) parseobj = OptionParser(option_list=option_list) @@ -26,99 +27,56 @@ require(gridExtra) ## read the table con <- file(opt$input, open = "r") -df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame() -rg <- list() +rg <- df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame() ## function for parsing histogram strings -function(ll, id) { +parse_hist <- function(ll, id, rg) { ss <- strsplit(ll[id],',')[[1]] - df <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T)) + df <- data.frame(matrix(as.numeric(unlist(strsplit(ss, '_'))), nrow=length(ss), byrow=T), stringsAsFactors=FALSE) colnames(df) <- c("Min", "Max", "Count") + df$readgroup = rg return (df) } +line <- readLines(con, n = 1, warn = FALSE) while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { ll <- strsplit(line, '\t')[[1]] thisrg = ll[1] - rg[[thisrg]] <- data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]), + rg <- rbind(rg, data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]), Unmapped = as.numeric(ll[4]), MateUnmapped = as.numeric(ll[5]), - QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7])) - - - ss <- strsplit(ll[8],',')[[1]] - df.mapq <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T)) - colnames(df.mapq) <- c("Min", "Max", "Count") - - ## found a new one - #if (grepl("ReadGroup", line)) { - # thisrg = gsub("READGROUP:BI:(.*)", "\\1", line) - # rg[[thisrg]] <- data.frame(readgroup = thisrg) - #} else if (grepl("total", line)) { - # rg[[thisrg]]$total = as.numeric(gsub("total,([0-9]+)", "\\1", line)) - #} else if (grepl("unmap", line)) { - # rg[[thisrg]]$unmap = as.numeric(gsub("unmap,([0-9]+)", "\\1", line)) - #} else if (grepl("qcfail", line)) { - # rg[[thisrg]]$qcfail = as.numeric(gsub("qcfail,([0-9]+)", "\\1", line)) - #} else if (grepl("duplicate", line)) { - # rg[[thisrg]]$duplicate = as.numeric(gsub("duplicate,([0-9]+)", "\\1", line)) - #} else if (grepl("supplementary", line)) { - # rg[[thisrg]]$supp = as.numeric(gsub("supplementary,([0-9]+)", "\\1", line)) - #} else if (grepl("mapq", line)) { - # df.mapq <- rbind(df.mapq, as.numeric(strsplit(gsub("mapq,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("nm", line)) { - # df.nm <- rbind(df.nm, as.numeric(strsplit(gsub("nm,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("isize", line)) { - # df.isize <- rbind(df.isize, as.numeric(strsplit(gsub("isize,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("as", line)) { - # df.as <- rbind(df.as, as.numeric(strsplit(gsub("as,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("xp", line)) { - # df.xp <- rbind(df.xp, as.numeric(strsplit(gsub("xp,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("clip", line)) { - # df.clip <- rbind(df.clip, as.numeric(strsplit(gsub("clip,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("len", line)) { - # df.len <- rbind(df.len, as.numeric(strsplit(gsub("len,([0-9]+)", "\\1", line), ",")[[1]])) - #} else if (grepl("phred", line)) { - # df.phred <- rbind(df.phred, as.numeric(strsplit(gsub("phred,([0-9]+)", "\\1", line), ",")[[1]])) - #} else { - # stop(paste("Failed to read file at line:", line)) - #} + QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7]))) + + df.mapq <- rbind(df.mapq, parse_hist(ll, 8, thisrg)) + df.nm <- rbind(df.nm, parse_hist(ll, 9, thisrg)) + df.isize <- rbind(df.isize, parse_hist(ll, 10, thisrg)) + df.clip <- rbind(df.clip, parse_hist(ll, 11, thisrg)) + #df.phred <- rbind(df.phred, parse_hist(ll, 12, thisrg)) + df.length <- rbind(df.length, parse_hist(ll, 13, thisrg)) } close(con) -colnames(df.mapq) <- seq(from=0,to=60) -colnames(df.nm) <- seq(from=0,to=ncol(df.nm)-1) -colnames(df.isize) <- seq(from=0,to=ncol(df.isize)-1) -colnames(df.xp) <- seq(from=0, to=ncol(df.xp)-1) -colnames(df.as) <- seq(from=0, to=ncol(df.as)-1) -colnames(df.len) <- seq(from=0, to=ncol(df.len)-1) -colnames(df.phred) <- seq(from=0, to=ncol(df.phred)-1) -colnames(df.clip) <- seq(from=0, to=ncol(df.clip)-1) -readg <- sapply(rg, function(x) x$readgroup) - -df.mapq$readgroup <- readg -df.nm$readgroup <- readg -df.isize$readgroup <- readg -df.xp$readgroup <- readg -df.as$readgroup <- readg -df.phred$readgroup <- readg -df.len$readgroup <- readg -df.clip$readgroup <- readg - -g.mapq <- ggplot(df.mapq.m <- melt(df.mapq, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61)) + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) -g.nm <- ggplot(df.nm.m <- melt(df.nm, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50)) + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) -g.isize <- ggplot(df.isize.m <- melt(df.isize, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(10,2001)) + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) +if (is.null(opt$readgroup)) + opt$readgroup = unique(rg$readgroup) + + +cbbPalette <- rep(c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), 50) + +#g.mapq <- ggplot(df.mapq, aes(x=as.numeric(Min), y=pmax(log(Count,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61)) + ylab('Reads') + xlab('Mapping Quality') +g.mapq <- ggplot(df.mapq[df.mapq$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count, 10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_bar(stat='identity', position='dodge') + theme_bw() + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) +g.nm <- ggplot(df.nm.m <- melt(df.nm, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50)) + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) +g.isize <- ggplot(df.isize[df.isize$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + scale_color_manual("Read Group", values=cbbPalette) + theme(legend.position = 'none') + coord_cartesian(xlim=c(200,500)) g.xp <- ggplot(df.xp.m <- melt(df.xp, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('XP Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) g.as <- ggplot(df.as.m <- melt(df.as, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('AS Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) -g.len <- ggplot(df.len.m <- melt(df.len, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102)) + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) -g.clip <- ggplot(df.clip.m <- melt(df.clip, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,85)) + ylab('Reads') + xlab('Clipped bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) +g.len <- ggplot(df.len.m <- melt(df.len, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102)) + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) +g.clip <- ggplot(df.clip[df.clip$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('Clipped Bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + scale_color_manual("Read Group", values=cbbPalette) g.phred <- ggplot(df.phred.m <- melt(df.phred, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,43)) + ylab('Reads') + xlab('Mean read Phred quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) -pdf(opt$output, width=15, height=8) -print(grid.arrange(g.mapq, g.nm, g.isize, g.len, g.clip, g.phred, ncol=3)) -dev.off() +#pdf(opt$output, width=15, height=8) +ppdf(print(grid.arrange(g.isize, g.clip, ncol=2)), width=4, height=6) +#dev.off()