Skip to content

Commit

Permalink
updated BamQCPlot
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeremiah Wala committed Jul 7, 2015
1 parent 902bbd2 commit 27ba81c
Showing 1 changed file with 31 additions and 73 deletions.
104 changes: 31 additions & 73 deletions R/BamQCPlot.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ library(optparse)

option_list = list(
make_option(c("-i", "--input"), type = "character", default = "qcreport.txt", help = "Input txt file from a snowman preprocess qcreport.txt"),
make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output pdf to generate")
make_option(c("-o", "--output"), type = "character", default = "qcreport.pdf", help = "Output pdf to generate"),
make_option(c("-r", "--readgroup"), type = "character", default = NULL, help = "Read group")
)

parseobj = OptionParser(option_list=option_list)
Expand All @@ -26,99 +27,56 @@ require(gridExtra)
## read the table
con <- file(opt$input, open = "r")

df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame()
rg <- list()
rg <- df.mapq <- df.nm <- df.isize <- df.as <- df.xp <- df.len <- df.phred <- df.clip <- data.frame()

## function for parsing histogram strings
function(ll, id) {
parse_hist <- function(ll, id, rg) {
ss <- strsplit(ll[id],',')[[1]]
df <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T))
df <- data.frame(matrix(as.numeric(unlist(strsplit(ss, '_'))), nrow=length(ss), byrow=T), stringsAsFactors=FALSE)
colnames(df) <- c("Min", "Max", "Count")
df$readgroup = rg
return (df)
}

line <- readLines(con, n = 1, warn = FALSE)
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {

ll <- strsplit(line, '\t')[[1]]

thisrg = ll[1]
rg[[thisrg]] <- data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]),
rg <- rbind(rg, data.frame(readgroup = thisrg, ReadCount = as.numeric(ll[2]), Supplementary = as.numeric(ll[3]),
Unmapped = as.numeric(ll[4]), MateUnmapped = as.numeric(ll[5]),
QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7]))


ss <- strsplit(ll[8],',')[[1]]
df.mapq <- data.frame(matrix(unlist(strsplit(ss, '_')), nrow=length(ss), byrow=T))
colnames(df.mapq) <- c("Min", "Max", "Count")

## found a new one
#if (grepl("ReadGroup", line)) {
# thisrg = gsub("READGROUP:BI:(.*)", "\\1", line)
# rg[[thisrg]] <- data.frame(readgroup = thisrg)
#} else if (grepl("total", line)) {
# rg[[thisrg]]$total = as.numeric(gsub("total,([0-9]+)", "\\1", line))
#} else if (grepl("unmap", line)) {
# rg[[thisrg]]$unmap = as.numeric(gsub("unmap,([0-9]+)", "\\1", line))
#} else if (grepl("qcfail", line)) {
# rg[[thisrg]]$qcfail = as.numeric(gsub("qcfail,([0-9]+)", "\\1", line))
#} else if (grepl("duplicate", line)) {
# rg[[thisrg]]$duplicate = as.numeric(gsub("duplicate,([0-9]+)", "\\1", line))
#} else if (grepl("supplementary", line)) {
# rg[[thisrg]]$supp = as.numeric(gsub("supplementary,([0-9]+)", "\\1", line))
#} else if (grepl("mapq", line)) {
# df.mapq <- rbind(df.mapq, as.numeric(strsplit(gsub("mapq,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("nm", line)) {
# df.nm <- rbind(df.nm, as.numeric(strsplit(gsub("nm,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("isize", line)) {
# df.isize <- rbind(df.isize, as.numeric(strsplit(gsub("isize,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("as", line)) {
# df.as <- rbind(df.as, as.numeric(strsplit(gsub("as,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("xp", line)) {
# df.xp <- rbind(df.xp, as.numeric(strsplit(gsub("xp,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("clip", line)) {
# df.clip <- rbind(df.clip, as.numeric(strsplit(gsub("clip,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("len", line)) {
# df.len <- rbind(df.len, as.numeric(strsplit(gsub("len,([0-9]+)", "\\1", line), ",")[[1]]))
#} else if (grepl("phred", line)) {
# df.phred <- rbind(df.phred, as.numeric(strsplit(gsub("phred,([0-9]+)", "\\1", line), ",")[[1]]))
#} else {
# stop(paste("Failed to read file at line:", line))
#}
QCFailed = as.numeric(ll[6]), Duplicate = as.numeric(ll[7])))

df.mapq <- rbind(df.mapq, parse_hist(ll, 8, thisrg))
df.nm <- rbind(df.nm, parse_hist(ll, 9, thisrg))
df.isize <- rbind(df.isize, parse_hist(ll, 10, thisrg))
df.clip <- rbind(df.clip, parse_hist(ll, 11, thisrg))
#df.phred <- rbind(df.phred, parse_hist(ll, 12, thisrg))
df.length <- rbind(df.length, parse_hist(ll, 13, thisrg))

}

close(con)

colnames(df.mapq) <- seq(from=0,to=60)
colnames(df.nm) <- seq(from=0,to=ncol(df.nm)-1)
colnames(df.isize) <- seq(from=0,to=ncol(df.isize)-1)
colnames(df.xp) <- seq(from=0, to=ncol(df.xp)-1)
colnames(df.as) <- seq(from=0, to=ncol(df.as)-1)
colnames(df.len) <- seq(from=0, to=ncol(df.len)-1)
colnames(df.phred) <- seq(from=0, to=ncol(df.phred)-1)
colnames(df.clip) <- seq(from=0, to=ncol(df.clip)-1)
readg <- sapply(rg, function(x) x$readgroup)

df.mapq$readgroup <- readg
df.nm$readgroup <- readg
df.isize$readgroup <- readg
df.xp$readgroup <- readg
df.as$readgroup <- readg
df.phred$readgroup <- readg
df.len$readgroup <- readg
df.clip$readgroup <- readg

g.mapq <- ggplot(df.mapq.m <- melt(df.mapq, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61)) + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.nm <- ggplot(df.nm.m <- melt(df.nm, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50)) + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.isize <- ggplot(df.isize.m <- melt(df.isize, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(10,2001)) + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
if (is.null(opt$readgroup))
opt$readgroup = unique(rg$readgroup)


cbbPalette <- rep(c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), 50)

#g.mapq <- ggplot(df.mapq, aes(x=as.numeric(Min), y=pmax(log(Count,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(30,61)) + ylab('Reads') + xlab('Mapping Quality')
g.mapq <- ggplot(df.mapq[df.mapq$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count, 10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_bar(stat='identity', position='dodge') + theme_bw() + ylab('Reads') + xlab('Mapping Quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.nm <- ggplot(df.nm.m <- melt(df.nm, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,50)) + ylab('Reads') + xlab('NM Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.isize <- ggplot(df.isize[df.isize$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('InsertSize') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + scale_color_manual("Read Group", values=cbbPalette) + theme(legend.position = 'none') + coord_cartesian(xlim=c(200,500))
g.xp <- ggplot(df.xp.m <- melt(df.xp, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('XP Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.as <- ggplot(df.as.m <- melt(df.as, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,100)) + ylab('Reads') + xlab('AS Tag') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.len <- ggplot(df.len.m <- melt(df.len, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102)) + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.clip <- ggplot(df.clip.m <- melt(df.clip, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,85)) + ylab('Reads') + xlab('Clipped bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.len <- ggplot(df.len.m <- melt(df.len, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(20,102)) + ylab('Reads') + xlab('Read Length') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))
g.clip <- ggplot(df.clip[df.clip$readgroup %in% opt$readgroup, ], aes(x=Min, y=pmax(log(Count,10),0), group=readgroup, color=readgroup, fill=readgroup)) + geom_line() + theme_bw() + ylab('Reads') + xlab('Clipped Bases') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^'))) + scale_color_manual("Read Group", values=cbbPalette)
g.phred <- ggplot(df.phred.m <- melt(df.phred, id='readgroup'), aes(x=as.numeric(variable), y=pmax(log(value,10),0), group=readgroup, color=readgroup)) + geom_line() + theme_bw() + scale_x_continuous(limits=c(0,43)) + ylab('Reads') + xlab('Mean read Phred quality') + scale_y_continuous(breaks=seq(0,8), labels=parse(text=paste('10', seq(0,8), sep='^')))


pdf(opt$output, width=15, height=8)
print(grid.arrange(g.mapq, g.nm, g.isize, g.len, g.clip, g.phred, ncol=3))
dev.off()
#pdf(opt$output, width=15, height=8)
ppdf(print(grid.arrange(g.isize, g.clip, ncol=2)), width=4, height=6)
#dev.off()

0 comments on commit 27ba81c

Please sign in to comment.