diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 95605be..f3390cb 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -52,10 +52,11 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: ubuntu-latest, r: 'next', bioc: '3.17', cont: "bioconductor/bioconductor_docker:RELEASE_3_17", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: '4.2', bioc: '3.16'} - - { os: windows-latest, r: '4.2', bioc: '3.16'} + # This needs to be updated after each Bioconductor release. Please make sure we have the matching R and Bioc versions in it. + - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: ubuntu-latest, r: 'next', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: '4.3', bioc: '3.18'} + - { os: windows-latest, r: '4.3', bioc: '3.18'} ## Check https://github.com/r-lib/actions/tree/master/examples ## for examples using the http-user-agent env: @@ -106,16 +107,16 @@ jobs: uses: actions/cache@v3 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-${{ runner.bioc }}-r-${{ runner.r }}-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-${{ runner.bioc }}-r-${{ runner.r }}- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " uses: actions/cache@v3 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-${{ runner.bioc }}-r-${{ runner.r }}-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-${{ runner.bioc }}-r-${{ runner.r }}- - name: Install Linux system dependencies if: runner.os == 'Linux' @@ -249,7 +250,7 @@ jobs: rcmdcheck::rcmdcheck( args = c("--no-manual", "--no-vignettes", "--timings"), build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), - error_on = "error", + error_on = "warning", check_dir = "check" ) shell: Rscript {0} @@ -313,7 +314,7 @@ jobs: if: failure() uses: actions/upload-artifact@master with: - name: ${{ runner.os }}-biocversion-RELEASE_3_16-r-4.2-results + name: ${{ runner.os }}-biocversion-${{ runner.bioc }}-r-${{ runner.r }}-results path: check ## Note that DOCKER_PASSWORD is really a token for your dockerhub diff --git a/DESCRIPTION b/DESCRIPTION index 82a444d..ed5e74e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,6 +42,7 @@ Imports: generics, GenomicRanges, ggplot2, + ggrepel, grDevices, heatmaply, pheatmap, @@ -69,7 +70,10 @@ Suggests: RMariaDB, AnnotationDbi, beeswarm, - covr + covr, + GenomeInfoDb, + ggbio, + biovizBase LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index aa2cc67..266312d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ export(plotExpectedVsObservedCounts) export(plotExpressedGenes) export(plotExpressionRank) export(plotFPKM) +export(plotManhattan) export(plotPowerAnalysis) export(plotQQ) export(plotSizeFactors) @@ -53,6 +54,7 @@ exportMethods(plotAberrantPerSample) exportMethods(plotCountCorHeatmap) exportMethods(plotDispEsts) exportMethods(plotEncDimSearch) +exportMethods(plotManhattan) exportMethods(plotQQ) exportMethods(plotVolcano) exportMethods(results) @@ -73,6 +75,7 @@ importFrom(BiocGenerics,plotDispEsts) importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bpisup) importFrom(BiocParallel,bplapply) +importFrom(BiocParallel,bpmapply) importFrom(BiocParallel,bpparam) importFrom(BiocParallel,bpstart) importFrom(BiocParallel,bpstop) @@ -95,14 +98,22 @@ importFrom(DESeq2,sizeFactors) importFrom(GenomicFeatures,exonsBy) importFrom(GenomicFeatures,makeTxDbFromGFF) importFrom(GenomicRanges,GRanges) +importFrom(GenomicRanges,end) +importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,reduce) +importFrom(GenomicRanges,start) importFrom(GenomicRanges,width) importFrom(IRanges,IRanges) importFrom(PRROC,pr.curve) importFrom(RColorBrewer,brewer.pal) importFrom(S4Vectors,"metadata<-") +importFrom(S4Vectors,"values<-") importFrom(S4Vectors,DataFrame) +importFrom(S4Vectors,endoapply) importFrom(S4Vectors,metadata) +importFrom(S4Vectors,queryHits) +importFrom(S4Vectors,subjectHits) +importFrom(S4Vectors,values) importFrom(SummarizedExperiment,"assay<-") importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") @@ -114,10 +125,13 @@ importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,mcols) importFrom(SummarizedExperiment,rowData) +importFrom(SummarizedExperiment,rowRanges) importFrom(generics,fit) importFrom(ggplot2,aes) importFrom(ggplot2,annotate) importFrom(ggplot2,element_blank) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) importFrom(ggplot2,geom_bar) importFrom(ggplot2,geom_histogram) @@ -130,6 +144,7 @@ importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,labs) +importFrom(ggplot2,quo_name) importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_identity) importFrom(ggplot2,scale_color_manual) @@ -143,6 +158,7 @@ importFrom(ggplot2,theme_bw) importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) importFrom(ggplot2,ylim) +importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRampPalette) importFrom(graphics,abline) importFrom(graphics,axis) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 6eaad0b..98ad193 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -27,6 +27,11 @@ setGeneric("plotAberrantPerSample", function(object, ...) setGeneric("plotCountCorHeatmap", function(object, ...) standardGeneric("plotCountCorHeatmap")) +#' @rdname plotFunctions +#' @export +setGeneric("plotManhattan", function(object, ...) + standardGeneric("plotManhattan")) + #' @importFrom DESeq2 plotDispEsts #' @noRd #' @export diff --git a/R/ZscoreMatrix.R b/R/ZscoreMatrix.R index 38513d1..0e51c39 100644 --- a/R/ZscoreMatrix.R +++ b/R/ZscoreMatrix.R @@ -1,14 +1,13 @@ #' #' Z score computation #' -#' Computes the z scores for every count in the matrix. -#' The z score is defined in the log2 space as follows: -#' \ifelse{html}{ -#' \out{zij = (lij - mujl)/ -#' sigmajl}}{ -#' \deqn{z_{ij} = \frac{l_{ij} - \mu_j^l}{\sigma_j^l}}}, -#' where l is the log2 transformed normalized count and mu and sigma the -#' mean and standard deviation for gene j, respectively. +#' Computes the z scores for every count in the matrix. +#' The z score is defined in the log\eqn{_2}{[2]} space as follows: +#' \eqn{z_{ij} = \frac{l_{ij} - \mu_j^l}{ +#' \sigma_j^l}}{z_ij = (l[ij] - mu[j]^l)/sigma[ij]} +#' where \code{l} is the log\eqn{_2}{[2]} transformed normalized count and +#' \eqn{\mu}{mu} and \eqn{\sigma}{sigma} the mean and standard deviation +#' for gene \code{j} and sample \code{i}, respectively. #' #' @param ods OutriderDataSet #' @param ... Further arguments passed on to \code{ZscoreMatrix}. diff --git a/R/inputCheckerFunctions.R b/R/inputCheckerFunctions.R index 2e41689..27e9db5 100644 --- a/R/inputCheckerFunctions.R +++ b/R/inputCheckerFunctions.R @@ -15,6 +15,8 @@ checkCountRequirements <- function(ods, test=FALSE){ stop("There are genes without any read. Please filter first ", "the data with: ods <- filterExpression(ods)") } + # Filter out genes that have on average less than 1 count per 100 samples. + # Ony affects datasets with more than 100 samples. filterGenes <- filterGenes | rowSums(counts(ods)) < ncol(ods)/100 if(any(filterGenes) & isFALSE(test)){ stop("The model requires for each gene at least 1 read ", diff --git a/R/method-evaluation.R b/R/method-evaluation.R index 0cdd431..51b6785 100644 --- a/R/method-evaluation.R +++ b/R/method-evaluation.R @@ -28,7 +28,7 @@ aberrant.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, #' @param by if the results should be summarized by 'sample', #' 'gene' or not at all (default). #' @param subsetName The name of a subset of genes of interest for which FDR -#' corected pvalues have been previously computed. Those FDR values +#' corected pvalues were previously computed. Those FDR values #' on the subset will then be used to determine aberrant status. #' Default is NULL (using transcriptome-wide FDR corrected pvalues). #' @param ... Currently not in use. diff --git a/R/method-plot.R b/R/method-plot.R index ccf8567..0ccb96d 100644 --- a/R/method-plot.R +++ b/R/method-plot.R @@ -32,6 +32,10 @@ #' Can also be a vector. Integers are treated as indices. #' @param padjCutoff,zScoreCutoff Significance or Z-score cutoff #' to mark outliers +#' @param label Indicates which genes or samples should be labeled. By default +#' all aberrant genes/samples are labelled. Can be set to NULL for +#' no labels. Provide a vector of geneIDs/sampleIDs to label +#' specific genes/samples. #' @param global Flag to plot a global Q-Q plot, default FALSE #' @param outlierRatio The fraction to be used for the outlier sample filtering #' @param normalized If TRUE, the normalized counts are used, the default, @@ -39,6 +43,20 @@ #' @param compareDisp If TRUE, the default, and if the autoCorrect normalization #' was used it computes the dispersion without autoCorrect and #' plots it for comparison. +#' @param xaxis Indicates which assay should be shown on the x-axis of the +#' volcano plot. Defaults to 'zscore'. Other options are 'fc' and +#' 'log2fc' for the fold-change or log2 fold-change. +#' @param value Indicates which assay is shown in the manhattan plot. Defaults +#' to 'pvalue'. Other options are 'zScore' and 'log2fc'. +#' @param featureRanges A GRanges object of the same length as the +#' OutriderDataSet object that contains the genomic positions of +#' features that are shown in the manhattan plot. +#' @param subsetName The name of a subset of genes of interest for which FDR +#' corrected pvalues were previously computed. Those FDR values +#' on the subset will then be used to determine aberrant status. +#' Default is NULL (using transcriptome-wide FDR corrected pvalues). +#' @param chr The chromosomes to be displayed in the \code{plotManhattan} +#' function. Default is \code{NULL}, i.e. all chromosomes are shown. #### Graphical parameters #' @param main Title for the plot, if missing a default title will be used. #' @param groups A character vector containing either group assignments of @@ -79,6 +97,9 @@ #' @param bins Number of bins used in the histogram. Defaults to 100. #' @param yadjust Option to adjust position of Median and 90 percentile labels. #' @param ylab The y axis label +#' @param chrColor A vector of length 2 giving the two colors used for +#' coloring alternating chromosomes in the manhattan plot. Default +#' colors are 'black' and 'darkgrey'. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -143,6 +164,11 @@ #' precision-recall curve). From this plot the optimum should be choosen for #' the \code{q} in fitting process. #' +#' \code{plotManhattan}: Visualizes different metrics for each gene (pvalue, +#' log2 fold-change, z-score) along with the genomic coordinates of the +#' respective gene as a manhattan plot. Detected outlier genes are highlighted +#' in red. +#' #' @return If base R graphics are used nothing is returned else the plotly or #' the gplot object is returned. #' @@ -157,22 +183,35 @@ #' #' mcols(ods)$basepairs <- 300 # assign pseudo gene length for filtering #' ods <- filterExpression(ods) -#' ods <- OUTRIDER(ods, implementation=implementation) -#' +#' # restrict FDR correction to set of genes of interest per sample +#' genesOfInterest <- list(MUC1372 = c("ATPIF1", "UROD", "YBX1", +#' sample(rownames(ods), 25)), +#' MUC1360 = sample(rownames(ods), 50), +#' MUC1350 = sample(rownames(ods), 75), +#' X76619 = sample(rownames(ods), 20), +#' X74172 = sample(rownames(ods), 150)) +#' ods <- OUTRIDER(ods, implementation=implementation, subsets=list("exampleGenes"=genesOfInterest)) +#' #' plotAberrantPerSample(ods) +#' plotAberrantPerSample(ods, subsetName="exampleGenes") #' #' plotVolcano(ods, 49) #' plotVolcano(ods, 'MUC1365', basePlot=TRUE) +#' plotVolcano(ods, 'MUC1351', basePlot=TRUE, xaxis="log2fc", label=c("NBPF16")) +#' plotVolcano(ods, 'MUC1372', basePlot=TRUE, subsetName="exampleGenes") #' #' plotExpressionRank(ods, 35) -#' plotExpressionRank(ods, "NDUFS5", normalized=FALSE, +#' plotExpressionRank(ods, 35, subsetName="exampleGenes") +#' plotExpressionRank(ods, "NDUFS5", normalized=FALSE, label="MUC1372", #' log=FALSE, main="Over expression outlier", basePlot=TRUE) #' #' plotQQ(ods, 149) +#' plotQQ(ods, 149, subsetName="exampleGenes") #' plotQQ(ods, global=TRUE, outlierRatio=0.001) #' #' plotExpectedVsObservedCounts(ods, 149) #' plotExpectedVsObservedCounts(ods, "ATAD3C", basePlot=TRUE) +#' plotExpectedVsObservedCounts(ods, "UROD", subsetName="exampleGenes") #' #' plotExpressedGenes(ods) #' @@ -202,19 +241,35 @@ #' implementation=implementation) #' plotEncDimSearch(ods) #' } +#' +#' # To show the pvalues of a sample in a manhattan plot, rowRanges(ods) must +#' # contain the genomic position of each feature or a GRanges object must +#' # be provided +#' \dontrun{ +#' # in case rowRanges(ods) is a GRangesList, run this first once to speed up: +#' rowRanges(ods) <- unlist(endoapply(rowRanges(ods), range)) +#' } +#' gr <- GRanges( +#' seqnames=sample(paste0("chr", 1:22), nrow(ods), replace=TRUE), +#' ranges=IRanges(start=runif(nrow(ods), min=0, max=1e5), width=100)) +#' plotManhattan(ods, "MUC1350", value="pvalue", featureRanges=gr) +#' plotManhattan(ods, "MUC1350", value="l2fc", featureRanges=gr) +#' plotManhattan(ods, "MUC1372", featureRanges=gr, subsetName="exampleGenes") #' #' @rdname plotFunctions #' @aliases plotFunctions plotVolcano plotQQ plotExpectedVsObservedCounts #' plotExpressionRank plotCountCorHeatmap plotCountGeneSampleHeatmap #' plotAberrantPerSample plotFPKM plotDispEsts plotPowerAnalysis -#' plotEncDimSearch plotExpressedGenes plotSizeFactors +#' plotEncDimSearch plotExpressedGenes plotSizeFactors plotManhattan #' NULL plotVolcano.OUTRIDER <- function(object, sampleID, main, padjCutoff=0.05, - zScoreCutoff=0, - pch=16, basePlot=FALSE, col=c("gray", "firebrick")){ + zScoreCutoff=0, label="aberrant", + xaxis=c("zscore", "log2fc", "fc"), + pch=16, basePlot=FALSE, + col=c("gray", "firebrick"), subsetName=NULL){ if(missing(sampleID)){ stop("specify which sample should be plotted, sampleID = 'sample5'") } @@ -237,7 +292,8 @@ plotVolcano.OUTRIDER <- function(object, sampleID, main, padjCutoff=0.05, if(length(sampleID) > 1){ ans <- lapply(sampleID, plotVolcano, object=object, padjCutoff=padjCutoff, - zScoreCutoff=zScoreCutoff, basePlot=basePlot) + zScoreCutoff=zScoreCutoff, basePlot=basePlot, + subsetName=subsetName) return(ans) } if(missing(main)){ @@ -247,25 +303,61 @@ plotVolcano.OUTRIDER <- function(object, sampleID, main, padjCutoff=0.05, if(is.null(rownames(object))){ rownames(object) <- paste("feature", seq_len(nrow(object)), sep="_") } + + xaxis <- match.arg(xaxis) + if(xaxis == "zscore"){ + if("zScore" %in% assayNames(object)){ + xaxis <- zScore(object) + xaxis_name <- "Z-Score" + base_x_lab <- xaxis_name + } else{ + stop("Calculate zScores first or choose other xaxis type.") + } + + } else if(xaxis == "fc"){ + if("l2fc" %in% assayNames(object)){ + xaxis <- 2^assay(object, "l2fc") + xaxis_name <- "Fold change" + base_x_lab <- xaxis_name + } else{ + stop("Calculate log2fc first or choose other xaxis type.") + } + } else if(xaxis == "log2fc"){ + if("l2fc" %in% assayNames(object)){ + xaxis <- assay(object, "l2fc") + xaxis_name <- expression(paste(log[2], "(fold-change)")) + base_x_lab <- paste("log2(fold-change)") + } else{ + stop("Calculate log2fc first or choose other xaxis type.") + } + } else { + stop("Unknown xaxis type, choose one of zscore, fc, log2fc.") + } dt <- data.table( GENE_ID = rownames(object), pValue = pValue(object)[,sampleID], - padjust = padj(object)[,sampleID], - zScore = zScore(object)[,sampleID], + padjust = padj(object, subsetName=subsetName)[,sampleID], + # zScore = zScore(object)[,sampleID], + xaxis = xaxis[,sampleID], normCts = counts(object, normalized=TRUE)[,sampleID], medianCts = rowMedians(counts(object, normalized=TRUE)), expRank = apply( counts(object, normalized=TRUE), 2, rank)[,sampleID], aberrant = aberrant(object, padjCutoff=padjCutoff, - zScoreCutoff=zScoreCutoff)[,sampleID], + zScoreCutoff=zScoreCutoff, subsetName=subsetName)[,sampleID], color = col[1]) dt[aberrant == TRUE, color:=col[2]] + dt[is.na(padjust), color:=col[3]] + dt[aberrant == TRUE, aberrantLabel:="aberrant"] + dt[aberrant == FALSE, aberrantLabel:="not aberrant"] + dt[is.na(aberrant), aberrantLabel:="not in tested group"] # remove the NAs from the zScores for plotting - dt[is.na(zScore),zScore:=0] + dt[is.na(xaxis),xaxis:=0] - p <- ggplot(dt, aes(zScore, -log10(pValue), color=color, text=paste0( + p <- ggplot(dt, aes(xaxis, -log10(pValue), color=aberrantLabel, + label=GENE_ID, text=paste0( "Gene ID: ", GENE_ID, "
Sample ID: ", sampleID, "
Median normcount: ", round(medianCts, 2), @@ -273,14 +365,52 @@ plotVolcano.OUTRIDER <- function(object, sampleID, main, padjCutoff=0.05, "
expression rank: ", as.integer(expRank), "
nominal P-value: ", signif(pValue,3), "
adj. P-value: ", signif(padjust,3), - "
Z-score: ", signif(zScore,2)))) + + "
", xaxis_name, ": ", signif(xaxis,2)))) + geom_point() + theme_bw() + - xlab("Z-score") + + xlab(xaxis_name) + ylab(expression(paste(-log[10], "(", italic(P), "-value)"))) + ggtitle(main) + - scale_color_identity() + - theme(legend.position = 'none') + scale_color_manual(values=c("not aberrant"=col[1], "aberrant"=col[2], + "not in tested group"="gray90")) + + theme(legend.position = 'bottom', + legend.title=element_blank()) + + if(!is.null(subsetName)){ + p <- p + labs(subtitle=paste0("FDR across genes in the ", subsetName, + " group (N=", dt[!is.na(padjust), .N], ")")) + } + + # Log scale if fold change is plotted + if(isTRUE(basePlot)){ + if(!is(xaxis_name, "expression") && xaxis_name == 'Fold change'){ + p <- p + scale_x_log10(labels = scales::trans_format( + "log10", scales::math_format(10^.x))) + } + if(!is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(dt[aberrant == TRUE,]) > 0){ + p <- p + + geom_text_repel(data=dt[aberrant == TRUE,], + aes(col=aberrantLabel), fontface='bold', + hjust=-.2, vjust=.2) + } + } + else{ + if(nrow(dt[GENE_ID %in% label]) > 0){ + p <- p + + geom_text_repel(data=subset(dt, GENE_ID %in% label), + aes(col=aberrantLabel), fontface='bold', + hjust=-.2, vjust=.2) + } + if(any(!(label %in% dt[,GENE_ID]))){ + warning("Did not find gene(s) ", + paste(label[!(label %in% dt[,GENE_ID])], + collapse=", "), " to label.") + } + } + } + } if(isFALSE(basePlot)){ p <- p + ylab(paste("-log10(P-value)")) @@ -297,7 +427,7 @@ setMethod("plotVolcano", signature(object="OutriderDataSet"), plotQQ.OUTRIDER <- function(object, geneID, main, global=FALSE, padjCutoff=0.05, zScoreCutoff=0, samplePoints=TRUE, legendPos="topleft", - outlierRatio=0.001, conf.alpha=0.05, + outlierRatio=0.001, conf.alpha=0.05, subsetName=NULL, pch=16, xlim=NULL, ylim=NULL, col=NULL){ checkOutriderDataSet(object) stopifnot(isScalarLogical(global)) @@ -318,6 +448,10 @@ plotQQ.OUTRIDER <- function(object, geneID, main, global=FALSE, padjCutoff=0.05, if(missing(main)){ main <- paste0('Q-Q plot for gene: ', geneID) } + if(!is.null(subsetName)){ + main <- paste0(main, "\nFDR across genes in the ", subsetName, + " group") + } if(is.null(col)){ col <- c('black', 'firebrick') } @@ -326,7 +460,7 @@ plotQQ.OUTRIDER <- function(object, geneID, main, global=FALSE, padjCutoff=0.05, pointCex <- 1 #data table with expected and observerd log10(pValues) aberrantEvent <- aberrant(object[geneID,], padjCutoff=padjCutoff, - zScoreCutoff=zScoreCutoff, by='sample') + zScoreCutoff=zScoreCutoff, subsetName=subsetName, by='sample') df <- data.table(obs= -log10(pVal), pch=pch, subset=FALSE, col=ifelse(aberrantEvent, col[2], col[1])) @@ -373,7 +507,7 @@ plotQQ.OUTRIDER <- function(object, geneID, main, global=FALSE, padjCutoff=0.05, if(is.null(ylim)){ ylim=range(df[,obs], na.rm=TRUE) } - plot(NA, xlim=xlim, ylim=ylim, main=main, + plot(NA, xlim=xlim, ylim=ylim, main=main, xlab=expression( paste(-log[10], " (expected ", italic(P), "-value)")), ylab=expression( @@ -449,7 +583,8 @@ setMethod("plotQQ", signature(object="OutriderDataSet"), plotQQ.OUTRIDER) #' @rdname plotFunctions #' @export plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, - log=TRUE, groups=c(), groupColSet='Set1', ...){ + log=TRUE, groups=c(), groupColSet='Set1', label="aberrant", + subsetName=NULL, ...){ # check user input checkOutriderDataSet(ods) @@ -463,6 +598,11 @@ plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, if (missing(main)) { main <- paste("Predicted expression plot:", geneID) } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } ods <- ods[geneID] cnts <- data.table( @@ -470,7 +610,8 @@ plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, sampleID = colnames(ods), observed = as.vector(counts(ods)) + isTRUE(log), expected = as.vector(normalizationFactors(ods)) + isTRUE(log), - aberrant = as.vector(aberrant(ods))) + aberrant = as.vector(aberrant(ods, subsetName=subsetName))) + cnts[is.na(aberrant), aberrant:=FALSE] # group assignment if(length(groups) != ncol(ods)) { @@ -483,14 +624,15 @@ plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, groups[is.na(groups)] <- 'NA' cnts[, group := groups] - g <- ggplot(cnts, aes(expected, observed, text=paste0( + g <- ggplot(cnts, aes(expected, observed, label=sampleID, + text=paste0( "Gene ID: ", feature_id, "
", "Sample ID: ", sampleID, "
", "Raw count: ", observed, "
", "Expected count: ", round(expected, 2), "
"))) + theme_bw() + geom_abline(slope = 1, intercept = 0) + - labs(title = main, + labs(title = main, subtitle=subtitle, x=paste('Expected counts', ifelse(isTRUE(log), '+ 1', '')), y=paste('Raw counts', ifelse(isTRUE(log), '+ 1', ''))) @@ -512,11 +654,35 @@ plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, } else { # no grouping given point_mapping$colour <- substitute(aberrant) g <- g + geom_point(point_mapping) + - scale_color_manual(values = c("gray", "firebrick")) + + scale_color_manual(values = c("FALSE"="gray", + "TRUE"="firebrick")) + theme(legend.position = 'none') } if (isTRUE(basePlot)) { + if(!is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(cnts[aberrant == TRUE,]) > 0){ + g <- g + + geom_text_repel(data=cnts[aberrant == TRUE,], + aes(col=aberrant), fontface='bold', + hjust=-.2, vjust=.5) + } + } + else{ + if(nrow(cnts[sampleID %in% label]) > 0){ + g <- g + + geom_text_repel(data=subset(cnts, sampleID %in% label), + aes(col=aberrant), fontface='bold', + hjust=-.2, vjust=.5) + } + if(any(!(label %in% cnts[,sampleID]))){ + warning("Did not find sample(s) ", + paste(label[!(label %in% cnts[,sampleID])], + collapse=", "), " to label.") + } + } + } return(g) } ggplotly(g, tooltip="text") @@ -528,7 +694,8 @@ plotExpectedVsObservedCounts <- function(ods, geneID, main, basePlot=FALSE, plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, zScoreCutoff=0, normalized=TRUE, basePlot=FALSE, log=TRUE, col=c("gray", "firebrick"), groups=c(), - groupColSet='Accent'){ + groupColSet='Accent', label="aberrant", + subsetName=NULL){ # check user input checkOutriderDataSet(ods) if(isTRUE(normalized) & is.null(sizeFactors(ods))){ @@ -557,7 +724,7 @@ plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, if(length(geneID) > 1){ ans <- lapply(geneID, plotExpressionRank, ods=ods, padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, - basePlot=basePlot, normalized=normalized) + basePlot=basePlot, normalized=normalized, subsetName=subsetName) return(ans) } stopifnot(isScalarValue(geneID)) @@ -575,10 +742,11 @@ plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, dt[, norm_rank:= rank(counts, ties.method = 'first')] if(all(c('pValue', 'padjust', 'zScore') %in% assayNames(ods))){ dt[, pValue := assay(ods, 'pValue')[1,]] - dt[, padjust := assay(ods, 'padjust')[1,]] + dt[, padjust := padj(ods, subsetName=subsetName)[1,]] dt[, zScore := assay(ods, 'zScore')[1,]] dt[, aberrant := aberrant(ods, padjCutoff=padjCutoff, - zScoreCutoff=zScoreCutoff)[1,]] + zScoreCutoff=zScoreCutoff, subsetName=subsetName)[1,]] + dt[is.na(aberrant), aberrant:=FALSE] } else { dt[,aberrant:=FALSE] } @@ -587,9 +755,15 @@ plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, if(missing(main)){ main <- paste("Expression rank plot:", geneID) } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } # create ggplot object - g <- ggplot(data=dt, aes(x = norm_rank, y = counts, text = paste0( + g <- ggplot(data=dt, aes(x = norm_rank, y = counts, label = sampleID, + text = paste0( "Gene ID: ", geneID, "
", "Sample ID: ", sampleID, "
", ifelse(uniqueN(groups) == 1, "", paste0("Group: ", group, "
")), @@ -601,7 +775,7 @@ plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, "nominal P-value: ", sprintf("%1.1E", pValue), "
", "Z-score: ", round(zScore, digits = 1), "
") ))) + - labs(title = main, x = 'Sample rank', y = ylab) + + labs(title = main, subtitle=subtitle, x = 'Sample rank', y = ylab) + theme_bw() if(isTRUE(log)){ @@ -628,6 +802,29 @@ plotExpressionRank <- function(ods, geneID, main, padjCutoff=0.05, } if(isTRUE(basePlot)){ + if(!is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(dt[aberrant == TRUE,]) > 0){ + g <- g + + geom_text_repel(data=dt[aberrant == TRUE,], + aes(col=aberrant), fontface='bold', + hjust=-.2, vjust=.5) + } + } + else{ + if(nrow(dt[sampleID %in% label]) > 0){ + g <- g + + geom_text_repel(data=subset(dt, sampleID %in% label), + aes(col=aberrant), fontface='bold', + hjust=-.2, vjust=.5) + } + if(any(!(label %in% dt[,sampleID]))){ + warning("Did not find sample(s) ", + paste(label[!(label %in% dt[,sampleID])], + collapse=", "), " to label.") + } + } + } return(g) } return(ggplotly(g)) @@ -865,9 +1062,15 @@ plotHeatmap <- function(ods, mtx, annotation_row=NULL, annotation_col=NULL, plotAberrantPerSample.OUTRIDER <- function(object, main='Aberrant Genes per Sample', outlierRatio=0.001, col='Dark2', yadjust=1.2, - ylab="#Aberrantly expressed genes", ...){ + ylab="Aberrantly expressed genes", + subsetName=NULL, ...){ oneOffset <- 0.8 - count_vector <- sort(aberrant(object, by="sample", ...)) + count_vector <- aberrant(object, by="sample", + subsetName=subsetName, ...) + testedSamples <- colnames(object)[which( + colSums(is.na(padj(object, subsetName=subsetName))) != nrow(object))] + count_vector <- sort(count_vector[testedSamples]) + hlines = quantile(count_vector, c(0.5, 0.9)) hlines[hlines == 0] <- oneOffset @@ -876,13 +1079,19 @@ plotAberrantPerSample.OUTRIDER <- function(object, y=c(count_vector, c(1, oneOffset)[(count_vector != 0) + 1]), fill=!count_vector <= max(1, length(object)*outlierRatio)) + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } + g <- ggplot(dt2p, aes(x=x, y=y, fill=fill)) + geom_bar(stat = "Identity") + theme_bw() + scale_y_log10(limits=c(oneOffset, NA)) + ylab(ylab) + xlab("Sample rank") + - ggtitle(main) + + ggtitle(main, subtitle=subtitle) + theme(legend.position="none") + geom_hline(yintercept=hlines) + annotate("text", label=c("Median", "90^th ~ percentile"), @@ -1128,3 +1337,247 @@ checkDeprication <- function(names2check, ...){ "\t'", names2check[i], "' --> '", names(names2check[i]), "'") } } + +plotManhattan.OUTRIDER <- function(object, sampleID, value="pvalue", + chr=NULL, main=paste0("Sample: ", sampleID), + featureRanges=rowRanges(object), + subsetName=NULL, + chrColor = c("black", "darkgrey")){ + requireNamespace("ggbio") + requireNamespace("GenomeInfoDb") + + # check user input + checkOutriderDataSet(object) + stopifnot("Sample not in ods" = sampleID %in% colnames(object)) + stopifnot("Value should be either pvalue, zscore or l2fc" = + value %in% c('pvalue', 'pValue', 'pv', 'zscore', 'zScore', + 'l2fc', 'L2FC', 'log2fc')) + + # get granges from rowRanges(ods) or provided ranges, check dimension + if(is.null(featureRanges)){ + if(is.null(rowRanges(object))){ + stop("No rowRanges(ods) found. Assign them first to use ", + "this function or provide a GRanges object.") + } + } else if(length(featureRanges) != nrow(object)){ + stop("The provided feature ranges must be of the same length as the ", + "ods object.") + } else if(is(featureRanges, "GRanges")){ + gr <- featureRanges + } else if(is(featureRanges, "GRangesList")){ + gr <- unlist(endoapply(featureRanges, range)) + # faster than range but gives error for empty GRangesList: + # gr <- unlist(endoapply(featureRanges, function(rr) rr[1,])) + if(length(gr) != nrow(object)){ + stop("The provided gene ranges do not contain ranges for all rows ", + "of the ods object") + } + } else{ + stop("The provided feature_ranges must a a GRanges or GRangesList ", + "object.") + } + + GenomeInfoDb::seqlevelsStyle(gr) <- 'NCBI' + + # Add values to granges + if(value %in% c('pvalue', 'pValue', 'pv')){ + gr$value <- -log10(pValue(object)[, sampleID]) + # value <- '-log10(pvalue)' + value <- expression(paste(-log[10], "(P-value)")) + } + if(value %in% c('zscore', 'zScore')){ + gr$value <- zScore(object)[, sampleID] + } + if(value %in% c('l2fc', 'L2FC', 'log2fc')){ + if(!"l2fc" %in% assayNames(object)){ + stop("Please compute the log2 fold changes first before ", + "retrieving them.") + } + gr$value <- assay(object, "l2fc")[, sampleID] + value <- expression(paste(log[2], "(fold-change)")) + } + gr$aberrant <- aberrant(object, subsetName=subsetName)[,sampleID] + + # Sort granges for plot + gr <- GenomeInfoDb::sortSeqlevels(gr) + gr <- sort(gr) + + # subset to chromosomes in chrSubset if requested + if(!is.null(chr)){ + # check input + if(any(grepl("chr", chr))){ + chr <- gsub("chr", "", chr) + } + if(!all(chr %in% unique(GenomeInfoDb::seqnames(gr)))){ + stop("Not all chromosomes selected for subsetting are present ", + "in the GRanges object.") + } + + # subset + gr <- gr[as.character(GenomeInfoDb::seqnames(gr)) %in% chr] + + # add chr to plot title if only one chr given + if(length(chr) == 1){ + main <- paste0(main, "; ", + paste("chr", chr, collapse=", ", sep="")) + } + } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } + + p <- plotGrandLinear.adapted(gr, aes(y = value), + color = chrColor, + highlight.gr = gr[which(gr$aberrant == TRUE)], + highlight.col = 'firebrick', + highlight.overlap = 'equal', + use.genome.coords=is.null(chr)) + + labs(x="Chromosome", y = value, title=main, subtitle=subtitle) + + return(p) +} + +#' @rdname plotFunctions +#' @export +setMethod("plotManhattan", signature="OutriderDataSet", + plotManhattan.OUTRIDER) + +#' +#' Adapted function from ggbio package to create manhattan plot. +#' Adapted to allow manhattan plots creation for a subset of chromomes only, +#' as well as highlighting only ranges that exactly match. Uses functions +#' from package biovizBase. +#' +#' @noRd +plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, + geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1, + legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL, + highlight.name = NULL, highlight.col = "red", highlight.label = TRUE, + highlight.label.size = 5, highlight.label.offset = 0.05, + highlight.label.col = "black", + highlight.overlap = c("any", "start", "end", "within", "equal"), + spaceline = FALSE, + use.genome.coords=TRUE){ + requireNamespace("biovizBase") + if (is.null(geom)) + geom <- "point" + args <- list(...) + args.aes <- biovizBase::parseArgsForAes(args) + args.non <- biovizBase::parseArgsForNonAes(args) + two.color <- c("#0080FF", "#4CC4FF") + .is.seq <- FALSE + if (!"colour" %in% names(args.aes)) { + if (!any(c("color", "colour") %in% names(args.non))) { + .color <- two.color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + } + else { + if (length(args.non$color) > 1) { + .color <- args.non$color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + args.non <- args.non[!names(args.non) %in% c("colour", + "color")] + } + } + } + else { + if (quo_name(args.aes$colour) == "seqnames") + args.aes$colour <- as.name("seqnames") + } + if (!"y" %in% names(args.aes)) + stop("need to provide y") + if(isTRUE(use.genome.coords)){ + args.non$coord <- "genome" + } + args.non$space.skip <- space.skip + args.non$geom <- geom + args.non$object <- obj + aes.res <- do.call(aes, args.aes) + p <- do.call(ggbio::autoplot, c(list(aes.res), args.non)) + if (!legend) + p <- p + theme(legend.position = "none") + if (!missing(ylab)) + p <- p + ylab(ylab) + if (!is.null(cutoff)) + p <- p + geom_hline(yintercept = cutoff, color = cutoff.color, + size = cutoff.size) + chrs <- names(GenomeInfoDb::seqlengths(obj)) + if (.is.seq) { + N <- length(chrs) + cols <- rep(.color, round(N/length(.color)) + 1)[1:N] + names(cols) <- chrs + p <- p + scale_color_manual(values = cols) + } + if (!missing(facets)) { + args$facets <- facets + args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid, + facet_wrap) + facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets) + p <- p + facet + } + p <- p + theme(panel.grid.minor = element_blank()) + if (!is.null(highlight.gr)) { + highlight.overlap <- match.arg(highlight.overlap) + idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap) + .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)), function(id) { + gr <- GRanges(as.character(GenomeInfoDb::seqnames(p@data))[id][1], + IRanges(start = min(start(p@data[id])), + end = max(end(p@data[id])))) + val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)])) + val <- val * (1 + highlight.label.offset) + values(gr)$val <- val + gr + }) + .h.pos <- suppressWarnings(do.call("c", unname(.h.pos))) + if (length(.h.pos)) { + if (is.null(highlight.name)) { + highlight.name <- names(highlight.gr) + } + else { + highlight.name <- values(highlight.gr)[, highlight.name] + } + p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]), + do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), + color = highlight.col) + if (!is.null(highlight.name)) { + GenomeInfoDb::seqlevels(.h.pos, pruning.mode = "coarse") <- + GenomeInfoDb::seqlevels(obj) + suppressWarnings(GenomeInfoDb::seqinfo(.h.pos) <- + GenomeInfoDb::seqinfo(obj)) + .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip) + values(.trans)$mean <- (start(.trans) + end(.trans))/2 + values(.trans)$names <- highlight.name + p <- p + geom_text(data = biovizBase::mold(.trans), + size = highlight.label.size, + vjust = 0, color = highlight.label.col, + do.call(aes, + list(x = substitute(mean), y = as.name("val"), + label = as.name("names")))) + } + } + } + if (spaceline) { + vline.df <- p@ggplot$data + vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames, + function(dd) { + data.frame(start = min(dd$start), end = max(dd$end)) + })) + gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2 + p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") + + theme(panel.grid = element_blank()) + } + if (!missing(main)) + p <- p + labs(title = main) + if (!missing(xlim)) + p <- p + xlim(xlim) + if (!missing(ylim)) + p <- p + ylim(ylim) + if (missing(xlab)) + xlab <- "" + p <- p + ggplot2::xlab(xlab) + p +} diff --git a/R/method-results.R b/R/method-results.R index d1e68c3..144682a 100644 --- a/R/method-results.R +++ b/R/method-results.R @@ -20,13 +20,17 @@ compileResults.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, } meanCorrectedCounts <- rowMeans(counts(object, normalized=TRUE)) + meanRawCounts <- rowMeans(counts(object, normalized=FALSE)) if(isFALSE(all)){ abByGene <- aberrant(object, - padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, by="gene") + padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, by="gene", + subsetName=subsetName) abBySample <- aberrant(object, - padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, by="sample") + padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, by="sample", + subsetName=subsetName) object <- object[abByGene > 0, abBySample > 0] meanCorrectedCounts <- meanCorrectedCounts[abByGene > 0] + meanRawCounts <- meanRawCounts[abByGene > 0] } # warning if no rows to return @@ -44,6 +48,7 @@ compileResults.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, zScore=NA_real_, l2fc=NA_real_, rawcounts=NA_integer_, + meanRawcounts=NA_real_, normcounts=NA_real_, meanCorrected=NA_real_, theta=NA_real_, @@ -65,6 +70,7 @@ compileResults.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, zScore = c(zScore(object)), l2fc = c(assay(object, "l2fc")), rawcounts = c(counts(object)), + meanRawcounts = meanRawCounts, normcounts = c(counts(object, normalized=TRUE)), meanCorrected = meanCorrectedCounts, theta = theta(object), @@ -77,15 +83,15 @@ compileResults.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, AberrantByGene = aberrant(object, padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, by="gene", subsetName=subsetName), - padj_rank = c(apply(padj(object), 2, rank)), + padj_rank = c(apply(padj(object, subsetName=subsetName), 2, rank)), FDR_set = ifelse(is.null(subsetName), "transcriptome-wide", subsetName) ) # round columns if requested if(is.numeric(round)){ - devNull <- lapply( - c("normcounts", "zScore", "l2fc", "theta", "meanCorrected"), + devNull <- lapply(c("normcounts", "zScore", "l2fc", "theta", + "meanRawcounts", "meanCorrected"), function(x) ans[,c(x):=round(get(x), as.integer(round))] ) } @@ -140,10 +146,49 @@ compileResultsAll.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, return(resSub) }) res <- rbindlist(resSubsets) - + # sort it if existing - if(length(res) > 0){ - res <- res[order(padjust)] + if(nrow(res) > 0){ + # get aberrant columns from transcriptome-wide results + res_tw <- res[FDR_set == "transcriptome-wide", + .(geneID, sampleID, padj_rank, aberrant, + AberrantBySample, AberrantByGene)] + + # dcast to have only one row per gene-sample combination + res <- dcast(res[, !c("padj_rank", "aberrant", "AberrantBySample", + "AberrantByGene"), with=FALSE], + ... ~ FDR_set, value.var="padjust") + + # merge again with aberrant columns for transcriptome-wide results + if(isTRUE(returnTranscriptomewideResults)){ + # rename column back to padjust for tw results after dcast + if("transcriptome-wide" %in% colnames(res)){ + setnames(res, "transcriptome-wide", "padjust") + } else{ + res[, padjust := NA] + } + + # merge with aberrant columns + res <- merge(res, res_tw, by=c("geneID", "sampleID"), all.x=TRUE) + + # set aberrant to FALSE if only significant in subset results + res[is.na(aberrant), aberrant := FALSE] + res[is.na(padjust), padjust := padj(object)[cbind(geneID,sampleID)]] + + # restore previous column order + setcolorder(res, colnames(resSubsets[[1]][,!"FDR_set", with=FALSE])) + + # order rows based on transcriptome-wide FDR + res <- res[order(padjust)] + } + + # rename padjust columns for other gene sets to padjust_setname + for(subsetAssayName in padjAssays[padjAssays != "padjust"]){ + setnames(res, gsub("padjust_", "", subsetAssayName), + subsetAssayName, skip_absent=TRUE) + } + } else{ + res <- res[, !"FDR_set", with=FALSE] } return(res) @@ -164,9 +209,7 @@ compileResultsAll.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, #' more user friendly #' @param all By default FALSE, only significant read counts are listed in the #' results. If TRUE all results are assembled resulting in a -#' data.table of length samples x genes. If FDR corrected pvalues -#' for subsets of genes of interest have been calculated, this -#' indicates whether the whole subset will be listed in the results. +#' data.table of length samples x genes. #' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets #' of genes of interest have been calculated, this parameter #' indicates whether additionally the transcriptome-wide results @@ -174,7 +217,34 @@ compileResultsAll.OUTRIDER <- function(object, padjCutoff=0.05, zScoreCutoff=0, #' for those subsets should be retrieved. #' @param ... Additional arguments, currently not used #' @return A data.table where each row is an outlier event and the columns -#' contain additional information about this event. Eg padj, l2fc +#' contain additional information about this event. In details the +#' table contains: +#' \itemize{ +#' \item{sampleID / geneID: }{The gene or sample ID as provided by the +#' user, e.g. \code{rowData(ods)} and \code{colData(ods)}, +#' respectively.} +#' \item{pValue / padjust: }{The nominal P-value and the FDR corrected +#' P-value (transcriptome-wide) indicating the outlier status.} +#' \item{zScore / l2fc: }{The z score and log\eqn{_2}{[2]} fold change +#' as computed by \code{\link[OUTRIDER]{computeZscores}}.} +#' \item{rawcounts: }{The observed read counts.} +#' \item{normcounts: }{The expected count given the fitted +#' autoencoder model for the given gene-sample combination.} +#' \item{meanRawcounts / meanCorrected: }{For this gene, the mean of the +#' observed or expected counts, respectively, given the fitted +#' autoencoder model.} +#' \item{theta: }{The dispersion parameter of the NB distribution +#' for the given gene.} +#' \item{aberrant: }{The transcriptome-wide outlier status of this event: +#' \code{TRUE} or \code{FALSE}.} +#' \item{AberrantBySample / AberrantByGene: }{Number of outliers for the +#' given sample or gene (transcriptome-wide), respectively.} +#' \item{padj_rank: }{Rank of this outlier event within the given sample.} +#' \item{padjust_FDRset: }{The FDR corrected P-value with respect to the +#' gene subset called 'FDRset', if gene subsets were specified +#' during the P-value computation. Find more details at +#' \code{\link[OUTRIDER]{computePvalues}}.} +#' } #' #' @examples #' diff --git a/R/pValMatrix.R b/R/pValMatrix.R index f011e80..b786964 100644 --- a/R/pValMatrix.R +++ b/R/pValMatrix.R @@ -2,9 +2,10 @@ #' Calculate P-values #' #' This function computes the P-values based on the fitted negative binomial -#' model. It computes two matrices with the same dimension as the count matrix -#' (samples x genes), which contain the corresponding P-values and -#' adjusted P-values of every count. +#' model being an outlier for the given sample gene combination. It computes +#' two matrices with the same dimension as the count matrix (samples x genes), +#' which contain the corresponding P-values and adjusted P-values of every +#' count. The adjusted P-values are computed across all genes per sample. #' #' @param object An OutriderDataSet #' @param alternative Can be one of "two.sided", "greater" or "less" to specify diff --git a/R/package-OUTRIDER.R b/R/package-OUTRIDER.R index 0d39f9e..5b57410 100644 --- a/R/package-OUTRIDER.R +++ b/R/package-OUTRIDER.R @@ -18,23 +18,27 @@ #' estimateSizeFactorsForMatrix replaceOutliers dispersions #' #' @importFrom SummarizedExperiment colData colData<- rowData rowData<- -#' assays assays<- assayNames mcols mcols<- assay assay<- +#' assays assays<- assayNames mcols mcols<- assay assay<- rowRanges #' #' @importFrom BBmisc isScalarLogical isScalarNumeric isScalarCharacter isFALSE #' isScalarValue isScalarNA chunk seq_col seq_row #' #' @importFrom BiocParallel bplapply bpparam SerialParam bpisup bpstart bpstop +#' bpmapply #' #' @importFrom GenomicFeatures makeTxDbFromGFF exonsBy #' -#' @importFrom GenomicRanges GRanges reduce width +#' @importFrom GenomicRanges GRanges reduce width start end findOverlaps #' #' @importFrom ggplot2 ggplot aes annotate geom_bar geom_histogram #' geom_hline geom_smooth geom_point labs scale_x_log10 #' scale_y_log10 scale_fill_manual scale_color_manual #' scale_fill_brewer scale_color_brewer theme ylim #' ggtitle geom_vline geom_text scale_linetype_manual geom_line -#' geom_abline theme_bw element_blank xlab ylab scale_color_identity +#' geom_abline theme_bw element_blank xlab ylab scale_color_identity +#' facet_grid facet_wrap quo_name +#' +#' @importFrom ggrepel geom_text_repel #' #' @importFrom grDevices colorRampPalette #' @@ -59,7 +63,8 @@ #' #' @importFrom reshape2 melt #' -#' @importFrom S4Vectors DataFrame metadata metadata<- +#' @importFrom S4Vectors DataFrame metadata metadata<- endoapply queryHits +#' subjectHits values values<- #' #' @importFrom scales math_format trans_format #' @@ -125,5 +130,10 @@ globalVariables(package="OUTRIDER", c( "Var2", ".x", "y", - "z")) + "z", + "AberrantByGene", + "AberrantBySample", + "FDR_set", + "aberrantLabel", + "geneID")) diff --git a/man/aberrant.Rd b/man/aberrant.Rd index 5ce6152..4174fb8 100644 --- a/man/aberrant.Rd +++ b/man/aberrant.Rd @@ -30,7 +30,7 @@ if NA or NULL no Z-score cutoff is used} 'gene' or not at all (default).} \item{subsetName}{The name of a subset of genes of interest for which FDR -corected pvalues have been previously computed. Those FDR values +corected pvalues were previously computed. Those FDR values on the subset will then be used to determine aberrant status. Default is NULL (using transcriptome-wide FDR corrected pvalues).} } diff --git a/man/computePvalues.Rd b/man/computePvalues.Rd index 6d48e60..74583e2 100644 --- a/man/computePvalues.Rd +++ b/man/computePvalues.Rd @@ -42,9 +42,10 @@ An OutriderDataSet object with computed nominal and adjusted P-values } \description{ This function computes the P-values based on the fitted negative binomial -model. It computes two matrices with the same dimension as the count matrix -(samples x genes), which contain the corresponding P-values and -adjusted P-values of every count. +model being an outlier for the given sample gene combination. It computes +two matrices with the same dimension as the count matrix (samples x genes), +which contain the corresponding P-values and adjusted P-values of every +count. The adjusted P-values are computed across all genes per sample. } \examples{ ods <- makeExampleOutriderDataSet() diff --git a/man/computeZscores.Rd b/man/computeZscores.Rd index 8677b36..cb87d01 100644 --- a/man/computeZscores.Rd +++ b/man/computeZscores.Rd @@ -22,14 +22,13 @@ An OutriderDataSet containing the Z score matrix "zScore" and the log2 fold changes "l2fc" as asasys. } \description{ -Computes the z scores for every count in the matrix. -The z score is defined in the log2 space as follows: -\ifelse{html}{ - \out{zij = (lij - mujl)/ - sigmajl}}{ - \deqn{z_{ij} = \frac{l_{ij} - \mu_j^l}{\sigma_j^l}}}, -where l is the log2 transformed normalized count and mu and sigma the -mean and standard deviation for gene j, respectively. +Computes the z scores for every count in the matrix. +The z score is defined in the log\eqn{_2}{[2]} space as follows: +\eqn{z_{ij} = \frac{l_{ij} - \mu_j^l}{ + \sigma_j^l}}{z_ij = (l[ij] - mu[j]^l)/sigma[ij]} +where \code{l} is the log\eqn{_2}{[2]} transformed normalized count and +\eqn{\mu}{mu} and \eqn{\sigma}{sigma} the mean and standard deviation +for gene \code{j} and sample \code{i}, respectively. } \examples{ ods <- makeExampleOutriderDataSet() diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index b5bd579..0fa518b 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -3,6 +3,7 @@ \name{plotAberrantPerSample} \alias{plotAberrantPerSample} \alias{plotCountCorHeatmap} +\alias{plotManhattan} \alias{plotEncDimSearch} \alias{plotQQ} \alias{plotVolcano} @@ -22,12 +23,15 @@ \alias{plotAberrantPerSample,OutriderDataSet-method} \alias{plotDispEsts,OutriderDataSet-method} \alias{plotEncDimSearch,OutriderDataSet-method} +\alias{plotManhattan,OutriderDataSet-method} \title{Visualization functions for OUTRIDER} \usage{ plotAberrantPerSample(object, ...) plotCountCorHeatmap(object, ...) +plotManhattan(object, ...) + plotEncDimSearch(object, ...) plotQQ(object, ...) @@ -40,9 +44,12 @@ plotVolcano(object, ...) main, padjCutoff = 0.05, zScoreCutoff = 0, + label = "aberrant", + xaxis = c("zscore", "log2fc", "fc"), pch = 16, basePlot = FALSE, - col = c("gray", "firebrick") + col = c("gray", "firebrick"), + subsetName = NULL ) \S4method{plotQQ}{OutriderDataSet}( @@ -56,6 +63,7 @@ plotVolcano(object, ...) legendPos = "topleft", outlierRatio = 0.001, conf.alpha = 0.05, + subsetName = NULL, pch = 16, xlim = NULL, ylim = NULL, @@ -70,6 +78,8 @@ plotExpectedVsObservedCounts( log = TRUE, groups = c(), groupColSet = "Set1", + label = "aberrant", + subsetName = NULL, ... ) @@ -84,7 +94,9 @@ plotExpressionRank( log = TRUE, col = c("gray", "firebrick"), groups = c(), - groupColSet = "Accent" + groupColSet = "Accent", + label = "aberrant", + subsetName = NULL ) \S4method{plotCountCorHeatmap}{OutriderDataSet}( @@ -128,7 +140,8 @@ plotCountGeneSampleHeatmap( outlierRatio = 0.001, col = "Dark2", yadjust = 1.2, - ylab = "#Aberrantly expressed genes", + ylab = "Aberrantly expressed genes", + subsetName = NULL, ... ) @@ -150,6 +163,17 @@ plotPowerAnalysis(ods) plotExpressedGenes(ods, main = "Statistics of expressed genes") plotSizeFactors(ods, basePlot = TRUE) + +\S4method{plotManhattan}{OutriderDataSet}( + object, + sampleID, + value = "pvalue", + chr = NULL, + main = paste0("Sample: ", sampleID), + featureRanges = rowRanges(object), + subsetName = NULL, + chrColor = c("black", "darkgrey") +) } \arguments{ \item{...}{Additional parameters passed to plot() or plot_ly() if not stated @@ -163,6 +187,15 @@ Can also be a vector. Integers are treated as indices.} \item{padjCutoff, zScoreCutoff}{Significance or Z-score cutoff to mark outliers} +\item{label}{Indicates which genes or samples should be labeled. By default +all aberrant genes/samples are labelled. Can be set to NULL for +no labels. Provide a vector of geneIDs/sampleIDs to label +specific genes/samples.} + +\item{xaxis}{Indicates which assay should be shown on the x-axis of the +volcano plot. Defaults to 'zscore'. Other options are 'fc' and +'log2fc' for the fold-change or log2 fold-change.} + \item{pch}{Integer or character to be used for plotting the points} \item{basePlot}{if \code{TRUE}, use the R base plot version, else use the @@ -172,6 +205,11 @@ plotly framework, which is the default} of length 2. (1. normal point; 2. outlier point) or a single character referring to a color palette from \code{RColorBrewer}.} +\item{subsetName}{The name of a subset of genes of interest for which FDR +corrected pvalues were previously computed. Those FDR values +on the subset will then be used to determine aberrant status. +Default is NULL (using transcriptome-wide FDR corrected pvalues).} + \item{global}{Flag to plot a global Q-Q plot, default FALSE} \item{samplePoints}{Sample points for Q-Q plot, defaults to max 30k points} @@ -234,6 +272,20 @@ top n genes based on the BCV.} \item{compareDisp}{If TRUE, the default, and if the autoCorrect normalization was used it computes the dispersion without autoCorrect and plots it for comparison.} + +\item{value}{Indicates which assay is shown in the manhattan plot. Defaults +to 'pvalue'. Other options are 'zScore' and 'log2fc'.} + +\item{chr}{The chromosomes to be displayed in the \code{plotManhattan} +function. Default is \code{NULL}, i.e. all chromosomes are shown.} + +\item{featureRanges}{A GRanges object of the same length as the +OutriderDataSet object that contains the genomic positions of +features that are shown in the manhattan plot.} + +\item{chrColor}{A vector of length 2 giving the two colors used for +coloring alternating chromosomes in the manhattan plot. Default +colors are 'black' and 'darkgrey'.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -319,7 +371,12 @@ levels. The curves are smooths to make the reading of the plot easier. \code{plotEncDimSearch}: Visualization of the hyperparameter optimization. It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for -the \code{q} in fitting process. +the \code{q} in fitting process. + +\code{plotManhattan}: Visualizes different metrics for each gene (pvalue, +log2 fold-change, z-score) along with the genomic coordinates of the +respective gene as a manhattan plot. Detected outlier genes are highlighted +in red. } \examples{ ods <- makeExampleOutriderDataSet(dataset="Kremer") @@ -332,22 +389,35 @@ implementation <- 'autoencoder' mcols(ods)$basepairs <- 300 # assign pseudo gene length for filtering ods <- filterExpression(ods) -ods <- OUTRIDER(ods, implementation=implementation) +# restrict FDR correction to set of genes of interest per sample +genesOfInterest <- list(MUC1372 = c("ATPIF1", "UROD", "YBX1", + sample(rownames(ods), 25)), + MUC1360 = sample(rownames(ods), 50), + MUC1350 = sample(rownames(ods), 75), + X76619 = sample(rownames(ods), 20), + X74172 = sample(rownames(ods), 150)) +ods <- OUTRIDER(ods, implementation=implementation, subsets=list("exampleGenes"=genesOfInterest)) plotAberrantPerSample(ods) +plotAberrantPerSample(ods, subsetName="exampleGenes") plotVolcano(ods, 49) plotVolcano(ods, 'MUC1365', basePlot=TRUE) +plotVolcano(ods, 'MUC1351', basePlot=TRUE, xaxis="log2fc", label=c("NBPF16")) +plotVolcano(ods, 'MUC1372', basePlot=TRUE, subsetName="exampleGenes") plotExpressionRank(ods, 35) -plotExpressionRank(ods, "NDUFS5", normalized=FALSE, +plotExpressionRank(ods, 35, subsetName="exampleGenes") +plotExpressionRank(ods, "NDUFS5", normalized=FALSE, label="MUC1372", log=FALSE, main="Over expression outlier", basePlot=TRUE) plotQQ(ods, 149) +plotQQ(ods, 149, subsetName="exampleGenes") plotQQ(ods, global=TRUE, outlierRatio=0.001) plotExpectedVsObservedCounts(ods, 149) plotExpectedVsObservedCounts(ods, "ATAD3C", basePlot=TRUE) +plotExpectedVsObservedCounts(ods, "UROD", subsetName="exampleGenes") plotExpressedGenes(ods) @@ -378,4 +448,18 @@ ods <- findEncodingDim(ods, params=c(3, 10, 20, 35, 50), plotEncDimSearch(ods) } +# To show the pvalues of a sample in a manhattan plot, rowRanges(ods) must +# contain the genomic position of each feature or a GRanges object must +# be provided +\dontrun{ +# in case rowRanges(ods) is a GRangesList, run this first once to speed up: +rowRanges(ods) <- unlist(endoapply(rowRanges(ods), range)) +} +gr <- GRanges( + seqnames=sample(paste0("chr", 1:22), nrow(ods), replace=TRUE), + ranges=IRanges(start=runif(nrow(ods), min=0, max=1e5), width=100)) +plotManhattan(ods, "MUC1350", value="pvalue", featureRanges=gr) +plotManhattan(ods, "MUC1350", value="l2fc", featureRanges=gr) +plotManhattan(ods, "MUC1372", featureRanges=gr, subsetName="exampleGenes") + } diff --git a/man/results.Rd b/man/results.Rd index 9702ab2..78e0bd6 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -32,9 +32,7 @@ more user friendly} \item{all}{By default FALSE, only significant read counts are listed in the results. If TRUE all results are assembled resulting in a -data.table of length samples x genes. If FDR corrected pvalues -for subsets of genes of interest have been calculated, this -indicates whether the whole subset will be listed in the results.} +data.table of length samples x genes.} \item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets of genes of interest have been calculated, this parameter @@ -44,7 +42,34 @@ for those subsets should be retrieved.} } \value{ A data.table where each row is an outlier event and the columns - contain additional information about this event. Eg padj, l2fc + contain additional information about this event. In details the + table contains: + \itemize{ + \item{sampleID / geneID: }{The gene or sample ID as provided by the + user, e.g. \code{rowData(ods)} and \code{colData(ods)}, + respectively.} + \item{pValue / padjust: }{The nominal P-value and the FDR corrected + P-value (transcriptome-wide) indicating the outlier status.} + \item{zScore / l2fc: }{The z score and log\eqn{_2}{[2]} fold change + as computed by \code{\link[OUTRIDER]{computeZscores}}.} + \item{rawcounts: }{The observed read counts.} + \item{normcounts: }{The expected count given the fitted + autoencoder model for the given gene-sample combination.} + \item{meanRawcounts / meanCorrected: }{For this gene, the mean of the + observed or expected counts, respectively, given the fitted + autoencoder model.} + \item{theta: }{The dispersion parameter of the NB distribution + for the given gene.} + \item{aberrant: }{The transcriptome-wide outlier status of this event: + \code{TRUE} or \code{FALSE}.} + \item{AberrantBySample / AberrantByGene: }{Number of outliers for the + given sample or gene (transcriptome-wide), respectively.} + \item{padj_rank: }{Rank of this outlier event within the given sample.} + \item{padjust_FDRset: }{The FDR corrected P-value with respect to the + gene subset called 'FDRset', if gene subsets were specified + during the P-value computation. Find more details at + \code{\link[OUTRIDER]{computePvalues}}.} + } } \description{ This function assembles a results table of significant outlier events based diff --git a/tests/testthat/test_methods.R b/tests/testthat/test_methods.R index 6c06910..31add16 100644 --- a/tests/testthat/test_methods.R +++ b/tests/testthat/test_methods.R @@ -26,7 +26,7 @@ test_that("result method", { set.seed(42) ods <- makeExampleOutriderDataSet(100, 50) expect_error(results(ods), "Please calculate..*") - ods <- OUTRIDER(ods, iteration=2) + ods <- OUTRIDER(ods, q=2, iteration=2) expect_warning(res <- results(ods, padj=1e-10), "No significant events:") expect_equal(colnames(res), colnames(results(ods))) diff --git a/tests/testthat/test_plotting.R b/tests/testthat/test_plotting.R index c9d27df..9656804 100644 --- a/tests/testthat/test_plotting.R +++ b/tests/testthat/test_plotting.R @@ -6,33 +6,51 @@ test_that("plotting", { mcols(ods)[['basepairs']] <- rnbinom(nrow(ods), mu=1e4, size=2) ods <- filterExpression(ods, filterGenes=FALSE, fpkmCutoff=1e3) - ods <- OUTRIDER(ods, implementation='pca', q=2) + genesOfInterest <- list("sample_1"=sample(rownames(ods), 10), + "sample_3"=sample(rownames(ods), 8), + "sample_10"=sample(rownames(ods), 20)) + ods <- OUTRIDER(ods, implementation='pca', q=2, + subsets=list("testSet"=genesOfInterest)) sex <- sample(c("female", "male"), dim(ods)[2], replace=TRUE) colData(ods)$Sex <- sex expect_is(plotAberrantPerSample(ods), "ggplot") - + expect_is(plotAberrantPerSample(ods, subsetName="testSet"), "ggplot") expect_equal(class(plotVolcano(ods, 30)), c("plotly", "htmlwidget")) expect_is(plotVolcano(ods, "sample_10", basePlot=TRUE), "ggplot") + expect_is(plotVolcano(ods, "sample_10", subsetName="testSet", + basePlot=TRUE), "ggplot") + expect_is(plotVolcano(ods, "sample_10", label="feature_3", basePlot=TRUE), + "ggplot") expect_equal(class(plotExpressionRank(ods, 23)), c("plotly", "htmlwidget")) expect_is(plotExpressionRank(ods, "feature_13", normalized=FALSE, log=FALSE, basePlot=TRUE), 'ggplot') + expect_is(plotExpressionRank(ods, "feature_13", normalized=TRUE, + subsetName="testSet", basePlot=TRUE), 'ggplot') + expect_is(plotExpressionRank(ods, "feature_13", normalized=FALSE, + label=c("sample_1", "sample_5"), + log=FALSE, basePlot=TRUE), 'ggplot') expect_equal(class(plotExpectedVsObservedCounts(ods, 39)), c("plotly", "htmlwidget")) expect_is(plotExpectedVsObservedCounts(ods, "feature_32", basePlot=TRUE), "ggplot") + expect_is(plotExpectedVsObservedCounts(ods, "feature_32", + subsetName="testSet", basePlot=TRUE), "ggplot") + expect_is(plotExpectedVsObservedCounts(ods, "feature_32", label="aberrant", + basePlot=TRUE), "ggplot") expect_is(plotExpressedGenes(ods), "ggplot") expect_null(plotQQ(ods, 1)) expect_null(plotQQ(ods, "feature_20")) + expect_null(plotQQ(ods, "feature_20", subsetName="testSet")) expect_null(plotQQ(ods, global=TRUE)) expect_null(plotQQ(ods, global=TRUE, outlierRatio=0.001)) expect_null(plotQQ(ods, global=TRUE, outlierRatio=NULL)) @@ -62,6 +80,15 @@ test_that("plotting", { plotPowerAnalysis(ods) + expect_error(plotManhattan(ods, "sample_1")) + gr <- GRanges( + seqnames=sample(paste0("chr", 1:22), nrow(ods), replace=TRUE), + ranges=IRanges(start=runif(nrow(ods), min=0, max=1e5), width=100)) + expect_is(plotManhattan(ods, "sample_1", featureRanges=gr), "GGbio") + expect_is(plotManhattan(ods, "sample_1", featureRanges=gr, chr=10), "GGbio") + expect_is(plotManhattan(ods, "sample_26", featureRanges=gr, + subsetName="testSet"), "GGbio") + }) diff --git a/vignettes/OUTRIDER.Rnw b/vignettes/OUTRIDER.Rnw index 36bcc11..b1367d9 100644 --- a/vignettes/OUTRIDER.Rnw +++ b/vignettes/OUTRIDER.Rnw @@ -513,6 +513,31 @@ head(res) dim(res) @ +In details the table contains: +\itemize{ + \item{sampleID / geneID: }{The gene or sample ID as provided by the + user, e.g. rowData(ods) and colData(ods) respectively.} + \item{pValue / padjust: }{The nominal P-value and the FDR corrected + P-value indicating the outlier status. Find more details at + computePvalues.} + \item{zScore / l2fc: }{The z score and $\log_{2}$ fold change + as computed by computeZscores.} + \item{rawcounts: }{The observed read counts.} + \item{normcounts: }{The expected count given the fitted + autoencoder model for the given gene-sample combination.} + \item{meanRawcounts / meanCorrected: }{For this gene, the mean of the + observed or expected counts, respectively, given the fitted + autoencoder model.} + \item{theta: }{The dispersion parameter of the NB distribution + for the given gene.} + \item{aberrant: }{The outlier status of this event: TRUE or FALSE.} + \item{AberrantBySample / AberrantByGene: }{Number of outliers for the + given sample or gene, respectively.} + \item{padj rank: }{Rank of this outlier event within the given sample.} + \item{FDR set: }{The subset-name used for the P-value computation.} +} + + \subsection{Number of aberrant genes per sample}