From 8107537e6793c284a85fe7e6f2ef2e6d2fbf78eb Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Oct 2021 09:32:10 +0200 Subject: [PATCH 01/80] include filters in pvalue calculation & cleanup --- R/AllGenerics.R | 352 ++++++++++++++++++++++-------------------- R/countRNAseqData.R | 23 +-- R/example_functions.R | 6 +- R/getNSetterFuns.R | 91 ++++++++--- R/helper-functions.R | 43 ++++++ R/pvalsNzscore.R | 151 +++++++++++------- src/RcppExports.cpp | 5 + 7 files changed, 412 insertions(+), 259 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 42986760..8298e03f 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -578,42 +578,70 @@ setAs("DataFrame", "matrix", function(from){ as.matrix(as(from, "data.table")) }) #' -#' retrieve a single sample result object -#' @noRd -resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals, - rawCts, rawTotalCts, deltaPsiVals, muPsi, psiType, fdrCut, - zscoreCut, dPsiCut, rowMeansK, rowMeansN, minCount, - additionalColumns){ - - zscore <- zscores[,sampleID] - dpsi <- deltaPsiVals[,sampleID] - pval <- pvals[,sampleID] - padj <- padjs[,sampleID] - - goodCut <- !logical(length(zscore)) - if(!is.na(zscoreCut)){ - goodCut <- goodCut & na2default(abs(zscore) >= zscoreCut, TRUE) - } - if(!is.na(dPsiCut)){ - goodCut <- goodCut & na2default(abs(dpsi) >= dPsiCut, TRUE) - } - if(!is.na(fdrCut)){ - goodCut <- goodCut & na2false(padj <= fdrCut) +#' Mapping of chromosome names +#' +#' @param fds FraserDataSet +#' @param style The style of the chromosome names. +#' @param ... Further parameters. For mapSeqLevels: further parameters +#' passed to GenomeInfoDb::mapSeqlevels(). +#' +#' @rdname fds-methods +#' @export +mapSeqlevels <- function(fds, style="UCSC", ...){ + + mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) + # fix missing names() when fds has only a single chromosome + if(is.null(names(mappings))){ + names(mappings) <- seqlevels(fds) } - if(!is.na(minCount)){ - goodCut <- goodCut & rawTotalCts[,sampleID] >= minCount + + if(length(mappings) != length(seqlevels(fds))){ + message(date(), ": Drop non standard chromosomes for compatibility.") + fds <- keepStandardChromosomes(fds) + nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) + validObject(fds) } + fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] + + seqlevels(fds) <- as.vector(mappings) + seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) + + return(fds) +} +#' +#' retrieve a single sample result object +#' @noRd +resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, + psivals, rawCts, rawTotalCts, deltaPsiVals, + psiType, rowMeansK, rowMeansN, aberrant, + aggregate, rho, pvalsGene=NULL, padjsGene=NULL, + additionalColumns, filters=list()){ + + # if gene level results, find the most aberrant junction per gene first + if(isTRUE(aggregate)){ + goodGenes <- rownames(aberrant)[aberrant[,sampleID]] + geneJunctions <- findJunctionsForAberrantGenes(gr=gr, genes=goodGenes, + pvals=pvals[,sampleID], + dpsi=deltaPsiVals[,sampleID], + minCount=rawTotalCts[,sampleID], rho=rho, + filters=filters) + goodCut <- rep(FALSE, nrow(pvals)) + goodCut[geneJunctions] <- TRUE + } else{ + goodCut <- aberrant[,sampleID] + } + ans <- granges(gr[goodCut]) - + if(!any(goodCut)){ return(ans) } - + if(!"hgnc_symbol" %in% colnames(mcols(gr))){ mcols(gr)$hgnc_symbol <- NA_character_ } - + # extract data mcols(ans)$sampleID <- Rle(sampleID) if("hgnc_symbol" %in% colnames(mcols(gr))){ @@ -623,57 +651,57 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals, mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol) } mcols(ans)$type <- Rle(psiType) - mcols(ans)$pValue <- signif(pval[goodCut], 5) - mcols(ans)$padjust <- signif(padj[goodCut], 5) - mcols(ans)$zScore <- Rle(round(zscore[goodCut], 2)) + mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5) + mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5) + mcols(ans)$zScore <- Rle(round(zscores[goodCut,sampleID], 2)) mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2)) - mcols(ans)$deltaPsi <- Rle(round(dpsi[goodCut], 2)) + mcols(ans)$deltaPsi <- Rle(round(deltaPsiVals[goodCut,sampleID], 2)) mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID]) mcols(ans)$totalCounts <- Rle(rawTotalCts[goodCut, sampleID]) + if(isTRUE(aggregate)){ + mcols(ans)$pValueGene <- signif(pvalsGene[goodCut,sampleID], 5) + mcols(ans)$padjustGene <- signif(padjsGene[goodCut,sampleID], 5) + } + if(!is.null(additionalColumns)){ for(column in additionalColumns){ mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column]) } } - + return(ans[order(mcols(ans)$pValue)]) } FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, - dPsiCutoff, psiType, BPPARAM=bpparam(), maxCols=20, - minCount, additionalColumns=NULL){ - - # check input - checkNaAndRange(fdrCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(dPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(zscoreCutoff, min=0, max=100, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) - + dPsiCutoff, minCount, rhoCutoff, psiType, + maxCols=20, aggregate=FALSE, + BPPARAM=bpparam(), additionalColumns=NULL){ + stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) - + resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ message(date(), ": Collecting results for: ", type) currentType(object) <- type gr <- rowRanges(object, type=type) - + # first get row means rowMeansK <- rowMeans(K(object, type=type)) rowMeansN <- rowMeans(N(object, type=type)) - + # then iterate by chunk chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols) sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs, - chunkSize=chunkCols) - + chunkSize=chunkCols) + ans <- lapply(seq_along(sampleChunks), function(idx){ message(date(), ": Process chunk: ", idx, " for: ", type) sc <- sampleChunks[[idx]] tmp_x <- object[,sc] - + # extract values rawCts <- as.matrix(K(tmp_x)) rawTotalCts <- as.matrix(N(tmp_x)) @@ -683,9 +711,24 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) psivals_pc <- (rawCts + pseudocount()) / - (rawTotalCts + 2*pseudocount()) + (rawTotalCts + 2*pseudocount()) deltaPsiVals <- psivals_pc - muPsi - + rho <- rho(tmp_x, type) + aberrant <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + zScoreCutoff=zscoreCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=aggregate) + if(isTRUE(aggregate)){ + pvalsGene <- as.matrix(pVals(tmp_x, level="gene")) + padjsGene <- as.matrix(padjVals(tmp_x, level="gene")) + } else{ + pvalsGene <- NULL + padjsGene <- NULL + } + if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc @@ -695,33 +738,39 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, # create result table sampleRes <- lapply(sc, - resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs, - zscores=zscores, psiType=type, psivals=psivals, - deltaPsiVals=deltaPsiVals, muPsi=muPsi, rawCts=rawCts, - rawTotalCts=rawTotalCts, fdrCut=fdrCutoff, - zscoreCut=zscoreCutoff, dPsiCut=dPsiCutoff, - rowMeansK=rowMeansK, rowMeansN=rowMeansN, - minCount=minCount, additionalColumns=additionalColumns) - + resultsSingleSample, gr=gr, pvals=pvals, + padjs=padjs, zscores=zscores, psiType=type, + psivals=psivals, deltaPsiVals=deltaPsiVals, + rawCts=rawCts, rawTotalCts=rawTotalCts, + rowMeansK=rowMeansK, rowMeansN=rowMeansN, + aberrant=aberrant, aggregate=aggregate, + rho=rho, + pvalsGene=pvalsGene, padjsGene=padjsGene, + additionalColumns=additionalColumns, + filters=list(dpsi=dPsiCutoff, + minCount=minCount, + rho=rhoCutoff)) + # return combined result return(unlist(GRangesList(sampleRes))) }) - + unlist(GRangesList(ans)) }) - + # merge results ans <- unlist(GRangesList(resultsls)) - + # sort it if existing if(length(ans) > 0){ ans <- ans[order(ans$pValue)] } - + # return only the results return(ans) } + #' #' Extracting results and aberrant splicing events #' @@ -794,94 +843,56 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.3, + rhoCutoff=0.1, aggregate=FALSE, minCount=5, psiType=c("psi3", "psi5", "theta"), additionalColumns=NULL, BPPARAM=bpparam(), ...){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, - zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, - minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), + zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=match.arg(psiType, several.ok=TRUE), + aggregate=aggregate, additionalColumns=additionalColumns, BPPARAM=BPPARAM) }) #' @rdname results #' @export -resultsByGenes <- function(res, geneColumn="hgncSymbol", method="BY"){ - # sort by pvalue - res <- res[order(res$pValue)] +resultsByGenes <- function(res, geneColumn="hgncSymbol"){ + # sort by gene pvalue + res <- res[order(res$pValueGene)] # extract subset if(is(res, "GRanges")){ - ans <- as.data.table(mcols(res)[,c(geneColumn, "pValue", "sampleID")]) - colnames(ans) <- c("features", "pval", "sampleID") + ans <- as.data.table(mcols(res)[,c(geneColumn, "sampleID")]) + colnames(ans) <- c("features", "sampleID") } else { ans <- featureNames <- res[,.( - features=get(geneColumn), pval=pValue, sampleID=sampleID)] + features=get(geneColumn), sampleID=sampleID)] } - # remove NAs + # remove NAs and + # keep only one row per gene with lowest pvalue over all psiTypes naIdx <- ans[,is.na(features)] ansNoNA <- ans[!is.na(features)] - - # compute pvalues by gene - ansNoNA[,pByFeature:=min(p.adjust(pval, method="holm")), - by="sampleID,features"] - - # subset to lowest pvalue by gene dupIdx <- duplicated(ansNoNA[,.(features,sampleID)]) - ansGenes <- ansNoNA[!dupIdx] - - # compute FDR - ansGenes[,fdrByFeature:=p.adjust(pByFeature, method=method), - by="sampleID"] # get final result table finalAns <- res[!naIdx][!dupIdx] - finalAns$pValueGene <- ansGenes$pByFeature - finalAns$padjustGene <- ansGenes$fdrByFeature finalAns } -#' -#' Mapping of chromosome names -#' -#' @param fds FraserDataSet -#' @param style The style of the chromosome names. -#' @param ... Further parameters. For mapSeqLevels: further parameters -#' passed to GenomeInfoDb::mapSeqlevels(). -#' -#' @rdname fds-methods -#' @export -mapSeqlevels <- function(fds, style="UCSC", ...){ - - mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) - # fix missing names() when fds has only a single chromosome - if(is.null(names(mappings))){ - names(mappings) <- seqlevels(fds) - } - - if(length(mappings) != length(seqlevels(fds))){ - message(date(), ": Drop non standard chromosomes for compatibility.") - fds <- keepStandardChromosomes(fds) - nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) - validObject(fds) - } - fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] - - seqlevels(fds) <- as.vector(mappings) - seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) - - return(fds) -} - - -aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, - deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5, - by=c("none", "sample", "feature"), aggregate=FALSE, ...){ - - checkNaAndRange(zScoreCutoff, min=0, max=Inf, na.ok=TRUE) - checkNaAndRange(padjCutoff, min=0, max=1, na.ok=TRUE) - checkNaAndRange(deltaPsiCutoff, min=0, max=1, na.ok=TRUE) +aberrant.FRASER <- function(object, type=currentType(object), + padjCutoff=0.05, deltaPsiCutoff=0.3, + zScoreCutoff=NA, minCount=5, rhoCutoff=0.1, + by=c("none", "sample", "feature"), + aggregate=FALSE, ...){ + + checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(zScoreCutoff, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) by <- match.arg(by) - + dots <- list(...) if("n" %in% names(dots)){ n <- dots[['n']] @@ -896,60 +907,73 @@ aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, if("padjVals" %in% names(dots)){ padj <- dots[['padjVals']] } else { - padj <- padjVals(object, type=type) + # check if padj values are available for the given filters + pvalsAvailable <- checkPadjAvailableForFilters(object, type=type, + filters=list(rho=rhoCutoff), + aggregate=aggregate) + if(isFALSE(pvalsAvailable)){ + stop("For the given filters, pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function with the requested filters.") + } + pvalLevel <- ifelse(isTRUE(aggregate), "gene", "site") + padj <- padjVals(object, type=type, level=pvalLevel, + filters=list(rho=rhoCutoff)) } if("dPsi" %in% names(dots)){ dpsi <- dots[['dPsi']] } else { dpsi <- deltaPsiValue(object, type=type) } + if("rhoVals" %in% names(dots)){ + rho <- dots[['rhoVals']] + } else { + rho <- matrix(rho(object, type=type), + nrow=nrow(dpsi), ncol=ncol(dpsi)) + } - - # create cutoff matrix - goodCutoff <- matrix(TRUE, nrow=nrow(zscores), ncol=ncol(zscores), - dimnames=dimnames(zscores)) - if("hgnc_symbol" %in% colnames(mcols(object, type=type)) & - nrow(mcols(object, type=type)) == nrow(goodCutoff)){ - rownames(goodCutoff) <- mcols(object, type=type)[,"hgnc_symbol"] - } else if(isTRUE(aggregate)){ - stop("Please provide hgnc symbols to compute gene p values!") - } - - # check each cutoff if in use (not NA) - if(!is.na(minCount)){ - goodCutoff <- goodCutoff & as.matrix(n >= minCount) - } - if(!is.na(zScoreCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(zscores) > zScoreCutoff) + if(is.na(padjCutoff)){ + padjCutoff <- 1 } - if(!is.na(deltaPsiCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(dpsi) > deltaPsiCutoff) - } - if(!is.na(padjCutoff)){ - goodCutoff <- goodCutoff & as.matrix(padj < padjCutoff) - } - goodCutoff[is.na(goodCutoff)] <- FALSE - # check if we should go for aggregation - # TODO to speed it up we only use any hit within a feature - # but should do a holm's + BY correction per gene and genome wide - if(isTRUE(aggregate)){ - goodCutoff <- as.matrix(data.table(goodCutoff, keep.rownames=TRUE)[, - as.data.table(t(colAnys(as.matrix(.SD)))), by=rn][,-1]) - rownames(goodCutoff) <- unique(mcols(object, type=type)[,"hgnc_symbol"]) - colnames(goodCutoff) <- colnames(zscores) + if(isFALSE(aggregate)){ + aberrantEvents <- padj <= padjCutoff + + # check each cutoff if in use (not NA) + if(!is.na(minCount)){ + aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) + } + if(!is.na(zScoreCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(zscores) > zScoreCutoff) + } + if(!is.na(deltaPsiCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(dpsi) > deltaPsiCutoff) + } + if(!is.na(rhoCutoff)){ + aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff) + } + aberrantEvents[is.na(aberrantEvents)] <- FALSE + + } else{ + dt <- data.table(geneID=mcols(fds, type=type)$hgnc_symbol, padj) + dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)] + padj <- as.matrix(dt[, -1]) + rownames(padj) <- dt[,geneID] + + aberrantEvents <- padj <= padjCutoff } - # return results - if(by == "feature"){ - return(rowSums(goodCutoff)) - } - if(by == "sample"){ - return(colSums(goodCutoff)) - } - return(goodCutoff) + return(switch(match.arg(by), + none = aberrantEvents, + sample = colSums(aberrantEvents, na.rm=TRUE), + feature = rowSums(aberrantEvents, na.rm=TRUE) + )) } #' @rdname results #' @export setMethod("aberrant", "FraserDataSet", aberrant.FRASER) + + diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index 3578fdef..2f278832 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -965,15 +965,20 @@ readJunctionMap <- function(junctionMap){ #' @noRd extractSpliceSiteCoordinates <- function(junctions, fds){ - if(strandSpecific(fds) >= 1L){ - spliceSiteCoords <- unlist(GRangesList( - extractSpliceSiteCoordsPerStrand(junctions, "+"), - extractSpliceSiteCoordsPerStrand(junctions, "-") - )) - } else { - strand(junctions) <- "*" - spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") - } + # if(strandSpecific(fds) >= 1L){ + # spliceSiteCoords <- unlist(GRangesList( + # extractSpliceSiteCoordsPerStrand(junctions, "+"), + # extractSpliceSiteCoordsPerStrand(junctions, "-") + # )) + # } else { + # strand(junctions) <- "*" + # spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") + # } + + spliceSiteCoords <- unlist(GRangesList( + lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, + junctions=junctions) + )) return(unique(sort(spliceSiteCoords))) } diff --git a/R/example_functions.R b/R/example_functions.R index 16640805..90bf817e 100644 --- a/R/example_functions.R +++ b/R/example_functions.R @@ -81,12 +81,12 @@ createTestFraserDataSet <- function(workingDir=tempdir(), rerun=FALSE){ fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=5, minDeltaPsi=0, quantileMinExpression=0) - # run FRASER pipeline - fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2) - # annotate it suppressMessages({ fds <- annotateRangesWithTxDb(fds) }) + # run FRASER pipeline + fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2) + # save data for later fds <- saveFraserDataSet(fds) diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 94e8ca5e..26f72951 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -248,8 +248,9 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ #' @describeIn getter_setter_functions This returns the calculated p-values. #' @export pVals <- function(fds, type=currentType(fds), level="site", + filters=list(rho=0.1), dist="BetaBinomial", ...){ - level <- match.arg(level, choices=c("site", "junction")) + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ @@ -260,17 +261,39 @@ pVals <- function(fds, type=currentType(fds), level="site", warning("Did not find junction-level p values. ", "Using site-level p values instead.") } + } else{ + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% assayNames(fds)){ + stop("Did not find gene-level p values. ", + "Please compute them first.") + } + } } + getAssayMatrix(fds, aname, type=type, ...) } `pVals<-` <- function(fds, type=currentType(fds), level="site", + filters=list(rho=0.1), dist="BetaBinomial", ..., value){ - level <- match.arg(level, choices=c("site", "junction")) + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ aname <- paste0(aname, "_junction") + setAssayMatrix(fds, name=aname, type=type, ...) <- value + return(fds) + } else if(level == "gene"){ + aname <- paste0(aname, "_gene") + } + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) } setAssayMatrix(fds, name=aname, type=type, ...) <- value return(fds) @@ -278,15 +301,36 @@ pVals <- function(fds, type=currentType(fds), level="site", #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export -padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){ +padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), + level="site", filters=list(rho=0.1), ...){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...)) + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% assayNames(fds)){ + stop("Did not find gene-level p values. ", + "Please compute them first.") + } + } + return(getAssayMatrix(fds, aname, type=type, ...)) } -`padjVals<-` <- function(fds, type=currentType(fds), - dist="BetaBinomial", ..., value){ +`padjVals<-` <- function(fds, type=currentType(fds), level="site", + dist="BetaBinomial", filters=list(rho=0.1), ..., value){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - setAssayMatrix(fds, name=paste0("padj", dist), type=type, ...) <- value + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + setAssayMatrix(fds, name=aname, type=type, ...) <- value return(fds) } @@ -589,32 +633,29 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, padj = c(padjVals(fds, type=type)[idxrow, idxcol]), zscore = c(zScores(fds, type=type)[idxrow, idxcol]), obsPsi = c((k + pseudocount())/(n + 2*pseudocount())), - predPsi = c(predictedMeans(fds, type)[idxrow, idxcol])) + predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), + rho = rep(rho(fds, type=type)[idxrow], + ifelse(isTRUE(idxcol), ncol(fds), length(idxcol))) + ) dt[, deltaPsi:=obsPsi - predPsi] # add aberrant information to it aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], - dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)]) + dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], + rhoVals=dt[,.(rho)]) dt[,aberrant:=aberrantVec] # if requested return gene p values (correct for multiple testing again) if(isTRUE(aggregate)){ + dt[, pval:=c(pVals(fds, type=type, + level="gene")[idxrow, idxcol])] + dt[, padj:=c(padjVals(fds, type=type, + level="gene")[idxrow, idxcol]),] dt <- dt[!is.na(featureID)] - - # correct by gene and take the smallest p value - dt <- rbindlist(bplapply(unique(dt[,sampleID]), - BPPARAM=getBPParam(Ncpus, length(unique(dt[,sampleID]))), - FUN=function(x){ - dttmp <- dt[sampleID == x] - dttmp[, pval:=p.adjust(pval, method="holm"), - by="sampleID,featureID,type"] - dttmp <- dttmp[order(sampleID, featureID, type, -aberrant, - pval, -abs(deltaPsi))][ - !duplicated(data.table(sampleID, featureID, type))] - dttmp <- dttmp[, padj:=p.adjust(pval, method="BY"), - by="sampleID,type"] - dttmp - })) + + dt <- dt[order(sampleID, featureID, type, -aberrant, + padj, -abs(deltaPsi))][ + !duplicated(data.table(sampleID, featureID, type))] } # return object @@ -622,7 +663,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, } -#' @describeIn getter_setter_functions Dependend on the level of verbosity +#' @describeIn getter_setter_functions Dependent on the level of verbosity #' the algorithm reports more or less to the user. 0 means being quiet #' and 10 means everything. #' @export diff --git a/R/helper-functions.R b/R/helper-functions.R index 88a7cbae..17a843a3 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -548,3 +548,46 @@ getStrandString <- function(fds){ strand <- switch(strandSpecific(fds)+1L, "no", "yes", "reverse") return(strand) } + +#' +#' Check if adjusted pvalues have been computed for a given set of filters. +#' @noRd +checkPadjAvailableForFilters <- function(fds, type, filters=list(), + dist="BetaBinomial", aggregate=FALSE){ + dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) + aname <- paste0("padj", dist) + aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname) + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + pvalsAvailable <- paste(aname, type, sep="_") %in% assayNames(fds) + return(pvalsAvailable) +} + +#' +#' Find most aberrant junction for each gene +#' @noRd +findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, + rho, filters=list()){ + geneJunctions <- mcols(gr)$hgnc_symbol %in% genes + dt <- data.table(idx=which(geneJunctions == TRUE), + geneID=mcols(gr)$hgnc_symbol[geneJunctions], + pval=pvals[geneJunctions], + dpsi=abs(dpsi[geneJunctions]), + minCount=minCount[geneJunctions], + rho=rho[geneJunctions] ) + + # mask junctions that don't pass filters (minCount, dPsi, rho) + for(n in names(filters)){ + if(n == "rho"){ + dt[rho > filters[["rho"]], pval:=NA] + } else{ + dt[get(n) < filters[[n]], pval:=NA] + } + } + + # sort per gene by lowest pvalue / highest deltaPsi and return index + dt <- dt[order(geneID, pval, -dpsi)] + return(dt[!duplicated(dt, by="geneID"),idx]) +} diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index deac8b59..99b5754b 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -109,11 +109,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="BetaBinomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="BetaBinomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("binomial" %in% distributions){ @@ -125,11 +120,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="Binomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Binomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("normal" %in% distributions){ @@ -142,19 +132,17 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval, 0.5) pVals(fds, dist="Normal", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Normal", level="site", - withDimnames=FALSE) <- fwer_pvals } fds } -adjust_FWER_PValues <- function(i, pvals=pvals, index=index){ - dt <- data.table(p=pvals[,i], idx=index) - dt2 <- dt[,.(pa=min(p.adjust(p, method="holm"), na.rm=TRUE)),by=idx] +adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + dt <- data.table(p=pvals[,i], idx=index, rho=rho) + dt[rho > rhoCutoff, p:=NA] + dt2 <- dt[,.(pa=min(p.adjust(p, method=method), na.rm=TRUE)),by=idx] + dt2[is.infinite(pa), pa:=NA] setkey(dt2, "idx")[J(index)][,pa] } @@ -193,27 +181,100 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' @param method The p.adjust method that should be used. #' #' @export -calculatePadjValues <- function(fds, type=currentType(fds), method="BY"){ +calculatePadjValues <- function(fds, type=currentType(fds), method="BY", + rhoCutoff=0.1, + BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) idx <- !duplicated(index) for(i in c("BetaBinomial", "Binomial", "Normal")){ # only do it if it exists - if(!paste0("pvalues", i, "_", type) %in% assayNames(fds)){ + if(!paste0("pvalues", i, "_junction_", type) %in% assayNames(fds)){ next } - pvals <- pVals(fds, dist=i) - padj <- apply(pvals[idx,], 2, p.adjust, method=method) + pvals <- pVals(fds, dist=i, level="junction") + rho <- rho(fds, type=type) + + # splice site-level pval correction + message(date(), ": adjusting junction-level pvalues ...") + fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, + pvals=pvals, index, BPPARAM=BPPARAM, + method="holm", rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(cbind, fwer_pval) + pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff), + withDimnames=FALSE) <- fwer_pvals + + # junction-level FDR correction + padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method) padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)] padjDT[,i:=NULL] - padjVals(fds, dist=i, withDimnames=FALSE) <- as.matrix(padjDT) + padjVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff), + withDimnames=FALSE) <- as.matrix(padjDT) + + # gene-level pval correction and FDR + if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){ + message(date(), ": calculating gene-level pvalues ...") + gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, + method="holm", FDRmethod=method, + BPPARAM=BPPARAM) + pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), + withDimnames=FALSE) <- gene_pvals[["pvals"]] + padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), + withDimnames=FALSE) <- gene_pvals[["padj"]] + } } return(fds) } +getPvalsPerGene <- function(fds, type, + pvals=pVals(fds, type=type, level="site"), + sampleID=NULL, method="holm", FDRmethod="BY", + BPPARAM=bpparam()){ + # extract data and take only the first index of per site + samples <- samples(fds) + if(is.null(colnames(pvals))){ + colnames(pvals) <- samples + } + dt <- data.table( + idx=getSiteIndex(fds, type=type), + geneID=getGeneIDs(fds, type=type, unique=FALSE), + as.data.table(pvals)) + dt <- dt[!is.na(geneID)] + geneIDs <- dt[,unique(geneID)] + setkey(dt, geneID) + + # extract samples + if(!is.null(sampleID)){ + samples <- sampleID + } + + # aggregate pvalues to gene level per sample + pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, + function(i){ + dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], + method=method), na.rm=TRUE), + by=geneID] + setkey(dttmp, geneID) + dttmp[J(geneIDs), V1] + })), ncol=length(samples)) + + colnames(pvalsPerGene) <- samples + rownames(pvalsPerGene) <- geneIDs + + # compute FDR + gene_padj <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod) + + # blow up back to original assay size + pvalsPerGene <- mapGeneToJunctionAssay(fds, type, pvalsPerGene) + padjPerGene <- mapGeneToJunctionAssay(fds, type, gene_padj) + + return(list(pvals=pvalsPerGene, padj=padjPerGene)) + +} + getSiteIndex <- function(fds, type){ if(type == "theta"){ return(mcols(fds, type=type)[['spliceSiteID']]) @@ -242,39 +303,13 @@ getGeneIDs <- function(fds, type, unique=TRUE){ geneIDs } -getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type), - sampleID=NULL, method="holm", BPPARAM=bpparam()){ - # extract data and take only the first index of per site - dt <- data.table( - idx=getSiteIndex(fds, type=type), - geneID=getGeneIDs(fds, type=type, unique=FALSE), - as.data.table(pvals)) - dt <- dt[!duplicated(idx) & !is.na(geneID)] - setkey(dt, geneID) - - samples <- samples(fds) - if(!is.null(sampleID)){ - samples <- sampleID - } - - pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, - function(i){ - dttmp <- dt[,min(p.adjust(get(i), method=method)),by=geneID] - setkey(dttmp, geneID) - dttmp[J(getGeneIDs(fds, type=type)), V1] - })), ncol=length(samples)) - - colnames(pvalsPerGene) <- samples - rownames(pvalsPerGene) <- getGeneIDs(fds, type=type) - - return(pvalsPerGene) - -} - -getPadjPerGene <- function(pvals, method="BY"){ - - padjPerGene <- apply(pvals, 2, p.adjust, method=method) - - return(padjPerGene) - +mapGeneToJunctionAssay <- function(fds, type, pvalMat){ + # blow up back to original assay size + pvalDT <- data.table(geneID=rownames(pvalMat), + as.data.table(pvalMat) ) + setkey(pvalDT, "geneID") + pvalDT <- pvalDT[J(getGeneIDs(fds, type=type, unique=FALSE))] + fullMat <- as.matrix(pvalDT[, -1]) + rownames(fullMat) <- pvalDT[, geneID] + return(fullMat) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 1a7c62e3..461280c0 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,6 +6,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // setPseudoCount double setPseudoCount(double pseudoCount); RcppExport SEXP _FRASER_setPseudoCount(SEXP pseudoCountSEXP) { From 420e0d5c860b3c1e32b93af253d1b3b7d4512985 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Oct 2021 18:32:01 +0200 Subject: [PATCH 02/80] update documentation & tests --- DESCRIPTION | 2 +- NAMESPACE | 1 - R/AllGenerics.R | 61 ++++++++++++--------------- R/countRNAseqData.R | 12 +----- R/find_encoding_dimensions.R | 4 +- R/getNSetterFuns.R | 5 ++- R/helper-functions.R | 26 +++++++++++- R/pvalsNzscore.R | 30 +++++++++---- man/FRASER.Rd | 26 ++++++++++-- man/getter_setter_functions.Rd | 28 +++++++++--- man/results.Rd | 36 ++++++++++------ tests/testthat/test_fraser_pipeline.R | 3 +- tests/testthat/test_stats.R | 50 ++++++++++++++++++++++ 13 files changed, 204 insertions(+), 80 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a8ebe376..c7704274 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,7 @@ biocViews: License: MIT + file LICENSE URL: https://github.com/gagneurlab/FRASER BugRepots: https://github.com/gagneurlab/FRASER/issues -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Encoding: UTF-8 VignetteBuilder: knitr Depends: diff --git a/NAMESPACE b/NAMESPACE index 35e2d66a..0625f5fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,7 +72,6 @@ export(predictedMeans) export(pseudocount) export(psiTypes) export(results) -export(resultsByGenes) export(rho) export(samples) export(saveFraserDataSet) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 8298e03f..d1dba092 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -677,11 +677,12 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, dPsiCutoff, minCount, rhoCutoff, psiType, - maxCols=20, aggregate=FALSE, + maxCols=20, aggregate=FALSE, collapse=TRUE, BPPARAM=bpparam(), additionalColumns=NULL){ stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) + message(date(), " collapse = ", collapse) resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ message(date(), ": Collecting results for: ", type) @@ -766,6 +767,19 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, ans <- ans[order(ans$pValue)] } + # collapse into one row per gene if requested + if(isTRUE(aggregate) && isTRUE(collapse) && length(ans) > 0){ + + ans <- ans[order(ans$pValueGene, ans$pValue)] + naIdx <- is.na(ans$hgncSymbol) + ansNoNA <- ans[!is.na(ans$hgncSymbol),] + + # get final result table + dupIdx <- duplicated(data.table(as.vector(ansNoNA$hgncSymbol), + as.vector(ansNoNA$sampleID))) + ans <- ans[!naIdx,][!dupIdx,] + } + # return only the results return(ans) } @@ -787,6 +801,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param minCount The minimum count value of the total coverage of an intron #' to be considered as significant. #' result +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which +#' junctions are filtered #' @param psiType The psi types for which the results should be retrieved. #' @param additionalColumns Character vector containing the names of additional #' columns from mcols(fds) that should appear in the result table @@ -802,8 +819,11 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param by By default \code{none} which means no grouping. But if #' \code{sample} or \code{feature} is specified the sum by #' sample or feature is returned -#' @param aggregate If TRUE the returned object is based on the grouped -#' features +#' @param aggregate If TRUE the returned object is aggregated to the feature +#' level (i.e. gene level). +#' @param collapse Only takes effect if \code{aggregate=TRUE}. +#' If TRUE (default), collapses results across the different psi +#' types to return only one row per feature (gene) and sample. #' @param ... Further arguments can be passed to the method. If "zscores", #' "padjVals" or "dPsi" is given, the values of those arguments #' are used to define the aberrant events. @@ -826,7 +846,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' #' # aggregate the results by genes (gene symbols need to be annotated first #' # using annotateRanges() function) -#' resultsByGenes(res) +#' results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, +#' aggregate=TRUE) #' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value @@ -843,43 +864,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.3, - rhoCutoff=0.1, aggregate=FALSE, + rhoCutoff=0.1, aggregate=FALSE, collapse=TRUE, minCount=5, psiType=c("psi3", "psi5", "theta"), additionalColumns=NULL, BPPARAM=bpparam(), ...){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, rhoCutoff=rhoCutoff, minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), - aggregate=aggregate, + aggregate=aggregate, collapse=collapse, additionalColumns=additionalColumns, BPPARAM=BPPARAM) }) -#' @rdname results -#' @export -resultsByGenes <- function(res, geneColumn="hgncSymbol"){ - # sort by gene pvalue - res <- res[order(res$pValueGene)] - - # extract subset - if(is(res, "GRanges")){ - ans <- as.data.table(mcols(res)[,c(geneColumn, "sampleID")]) - colnames(ans) <- c("features", "sampleID") - } else { - ans <- featureNames <- res[,.( - features=get(geneColumn), sampleID=sampleID)] - } - - # remove NAs and - # keep only one row per gene with lowest pvalue over all psiTypes - naIdx <- ans[,is.na(features)] - ansNoNA <- ans[!is.na(features)] - dupIdx <- duplicated(ansNoNA[,.(features,sampleID)]) - - # get final result table - finalAns <- res[!naIdx][!dupIdx] - finalAns -} - aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5, rhoCutoff=0.1, @@ -957,7 +952,7 @@ aberrant.FRASER <- function(object, type=currentType(object), aberrantEvents[is.na(aberrantEvents)] <- FALSE } else{ - dt <- data.table(geneID=mcols(fds, type=type)$hgnc_symbol, padj) + dt <- data.table(geneID=mcols(object, type=type)$hgnc_symbol, padj) dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)] padj <- as.matrix(dt[, -1]) rownames(padj) <- dt[,geneID] diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index 2f278832..54bee9e3 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -965,19 +965,9 @@ readJunctionMap <- function(junctionMap){ #' @noRd extractSpliceSiteCoordinates <- function(junctions, fds){ - # if(strandSpecific(fds) >= 1L){ - # spliceSiteCoords <- unlist(GRangesList( - # extractSpliceSiteCoordsPerStrand(junctions, "+"), - # extractSpliceSiteCoordsPerStrand(junctions, "-") - # )) - # } else { - # strand(junctions) <- "*" - # spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") - # } - spliceSiteCoords <- unlist(GRangesList( lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, - junctions=junctions) + junctions=junctions) )) return(unique(sort(spliceSiteCoords))) diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index 212440cd..6653fcaa 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -27,6 +27,8 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){ fds <- calculatePvalues(fds, type=type, implementation=implementation, BPPARAM=BPPARAM) + fds <- calculatePadjValues(fds, type=type, geneLevel=FALSE, + BPPARAM=BPPARAM) return(fds) } @@ -50,7 +52,7 @@ eval_prot <- function(fds, type){ }, FUN.VALUE=logical(length(unique(index))) ) ) + 0 if(any(is.na(scores))){ - warning(sum(is.na(scores)), " P-values where NAs.") + # warning(sum(is.na(scores)), " P-values where NAs.") scores[is.na(scores)] <- min(scores, na.rm=TRUE)-1 } pr <- pr.curve(scores, weights.class0=labels) diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 26f72951..22644e80 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -10,11 +10,14 @@ #' @param level Indicates if the retrieved p values should be adjusted on the #' donor/acceptor site-level (default) or if unadjusted junction-level #' p values should be returned. +#' @param filters A named list giving the filters that were applied for masking +#' during p value correction. Used for storing and retrieving the +#' correct set of requested p values. #' @param value The new value to be assigned. #' @param all Logical value indicating whether \code{hyperParams(fds)} should #' return the results of all evaluated parameter combinations or only #' for the optimal parameter combination. -#' @param ... Internally used parameteres. +#' @param ... Internally used parameters. #' @return A (delayed) matrix or vector dependent on the type of data retrieved. #' #' @name getter_setter_functions diff --git a/R/helper-functions.R b/R/helper-functions.R index 17a843a3..ca7db0a5 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -567,6 +567,18 @@ checkPadjAvailableForFilters <- function(fds, type, filters=list(), #' #' Find most aberrant junction for each gene +#' +#' @param gr GRanges object with information about junctions. +#' @param genes Significant genes for which the corresponding junction should +#' be extracted. +#' @param pvals Vector of pvalues (for one sample). +#' @param dpsi Vector of delta psi values (for one sample). +#' @param minCount Vector of total counts (N) to which minCount filter will be +#' applied. +#' @param rho Vector of rho values (for all junctions). +#' @param filters The filters which will be used for masked junctions during +#' extraction (possible options: dpsi, minCount, rho). +#' #' @noRd findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, rho, filters=list()){ @@ -581,13 +593,25 @@ findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, # mask junctions that don't pass filters (minCount, dPsi, rho) for(n in names(filters)){ if(n == "rho"){ + if(is.na(filters[["rho"]])){ + filters[["rho"]] <- 1 + } dt[rho > filters[["rho"]], pval:=NA] } else{ + if(is.na(filters[[n]])){ + filters[[n]] <- 0 + } dt[get(n) < filters[[n]], pval:=NA] } } # sort per gene by lowest pvalue / highest deltaPsi and return index dt <- dt[order(geneID, pval, -dpsi)] - return(dt[!duplicated(dt, by="geneID"),idx]) + dt <- dt[!duplicated(dt, by="geneID"),] + + # remove gene-level significant result if no junction in that gene passed + # the filters + dt <- dt[!is.na(pval),] + + return(dt[,idx]) } diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 99b5754b..16eae792 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -33,7 +33,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){ #' @describeIn FRASER This function calculates two-sided p-values based on #' the beta-binomial distribution (or binomial or normal if desired). The -#' returned p values are already adjusted with Holm's method per donor or +#' returned p values are not yet adjusted with Holm's method per donor or #' acceptor site, respectively. #' #' @param distributions The distribution based on which the p-values are @@ -141,7 +141,8 @@ adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, method="holm"){ dt <- data.table(p=pvals[,i], idx=index, rho=rho) dt[rho > rhoCutoff, p:=NA] - dt2 <- dt[,.(pa=min(p.adjust(p, method=method), na.rm=TRUE)),by=idx] + suppressWarnings(dt2 <- dt[,.(pa=min(p.adjust(p, method=method), + na.rm=TRUE)),by=idx]) dt2[is.infinite(pa), pa:=NA] setkey(dt2, "idx")[J(index)][,pa] } @@ -176,13 +177,22 @@ singlePvalueBinomial <- function(idx, k, n, mu){ } #' @describeIn FRASER This function adjusts the previously calculated -#' p-values per sample for multiple testing. +#' p-values per sample for multiple testing. First, the previoulsy calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, if gene symbols have been annotated to +#' junctions (and not otherwise requested), gene-level p values are computed. #' -#' @param method The p.adjust method that should be used. +#' @param method The p.adjust method that should be used for genome-wide +#' multiple testing correction. +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which junctions are +#' masked with NA during p value adjustment. +#' @param geneLevel Logical value indiciating whether gene-level p values +#' should be calculated. Defaults to TRUE. #' #' @export calculatePadjValues <- function(fds, type=currentType(fds), method="BY", - rhoCutoff=0.1, + rhoCutoff=0.1, geneLevel=TRUE, BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) @@ -214,7 +224,8 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", withDimnames=FALSE) <- as.matrix(padjDT) # gene-level pval correction and FDR - if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){ + if("hgnc_symbol" %in% colnames(mcols(fds, type=type)) && + isTRUE(geneLevel)){ message(date(), ": calculating gene-level pvalues ...") gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, method="holm", FDRmethod=method, @@ -254,11 +265,14 @@ getPvalsPerGene <- function(fds, type, # aggregate pvalues to gene level per sample pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, function(i){ + suppressWarnings( dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], method=method), na.rm=TRUE), by=geneID] - setkey(dttmp, geneID) - dttmp[J(geneIDs), V1] + ) + dttmp[is.infinite(V1), V1:=NA] + setkey(dttmp, geneID) + dttmp[J(geneIDs), V1] })), ncol=length(samples)) colnames(pvalsPerGene) <- samples diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 22f3fff5..58f9fb10 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -28,7 +28,14 @@ calculatePvalues( capN = 5 * 1e+05 ) -calculatePadjValues(fds, type = currentType(fds), method = "BY") +calculatePadjValues( + fds, + type = currentType(fds), + method = "BY", + rhoCutoff = 0.1, + geneLevel = TRUE, + BPPARAM = bpparam() +) } \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} @@ -62,7 +69,15 @@ calculated. Possible are beta-binomial, binomial and normal.} \item{capN}{Counts are capped at this value to speed up the p-value calculation} -\item{method}{The p.adjust method that should be used.} +\item{method}{The p.adjust method that should be used for genome-wide +multiple testing correction.} + +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which junctions are +masked with NA during p value adjustment.} + +\item{geneLevel}{Logical value indiciating whether gene-level p values +should be calculated. Defaults to TRUE.} } \value{ FraserDataSet @@ -104,11 +119,14 @@ psi. \item \code{calculatePvalues}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The -returned p values are already adjusted with Holm's method per donor or +returned p values are not yet adjusted with Holm's method per donor or acceptor site, respectively. \item \code{calculatePadjValues}: This function adjusts the previously calculated -p-values per sample for multiple testing. +p-values per sample for multiple testing. First, the previoulsy calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, if gene symbols have been annotated to +junctions (and not otherwise requested), gene-level p values are computed. }} \examples{ diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index bf56e32c..79306a75 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -35,9 +35,23 @@ rho(fds, type = currentType(fds)) zScores(fds, type = currentType(fds), byGroup = FALSE, ...) -pVals(fds, type = currentType(fds), level = "site", dist = "BetaBinomial", ...) - -padjVals(fds, type = currentType(fds), dist = c("BetaBinomial"), ...) +pVals( + fds, + type = currentType(fds), + level = "site", + filters = list(rho = 0.1), + dist = "BetaBinomial", + ... +) + +padjVals( + fds, + type = currentType(fds), + dist = c("BetaBinomial"), + level = "site", + filters = list(rho = 0.1), + ... +) predictedMeans(fds, type = currentType(fds)) @@ -70,12 +84,16 @@ verbose(fds) <- value \item{byGroup}{If TRUE, aggregation by donor/acceptor site will be done.} -\item{...}{Internally used parameteres.} +\item{...}{Internally used parameters.} \item{level}{Indicates if the retrieved p values should be adjusted on the donor/acceptor site-level (default) or if unadjusted junction-level p values should be returned.} +\item{filters}{A named list giving the filters that were applied for masking +during p value correction. Used for storing and retrieving the +correct set of requested p values.} + \item{dist}{Distribution for which the p-values should be extracted.} \item{all}{Logical value indicating whether \code{hyperParams(fds)} should @@ -140,7 +158,7 @@ assays should be stored as hdf5 files. \item \code{dontWriteHDF5<-}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose}: Dependend on the level of verbosity +\item \code{verbose}: Dependent on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. diff --git a/man/results.Rd b/man/results.Rd index 57682616..330aef7d 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/AllGenerics.R \name{results,FraserDataSet-method} \alias{results,FraserDataSet-method} -\alias{resultsByGenes} \alias{aberrant,FraserDataSet-method} \title{Extracting results and aberrant splicing events} \usage{ @@ -12,6 +11,9 @@ padjCutoff = 0.05, zScoreCutoff = NA, deltaPsiCutoff = 0.3, + rhoCutoff = 0.1, + aggregate = FALSE, + collapse = TRUE, minCount = 5, psiType = c("psi3", "psi5", "theta"), additionalColumns = NULL, @@ -19,8 +21,6 @@ ... ) -resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY") - \S4method{aberrant}{FraserDataSet}( object, type = currentType(object), @@ -28,6 +28,7 @@ resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY") deltaPsiCutoff = 0.3, zScoreCutoff = NA, minCount = 5, + rhoCutoff = 0.1, by = c("none", "sample", "feature"), aggregate = FALSE, ... @@ -45,6 +46,17 @@ retrieved} \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.} +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which +junctions are filtered} + +\item{aggregate}{If TRUE the returned object is aggregated to the feature +level (i.e. gene level).} + +\item{collapse}{Only takes effect if \code{aggregate=TRUE}. +If TRUE (default), collapses results across the different psi +types to return only one row per feature (gene) and sample.} + \item{minCount}{The minimum count value of the total coverage of an intron to be considered as significant. result} @@ -62,6 +74,12 @@ are included.} "padjVals" or "dPsi" is given, the values of those arguments are used to define the aberrant events.} +\item{type}{Splicing type (psi5, psi3 or theta)} + +\item{by}{By default \code{none} which means no grouping. But if +\code{sample} or \code{feature} is specified the sum by +sample or feature is returned} + \item{res}{Result as created with \code{results()}} \item{geneColumn}{The name of the column in \code{mcols(res)} that contains @@ -69,15 +87,6 @@ the gene symbols.} \item{method}{The p.adjust method that is being used to adjust p values per sample.} - -\item{type}{Splicing type (psi5, psi3 or theta)} - -\item{by}{By default \code{none} which means no grouping. But if -\code{sample} or \code{feature} is specified the sum by -sample or feature is returned} - -\item{aggregate}{If TRUE the returned object is based on the grouped -features} } \value{ For \code{results}: GRanges object containing significant results. @@ -102,7 +111,8 @@ res # aggregate the results by genes (gene symbols need to be annotated first # using annotateRanges() function) -resultsByGenes(res) +results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, + aggregate=TRUE) # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 069a217d..2bb6e2cc 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -4,7 +4,8 @@ test_that("FRASER function", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", - "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", + "pvaluesBetaBinomial_rho0.1", "padjBetaBinomial_rho0.1", + "zScores"), "_", rep(psiTypes, 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) }) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 8e76cc19..496ffd64 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -30,3 +30,53 @@ test_that("Zscore calculation", { expect_equal(zscores, zScores(fds, "psi5")) }) + +test_that("Gene p value calculation with NAs", { + fds <- getFraser() + fds <- fds[15:24,] + mcols(fds, type="j")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(3, 4, 3)) + mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(4, 6, 4)) + + # simulate junction with bad rho fit + rho_5 <- rho(fds, type="psi5") + rho_5[c(1, 4:7)] <- 0.5 + rho(fds, type="psi5") <- rho_5 + + rho_3 <- rho(fds, type="psi3") + rho_3 <- rep(0.5, length(rho_3)) + rho(fds, type="psi3") <- rho_3 + + # calc p values + fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1) + fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1) + + # check psi5 pvals are partly NAs + expect_equal(pVals(fds, type="psi5", level="site", + filters=list(rho=0.1))[4:7,1], + as.double(rep(NA, 4))) + expect_equal(pVals(fds, type="psi5", level="gene", + filters=list(rho=0.1))[4:7,2], + as.double(rep(NA, 4))) + expect_equal(padjVals(fds, type="psi5", level="site", + filters=list(rho=0.1))[4:7,1], + as.double(rep(NA, 4))) + expect_equal(padjVals(fds, type="psi5", level="gene", + filters=list(rho=0.1))[4:7,2], + as.double(rep(NA, 4))) + + # check psi3 pvals are all NAs + expect_equal(pVals(fds, type="psi3", level="site", + filters=list(rho=0.1))[,1], + as.double(rep(NA, nrow(fds)))) + expect_equal(pVals(fds, type="psi3", level="gene", + filters=list(rho=0.1))[,2], + as.double(rep(NA, nrow(fds)))) + expect_equal(padjVals(fds, type="psi3", level="site", + filters=list(rho=0.1))[,1], + as.double(rep(NA, nrow(fds)))) + expect_equal(padjVals(fds, type="psi3", level="gene", + filters=list(rho=0.1))[,2], + as.double(rep(NA, nrow(fds)))) +}) From f2b992ad1510dbd9f335d4bd1585e78dc29c026c Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Oct 2021 16:57:22 +0200 Subject: [PATCH 03/80] fix plotting examples --- R/plotMethods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index 0afa2fd0..43a66403 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -213,7 +213,7 @@ plotVolcano.FRASER <- function(object, sampleID, } if(!is.na(padjCutoff)){ - if(dt[,any(padj < padjCutoff)]){ + if(dt[padj < padjCutoff, .N] > 0){ padj_line <- min(dt[padj < padjCutoff, -log10(pval)]) } if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){ From ad726273f39eeb1600e97ec1ff0878e94e62a433 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 2 Nov 2021 15:57:49 +0100 Subject: [PATCH 04/80] calculation of gene pvals for all annotated genes --- R/AllGenerics.R | 202 +++++++++++++++++++++--------------- R/FRASER-package.R | 3 +- R/FraserDataSet-class.R | 15 +-- R/annotationOfRanges.R | 120 +++++++++++---------- R/getNSetterFuns.R | 74 ++++++++++--- R/helper-functions.R | 117 ++++++++++++++------- R/makeSimulatedDataset.R | 3 +- R/plotMethods.R | 4 + R/pvalsNzscore.R | 131 +++++++++++++++-------- man/FRASER.Rd | 4 + man/results.Rd | 39 +++---- tests/testthat/test_stats.R | 48 ++++----- 12 files changed, 482 insertions(+), 278 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index d1dba092..ba1ce932 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -616,16 +616,19 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals, rawCts, rawTotalCts, deltaPsiVals, psiType, rowMeansK, rowMeansN, aberrant, aggregate, rho, pvalsGene=NULL, padjsGene=NULL, - additionalColumns, filters=list()){ - + aberrantGene, additionalColumns, + geneColumn="hgnc_symbol"){ + mcols(gr)$idx <- seq_along(gr) # if gene level results, find the most aberrant junction per gene first if(isTRUE(aggregate)){ - goodGenes <- rownames(aberrant)[aberrant[,sampleID]] - geneJunctions <- findJunctionsForAberrantGenes(gr=gr, genes=goodGenes, + goodGenes <- rownames(aberrantGene)[aberrantGene[,sampleID] & + !is.na(aberrantGene[,sampleID])] + geneJunctions <- findJunctionsForAberrantGenes(gr=gr, + aberrantGenes=goodGenes, pvals=pvals[,sampleID], dpsi=deltaPsiVals[,sampleID], - minCount=rawTotalCts[,sampleID], rho=rho, - filters=filters) + geneColumn=geneColumn, + aberrantJunctions=aberrant[,sampleID]) goodCut <- rep(FALSE, nrow(pvals)) goodCut[geneJunctions] <- TRUE } else{ @@ -637,33 +640,43 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, if(!any(goodCut)){ return(ans) } + mcols(ans)$idx <- mcols(gr)$idx[goodCut] - if(!"hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(gr)$hgnc_symbol <- NA_character_ + if(!geneColumn %in% colnames(mcols(gr))){ + mcols(gr)[,geneColumn] <- NA_character_ } # extract data mcols(ans)$sampleID <- Rle(sampleID) if("hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])$hgnc_symbol) - } - if("other_hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol) + mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])[,geneColumn]) } + mcols(ans)$type <- Rle(psiType) mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5) mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5) mcols(ans)$zScore <- Rle(round(zscores[goodCut,sampleID], 2)) mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2)) - mcols(ans)$deltaPsi <- Rle(round(deltaPsiVals[goodCut,sampleID], 2)) - mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) - mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + mcols(ans)$deltaPsi <- round(deltaPsiVals[goodCut,sampleID], 2) mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID]) mcols(ans)$totalCounts <- Rle(rawTotalCts[goodCut, sampleID]) + mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) + mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) if(isTRUE(aggregate)){ - mcols(ans)$pValueGene <- signif(pvalsGene[goodCut,sampleID], 5) - mcols(ans)$padjustGene <- signif(padjsGene[goodCut,sampleID], 5) + # report junction more than once if it is significant for several genes + nrGenesPerJunction <- table(geneJunctions) + ans <- rep(ans, nrGenesPerJunction[as.character(mcols(ans)$idx)]) + mcols(ans)$hgncSymbol <- + as.data.table(ans)[, names(geneJunctions)[geneJunctions == idx], + by = eval(colnames(mcols(ans)))][,V1] + + # add gene level pvalue + mcols(ans)$pValueGene <- + signif(pvalsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$padjustGene <- + signif(padjsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol) } if(!is.null(additionalColumns)){ @@ -672,17 +685,21 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, } } - return(ans[order(mcols(ans)$pValue)]) + # remove helper column + mcols(ans)$idx <- NULL + + + return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))]) } FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, dPsiCutoff, minCount, rhoCutoff, psiType, - maxCols=20, aggregate=FALSE, collapse=TRUE, - BPPARAM=bpparam(), additionalColumns=NULL){ + maxCols=20, aggregate=FALSE, collapse=FALSE, + geneColumn="hgnc_symbol", BPPARAM=bpparam(), + additionalColumns=NULL){ stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) - message(date(), " collapse = ", collapse) resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ message(date(), ": Collecting results for: ", type) @@ -713,7 +730,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, muPsi <- as.matrix(predictedMeans(tmp_x)) psivals_pc <- (rawCts + pseudocount()) / (rawTotalCts + 2*pseudocount()) - deltaPsiVals <- psivals_pc - muPsi + deltaPsiVals <- deltaPsiValue(tmp_x, type) rho <- rho(tmp_x, type) aberrant <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, @@ -721,13 +738,23 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, deltaPsiCutoff=dPsiCutoff, minCount=minCount, rhoCutoff=rhoCutoff, - aggregate=aggregate) + aggregate=FALSE, + geneColumn=geneColumn) if(isTRUE(aggregate)){ pvalsGene <- as.matrix(pVals(tmp_x, level="gene")) padjsGene <- as.matrix(padjVals(tmp_x, level="gene")) + aberrantGene <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + zScoreCutoff=zscoreCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=TRUE, + geneColumn=geneColumn) } else{ pvalsGene <- NULL padjsGene <- NULL + aberrantGene <- NULL } if(length(sc) == 1){ @@ -745,12 +772,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, rawCts=rawCts, rawTotalCts=rawTotalCts, rowMeansK=rowMeansK, rowMeansN=rowMeansN, aberrant=aberrant, aggregate=aggregate, - rho=rho, + rho=rho, geneColumn=geneColumn, pvalsGene=pvalsGene, padjsGene=padjsGene, - additionalColumns=additionalColumns, - filters=list(dpsi=dPsiCutoff, - minCount=minCount, - rho=rhoCutoff)) + aberrantGene=aberrantGene, + additionalColumns=additionalColumns) # return combined result return(unlist(GRangesList(sampleRes))) @@ -768,16 +793,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, } # collapse into one row per gene if requested - if(isTRUE(aggregate) && isTRUE(collapse) && length(ans) > 0){ - - ans <- ans[order(ans$pValueGene, ans$pValue)] - naIdx <- is.na(ans$hgncSymbol) - ansNoNA <- ans[!is.na(ans$hgncSymbol),] - - # get final result table - dupIdx <- duplicated(data.table(as.vector(ansNoNA$hgncSymbol), - as.vector(ansNoNA$sampleID))) - ans <- ans[!naIdx,][!dupIdx,] + if(isTRUE(aggregate) && isTRUE(collapse)){ + ans <- collapseResTablePerGene(ans) } # return only the results @@ -810,11 +827,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns #' are included. #' @param BPPARAM The BiocParallel parameter. -#' @param res Result as created with \code{results()} -#' @param geneColumn The name of the column in \code{mcols(res)} that contains -#' the gene symbols. -#' @param method The p.adjust method that is being used to adjust p values per -#' sample. #' @param type Splicing type (psi5, psi3 or theta) #' @param by By default \code{none} which means no grouping. But if #' \code{sample} or \code{feature} is specified the sum by @@ -822,11 +834,13 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param aggregate If TRUE the returned object is aggregated to the feature #' level (i.e. gene level). #' @param collapse Only takes effect if \code{aggregate=TRUE}. -#' If TRUE (default), collapses results across the different psi +#' If TRUE, collapses results across the different psi #' types to return only one row per feature (gene) and sample. -#' @param ... Further arguments can be passed to the method. If "zscores", -#' "padjVals" or "dPsi" is given, the values of those arguments -#' are used to define the aberrant events. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. +#' @param ... Further arguments can be passed to the method. If "n", "zscores", +#' "padjVals", "dPsi" or "rhoVals" are given, the values of those +#' arguments are used to define the aberrant events. #' #' @return For \code{results}: GRanges object containing significant results. #' For \code{aberrant}: Either a of logical values of size @@ -839,16 +853,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' # get data, fit and compute p-values and z-scores #' fds <- createTestFraserDataSet() #' -#' # extract results: for this example dataset, z score cutoff of 2 is used to -#' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +#' # extract results: for this example dataset, z score cutoff of 1 and delta +#' # psi cutoff of 0.2 is used to get at least one result and show the output +#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.2) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first #' # using annotateRanges() function) -#' results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, +#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, #' aggregate=TRUE) #' +#' # aggregate the results by genes and collapse over all psi types to obtain +#' # only one row per gene in the results table +#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, +#' aggregate=TRUE, collapse=TRUE) +#' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value #' aberrant(fds, type="psi5", by="sample") @@ -864,14 +883,15 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.3, - rhoCutoff=0.1, aggregate=FALSE, collapse=TRUE, + rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=c("psi3", "psi5", "theta"), - additionalColumns=NULL, BPPARAM=bpparam(), ...){ + geneColumn="hgnc_symbol", + additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, rhoCutoff=rhoCutoff, minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), - aggregate=aggregate, collapse=collapse, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, additionalColumns=additionalColumns, BPPARAM=BPPARAM) }) @@ -879,7 +899,7 @@ aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5, rhoCutoff=0.1, by=c("none", "sample", "feature"), - aggregate=FALSE, ...){ + aggregate=FALSE, geneColumn="hgnc_symbol", ...){ checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(zScoreCutoff, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) @@ -911,8 +931,7 @@ aberrant.FRASER <- function(object, type=currentType(object), "Please compute them first by running the ", "calculatePadjValues function with the requested filters.") } - pvalLevel <- ifelse(isTRUE(aggregate), "gene", "site") - padj <- padjVals(object, type=type, level=pvalLevel, + padj <- padjVals(object, type=type, level="site", filters=list(rho=rhoCutoff)) } if("dPsi" %in% names(dots)){ @@ -926,38 +945,59 @@ aberrant.FRASER <- function(object, type=currentType(object), rho <- matrix(rho(object, type=type), nrow=nrow(dpsi), ncol=ncol(dpsi)) } + if(isTRUE(aggregate)){ + if("padjGeneVals" %in% names(dots)){ + padj_gene <- dots[['padjGeneVals']] + } else{ + padj_gene <- padjVals(object, type=type, level="gene", + filters=list(rho=rhoCutoff)) + } + + } if(is.na(padjCutoff)){ padjCutoff <- 1 } - if(isFALSE(aggregate)){ - aberrantEvents <- padj <= padjCutoff + aberrantEvents <- as.matrix(padj) <= padjCutoff + + # check each cutoff if in use (not NA) + if(!is.na(minCount)){ + aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) + } + if(!is.na(zScoreCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(zscores) > zScoreCutoff) + } + if(!is.na(deltaPsiCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(dpsi) > deltaPsiCutoff) + } + if(!is.na(rhoCutoff)){ + aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff) + } + aberrantEvents[is.na(aberrantEvents)] <- FALSE - # check each cutoff if in use (not NA) - if(!is.na(minCount)){ - aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) - } - if(!is.na(zScoreCutoff)){ - aberrantEvents <- aberrantEvents & - as.matrix(abs(zscores) > zScoreCutoff) - } - if(!is.na(deltaPsiCutoff)){ - aberrantEvents <- aberrantEvents & - as.matrix(abs(dpsi) > deltaPsiCutoff) - } - if(!is.na(rhoCutoff)){ - aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff) + if(isTRUE(aggregate)){ + if(is.null(rownames(padj_gene))){ + stop("Missing rownames for gene-level padj values.") } - aberrantEvents[is.na(aberrantEvents)] <- FALSE - - } else{ - dt <- data.table(geneID=mcols(object, type=type)$hgnc_symbol, padj) - dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)] - padj <- as.matrix(dt[, -1]) - rownames(padj) <- dt[,geneID] + # reduce aberrant matrix to one row per gene + # (TRUE if any junction is aberrant for each sample) + ab_dt <- data.table(geneID=getGeneIDs(object, type=type, unique=FALSE, + geneColumn=geneColumn), + aberrantEvents) + ab_dt[, dt_idx:=seq_len(.N)] + dt_tmp <- ab_dt[!is.na(geneID), splitGenes(geneID), by="dt_idx"] + ab_dt <- ab_dt[dt_tmp$dt_idx] + ab_dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + ab_dt <- ab_dt[,lapply(.SD, any), by="geneID"] + aberrantEvents <- as.matrix(ab_dt[,-1]) + rownames(aberrantEvents) <- ab_dt[,geneID] - aberrantEvents <- padj <= padjCutoff + aberrantEvents <- aberrantEvents & as.matrix( + padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] + ) <= padjCutoff } return(switch(match.arg(by), diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 3cecc264..35bf0016 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -128,5 +128,6 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa", "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", - "uniqueID", "V1", "value", "zscore", "maxDTheta"), + "uniqueID", "V1", "value", "zscore", "maxDTheta", "genes_donor", + "genes_acceptor", "gene_pval", "gene_padj", "dt_idx"), package="FRASER") diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R index 8857751b..1ddea660 100644 --- a/R/FraserDataSet-class.R +++ b/R/FraserDataSet-class.R @@ -22,7 +22,8 @@ setClass("FraserDataSet", name = "Data Analysis", bamParam = ScanBamParam(mapqFilter=0), strandSpecific = 0L, - workingDir = file.path(tempdir(), "FRASER"), + workingDir = getwd(), + # file.path(tempdir(), "FRASER"), nonSplicedReads = SummarizedExperiment(rowRanges=GRanges()) ) ) @@ -192,13 +193,13 @@ showFraserDataSet <- function(object) { cat("\n") cat("-------------------- BAM parameters --------------------\n") - if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ - cat(paste0("Default used with: ", - "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) - )) - } else { + # if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ + # cat(paste0("Default used with: ", + # "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) + # )) + # } else { show(scanBamParam(object)) - } + # } cat("\n\n") } diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R index 62f6c377..da036f5b 100644 --- a/R/annotationOfRanges.R +++ b/R/annotationOfRanges.R @@ -59,9 +59,6 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, if(length(fds) == 0) return(fds) # useEnsembl only understands GRCh=37 or GRCh=NULL (uses 38 then) - if(is.null(GRCh)){ - GRCh <- 38 - } if(GRCh == 38){ GRCh <- NULL } @@ -89,17 +86,16 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, annotation <- getFeatureAsGRange(ensembl, feature, featureName, biotype, useUSCS) - # annotate split reads - for(i in c("psi3", "theta")){ - gr <- rowRanges(fds, type=i) - if(any(strand(gr) == "*")){ - strand(annotation) <- "*" - } - annos <- getAnnotationFeature(data=gr, featureName, annotation) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # annotate splice sites first + gr <- rowRanges(fds, type="theta") + if(any(strand(gr) == "*")){ + strand(annotation) <- "*" } + annos <- getAnnotationFeature(data=gr, featureName, annotation) + mcols(fds, type="theta")[[featureName]] <- annos + + # annotate junctions with genes at donor and acceptor sites + fds <- annotateFeatureFromSpliceSite(fds, featureName) return(fds) } @@ -132,41 +128,40 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL", } } - for(i in c("psi3", "theta")){ - # get GRanges object with the split reads which should be annotated - gr <- rowRanges(fds, type=i) - - # get the annotation to compare to - anno <- genes(txdb) - if(is.data.table(orgDb)){ - tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, - by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, - .(gene_id, feature=get(feature))] - setnames(tmp, "feature", feature) - } else { - tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], - columns=feature, keytype=keytype)) - } - - # add the new feature to the annotation - tmp[, uniqueID := .GRP, by=keytype] - anno <- anno[tmp[,uniqueID]] - mcols(anno)[[featureName]] <- tmp[,get(feature)] + # get GRanges object with the splice sites which should be annotated + gr <- rowRanges(fds, type="theta") + + # get the annotation to compare to + anno <- genes(txdb) + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, + by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, + .(gene_id, feature=get(feature))] + setnames(tmp, "feature", feature) + } else { + tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], + columns=feature, keytype=keytype)) + } - # clean up of NA and "" ids - anno <- anno[!is.na(mcols(anno)[,featureName]),] - anno <- anno[mcols(anno)[,featureName] != "",] - if(any(strand(gr) == "*")){ - strand(anno) <- "*" - } + # add the new feature to the annotation + tmp[, uniqueID := .GRP, by=keytype] + anno <- anno[tmp[,uniqueID]] + mcols(anno)[[featureName]] <- tmp[,get(feature)] - # retrieve the feature of interest for the split reads - annos <- getAnnotationFeature(data=gr, featureName, anno) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # clean up of NA and "" ids + anno <- anno[!is.na(mcols(anno)[,featureName]),] + anno <- anno[mcols(anno)[,featureName] != "",] + if(any(strand(gr) == "*")){ + strand(anno) <- "*" } + # retrieve the feature of interest for the splice sites + annos <- getAnnotationFeature(data=gr, featureName, anno) + mcols(fds, type="theta")[[featureName]] <- annos + + # transfer annoated features for splice sites to junctions + fds <- annotateFeatureFromSpliceSite(fds, featureName) + return(fds) } @@ -228,14 +223,12 @@ getAnnotationFeature <- function(data, feature, annotation){ } # extract only the feature and group them with a ";" - featureDT <- featureDT[, - list(first_feature=unique(feature)[1], - other_features=paste(unique(feature)[-1], collapse = ";")), - by="from" - ] + featureDT <- featureDT[,feature:=paste(unique(feature), collapse = ";"), + by="from"] + featureDT <- featureDT[!duplicated(featureDT),] + featureDT[feature == "NA", feature:=NA] - return(list(feature=featureDT[order(from),first_feature], - other_features=featureDT[order(from),other_features])) + return(featureDT[order(from),feature]) } @@ -314,4 +307,27 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE, fds } - +#' annotate junctions with genes at donor and acceptor sites +#' @noRd +annotateFeatureFromSpliceSite <- function(fds, featureName){ + ssdt <- data.table(spliceSiteID=mcols(fds, type="theta")$spliceSiteID, + genes=mcols(fds, type="theta")[[featureName]] + ) + junction_dt <- data.table(startID=mcols(fds, type="psi3")$startID, + endID=mcols(fds, type="psi3")$endID + ) + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="startID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_donor") + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="endID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_acceptor") + + junction_dt[,genes:=paste(uniqueIgnoreNA( + c(splitGenes(genes_donor), splitGenes(genes_acceptor))), + collapse=";"), + by="startID,endID"] + junction_dt[genes == "NA", genes:=NA] + mcols(fds, type="j")[[featureName]] <- junction_dt[,genes] + return(fds) +} diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 22644e80..8e91710f 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -271,10 +271,11 @@ pVals <- function(fds, type=currentType(fds), level="site", aname <- paste0(aname, "_", n, filters[[n]]) } if(level == "gene"){ - if(!paste(aname, type, sep="_") %in% assayNames(fds)){ + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ stop("Did not find gene-level p values. ", "Please compute them first.") } + return(metadata(fds)[[paste(aname, type, sep="_")]]) } } @@ -298,7 +299,15 @@ pVals <- function(fds, type=currentType(fds), level="site", for(n in sort(names(filters))){ aname <- paste0(aname, "_", n, filters[[n]]) } - setAssayMatrix(fds, name=aname, type=type, ...) <- value + + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value + } return(fds) } @@ -315,10 +324,11 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), aname <- paste0(aname, "_", n, filters[[n]]) } if(level == "gene"){ - if(!paste(aname, type, sep="_") %in% assayNames(fds)){ - stop("Did not find gene-level p values. ", + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ + stop("Did not find gene-level padj values. ", "Please compute them first.") } + return(metadata(fds)[[paste(aname, type, sep="_")]]) } return(getAssayMatrix(fds, aname, type=type, ...)) } @@ -333,7 +343,14 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), for(n in sort(names(filters))){ aname <- paste0(aname, "_", n, filters[[n]]) } - setAssayMatrix(fds, name=aname, type=type, ...) <- value + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value + } return(fds) } @@ -589,7 +606,8 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ } getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, - idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, ...){ + idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, + geneColumn="hgnc_symbol", ...){ if(!is.null(result)){ type <- as.character(result$type) idx <- getIndexFromResultTable(fds, result) @@ -611,8 +629,8 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, spliceID <- getSiteIndex(fds, type=type)[idxrow] feature_names <- rownames(mcols(fds, type=type))[idxrow] - if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){ - feature_names <- mcols(fds, type=type)[idxrow,"hgnc_symbol"] + if(geneColumn %in% colnames(mcols(fds, type=type))){ + feature_names <- mcols(fds, type=type)[idxrow, geneColumn] } if(is.null(feature_names)){ feature_names <- as.character(seq_row(mcols(fds, type=type)))[idxrow] @@ -638,29 +656,53 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, obsPsi = c((k + pseudocount())/(n + 2*pseudocount())), predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), rho = rep(rho(fds, type=type)[idxrow], - ifelse(isTRUE(idxcol), ncol(fds), length(idxcol))) + ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) ) dt[, deltaPsi:=obsPsi - predPsi] # add aberrant information to it aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], - rhoVals=dt[,.(rho)]) + rhoVals=dt[,.(rho)], aggregate=FALSE) dt[,aberrant:=aberrantVec] - # if requested return gene p values (correct for multiple testing again) + # if requested return gene p values if(isTRUE(aggregate)){ - dt[, pval:=c(pVals(fds, type=type, - level="gene")[idxrow, idxcol])] - dt[, padj:=c(padjVals(fds, type=type, - level="gene")[idxrow, idxcol]),] dt <- dt[!is.na(featureID)] + # split featureID into several rows if more than one + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(featureID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(featureID=dt_tmp$V1, dt_idx=NULL)] + + # get gene-level pvalue matrices + pvalsGene <- lapply(c("pval", "padj"), function(x){ + if(x == "pval"){ + pvalsGene <- pVals(fds, type=type, + level="gene")[,idxcol,drop=FALSE] + } else { + pvalsGene <- padjVals(fds, type=type, + level="gene")[,idxcol,drop=FALSE] + } + pvalsGene <- data.table(featureID=rownames(pvalsGene), pvalsGene) + pvalsGene <- melt(pvalsGene, value.name=paste0("gene_", x), + id.vars="featureID", variable.name="sampleID") + return(pvalsGene) + }) + pvalsGene <- merge(pvalsGene[[1]], pvalsGene[[2]], + by=c("featureID", "sampleID")) + # merge with gene pval matrix + dt <- merge(dt, pvalsGene, by=c("featureID", "sampleID")) + dt[,`:=`(pval=gene_pval, padj=gene_padj, + gene_pval=NULL, gene_padj=NULL)] + + # sort dt <- dt[order(sampleID, featureID, type, -aberrant, padj, -abs(deltaPsi))][ !duplicated(data.table(sampleID, featureID, type))] } - + # return object dt } diff --git a/R/helper-functions.R b/R/helper-functions.R index ca7db0a5..573ba2c7 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -561,57 +561,102 @@ checkPadjAvailableForFilters <- function(fds, type, filters=list(), for(n in sort(names(filters))){ aname <- paste0(aname, "_", n, filters[[n]]) } - pvalsAvailable <- paste(aname, type, sep="_") %in% assayNames(fds) + aname <- paste(aname, type, sep="_") + if(isTRUE(aggregate)){ + pvalsAvailable <- aname %in% names(metadata(fds)) + } else{ + pvalsAvailable <- aname %in% assayNames(fds) + } return(pvalsAvailable) } #' -#' Find most aberrant junction for each gene +#' Find most aberrant junction for each aberrant gene #' #' @param gr GRanges object with information about junctions. -#' @param genes Significant genes for which the corresponding junction should -#' be extracted. +#' @param aberrantGenes Significant genes for which the corresponding junction +#' should be extracted. #' @param pvals Vector of pvalues (for one sample). #' @param dpsi Vector of delta psi values (for one sample). -#' @param minCount Vector of total counts (N) to which minCount filter will be -#' applied. -#' @param rho Vector of rho values (for all junctions). -#' @param filters The filters which will be used for masked junctions during -#' extraction (possible options: dpsi, minCount, rho). -#' +#' @param aberrantJunctions Vector indicating which junctions are considered +#' aberrant. +#' @param geneColumn Name of the column in mcols(fds) that has gene annotation. #' @noRd -findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, - rho, filters=list()){ - geneJunctions <- mcols(gr)$hgnc_symbol %in% genes - dt <- data.table(idx=which(geneJunctions == TRUE), - geneID=mcols(gr)$hgnc_symbol[geneJunctions], - pval=pvals[geneJunctions], - dpsi=abs(dpsi[geneJunctions]), - minCount=minCount[geneJunctions], - rho=rho[geneJunctions] ) - - # mask junctions that don't pass filters (minCount, dPsi, rho) - for(n in names(filters)){ - if(n == "rho"){ - if(is.na(filters[["rho"]])){ - filters[["rho"]] <- 1 - } - dt[rho > filters[["rho"]], pval:=NA] - } else{ - if(is.na(filters[[n]])){ - filters[[n]] <- 0 - } - dt[get(n) < filters[[n]], pval:=NA] - } - } +findJunctionsForAberrantGenes <- function(gr, aberrantGenes, pvals, dpsi, + aberrantJunctions, geneColumn="hgnc_symbol"){ + dt <- data.table(idx=mcols(gr)$idx, + geneID=mcols(gr)[,geneColumn], + pval=pvals, + dpsi=abs(dpsi), + aberrant=aberrantJunctions) + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + dt <- dt[geneID %in% aberrantGenes,] + dt <- dt[!is.na(aberrant) & aberrant == TRUE,] # sort per gene by lowest pvalue / highest deltaPsi and return index - dt <- dt[order(geneID, pval, -dpsi)] + dt <- dt[order(geneID, -aberrant, pval, -dpsi)] dt <- dt[!duplicated(dt, by="geneID"),] # remove gene-level significant result if no junction in that gene passed # the filters dt <- dt[!is.na(pval),] - return(dt[,idx]) + junctionsToReport <- dt[,idx] + names(junctionsToReport) <- dt[,geneID] + junctionsToReport <- sort(junctionsToReport) + return(junctionsToReport) } + +collapseResTablePerGene <- function(res, geneColumn="hgncSymbol"){ + if(length(res) == 0){ + return(res) + } + if(!is.data.table(res)){ + res <- as.data.table(res) + } + + if(any(!c("pValue", "pValueGene", geneColumn) %in% colnames(res))){ + stop("For collapsing per gene, the results table needs to contain ", + "the columns pValue, pValueGene and ", geneColumn, ".") + } + + res <- res[order(res$pValueGene, res$pValue)] + naIdx <- is.na(res[, get(geneColumn)]) + ansNoNA <- res[!is.na(res[, get(geneColumn)]),] + + # get final result table + dupIdx <- duplicated(data.table(as.vector(ansNoNA[, get(geneColumn)]), + as.vector(ansNoNA$sampleID))) + ans <- res[!naIdx,][!dupIdx,] + return(ans) +} + +#' ignores NA in unique if other values than NA are present +#' @noRd +uniqueIgnoreNA <- function(x){ + uniq <- unique(x) + if(length(uniq) > 1) uniq <- uniq[!is.na(uniq)] + return(uniq) +} + +#' split string of gene names into vector +#' @noRd +splitGenes <- function(x, sep=";"){ + return(unlist(strsplit(as.character(x), sep, fixed=TRUE))) +} + +#' cap string of gene names to show max 3 gene names +#' @noRd +limitGeneNamesList <- function(gene_names, maxLength=3){ + gene_names <- as.character(gene_names) + numFeatures <- unlist(lapply(gene_names, function(x) length(splitGenes(x)))) + gene_names[numFeatures > maxLength] <- + unlist(lapply(gene_names[numFeatures > maxLength], function(x){ + paste(c(splitGenes(x)[seq_len(maxLength)], "..."), + collapse=";") + } )) + return(gene_names) +} \ No newline at end of file diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index 6e29adf8..95331d19 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -500,7 +500,8 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), dt[,groupSize:=.N, by=groupID] # Get groups where outlier can be injected - available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), unique(groupID)] + available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), + unique(groupID)] # e.g. for psi3/5: no donor/acceptor # groups with at least 2 junctions (e.g in simulationBB) diff --git a/R/plotMethods.R b/R/plotMethods.R index 43a66403..ccf99417 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -352,6 +352,8 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), } dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + if(!is.null(colGroup)){ if(all(colGroup %in% samples(fds))){ colGroup <- samples(fds) %in% colGroup @@ -441,6 +443,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), idx=idx, ...) type <- as.character(unique(dt$type)) idx <- unique(dt$idx) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] if(is.null(main)){ if(isTRUE(basePlot)){ @@ -562,6 +565,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, } else { type <- as.character(dt[,unique(type)]) featureID <- as.character(dt[,unique(featureID)]) + featureID <- limitGeneNamesList(featureID, maxLength=3) if(isTRUE(basePlot)){ main <- as.expression(bquote(bold(paste( .(ggplotLabelPsi(type)[[1]]), diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 16eae792..58d60a8b 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -147,6 +147,42 @@ adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, setkey(dt2, "idx")[J(index)][,pa] } +adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + pvals[rho > rhoCutoff,] <- NA + dttmp <- data.table(idx=index, rho=rho, + apply(pvals, 2, as.numeric))[idx == i,] + suppressWarnings( + pa <- apply(as.matrix(dttmp[,-c("idx", "rho")]), 2, + function(x) min(p.adjust(x, method=method), + na.rm = TRUE) ) + ) + pa[is.infinite(pa)] <- NA + return(pa) +} + +getFWERpvals_bySample <- function(pvals, index, rho, method="holm", + rhoCutoff=0.1, BPPARAM=bpparam()){ + fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(cbind, fwer_pval) + return(fwer_pvals) +} + +getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", + rhoCutoff=0.1, BPPARAM=bpparam()){ + unique_idx <- unique(index) + fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(rbind, fwer_pval) + fwer_pvals <- as.matrix( + setkey(data.table(idx=unique_idx, fwer_pvals), + "idx")[J(index)][,-c("idx")]) + return(fwer_pvals) +} + singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ ki <- k[idx,] @@ -189,11 +225,13 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' masked with NA during p value adjustment. #' @param geneLevel Logical value indiciating whether gene-level p values #' should be calculated. Defaults to TRUE. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. #' #' @export calculatePadjValues <- function(fds, type=currentType(fds), method="BY", - rhoCutoff=0.1, geneLevel=TRUE, - BPPARAM=bpparam()){ + rhoCutoff=0.1, geneLevel=TRUE, + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) idx <- !duplicated(index) @@ -209,14 +247,15 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", # splice site-level pval correction message(date(), ": adjusting junction-level pvalues ...") - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM, - method="holm", rho=rho, rhoCutoff=rhoCutoff) - fwer_pvals <- do.call(cbind, fwer_pval) + # fwer_pvals <- getFWERpvals_byIdx(pvals, index, rho, method="holm", + # rhoCutoff=rhoCutoff, BPPARAM=BPPARAM) + fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", + rhoCutoff=rhoCutoff, BPPARAM=BPPARAM) pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff), withDimnames=FALSE) <- fwer_pvals # junction-level FDR correction + message(date(), ": genome-wide FDR for junction-level pvalues ...") padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method) padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)] padjDT[,i:=NULL] @@ -224,16 +263,22 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", withDimnames=FALSE) <- as.matrix(padjDT) # gene-level pval correction and FDR - if("hgnc_symbol" %in% colnames(mcols(fds, type=type)) && + if(geneColumn %in% colnames(mcols(fds, type=type)) && isTRUE(geneLevel)){ message(date(), ": calculating gene-level pvalues ...") gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, method="holm", FDRmethod=method, + geneColumn=geneColumn, BPPARAM=BPPARAM) pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), withDimnames=FALSE) <- gene_pvals[["pvals"]] padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), withDimnames=FALSE) <- gene_pvals[["padj"]] + } else{ + warning("Gene-level pvalues could not be calculated as column ", + geneColumn, "\nwas not found for the given fds object. ", + "Please annotate gene symbols \nfirst using the ", + "annotateRanges function.") } } @@ -243,18 +288,27 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type, level="site"), sampleID=NULL, method="holm", FDRmethod="BY", - BPPARAM=bpparam()){ + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ # extract data and take only the first index of per site + message(date(), ": starting gene-level pval computation for type ", type) samples <- samples(fds) if(is.null(colnames(pvals))){ colnames(pvals) <- samples } dt <- data.table( idx=getSiteIndex(fds, type=type), - geneID=getGeneIDs(fds, type=type, unique=FALSE), + geneID=getGeneIDs(fds, type=type, unique=FALSE, + geneColumn=geneColumn), as.data.table(pvals)) dt <- dt[!is.na(geneID)] - geneIDs <- dt[,unique(geneID)] + geneIDs <- getGeneIDs(fds, type=type, unique=TRUE, + geneColumn=geneColumn) + + # separate geneIDs into rows + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] setkey(dt, geneID) # extract samples @@ -263,28 +317,16 @@ getPvalsPerGene <- function(fds, type, } # aggregate pvalues to gene level per sample - pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, - function(i){ - suppressWarnings( - dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], - method=method), na.rm=TRUE), - by=geneID] - ) - dttmp[is.infinite(V1), V1:=NA] - setkey(dttmp, geneID) - dttmp[J(geneIDs), V1] - })), ncol=length(samples)) - - colnames(pvalsPerGene) <- samples - rownames(pvalsPerGene) <- geneIDs + message(date(), ": gene-level pval computation per gene (n=", + length(geneIDs), ")") + pvalsPerGene <- genePvalsByGeneID(dt, samples=samples, geneIDs=geneIDs, + method=method, BPPARAM=BPPARAM) # compute FDR - gene_padj <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod) - - # blow up back to original assay size - pvalsPerGene <- mapGeneToJunctionAssay(fds, type, pvalsPerGene) - padjPerGene <- mapGeneToJunctionAssay(fds, type, gene_padj) + message(date(), ": genome-wide FDR for gene-level pvals for type ", type) + padjPerGene <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod) + message(date(), ": finished gene-level pval computation for type ", type) return(list(pvals=pvalsPerGene, padj=padjPerGene)) } @@ -308,22 +350,29 @@ getSiteIndex <- function(fds, type){ ans[selectionMat] } -getGeneIDs <- function(fds, type, unique=TRUE){ - geneIDs <- mcols(fds, type=type)$hgnc_symbol +getGeneIDs <- function(fds, type, unique=TRUE, geneColumn="hgnc_symbol"){ + geneIDs <- mcols(fds, type=type)[[geneColumn]] if(isTRUE(unique)){ - geneIDs <- unique(geneIDs) + geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){ + unlist(strsplit(g, ";"))}) )) geneIDs <- geneIDs[!is.na(geneIDs)] } geneIDs } -mapGeneToJunctionAssay <- function(fds, type, pvalMat){ - # blow up back to original assay size - pvalDT <- data.table(geneID=rownames(pvalMat), - as.data.table(pvalMat) ) - setkey(pvalDT, "geneID") - pvalDT <- pvalDT[J(getGeneIDs(fds, type=type, unique=FALSE))] - fullMat <- as.matrix(pvalDT[, -1]) - rownames(fullMat) <- pvalDT[, geneID] - return(fullMat) +genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ + pvalsPerGene <- bplapply(geneIDs, BPPARAM=BPPARAM, + FUN=function(g) { + dttmp <- dt[geneID == g][!duplicated(idx)] + suppressWarnings( + pval_g <- apply(as.matrix(dttmp[,-c("idx", "geneID")]), 2, + function(x) min(p.adjust(x, method=method), na.rm = TRUE) ) + ) + pval_g[is.infinite(pval_g)] <- NA + pval_g + }) + pvalsPerGene <- do.call(rbind, pvalsPerGene) + rownames(pvalsPerGene) <- geneIDs + return(pvalsPerGene) } + diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 58f9fb10..0931ec25 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -34,6 +34,7 @@ calculatePadjValues( method = "BY", rhoCutoff = 0.1, geneLevel = TRUE, + geneColumn = "hgnc_symbol", BPPARAM = bpparam() ) } @@ -78,6 +79,9 @@ masked with NA during p value adjustment.} \item{geneLevel}{Logical value indiciating whether gene-level p values should be calculated. Defaults to TRUE.} + +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} } \value{ FraserDataSet diff --git a/man/results.Rd b/man/results.Rd index 330aef7d..7f78ffe0 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -13,12 +13,12 @@ deltaPsiCutoff = 0.3, rhoCutoff = 0.1, aggregate = FALSE, - collapse = TRUE, + collapse = FALSE, minCount = 5, psiType = c("psi3", "psi5", "theta"), + geneColumn = "hgnc_symbol", additionalColumns = NULL, - BPPARAM = bpparam(), - ... + BPPARAM = bpparam() ) \S4method{aberrant}{FraserDataSet}( @@ -31,6 +31,7 @@ rhoCutoff = 0.1, by = c("none", "sample", "feature"), aggregate = FALSE, + geneColumn = "hgnc_symbol", ... ) } @@ -54,7 +55,7 @@ junctions are filtered} level (i.e. gene level).} \item{collapse}{Only takes effect if \code{aggregate=TRUE}. -If TRUE (default), collapses results across the different psi +If TRUE, collapses results across the different psi types to return only one row per feature (gene) and sample.} \item{minCount}{The minimum count value of the total coverage of an intron @@ -63,6 +64,9 @@ result} \item{psiType}{The psi types for which the results should be retrieved.} +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} + \item{additionalColumns}{Character vector containing the names of additional columns from mcols(fds) that should appear in the result table (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns @@ -70,23 +74,15 @@ are included.} \item{BPPARAM}{The BiocParallel parameter.} -\item{...}{Further arguments can be passed to the method. If "zscores", -"padjVals" or "dPsi" is given, the values of those arguments -are used to define the aberrant events.} - \item{type}{Splicing type (psi5, psi3 or theta)} \item{by}{By default \code{none} which means no grouping. But if \code{sample} or \code{feature} is specified the sum by sample or feature is returned} -\item{res}{Result as created with \code{results()}} - -\item{geneColumn}{The name of the column in \code{mcols(res)} that contains -the gene symbols.} - -\item{method}{The p.adjust method that is being used to adjust p values per -sample.} +\item{...}{Further arguments can be passed to the method. If "n", "zscores", +"padjVals", "dPsi" or "rhoVals" are given, the values of those +arguments are used to define the aberrant events.} } \value{ For \code{results}: GRanges object containing significant results. @@ -104,16 +100,21 @@ aberrant splicing events based on the given cutoffs. # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() -# extract results: for this example dataset, z score cutoff of 2 is used to -# get at least one result and show the output -res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +# extract results: for this example dataset, z score cutoff of 1 and delta +# psi cutoff of 0.2 is used to get at least one result and show the output +res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.2) res # aggregate the results by genes (gene symbols need to be annotated first # using annotateRanges() function) -results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, +results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, aggregate=TRUE) +# aggregate the results by genes and collapse over all psi types to obtain +# only one row per gene in the results table +results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, + aggregate=TRUE, collapse=TRUE) + # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value aberrant(fds, type="psi5", by="sample") diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 496ffd64..cdc098df 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -52,31 +52,31 @@ test_that("Gene p value calculation with NAs", { fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1) fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1) + # check dimension of junction-, site- and gene-level pval matrices + expect_equal(nrow(pVals(fds, type="psi5", level="junction", + filters=list(rho=0.1))), nrow(fds)) + expect_equal(nrow(pVals(fds, type="psi5", level="site", + filters=list(rho=0.1))), nrow(fds)) + expect_equal(nrow(pVals(fds, type="psi5", level="gene", + filters=list(rho=0.1))), 3) + # check psi5 pvals are partly NAs - expect_equal(pVals(fds, type="psi5", level="site", - filters=list(rho=0.1))[4:7,1], - as.double(rep(NA, 4))) - expect_equal(pVals(fds, type="psi5", level="gene", - filters=list(rho=0.1))[4:7,2], - as.double(rep(NA, 4))) - expect_equal(padjVals(fds, type="psi5", level="site", - filters=list(rho=0.1))[4:7,1], - as.double(rep(NA, 4))) - expect_equal(padjVals(fds, type="psi5", level="gene", - filters=list(rho=0.1))[4:7,2], - as.double(rep(NA, 4))) + expect_true(all(is.na(pVals(fds, type="psi5", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(pVals(fds, type="psi5", level="gene", + filters=list(rho=0.1))["geneB",]))) + expect_true(all(is.na(padjVals(fds, type="psi5", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(padjVals(fds, type="psi5", level="gene", + filters=list(rho=0.1))["geneB",]))) # check psi3 pvals are all NAs - expect_equal(pVals(fds, type="psi3", level="site", - filters=list(rho=0.1))[,1], - as.double(rep(NA, nrow(fds)))) - expect_equal(pVals(fds, type="psi3", level="gene", - filters=list(rho=0.1))[,2], - as.double(rep(NA, nrow(fds)))) - expect_equal(padjVals(fds, type="psi3", level="site", - filters=list(rho=0.1))[,1], - as.double(rep(NA, nrow(fds)))) - expect_equal(padjVals(fds, type="psi3", level="gene", - filters=list(rho=0.1))[,2], - as.double(rep(NA, nrow(fds)))) + expect_true(all(is.na(pVals(fds, type="psi3", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(pVals(fds, type="psi3", level="gene", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="psi3", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="psi3", level="gene", + filters=list(rho=0.1))))) }) From 1b4a0bd224715e3f5c017c34ccaeed186d3b0eb6 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 17 Jan 2022 10:30:15 +0100 Subject: [PATCH 05/80] first impleion of intron jaccard index --- R/AllGenerics.R | 5 +- R/calculatePSIValue.R | 157 ++++++++++++++++++++++++ R/filterExpression.R | 272 ++++++++++++++++++++++++++++++++++++++++++ R/helper-functions.R | 7 +- R/plotMethods.R | 8 +- R/variables.R | 3 +- 6 files changed, 442 insertions(+), 10 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index ba1ce932..9349f8b2 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -536,8 +536,9 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, # extract psi value from type type <- whichPSIType(type) if(length(type) == 0 | length(type) > 1){ - stop(paste0("Please provide a correct psi type: psi5, psi3, or ", - "theta. Not the given one: '", type, "'.")) + stop(paste0("Please provide a correct psi type: psi5, psi3, ", + "theta or intron_jaccard. Not the given one: '", + type, "'.")) } aname <- paste0("rawOtherCounts_", type) if(!aname %in% assayNames(object)){ diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index 90894279..be0b8dc2 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -36,6 +36,9 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, overwriteCts=overwriteCts, BPPARAM=BPPARAM) } + # calculate intron jaccard index + fds <- calculateIntronJaccardIndex(fds) + # calculate the delta psi value for(psiType in types){ assayName <- paste0("delta_", psiType) @@ -321,3 +324,157 @@ getOtherCountsCacheFolder <- function(fds){ # return it return(cachedir) } + +#' +#' calculates the intron jaccard index values for all junctions +#' @inheritParams countRNA +#' +calculateIntronJaccardIndex <- function(fds){ + # retrieve junction and splice site annotation with count information + junction_dt <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, + strand, startID, endID)], + as.matrix(K(fds, type="psi5"))) + ss_dt <- data.table( + as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, + strand, spliceSiteID)], + as.matrix(N(fds, type="theta"))) + + # melt to have one row per sample - junction combination + junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", + id.vars=c("seqnames", "start", "end", "strand", + "startID", "endID")) + ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", + id.vars=c("seqnames", "start", "end", "strand", + "spliceSiteID")) + + # merge junction information with splice site annotation (theta) + junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], + all.x=TRUE, by.x=c("startID", "sampleID"), + by.y=c("spliceSiteID", "sampleID"), sort=FALSE) + setnames(junction_dt, "n", "n_donor") + junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], + all.x=TRUE, by.x=c("endID", "sampleID"), + by.y=c("spliceSiteID", "sampleID"), sort=FALSE) + setnames(junction_dt, "n", "n_acceptor") + rm(ss_dt) + gc() + + # TODO deal with missing endIDs (why do we have them? lost in filtering?) + # for now: replace n_donor with N from psi5 (and same for n_acceptor and psi3) + junction_dt_nacceptor <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], + as.matrix(N(fds, type="psi3"))) + junction_dt_nacceptor <- melt(junction_dt_nacceptor, + variable.name="sampleID", value.name="n_psi3", + id.vars=c("startID", "endID")) + junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]] + rm(junction_dt_nacceptor) + gc() + junction_dt_ndonor <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], + as.matrix(N(fds, type="psi5"))) + junction_dt_ndonor <- melt(junction_dt_ndonor, + variable.name="sampleID", value.name="n_psi5", + id.vars=c("startID", "endID")) + junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]] + rm(junction_dt_ndonor) + gc() + + # replace n (with non-split counts) by n_psi3/n_psi5 if NA + junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3] + junction_dt[is.na(n_donor), n_donor:=n_psi5] + junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3] + junction_dt[n_donor < n_psi5, n_donor:=n_psi5] + + # calculate intron_jaccard + junction_dt[, denominator:=(n_donor + n_acceptor - k)] + junction_dt[, intron_jaccard:= k / denominator] + junction_dt[is.nan(intron_jaccard), intron_jaccard:=1] # n_donor = n_acceptor = 0 + + # convert to matrix to store it as assay in the fds + intron_jaccard <- matrix(junction_dt[,intron_jaccard], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) + rownames(intron_jaccard) <- rownames(fds) + colnames(intron_jaccard) <- colnames(fds) + assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- intron_jaccard + + # store denominator + jaccard_denom <- matrix(junction_dt[,denominator], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) + rownames(jaccard_denom) <- rownames(fds) + colnames(jaccard_denom) <- colnames(fds) + assay(fds, type="j", "rawOtherCounts_intron_jaccard", + withDimnames=FALSE) <- jaccard_denom - matrix(junction_dt[,k], + nrow=nrow(fds), + ncol=ncol(fds), + byrow=FALSE) + return(fds) +} + +calculatePhi <- function(fds){ + # retrieve junction and splice site annotation with count information + junction_dt <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, + strand, startID, endID)], + as.matrix(K(fds, type="psi5"))) + ss_dt <- data.table( + as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, + strand, spliceSiteID)], + as.matrix(N(fds, type="theta"))) + + # melt to have one row per sample - junction combination + junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", + id.vars=c("seqnames", "start", "end", "strand", + "startID", "endID")) + ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", + id.vars=c("seqnames", "start", "end", "strand", + "spliceSiteID")) + + # merge junction information with splice site annotation (theta) + junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], + all.x=TRUE, by.x=c("startID", "sampleID"), + by.y=c("spliceSiteID", "sampleID"), sort=FALSE) + setnames(junction_dt, "n", "n_donor") + junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], + all.x=TRUE, by.x=c("endID", "sampleID"), + by.y=c("spliceSiteID", "sampleID"), sort=FALSE) + setnames(junction_dt, "n", "n_acceptor") + rm(ss_dt) + gc() + + # deal with missing endIDs + junction_dt_nacceptor <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], + as.matrix(N(fds, type="psi3"))) + junction_dt_nacceptor <- melt(junction_dt_nacceptor, + variable.name="sampleID", value.name="n_psi3", + id.vars=c("startID", "endID")) + junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]] + rm(junction_dt_nacceptor) + gc() + junction_dt_ndonor <- data.table( + as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], + as.matrix(N(fds, type="psi5"))) + junction_dt_ndonor <- melt(junction_dt_ndonor, + variable.name="sampleID", value.name="n_psi5", + id.vars=c("startID", "endID")) + junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]] + rm(junction_dt_ndonor) + gc() + + # replace n (with non-split counts) by n_psi3/n_psi5 if NA + junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3] + junction_dt[is.na(n_donor), n_donor:=n_psi5] + junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3] + junction_dt[n_donor < n_psi5, n_donor:=n_psi5] + + # calculate phi + junction_dt[, phi:= k / ((n_donor + n_acceptor)/2)] + junction_dt[is.nan(phi), phi:=0] # n_donor = n_acceptor = 0 + + # convert to matrix to store it as assay in the fds + phi <- matrix(junction_dt[,phi], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) + rownames(phi) <- rownames(fds) + colnames(phi) <- colnames(fds) + assay(fds, "phi", type="psi5", withDimnames=FALSE) <- phi + return(fds) +} diff --git a/R/filterExpression.R b/R/filterExpression.R index 3dbab8e2..7d8d2806 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -353,3 +353,275 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ return(fds) } + +#' @describeIn filtering This functions filters out both introns with low +#' read support and introns that are not variable across samples. +#' @export +filterExpressionAndVariability_jaccard <- function(object, + minExpressionInOneSample=20, + quantile=0.95, quantileMinExpression=1, minDelta=0.05, + filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + # filter introns with low read support and corresponding splice sites + object <- filterExpression_jaccard(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM) + + # filter introns that are not variable across samples + object <- filterVariability_jaccard(object, minDelta=minDelta, + filter=filter, + delayed=delayed, BPPARAM=BPPARAM) + + # return fds + message(date(), ": Filtering done!") + return(object) + +} + +filterExpression_jaccard <- function(object, minExpressionInOneSample=20, + quantile=0.95, quantileMinExpression=1, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + + stopifnot(is(object, "FraserDataSet")) + + message(date(), ": Filtering out introns with low read support ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="intron_jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN) + } + + # cutoff functions + f1 <- function(cts, ...){ + rowMaxs(cts) } + f2 <- function(cts, ctsN, quantile, ...){ + rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] } + + funs <- c(maxCount=f1, quantileValueN=f2) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN, quantile=quantile) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + mcols(object, type="j")[['passedExpression']] <- + cutoffs$maxCount >= minExpressionInOneSample & + cutoffs$quantileValueN >= quantileMinExpression + if("passedVariability" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyExpressionFilters_jaccard(object, + minExpressionInOneSample, + quantileMinExpression) + } + + validObject(object) + return(object) +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites which do not show variablity across samples. +#' @export +filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + + message(date(), ": Filtering out non-variable introns ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="intron_jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN5) + } + + # cutoff functions + f1 <- function(cts, ctsN3, ...) { + intron_jaccard <- cts/ctsN + rowMaxs(abs(intron_jaccard - rowMeans2(intron_jaccard, na.rm=TRUE)), + na.rm=TRUE) } + + funs <- c(maxDJaccard=f1) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + # add annotation of theta on splice sites of introns to mcols + intron_dt <- as.data.table(rowRanges(object, type="j")) + + # check which introns pass the filter + mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, + cutoffs$maxDJaccard, + 0) >= minDelta + if("passedExpression" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedVariability']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyVariabilityFilters(object, minDelta) + } + + validObject(object) + return(object) +} + +#' Applies previously calculated filters for expression filters +#' @noRd +applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, + quantileMinExpression){ + + maxCount <- mcols(fds, type="j")[['maxCount']] + quantileValueN <- mcols(fds, type="j")[['quantileValueN']] + + # report rare junctions that passed minExpression filter but not + # quantileFilter as SE obj + junctionsToReport <- maxCount >= minExpressionInOneSample & + !(quantileValueN >= quantileMinExpression) + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + + if(any(junctionsToReport)){ + # get SE object of junctions to report + rareJunctions <- asSE(fds[junctionsToReport, by="j"]) + for(aname in assayNames(rareJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "intron_jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(rareJunctions, aname) <- NULL + } + } + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) + + # check if folder already exists from previous filtering + rareJctsDir <- file.path(outputDir, "rareJunctions") + if(dir.exists(rareJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir) + toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & + !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) + + rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions) + + for(aname in assayNames(rJ_tmp)){ + assay(rJ_tmp, aname) <- + rbind(as.matrix(assay(rareJunctions, aname)), + as.matrix(assay(rJ_stored[toReport,], aname)) ) + } + rareJunctions <- rJ_tmp + rm(rJ_tmp) + } + + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=rareJctsDir, replace=TRUE) + } + + # apply filter + numFilt <- sum(mcols(fds, type="j")[['passedExpression']]) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] + + return(fds) + +} + + +#' Applies previously calculated variablilty filters +#' @noRd +applyVariabilityFilters <- function(fds, minDelta){ + + # + passedVariability <- mcols(fds, type="j")[['passedVariability']] + + # store information of non-variable junctions + filtered <- !passedVariability + + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + if(any(filtered)){ + # get SE object of junctions to report + nonVariableJunctions <- asSE(fds[filtered, by="j"]) + for(aname in assayNames(nonVariableJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "intron_jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(nonVariableJunctions, aname) <- NULL + } + } + nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE, + nonVariableJunctions, + dir=file.path(tempdir(), "tmp_nvJ")) + + # check if folder already exists from previous filtering + nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions") + if(dir.exists(nonVarJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) + toReport <- mcols(nV_stored)$maxDJaccard < minDelta + + nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) + for(aname in assayNames(nVJunctions)){ + assay(nVJunctions, aname) <- + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) + } + nonVariableJunctions <- nVJunctions + rm(nVJunctions) + } + + nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir, + x=nonVariableJunctions, replace=TRUE) + + } + + # apply filtering + numFilt <- sum(passedVariability) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] + return(fds) + +} diff --git a/R/helper-functions.R b/R/helper-functions.R index 573ba2c7..69466b74 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -79,7 +79,7 @@ checkReadType <- function(fds, type){ } type <- unique(type) stopifnot(isScalarCharacter(type)) - correctTypes <- c(psi3="j", psi5="j", theta="ss") + correctTypes <- c(psi3="j", psi5="j", theta="ss", intron_jaccard="j") # check if it is already the correct type if(type %in% correctTypes) return(type) @@ -109,7 +109,7 @@ checkReadType <- function(fds, type){ #' #' @noRd whichPSIType <- function(type){ - unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE))) + unlist(regmatches(type, gregexpr("psi(3|5)|theta|intron_jaccard", type, perl=TRUE))) } #' @@ -122,7 +122,8 @@ whichReadType <- function(fds, name){ # check writing if(name == "ss" | endsWith(name, "theta")) return("ss") - if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3")) + if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3") | + endsWith(name, "intron_jaccard")) return("j") # check assay names diff --git a/R/plotMethods.R b/R/plotMethods.R index ccf99417..13e22bc3 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -178,7 +178,7 @@ NULL plotVolcano.FRASER <- function(object, sampleID, - type=c("psi3", "psi5", "theta"), basePlot=TRUE, + type=c("psi3", "psi5", "theta", "intron_jaccard"), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, deltaPsiCutoff=0.3, padjCutoff=0.1, ...){ @@ -278,7 +278,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=c("psi3", "psi5", "theta"), + type=c("psi3", "psi5", "theta", "intron_jaccard"), padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3, aggregate=TRUE, BPPARAM=bpparam(), ...){ @@ -341,7 +341,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), +plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"), site=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ @@ -433,7 +433,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), +plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"), idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) diff --git a/R/variables.R b/R/variables.R index ccba65f5..74bb6d91 100644 --- a/R/variables.R +++ b/R/variables.R @@ -6,6 +6,7 @@ #' psiTypes #' #' @export -psiTypes <- c("psi5", "psi3", "theta") +psiTypes <- c("psi5", "psi3", "theta", "intron_jaccard") +# psiTypes <- c("psi5", "psi3", "theta") names(psiTypes) <- psiTypes From fdece76ac075fecae4a8919edd5de4e650685ae4 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 18 Jan 2022 13:29:30 +0100 Subject: [PATCH 06/80] included splice type annotation --- R/resultAnnotations.R | 818 ++++++++++++++++++ .../hg19-blacklist.v2.bed.gz | Bin 0 -> 7731 bytes .../hg38-blacklist.v2.bed.gz | Bin 0 -> 5867 bytes 3 files changed, 818 insertions(+) create mode 100644 R/resultAnnotations.R create mode 100644 inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz create mode 100644 inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R new file mode 100644 index 00000000..ff9268c9 --- /dev/null +++ b/R/resultAnnotations.R @@ -0,0 +1,818 @@ +#' +#' @title Result annotations +#' +#' @description These functions work on the result table and add additional +#' annotations to the reported junctions: the type of splice event (e.g. +#' exon skipping, intron retention, ...), as well as a flag for junctions +#' that are located in blacklist regions of the genome. +#' +#' \code{\link{annotateSpliceEventType}} annotates each junction in the results +#' table with the type of splice event: exon skipping, +#' +#' \code{\link{flagBlacklistRegions}} flags junction in the results table on +#' whether or not they are located in a blacklist region of the genome. +#' +#' @param result A result table as generated by FRASER. +#' @param fds The FraserDataSet, which was used to generate the results table +#' @param txdb A txdb object providing the reference annotation. +#' @params blacklist_regions BED file that contains the blacklist regions. +#' If \code{NULL} (default), the BED files that are packaged with FRASER are used. +#' @param assemblyVersion Indicates the genome assembly of the junction +#' coordinates. Only used if blacklist_regions is NULL. +#' @param BPPARAM For controlling parallelization behavior. +#' Defaults to \code{bpparam()}. +#' @return annotated results table +#' +#' @name spliceTypeAnnotations +#' @rdname spliceTypeAnnotations +#' +#' @examples +#' # get data, fit and compute p-values and z-scores +#' fds <- createTestFraserDataSet() +#' +#' # extract results: for this example dataset, z score cutoff of 2 is used to +#' # get at least one result +#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +#' res_dt <- as.data.table(res) +#' +#' # annotate type of splice event +#' requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene") +#' txdb <- +#' TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene +#' res_dt <- annotateSpliceEventType(fds, res_dt, txdb) +#' +#' # annotate blacklist regions +#' res_dt <- flagBlacklistRegions(res_dt, assemblyVersion="hg19") +#' +#' # show annoated results table +#' res_dt +#' +NULL + +#' @describeIn spliceTypeAnnotations This method annotates the splice event +#' type to junctions in the given results table. +#' @export +annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){ + + # Create basic annotation of overlap with reference + fds <- annotateJunctionReferenceOverlap(fds, txdb, BPPARAM) + + # Calculate splice types and frameshift + result <- aberrantSpliceType(result, fds, txdb) + + # Add the subtypes for exonSkipping and inconclusive + result <- checkExonSkipping(result, txdb) + result <- checkInconclusive(result, txdb) + + # Add UTR labels + result <- addUTRLabels(result, txdb) + + return(result) +} + +#' @describeIn spliceTypeAnnotations This method flags blacklist regions in +#' the given results table. +#' @export +flagBlacklistRegions <- function(result, blacklist_regions=NULL, + assemblyVersion=c('hg19', 'hg38')){ + + assemblyVersion <- match.arg(assemblyVersion) + if(is.null(blacklist_regions)){ + blacklist_regions <- + system.file("extdata", "blacklist_regions", + paste0(assemblyVersion, "-blacklist.v2.bed.gz"), + package = "FRASER") + } + if(!file.exists(blacklist_regions)){ + stop("BED file with blacklist regions does not exist: ", + blacklist_regions) + } + print("Importing blacklist regions ...") + blacklist_gr <- import(blacklist_regions, format = "BED") + result <- addBlacklistLabels(result, blacklist_gr, "splicing") + return(result) +} + +############# helper functions ############################## + +#' blacklist annotation for aberrant splicing events +#' @noRd +addBlacklistLabels <- function(junctions_dt, blacklist_gr){ + # add the blacklist information + print("Set up aberrant splicing granges") + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions]) + + # set to the same seqlevelsstyle + seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr) + + ## create overlap with blacklist and annotate extra column + print("find blacklist overlap") + black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr))) + junctions_dt[, blacklist := FALSE] + + junctions_dt[psi_positions[black_hits], blacklist := TRUE] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + + print("blacklist labels done") + return(junctions_dt) +} + +#' UTR annotation to results table +#' @noRd +addUTRLabels <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions]) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + ### UTR labels based on txdb file + ### add 5' 3' UTR labels + print("find UTR overlap") + threes <- unique(from(findOverlaps(junctions_gr, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + fives <- unique(from(findOverlaps(junctions_gr, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + junctions_dt[psi_positions, UTR := "no"] + junctions_dt[psi_positions[threes], UTR := "3"] + junctions_dt[psi_positions[fives], UTR := "5"] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + print("UTR labels done") + return(junctions_dt) +} + +#' Basic annotations of overlap with reference (start, end, none, both) for +#' full fds. +#' @noRd +annotateJunctionReferenceOverlap <- function(fds, txdb, BPPARAM){ + print("loading introns") + #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + introns <- unique(unlist(intronsByTranscript(txdb))) + # reduce the introns to only the actually expressed introns + fds_known <- fds[unique(to(findOverlaps(introns, + rowRanges(fds, type = "j"), type = "equal"))),] + anno_introns <- as.data.table(rowRanges(fds_known, + type="j"))[,.(seqnames, start, end, strand)] + + # calculate extra columns with mean/median intron expression count + # add the new columns + print("adding median count to introns") + sampleCounts <- K(fds_known, type = "j") + anno_introns[, meanCount := rowMeans(sampleCounts)] + anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))] + # order by medianCount (highest first) + setorderv(anno_introns, "medianCount", order=-1) + anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, + keep.extra.columns = TRUE) + + # get all fds junctions + fds_junctions <- rowRanges(fds, type = "j") + + # Do the annotation just for the intron with highest median expression + print("start calculating annotations") + overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") + annotations <- bplapply(1:length(fds_junctions), function(i){ + # only select first intron as already ordered by medianCount beforehand + overlap <- overlaps[i] + if(is.na(overlap)) return("none") #no overlap with any intron + + hit_equal <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="equal")) + if(length(hit_equal) > 0) return("both") + + hit_start <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="start")) + if(length(hit_start) > 0) return("start") + hit_end <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="end")) + if(length(hit_end) > 0) return("end") + + # overlaps but no start/end match + return("none") + }, BPPARAM=BPPARAM) + annotations <- unlist(annotations) + + rowRanges(fds)$annotatedJunction <- annotations + mcols(fds, type="ss")$annotatedJunction <- "none" + print("annotations done") + return(fds) +} + +#' +#' @noRd +aberrantSpliceType <- function(junctions_dt, fds, txdb){ + print("preparing ...") + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = T) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + introns_tmp <- unique(unlist(intronsByTranscript(txdb))) + exons <- exons(txdb) + + # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + fds_known <- fds[unique(to(findOverlaps(introns_tmp, + rowRanges(fds, type = "j"), + type = "equal"))),] + grIntrons <- rowRanges(fds_known, type="j") + introns <- as.data.table(grIntrons) + introns <- introns[,.(seqnames, start, end, strand)] + + sampleCounts <- K(fds_known, type = "j") + introns[, "meanCount" := rowMeans(sampleCounts)] + introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))] + intron_ranges <- makeGRangesFromDataFrame(introns, + keep.extra.columns = TRUE) + + # prepare the results column + junctions_dt[, aberrantSpliceType := "NA"] + junctions_dt[, causesFrameshift := "NA"] + junctions_dt[which(junctions_dt$annotatedJunction=="both"), + aberrantSpliceType := "sameIntron"] + junctions_dt[which(junctions_dt$annotatedJunction=="both"), + causesFrameshift := "unlikely"] + + starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start") + ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") + nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none") + + print("calculating aberrant splice types") + print("start junctions") + start_results <- sapply(starts, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "start")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareEnds(junctions_gr, i, overlap[maxExpr], F, + intron_ranges, exons)) + }) + junctions_dt[psi_positions[starts], + causesFrameshift:=start_results[2,]] + junctions_dt[psi_positions[starts], + aberrantSpliceType := start_results[1,]] + + print("end junctions") + end_results <- sapply(ends, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "end")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareStarts(junctions_gr, i, overlap[maxExpr], F, + intron_ranges, exons)) + + }) + junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] + junctions_dt[psi_positions[ends], aberrantSpliceType := end_results[1,]] + + print("none junctions pt1") + none_results <- sapply(nones, function(i){ + # find most freq intron + # check start and end + + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges)) + if(length(overlap) == 0) return(c("noOverlap", "inconclusive")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + # returns type of exon splicing, frameshift T/F, amount of shift + st = compareStarts(junctions_gr, i, overlap[maxExpr], T, + intron_ranges, exons) + en = compareEnds(junctions_gr, i, overlap[maxExpr], T, + intron_ranges, exons) + + # merge, start and end results + # merge exon elongation/truncation + # if both likely/unlikely fine + # if one is likely -> return likely + # if one is notYet -> return notYet + if((st[1] == "singleExonSkipping" & !(en[1] %in% + c("singleExonSkipping", "exonSkipping"))) || + (en[1] == "singleExonSkipping" & !(st[1] %in% + c("singleExonSkipping", "exonSkipping")))){ + ## only one is single exonSkipping, the other is trunc/elong + if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("singleExonSkipping", frs)) + } + + if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in% + c("exonSkipping", "singleExonSkipping")){ + return(c("exonSkipping", "inconclusive")) + } + + if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + if( st[1] != en[1]){ + combined = "trunc, elong" + }else{combined = st[1]} + return(c(combined,frs)) + + }) + junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] + junctions_dt[psi_positions[nones], aberrantSpliceType := none_results[1,]] + + noLaps <-which(junctions_dt[psi_positions]$aberrantSpliceType=="noOverlap") + refseq.genes<- genes(txdb) + + print("none junctions pt2") + noLaps_results <- sapply(noLaps, function(i){ + overlap <- to(findOverlaps(junctions_gr[i], exons)) + # no overlap with an intron or an exon + if(length(overlap) == 0){ + return(checkIntergenic(junctions_gr, i, refseq.genes)) + } + + # for the exons, check if splice site is contained in the exon + for(j in overlap){ + exon_start = start(exons[j]) + exon_end = end(exons[j]) + if(exon_start <= start(junctions_gr[i]) & + exon_end >= end(junctions_gr[i])){ + if((end(junctions_gr[i]) - + start(junctions_gr[i]) + 1) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("exonTruncation", frs)) + } + } + + return(c("inconclusive","inconclusive")) + }) + junctions_dt[psi_positions[noLaps], + causesFrameshift:=noLaps_results[2,]] + junctions_dt[psi_positions[noLaps], + aberrantSpliceType := noLaps_results[1,]] + + + # add distance to closest neighbour gene for intergenic results + print("adding distances to nearest gene") + up <- which(junctions_dt[psi_positions]$aberrantSpliceType == "upstream") + down <- which(junctions_dt[psi_positions + ]$aberrantSpliceType == "downstream") + print("Calculate distances") + + if(length(up) > 0){ + distanceNearestGene_up <- sapply(up, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = T)}) + if(length(distanceNearestGene_up > 0)){ + junctions_dt[psi_positions[up], + distNearestGene := distanceNearestGene_up] + } else{ + junctions_dt[psi_positions[up], distNearestGene := NA] + print("No distances found for upstream") + } + }else{print("No upstream targets")} + + if(length(down) > 0){ + distanceNearestGene_down <- sapply(down, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = T)}) + if(length(distanceNearestGene_down > 0)){ + junctions_dt[psi_positions[down], + distNearestGene := distanceNearestGene_down] + }else{ + junctions_dt[psi_positions[down], distNearestGene := NA] + print("No distances found for downstream") + } + }else{print("No downstream targets")} + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + print("done calculating aberrant splice types") + return(junctions_dt) +} + +#' +#' @noRd +compareStarts <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_start = start(intron_ranges[max_lap]) + ss_start = start(junctions_gr[i]) + + # found the most freq intron with same end again + # check if intron starts before splice site -> exon elongation -> FRS + if(intron_start < ss_start){ + if(((ss_start - intron_start) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, + (ss_start - intron_start))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_start > ss_start){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(intron_start-2, end = intron_start -1), + strand = toString(strand(intron_ranges[max_lap])) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end")) + for(j in exonChoices){ + exon_start = start(exons[j]) + if(exon_start < ss_start){ + if((end(exons[j]) - ss_start + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation", frs, + (-1)*(end(exons[j]) - ss_start + 1))), + return(c("exonTruncation", frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, + end(junctions_gr[i])), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "end")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation + + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + ss_start - start(intron_ranges[secItrChoices[maxExpr]]) + + frs = ifelse(shift %% 3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +compareEnds <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_end = end(intron_ranges[max_lap]) + ss_end = end(junctions_gr[i]) + + # found the most freq intron with same start again + # check if intron ends after splice site -> exon elongation -> FRS -> done + if(intron_end > ss_end){ + if(((intron_end - ss_end) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, (intron_end - ss_end))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_end < ss_end){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + ranges = IRanges(intron_end + 1, end = intron_end + 2), + strand = toString(intron_ranges[max_lap]@strand@values) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start")) + for(j in exonChoices){ + exon_end = end(exons[j]) + if(exon_end > ss_end){ + if((ss_end - start(exons[j]) + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation",frs, + (-1)*(ss_end - start(exons[j]) + 1))), + return(c("exonTruncation",frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(start(junctions_gr[i]), + start(exons[exonChoices[1]]) - 1), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "start")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation at end + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + end(intron_ranges[secItrChoices[maxExpr]]) - ss_end + frs = ifelse(shift%%3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +checkIntergenic <- function(junctions_gr, i, refseq.genes){ + # check if start > 1000 + # start - 1000, end + 1000 + start = start(junctions_gr[i]) + # ifelse(start > 1000, start = start - 1000, start = 1) + # if(start > 1000){ + # start = start - 1000 + # }else{start = 1} + + end = end(junctions_gr[i]) #+ 1000 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_junction <- GRanges( + seqnames = seqnames(junctions_gr[i]), + ranges = IRanges(start, end), + strand = strand(junctions_gr[i]) + ) + + # overlap with introns and exon + # IGNORE STRANDS? -> decided its not necessary + + # check if distance to nearest is > 1000 -> intergenic + # otherwise up/downstream + dist = min(distance(test_junction, refseq.genes), na.rm = T) + if(dist > 0){ + # if(dist > 1000){ + # print("intergenic") + # return(c("intergenic", "unlikely")) + # }else{ + + # find nearest and compare starts + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start){ + ifelse(strand(junctions_gr[i]) == "+", + return(c("upstream", "unlikely")), + return(c("downstream", "unlikely"))) + }else{ + ifelse(strand(junctions_gr[i]) == "+", + return(c("downstream", "unlikely")), + return(c("upstream", "unlikely"))) + } + } + return(c("inconclusive", "inconclusive")) + + # if both lists == 0 return intergenic else inconclusive + # return(c("inconclusive", "inconclusive")) +} + +#' +#' @noRd +checkExonSkipping <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = T) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + exonSkip <- which(junctions_dt[psi_positions]$aberrantSpliceType %in% + c("exonSkipping", "singleExonSkipping")) + + print("start checking exonSkipping") + newSkip_results <- sapply(exonSkip, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("beyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("exonSkipping") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("beyondGene") + # end is in a different gene + return("multigenic") + }) + + print("checking exonSkipping done") + if(length(exonSkip) > 0){ + junctions_dt[psi_positions[exonSkip], + aberrantSpliceType2 := newSkip_results] + junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), + aberrantSpliceType := "beyondGene"] + junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), + causesFrameshift := "inconclusive"] + junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), + aberrantSpliceType := "multigenic"] + junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), + causesFrameshift := "inconclusive"] + junctions_dt[, aberrantSpliceType2 := NULL] + } + + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + return(junctions_dt) +} + +#' +#' @noRd +checkInconclusive <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = T) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + inconclusive <- which(junctions_dt[psi_positions + ]$aberrantSpliceType == "inconclusive") + print("start checking inconclusive") + + inconclusive_results <- sapply(inconclusive, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("beyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("inconclusive") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("beyondGene") + # end is in a different gene + return("multigenic") + }) + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + print("done checking inconclusive") + if(length(inconclusive) > 0){ + junctions_dt[psi_positions[inconclusive], + aberrantSpliceType := inconclusive_results] + } + + return(junctions_dt) +} diff --git a/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz new file mode 100644 index 0000000000000000000000000000000000000000..5d87eb135971a4b5e1010045ab5c5bbc4736e1ab GIT binary patch literal 7731 zcmV-39?an%iwFo<=jvMk188S4IW1yrVPk7-X>)Wgb}}wvWn=(_eMzouNpjryY8ADF z=$jpIKuxGg*059%iU|-TK?o4Edz!h$n?1LS9DDGi{{4UaJ*cD2Hf~T)z0FbYnX8N<(59o- zK1biH1e$&9oLoO3-PIjqp26RpXPIBEpS!@q>#fBh|MP$RzrXy?zy0Tb{_TJN`EP&z z-~azOwbaSC!M|=|NId5R@eccPwT{Xs)mA;RuRTb_zw4$ixK1zpUHseZwAC`(nem2G z)MUi9t<8lcxOM`^!DBdu*fnb$yCenwae?tq>%&tQHUX&>uVE)0jG?8j?cCz2bH%Qi zw045>Ew~)h_4W6FUuzjg4OIWSv=etPntKQnQEqT`^ezXV}%k zOmYrl2b5}0B+0Gei9v3S^X145lHkNK0kX5(czk2qlMS*w|Z18=1-D_Qpn!)vnvlN4@9NSQP+2F|F(pXNO># zeL?DlddiWQ^$gKv&~7xZKFrdzU_7!bw{Gjb=70uDPE;&1xqMx4~ImIt_&5@j{?u zuatN)&>lvyBDr^ttO!j%X`Kgah>ZIr8(j~*s$Tcg9SjJGP!8^ZnD*2)o?Kg$gr2(h z5%oCbfYe=M`xA7t13DyM!KEJ}uF92|jYC`I0aft$Srxkz-VI3Q< zbtuO@pMM_v`1NsnGm-_=`2-5Vm!(=`#i9f3W>$aS9nuoFWOqji)B|F?=-IvdCB1uv zq$0TXMSE#7F#yKS#1wh`#)<3DJDkEp>Xs$}aGeHC>XBgsU?mBtFkGlg5RBr-RA&q6 zNZ=22$IplD*=bJNNd;raXb7AJ4Jx)=C=&qHsdGUccZhoY_Jgn;(v$!~u}GE*g0IJm z0t|bgA-?Ci(x}##`G@Hg7aG9s{zdVuIGg~>S(&6_|A$J4|KZD|E~wUyJ~)! z;Llgb1QAe*>b!Aoce`& zXWZdfsV@Qt_QMH>0KoM^6uVM79>DZhBQ(AF+-sqCSnBR!!vHM5`V+e(9tL2YF`(8W z^?_h#?#ke;E010Maz3HF_mCxY-T6Z@A((m#>YDz-#i^7N3>wOIt_f0`lnCfBwc5=; z>0^pvVZ}y%@F(oX(9992DxuN850 z=7bmm7+-D3`KC#|+Tj4zImi2F!_tpC7+|%0 zA8pvta*qP|1G(!~{D;5=;z)%GFoSwmx4Uz<9SSADDc*;JRaDdy;1;`1xb<4>CV=iW zZ+arbjss}LzboXW-gTVn{S%lfIq?XVpRD-;Yy>BZ(*|$GK_l>jY0o1Y3P3TEoVuDk zrh7qS^^>}$;w%7PRQkQn2pLP@Pbsc0xo0+A>A4VPl`l>p?y&VRa9#qtSeuKAtzEG`1C^Udu+BtW7e-E3?04bq&&}m9Xh!N zjtBb_j9W>rZQe>Q!U*cBaupJ|u`igW@s?{!)DUmuXkiKtw=8lmD_J&=D|e3?ec_tdFqnd0U!k^8U9)~ zB->ptnjmSr%@&vkV`uVa+~ER4{3ghPJ^Od63EXjC{!s$ zp37_3wcwZzJsFC&zi>d=GFAfl-2jufPQ~Yme5{bEdu1tI0?3$cl@W;h(*4$#NI58X zR=opI&xLEn$Z=BylP(3zghHK?c^;-5&@FI{mHkiY? zUKS_py_-vhS{Sv|egHat591AYC}%|1STU8}QO!eTikDwCaQYG%$lIrtoUM0&=G(uK zXN`F!YG0%TER=a4`K8oa@(uMR&@fVh`LRdIKCR9#8@T@XUx~W+B9XG<9*tDj^6kl2 zK~JLnB{`)+N?ff-f0?O{3`_lGGxg?@iGqx$Y59PBc!k+f%I7Bfq8)>>*V~*xUh4+s z#^wn$O(cv__apgJNB!cvAmu7osE!utp_bf2dkJ*BQv#GRmD^Rv{9?zEqK!thy~iJv zm&RKeK=sZ6#yZA%uW@WP&iF)jQE`r;@qy^Mv4z{NHpg##paj2Ugyv_9)UK2v0yiR3 zZx07Kz)T~ZgSiSN)DnQ6CZB=FWU)xC0C05$pfVH~SpXOInjO&Ol^t$!fMsWW?NBr5 z=bSc!d3@;654n4IwVEw8Xrw*k21==mvV#hFAJ*xix6#hPfd+YH*q!T=zCk@rK3M=8 zWgnl*+~Uw60i69iLU9{Ex)V;JIj|!Xkn^KmY^!q)DH!8v0`)kho8(<3gow7&Bxf9} zx||X{KVi5;+7@PQnqxJ|P-b&$d)(|w2e0F{iG;bnls=U|66@5IsE)VDtT*t^>~YyO zNLZHyIGHpiUvr0=;+!;~LZpY*&Wx#41q6>77FQ>~y7ieK#xG;f5x5ZM{H)t%Je9yD z*US$#&!!&X!s7JFA{6mlK9O-1WUbjc(;gWv%NtA8Klb=gdIU8*F3$n%rlLuZm;ntleb%XZu^1Ww#-7Q1YvNRCree=B&a zDX_$)bT`aq_FaStKjE-5n|(+2UHmA3)ww~11XoI3-&QpSb^c`EF&1fd-fUG#P_rGU z6p)HrD{}wl+BK63o9R3fU!~=QS28W%`4d-XgF2zpR=V8{?0ozRYSzl49$4w z6D1qwNSx1Ak;1iV#9`ih?6B6XO$y@C5y0t9-Ek)>e`aW%#i_D~G9~J>0GiD{1(oT? zVg6Aw^HgkQa(O?MF5bx3;J5 zSB*{JQgfn4zx9XZj2L zys|K;iQ0!#dJmJSo8A?`86M}uYyjO^0N<#qY!!;H>Wm~%2aB~x1#rPhQW@9?YD7jo zb;OQlmg|>yoqa#K+4FSm!xUH59dbsU)lKT$67XwvcWZ)5<+Mr>DE?rD4ILR`rQ0CO zVWwe~=?n1XsVosw`e3)o`k*q!SQ&)<`W;A(KhDz%ud1ao!923m=;*UYV4be2{>Y)Q z@49ibDkEd8JM9eYRbOp3K$^u!UXweYz}Y^e!roWv$SKYtc~~ke7bBBAoYRohrhpZJ zQ}IacJ2VRb>m(p`?U!W&OWn>t^G2F%nr%zdFOz$QGIwlg>HQpNcj3*r)nz%~59Fja zDs%Om)=KIuD*l7*T{dL6UHRb4W+Q#LlrCM5c`Xs}AMR-2`u?e*oNfny zXcEUAzOJ)~p%*Vn+HOmoBDGlQ!rh4@7=fqBzJ)w?={Fx<7Fy$DGD|*?n9>^nrn@$v z+ZCo;hn824?Q%lKRoYRdvf->9DUS-d;)=VjEeaTW=#r1ZASVvUI9=f!rZ%~Je(I0) z%%^KU`#tB<_ZOacB4;0BxuC*^ssl8vFOo~!>lXKfKiq9oT!!i!0d$>YqES%ytDqfT zH3Q>poin22-vK*y#~KGp-HiD2eo>r2+buAZy{iazAUk0&_?vVyqG`15Myr4KkTuMw znTfY8c=v=2*jDIpDGy5@C%Qp^nsmVyj1h;r-PkvV3e8MepJTnVmQ!i*hG1r`c`kz* z)5nj>8gOP?mWbCN3oteEZiibweIh%sj28+nQJH()NK zTXbnzOzCl)N(WYxpuAZ*Z16Au3DlX|^5mU9)kWseS8jir@szNJaYjwWdTS9(8?rEA z73LZXsj%5}+8B;BTZ7GKP7AGu;3gU)c%3aPLSvxQ(yUISr%CZSf-F50JSY>Y)u6Y> z1f+x76!Q#!kok4UqN|WXS1tZ;KmCF6p}r~*&SuU(wI97RRnHjKFkSuDVMmD-Ebh;<@CY-aE;Or=AH8Xo_mim zPTJ>|rrHnuAI(_uO<8@RXD|~kog(%-I4@JoWva6~3+8aVu4bp2$X$yV{@5>uKZZ$X zh;AqQd8j(04%uGkpRaO4nJm$qIHCr!6c!^=)b;m=uF!@;31E(i+a#QxG3GlM!zfPR zGSNu>_tEP$3K@%@LSlL8*#Fm&uM}#`KSN-i;30Kiok|=b^Hn z!~scb@&K_+&m<~!UL;ouCcD(0tTlUkK%j?OH22Ijo#^YKQvfh#zIL7I0jduK=tkXS zjrD9>F80~8QU_L<4jpO&Q{lo=vFEe~XvR$yWjiYM18}!@RJiO&9pmg6+mp;IwG!YO z$;N_=tAu+3bB(gb0b}Qg2wJ*2huLtE7+D-Ddi#iB&~ zKqo(k_(eS@P=0XOGHWV=Kx90k!R>Nz5;c>LLoOtpIt2oCqkL0nU`MJ2;rvN<$hq9U zk}wd3#`K_w!5yyrs>KA-B=i-ssNZJ*-r~clTEB;*VO^E$Fx(0?I&?V)cwxBFr@cuVYXd{_)lLlO~SSw zi{SP)HMafAj}3P#BZYI&Od2&42VJYF!Em5bNQdi2H$L|%FSomp;P{NvZk!4zTf>xD z54OeFU7ZxxF`11cN`m8=tYkI;*1_(ER|T&CW)4Sq;FYoiVB{V%`G8?xctB(pjOrqYj2n6V{xI?Kh{9lgoatC)SS&e8Lcbd3Y?mtegQEQpjcg%sGv;)H&oMwYg8XD{MNHlp; zlO6*35_pCjFIriK*UT89lYv|*`A;CEqqy?vE!+4E;}|Qd^9W|vl>#)fl<@TY!s!El zqBr5i$af=|OIkKTc3H*@$~Vw(CX7}-KaqD6K8ZL9ZWdctr)q#}TzZqaWpYS##=PCY zag8W*4Z!(5$-G_NjOKNM6-+i7mFuxMM*^7`$xkz-ENwby^7^uwkUb`Ntr<*$EK~v7 z;om~2E|=ssKbtx7VPsGMy0K7sQfq`!is^s?=6!x+0#gJuS)Hw0oJ||;2|MHVr9$D? zb|GV4TKfFa=;4$rCt%WRAv>e_oyo{m2ws9L4QNJDo@!gMigshL{6o!5iCppAWINj- z4Ts2CmlS}o$BsNL%w4l59fwEu1Ch&u3DmM@^;8Q{q@l&G64?? z#kQ0dNOZG^1BtyT`!k|cy(Pu2VhZj@0)f&La?!{fJjW&AL zoO&@kHx3N5bQ2%j`mjYy__6!lD40&S8#|c)G}WCZfdfoz+Hml*PRVWqT)C?L?rcSGpLv>G5)seKWfIQL zDD~wNmSOBoGGD^_7&c*jF{!#M^>41!!9>~%!w}@V8v3XDNfBff925dwiaJ5om}AAq zT@-LTTip+Tr`_kR2T-r$--SxKYD7nE!MJ-@w0*3~B%eKN>$muIg&lj9z(hgG>`Im5 zH->VFHphuTTYMlU>l*^vf=L=FnCq&oU`A892|*U%TFgn3twcYQNuMF}<=+$JTZLT) zMPpCYT+twNE6oR>XBI)w^evj@`*Ty;Lo1Fup|z8xf2?TES9L3}pL0%o_X5B%(L^v| zyHEAyYw~^m{=oU}5Q408!s6F6)B2C8Khl(+=alTb1fFLDKJ2g zUk$q*wTmGz+CP{)+u`+sG#u$2$B4DkMEPNdbQ`ewMI>AR^n8-{em%elvgSZ55>C~! zYp=8$C}~@j223_Jv@>*aqXStgF*Y`(0%*)P&1v^ls}k&VLf9EM(%K2E{9$EE99UlE zS-0cG%0H#F@IziR9qgf99{Li*K-MyxT9y140Pl+Vopy`*JHvoUn{r zs{Ijj9hmt|n@ykk32$V(K!gd(* z_4v7C=T=)~mpZw!fp*qbcd=ziQ*BS6EFvAmBX=#)HtSQzmOng@8BBC=(N6eLDM%`0_7^t|rsF-d?gUkhw6% zd>@#Kio>j(F^Q{PdG%NmlB@i%HQSa*AUbA-4lD@G5KvZ+LPrW)n!Kd4eRCy<{!w*g z4rE%e2e;3kB7we4y?Joy0TGiHXP8@P?N={X`Kj3KV~9cAiV_jc-1z37 zkJg7pdu9_am{Gcgh;mDWP1%%09w@h%8Qy$X>bm92Hi8s*_0o$~yZNtE(YcqGFf6A>da9OWQpaplpBvl_apko3lJ-LKN0Xqd1mxu@$Bet}FhI=D+E1VFFo`*&`%!8j&a`_TOr z!#TyZ$~vFu-uT*4bW;?{#uZJSH7Y+@;(ojKSF=lCvcwKQucb2d(Q7psU1V;>@@e>) zX=KIzm2nu=;%-FIu!oaGU}6S`u5;<61SULt=`Ia}&a^VwqjR^0i_^VA{P-IXA)XA8uf0@*~y@j~v{>g7k| zzBY2{?2ePf5+}!F{7{X$=DX-KCd2U|kr|CPY4Mi@*~*UL$8C&gWLf)B#~V5>#<+1Won zUEk@LX`tf|X&*%ynfWIBgDnYE!7_{adi)=e%p85-@U(o1|A6}XKG{0hC_O(2e|*^X zG`t~mzJ71N4g~kNzL*TT-i;h2bEutYgxcKv@oJt5wG}6WFg>u()!8r-9^3?7Rf`5N z#jji%$eq`PyzXI&D9B}NL%FlhtK&(zbS%!-(0aP28Gl69^-tGnwwSeCww9Ff5-s2J zJ;#rvNSlk#{n$z9lvzrc8`m`pCYR~QN^GN8aGRhq-%6z{6O(LrARV^)h%3+whyTv~ tD?3|0@7TDe9n)X=iu#@%(~gXM|0_R(t$JX7$THj6{|&+E3w>gU000{4D_sBp literal 0 HcmV?d00001 diff --git a/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz new file mode 100644 index 0000000000000000000000000000000000000000..a4ec858104cbe907e9056e873cec278cee11066a GIT binary patch literal 5867 zcmV)Wgb}}wvWn=(_TuZKHxp5pHucDX0 zKa)GqK%Q8W+rzeCvxJmnd4m-77a z@BjPDfByE5fBg2}KmPjTU;pwkX# z@%O*|KXpf2qMo&nrJhqX>ZrRmOC2>Z9;vOC-7U4XeTJyLH9ObZ))|YV5zRhc7ohgO z+KeK!&mQl?R;*<`W4?@4pIP-Kvz5062PX!9VuoAn`kXZ?{(cq~{B5XOs&*Ft9)sJsBZ6%g>vWs}&K00-JL;ialL0XNXVn6cj@*3wK=J$4 zs-{qc`2TX`UU9gm&O=>Mt&~-a!tIVlE2-=%EA#bl3;$gGRsRZ>viHdrn54&`I>^ez>46kEO zT!TJ^KHE~gefd#W#jsV!Xl!@lKwV&Pl?bNYMQ9@om+Ge0IqkSSzxzmlfULBE+{WnI z1ft=Ls1*>>7hIw{$pB9HRh8Zmsnk8Hb?T zV*tvSvnF{!cYAW@dZd+WIr|rKf9fi!)at71J8ElbW&@Vq%oo&XHLVc9GArDl$)UK< z0NWY*PLoEhnZ|82w+bNF(PyqN&>H>a7!MlyfVncB%OKVwPz<4N|4w1vpM% zKwYcJngjGX@t9x%PR+=86OGoI5IwYLN!( z(>Vg;!O3otYl3Yyl+;}=6`N%jYs|6rSpeZ=3e*)T_5{vN8}L=UTie+f7?0RMATIGb`b3A&S5P*=wp zCl$c?Uh{z;*L4N~V%x34k}pa2#>X`)02@b9D*$kgIf|g}m6m98!|YrUx;5!;fbBmk z8p8M{DkfqDnsKnSEL2PzmWuO_59+B2pa|ksu<-Xd?JFogE;wEAyJ58zAe^~$4u3Z2 zf*~wRz6~O+=A0Z-WwA(Iz1v?0W2W&O%Q`ut9<4j#fbArJ)aaR{f!MZhCUAyS0E4*p z79fEKjg}`62gQYEq_%Xy#v08<6pM@@Zx~>?IBq>sWeTC;Ma7%bwy7e!3&uBh{9J#@ z(Z*1k1fGSZL2K7k^~?FQn?Z%NB0_-n7Gi=0tCjhj!*gc8qwYw565;4Di?<(bh_p9= z%de5ThjvdssL>vv!X7Uj9cU5fqhT?jG_@;(F^HW2;~A|eh{2hqDG%7&6WlOS})Rg9nXgA_~G0v+oD;({{Ct4MQW73vFN~2qF}oGLbrDh8UO|M~q0d_e#WZ z>SDs^?BS`qr)~tXIpuO-7kcPy1DtM0?F=T<+Qf2h)S+p}qCtmm)QuD$CAm^h<7_*P zaiFPt#xw(-p9?EN#jndG0Zf;NIS;=1Wd5O4n}`K9X3FoxiFd|_>CfZL{Bt1bvau5hVk%u5kzAU*{Dwi3DFjks0%h z&Vc|UBvK`j^Wh(6iEFxVM9zhHSbM12d?9%~KV955;$#k$qb}=r|_GYxKTV~WhVBOa%x6ZXLpkoY!p#w z?VA!2%{9Q%)#$(JN zOULxJ;US#K6NfO>>=4t{S#BjTX&vV7cg<2C?4d~#aeGd=+jEN3T@Q}Ml6Vj}sRsea zUeOCg1=Q%aY%6Aja-n5L}MyT|j-i zF%N@F1v>~t-*PRaNyD*PJ=d`V&PdM}2gq7XMOit?ji}?B<5N$h7j#;&q5%ec?<_Kw z;!@w##-+8B5h14X);K~UoS>eAGTL~_pOD!4SUF-FuVJlfq4104DAm5Qi2JS+5CE_2 zS}D8Qv(8##$;um07$;ENe!mK6*|S*(#Zz-2YIeI;Z98ozkW2Z>bRj~~xJ1j1+Zua8 zr1BcivKtrM5P3X_jx)7j9M~keZbYUZ23!WF(hfw`M5*Mm2&@bN&@okMa`F38(nPM& zMqK5CNzlCyCZp|5BzMnQSnPC?b;lBO# z1I?(q4dNOT`#?Gi7E&PqbJlJ*p-9u((q*?2KkJwhOMqo^IP75PO;Sf-QepjB6IMY@ zH~=P%Or|R6MZoy9WUg#vBrvUkU_vV8t`Zo-2xgQl!4lZB39ld{4uG^J@66jsLts)e z{n>M6rfbjbH3JA;^8}bC^$R9!fLwHdX(c4GABOqDIQG0lx2LfH*mlfQYNi(si(8&K z6l}rcYRBpjc5=-Grv6n4!zV%wdKWa=dHTr5;}hxWOR+fVL9C?<1GWZD+VWiM6w zolwHwP9q-9_)^GPtJ-7k+ua@WOwUC~k^U@^Ny5?ppwyonUr4#|5&!Pd(6m zvQtFE#K!W@c{s%#-StqC%eFwxfmt|{C>2+ASaG4L{e0(6t4?&h>_9E$$w81%TKGAv zmbnz7S^c-IIP#T*yk0+SR?jtKo7p$qo{l{byY4)fjhC0Z$~WA3Q#ywsoBuZa{p7dd zFd6@ey}ZtzjAv%c)SSi(83W#5X24y^0LrEw8o$pdQ;B+pX=BvOLc_*a1 zO3GlEbCmFA4kFz5;y|d2h}V@|BQ{s2X9F&QT|=Cqbc`u)cAixSSi4Ne)ATcN&0zzx z#htlJQ$-l8jf|F;1Bi^pAQuKGH9JterR0VZP(8jJ~!7s&1t%R9M~M9bt6Cu8$oG7`{rGCK#jTzL^i=o{`indcQU)c|lCn{qOu z2QsZO)Qn%bbD?9ujmIz749I2hC#~hmRH2YPF)Z%pEkGetIVG3oGNVZ5WbAzDE2p;% zFj+DaizXWRU{^ebkQsmk6OlQ3CX51dy;m0^N(auM&R zY2iv((Wf#?B}O#tW`PFp`zDU+FzxZ;_S^oobGnv0Qt1kr1Uv6S3=@X2*MDRQ(Fo{f z=|Qn)*)QU;Vde7PZ)I}Y3sz%elcnJY~uKxXHD z`Y})%nU7oLxTioipPHb@h>YGRw<8|U zaVgk-TyEx7XZ8}~d3rtBxNCMFtiI+-2V74`87Aa7Z;Z~{>HErzP){1%>gVmi+#a4?su=Is0F z?vW8*gezI@CopOmOnkg-0f8}x=Lx=uBQ*kGZqR@^*#|4tRJJkQWY)YwPGD61dDL3k zGN1u6vHM{Lr)Iw_QJ9>|Q6vAYz5;c-Q()|xSGa&$dvPE|j9a!RS}%>;B!-4jPegZp z4xdMD|L-FA;kCYO@iXq)hxd8q(7R&y2R=$(_3Xy|{VpEUGUe&hD!!6`RMxN~$HSu9POh%H!71KX#!Opu;_<6U+Ect1E z;X;Sc)9$qygc!yn0B>eir0r=GBjn_K^=&Tu9ngjOg2o7QG(T{ZvNs?9+VS9m$(nB4 zZ2ft737fn#T;786vg5uSg9W!cf;cKWLdtPX>>u6>s99v>rKOiho3zNMY6y(QpfG$% z^{6hEB(vp1lmzym!|1Nt6qFu?VU9Eudx&HzkcLV?^n zpGKJ5J%|J*jgM+?yx8vM*~?xE_L(3zp$V9fkDb@VN_@kdH6Im$(`EWCz~t5DypKJa z3G%3n?>le+?$$b(FuRct3a)Nk-Vc4Boo-iyLUk(--#Hr?;hZ-+=48*9UO_Fqd+3>2 zq?yG&vH>Q(SMy{WweeHP9QlwCfa@#+v)y+yC)A-IvyFR&Q59Xsq&KH8>O1V&obVuJ z;|NSOA4~+88N5PF-xUZBM9n1vu)2#C3q{yWFK={o1h zJX6jsfV-miVadGQDti=rE;gTQApsyvL;|O>%s#TpKD%9|Xt!ktATON$RN$lUnVAyi zu$_3sDq7Bb!Ou=|_3U&bu*)9mk{9#16#9pkQnPD{z+7wwlgC>rzf)X$0hv(68WX@& zDL#DZqAR>3u-|@oRam!8`@NOT$jorbE@#Vy>yfcX zY0DHU$>kHg_qnpMts1F?UgAxqk1*|ck7%O^_%%pm3-Z@Z`N_FY$&}~M}SPCg|2vsdYeZ| zczLYsFIl%KWw?d~a(m5Z^MuUhDRL3H*Svr{>;tsq&fW5zZWS2(xJ(Wumk)({?v6cf z{2Z=;uJEPY>Tu<|OFklsvcF=TvaPm3@YGh%ryjm@{m3{sKBtAV-^K^ivK_gpS;=z8 zjC+j-$m+b>SPZ!D?3`{cbdd{>{(c*a&h`woU9U{?$toLsuvnX|NkUdjRs;YyYcyKO=~i#4DWTGm#8Y+#t?`qp=U-+w1P#403h%OxwATj?{|x|yOPCUA001Y{ BczOT; literal 0 HcmV?d00001 From 022965b30939989e8805d12fe3b4e4981145be98 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Fri, 21 Jan 2022 17:29:25 +0100 Subject: [PATCH 07/80] integration of spliceType and blacklist annotation to results table --- DESCRIPTION | 8 +- NAMESPACE | 7 + R/FRASER-package.R | 5 +- R/resultAnnotations.R | 401 ++++++++++++++++++++++------------- man/spliceTypeAnnotations.Rd | 127 +++++++++++ src/RcppExports.cpp | 5 + 6 files changed, 398 insertions(+), 155 deletions(-) create mode 100644 man/spliceTypeAnnotations.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a8ebe376..f5c8403f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,7 @@ biocViews: License: MIT + file LICENSE URL: https://github.com/gagneurlab/FRASER BugRepots: https://github.com/gagneurlab/FRASER/issues -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Encoding: UTF-8 VignetteBuilder: knitr Depends: @@ -82,9 +82,10 @@ Suggests: covr, TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db, + rtracklayer LinkingTo: - Rcpp, - RcppArmadillo + RcppArmadillo, + Rcpp Collate: variables.R getNSetterFuns.R @@ -114,3 +115,4 @@ Collate: fitCorrectionMethods.R plotMethods.R zzz.R + resultAnnotations.R diff --git a/NAMESPACE b/NAMESPACE index 35e2d66a..cd7b6b50 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,8 +22,10 @@ export(K) export(N) export(aberrant) export(addCountsToFraserDataSet) +export(annotateIntronReferenceOverlap) export(annotateRanges) export(annotateRangesWithTxDb) +export(annotateSpliceEventType) export(bamFile) export(bestQ) export(calculatePSIValues) @@ -44,6 +46,7 @@ export(filterExpression) export(filterExpressionAndVariability) export(filterVariability) export(fit) +export(flagBlacklistRegions) export(getNonSplitReadCountsForAllSamples) export(getSplitReadCountsForAllSamples) export(hyperParams) @@ -163,9 +166,12 @@ importFrom(GenomicAlignments,junctions) importFrom(GenomicAlignments,readGAlignmentPairs) importFrom(GenomicAlignments,readGAlignments) importFrom(GenomicAlignments,summarizeJunctions) +importFrom(GenomicFeatures,exons) +importFrom(GenomicFeatures,fiveUTRsByTranscript) importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,makeTxDbFromGFF) +importFrom(GenomicFeatures,threeUTRsByTranscript) importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) importFrom(GenomicRanges,findOverlaps) @@ -301,6 +307,7 @@ importFrom(rhdf5,H5Pget_chunk) importFrom(rhdf5,H5Pget_layout) importFrom(rhdf5,h5delete) importFrom(rhdf5,h5ls) +importFrom(rtracklayer,import) importFrom(stats,cor) importFrom(stats,cutree) importFrom(stats,dbinom) diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 3cecc264..843f45d0 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -22,7 +22,8 @@ #' ### GRange/Experiment/bamFile packages #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which -#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes +#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes exons +#' fiveUTRsByTranscript threeUTRsByTranscript #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions #' readGAlignmentPairs #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames @@ -40,7 +41,7 @@ #' #' @importFrom biomaRt useEnsembl getBM #' @importFrom AnnotationDbi select -#' +#' @importFrom rtracklayer import #' ### Plotting #' diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index ff9268c9..971069e9 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -1,27 +1,52 @@ #' -#' @title Result annotations +#' @title Additional result annotations #' #' @description These functions work on the result table and add additional -#' annotations to the reported junctions: the type of splice event (e.g. -#' exon skipping, intron retention, ...), as well as a flag for junctions -#' that are located in blacklist regions of the genome. +#' annotations to the reported introns: the type of splice event (e.g. +#' exon skipping, exon truncation, ...), expected occurence of frameshift, +#' overlap with UTR regions as well as a flag for introns that are +#' located in blacklist regions of the genome. +#' +#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the +#' fds for each intron based on the overlap of the intron's location with +#' the reference annotation. Has to be run before the result table is +#' created so that the new column can be included in it (see examples). #' -#' \code{\link{annotateSpliceEventType}} annotates each junction in the results -#' table with the type of splice event: exon skipping, +#' \code{\link{annotateSpliceEventType}} annotates each intron in the results +#' table with the type of splice event and expected occurence of frameshift +#' (likely, unlikely, inconclusive). Can also calculate overlap with +#' annotated UTR regions. Splice types can be one of: +#' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, +#' exonTruncation, exonElongation, exonTruncation&Elongation, +#' singleExonSkipping, exonSkipping, splicingBeyondGene, +#' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, +#' complex (everything else). +#' Splice sites (theta metric) annotations indicate how the splice site is +#' located with respect to the reference annotation. The annotated types +#' are: annotatedSpliceSite, exonicRegion, intronicRegion. #' -#' \code{\link{flagBlacklistRegions}} flags junction in the results table on -#' whether or not they are located in a blacklist region of the genome. +#' \code{\link{flagBlacklistRegions}} flags introns in the results table on +#' whether or not they are located in a blacklist region of the genome. By +#' default, the blacklist regions as reported in \cite{...} are used. #' -#' @param result A result table as generated by FRASER. -#' @param fds The FraserDataSet, which was used to generate the results table +#' @param fds A FraserDataSet #' @param txdb A txdb object providing the reference annotation. -#' @params blacklist_regions BED file that contains the blacklist regions. -#' If \code{NULL} (default), the BED files that are packaged with FRASER are used. -#' @param assemblyVersion Indicates the genome assembly of the junction -#' coordinates. Only used if blacklist_regions is NULL. -#' @param BPPARAM For controlling parallelization behavior. -#' Defaults to \code{bpparam()}. -#' @return annotated results table +#' @param result A result table as generated by FRASER, including the column +#' \code{annotatedJunction} as generated by the function +#' \code{annotateIntronReferenceOverlap}. +#' @param addSpliceType Logical, indicating if the type of the splice event +#' should be added to the results table. Defaults to \code{TRUE}. +#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions +#' should checked and added to the results table. Defaults to \code{TRUE}. +#' @param blacklist_regions A BED file that contains the blacklist regions. +#' If \code{NULL} (default), the BED files that are packaged with FRASER +#' are used. +#' @param assemblyVersion Indicates the genome assembly version of the intron +#' coordinates. Only used if blacklist_regions is NULL. For other versions, +#' please provide the BED file containing the blacklist regions directly. +#' @param BPPARAM For controlling parallelization behavior. Defaults to +#' \code{bpparam()}. +#' @return An annotated FraserDataSet or results table, respectively #' #' @name spliceTypeAnnotations #' @rdname spliceTypeAnnotations @@ -29,43 +54,125 @@ #' @examples #' # get data, fit and compute p-values and z-scores #' fds <- createTestFraserDataSet() -#' -#' # extract results: for this example dataset, z score cutoff of 2 is used to -#' # get at least one result -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) -#' res_dt <- as.data.table(res) #' -#' # annotate type of splice event +#' # load reference annotation #' requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene") #' txdb <- #' TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene -#' res_dt <- annotateSpliceEventType(fds, res_dt, txdb) +#' +#' # add basic annotations for overlap with the reference annotation +#' # run this function before creating the results table +#' fds <- annotateIntronReferenceOverlap(fds, txdb) +#' +#' # extract results: for this small example dataset, only a z score cutoff +#' # of 1 is used to get at least one result. +#' # Make sure to include the additional column in the results table +#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, +#' additionalColumns = 'annotatedJunction') +#' res_dt <- as.data.table(res) +#' +#' # annotate the type of splice event and UTR overlap +#' res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb) #' -#' # annotate blacklist regions -#' res_dt <- flagBlacklistRegions(res_dt, assemblyVersion="hg19") +#' # annotate overlap with blacklist regions +#' res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") #' -#' # show annoated results table +#' # show results table containing additional annotations #' res_dt #' NULL +#' @describeIn spliceTypeAnnotations This method calculates basic annotations +#' based on overlap with the reference annotation (start, end, none, both) +#' for the full fds. The overlap type is added as a new column +#' \code{annotatedJunction} in \code{mcols(fds)}. +#' @export +annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ + print("loading introns") + #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + introns <- unique(unlist(intronsByTranscript(txdb))) + # reduce the introns to only the actually expressed introns + fds_known <- fds[unique(to(findOverlaps(introns, + rowRanges(fds, type = "j"), type = "equal"))),] + anno_introns <- as.data.table(rowRanges(fds_known, + type="j"))[,.(seqnames, start, end, strand)] + + # calculate extra columns with mean/median intron expression count + # add the new columns + print("adding median count to introns") + sampleCounts <- K(fds_known, type = "j") + anno_introns[, meanCount := rowMeans(sampleCounts)] + anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))] + # order by medianCount (highest first) + setorderv(anno_introns, "medianCount", order=-1) + anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, + keep.extra.columns = TRUE) + + # get all fds junctions + fds_junctions <- rowRanges(fds, type = "j") + + # Do the annotation just for the intron with highest median expression + print("start calculating annotations") + overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") + annotations <- bplapply(1:length(fds_junctions), function(i){ + # only select first intron as already ordered by medianCount beforehand + overlap <- overlaps[i] + if(is.na(overlap)) return("none") #no overlap with any intron + + hit_equal <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="equal")) + if(length(hit_equal) > 0) return("both") + + hit_start <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="start")) + if(length(hit_start) > 0) return("start") + hit_end <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="end")) + if(length(hit_end) > 0) return("end") + + # overlaps but no start/end match + return("none") + }, BPPARAM=BPPARAM) + annotations <- unlist(annotations) + + rowRanges(fds)$annotatedJunction <- annotations + mcols(fds, type="ss")$annotatedJunction <- "not computed" + print("annotations done") + return(fds) +} + #' @describeIn spliceTypeAnnotations This method annotates the splice event #' type to junctions in the given results table. #' @export -annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){ +annotateSpliceEventType <- function(result, txdb, addSpliceType=TRUE, + addUTRoverlap=TRUE, BPPARAM=bpparam()){ # Create basic annotation of overlap with reference - fds <- annotateJunctionReferenceOverlap(fds, txdb, BPPARAM) + if(!("annotatedJunction" %in% colnames(result))){ + stop("Column 'annotatedJunction' not found in the results table!\n", + "Please run fds <- annotateIntronReferenceOverlap(fds, txdb) ", + "first and add it \nto the results table with ", + "results(..., additionalColumns = 'annotatedJunction')\n", + "(see examples) before calling this function.") + } - # Calculate splice types and frameshift - result <- aberrantSpliceType(result, fds, txdb) + # convert to data.table if not already + if(!is.data.table(result)){ + result <- as.data.table(result) + } - # Add the subtypes for exonSkipping and inconclusive - result <- checkExonSkipping(result, txdb) - result <- checkInconclusive(result, txdb) + # Calculate splice types and frameshift + if(isTRUE(addSpliceType)){ + result <- addSpliceTypeLabels(result, fds, txdb) + } # Add UTR labels - result <- addUTRLabels(result, txdb) + if(isTRUE(addUTRoverlap)){ + result <- addUTRLabels(result, txdb) + } return(result) } @@ -76,6 +183,11 @@ annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){ flagBlacklistRegions <- function(result, blacklist_regions=NULL, assemblyVersion=c('hg19', 'hg38')){ + # convert to data.table if not already + if(!is.data.table(result)){ + result <- as.data.table(result) + } + assemblyVersion <- match.arg(assemblyVersion) if(is.null(blacklist_regions)){ blacklist_regions <- @@ -89,7 +201,7 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL, } print("Importing blacklist regions ...") blacklist_gr <- import(blacklist_regions, format = "BED") - result <- addBlacklistLabels(result, blacklist_gr, "splicing") + result <- addBlacklistLabels(result, blacklist_gr) return(result) } @@ -100,9 +212,8 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL, addBlacklistLabels <- function(junctions_dt, blacklist_gr){ # add the blacklist information print("Set up aberrant splicing granges") - psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" - junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions]) + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) # set to the same seqlevelsstyle seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr) @@ -112,19 +223,18 @@ addBlacklistLabels <- function(junctions_dt, blacklist_gr){ black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr))) junctions_dt[, blacklist := FALSE] - junctions_dt[psi_positions[black_hits], blacklist := TRUE] + junctions_dt[black_hits, blacklist := TRUE] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" print("blacklist labels done") return(junctions_dt) } -#' UTR annotation to results table +#' adds UTR overlap annotation to results table #' @noRd addUTRLabels <- function(junctions_dt, txdb){ - psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" - junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions]) + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) ### UTR labels based on txdb file ### add 5' 3' UTR labels @@ -133,77 +243,19 @@ addUTRLabels <- function(junctions_dt, txdb){ threeUTRsByTranscript(txdb, use.names = TRUE)))) fives <- unique(from(findOverlaps(junctions_gr, fiveUTRsByTranscript(txdb, use.names = TRUE)))) - junctions_dt[psi_positions, UTR := "no"] - junctions_dt[psi_positions[threes], UTR := "3"] - junctions_dt[psi_positions[fives], UTR := "5"] + junctions_dt[, UTR_overlap := "no"] + junctions_dt[threes, UTR_overlap := "3'-UTR"] + junctions_dt[fives, UTR_overlap := "5'-UTR"] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" print("UTR labels done") return(junctions_dt) } -#' Basic annotations of overlap with reference (start, end, none, both) for -#' full fds. -#' @noRd -annotateJunctionReferenceOverlap <- function(fds, txdb, BPPARAM){ - print("loading introns") - #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] - introns <- unique(unlist(intronsByTranscript(txdb))) - # reduce the introns to only the actually expressed introns - fds_known <- fds[unique(to(findOverlaps(introns, - rowRanges(fds, type = "j"), type = "equal"))),] - anno_introns <- as.data.table(rowRanges(fds_known, - type="j"))[,.(seqnames, start, end, strand)] - - # calculate extra columns with mean/median intron expression count - # add the new columns - print("adding median count to introns") - sampleCounts <- K(fds_known, type = "j") - anno_introns[, meanCount := rowMeans(sampleCounts)] - anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))] - # order by medianCount (highest first) - setorderv(anno_introns, "medianCount", order=-1) - anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, - keep.extra.columns = TRUE) - - # get all fds junctions - fds_junctions <- rowRanges(fds, type = "j") - - # Do the annotation just for the intron with highest median expression - print("start calculating annotations") - overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") - annotations <- bplapply(1:length(fds_junctions), function(i){ - # only select first intron as already ordered by medianCount beforehand - overlap <- overlaps[i] - if(is.na(overlap)) return("none") #no overlap with any intron - - hit_equal <- from(findOverlaps(fds_junctions[i], - anno_introns_ranges[overlap], - type="equal")) - if(length(hit_equal) > 0) return("both") - - hit_start <- from(findOverlaps(fds_junctions[i], - anno_introns_ranges[overlap], - type="start")) - if(length(hit_start) > 0) return("start") - hit_end <- from(findOverlaps(fds_junctions[i], - anno_introns_ranges[overlap], - type="end")) - if(length(hit_end) > 0) return("end") - - # overlaps but no start/end match - return("none") - }, BPPARAM=BPPARAM) - annotations <- unlist(annotations) - - rowRanges(fds)$annotatedJunction <- annotations - mcols(fds, type="ss")$annotatedJunction <- "none" - print("annotations done") - return(fds) -} -#' + +#' adds type of splicing to each intron in the results table #' @noRd -aberrantSpliceType <- function(junctions_dt, fds, txdb){ +addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ print("preparing ...") psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" @@ -229,12 +281,15 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ keep.extra.columns = TRUE) # prepare the results column - junctions_dt[, aberrantSpliceType := "NA"] - junctions_dt[, causesFrameshift := "NA"] - junctions_dt[which(junctions_dt$annotatedJunction=="both"), - aberrantSpliceType := "sameIntron"] - junctions_dt[which(junctions_dt$annotatedJunction=="both"), - causesFrameshift := "unlikely"] + junctions_dt[, spliceType := "complex"] + junctions_dt[, causesFrameshift := "inconclusive"] + junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, + spliceType := "annotatedIntron_increasedUsage"] + junctions_dt[annotatedJunction == "both" & deltaPsi < 0, + spliceType := "annotatedIntron_reducedUsage"] + junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] + + # TODO check for intron retention starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start") ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") @@ -256,7 +311,7 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ junctions_dt[psi_positions[starts], causesFrameshift:=start_results[2,]] junctions_dt[psi_positions[starts], - aberrantSpliceType := start_results[1,]] + spliceType := start_results[1,]] print("end junctions") end_results <- sapply(ends, function(i){ @@ -272,7 +327,7 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ }) junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] - junctions_dt[psi_positions[ends], aberrantSpliceType := end_results[1,]] + junctions_dt[psi_positions[ends], spliceType := end_results[1,]] print("none junctions pt1") none_results <- sapply(nones, function(i){ @@ -318,15 +373,15 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ frs = "likely" }else{ frs = "unlikely"} if( st[1] != en[1]){ - combined = "trunc, elong" + combined = "exonTruncation&Elongation" }else{combined = st[1]} return(c(combined,frs)) }) junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] - junctions_dt[psi_positions[nones], aberrantSpliceType := none_results[1,]] + junctions_dt[psi_positions[nones], spliceType := none_results[1,]] - noLaps <-which(junctions_dt[psi_positions]$aberrantSpliceType=="noOverlap") + noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap") refseq.genes<- genes(txdb) print("none junctions pt2") @@ -351,21 +406,65 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ } } - return(c("inconclusive","inconclusive")) + return(c("complex","inconclusive")) }) junctions_dt[psi_positions[noLaps], causesFrameshift:=noLaps_results[2,]] junctions_dt[psi_positions[noLaps], - aberrantSpliceType := noLaps_results[1,]] + spliceType := noLaps_results[1,]] + + # theta annotations + print("Annotate theta junctions") + thetas <- which(junctions_dt$type == "theta") + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], + keep.extra.columns = TRUE) + + # specify default type for theta results as NA + junctions_dt[thetas, spliceType := NA] + junctions_dt[thetas, causesFrameshift := NA] + + # label all as intronic first if they have any intron overlap + intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp))) + junctions_dt[thetas[intronic], spliceType := "intronicRegion"] + + # for exonic, check if theta is fully contained in an exon + # if one end is in an intron and the other in an exon it is a splice site + exonic <- unique(from(findOverlaps(junctions_gr, exons))) + within <- findOverlaps(junctions_gr, exons, type = "within") + all <- findOverlaps(junctions_gr, exons) + exonic_results <- sapply(exonic, function(i){ + w <- unique(to(within)[which(from(within) == i)]) + a <- unique(to(all)[which(from(all) == i)]) + if(length(a) == length(w)) return("exonicRegion") + return("annotatedSpliceSite") + }) + junctions_dt[thetas[exonic], spliceType := exonic_results] + # check cases that don't overlap with an exon/intron + print("nones") + nones <- which(is.na(junctions_dt[thetas,]$spliceType)) + none_results <- sapply(nones, function(i){ + if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) + #return("intergenic") + if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start(junctions_gr[i])){ + ifelse(strand(junctions_gr[i]) == "+", return("upstreamOfNearestGene"), return("downstreamOfNearestGene")) + }else{ + ifelse(strand(junctions_gr[i]) == "+", return("downstreamOfNearestGene"), return("upstreamOfNearestGene")) + } + }) + junctions_dt[thetas[nones], spliceType := none_results] + print("thetas done") - # add distance to closest neighbour gene for intergenic results + # add distance to closest neighbour gene for intergenic results + # (both psi and theta) print("adding distances to nearest gene") - up <- which(junctions_dt[psi_positions]$aberrantSpliceType == "upstream") - down <- which(junctions_dt[psi_positions - ]$aberrantSpliceType == "downstream") - print("Calculate distances") + up <- which(junctions_dt$spliceType == "upstreamOfNearestGene") + down <- which(junctions_dt$spliceType == "downstreamOfNearestGene") + # create full grange object containing psi and theta + junctions_gr <- makeGRangesFromDataFrame(junctions_dt, keep.extra.columns = T) + + print("Calculate distances") if(length(up) > 0){ distanceNearestGene_up <- sapply(up, function(i){ min(distance(junctions_gr[i], refseq.genes), na.rm = T)}) @@ -392,6 +491,11 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){ colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" print("done calculating aberrant splice types") + + # Add the subtypes for exonSkipping and inconclusive + junctions_dt <- checkExonSkipping(junctions_dt, txdb) + junctions_dt <- checkInconclusive(junctions_dt, txdb) + return(junctions_dt) } @@ -660,18 +764,15 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){ if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start){ ifelse(strand(junctions_gr[i]) == "+", - return(c("upstream", "unlikely")), - return(c("downstream", "unlikely"))) + return(c("upstreamOfNearestGene", "unlikely")), + return(c("downstreamOfNearestGene", "unlikely"))) }else{ ifelse(strand(junctions_gr[i]) == "+", - return(c("downstream", "unlikely")), - return(c("upstream", "unlikely"))) + return(c("downstreamOfNearestGene", "unlikely")), + return(c("upstreamOfNearestGene", "unlikely"))) } } - return(c("inconclusive", "inconclusive")) - - # if both lists == 0 return intergenic else inconclusive - # return(c("inconclusive", "inconclusive")) + return(c("complex", "inconclusive")) } #' @@ -685,7 +786,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ refseq.genes<- genes(txdb) - exonSkip <- which(junctions_dt[psi_positions]$aberrantSpliceType %in% + exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% c("exonSkipping", "singleExonSkipping")) print("start checking exonSkipping") @@ -714,7 +815,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ # check for which genes distance to start is 0 start_genes <- which(distance(test_start, refseq.genes) == 0) # start is not in a gene - if(length(start_genes) == 0) return("beyondGene") + if(length(start_genes) == 0) return("splicingBeyondGene") # start is in a gene -> is end in same gene for(to in start_genes){ @@ -726,24 +827,24 @@ checkExonSkipping <- function(junctions_dt, txdb){ end_genes <- which(distance(test_end, refseq.genes) == 0) # end is not in a gene - if(length(end_genes) == 0) return("beyondGene") + if(length(end_genes) == 0) return("splicingBeyondGene") # end is in a different gene - return("multigenic") + return("multigenicSplicing") }) print("checking exonSkipping done") if(length(exonSkip) > 0){ junctions_dt[psi_positions[exonSkip], - aberrantSpliceType2 := newSkip_results] - junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), - aberrantSpliceType := "beyondGene"] - junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), + spliceType2 := newSkip_results] + junctions_dt[spliceType2 == "splicingBeyondGene", + spliceType := "splicingBeyondGene"] + junctions_dt[spliceType2 == "splicingBeyondGene", causesFrameshift := "inconclusive"] - junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), - aberrantSpliceType := "multigenic"] - junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), + junctions_dt[spliceType2 == "multigenicSplicing", + spliceType := "multigenicSplicing"] + junctions_dt[spliceType2 == "multigenicSplicing", causesFrameshift := "inconclusive"] - junctions_dt[, aberrantSpliceType2 := NULL] + junctions_dt[, spliceType2 := NULL] } colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" @@ -762,7 +863,7 @@ checkInconclusive <- function(junctions_dt, txdb){ refseq.genes<- genes(txdb) inconclusive <- which(junctions_dt[psi_positions - ]$aberrantSpliceType == "inconclusive") + ]$spliceType == "complex") print("start checking inconclusive") inconclusive_results <- sapply(inconclusive, function(i){ @@ -790,28 +891,28 @@ checkInconclusive <- function(junctions_dt, txdb){ # check for which genes distance to start is 0 start_genes <- which(distance(test_start, refseq.genes) == 0) # start is not in a gene - if(length(start_genes) == 0) return("beyondGene") + if(length(start_genes) == 0) return("splicingBeyondGene") # start is in a gene -> is end in same gene for(to in start_genes){ # end is in same gene if(distance(test_end, refseq.genes[to]) == 0){ - return("inconclusive") + return("complex") } } end_genes <- which(distance(test_end, refseq.genes) == 0) # end is not in a gene - if(length(end_genes) == 0) return("beyondGene") + if(length(end_genes) == 0) return("splicingBeyondGene") # end is in a different gene - return("multigenic") + return("multigenicSplicing") }) colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" print("done checking inconclusive") if(length(inconclusive) > 0){ junctions_dt[psi_positions[inconclusive], - aberrantSpliceType := inconclusive_results] + spliceType := inconclusive_results] } return(junctions_dt) diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd new file mode 100644 index 00000000..17f8332b --- /dev/null +++ b/man/spliceTypeAnnotations.Rd @@ -0,0 +1,127 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/resultAnnotations.R +\name{spliceTypeAnnotations} +\alias{spliceTypeAnnotations} +\alias{annotateIntronReferenceOverlap} +\alias{annotateSpliceEventType} +\alias{flagBlacklistRegions} +\title{Additional result annotations} +\usage{ +annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam()) + +annotateSpliceEventType( + result, + txdb, + addSpliceType = TRUE, + addUTRoverlap = TRUE, + BPPARAM = bpparam() +) + +flagBlacklistRegions( + result, + blacklist_regions = NULL, + assemblyVersion = c("hg19", "hg38") +) +} +\arguments{ +\item{fds}{A FraserDataSet} + +\item{txdb}{A txdb object providing the reference annotation.} + +\item{BPPARAM}{For controlling parallelization behavior. Defaults to +\code{bpparam()}.} + +\item{result}{A result table as generated by FRASER, including the column +\code{annotatedJunction} as generated by the function +\code{annotateIntronReferenceOverlap}.} + +\item{addSpliceType}{Logical, indicating if the type of the splice event +should be added to the results table. Defaults to \code{TRUE}.} + +\item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions +should checked and added to the results table. Defaults to \code{TRUE}.} + +\item{blacklist_regions}{A BED file that contains the blacklist regions. +If \code{NULL} (default), the BED files that are packaged with FRASER +are used.} + +\item{assemblyVersion}{Indicates the genome assembly version of the intron +coordinates. Only used if blacklist_regions is NULL. For other versions, +please provide the BED file containing the blacklist regions directly.} +} +\value{ +An annotated FraserDataSet or results table, respectively +} +\description{ +These functions work on the result table and add additional + annotations to the reported introns: the type of splice event (e.g. + exon skipping, exon truncation, ...), expected occurence of frameshift, + overlap with UTR regions as well as a flag for introns that are + located in blacklist regions of the genome. + +\code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the + fds for each intron based on the overlap of the intron's location with + the reference annotation. Has to be run before the result table is + created so that the new column can be included in it (see examples). + +\code{\link{annotateSpliceEventType}} annotates each intron in the results + table with the type of splice event and expected occurence of frameshift + (likely, unlikely, inconclusive). Can also calculate overlap with + annotated UTR regions. Splice types can be one of: + annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, + exonTruncation, exonElongation, exonTruncation&Elongation, + singleExonSkipping, exonSkipping, splicingBeyondGene, + multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, + complex (everything else). + Splice sites (theta metric) annotations indicate how the splice site is + located with respect to the reference annotation. The annotated types + are: annotatedSpliceSite, exonicRegion, intronicRegion. + +\code{\link{flagBlacklistRegions}} flags introns in the results table on + whether or not they are located in a blacklist region of the genome. By + default, the blacklist regions as reported in \cite{...} are used. +} +\section{Functions}{ +\itemize{ +\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations +based on overlap with the reference annotation (start, end, none, both) +for the full fds. The overlap type is added as a new column +\code{annotatedJunction} in \code{mcols(fds)}. + +\item \code{annotateSpliceEventType}: This method annotates the splice event +type to junctions in the given results table. + +\item \code{flagBlacklistRegions}: This method flags blacklist regions in +the given results table. +}} + +\examples{ + # get data, fit and compute p-values and z-scores + fds <- createTestFraserDataSet() + + # load reference annotation + requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene") + txdb <- + TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene + + # add basic annotations for overlap with the reference annotation + # run this function before creating the results table + fds <- annotateIntronReferenceOverlap(fds, txdb) + + # extract results: for this small example dataset, only a z score cutoff + # of 1 is used to get at least one result. + # Make sure to include the additional column in the results table + res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, + additionalColumns = 'annotatedJunction') + res_dt <- as.data.table(res) + + # annotate the type of splice event and UTR overlap + res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb) + + # annotate overlap with blacklist regions + res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") + + # show results table containing additional annotations + res_dt + +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 1a7c62e3..461280c0 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,6 +6,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // setPseudoCount double setPseudoCount(double pseudoCount); RcppExport SEXP _FRASER_setPseudoCount(SEXP pseudoCountSEXP) { From 49fd0687d151bf3a265523046f32af92d3c35f54 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 25 Jan 2022 10:46:58 +0100 Subject: [PATCH 08/80] cleanup for bioccheck --- NAMESPACE | 4 +++- R/FRASER-package.R | 10 ++++---- R/resultAnnotations.R | 46 ++++++++++++++++++++---------------- man/spliceTypeAnnotations.Rd | 3 ++- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cd7b6b50..96deb2b5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -184,7 +184,9 @@ importFrom(HDF5Array,path) importFrom(HDF5Array,saveHDF5SummarizedExperiment) importFrom(HDF5Array,writeHDF5Array) importFrom(IRanges,IRanges) +importFrom(IRanges,distance) importFrom(IRanges,from) +importFrom(IRanges,nearest) importFrom(IRanges,ranges) importFrom(IRanges,subsetByOverlaps) importFrom(IRanges,to) @@ -213,6 +215,7 @@ importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,Rle) importFrom(S4Vectors,SimpleList) +importFrom(S4Vectors,elementMetadata) importFrom(S4Vectors,end) importFrom(S4Vectors,mcols) importFrom(S4Vectors,metadata) @@ -307,7 +310,6 @@ importFrom(rhdf5,H5Pget_chunk) importFrom(rhdf5,H5Pget_layout) importFrom(rhdf5,h5delete) importFrom(rhdf5,h5ls) -importFrom(rtracklayer,import) importFrom(stats,cor) importFrom(stats,cutree) importFrom(stats,dbinom) diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 843f45d0..f01fbc3f 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -31,7 +31,7 @@ #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList #' makeGRangesFromDataFrame invertStrand -#' @importFrom IRanges subsetByOverlaps from to IRanges ranges +#' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter #' bamWhich bamWhich<- BamFile idxstatsBam #' @importFrom Rsubread featureCounts @@ -41,7 +41,6 @@ #' #' @importFrom biomaRt useEnsembl getBM #' @importFrom AnnotationDbi select -#' @importFrom rtracklayer import #' ### Plotting #' @@ -82,7 +81,7 @@ ### To be added into the functions above #' #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<- -#' start end metadata metadata<- subjectHits queryHits +#' start end metadata metadata<- subjectHits queryHits elementMetadata #' @importFrom grDevices colorRampPalette #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels #' seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames @@ -129,5 +128,8 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa", "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", - "uniqueID", "V1", "value", "zscore", "maxDTheta"), + "uniqueID", "V1", "value", "zscore", "maxDTheta", + "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", + "distNearestGene", "UTR_overlap", "meanCount", "medianCount", + "spliceType2"), package="FRASER") diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 971069e9..87295444 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -72,7 +72,7 @@ #' res_dt <- as.data.table(res) #' #' # annotate the type of splice event and UTR overlap -#' res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb) +#' res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds) #' #' # annotate overlap with blacklist regions #' res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") @@ -114,7 +114,7 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ # Do the annotation just for the intron with highest median expression print("start calculating annotations") overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") - annotations <- bplapply(1:length(fds_junctions), function(i){ + annotations <- bplapply(seq_len(length(fds_junctions)), function(i){ # only select first intron as already ordered by medianCount beforehand overlap <- overlaps[i] if(is.na(overlap)) return("none") #no overlap with any intron @@ -147,7 +147,7 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ #' @describeIn spliceTypeAnnotations This method annotates the splice event #' type to junctions in the given results table. #' @export -annotateSpliceEventType <- function(result, txdb, addSpliceType=TRUE, +annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, addUTRoverlap=TRUE, BPPARAM=bpparam()){ # Create basic annotation of overlap with reference @@ -200,7 +200,7 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL, blacklist_regions) } print("Importing blacklist regions ...") - blacklist_gr <- import(blacklist_regions, format = "BED") + blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED") result <- addBlacklistLabels(result, blacklist_gr) return(result) } @@ -260,7 +260,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], - keep.extra.columns = T) + keep.extra.columns = TRUE) seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) introns_tmp <- unique(unlist(intronsByTranscript(txdb))) @@ -286,7 +286,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, spliceType := "annotatedIntron_increasedUsage"] junctions_dt[annotatedJunction == "both" & deltaPsi < 0, - spliceType := "annotatedIntron_reducedUsage"] + spliceType := "annotatedIntron_reducedUsage"] junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] # TODO check for intron retention @@ -342,10 +342,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ }) maxExpr <- which.max(expre) - # returns type of exon splicing, frameshift T/F, amount of shift - st = compareStarts(junctions_gr, i, overlap[maxExpr], T, + # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift + st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE, intron_ranges, exons) - en = compareEnds(junctions_gr, i, overlap[maxExpr], T, + en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE, intron_ranges, exons) # merge, start and end results @@ -355,7 +355,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ # if one is notYet -> return notYet if((st[1] == "singleExonSkipping" & !(en[1] %in% c("singleExonSkipping", "exonSkipping"))) || - (en[1] == "singleExonSkipping" & !(st[1] %in% + (en[1] == "singleExonSkipping" & !(st[1] %in% c("singleExonSkipping", "exonSkipping")))){ ## only one is single exonSkipping, the other is trunc/elong if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){ @@ -446,10 +446,15 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ none_results <- sapply(nones, function(i){ if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) #return("intergenic") - if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start(junctions_gr[i])){ - ifelse(strand(junctions_gr[i]) == "+", return("upstreamOfNearestGene"), return("downstreamOfNearestGene")) + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start(junctions_gr[i])){ + ifelse(strand(junctions_gr[i]) == "+", + return("upstreamOfNearestGene"), + return("downstreamOfNearestGene")) }else{ - ifelse(strand(junctions_gr[i]) == "+", return("downstreamOfNearestGene"), return("upstreamOfNearestGene")) + ifelse(strand(junctions_gr[i]) == "+", + return("downstreamOfNearestGene"), + return("upstreamOfNearestGene")) } }) junctions_dt[thetas[nones], spliceType := none_results] @@ -462,12 +467,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ down <- which(junctions_dt$spliceType == "downstreamOfNearestGene") # create full grange object containing psi and theta - junctions_gr <- makeGRangesFromDataFrame(junctions_dt, keep.extra.columns = T) + junctions_gr <- makeGRangesFromDataFrame(junctions_dt, + keep.extra.columns = TRUE) print("Calculate distances") if(length(up) > 0){ distanceNearestGene_up <- sapply(up, function(i){ - min(distance(junctions_gr[i], refseq.genes), na.rm = T)}) + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) if(length(distanceNearestGene_up > 0)){ junctions_dt[psi_positions[up], distNearestGene := distanceNearestGene_up] @@ -479,7 +485,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ if(length(down) > 0){ distanceNearestGene_down <- sapply(down, function(i){ - min(distance(junctions_gr[i], refseq.genes), na.rm = T)}) + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) if(length(distanceNearestGene_down > 0)){ junctions_dt[psi_positions[down], distNearestGene := distanceNearestGene_down] @@ -593,7 +599,7 @@ compareStarts <- function(junctions_gr, i, max_lap, shift_needed, # elongation shift = (-1)*(end(exons[exonChoices[1]]) - - start(exons[exonChoices[1]]) + 1) + + start(exons[exonChoices[1]]) + 1) + ss_start - start(intron_ranges[secItrChoices[maxExpr]]) frs = ifelse(shift %% 3 == 0,"unlikely","likely") @@ -753,7 +759,7 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){ # check if distance to nearest is > 1000 -> intergenic # otherwise up/downstream - dist = min(distance(test_junction, refseq.genes), na.rm = T) + dist = min(distance(test_junction, refseq.genes), na.rm = TRUE) if(dist > 0){ # if(dist > 1000){ # print("intergenic") @@ -781,7 +787,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], - keep.extra.columns = T) + keep.extra.columns = TRUE) seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) refseq.genes<- genes(txdb) @@ -857,7 +863,7 @@ checkInconclusive <- function(junctions_dt, txdb){ psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], - keep.extra.columns = T) + keep.extra.columns = TRUE) seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) refseq.genes<- genes(txdb) diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd index 17f8332b..17fea92c 100644 --- a/man/spliceTypeAnnotations.Rd +++ b/man/spliceTypeAnnotations.Rd @@ -12,6 +12,7 @@ annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam()) annotateSpliceEventType( result, txdb, + fds, addSpliceType = TRUE, addUTRoverlap = TRUE, BPPARAM = bpparam() @@ -116,7 +117,7 @@ the given results table. res_dt <- as.data.table(res) # annotate the type of splice event and UTR overlap - res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb) + res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds) # annotate overlap with blacklist regions res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") From e893f2cd2ced9071df2a6ef24741d4535b05c891 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 25 Jan 2022 23:08:29 +0100 Subject: [PATCH 09/80] cleanup --- DESCRIPTION | 1 + R/AllGenerics.R | 5 + R/resultAnnotations.R | 191 ++++++++++++++++++++--------------- man/spliceTypeAnnotations.Rd | 37 ++++--- 4 files changed, 140 insertions(+), 94 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f5c8403f..8273914a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,6 +7,7 @@ Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de"), person("Ines", "Scheller", role=c("aut"), email="scheller@in.tum.de"), + person("Karoline", "Lutz", role=c("aut")) person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"), person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de")) Description: Detection of rare aberrant splicing events in transcriptome diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 42986760..68ed8f38 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -654,6 +654,11 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) + + if("annotatedJunction" %in% colnames(mcols(fds, type="j")) && + !("annotatedJunction" %in% additionalColumns)){ + additionalColumns <- c(additionalColumns, "annotatedJunction") + } resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ message(date(), ": Collecting results for: ", type) diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 87295444..6ed35e65 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -27,7 +27,10 @@ #' #' \code{\link{flagBlacklistRegions}} flags introns in the results table on #' whether or not they are located in a blacklist region of the genome. By -#' default, the blacklist regions as reported in \cite{...} are used. +#' default, the blacklist regions as reported in +#' \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from +#' \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} +#' are used. #' #' @param fds A FraserDataSet #' @param txdb A txdb object providing the reference annotation. @@ -38,9 +41,12 @@ #' should be added to the results table. Defaults to \code{TRUE}. #' @param addUTRoverlap Logical, indicating if the overlap with UTR regions #' should checked and added to the results table. Defaults to \code{TRUE}. +#' @param minoverlap Integer value defining the number of base pairs around the +#' splice site that need to overlap with UTR or blacklist region, +#' respectivly, to be considered matching. Defaults to 5 bp. #' @param blacklist_regions A BED file that contains the blacklist regions. #' If \code{NULL} (default), the BED files that are packaged with FRASER -#' are used. +#' are used (see Details for more information). #' @param assemblyVersion Indicates the genome assembly version of the intron #' coordinates. Only used if blacklist_regions is NULL. For other versions, #' please provide the BED file containing the blacklist regions directly. @@ -56,9 +62,8 @@ #' fds <- createTestFraserDataSet() #' #' # load reference annotation -#' requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene") -#' txdb <- -#' TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene +#' library(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene #' #' # add basic annotations for overlap with the reference annotation #' # run this function before creating the results table @@ -66,19 +71,16 @@ #' #' # extract results: for this small example dataset, only a z score cutoff #' # of 1 is used to get at least one result. -#' # Make sure to include the additional column in the results table -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, -#' additionalColumns = 'annotatedJunction') -#' res_dt <- as.data.table(res) +#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) #' #' # annotate the type of splice event and UTR overlap -#' res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds) +#' res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) #' #' # annotate overlap with blacklist regions -#' res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") +#' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") #' #' # show results table containing additional annotations -#' res_dt +#' res #' NULL @@ -88,7 +90,7 @@ NULL #' \code{annotatedJunction} in \code{mcols(fds)}. #' @export annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ - print("loading introns") + message("loading introns ...") #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] introns <- unique(unlist(intronsByTranscript(txdb))) # reduce the introns to only the actually expressed introns @@ -99,10 +101,9 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ # calculate extra columns with mean/median intron expression count # add the new columns - print("adding median count to introns") - sampleCounts <- K(fds_known, type = "j") + sampleCounts <- as.matrix(K(fds_known, type = "j")) anno_introns[, meanCount := rowMeans(sampleCounts)] - anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))] + anno_introns[, medianCount := rowMedians(sampleCounts)] # order by medianCount (highest first) setorderv(anno_introns, "medianCount", order=-1) anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, @@ -112,9 +113,10 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ fds_junctions <- rowRanges(fds, type = "j") # Do the annotation just for the intron with highest median expression - print("start calculating annotations") + message("start calculating basic annotations ...") overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") - annotations <- bplapply(seq_len(length(fds_junctions)), function(i){ + annotations <- bplapply(seq_len(length(fds_junctions)), + function(i, overlaps, fds_junctions, anno_introns_ranges){ # only select first intron as already ordered by medianCount beforehand overlap <- overlaps[i] if(is.na(overlap)) return("none") #no overlap with any intron @@ -135,12 +137,13 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ # overlaps but no start/end match return("none") - }, BPPARAM=BPPARAM) + }, overlaps=overlaps, fds_junctions=fds_junctions, + anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM) annotations <- unlist(annotations) rowRanges(fds)$annotatedJunction <- annotations mcols(fds, type="ss")$annotatedJunction <- "not computed" - print("annotations done") + message("basic annotations done") return(fds) } @@ -148,44 +151,56 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ #' type to junctions in the given results table. #' @export annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, - addUTRoverlap=TRUE, BPPARAM=bpparam()){ - - # Create basic annotation of overlap with reference - if(!("annotatedJunction" %in% colnames(result))){ - stop("Column 'annotatedJunction' not found in the results table!\n", - "Please run fds <- annotateIntronReferenceOverlap(fds, txdb) ", - "first and add it \nto the results table with ", - "results(..., additionalColumns = 'annotatedJunction')\n", - "(see examples) before calling this function.") - } + addUTRoverlap=TRUE, minoverlap=5, + BPPARAM=bpparam()){ # convert to data.table if not already if(!is.data.table(result)){ - result <- as.data.table(result) + annoResult <- as.data.table(result) + } else{ + annoResult <- result + } + + # Create basic annotation of overlap with reference + if(!("annotatedJunction" %in% colnames(annoResult))){ + stop("Column 'annotatedJunction' not found in the results table!\n", + "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ", + "first and then extract \nthe results table using the ", + "'results(fds, ...)' function before calling this function.") } # Calculate splice types and frameshift if(isTRUE(addSpliceType)){ - result <- addSpliceTypeLabels(result, fds, txdb) + annoResult <- addSpliceTypeLabels(annoResult, fds, txdb) } # Add UTR labels if(isTRUE(addUTRoverlap)){ - result <- addUTRLabels(result, txdb) + annoResult <- addUTRLabels(annoResult, txdb) } - return(result) + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) } -#' @describeIn spliceTypeAnnotations This method flags blacklist regions in -#' the given results table. +#' @describeIn spliceTypeAnnotations This method flags all introns and +#' splice sites in the given results table for which at least one splice +#' site (donor or acceptor) is located in a blacklist region. Blacklist +#' regions of the genome are determined from the provided BED file. #' @export flagBlacklistRegions <- function(result, blacklist_regions=NULL, - assemblyVersion=c('hg19', 'hg38')){ + assemblyVersion=c('hg19', 'hg38'), + minoverlap=5){ # convert to data.table if not already if(!is.data.table(result)){ - result <- as.data.table(result) + annoResult <- as.data.table(result) + } else{ + annoResult <- result } assemblyVersion <- match.arg(assemblyVersion) @@ -199,55 +214,82 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL, stop("BED file with blacklist regions does not exist: ", blacklist_regions) } - print("Importing blacklist regions ...") + message("Importing blacklist regions ...") blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED") - result <- addBlacklistLabels(result, blacklist_gr) - return(result) + annoResult <- addBlacklistLabels(annoResult, blacklist_gr) + + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) } ############# helper functions ############################## #' blacklist annotation for aberrant splicing events #' @noRd -addBlacklistLabels <- function(junctions_dt, blacklist_gr){ +addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){ # add the blacklist information - print("Set up aberrant splicing granges") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt) + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + # set to the same seqlevelsstyle seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr) ## create overlap with blacklist and annotate extra column - print("find blacklist overlap") - black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr))) + message("finding blacklist overlap ...") + black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr))) + black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr))) junctions_dt[, blacklist := FALSE] - junctions_dt[black_hits, blacklist := TRUE] + junctions_dt[black_hits_start_ss | black_hits_end_ss, blacklist := TRUE] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" - print("blacklist labels done") + message("blacklist labels done") return(junctions_dt) } #' adds UTR overlap annotation to results table #' @noRd -addUTRLabels <- function(junctions_dt, txdb){ +addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){ colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt) seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + ### UTR labels based on txdb file ### add 5' 3' UTR labels - print("find UTR overlap") - threes <- unique(from(findOverlaps(junctions_gr, + message("finding UTR overlap ...") + threes_start <- unique(from(findOverlaps(gr_start_ss, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + threes_end <- unique(from(findOverlaps(gr_end_ss, threeUTRsByTranscript(txdb, use.names = TRUE)))) - fives <- unique(from(findOverlaps(junctions_gr, + fives_start <- unique(from(findOverlaps(gr_start_ss, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + fives_end <- unique(from(findOverlaps(gr_end_ss, fiveUTRsByTranscript(txdb, use.names = TRUE)))) junctions_dt[, UTR_overlap := "no"] - junctions_dt[threes, UTR_overlap := "3'-UTR"] - junctions_dt[fives, UTR_overlap := "5'-UTR"] + junctions_dt[threes_start | threes_end, UTR_overlap := "3'-UTR"] + junctions_dt[fives_start | fives_end, UTR_overlap := "5'-UTR"] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" - print("UTR labels done") + message("UTR labels done") return(junctions_dt) } @@ -256,7 +298,7 @@ addUTRLabels <- function(junctions_dt, txdb){ #' adds type of splicing to each intron in the results table #' @noRd addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ - print("preparing ...") + message("preparing ...") psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], @@ -295,8 +337,8 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none") - print("calculating aberrant splice types") - print("start junctions") + message("calculating splice types ...") + # start junctions start_results <- sapply(starts, function(i){ # find the most freq intron that overlaps again overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, @@ -313,7 +355,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ junctions_dt[psi_positions[starts], spliceType := start_results[1,]] - print("end junctions") + # end junctions end_results <- sapply(ends, function(i){ # find the most freq intron that overlaps again overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, @@ -329,7 +371,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] junctions_dt[psi_positions[ends], spliceType := end_results[1,]] - print("none junctions pt1") + # none junctions pt1 none_results <- sapply(nones, function(i){ # find most freq intron # check start and end @@ -384,7 +426,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap") refseq.genes<- genes(txdb) - print("none junctions pt2") + # none junctions pt2 noLaps_results <- sapply(noLaps, function(i){ overlap <- to(findOverlaps(junctions_gr[i], exons)) # no overlap with an intron or an exon @@ -414,7 +456,6 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ spliceType := noLaps_results[1,]] # theta annotations - print("Annotate theta junctions") thetas <- which(junctions_dt$type == "theta") junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], keep.extra.columns = TRUE) @@ -441,7 +482,6 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ junctions_dt[thetas[exonic], spliceType := exonic_results] # check cases that don't overlap with an exon/intron - print("nones") nones <- which(is.na(junctions_dt[thetas,]$spliceType)) none_results <- sapply(nones, function(i){ if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) @@ -458,11 +498,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ } }) junctions_dt[thetas[nones], spliceType := none_results] - print("thetas done") # add distance to closest neighbour gene for intergenic results # (both psi and theta) - print("adding distances to nearest gene") + message("adding distances to nearest gene ...") up <- which(junctions_dt$spliceType == "upstreamOfNearestGene") down <- which(junctions_dt$spliceType == "downstreamOfNearestGene") @@ -470,7 +509,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ junctions_gr <- makeGRangesFromDataFrame(junctions_dt, keep.extra.columns = TRUE) - print("Calculate distances") + # Calculate distances if(length(up) > 0){ distanceNearestGene_up <- sapply(up, function(i){ min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) @@ -479,9 +518,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ distNearestGene := distanceNearestGene_up] } else{ junctions_dt[psi_positions[up], distNearestGene := NA] - print("No distances found for upstream") + message("No distances found for upstream") } - }else{print("No upstream targets")} + }else{message("No upstream targets")} if(length(down) > 0){ distanceNearestGene_down <- sapply(down, function(i){ @@ -491,12 +530,12 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ distNearestGene := distanceNearestGene_down] }else{ junctions_dt[psi_positions[down], distNearestGene := NA] - print("No distances found for downstream") + message("No distances found for downstream") } - }else{print("No downstream targets")} + }else{message("No downstream targets")} colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" - print("done calculating aberrant splice types") + message("done calculating splice types") # Add the subtypes for exonSkipping and inconclusive junctions_dt <- checkExonSkipping(junctions_dt, txdb) @@ -761,11 +800,6 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){ # otherwise up/downstream dist = min(distance(test_junction, refseq.genes), na.rm = TRUE) if(dist > 0){ - # if(dist > 1000){ - # print("intergenic") - # return(c("intergenic", "unlikely")) - # }else{ - # find nearest and compare starts if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start){ @@ -795,7 +829,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% c("exonSkipping", "singleExonSkipping")) - print("start checking exonSkipping") + message("start checking exonSkipping") newSkip_results <- sapply(exonSkip, function(i){ start = start(junctions_gr[i]) end = end(junctions_gr[i]) @@ -838,7 +872,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ return("multigenicSplicing") }) - print("checking exonSkipping done") + # checking exonSkipping done if(length(exonSkip) > 0){ junctions_dt[psi_positions[exonSkip], spliceType2 := newSkip_results] @@ -870,7 +904,6 @@ checkInconclusive <- function(junctions_dt, txdb){ inconclusive <- which(junctions_dt[psi_positions ]$spliceType == "complex") - print("start checking inconclusive") inconclusive_results <- sapply(inconclusive, function(i){ start = start(junctions_gr[i]) @@ -915,7 +948,7 @@ checkInconclusive <- function(junctions_dt, txdb){ }) colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" - print("done checking inconclusive") + if(length(inconclusive) > 0){ junctions_dt[psi_positions[inconclusive], spliceType := inconclusive_results] diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd index 17fea92c..b28e3c33 100644 --- a/man/spliceTypeAnnotations.Rd +++ b/man/spliceTypeAnnotations.Rd @@ -15,13 +15,15 @@ annotateSpliceEventType( fds, addSpliceType = TRUE, addUTRoverlap = TRUE, + minoverlap = 5, BPPARAM = bpparam() ) flagBlacklistRegions( result, blacklist_regions = NULL, - assemblyVersion = c("hg19", "hg38") + assemblyVersion = c("hg19", "hg38"), + minoverlap = 5 ) } \arguments{ @@ -42,9 +44,13 @@ should be added to the results table. Defaults to \code{TRUE}.} \item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions should checked and added to the results table. Defaults to \code{TRUE}.} +\item{minoverlap}{Integer value defining the number of base pairs around the +splice site that need to overlap with UTR or blacklist region, +respectivly, to be considered matching. Defaults to 5 bp.} + \item{blacklist_regions}{A BED file that contains the blacklist regions. If \code{NULL} (default), the BED files that are packaged with FRASER -are used.} +are used (see Details for more information).} \item{assemblyVersion}{Indicates the genome assembly version of the intron coordinates. Only used if blacklist_regions is NULL. For other versions, @@ -80,7 +86,10 @@ These functions work on the result table and add additional \code{\link{flagBlacklistRegions}} flags introns in the results table on whether or not they are located in a blacklist region of the genome. By - default, the blacklist regions as reported in \cite{...} are used. + default, the blacklist regions as reported in + \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from + \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} + are used. } \section{Functions}{ \itemize{ @@ -92,8 +101,10 @@ for the full fds. The overlap type is added as a new column \item \code{annotateSpliceEventType}: This method annotates the splice event type to junctions in the given results table. -\item \code{flagBlacklistRegions}: This method flags blacklist regions in -the given results table. +\item \code{flagBlacklistRegions}: This method flags all introns and +splice sites in the given results table for which at least one splice +site (donor or acceptor) is located in a blacklist region. Blacklist +regions of the genome are determined from the provided BED file. }} \examples{ @@ -101,9 +112,8 @@ the given results table. fds <- createTestFraserDataSet() # load reference annotation - requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene") - txdb <- - TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene + library(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # add basic annotations for overlap with the reference annotation # run this function before creating the results table @@ -111,18 +121,15 @@ the given results table. # extract results: for this small example dataset, only a z score cutoff # of 1 is used to get at least one result. - # Make sure to include the additional column in the results table - res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, - additionalColumns = 'annotatedJunction') - res_dt <- as.data.table(res) + res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) # annotate the type of splice event and UTR overlap - res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds) + res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) # annotate overlap with blacklist regions - res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19") + res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") # show results table containing additional annotations - res_dt + res } From 735ed8c55bce40d069549a7fcce4a624f1103c49 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 26 Jan 2022 00:09:37 +0100 Subject: [PATCH 10/80] fix bioccheck --- DESCRIPTION | 2 +- NAMESPACE | 4 ++++ R/FRASER-package.R | 2 +- R/resultAnnotations.R | 4 ++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8273914a..c008fecb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,7 +7,7 @@ Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de"), person("Ines", "Scheller", role=c("aut"), email="scheller@in.tum.de"), - person("Karoline", "Lutz", role=c("aut")) + person("Karoline", "Lutz", role=c("aut")), person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"), person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de")) Description: Detection of rare aberrant splicing events in transcriptome diff --git a/NAMESPACE b/NAMESPACE index 96deb2b5..861b7c2c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -172,12 +172,16 @@ importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,makeTxDbFromGFF) importFrom(GenomicFeatures,threeUTRsByTranscript) +importFrom(GenomicRanges,"end<-") +importFrom(GenomicRanges,"start<-") importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) +importFrom(GenomicRanges,end) importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,granges) importFrom(GenomicRanges,invertStrand) importFrom(GenomicRanges,makeGRangesFromDataFrame) +importFrom(GenomicRanges,start) importFrom(HDF5Array,HDF5Array) importFrom(HDF5Array,loadHDF5SummarizedExperiment) importFrom(HDF5Array,path) diff --git a/R/FRASER-package.R b/R/FRASER-package.R index f01fbc3f..bb2e57a5 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -30,7 +30,7 @@ #' colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList -#' makeGRangesFromDataFrame invertStrand +#' makeGRangesFromDataFrame invertStrand start end start<- end<- #' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter #' bamWhich bamWhich<- BamFile idxstatsBam diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 6ed35e65..5afb81e3 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -347,7 +347,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ elementMetadata(intron_ranges[j])$medianCount }) maxExpr <- which.max(expre) - return(compareEnds(junctions_gr, i, overlap[maxExpr], F, + return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE, intron_ranges, exons)) }) junctions_dt[psi_positions[starts], @@ -364,7 +364,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ elementMetadata(intron_ranges[j])$medianCount }) maxExpr <- which.max(expre) - return(compareStarts(junctions_gr, i, overlap[maxExpr], F, + return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE, intron_ranges, exons)) }) From 4a5b72ade085a8c88ef05a58351f0c9aa435dc9f Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 26 Jan 2022 01:01:28 +0100 Subject: [PATCH 11/80] bugfix --- R/resultAnnotations.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 5afb81e3..52f1c70b 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -252,7 +252,8 @@ addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){ black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr))) junctions_dt[, blacklist := FALSE] - junctions_dt[black_hits_start_ss | black_hits_end_ss, blacklist := TRUE] + junctions_dt[black_hits_start_ss, blacklist := TRUE] + junctions_dt[black_hits_end_ss, blacklist := TRUE] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" message("blacklist labels done") @@ -286,8 +287,10 @@ addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){ fives_end <- unique(from(findOverlaps(gr_end_ss, fiveUTRsByTranscript(txdb, use.names = TRUE)))) junctions_dt[, UTR_overlap := "no"] - junctions_dt[threes_start | threes_end, UTR_overlap := "3'-UTR"] - junctions_dt[fives_start | fives_end, UTR_overlap := "5'-UTR"] + junctions_dt[threes_start, UTR_overlap := "3'-UTR"] + junctions_dt[threes_end, UTR_overlap := "3'-UTR"] + junctions_dt[fives_start, UTR_overlap := "5'-UTR"] + junctions_dt[fives_end, UTR_overlap := "5'-UTR"] colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" message("UTR labels done") return(junctions_dt) From 8da822ae6685f21e1c1d3355a619e97852f021ce Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 26 Jan 2022 01:37:27 +0100 Subject: [PATCH 12/80] minor bugfix --- R/AllGenerics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 68ed8f38..e0a00fd6 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -655,7 +655,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) - if("annotatedJunction" %in% colnames(mcols(fds, type="j")) && + if("annotatedJunction" %in% colnames(mcols(object, type="j")) && !("annotatedJunction" %in% additionalColumns)){ additionalColumns <- c(additionalColumns, "annotatedJunction") } From 7346d333c91f6ff58dcbd44b1865e5f61a66910d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 26 Jan 2022 16:01:42 +0100 Subject: [PATCH 13/80] cleaner jaccard index computation --- R/AllGenerics.R | 3 +- R/calculatePSIValue.R | 239 ++++++++++++++++-------------------------- 2 files changed, 93 insertions(+), 149 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 9349f8b2..17553bc4 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -885,7 +885,8 @@ setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.3, rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE, - minCount=5, psiType=c("psi3", "psi5", "theta"), + minCount=5, psiType=c("psi3", "psi5", "theta", + "intron_jaccard"), geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index be0b8dc2..6cf2f17b 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -37,7 +37,8 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, } # calculate intron jaccard index - fds <- calculateIntronJaccardIndex(fds) + fds <- calculateIntronNonsplitSum(fds, overwriteCts=overwriteCts) + fds <- calculateJaccardIntronIndex(fds, overwriteCts=overwriteCts) # calculate the delta psi value for(psiType in types){ @@ -326,155 +327,97 @@ getOtherCountsCacheFolder <- function(fds){ } #' -#' calculates the intron jaccard index values for all junctions -#' @inheritParams countRNA -#' -calculateIntronJaccardIndex <- function(fds){ - # retrieve junction and splice site annotation with count information - junction_dt <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, - strand, startID, endID)], - as.matrix(K(fds, type="psi5"))) - ss_dt <- data.table( - as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, - strand, spliceSiteID)], - as.matrix(N(fds, type="theta"))) - - # melt to have one row per sample - junction combination - junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", - id.vars=c("seqnames", "start", "end", "strand", - "startID", "endID")) - ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", - id.vars=c("seqnames", "start", "end", "strand", - "spliceSiteID")) - - # merge junction information with splice site annotation (theta) - junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], - all.x=TRUE, by.x=c("startID", "sampleID"), - by.y=c("spliceSiteID", "sampleID"), sort=FALSE) - setnames(junction_dt, "n", "n_donor") - junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], - all.x=TRUE, by.x=c("endID", "sampleID"), - by.y=c("spliceSiteID", "sampleID"), sort=FALSE) - setnames(junction_dt, "n", "n_acceptor") - rm(ss_dt) - gc() - - # TODO deal with missing endIDs (why do we have them? lost in filtering?) - # for now: replace n_donor with N from psi5 (and same for n_acceptor and psi3) - junction_dt_nacceptor <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], - as.matrix(N(fds, type="psi3"))) - junction_dt_nacceptor <- melt(junction_dt_nacceptor, - variable.name="sampleID", value.name="n_psi3", - id.vars=c("startID", "endID")) - junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]] - rm(junction_dt_nacceptor) - gc() - junction_dt_ndonor <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], - as.matrix(N(fds, type="psi5"))) - junction_dt_ndonor <- melt(junction_dt_ndonor, - variable.name="sampleID", value.name="n_psi5", - id.vars=c("startID", "endID")) - junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]] - rm(junction_dt_ndonor) - gc() - - # replace n (with non-split counts) by n_psi3/n_psi5 if NA - junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3] - junction_dt[is.na(n_donor), n_donor:=n_psi5] - junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3] - junction_dt[n_donor < n_psi5, n_donor:=n_psi5] - - # calculate intron_jaccard - junction_dt[, denominator:=(n_donor + n_acceptor - k)] - junction_dt[, intron_jaccard:= k / denominator] - junction_dt[is.nan(intron_jaccard), intron_jaccard:=1] # n_donor = n_acceptor = 0 - - # convert to matrix to store it as assay in the fds - intron_jaccard <- matrix(junction_dt[,intron_jaccard], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) - rownames(intron_jaccard) <- rownames(fds) - colnames(intron_jaccard) <- colnames(fds) - assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- intron_jaccard - - # store denominator - jaccard_denom <- matrix(junction_dt[,denominator], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) - rownames(jaccard_denom) <- rownames(fds) - colnames(jaccard_denom) <- colnames(fds) - assay(fds, type="j", "rawOtherCounts_intron_jaccard", - withDimnames=FALSE) <- jaccard_denom - matrix(junction_dt[,k], - nrow=nrow(fds), - ncol=ncol(fds), - byrow=FALSE) +#' calculates the jaccard intron value for the given junctions +#' +#' @noRd +calculateJaccardIntronIndex <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the Jaccard Intron values ...") + + # check if we have computed N_psi3, N_psi5 and K_nonsplit already + if(!all(c(paste0("rawOtherCounts_psi", c(5, 3)), "rawCountsJnonsplit") %in% + assayNames(fds))){ + stop("Please calculate N_psi3, N_psi5 and K_nonsplit first before ", + "calling this function.") + } + + # calculate intron jaccard value + jaccard_denom <- N(fds, "psi3") + N(fds, "psi5") + + assay(fds, "rawCountsJnonsplit") - K(fds, type="j") + jaccardValues <- K(fds, type="j") / jaccard_denom + otherCounts_jaccard <- jaccard_denom - K(fds, type="j") + + # TODO also calculate it with nonsplit counts in the nominator + + # assign it to our object + assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- jaccardValues + + if(isTRUE(overwriteCts) || + !("rawOtherCounts_intron_jaccard" %in% assayNames(fds))){ + assay(fds, type="j", "rawOtherCounts_intron_jaccard", + withDimnames=FALSE) <- otherCounts_jaccard + } + return(fds) } -calculatePhi <- function(fds){ - # retrieve junction and splice site annotation with count information - junction_dt <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, - strand, startID, endID)], - as.matrix(K(fds, type="psi5"))) - ss_dt <- data.table( - as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, - strand, spliceSiteID)], - as.matrix(N(fds, type="theta"))) - - # melt to have one row per sample - junction combination - junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", - id.vars=c("seqnames", "start", "end", "strand", - "startID", "endID")) - ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", - id.vars=c("seqnames", "start", "end", "strand", - "spliceSiteID")) - - # merge junction information with splice site annotation (theta) - junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], - all.x=TRUE, by.x=c("startID", "sampleID"), - by.y=c("spliceSiteID", "sampleID"), sort=FALSE) - setnames(junction_dt, "n", "n_donor") - junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], - all.x=TRUE, by.x=c("endID", "sampleID"), - by.y=c("spliceSiteID", "sampleID"), sort=FALSE) - setnames(junction_dt, "n", "n_acceptor") - rm(ss_dt) - gc() - - # deal with missing endIDs - junction_dt_nacceptor <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], - as.matrix(N(fds, type="psi3"))) - junction_dt_nacceptor <- melt(junction_dt_nacceptor, - variable.name="sampleID", value.name="n_psi3", - id.vars=c("startID", "endID")) - junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]] - rm(junction_dt_nacceptor) - gc() - junction_dt_ndonor <- data.table( - as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], - as.matrix(N(fds, type="psi5"))) - junction_dt_ndonor <- melt(junction_dt_ndonor, - variable.name="sampleID", value.name="n_psi5", - id.vars=c("startID", "endID")) - junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]] - rm(junction_dt_ndonor) - gc() - - # replace n (with non-split counts) by n_psi3/n_psi5 if NA - junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3] - junction_dt[is.na(n_donor), n_donor:=n_psi5] - junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3] - junction_dt[n_donor < n_psi5, n_donor:=n_psi5] - - # calculate phi - junction_dt[, phi:= k / ((n_donor + n_acceptor)/2)] - junction_dt[is.nan(phi), phi:=0] # n_donor = n_acceptor = 0 - - # convert to matrix to store it as assay in the fds - phi <- matrix(junction_dt[,phi], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE) - rownames(phi) <- rownames(fds) - colnames(phi) <- colnames(fds) - assay(fds, "phi", type="psi5", withDimnames=FALSE) <- phi +#' Calculates the sum of nonsplit reads overlapping either the donor or acceptor +#' splice site and stores it as a new assay (one value for each junction and +#' sample). +#' +#' @noRd +calculateIntronNonsplitSum <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the total nonsplict counts for each intron ", + "...") + + + # get splice site nonsplit counts + nsr_ss <- K(fds, "theta") + + # retrieve junction and splice site annotation + junction_dt <- as.data.table(rowRanges(fds, type="j"))[, + .(seqnames, start, end, + strand, startID, endID)] + ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, + nsr_idx=seq_len(nrow(nonSplicedReads(fds)))) + + junction_dt <- merge(junction_dt, ss_map, + by.x="startID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "start_idx") + junction_dt <- merge(junction_dt, ss_map, + by.x="endID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "end_idx") + + # for each junction, find the two rows in K_theta corresponding to its + # donor and acceptor splice site + donor_sites <- junction_dt$start_idx + acc_sites <- junction_dt$end_idx + nsr_donor <- nsr_ss[donor_sites,] + nsr_acc <- nsr_ss[acc_sites,] + + # set nsr counts to 0 for junctions for which no mapping by spliceSiteID + # could be found + nsr_donor[is.na(nsr_donor)] <- 0 + nsr_acc[is.na(nsr_acc)] <- 0 + + # sum them + nsr_j <- nsr_donor + nsr_acc + + if(nrow(nsr_j) != nrow(fds)){ + warning("Unequal number of junctions in fds and junctions with ", + "computed nonsplit count sum!") + } + + # assign it to our object + if(isTRUE(overwriteCts) || + !("rawCountsJnonsplit" %in% assayNames(fds))){ + assay(fds, type="j", "rawCountsJnonsplit", withDimnames=FALSE) <- nsr_j + } + return(fds) } From f1a9d099effd799ca9bd3a70efa58af3d40c057c Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Mar 2022 16:14:57 +0200 Subject: [PATCH 14/80] adaptions to using jaccard metric --- R/AllGenerics.R | 10 ++++++---- R/calculatePSIValue.R | 22 +++++++++++++--------- R/filterExpression.R | 19 +++++++++++-------- R/find_encoding_dimensions.R | 2 +- R/helper-functions.R | 6 +++--- R/makeSimulatedDataset.R | 8 ++++++-- R/plotMethods.R | 13 ++++++++----- R/pvalsNzscore.R | 4 ++++ R/variables.R | 2 +- 9 files changed, 53 insertions(+), 33 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 17553bc4..73ebb087 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -537,7 +537,7 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, type <- whichPSIType(type) if(length(type) == 0 | length(type) > 1){ stop(paste0("Please provide a correct psi type: psi5, psi3, ", - "theta or intron_jaccard. Not the given one: '", + "theta or jaccard. Not the given one: '", type, "'.")) } aname <- paste0("rawOtherCounts_", type) @@ -724,8 +724,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, # extract values rawCts <- as.matrix(K(tmp_x)) rawTotalCts <- as.matrix(N(tmp_x)) - pvals <- as.matrix(pVals(tmp_x)) - padjs <- as.matrix(padjVals(tmp_x)) + pvals <- as.matrix(pVals(tmp_x, + filters=list(rho=rhoCutoff))) + padjs <- as.matrix(padjVals(tmp_x, + filters=list(rho=rhoCutoff))) zscores <- as.matrix(zScores(tmp_x)) psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) @@ -886,7 +888,7 @@ setMethod("results", "FraserDataSet", function(object, zScoreCutoff=NA, deltaPsiCutoff=0.3, rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=c("psi3", "psi5", "theta", - "intron_jaccard"), + "jaccard"), geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index 6cf2f17b..5fb07fad 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -351,11 +351,11 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){ # TODO also calculate it with nonsplit counts in the nominator # assign it to our object - assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- jaccardValues + assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues if(isTRUE(overwriteCts) || - !("rawOtherCounts_intron_jaccard" %in% assayNames(fds))){ - assay(fds, type="j", "rawOtherCounts_intron_jaccard", + !("rawOtherCounts_jaccard" %in% assayNames(fds))){ + assay(fds, type="j", "rawOtherCounts_jaccard", withDimnames=FALSE) <- otherCounts_jaccard } @@ -381,6 +381,7 @@ calculateIntronNonsplitSum <- function(fds, overwriteCts){ junction_dt <- as.data.table(rowRanges(fds, type="j"))[, .(seqnames, start, end, strand, startID, endID)] + junction_dt[, j_idx:=seq_len(.N)] ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, nsr_idx=seq_len(nrow(nonSplicedReads(fds)))) @@ -395,15 +396,18 @@ calculateIntronNonsplitSum <- function(fds, overwriteCts){ # for each junction, find the two rows in K_theta corresponding to its # donor and acceptor splice site - donor_sites <- junction_dt$start_idx - acc_sites <- junction_dt$end_idx - nsr_donor <- nsr_ss[donor_sites,] - nsr_acc <- nsr_ss[acc_sites,] + donor_sites <- junction_dt[!is.na(start_idx),] + acc_sites <- junction_dt[!is.na(end_idx),] # set nsr counts to 0 for junctions for which no mapping by spliceSiteID # could be found - nsr_donor[is.na(nsr_donor)] <- 0 - nsr_acc[is.na(nsr_acc)] <- 0 + nsr_donor <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + nsr_acc <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + + nsr_donor[donor_sites[,j_idx],] <- + as.matrix(nsr_ss[donor_sites[,start_idx],]) + nsr_acc[acc_sites[,j_idx],] <- + as.matrix(nsr_ss[acc_sites[,end_idx],]) # sum them nsr_j <- nsr_donor + nsr_acc diff --git a/R/filterExpression.R b/R/filterExpression.R index 7d8d2806..1dee404b 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -382,6 +382,9 @@ filterExpressionAndVariability_jaccard <- function(object, } +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites which are expressed at very low levels across samples. +#' @export filterExpression_jaccard <- function(object, minExpressionInOneSample=20, quantile=0.95, quantileMinExpression=1, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), @@ -393,7 +396,7 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20, # extract counts cts <- K(object, type="j") - ctsN <- N(object, type="intron_jaccard") + ctsN <- N(object, type="jaccard") if(isFALSE(delayed)){ cts <- as.matrix(cts) @@ -451,17 +454,17 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, # extract counts cts <- K(object, type="j") - ctsN <- N(object, type="intron_jaccard") + ctsN <- N(object, type="jaccard") if(isFALSE(delayed)){ cts <- as.matrix(cts) - ctsN <- as.matrix(ctsN5) + ctsN <- as.matrix(ctsN) } # cutoff functions - f1 <- function(cts, ctsN3, ...) { - intron_jaccard <- cts/ctsN - rowMaxs(abs(intron_jaccard - rowMeans2(intron_jaccard, na.rm=TRUE)), + f1 <- function(cts, ctsN, ...) { + jaccard <- cts/ctsN + rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), na.rm=TRUE) } funs <- c(maxDJaccard=f1) @@ -520,7 +523,7 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, for(aname in assayNames(rareJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3", "intron_jaccard", + "delta_psi5", "delta_psi3", "jaccard", "rawOtherCounts_intron_jaccard"))){ assay(rareJunctions, aname) <- NULL } @@ -583,7 +586,7 @@ applyVariabilityFilters <- function(fds, minDelta){ for(aname in assayNames(nonVariableJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3", "intron_jaccard", + "delta_psi5", "delta_psi3", "jaccard", "rawOtherCounts_intron_jaccard"))){ assay(nonVariableJunctions, aname) <- NULL } diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index 6653fcaa..af5be75a 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -125,7 +125,7 @@ findEncodingDim <- function(i, fds, type, params, implementation, optimHyperParams <- function(fds, type, implementation="PCA", q_param=seq(2, min(40, ncol(fds)), by=3), noise_param=0, minDeltaPsi=0.1, - iterations=5, setSubset=15000, injectFreq=1e-2, + iterations=5, setSubset=50000, injectFreq=1e-2, BPPARAM=bpparam(), internalThreads=1, plot=TRUE, delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE), ...){ if(isFALSE(needsHyperOpt(implementation))){ diff --git a/R/helper-functions.R b/R/helper-functions.R index 69466b74..007390f9 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -79,7 +79,7 @@ checkReadType <- function(fds, type){ } type <- unique(type) stopifnot(isScalarCharacter(type)) - correctTypes <- c(psi3="j", psi5="j", theta="ss", intron_jaccard="j") + correctTypes <- c(psi3="j", psi5="j", theta="ss", jaccard="j") # check if it is already the correct type if(type %in% correctTypes) return(type) @@ -109,7 +109,7 @@ checkReadType <- function(fds, type){ #' #' @noRd whichPSIType <- function(type){ - unlist(regmatches(type, gregexpr("psi(3|5)|theta|intron_jaccard", type, perl=TRUE))) + unlist(regmatches(type, gregexpr("psi(3|5)|theta|jaccard", type, perl=TRUE))) } #' @@ -123,7 +123,7 @@ whichReadType <- function(fds, name){ if(name == "ss" | endsWith(name, "theta")) return("ss") if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3") | - endsWith(name, "intron_jaccard")) + endsWith(name, "jaccard")) return("j") # check assay names diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index 95331d19..5b6f7531 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -437,7 +437,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' fds <- makeSimulatedFraserDataSet() #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export -injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), +injectOutliers <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, method=c('samplePSI', 'meanPSI', 'simulatedPSI'), @@ -472,6 +472,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), setAssayMatrix(fds, type="psi3", "originalOtherCounts", withDimnames=FALSE) <- counts(fds, type="psi3", side="other") + setAssayMatrix(fds, type="jaccard", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="jaccard", side="other") } # get infos from the fds @@ -500,7 +503,8 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), dt[,groupSize:=.N, by=groupID] # Get groups where outlier can be injected - available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), + available_groups <- dt[groupSize > ifelse(type == "theta" | + type == "jaccard", 0, 1), unique(groupID)] # e.g. for psi3/5: no donor/acceptor diff --git a/R/plotMethods.R b/R/plotMethods.R index 13e22bc3..e18abe55 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -178,7 +178,7 @@ NULL plotVolcano.FRASER <- function(object, sampleID, - type=c("psi3", "psi5", "theta", "intron_jaccard"), basePlot=TRUE, + type=c("psi3", "psi5", "theta", "jaccard"), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, deltaPsiCutoff=0.3, padjCutoff=0.1, ...){ @@ -278,7 +278,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=c("psi3", "psi5", "theta", "intron_jaccard"), + type=c("psi3", "psi5", "theta", "jaccard"), padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3, aggregate=TRUE, BPPARAM=bpparam(), ...){ @@ -341,7 +341,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"), +plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), site=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ @@ -433,7 +433,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard" #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"), +plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) @@ -696,7 +696,8 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) -plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), +plotEncDimSearch.FRASER <- function(object, + type=c("psi3", "psi5", "theta", "jaccard"), plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -1085,6 +1086,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ if(isFALSE(asCharacter)){ vapply(type, FUN=function(x) switch (x, + jaccard = c(bquote(jaccard~intron~index)), psi5 = c(bquote(psi[5])), psi3 = c(bquote(psi[3])), theta = c(bquote(theta))), @@ -1092,6 +1094,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ } else{ vapply(type, FUN=function(x) switch (x, + jaccard = "jaccard-intron-index", psi5 = "psi[5]", psi3 = "psi[3]", theta = "theta"), diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 58d60a8b..b04f0494 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -336,6 +336,10 @@ getSiteIndex <- function(fds, type){ return(mcols(fds, type=type)[['spliceSiteID']]) } + if(type == "jaccard"){ + return(seq_len(nrow(fds))) + } + startId <- mcols(fds, type=type)[,"startID"] endId <- mcols(fds, type=type)[,"endID"] strand <- strand(rowRanges(fds, type=type)) diff --git a/R/variables.R b/R/variables.R index 74bb6d91..a159d2ee 100644 --- a/R/variables.R +++ b/R/variables.R @@ -6,7 +6,7 @@ #' psiTypes #' #' @export -psiTypes <- c("psi5", "psi3", "theta", "intron_jaccard") +psiTypes <- c("psi5", "psi3", "theta", "jaccard") # psiTypes <- c("psi5", "psi3", "theta") names(psiTypes) <- psiTypes From 14f0dbb28d2ce4d4a2071551d017c708feb33d16 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 31 Mar 2022 20:40:17 +0200 Subject: [PATCH 15/80] updated c++ code --- R/RcppExports.R | 4 +++ R/updateRho.R | 33 ++++++++++++++++++++----- src/RcppExports.cpp | 17 ++++++++++++- src/loss_n_gradient_functions.cpp | 41 ++++++++++++++++++++++++++++++- 4 files changed, 87 insertions(+), 8 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 659f3b4c..42ec8fbc 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,6 +41,10 @@ truncNLL_rho <- function(rho, yi, ki, ni) { .Call('_FRASER_truncNLL_rho', PACKAGE = 'FRASER', rho, yi, ki, ni) } +truncNLL_rho_penalized <- function(logit_rho, yi, ki, ni, lambda) { + .Call('_FRASER_truncNLL_rho_penalized', PACKAGE = 'FRASER', logit_rho, yi, ki, ni, lambda) +} + fullNLL <- function(y, rho, k, n, D, lambda, byRows = FALSE) { .Call('_FRASER_fullNLL', PACKAGE = 'FRASER', y, rho, k, n, D, lambda, byRows) } diff --git a/R/updateRho.R b/R/updateRho.R index f5a0d429..2348c059 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -8,11 +8,14 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ n <- N(fds) y <- predictY(fds, noiseAlpha=currentNoiseAlpha(fds)) - fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, - k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, + # k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=trunc_negLogLikelihoodRho_penalized, + k=k, n=n, y=y, rhoRange=rhoRange, lambda=0, + BPPARAM=BPPARAM) - rho(fds) <- vapply(fitparameters, "[[", - double(1), "minimum") + rho(fds) <- plogis(vapply(fitparameters, "[[", + double(1), "minimum")) if(isTRUE(verbose)){ stxt <- capture.output(summary(rho(fds))) @@ -23,12 +26,15 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ return(fds) } -estRho <- function(idx, k, n, y, rhoRange, nll, control=list()){ +estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){ ki <- k[idx,] ni <- n[idx,] yi <- y[idx,] - est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # maximum=FALSE, tol=0.0000001) + # est + est <- optimize(f=nll, interval=c(-40, 40), mui=plogis(yi), ki=ki, ni=ni, lambda=lambda, maximum=FALSE, tol=0.0000001) est } @@ -63,6 +69,21 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){ mean(alpha + beta - alphaK - betaNK ) } +trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){ + #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE)) + + rho <- plogis(logit_rho) + r <- (1-rho)/rho + eps <- 0.5 + alpha <- lgamma(mui*r) + alphaK <- lgamma(mui*r + ki + eps) + beta <- lgamma((mui-1)*(-r)) + betaNK <- lgamma((mui-1)*(-r) + (ni - ki + eps)) + + #mean negative log likelihood with pseudocounts + mean(alpha + beta - alphaK - betaNK ) + lambda * sqrt(logit_rho) +} + methodOfMomentsRho <- function(k, n, rhoRange=c(1e-5, 1 - 1e-5)){ # taken from wiki: diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 461280c0..0d444179 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,7 +1,6 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#include #include using namespace Rcpp; @@ -152,6 +151,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// truncNLL_rho_penalized +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda); +RcppExport SEXP _FRASER_truncNLL_rho_penalized(SEXP logit_rhoSEXP, SEXP yiSEXP, SEXP kiSEXP, SEXP niSEXP, SEXP lambdaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< double >::type logit_rho(logit_rhoSEXP); + Rcpp::traits::input_parameter< arma::vec >::type yi(yiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ki(kiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ni(niSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + rcpp_result_gen = Rcpp::wrap(truncNLL_rho_penalized(logit_rho, yi, ki, ni, lambda)); + return rcpp_result_gen; +END_RCPP +} // fullNLL arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows); RcppExport SEXP _FRASER_fullNLL(SEXP ySEXP, SEXP rhoSEXP, SEXP kSEXP, SEXP nSEXP, SEXP DSEXP, SEXP lambdaSEXP, SEXP byRowsSEXP) { @@ -215,6 +229,7 @@ static const R_CallMethodDef CallEntries[] = { {"_FRASER_truncNLL_e", (DL_FUNC) &_FRASER_truncNLL_e, 7}, {"_FRASER_truncGrad_e", (DL_FUNC) &_FRASER_truncGrad_e, 7}, {"_FRASER_truncNLL_rho", (DL_FUNC) &_FRASER_truncNLL_rho, 4}, + {"_FRASER_truncNLL_rho_penalized", (DL_FUNC) &_FRASER_truncNLL_rho_penalized, 5}, {"_FRASER_fullNLL", (DL_FUNC) &_FRASER_fullNLL, 7}, {"_FRASER_truncWeightedNLL_db", (DL_FUNC) &_FRASER_truncWeightedNLL_db, 7}, {"_FRASER_truncWeightedGrad_db", (DL_FUNC) &_FRASER_truncWeightedGrad_db, 7}, diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp index 07eadcca..6cde1bf1 100644 --- a/src/loss_n_gradient_functions.cpp +++ b/src/loss_n_gradient_functions.cpp @@ -6,7 +6,7 @@ using namespace Rcpp; const double MAX_EXP_VALUE = 700; -double PSEUDO_COUNT = 1; +double PSEUDO_COUNT = 0; // [[Rcpp::export(.setPseudoCount)]] double setPseudoCount(double pseudoCount){ @@ -142,6 +142,12 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double infPosB = arma::find_nonfinite(beta); // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alpha.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + beta.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; @@ -366,6 +372,39 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){ return arma::as_scalar(nll); } +// [[Rcpp::export()]] +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda){ + arma::vec mui, u, alpha, alphaK, beta, betaNK, alphaBeta, nll; + double rho, rhoa, rhob; + + rho = exp(logit_rho)/(1 + exp(logit_rho)); + rhoa = (1 - rho)/rho; + rhob = (rho - 1)/rho; + mui = predictMuCpp(yi); + u = (mui-1) * rhob; + + alpha = arma::lgamma(mui * rhoa); + alphaK = arma::lgamma(mui * rhoa + ki + PSEUDO_COUNT); + beta = arma::lgamma(u); + betaNK = arma::lgamma(u + ni - ki + PSEUDO_COUNT); + alphaBeta = arma::lgamma(rhoa + ni + (2*PSEUDO_COUNT)) - lgamma(rhoa); + + // arma::vec abs; + arma::uvec infPosA, infPosB; + // abs = arma::abs(yi); + infPosA = arma::find_nonfinite(alpha); + // alpha.elem( infPosA ) = abs.elem( infPosA ); + alpha.elem( infPosA ) = estLgammaAlpha(yi, infPosA, rhoa); + infPosB = arma::find_nonfinite(beta); + // beta.elem( infPosB ) = abs.elem( infPosB ); + beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; + nll = nll + lambda * (logit_rho*logit_rho); + + return arma::as_scalar(nll); +} + // [[Rcpp::export()]] arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows=false){ arma::mat rhoa, rhob; From 1fd5839882aced027868fdeae2333288118eff78 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 31 Mar 2022 21:00:43 +0200 Subject: [PATCH 16/80] compiled and updated documentation --- NAMESPACE | 3 +++ man/filtering.Rd | 41 +++++++++++++++++++++++++++++++++++++++++ man/injectOutliers.Rd | 2 +- man/optimHyperParams.Rd | 2 +- man/plotFunctions.Rd | 10 +++++----- man/psiTypes.Rd | 2 +- man/results.Rd | 2 +- src/RcppExports.cpp | 1 + 8 files changed, 54 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0625f5fc..ec7d82f2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,7 +42,10 @@ export(dontWriteHDF5) export(featureExclusionMask) export(filterExpression) export(filterExpressionAndVariability) +export(filterExpressionAndVariability_jaccard) +export(filterExpression_jaccard) export(filterVariability) +export(filterVariability_jaccard) export(fit) export(getNonSplitReadCountsForAllSamples) export(getSplitReadCountsForAllSamples) diff --git a/man/filtering.Rd b/man/filtering.Rd index 3c2bace9..f81b1267 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -5,6 +5,9 @@ \alias{filterExpressionAndVariability} \alias{filterExpression,FraserDataSet-method} \alias{filterVariability} +\alias{filterExpressionAndVariability_jaccard} +\alias{filterExpression_jaccard} +\alias{filterVariability_jaccard} \title{Filtering FraserDataSets} \usage{ filterExpressionAndVariability( @@ -35,6 +38,35 @@ filterVariability( delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM = bpparam() ) + +filterExpressionAndVariability_jaccard( + object, + minExpressionInOneSample = 20, + quantile = 0.95, + quantileMinExpression = 1, + minDelta = 0.05, + filter = TRUE, + delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM = bpparam() +) + +filterExpression_jaccard( + object, + minExpressionInOneSample = 20, + quantile = 0.95, + quantileMinExpression = 1, + filter = TRUE, + delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM = bpparam() +) + +filterVariability_jaccard( + object, + minDelta = 0, + filter = TRUE, + delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM = bpparam() +) } \arguments{ \item{object}{A \code{\link{FraserDataSet}} object} @@ -80,6 +112,15 @@ splice sites that have low read support in all samples. \item \code{filterVariability}: This function filters out introns and corresponding splice sites which do not show variablity across samples. + +\item \code{filterExpressionAndVariability_jaccard}: This functions filters out both introns with low +read support and introns that are not variable across samples. + +\item \code{filterExpression_jaccard}: This function filters out introns and corresponding +splice sites which are expressed at very low levels across samples. + +\item \code{filterVariability_jaccard}: This function filters out introns and corresponding +splice sites which do not show variablity across samples. }} \examples{ diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index 1f85338e..b32ed74a 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -6,7 +6,7 @@ \usage{ injectOutliers( fds, - type = c("psi5", "psi3", "theta"), + type = c("psi5", "psi3", "theta", "jaccard"), freq = 0.001, minDpsi = 0.2, minCoverage = 2, diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index 3894d9d7..f3d10622 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -12,7 +12,7 @@ optimHyperParams( noise_param = 0, minDeltaPsi = 0.1, iterations = 5, - setSubset = 15000, + setSubset = 50000, injectFreq = 0.01, BPPARAM = bpparam(), internalThreads = 1, diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index cbeddee6..f6b36c41 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -21,7 +21,7 @@ \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = c("psi3", "psi5", "theta"), + type = c("psi3", "psi5", "theta", "jaccard"), basePlot = TRUE, aggregate = FALSE, main = NULL, @@ -34,7 +34,7 @@ \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = c("psi3", "psi5", "theta"), + type = c("psi3", "psi5", "theta", "jaccard"), padjCutoff = 0.1, zScoreCutoff = NA, deltaPsiCutoff = 0.3, @@ -45,7 +45,7 @@ plotExpression( fds, - type = c("psi5", "psi3", "theta"), + type = c("psi5", "psi3", "theta", "jaccard"), site = NULL, result = NULL, colGroup = NULL, @@ -57,7 +57,7 @@ plotExpression( plotExpectedVsObservedPsi( fds, - type = c("psi5", "psi3", "theta"), + type = c("psi5", "psi3", "theta", "jaccard"), idx = NULL, result = NULL, colGroup = NULL, @@ -85,7 +85,7 @@ plotExpectedVsObservedPsi( \S4method{plotEncDimSearch}{FraserDataSet}( object, - type = c("psi3", "psi5", "theta"), + type = c("psi3", "psi5", "theta", "jaccard"), plotType = c("auc", "loss") ) diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index f07be433..43ce2ea6 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -5,7 +5,7 @@ \alias{psiTypes} \title{Available psi types} \format{ -An object of class \code{character} of length 3. +An object of class \code{character} of length 4. } \usage{ psiTypes diff --git a/man/results.Rd b/man/results.Rd index 7f78ffe0..392fff9f 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -15,7 +15,7 @@ aggregate = FALSE, collapse = FALSE, minCount = 5, - psiType = c("psi3", "psi5", "theta"), + psiType = c("psi3", "psi5", "theta", "jaccard"), geneColumn = "hgnc_symbol", additionalColumns = NULL, BPPARAM = bpparam() diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0d444179..b03ef489 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,6 +1,7 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#include #include using namespace Rcpp; From 3c4574f27568843be7d7659f92c479f1f2716ba3 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Fri, 1 Apr 2022 12:38:46 +0200 Subject: [PATCH 17/80] small bug fix --- R/updateRho.R | 9 ++++---- src/loss_n_gradient_functions.cpp | 35 +++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/R/updateRho.R b/R/updateRho.R index 2348c059..d90d7aed 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -70,18 +70,17 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){ } trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){ - #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE)) + #-mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) rho <- plogis(logit_rho) r <- (1-rho)/rho - eps <- 0.5 alpha <- lgamma(mui*r) - alphaK <- lgamma(mui*r + ki + eps) + alphaK <- lgamma(mui*r + ki) beta <- lgamma((mui-1)*(-r)) - betaNK <- lgamma((mui-1)*(-r) + (ni - ki + eps)) + betaNK <- lgamma((mui-1)*(-r) + (ni - ki)) #mean negative log likelihood with pseudocounts - mean(alpha + beta - alphaK - betaNK ) + lambda * sqrt(logit_rho) + mean(alpha + beta - alphaK - betaNK ) + lambda * (logit_rho*logit_rho) } diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp index 6cde1bf1..0078a270 100644 --- a/src/loss_n_gradient_functions.cpp +++ b/src/loss_n_gradient_functions.cpp @@ -145,9 +145,9 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double arma::uvec infPosA2, infPosB2; infPosA2 = arma::find_nonfinite(alphaK); - alpha.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); infPosB2 = arma::find_nonfinite(betaNK); - beta.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; @@ -272,6 +272,12 @@ double truncNLL_e(arma::vec par, arma::mat x, arma::mat D, arma::vec b, beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); + nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; return arma::as_scalar(nll); @@ -367,6 +373,13 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){ // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; return arma::as_scalar(nll); @@ -399,6 +412,13 @@ double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; nll = nll + lambda * (logit_rho*logit_rho); @@ -434,6 +454,11 @@ arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::ma infPosB = arma::find_nonfinite(beta); beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); if(byRows){ nll = rowMeans(alpha + beta - alphaK - betaNK + nonTruncTerms); @@ -510,6 +535,12 @@ double truncWeightedNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); + nll = arma::accu((alpha + beta - alphaK - betaNK)%w)/k.n_elem; nll = nll + (lambda/k.n_elem) * arma::accu(d % d); From b671b34089907ff3799e2e00d967579f051dd1ec Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 28 Apr 2022 10:51:40 +0200 Subject: [PATCH 18/80] updated rho fit --- DESCRIPTION | 2 +- R/autoencoder.R | 2 +- R/fitCorrectionMethods.R | 2 +- R/getNSetterFuns.R | 2 +- R/plotMethods.R | 8 ++++---- R/pvalsNzscore.R | 2 +- R/updateRho.R | 19 ++++++++++++++----- 7 files changed, 23 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c7704274..f7a853c7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FRASER Type: Package Title: Find RAre Splicing Events in RNA-Seq Data -Version: 1.2.2 +Version: 1.99.0 Date: 2021-01-27 Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), diff --git a/R/autoencoder.R b/R/autoencoder.R index bf1f4fd1..f59b4661 100644 --- a/R/autoencoder.R +++ b/R/autoencoder.R @@ -3,7 +3,7 @@ #' #' @noRd fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1, - rhoRange=c(1e-5, 1-1e-5), lambda=0, convergence=1e-5, + rhoRange=c(-30, 30), lambda=0, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5, weighted=FALSE, nSubset=15000, multiRho=FALSE, diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index f9b67599..38f5d634 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), - q, type="psi3", rhoRange=c(1e-8, 1-1e-8), + q, type="psi3", rhoRange=c(-30, 30), weighted=FALSE, noiseAlpha=1, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), nSubset=15000, diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 8e91710f..63c8d604 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -653,7 +653,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, level=pvalLevel)[idxrow, idxcol]), padj = c(padjVals(fds, type=type)[idxrow, idxcol]), zscore = c(zScores(fds, type=type)[idxrow, idxcol]), - obsPsi = c((k + pseudocount())/(n + 2*pseudocount())), + obsPsi = c(k/n), predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), rho = rep(rho(fds, type=type)[idxrow], ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) diff --git a/R/plotMethods.R b/R/plotMethods.R index e18abe55..5d567691 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -25,7 +25,7 @@ #' @param type The psi type: either psi5, psi3 or theta (for SE). #' @param sampleID A sample ID which should be plotted. Can also be a vector. #' Integers are treated as indices. -#' @param idx,site A junction site ID or gene ID or one of both, which +#' @param idx A junction site ID or gene ID or one of both, which #' should be plotted. Can also be a vector. Integers are treated #' as indices. #' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta @@ -342,16 +342,16 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' @rdname plotFunctions #' @export plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), - site=NULL, result=NULL, colGroup=NULL, + idx=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ type <- as.character(result$type) - site <- getIndexFromResultTable(fds, result) + idx <- getIndexFromResultTable(fds, result) } else { type <- match.arg(type) } - dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...) + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...) dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] if(!is.null(colGroup)){ diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index b04f0494..749d192c 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -196,7 +196,7 @@ singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ pvals <- pmin(1, pbbinom(ki, ni, alphai, betai)) if(any(is.na(pvals))){ - message(date(), " : ", idx) + message(date(), ": obtained NA pvalues for junction ", idx) } return (pvals) diff --git a/R/updateRho.R b/R/updateRho.R index d90d7aed..628b21fd 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -10,9 +10,10 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, # k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) - fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=trunc_negLogLikelihoodRho_penalized, - k=k, n=n, y=y, rhoRange=rhoRange, lambda=0, - BPPARAM=BPPARAM) + fitparameters <- bplapply(seq_len(nrow(k)), estRho, + nll=fullNLLRho_penalized, + k=k, n=n, y=y, rhoRange=rhoRange, lambda=1e-4, + BPPARAM=BPPARAM) rho(fds) <- plogis(vapply(fitparameters, "[[", double(1), "minimum")) @@ -26,7 +27,7 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ return(fds) } -estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){ +estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=1e-4){ ki <- k[idx,] ni <- n[idx,] yi <- y[idx,] @@ -34,11 +35,19 @@ estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){ # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, # maximum=FALSE, tol=0.0000001) # est - est <- optimize(f=nll, interval=c(-40, 40), mui=plogis(yi), ki=ki, ni=ni, lambda=lambda, + est <- optimize(f=nll, interval=rhoRange, + mui=plogis(yi), ki=ki, ni=ni, lambda=lambda, maximum=FALSE, tol=0.0000001) est } +fullNLLRho_penalized <- function(logit_rho, ki, ni, mui, lambda=1e-4){ + rho <- plogis(logit_rho) + nll <- -mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) + nll <- nll + lambda * (logit_rho^2) + return(nll) +} + negLogLikelihoodRho <- function(rho, ki, ni, mui){ #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE)) From 783a88c24332ca86f3cadf7baef6b97c9d066f8a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 4 Jul 2022 13:05:10 +0200 Subject: [PATCH 19/80] minor changes --- R/getNSetterFuns.R | 16 +++++++++++----- R/plotMethods.R | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 63c8d604..d5914185 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -406,7 +406,7 @@ pseudocount <- function(value=NULL){ # set pseudo count if provided stopifnot(isScalarNumeric(value)) stopifnot(value >= 0) - value <- as.integer(value) + # value <- as.integer(value) options('FRASER.pseudoCount'=value) devNULL <- .setPseudoCount(value) stopifnot(value == devNULL) @@ -557,8 +557,9 @@ calcFraserWeights <- function(fds, psiType){ n <- as.matrix(N(fds, psiType)) mu <- t(predictMu(fds, psiType)) rho <- rho(fds, psiType) - dataPsi <- plogis(t( - x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + # dataPsi <- plogis(t( + # x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + dataPsi <- k / n # pearson residuals for BB # on counts of success k @@ -567,13 +568,18 @@ calcFraserWeights <- function(fds, psiType){ # (1+((n+2*pseudocount())-1)*rho)) # on probability of success mu r <- (dataPsi - mu) / sqrt( - mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / - (n+2*pseudocount())) + # mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / + # (n+2*pseudocount())) + mu * (1-mu) * (1+(n-1)*rho) / n + ) # weights according to Huber function (as in edgeR) c <- 1.345; # constant, as suggested in edgeR paper w <- ifelse(abs(r) > c, c/abs(r) , 1) + # set weights to 0 if NA (i.e. N=0) + w[is.na(w)] <- 0 + return(w) } diff --git a/R/plotMethods.R b/R/plotMethods.R index 5d567691..6b68fc26 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -487,6 +487,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jacc geom_point(alpha=ifelse(dt$aberrant, 1, 0.5), color=c("gray70", "firebrick")[dt$aberrant + 1]) + geom_abline(intercept = 0, slope=1) + + xlim(c(0,1)) + ylim(c(0,1)) + theme_bw() + theme(legend.position="none") + xlab(xlab) + From 75acec3f144f2ac76360f140f3081751ce71149c Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Jul 2022 11:47:30 +0200 Subject: [PATCH 20/80] fix bug in spliceSiteID mapping and intron retention annotation --- NAMESPACE | 1 + R/AllGenerics.R | 44 ++++++++++++++++++++++++++++++++--------- R/FRASER-package.R | 2 +- R/FraserDataSet-class.R | 2 +- R/countRNAseqData.R | 22 +++++++++++++-------- R/getNSetterFuns.R | 2 +- R/resultAnnotations.R | 12 ++++++++++- man/fit.Rd | 2 +- man/plotFunctions.Rd | 10 +++++----- 9 files changed, 70 insertions(+), 27 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ab6011bb..075aa04a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -286,6 +286,7 @@ importFrom(ggplot2,theme_bw) importFrom(ggplot2,xlab) importFrom(ggplot2,xlim) importFrom(ggplot2,ylab) +importFrom(ggplot2,ylim) importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRampPalette) importFrom(matrixStats,colAnys) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 7e8055a2..334db5a5 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -614,9 +614,11 @@ mapSeqlevels <- function(fds, style="UCSC", ...){ #' retrieve a single sample result object #' @noRd resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, - psivals, rawCts, rawTotalCts, deltaPsiVals, - psiType, rowMeansK, rowMeansN, aberrant, - aggregate, rho, pvalsGene=NULL, padjsGene=NULL, + psivals, rawCts, rawTotalCts, rawNonsplitCts, + rawNsProportion, nsProportion_99quantile, + deltaPsiVals, psiType, rowMeansK, rowMeansN, + aberrant, aggregate, rho, + pvalsGene=NULL, padjsGene=NULL, aberrantGene, additionalColumns, geneColumn="hgnc_symbol"){ mcols(gr)$idx <- seq_along(gr) @@ -664,6 +666,21 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + if(psiType == "jaccard"){ + mcols(ans)$nonsplitCounts <- + Rle(round(rawNonsplitCts[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion <- + Rle(round(rawNsProportion[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion_99quantile <- + Rle(round(nsProportion_99quantile[goodCut], 2)) + } + + if(!is.null(additionalColumns)){ + for(column in additionalColumns){ + mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column]) + } + } + if(isTRUE(aggregate)){ # report junction more than once if it is significant for several genes nrGenesPerJunction <- table(geneJunctions) @@ -680,12 +697,6 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol) } - if(!is.null(additionalColumns)){ - for(column in additionalColumns){ - mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column]) - } - } - # remove helper column mcols(ans)$idx <- NULL @@ -772,12 +783,27 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, colnames(deltaPsiVals) <- sc } + if(type == "jaccard"){ + rawNonsplitCts <- as.matrix(assay(tmp_x, "rawCountsJnonsplit")) + rawNsProportion <- rawNonsplitCts / rawTotalCts + nsProportion_99quantile <- + rowQuantiles(rawNsProportion, probs=0.99) + } else{ + rawNonsplitCts <- NULL + rawNsProportion <- NULL + nsProportion_99quantile <- NULL + } + + # create result table sampleRes <- lapply(sc, resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs, zscores=zscores, psiType=type, psivals=psivals, deltaPsiVals=deltaPsiVals, rawCts=rawCts, rawTotalCts=rawTotalCts, + rawNonsplitCts=rawNonsplitCts, + rawNsProportion=rawNsProportion, + nsProportion_99quantile=nsProportion_99quantile, rowMeansK=rowMeansK, rowMeansN=rowMeansN, aberrant=aberrant, aggregate=aggregate, rho=rho, geneColumn=geneColumn, diff --git a/R/FRASER-package.R b/R/FRASER-package.R index e70f894e..c6511822 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -100,7 +100,7 @@ #' scale_y_log10 scale_color_gradientn labs theme_bw theme #' scale_color_brewer scale_color_discrete scale_linetype_manual #' annotate geom_histogram scale_fill_manual xlim scale_colour_manual -#' element_blank annotation_logticks +#' element_blank annotation_logticks ylim #' #' @importFrom tibble as_tibble #' diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R index a0e9d7f5..a92e27e2 100644 --- a/R/FraserDataSet-class.R +++ b/R/FraserDataSet-class.R @@ -103,7 +103,7 @@ validateWorkingDir <- function(object) { } if(!dir.exists(object@workingDir)){ message(date(), ": The given working directory '", object@workingDir, - "' does not exists. We will create it." + "' does not exist. We will create it." ) dir.create(object@workingDir, recursive = TRUE) } diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index eafb79d6..1ed57401 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -363,7 +363,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges, " splice junctions are found.") # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) message(date(), ": In total ", length(spliceSiteCoords), " splice sites (acceptor/donor) will be counted ...") @@ -860,7 +860,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, } # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) } @@ -982,7 +982,7 @@ readJunctionMap <- function(junctionMap){ #' extracts the splice site coordinates from a junctions GRange object ( #' @noRd -extractSpliceSiteCoordinates <- function(junctions, fds){ +extractSpliceSiteCoordinates <- function(junctions){ spliceSiteCoords <- unlist(GRangesList( lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, @@ -1052,15 +1052,21 @@ annotateSpliceSite <- function(gr){ dt <- GRanges2SAF(gr) # extract donor/acceptor annotation - startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] - endSideDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] + endSiteDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT[,Start:=Start-1] + endSiteDT[,End:=End+1] # annotate and enumerate donor/acceptor - annotadedDT <- rbind(startSideDT, endSideDT) - annotadedDT[,id:=seq_len(nrow(annotadedDT))] + annotatedDT <- rbind(startSiteDT, endSiteDT) + annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"] + + # set back start / end positions for merging with junction ranges + annotatedDT[type == "start", Start:=End] + annotatedDT[type == "end", End:=Start] # convert back to granges - annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE) + annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE) ids <- lapply(c("start", "end"), function(type){ # reduce annogr to only the specific type to prevent overlap diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index d5914185..608b0171 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -406,7 +406,7 @@ pseudocount <- function(value=NULL){ # set pseudo count if provided stopifnot(isScalarNumeric(value)) stopifnot(value >= 0) - # value <- as.integer(value) + value <- as.numeric(value) options('FRASER.pseudoCount'=value) devNULL <- .setPseudoCount(value) stopifnot(value == devNULL) diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 52f1c70b..5cafc8ea 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -334,7 +334,17 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ spliceType := "annotatedIntron_reducedUsage"] junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] - # TODO check for intron retention + if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% + colnames(junctions_dt))){ + junctions_dt[spliceType == "annotatedIntron_reducedUsage" & + type == "jaccard" & + nonsplitProportion >= nonsplitProportion_99quantile, + spliceType := "intron_retention"] + + # TODO check frameshift for intron retention + junctions_dt[spliceType == "intron_retention", + causesFrameshift := "inconclusive"] + } starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start") ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") diff --git a/man/fit.Rd b/man/fit.Rd index eb68296c..35f7966e 100644 --- a/man/fit.Rd +++ b/man/fit.Rd @@ -11,7 +11,7 @@ "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), q, type = "psi3", - rhoRange = c(1e-08, 1 - 1e-08), + rhoRange = c(-30, 30), weighted = FALSE, noiseAlpha = 1, convergence = 1e-05, diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index f6b36c41..4cfeb0e6 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -46,7 +46,7 @@ plotExpression( fds, type = c("psi5", "psi3", "theta", "jaccard"), - site = NULL, + idx = NULL, result = NULL, colGroup = NULL, basePlot = TRUE, @@ -158,14 +158,14 @@ otherwise in the details for each plot function} \item{BPPARAM}{BiocParallel parameter to use.} +\item{idx}{A junction site ID or gene ID or one of both, which +should be plotted. Can also be a vector. Integers are treated +as indices.} + \item{result}{The result table to be used by the method.} \item{colGroup}{Group of samples that should be colored.} -\item{idx, site}{A junction site ID or gene ID or one of both, which -should be plotted. Can also be a vector. Integers are treated -as indices.} - \item{global}{Flag to plot a global Q-Q plot, default FALSE} \item{conf.alpha}{If set, a confidence interval is plotted, defaults to 0.05} From 62d90c16249cd5155f3f0fc79f6b83ca0e0487fa Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Jul 2022 11:55:59 +0200 Subject: [PATCH 21/80] fix missing spliceSiteID bug --- R/countRNAseqData.R | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index f68c4bdc..d2b1a8ae 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -976,15 +976,10 @@ readJunctionMap <- function(junctionMap){ #' @noRd extractSpliceSiteCoordinates <- function(junctions, fds){ - if(strandSpecific(fds) >= 1L){ - spliceSiteCoords <- unlist(GRangesList( - extractSpliceSiteCoordsPerStrand(junctions, "+"), - extractSpliceSiteCoordsPerStrand(junctions, "-") - )) - } else { - strand(junctions) <- "*" - spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") - } + spliceSiteCoords <- unlist(GRangesList( + lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, + junctions=junctions) + )) return(unique(sort(spliceSiteCoords))) } @@ -1049,12 +1044,18 @@ annotateSpliceSite <- function(gr){ dt <- GRanges2SAF(gr) # extract donor/acceptor annotation - startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] - endSideDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] + endSiteDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT[,Start:=Start-1] + endSiteDT[,End:=End+1] # annotate and enumerate donor/acceptor - annotadedDT <- rbind(startSideDT, endSideDT) - annotadedDT[,id:=seq_len(nrow(annotadedDT))] + annotatedDT <- rbind(startSiteDT, endSiteDT) + annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"] + + # set back start / end positions for merging with junction ranges + annotatedDT[type == "start", Start:=End] + annotatedDT[type == "end", End:=Start] # convert back to granges annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE) From d930ca1af06a0291a3849e694c507a9d4dab5277 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Jul 2022 14:22:28 +0200 Subject: [PATCH 22/80] fix missing spliceSiteID bug part2 --- R/countRNAseqData.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index d2b1a8ae..b5b442df 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -361,7 +361,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges, " splice junctions are found.") # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) message(date(), ": In total ", length(spliceSiteCoords), " splice sites (acceptor/donor) will be counted ...") @@ -852,7 +852,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, } # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) } @@ -974,7 +974,7 @@ readJunctionMap <- function(junctionMap){ #' extracts the splice site coordinates from a junctions GRange object ( #' @noRd -extractSpliceSiteCoordinates <- function(junctions, fds){ +extractSpliceSiteCoordinates <- function(junctions){ spliceSiteCoords <- unlist(GRangesList( lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, @@ -1058,7 +1058,7 @@ annotateSpliceSite <- function(gr){ annotatedDT[type == "end", End:=Start] # convert back to granges - annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE) + annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE) ids <- lapply(c("start", "end"), function(type){ # reduce annogr to only the specific type to prevent overlap From cc4868d8aa64ec84b73f4a476620920dc635888a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Jul 2022 16:28:33 +0200 Subject: [PATCH 23/80] add functionality to plot coverage from bam files --- DESCRIPTION | 1 + NAMESPACE | 7 + R/FRASER-package.R | 10 +- R/plotMethods.R | 349 ++++++++++++++++++++++++++++++++++++++++++- man/plotFunctions.Rd | 155 +++++++++++++++++++ 5 files changed, 513 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 854339e6..fc5ca357 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -82,6 +82,7 @@ Suggests: covr, TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db, + SGSeq LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index 35e2d66a..78006b7d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -60,6 +60,8 @@ export(pVals) export(padjVals) export(pairedEnd) export(plotAberrantPerSample) +export(plotBamCoverage) +export(plotBamCoverageFromResultTable) export(plotCountCorHeatmap) export(plotEncDimSearch) export(plotExpectedVsObservedPsi) @@ -166,6 +168,7 @@ importFrom(GenomicAlignments,summarizeJunctions) importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,makeTxDbFromGFF) +importFrom(GenomicFeatures,seqlevels0) importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) importFrom(GenomicRanges,findOverlaps) @@ -177,6 +180,9 @@ importFrom(HDF5Array,loadHDF5SummarizedExperiment) importFrom(HDF5Array,path) importFrom(HDF5Array,saveHDF5SummarizedExperiment) importFrom(HDF5Array,writeHDF5Array) +importFrom(IRanges,"%over%") +importFrom(IRanges,"end<-") +importFrom(IRanges,"start<-") importFrom(IRanges,IRanges) importFrom(IRanges,from) importFrom(IRanges,ranges) @@ -328,5 +334,6 @@ importFrom(tibble,as_tibble) importFrom(tools,file_path_as_absolute) importFrom(utils,capture.output) importFrom(utils,packageVersion) +importFrom(utils,tail) importMethodsFrom(OUTRIDER,results) useDynLib(FRASER) diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 87e5ad96..560de2f9 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -22,7 +22,8 @@ #' ### GRange/Experiment/bamFile packages #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which -#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes +#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes +#' seqlevels0 #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions #' readGAlignmentPairs #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames @@ -30,7 +31,8 @@ #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList #' makeGRangesFromDataFrame invertStrand -#' @importFrom IRanges subsetByOverlaps from to IRanges ranges +#' @importFrom IRanges subsetByOverlaps from to IRanges ranges %over% +#' start<- end<- #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter #' bamWhich bamWhich<- BamFile idxstatsBam #' @importFrom Rsubread featureCounts @@ -74,7 +76,7 @@ #' @importFrom R.utils renameFile withTimeout #' @importFrom tools file_path_as_absolute #' @importFrom methods as callNextMethod is new show slot slot<- validObject -#' @importFrom utils capture.output packageVersion +#' @importFrom utils capture.output packageVersion tail #' #' #' @@ -128,5 +130,5 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa", "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", - "uniqueID", "V1", "value", "zscore", "maxDTheta"), + "uniqueID", "V1", "value", "zscore", "maxDTheta", "par"), package="FRASER") diff --git a/R/plotMethods.R b/R/plotMethods.R index 6d824392..4c0ef708 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -15,6 +15,8 @@ #' \item plotFilterExpression() #' \item plotFilterVariability() #' \item plotEncDimSearch() +#' \item plotBamCoverage() +#' \item plotBamCoverageFromResultTable() #' } #' #' For a detailed description of each plot function please see the details. @@ -52,11 +54,59 @@ #' sample-sample correlation heatmap or \code{"junctionSample"} #' for a junction-sample correlation heatmap. #' @param onlyVariableIntrons Logical value indicating whether to show only -#' introns that also pass the variability filter. Defaults to -#' FALSE. +#' introns that also pass the variability filter. Defaults to +#' FALSE. #' @param onlyExpressedIntrons Logical value indicating whether to show only -#' introns that also pass the expression filter. Defaults to -#' FALSE. +#' introns that also pass the expression filter. Defaults to +#' FALSE. +#' @param control_samples The sampleIDs of the samples used as control in +#' \code{plotBamCoverage}. +#' @param min_junction_count The minimal junction count across samples required +#' for a junction to appear in the splicegraph and coverage tracks +#' of \code{plotBamCoverage}. +#' @param txdb A TxDb object giving the gene/transcript annotation to use. +#' @param orgDb A OrgDb object giving the mapping of gene ids and symbols. +#' @param show_full_gene Should the full genomic range of the gene be shown in +#' \code{plotBamCoverageFromResultTable} (default: FALSE)? +#' If FALSE, only a certain region (see parameters left_extension +#' and right_extension) around the outlier junction is shown. +#' @param left_extension Indicating how far the plotted range around the outlier +#' junction should be extended to the left in +#' \code{plotBamCoverageFromResultTable}. +#' @param right_extension Indicating how far the plotted range around the +#' outlier junction should be extended to the right in +#' \code{plotBamCoverageFromResultTable}. +#' @param res_gene_col The column name in the given results table that +#' contains the gene annotation. +#' @param res_gene_type The type of gene annotation in \code{res_gene_col} +#' (e.g. SYMBOL or ENTREZID etc.). This information is needed for +#' mapping between the results table and the provided annotation +#' in the txdb object. +#' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} +#' (e.g. ENTREZID). This information is needed for +#' mapping between the results table and the provided annotation +#' in the txdb object. +#' @param highlight_range A \code{GenomicRanges} or \code{GenomicRangesList} +#' object of ranges to be highlighted in the splicegraph of +#' \code{plotBamCoverage}. +#' @param highlight_range_color The color of highlighted ranges in +#' the splicegraph of \code{plotBamCoverage}. +#' @param toscale In \code{plotBamCoverage}, indicates which part of the +#' plotted region should be drawn to scale. Possible values are +#' 'exon' (exonic regions are drawn to scale), +#' 'gene' (both exonic and intronic regions are drawn to scale) or +#' 'none' (exonic and intronic regions have constant length) +#' (see SGSeq package). +#' @param splicegraph_labels Indicated the format of exon/splice junction +#' labels in the splicegraph of \code{plotBamCoverage}. +#' Possible values are 'genomic_range' (gives the start position +#' of the first exon and the end position of the last exon that +#' are shown), 'id' (format E1,... J1,...), 'name' (format +#' type:chromosome:start-end:strand for each feature), +#' 'none' for no labels (see SGSeq package). +#' @param splicegraph_position The position of the splicegraph relative to the +#' coverage tracks in \code{plotBamCoverage}. Possible values +#' are 'top' (default) and 'bottom'. #' #### Graphical parameters #' @param main Title for the plot, if missing a default title will be used. @@ -89,6 +139,26 @@ #' @param bins Set the number of bins to be used in the histogram. #' @param legend.position Set legend position (x and y coordinate), defaults to #' the top right corner. +#' @param color_annotated The color for exons and junctions present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_novel The color for novel exons and junctions not present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_sample_interest The color in \code{plotBamCoverage} for the +#' sample of interest. +#' @param color_control_samples The color in \code{plotBamCoverage} for the +#' samples used as controls. +#' @param curvature_splicegraph The curvature of the junction arcs in the +#' splicegraph in \code{plotBamCoverage}. Decrease this value +#' for flatter arcs and increase it for steeper arcs. +#' @param curvature_coverage The curvature of the junction arcs in the +#' coverage tracks of \code{plotBamCoverage}. Decrease this +#' value for flatter arcs and increase it for steeper arcs. +#' @param mar The margin of the plot area for \code{plotBamCoverage} +#' (b,l,t,r). +#' @param cex For controlling the size of text and numbers in +#' \code{plotBamCoverage}. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -173,7 +243,30 @@ #' plotQQ(fds, result=res[1]) #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) #' -#' +#' # plot splice graph and coverage from bam files in a given region +#' fds <- createTestFraserSettings() +#' gr <- GRanges(seqnames="chr19", +#' IRanges(start=7587496, end=7598895), +#' strand="+") +#' plotBamCoverage(fds, gr=gr, sampleID="sample3", +#' control_samples="sample2", min_junction_count=5, +#' curvature_splicegraph=1, curvature_coverage=1, +#' mar=c(1, 7, 0.1, 3)) +#' +#' # plot coverage from bam file for a row in the result table +#' \donttest{ +#' fds <- createTestFraserDataSet() +#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' require(org.Hs.eg.db) +#' orgDb <- org.Hs.eg.db +#' plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE, +#' txdb=txdb, orgDb=orgDb, control_samples="sample3") +#' plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, +#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) +#' } +#' NULL @@ -1056,6 +1149,213 @@ plotCountCorHeatmap.FRASER <- function(object, setMethod("plotCountCorHeatmap", signature="FraserDataSet", plotCountCorHeatmap.FRASER) +#' +#' Plot coverage from bam files for given genomic range and sample ids +#' +#' @rdname plotFunctions +#' @export +plotBamCoverage <- function(fds, gr, sampleID, + control_samples=sample( + samples(fds[, which(samples(fds) != sampleID)]), + min(3, ncol(fds)-length(sampleID))), + txdb=NULL, min_junction_count=20, + highlight_range=NULL, highlight_range_color="firebrick", + color_annotated="gray", color_novel="goldenrod3", + color_sample_interest="firebrick", color_control_samples="dodgerblue4", + toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5), + curvature_splicegraph=1, curvature_coverage=1, cex=1, + splicegraph_labels=c("genomic_range", "id", "name", "none"), + splicegraph_position=c("top", "bottom")){ + + require(SGSeq) + if(missing(fds)){ + stop("Missing input: fds (FraserDataSet object)") + } else{ + stopifnot(is(fds, "FraserDataSet")) + } + if(missing(gr)){ + stop("Missing input gr (genomic range to plot).") + } else{ + stopifnot(is(gr, "GenomicRanges")) + } + if(missing(sampleID)){ + stop("Missing input: sample_of_interest") + } + toscale <- match.arg(toscale) + splicegraph_labels <- match.arg(splicegraph_labels) + splicegraph_position <- match.arg(splicegraph_position) + + # extract bam info for sample ids to plot + all_sids <- c(sampleID, control_samples) + si_out <- getSGSeqSI(fds, all_sids) + sgseq_si <- si_out[[1]] + fds <- si_out[[2]] + + # collapse input ranges if several + gr <- range(gr) + gr <- keepSeqlevels(gr, as.character(seqnames(gr))) + if(all(strand(gr) == "*")){ + # seems to throw an error with * strand so guessing + strand instead + strand(gr) <- "+" + } + + # convert highlight_range to GRangesList if not + if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ + stopifnot(is(highlight_range, "GRanges")) + highlight_range <- GRangesList(highlight_range) + } + + # extract splice graph + sgfc_pred <- analyzeFeatures(sgseq_si, which = gr, + min_junction_count=min_junction_count, psi=0) + + # overlap detected junctions with annotation + if(!is.null(txdb)){ + # subset to chr of interest + seqlevels(txdb) <- as.character(seqnames(gr)) + + # extract transcript features with SGSeq package + txf <- convertToTxFeatures(txdb) + txf <- txf[txf %over% gr] + + # restore seqlevels of txdb object + seqlevels(txdb) <- seqlevels0(txdb) + + # annotate splice junctions with annotation features + sgfc_pred <- SGSeq::annotate(sgfc_pred, txf) + } else{ + # when no annotation is given, show everything in the same color + color_novel <- color_annotated + } + + # get genomic positions for first and last exon in given range + if(splicegraph_labels == "genomic_range"){ + # tell plotSpliceGraph function to use custom labels + splicegraph_labels <- "label" + # create custom labels (only for first and last exon for readability) + mcols(sgfc_pred)$label <- "" + exons <- which(type(sgfc_pred) == "E" & rowRanges(sgfc_pred) %over% gr) + exons <- unique(c(exons[1], tail(exons, n=1))) + if(length(exons) == 1){ + mcols(sgfc_pred)$label[exons] <- + paste(seqnames(sgfc_pred), + paste(start(sgfc_pred), end(sgfc_pred), sep="-"), + strand(sgfc_pred), sep=":")[exons] + } + if(length(exons) == 2){ + mcols(sgfc_pred)$label[exons[1]] <- + paste(seqnames(sgfc_pred), + start(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[1]] + mcols(sgfc_pred)$label[exons[2]] <- + paste(seqnames(sgfc_pred), + end(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[2]] + } + } + + # plot splice graph and coverage of junctions from bam + nr_sa2p <- length(all_sids) + par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) + if(splicegraph_position == "top"){ + plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color=color_annotated, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + for (j in seq_along(sampleID)) { + plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], + which = gr, + toscale = toscale, + label=sampleID[j], + color=color_sample_interest, + curvature=curvature_coverage) + } + for (j in seq_along(control_samples)) { + plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])], + which = gr, + toscale = toscale, + label=control_samples[j], + color=color_control_samples, + curvature=curvature_coverage) + } + if(splicegraph_position == "bottom"){ + plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + + return(invisible(fds)) +} + +#' +#' Plot coverage from bam files for given row of results table +#' +#' @rdname plotFunctions +#' @export +plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, + txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol", + res_gene_type="SYMBOL", txdb_geneid_type="ENTREZID", + left_extension=1000, right_extension=1000, ...){ + stopifnot(is(fds, "FraserDataSet")) + + if(is(result, "GenomicRanges")){ + result <- as.data.table(result) + } + + stopifnot(is.data.table(result)) + stopifnot(result[,.N] == 1) + + sid <- result[,sampleID] + jidx <- getIndexFromResultTable(fds, result) + outlier_range <- rowRanges(fds, type=result[,type])[jidx,] + + # showing either full range of the gene in which the outlier occured + if(show_full_gene == TRUE){ + if(missing(txdb)){ + stop("Missing input: txdb (for extracting gene range)") + } + if(missing(orgDb)){ + stop("Missing input: orgDb (for mapping of IDs to txdb)") + } + txdb_geneid <- select(orgDb, + keys=as.character(result[,res_gene_col, with=FALSE]), + columns=txdb_geneid_type, + keytype=res_gene_type)[1] + gr <- genes(txdb, filter=list("gene_id"=txdb_geneid)) + } else{ + # or just showing a certain region around the outlier junction + gr <- outlier_range + start(gr) <- start(gr) - left_extension + end(gr) <- end(gr) + right_extension + } + + # create the coverage plot for the given outlier + fds <- plotBamCoverage(fds, + gr=gr, + sampleID=sid, + txdb=txdb, + highlight_range=outlier_range, + ...) + return(invisible(fds)) +} + #' #' helper function to get the annotation as data frame from the col data object #' @@ -1112,3 +1412,42 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ FUN.VALUE=character(1)) } } + +#' +#' Extract info from bam files needed for SGSeq functions to work +#' +#' @noRd +getSGSeqSI <- function(fds, sample_ids){ + + require(SGSeq) + # check if bam info is already stored in fds for given samples + if("SGSeq_sampleinfo" %in% names(metadata(fds))){ + si <- metadata(fds)[["SGSeq_sampleinfo"]] + si <- si[si$sample_name %in% sample_ids,] + if(nrow(si) != length(sample_ids)){ + # add bam info for missing sample_ids + missing_ids <- sample_ids[!sample_ids %in% si$sample_name] + message("Extracting SGSeq sample info from BAM files for samples ", + paste(missing_ids, collapse=", "), " ...") + df_missing <- data.frame( + sample_name=samples(fds)[samples(fds) %in% missing_ids], + file_bam=bamFile(fds)[samples(fds) %in% missing_ids]) + si_new <- getBamInfo(df_missing, yieldSize=1e6) + si_new$lib_size <- 50e6 # dummy value to speed up this part + si <- rbind(si, si_new) + metadata(fds)[["SGSeq_sampleinfo"]] <- + rbind(metadata(fds)[["SGSeq_sampleinfo"]], si_new) + } + return(list(si, fds)) + } else{ + message("Extracting SGSeq sample info from BAM files for samples ", + paste(sample_ids, collapse=", "), " ...") + df <- data.frame( + sample_name=samples(fds)[samples(fds) %in% sample_ids], + file_bam=bamFile(fds)[samples(fds) %in% sample_ids]) + si <- getBamInfo(df, yieldSize=1e6) + si$lib_size <- 50e6 # dummy value to speed up this part + metadata(fds)[["SGSeq_sampleinfo"]] <- si + return(list(si, fds)) + } +} diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index cbeddee6..2fc50b2b 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -16,6 +16,8 @@ \alias{plotEncDimSearch,FraserDataSet-method} \alias{plotFilterVariability} \alias{plotCountCorHeatmap,FraserDataSet-method} +\alias{plotBamCoverage} +\alias{plotBamCoverageFromResultTable} \title{Visualization functions for FRASER} \usage{ \S4method{plotVolcano}{FraserDataSet}( @@ -126,6 +128,43 @@ plotFilterVariability( plotCov = TRUE, ... ) + +plotBamCoverage( + fds, + gr, + sampleID, + control_samples = sample(samples(fds[, which(samples(fds) != sampleID)]), min(3, + ncol(fds) - length(sampleID))), + txdb = NULL, + min_junction_count = 20, + highlight_range = NULL, + highlight_range_color = "firebrick", + color_annotated = "gray", + color_novel = "goldenrod3", + color_sample_interest = "firebrick", + color_control_samples = "dodgerblue4", + toscale = c("exon", "gene", "none"), + mar = c(2, 10, 0.1, 5), + curvature_splicegraph = 1, + curvature_coverage = 1, + cex = 1, + splicegraph_labels = c("genomic_range", "id", "name", "none"), + splicegraph_position = c("top", "bottom") +) + +plotBamCoverageFromResultTable( + fds, + result, + show_full_gene = FALSE, + txdb = NULL, + orgDb = NULL, + res_gene_col = "hgncSymbol", + res_gene_type = "SYMBOL", + txdb_geneid_type = "ENTREZID", + left_extension = 1000, + right_extension = 1000, + ... +) } \arguments{ \item{object, fds}{An \code{\link{FraserDataSet}} object.} @@ -228,6 +267,97 @@ annotation of the heatmap.} \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with the mean psi values or the junction coverage.} + +\item{control_samples}{The sampleIDs of the samples used as control in +\code{plotBamCoverage}.} + +\item{txdb}{A TxDb object giving the gene/transcript annotation to use.} + +\item{min_junction_count}{The minimal junction count across samples required +for a junction to appear in the splicegraph and coverage tracks +of \code{plotBamCoverage}.} + +\item{highlight_range}{A \code{GenomicRanges} or \code{GenomicRangesList} +object of ranges to be highlighted in the splicegraph of +\code{plotBamCoverage}.} + +\item{highlight_range_color}{The color of highlighted ranges in +the splicegraph of \code{plotBamCoverage}.} + +\item{color_annotated}{The color for exons and junctions present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_novel}{The color for novel exons and junctions not present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_sample_interest}{The color in \code{plotBamCoverage} for the +sample of interest.} + +\item{color_control_samples}{The color in \code{plotBamCoverage} for the +samples used as controls.} + +\item{toscale}{In \code{plotBamCoverage}, indicates which part of the +plotted region should be drawn to scale. Possible values are +'exon' (exonic regions are drawn to scale), +'gene' (both exonic and intronic regions are drawn to scale) or +'none' (exonic and intronic regions have constant length) +(see SGSeq package).} + +\item{mar}{The margin of the plot area for \code{plotBamCoverage} +(b,l,t,r).} + +\item{curvature_splicegraph}{The curvature of the junction arcs in the +splicegraph in \code{plotBamCoverage}. Decrease this value +for flatter arcs and increase it for steeper arcs.} + +\item{curvature_coverage}{The curvature of the junction arcs in the +coverage tracks of \code{plotBamCoverage}. Decrease this +value for flatter arcs and increase it for steeper arcs.} + +\item{cex}{For controlling the size of text and numbers in +\code{plotBamCoverage}.} + +\item{splicegraph_labels}{Indicated the format of exon/splice junction +labels in the splicegraph of \code{plotBamCoverage}. +Possible values are 'genomic_range' (gives the start position +of the first exon and the end position of the last exon that +are shown), 'id' (format E1,... J1,...), 'name' (format +type:chromosome:start-end:strand for each feature), +'none' for no labels (see SGSeq package).} + +\item{splicegraph_position}{The position of the splicegraph relative to the +coverage tracks in \code{plotBamCoverage}. Possible values +are 'top' (default) and 'bottom'.} + +\item{show_full_gene}{Should the full genomic range of the gene be shown in +\code{plotBamCoverageFromResultTable} (default: FALSE)? +If FALSE, only a certain region (see parameters left_extension +and right_extension) around the outlier junction is shown.} + +\item{orgDb}{A OrgDb object giving the mapping of gene ids and symbols.} + +\item{res_gene_col}{The column name in the given results table that +contains the gene annotation.} + +\item{res_gene_type}{The type of gene annotation in \code{res_gene_col} +(e.g. SYMBOL or ENTREZID etc.). This information is needed for +mapping between the results table and the provided annotation +in the txdb object.} + +\item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} +(e.g. ENTREZID). This information is needed for +mapping between the results table and the provided annotation +in the txdb object.} + +\item{left_extension}{Indicating how far the plotted range around the outlier +junction should be extended to the left in +\code{plotBamCoverageFromResultTable}.} + +\item{right_extension}{Indicating how far the plotted range around the +outlier junction should be extended to the right in +\code{plotBamCoverageFromResultTable}.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -269,6 +399,8 @@ This is the list of all plotting function provided by FRASER: \item plotFilterExpression() \item plotFilterVariability() \item plotEncDimSearch() + \item plotBamCoverage() + \item plotBamCoverageFromResultTable() } For a detailed description of each plot function please see the details. @@ -345,5 +477,28 @@ plotExpression(fds, result=res[1]) plotQQ(fds, result=res[1]) plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +# plot splice graph and coverage from bam files in a given region +fds <- createTestFraserSettings() +gr <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") +plotBamCoverage(fds, gr=gr, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + +# plot coverage from bam file for a row in the result table +\donttest{ + fds <- createTestFraserDataSet() + require(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + require(org.Hs.eg.db) + orgDb <- org.Hs.eg.db + plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000) +} } From 3959c71f72b85be510ed83c7a5d26053227ca077 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 12 Jul 2022 19:54:35 +0200 Subject: [PATCH 24/80] fix examples --- R/AllGenerics.R | 4 ++-- R/plotMethods.R | 40 ++++++++++++++++++++------------ man/plotFunctions.Rd | 55 +++----------------------------------------- man/results.Rd | 4 ++-- 4 files changed, 32 insertions(+), 71 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index cfb73078..a285f411 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -770,9 +770,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' # get data, fit and compute p-values and z-scores #' fds <- createTestFraserDataSet() #' -#' # extract results: for this example dataset, z score cutoff of 2 is used to +#' # extract results: for this example dataset, no cutoffs are used to #' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +#' res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first diff --git a/R/plotMethods.R b/R/plotMethods.R index 4c0ef708..f363ee55 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -59,6 +59,8 @@ #' @param onlyExpressedIntrons Logical value indicating whether to show only #' introns that also pass the expression filter. Defaults to #' FALSE. +#' @param gr A GRanges object indicating the genomic range that should be shown +#' in \code{plotBamCoverage}. #' @param control_samples The sampleIDs of the samples used as control in #' \code{plotBamCoverage}. #' @param min_junction_count The minimal junction count across samples required @@ -254,17 +256,24 @@ #' mar=c(1, 7, 0.1, 3)) #' #' # plot coverage from bam file for a row in the result table -#' \donttest{ -#' fds <- createTestFraserDataSet() -#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) -#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -#' require(org.Hs.eg.db) -#' orgDb <- org.Hs.eg.db -#' plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE, -#' txdb=txdb, orgDb=orgDb, control_samples="sample3") -#' plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, -#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, -#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) +#' fds <- createTestFraserDataSet() +#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' require(org.Hs.eg.db) +#' orgDb <- org.Hs.eg.db +#' +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) +#' res_dt <- as.data.table(res) +#' res_dt <- res_dt[sampleID == "sample2",] +#' +#' # plot full range of gene containing outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, +#' txdb=txdb, orgDb=orgDb, control_samples="sample3") +#' +#' # plot only certain range around outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, +#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) #' } #' NULL @@ -1335,10 +1344,11 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, stop("Missing input: orgDb (for mapping of IDs to txdb)") } txdb_geneid <- select(orgDb, - keys=as.character(result[,res_gene_col, with=FALSE]), - columns=txdb_geneid_type, - keytype=res_gene_type)[1] - gr <- genes(txdb, filter=list("gene_id"=txdb_geneid)) + keys=as.character(result[,res_gene_col, with=FALSE]), + columns=txdb_geneid_type, + keytype=res_gene_type)[1,] + gr <- genes(txdb, + filter=list("gene_id"=txdb_geneid[,txdb_geneid_type])) } else{ # or just showing a certain region around the outlier junction gr <- outlier_range diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 2fc50b2b..7248f24f 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -268,6 +268,9 @@ annotation of the heatmap.} \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with the mean psi values or the junction coverage.} +\item{gr}{A GRanges object indicating the genomic range that should be shown +in \code{plotBamCoverage}.} + \item{control_samples}{The sampleIDs of the samples used as control in \code{plotBamCoverage}.} @@ -450,55 +453,3 @@ It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for the \code{q} in fitting process. } -\examples{ -# create full FRASER object -fds <- makeSimulatedFraserDataSet(m=40, j=200) -fds <- calculatePSIValues(fds) -fds <- filterExpressionAndVariability(fds, filter=FALSE) -# this step should be done for all splicing metrics and more dimensions -fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) -fds <- FRASER(fds) - -# QC plotting -plotFilterExpression(fds) -plotFilterVariability(fds) -plotCountCorHeatmap(fds, "theta") -plotCountCorHeatmap(fds, "theta", normalized=TRUE) -plotEncDimSearch(fds, type="psi5") - -# extract results -plotAberrantPerSample(fds, aggregate=FALSE) -plotVolcano(fds, "sample1", "psi5") - -# dive into gene/sample level results -res <- results(fds) -res -plotExpression(fds, result=res[1]) -plotQQ(fds, result=res[1]) -plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) - -# plot splice graph and coverage from bam files in a given region -fds <- createTestFraserSettings() -gr <- GRanges(seqnames="chr19", - IRanges(start=7587496, end=7598895), - strand="+") -plotBamCoverage(fds, gr=gr, sampleID="sample3", - control_samples="sample2", min_junction_count=5, - curvature_splicegraph=1, curvature_coverage=1, - mar=c(1, 7, 0.1, 3)) - -# plot coverage from bam file for a row in the result table -\donttest{ - fds <- createTestFraserDataSet() - require(TxDb.Hsapiens.UCSC.hg19.knownGene) - txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene - require(org.Hs.eg.db) - orgDb <- org.Hs.eg.db - plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE, - txdb=txdb, orgDb=orgDb, control_samples="sample3") - plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, - control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, - curvature_coverage=0.5, right_extension=5000, left_extension=5000) -} - -} diff --git a/man/results.Rd b/man/results.Rd index 57682616..7061f285 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -95,9 +95,9 @@ aberrant splicing events based on the given cutoffs. # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() -# extract results: for this example dataset, z score cutoff of 2 is used to +# extract results: for this example dataset, no cutoffs are used to # get at least one result and show the output -res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA) res # aggregate the results by genes (gene symbols need to be annotated first From 7c208125a27d23882636b0c0ff72200d2c668f7d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 13:14:48 +0200 Subject: [PATCH 25/80] minor improvements --- R/plotMethods.R | 37 ++++++++++++++++-------- man/plotFunctions.Rd | 68 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 88 insertions(+), 17 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index f363ee55..316c43dc 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -80,10 +80,10 @@ #' \code{plotBamCoverageFromResultTable}. #' @param res_gene_col The column name in the given results table that #' contains the gene annotation. -#' @param res_gene_type The type of gene annotation in \code{res_gene_col} -#' (e.g. SYMBOL or ENTREZID etc.). This information is needed for -#' mapping between the results table and the provided annotation -#' in the txdb object. +#' @param res_geneid_type The type of gene annotation in the results table in +#' \code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +#' information is needed for mapping between the results table and +#' the provided annotation in the txdb object. #' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} #' (e.g. ENTREZID). This information is needed for #' mapping between the results table and the provided annotation @@ -274,7 +274,6 @@ #' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, #' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, #' curvature_coverage=0.5, right_extension=5000, left_extension=5000) -#' } #' NULL @@ -1320,7 +1319,7 @@ plotBamCoverage <- function(fds, gr, sampleID, #' @export plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol", - res_gene_type="SYMBOL", txdb_geneid_type="ENTREZID", + res_geneid_type="SYMBOL", txdb_geneid_type="ENTREZID", left_extension=1000, right_extension=1000, ...){ stopifnot(is(fds, "FraserDataSet")) @@ -1343,12 +1342,26 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, if(missing(orgDb)){ stop("Missing input: orgDb (for mapping of IDs to txdb)") } - txdb_geneid <- select(orgDb, - keys=as.character(result[,res_gene_col, with=FALSE]), - columns=txdb_geneid_type, - keytype=res_gene_type)[1,] - gr <- genes(txdb, - filter=list("gene_id"=txdb_geneid[,txdb_geneid_type])) + result_gene <- result[,get(res_gene_col)] + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, + by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, + sort=FALSE)[,.(gene_id, feature=get(res_geneid_type))] + setnames(tmp, "feature", res_geneid_type) + txdb_geneid <- tmp[get(res_geneid_type) %in% result_gene, gene_id] + } else { + tmp <- as.data.table( + select(orgDb, + keys=result_gene, + columns=txdb_geneid_type, + keytype=res_geneid_type) + ) + txdb_geneid <- tmp[, get(txdb_geneid_type)] + } + gr <- genes(txdb, filter=list("gene_id"=txdb_geneid)) + if(length(gr) == 0){ + stop("Could not extract genomic coordinates for input gene.") + } } else{ # or just showing a certain region around the outlier junction gr <- outlier_range diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 7248f24f..14958dae 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -159,7 +159,7 @@ plotBamCoverageFromResultTable( txdb = NULL, orgDb = NULL, res_gene_col = "hgncSymbol", - res_gene_type = "SYMBOL", + res_geneid_type = "SYMBOL", txdb_geneid_type = "ENTREZID", left_extension = 1000, right_extension = 1000, @@ -344,10 +344,10 @@ and right_extension) around the outlier junction is shown.} \item{res_gene_col}{The column name in the given results table that contains the gene annotation.} -\item{res_gene_type}{The type of gene annotation in \code{res_gene_col} -(e.g. SYMBOL or ENTREZID etc.). This information is needed for -mapping between the results table and the provided annotation -in the txdb object.} +\item{res_geneid_type}{The type of gene annotation in the results table in +\code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +information is needed for mapping between the results table and +the provided annotation in the txdb object.} \item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} (e.g. ENTREZID). This information is needed for @@ -453,3 +453,61 @@ It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for the \code{q} in fitting process. } +\examples{ +# create full FRASER object +fds <- makeSimulatedFraserDataSet(m=40, j=200) +fds <- calculatePSIValues(fds) +fds <- filterExpressionAndVariability(fds, filter=FALSE) +# this step should be done for all splicing metrics and more dimensions +fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) +fds <- FRASER(fds) + +# QC plotting +plotFilterExpression(fds) +plotFilterVariability(fds) +plotCountCorHeatmap(fds, "theta") +plotCountCorHeatmap(fds, "theta", normalized=TRUE) +plotEncDimSearch(fds, type="psi5") + +# extract results +plotAberrantPerSample(fds, aggregate=FALSE) +plotVolcano(fds, "sample1", "psi5") + +# dive into gene/sample level results +res <- results(fds) +res +plotExpression(fds, result=res[1]) +plotQQ(fds, result=res[1]) +plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) + +# plot splice graph and coverage from bam files in a given region +fds <- createTestFraserSettings() +gr <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") +plotBamCoverage(fds, gr=gr, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + +# plot coverage from bam file for a row in the result table +fds <- createTestFraserDataSet() +require(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +require(org.Hs.eg.db) +orgDb <- org.Hs.eg.db + +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) +res_dt <- as.data.table(res) +res_dt <- res_dt[sampleID == "sample2",] + +# plot full range of gene containing outlier junction +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + +# plot only certain range around outlier junction +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000) + +} From 8910fc13e62791f9f57156b1e1f687a1e34a7f03 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 14:14:47 +0200 Subject: [PATCH 26/80] minor fix --- R/plotMethods.R | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index ef0551cb..b920b863 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1192,6 +1192,7 @@ plotBamCoverage <- function(fds, gr, sampleID, stop("Missing input gr (genomic range to plot).") } else{ stopifnot(is(gr, "GenomicRanges")) + stopifnot(length(gr) > 0) } if(missing(sampleID)){ stop("Missing input: sample_of_interest") @@ -1207,12 +1208,25 @@ plotBamCoverage <- function(fds, gr, sampleID, fds <- si_out[[2]] # collapse input ranges if several - gr <- range(gr) - gr <- keepSeqlevels(gr, as.character(seqnames(gr))) - if(all(strand(gr) == "*")){ - # seems to throw an error with * strand so guessing + strand instead - strand(gr) <- "+" + if(any(strand(gr) == "*")){ + # seems to throw an error with * strand so guessing strand instead + if(all(strand(gr) == "*")){ + guessStrand <- "+" + } else{ + guessStrand <- strand(gr[strand(gr) != "*"])[1] + } + strand(gr)[strand(gr) == "*"] <- guessStrand + warning("Input genomic ranges contained unstranded ranges.\n", + "This function needs strand information, guessing strand to ", + "be ", guessStrand, ".") } + if(!all(strand(gr) == strand(gr[1,]))){ + warning("Input genomic ranges contained ranges on different strands,\n", + "only showing coverage for the ", strand(gr[1,]), " strand.") + strand(gr) <- rep(strand(gr[1,]), length(gr)) + } + gr <- range(gr) + gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr)))) # convert highlight_range to GRangesList if not if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ @@ -1227,7 +1241,7 @@ plotBamCoverage <- function(fds, gr, sampleID, # overlap detected junctions with annotation if(!is.null(txdb)){ # subset to chr of interest - seqlevels(txdb) <- as.character(seqnames(gr)) + seqlevels(txdb) <- unique(as.character(seqnames(gr))) # extract transcript features with SGSeq package txf <- convertToTxFeatures(txdb) @@ -1350,6 +1364,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, stop("Missing input: orgDb (for mapping of IDs to txdb)") } result_gene <- result[,get(res_gene_col)] + result_gene <- strsplit(result_gene, ";", fixed=TRUE)[[1]] if(is.data.table(orgDb)){ tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, @@ -1376,6 +1391,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, end(gr) <- end(gr) + right_extension } + # if several genes overlap, only show those on same strand as outlier + gr <- gr[strand(gr) == strand(outlier_range),] + # create the coverage plot for the given outlier fds <- plotBamCoverage(fds, gr=gr, From 22f579b92c16784027a87113f60f7fad0fe0b2dc Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 14:25:03 +0200 Subject: [PATCH 27/80] deal with unstranded input ranges --- R/plotMethods.R | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index 316c43dc..c74a8c5c 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1175,7 +1175,6 @@ plotBamCoverage <- function(fds, gr, sampleID, splicegraph_labels=c("genomic_range", "id", "name", "none"), splicegraph_position=c("top", "bottom")){ - require(SGSeq) if(missing(fds)){ stop("Missing input: fds (FraserDataSet object)") } else{ @@ -1200,12 +1199,25 @@ plotBamCoverage <- function(fds, gr, sampleID, fds <- si_out[[2]] # collapse input ranges if several + if(any(strand(gr) == "*")){ + # seems to throw an error with * strand so guessing strand instead + if(all(strand(gr) == "*")){ + guessStrand <- "+" + } else{ + guessStrand <- strand(gr[strand(gr) != "*"])[1] + } + strand(gr)[strand(gr) == "*"] <- guessStrand + warning("Input genomic ranges contained unstranded ranges.\n", + "This function needs strand information, guessing strand to ", + "be ", guessStrand, ".") + } + if(!all(strand(gr) == strand(gr[1,]))){ + warning("Input genomic ranges contained ranges on different strands,\n", + "only showing coverage for the ", strand(gr[1,]), " strand.") + strand(gr) <- rep(strand(gr[1,]), length(gr)) + } gr <- range(gr) gr <- keepSeqlevels(gr, as.character(seqnames(gr))) - if(all(strand(gr) == "*")){ - # seems to throw an error with * strand so guessing + strand instead - strand(gr) <- "+" - } # convert highlight_range to GRangesList if not if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ @@ -1214,7 +1226,7 @@ plotBamCoverage <- function(fds, gr, sampleID, } # extract splice graph - sgfc_pred <- analyzeFeatures(sgseq_si, which = gr, + sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, min_junction_count=min_junction_count, psi=0) # overlap detected junctions with annotation @@ -1223,7 +1235,7 @@ plotBamCoverage <- function(fds, gr, sampleID, seqlevels(txdb) <- as.character(seqnames(gr)) # extract transcript features with SGSeq package - txf <- convertToTxFeatures(txdb) + txf <- SGSeq::convertToTxFeatures(txdb) txf <- txf[txf %over% gr] # restore seqlevels of txdb object @@ -1242,7 +1254,8 @@ plotBamCoverage <- function(fds, gr, sampleID, splicegraph_labels <- "label" # create custom labels (only for first and last exon for readability) mcols(sgfc_pred)$label <- "" - exons <- which(type(sgfc_pred) == "E" & rowRanges(sgfc_pred) %over% gr) + exons <- which(SGSeq::type(sgfc_pred) == "E" & + rowRanges(sgfc_pred) %over% gr) exons <- unique(c(exons[1], tail(exons, n=1))) if(length(exons) == 1){ mcols(sgfc_pred)$label[exons] <- @@ -1266,7 +1279,7 @@ plotBamCoverage <- function(fds, gr, sampleID, nr_sa2p <- length(all_sids) par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) if(splicegraph_position == "top"){ - plotSpliceGraph(rowRanges(sgfc_pred), + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), which=gr, toscale=toscale, color=color_annotated, @@ -1279,7 +1292,7 @@ plotBamCoverage <- function(fds, gr, sampleID, label=splicegraph_labels) } for (j in seq_along(sampleID)) { - plotCoverage( + SGSeq::plotCoverage( sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], which = gr, toscale = toscale, @@ -1288,7 +1301,7 @@ plotBamCoverage <- function(fds, gr, sampleID, curvature=curvature_coverage) } for (j in seq_along(control_samples)) { - plotCoverage( + SGSeq::plotCoverage( sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])], which = gr, toscale = toscale, @@ -1297,7 +1310,7 @@ plotBamCoverage <- function(fds, gr, sampleID, curvature=curvature_coverage) } if(splicegraph_position == "bottom"){ - plotSpliceGraph(rowRanges(sgfc_pred), + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), which=gr, toscale=toscale, color_novel=color_novel, @@ -1369,6 +1382,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, end(gr) <- end(gr) + right_extension } + # if several genes overlap, only show those on same strand as outlier + gr <- gr[strand(gr) == strand(outlier_range),] + # create the coverage plot for the given outlier fds <- plotBamCoverage(fds, gr=gr, @@ -1442,7 +1458,6 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ #' @noRd getSGSeqSI <- function(fds, sample_ids){ - require(SGSeq) # check if bam info is already stored in fds for given samples if("SGSeq_sampleinfo" %in% names(metadata(fds))){ si <- metadata(fds)[["SGSeq_sampleinfo"]] @@ -1455,7 +1470,7 @@ getSGSeqSI <- function(fds, sample_ids){ df_missing <- data.frame( sample_name=samples(fds)[samples(fds) %in% missing_ids], file_bam=bamFile(fds)[samples(fds) %in% missing_ids]) - si_new <- getBamInfo(df_missing, yieldSize=1e6) + si_new <- SGSeq::getBamInfo(df_missing, yieldSize=1e6) si_new$lib_size <- 50e6 # dummy value to speed up this part si <- rbind(si, si_new) metadata(fds)[["SGSeq_sampleinfo"]] <- @@ -1468,7 +1483,7 @@ getSGSeqSI <- function(fds, sample_ids){ df <- data.frame( sample_name=samples(fds)[samples(fds) %in% sample_ids], file_bam=bamFile(fds)[samples(fds) %in% sample_ids]) - si <- getBamInfo(df, yieldSize=1e6) + si <- gSGSeq::getBamInfo(df, yieldSize=1e6) si$lib_size <- 50e6 # dummy value to speed up this part metadata(fds)[["SGSeq_sampleinfo"]] <- si return(list(si, fds)) From 51bfdfb5521b0d7125d84c9fde89b6f72a57392d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 15:14:43 +0200 Subject: [PATCH 28/80] fix typo --- R/plotMethods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index c74a8c5c..4deeb120 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1483,7 +1483,7 @@ getSGSeqSI <- function(fds, sample_ids){ df <- data.frame( sample_name=samples(fds)[samples(fds) %in% sample_ids], file_bam=bamFile(fds)[samples(fds) %in% sample_ids]) - si <- gSGSeq::getBamInfo(df, yieldSize=1e6) + si <- SGSeq::getBamInfo(df, yieldSize=1e6) si$lib_size <- 50e6 # dummy value to speed up this part metadata(fds)[["SGSeq_sampleinfo"]] <- si return(list(si, fds)) From f6bdef66fa027bb1fb22f2c377b574481c29efda Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 16:28:30 +0200 Subject: [PATCH 29/80] slightly improved intron_retention detection --- R/resultAnnotations.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 5cafc8ea..2dbd9a5b 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -337,12 +337,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% colnames(junctions_dt))){ junctions_dt[spliceType == "annotatedIntron_reducedUsage" & - type == "jaccard" & - nonsplitProportion >= nonsplitProportion_99quantile, - spliceType := "intron_retention"] + type == "jaccard" & + nonsplitProportion >= nonsplitProportion_99quantile + 0.05 & + nonsplitCounts >= 10, + spliceType := "(partial)intronRetention"] # TODO check frameshift for intron retention - junctions_dt[spliceType == "intron_retention", + junctions_dt[spliceType == "(partial)intronRetention", causesFrameshift := "inconclusive"] } From 1aa4052d64a0ae0b1c5908181e1512ab4c20891a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 16:40:02 +0200 Subject: [PATCH 30/80] require sgseq package to be available for running examples --- R/plotMethods.R | 60 +++++++++++++++++++++++--------------------- man/plotFunctions.Rd | 54 ++++++++++++++++++++------------------- 2 files changed, 60 insertions(+), 54 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index 4deeb120..eef9ce4e 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -246,34 +246,36 @@ #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) #' #' # plot splice graph and coverage from bam files in a given region -#' fds <- createTestFraserSettings() -#' gr <- GRanges(seqnames="chr19", -#' IRanges(start=7587496, end=7598895), -#' strand="+") -#' plotBamCoverage(fds, gr=gr, sampleID="sample3", -#' control_samples="sample2", min_junction_count=5, -#' curvature_splicegraph=1, curvature_coverage=1, -#' mar=c(1, 7, 0.1, 3)) +#' if(require(SGSeq)){ +#' fds <- createTestFraserSettings() +#' gr <- GRanges(seqnames="chr19", +#' IRanges(start=7587496, end=7598895), +#' strand="+") +#' plotBamCoverage(fds, gr=gr, sampleID="sample3", +#' control_samples="sample2", min_junction_count=5, +#' curvature_splicegraph=1, curvature_coverage=1, +#' mar=c(1, 7, 0.1, 3)) #' -#' # plot coverage from bam file for a row in the result table -#' fds <- createTestFraserDataSet() -#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) -#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -#' require(org.Hs.eg.db) -#' orgDb <- org.Hs.eg.db +#' # plot coverage from bam file for a row in the result table +#' fds <- createTestFraserDataSet() +#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' require(org.Hs.eg.db) +#' orgDb <- org.Hs.eg.db +#' +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) +#' res_dt <- as.data.table(res) +#' res_dt <- res_dt[sampleID == "sample2",] #' -#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) -#' res_dt <- as.data.table(res) -#' res_dt <- res_dt[sampleID == "sample2",] +#' # plot full range of gene containing outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, +#' txdb=txdb, orgDb=orgDb, control_samples="sample3") #' -#' # plot full range of gene containing outlier junction -#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, -#' txdb=txdb, orgDb=orgDb, control_samples="sample3") -#' -#' # plot only certain range around outlier junction -#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, -#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, -#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) +#' # plot only certain range around outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, +#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) +#' } #' NULL @@ -1217,7 +1219,7 @@ plotBamCoverage <- function(fds, gr, sampleID, strand(gr) <- rep(strand(gr[1,]), length(gr)) } gr <- range(gr) - gr <- keepSeqlevels(gr, as.character(seqnames(gr))) + gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr)))) # convert highlight_range to GRangesList if not if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ @@ -1232,7 +1234,7 @@ plotBamCoverage <- function(fds, gr, sampleID, # overlap detected junctions with annotation if(!is.null(txdb)){ # subset to chr of interest - seqlevels(txdb) <- as.character(seqnames(gr)) + seqlevels(txdb) <- unique(as.character(seqnames(gr))) # extract transcript features with SGSeq package txf <- SGSeq::convertToTxFeatures(txdb) @@ -1383,7 +1385,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, } # if several genes overlap, only show those on same strand as outlier - gr <- gr[strand(gr) == strand(outlier_range),] + if(as.character(strand(outlier_range)) != "*"){ + gr <- gr[strand(gr) == strand(outlier_range),] + } # create the coverage plot for the given outlier fds <- plotBamCoverage(fds, diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 14958dae..3ee980cf 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -481,33 +481,35 @@ plotQQ(fds, result=res[1]) plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) # plot splice graph and coverage from bam files in a given region -fds <- createTestFraserSettings() -gr <- GRanges(seqnames="chr19", - IRanges(start=7587496, end=7598895), - strand="+") -plotBamCoverage(fds, gr=gr, sampleID="sample3", - control_samples="sample2", min_junction_count=5, - curvature_splicegraph=1, curvature_coverage=1, - mar=c(1, 7, 0.1, 3)) - -# plot coverage from bam file for a row in the result table -fds <- createTestFraserDataSet() -require(TxDb.Hsapiens.UCSC.hg19.knownGene) -txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -require(org.Hs.eg.db) -orgDb <- org.Hs.eg.db +if(require(SGSeq)){ + fds <- createTestFraserSettings() + gr <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") + plotBamCoverage(fds, gr=gr, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + + # plot coverage from bam file for a row in the result table + fds <- createTestFraserDataSet() + require(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + require(org.Hs.eg.db) + orgDb <- org.Hs.eg.db + + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) + res_dt <- as.data.table(res) + res_dt <- res_dt[sampleID == "sample2",] -res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) -res_dt <- as.data.table(res) -res_dt <- res_dt[sampleID == "sample2",] + # plot full range of gene containing outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") -# plot full range of gene containing outlier junction -plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, - txdb=txdb, orgDb=orgDb, control_samples="sample3") - -# plot only certain range around outlier junction -plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, - control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, - curvature_coverage=0.5, right_extension=5000, left_extension=5000) + # plot only certain range around outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000) +} } From 8a83664ac5f3503221c82a31a1392fdd6526c3ad Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 13 Jul 2022 19:13:56 +0200 Subject: [PATCH 31/80] fix example --- R/plotMethods.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index eef9ce4e..ad241e25 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -274,7 +274,8 @@ #' # plot only certain range around outlier junction #' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, #' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, -#' curvature_coverage=0.5, right_extension=5000, left_extension=5000) +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000, +#' splicegraph_labels="id") #' } #' NULL From 78efd11c36f3ba71625d785a003dd7c25c7b2e07 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 14 Jul 2022 14:06:21 +0200 Subject: [PATCH 32/80] add updated man/plotFunctions.Rd --- man/plotFunctions.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 3ee980cf..f4cc5c47 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -509,7 +509,8 @@ if(require(SGSeq)){ # plot only certain range around outlier junction plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, - curvature_coverage=0.5, right_extension=5000, left_extension=5000) + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") } } From d775fe4740bf015a309ce997e6914d24085b4189 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 19 Jul 2022 15:42:51 +0200 Subject: [PATCH 33/80] allow for small variants in nonsplit anchor region --- R/countRNAseqData.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index b5b442df..ee9f908b 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -895,7 +895,8 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, # extract the counts with Rsubread tmp_ssc <- checkSeqLevelStyle(spliceSiteCoords, fds, sampleID, TRUE) - anno <- GRanges2SAF(tmp_ssc, minAnchor=minAnchor) + # use minAnchor+1 here to allow for small variants in the anchor region + anno <- GRanges2SAF(tmp_ssc, minAnchor=(minAnchor+1)) rsubreadCounts <- featureCounts(files=bamFile, annot.ext=anno, minOverlap=minAnchor*2, allowMultiOverlap=TRUE, From 580234b05ec346b98bf6179f9719ebfd535c366b Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 19 Jul 2022 17:48:41 +0200 Subject: [PATCH 34/80] adapt test case to compare to correct result when allowing short mismatches in nonsplit anchor --- tests/testthat/test_counting.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R index 4fcd0d61..1bdbfc37 100644 --- a/tests/testthat/test_counting.R +++ b/tests/testthat/test_counting.R @@ -51,7 +51,7 @@ test_that("test minAnchor", { "sample3", features, fds, minAnchor=25, recount=TRUE)) }) expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1]) - expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1]) + expect_equivalent(c(5, 8, 0, 0, 7), ctnNS25[,1]) }) test_that("Test psi values", { From df77b899a73f65bf9fe43d43971d7871b3292794 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 21 Jul 2022 15:20:18 +0200 Subject: [PATCH 35/80] fix minor issue --- R/plotMethods.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index ad241e25..f78a41c0 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1176,7 +1176,7 @@ plotBamCoverage <- function(fds, gr, sampleID, toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5), curvature_splicegraph=1, curvature_coverage=1, cex=1, splicegraph_labels=c("genomic_range", "id", "name", "none"), - splicegraph_position=c("top", "bottom")){ + splicegraph_position=c("top", "bottom"), ...){ if(missing(fds)){ stop("Missing input: fds (FraserDataSet object)") @@ -1230,7 +1230,8 @@ plotBamCoverage <- function(fds, gr, sampleID, # extract splice graph sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, - min_junction_count=min_junction_count, psi=0) + min_junction_count=min_junction_count, psi=0, + ...) # overlap detected junctions with annotation if(!is.null(txdb)){ @@ -1386,7 +1387,8 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, } # if several genes overlap, only show those on same strand as outlier - if(as.character(strand(outlier_range)) != "*"){ + if(as.character(strand(outlier_range)) != "*" & + length(gr[strand(gr) == strand(outlier_range),]) > 0){ gr <- gr[strand(gr) == strand(outlier_range),] } From 211da6094221923a2379cdaedb112ce3165aec94 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 17 Aug 2022 11:01:57 +0200 Subject: [PATCH 36/80] small fix to plot cor heatmap for jaccard --- R/plotMethods.R | 4 ++-- man/plotFunctions.Rd | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index cb392593..5021edf0 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -971,8 +971,8 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000, - topJ=5000, minMedian=1, minCount=10, + type=c("psi5", "psi3", "theta", "jaccard"), logit=FALSE, + topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, annotation_row=NA, border_color=NA, nClust=5, diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index a2862aed..57b42b60 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -107,7 +107,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = c("psi5", "psi3", "theta"), + type = c("psi5", "psi3", "theta", "jaccard"), logit = FALSE, topN = 50000, topJ = 5000, @@ -149,7 +149,8 @@ plotBamCoverage( curvature_coverage = 1, cex = 1, splicegraph_labels = c("genomic_range", "id", "name", "none"), - splicegraph_position = c("top", "bottom") + splicegraph_position = c("top", "bottom"), + ... ) plotBamCoverageFromResultTable( From 90ae558c5a70f51a15554fa9d7f05c223a56d50a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 23 Nov 2022 14:08:51 +0100 Subject: [PATCH 37/80] add manhattan plot function and update documentation --- DESCRIPTION | 4 +- NAMESPACE | 5 + R/AllGenerics-definitions.R | 6 ++ R/FRASER-package.R | 2 + R/plotMethods.R | 207 ++++++++++++++++++++++++++++++++++++ man/plotFunctions.Rd | 38 ++++++- 6 files changed, 256 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1961b62e..a38de791 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -84,7 +84,9 @@ Suggests: TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db, rtracklayer, - SGSeq + SGSeq, + ggbio, + biovizBase LinkingTo: RcppArmadillo, Rcpp diff --git a/NAMESPACE b/NAMESPACE index 0e6eb40a..4d813d74 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ export(plotExpectedVsObservedPsi) export(plotExpression) export(plotFilterExpression) export(plotFilterVariability) +export(plotManhattan) export(plotQQ) export(plotVolcano) export(predictedMeans) @@ -112,6 +113,7 @@ exportMethods(pairedEnd) exportMethods(plotAberrantPerSample) exportMethods(plotCountCorHeatmap) exportMethods(plotEncDimSearch) +exportMethods(plotManhattan) exportMethods(plotQQ) exportMethods(plotVolcano) exportMethods(results) @@ -255,11 +257,14 @@ importFrom(VGAM,rbetabinom) importFrom(VGAM,vglm) importFrom(biomaRt,getBM) importFrom(biomaRt,useEnsembl) +importFrom(biovizBase,parseArgsForAes) +importFrom(biovizBase,parseArgsForNonAes) importFrom(cowplot,theme_cowplot) importFrom(extraDistr,dbbinom) importFrom(extraDistr,pbbinom) importFrom(extraDistr,rdirmnom) importFrom(generics,fit) +importFrom(ggbio,autoplot) importFrom(ggplot2,aes) importFrom(ggplot2,annotate) importFrom(ggplot2,annotation_logticks) diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R index ced4142a..0a364ac2 100644 --- a/R/AllGenerics-definitions.R +++ b/R/AllGenerics-definitions.R @@ -146,3 +146,9 @@ setGeneric("nonSplicedReads", #' @export setGeneric("nonSplicedReads<-", signature = "object", function(object, value) standardGeneric("nonSplicedReads<-")) + +#' @rdname plotFunctions +#' @export +setGeneric("plotManhattan", function(object, ...) + standardGeneric("plotManhattan")) + diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 642dc225..22cee146 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -50,6 +50,8 @@ #' @importFrom RColorBrewer brewer.pal #' @importFrom cowplot theme_cowplot #' @importFrom ggrepel geom_text_repel +#' @importFrom biovizBase parseArgsForAes parseArgsForNonAes +#' @importFrom ggbio autoplot #' #' ### Data handling diff --git a/R/plotMethods.R b/R/plotMethods.R index 5021edf0..1b4ae279 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -17,6 +17,7 @@ #' \item plotEncDimSearch() #' \item plotBamCoverage() #' \item plotBamCoverageFromResultTable() +#' \item plotManhattan() #' } #' #' For a detailed description of each plot function please see the details. @@ -209,6 +210,17 @@ #' It plots the encoding dimension against the achieved loss (area under the #' precision-recall curve). From this plot the optimum should be choosen for #' the \code{q} in fitting process. +#' +#' \code{plotManhattan}: A Manhattan plot showing the junction pvalues by +#' genomic position. Useful to identify if outliers cluster by genomic position. +#' +#' \code{plotBamCoverage}: A sashimi plot showing the read coverage from +#' the underlying bam files for a given genomic range and sampleIDs. +#' +#' \code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +#' coverage from the underlying bam files for a row in the results table. Can +#' either show the full range of the gene with the outlier junction or only a +#' certain region around the outlier. #' #' @return If base R graphics are used nothing is returned else the plotly or #' the gplot object is returned. @@ -245,6 +257,9 @@ #' plotQQ(fds, result=res[1]) #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) #' +#' # create manhattan plot of pvalues by genomic position +#' plotManhattan(fds, type="jaccard", sampleID="sample10") +#' #' # plot splice graph and coverage from bam files in a given region #' if(require(SGSeq)){ #' fds <- createTestFraserSettings() @@ -1411,6 +1426,69 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, return(invisible(fds)) } +plotManhattan.FRASER <- function(object, sampleID, + type=c("psi5", "psi3", "theta", "jaccard"), + main=paste0("sampleID = ", sampleID), + color=c("black", "darkgrey"), + ...){ + # check arguments + stopifnot(sampleID %in% samples(object)) + type <- match.arg(type) + additional_args <- list(...) + padjCutoff <- 0.05 + if("padjCutoff" %in% names(additional_args)){ + padjCutoff <- additional_args$padjCutoff + } + deltaPsiCutoff <- 0.3 + if("deltaPsiCutoff" %in% names(additional_args)){ + deltaPsiCutoff <- additional_args$deltaPsiCutoff + } + + # extract neccessary informations + gr_sample <- rowRanges(object, type=type) + seqlevelsStyle(gr_sample) <- seqlevelsStyle(object) + mcols(gr_sample)[,"pvalue"] <- -log10( + pVals(object, type=type, level="junction")[,sampleID]) + mcols(gr_sample)[,"padjust"] <- -log10( + padjVals(object, type=type, level="site")[,sampleID]) + mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID] + + # only one point per donor/acceptor site (relevant only for psi5 and psi3) + index <- FRASER:::getSiteIndex(object, type=type) + nonDup <- !duplicated(index) + gr_sample <- gr_sample[nonDup,] + + # Sort granges for plot + gr_sample <- sortSeqlevels(gr_sample) + gr_sample <- sort(gr_sample) + + # find outlier indices + if(!type %in% c("psi3", "psi5")){ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) & + abs(gr_sample$delta) >= deltaPsiCutoff) + } else{ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff)) + } + message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...") + + # plot manhattan plot + plotGrandLinear.adapted(gr_sample, aes(y=pvalue), + color=color, + highlight.gr=gr_sample[outlier_idx,], + highlight.overlap="equal") + + labs(title=main) + +} + +#' +#' Plot manhattan plot of junction pvalues +#' +#' @rdname plotFunctions +#' @export +setMethod("plotManhattan", signature="FraserDataSet", + plotManhattan.FRASER) + + #' #' helper function to get the annotation as data frame from the col data object #' @@ -1507,3 +1585,132 @@ getSGSeqSI <- function(fds, sample_ids){ return(list(si, fds)) } } + +#' +#' Adapted function from ggbio package to create manhattan plot. +#' Adapted to allow highlighting only ranges that exactly match. Uses functions +#' from package biovizBase. +#' +#' @noRd +plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, + geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1, + legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL, + highlight.name = NULL, highlight.col = "red", highlight.label = TRUE, + highlight.label.size = 5, highlight.label.offset = 0.05, + highlight.label.col = "black", + highlight.overlap = c("any", "start", "end", "within", "equal"), + spaceline = FALSE){ + if (is.null(geom)) + geom <- "point" + args <- list(...) + args.aes <- parseArgsForAes(args) + args.non <- parseArgsForNonAes(args) + two.color <- c("#0080FF", "#4CC4FF") + .is.seq <- FALSE + if (!"colour" %in% names(args.aes)) { + if (!any(c("color", "colour") %in% names(args.non))) { + .color <- two.color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + } + else { + if (length(args.non$color) > 1) { + .color <- args.non$color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + args.non <- args.non[!names(args.non) %in% c("colour", + "color")] + } + } + } + else { + if (quo_name(args.aes$colour) == "seqnames") + args.aes$colour <- as.name("seqnames") + } + if (!"y" %in% names(args.aes)) + stop("need to provide y") + args.non$coord <- "genome" + args.non$space.skip <- space.skip + args.non$geom <- geom + args.non$object <- obj + aes.res <- do.call(aes, args.aes) + p <- do.call(autoplot, c(list(aes.res), args.non)) + if (!legend) + p <- p + theme(legend.position = "none") + if (!missing(ylab)) + p <- p + ylab(ylab) + if (!is.null(cutoff)) + p <- p + geom_hline(yintercept = cutoff, color = cutoff.color, + size = cutoff.size) + chrs <- names(seqlengths(obj)) + if (.is.seq) { + N <- length(chrs) + cols <- rep(.color, round(N/length(.color)) + 1)[1:N] + names(cols) <- chrs + p <- p + scale_color_manual(values = cols) + } + if (!missing(facets)) { + args$facets <- facets + args.facets <- subsetArgsByFormals(args, facet_grid, + facet_wrap) + facet <- .buildFacetsFromArgs(obj, args.facets) + p <- p + facet + } + p <- p + theme(panel.grid.minor = element_blank()) + if (!is.null(highlight.gr)) { + highlight.overlap <- match.arg(highlight.overlap) + idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap) + .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)), + function(id) { + gr <- GRanges(as.character(seqnames(p@data))[id][1], + IRanges(start = min(start(p@data[id])), end = max(end(p@data[id])))) + val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)])) + val <- val * (1 + highlight.label.offset) + values(gr)$val <- val + gr + }) + .h.pos <- suppressWarnings(do.call("c", unname(.h.pos))) + if (length(.h.pos)) { + if (is.null(highlight.name)) { + highlight.name <- names(highlight.gr) + } + else { + highlight.name <- values(highlight.gr)[, highlight.name] + } + p <- p + geom_point(data = mold(p@data[queryHits(idx)]), + do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), + color = highlight.col) + if (!is.null(highlight.name)) { + seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj) + suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj)) + .trans <- transformToGenome(.h.pos, space.skip = space.skip) + values(.trans)$mean <- (start(.trans) + end(.trans))/2 + values(.trans)$names <- highlight.name + p <- p + geom_text(data = mold(.trans), size = highlight.label.size, + vjust = 0, color = highlight.label.col, do.call(aes, + list(x = substitute(mean), y = as.name("val"), + label = as.name("names")))) + } + } + } + if (spaceline) { + vline.df <- p@ggplot$data + vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames, + function(dd) { + data.frame(start = min(dd$start), end = max(dd$end)) + })) + gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2 + p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") + + theme(panel.grid = element_blank()) + } + if (!missing(main)) + p <- p + labs(title = main) + if (!missing(xlim)) + p <- p + xlim(xlim) + if (!missing(ylim)) + p <- p + ylim(ylim) + if (missing(xlab)) + xlab <- "" + p <- p + ggplot2::xlab(xlab) + p +} diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 57b42b60..9f591ebb 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -1,6 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plotMethods.R -\name{plotFunctions} +% Please edit documentation in R/AllGenerics-definitions.R, R/plotMethods.R +\name{plotManhattan} +\alias{plotManhattan} \alias{plotFunctions} \alias{plotAberrantPerSample} \alias{plotVolcano} @@ -18,8 +19,11 @@ \alias{plotCountCorHeatmap,FraserDataSet-method} \alias{plotBamCoverage} \alias{plotBamCoverageFromResultTable} +\alias{plotManhattan,FraserDataSet-method} \title{Visualization functions for FRASER} \usage{ +plotManhattan(object, ...) + \S4method{plotVolcano}{FraserDataSet}( object, sampleID, @@ -166,10 +170,22 @@ plotBamCoverageFromResultTable( right_extension = 1000, ... ) + +\S4method{plotManhattan}{FraserDataSet}( + object, + sampleID, + type = c("psi5", "psi3", "theta", "jaccard"), + main = paste0("sampleID = ", sampleID), + color = c("black", "darkgrey"), + ... +) } \arguments{ \item{object, fds}{An \code{\link{FraserDataSet}} object.} +\item{...}{Additional parameters passed to plot() or plot_ly() if not stated +otherwise in the details for each plot function} + \item{sampleID}{A sample ID which should be plotted. Can also be a vector. Integers are treated as indices.} @@ -193,9 +209,6 @@ list of gene symbols or sampleIDs.} \item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta psi cutoff to mark outliers} -\item{...}{Additional parameters passed to plot() or plot_ly() if not stated -otherwise in the details for each plot function} - \item{BPPARAM}{BiocParallel parameter to use.} \item{idx}{A junction site ID or gene ID or one of both, which @@ -405,6 +418,7 @@ This is the list of all plotting function provided by FRASER: \item plotEncDimSearch() \item plotBamCoverage() \item plotBamCoverageFromResultTable() + \item plotManhattan() } For a detailed description of each plot function please see the details. @@ -453,6 +467,17 @@ introns and for the filtered (i.e. non-variable) introns. It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for the \code{q} in fitting process. + +\code{plotManhattan}: A Manhattan plot showing the junction pvalues by +genomic position. Useful to identify if outliers cluster by genomic position. + +\code{plotBamCoverage}: A sashimi plot showing the read coverage from +the underlying bam files for a given genomic range and sampleIDs. + +\code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +coverage from the underlying bam files for a row in the results table. Can +either show the full range of the gene with the outlier junction or only a +certain region around the outlier. } \examples{ # create full FRASER object @@ -481,6 +506,9 @@ plotExpression(fds, result=res[1]) plotQQ(fds, result=res[1]) plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +# create manhattan plot of pvalues by genomic position +plotManhattan(fds, type="jaccard", sampleID="sample10") + # plot splice graph and coverage from bam files in a given region if(require(SGSeq)){ fds <- createTestFraserSettings() From 0a6addc6991957fdc764be207ae7939fa9188aba Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 24 Nov 2022 13:21:24 +0100 Subject: [PATCH 38/80] code cleanup & adaption of documentation, examples and tests for FRASER2 --- NAMESPACE | 18 +- NEWS | 21 ++ R/AllGenerics-definitions.R | 5 + R/AllGenerics.R | 32 ++- R/FRASER-package.R | 18 +- R/Fraser-pipeline.R | 16 +- R/annotationOfRanges.R | 6 +- R/autoencoder.R | 3 +- R/calculatePSIValue.R | 18 +- R/example_functions.R | 2 +- R/filterExpression.R | 358 ++++++++++++------------- R/find_encoding_dimensions.R | 29 +- R/fitCorrectionMethods.R | 2 +- R/getNSetterFuns.R | 78 ++++-- R/helper-functions.R | 32 ++- R/makeSimulatedDataset.R | 3 +- R/plotMethods.R | 47 ++-- R/pvalsNzscore.R | 96 ++++++- R/variables.R | 20 +- R/zzz.R | 2 +- man/FRASER.Rd | 42 ++- man/annotateRanges.Rd | 6 +- man/calculatePSIValues.Rd | 6 +- man/counts.Rd | 17 +- man/filtering.Rd | 70 ++--- man/fit.Rd | 4 +- man/getter_setter_functions.Rd | 14 +- man/injectOutliers.Rd | 3 +- man/optimHyperParams.Rd | 13 +- man/plotFunctions.Rd | 40 +-- man/psiTypes.Rd | 8 +- man/psiTypes_avail.Rd | 21 ++ man/results.Rd | 16 +- tests/testthat/test_fraser_pipeline.R | 2 +- tests/testthat/test_hyperParams.R | 1 + tests/testthat/test_plotJunctionDist.R | 8 +- tests/testthat/test_stats.R | 62 ++--- 37 files changed, 659 insertions(+), 480 deletions(-) create mode 100644 man/psiTypes_avail.Rd diff --git a/NAMESPACE b/NAMESPACE index 4d813d74..139fcd83 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ export(bamFile) export(bestQ) export(calculatePSIValues) export(calculatePadjValues) +export(calculatePadjValuesOnSubset) export(calculatePvalues) export(calculateZscore) export(condition) @@ -44,10 +45,7 @@ export(dontWriteHDF5) export(featureExclusionMask) export(filterExpression) export(filterExpressionAndVariability) -export(filterExpressionAndVariability_jaccard) -export(filterExpression_jaccard) export(filterVariability) -export(filterVariability_jaccard) export(fit) export(flagBlacklistRegions) export(getNonSplitReadCountsForAllSamples) @@ -80,6 +78,7 @@ export(plotVolcano) export(predictedMeans) export(pseudocount) export(psiTypes) +export(psiTypes_avail) export(results) export(rho) export(samples) @@ -106,6 +105,7 @@ exportMethods(assays) exportMethods(bamFile) exportMethods(condition) exportMethods(filterExpression) +exportMethods(filterVariability) exportMethods(length) exportMethods(name) exportMethods(nonSplicedReads) @@ -167,6 +167,7 @@ importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqlevels) importFrom(GenomeInfoDb,seqlevelsStyle) importFrom(GenomeInfoDb,seqnames) +importFrom(GenomeInfoDb,sortSeqlevels) importFrom(GenomeInfoDb,standardChromosomes) importFrom(GenomicAlignments,junctions) importFrom(GenomicAlignments,readGAlignmentPairs) @@ -180,6 +181,7 @@ importFrom(GenomicFeatures,makeTxDbFromGFF) importFrom(GenomicFeatures,seqlevels0) importFrom(GenomicFeatures,threeUTRsByTranscript) importFrom(GenomicRanges,"end<-") +importFrom(GenomicRanges,"seqinfo<-") importFrom(GenomicRanges,"start<-") importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) @@ -224,6 +226,7 @@ importFrom(Rsamtools,scanBamHeader) importFrom(Rsubread,featureCounts) importFrom(S4Vectors,"mcols<-") importFrom(S4Vectors,"metadata<-") +importFrom(S4Vectors,"values<-") importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,Rle) importFrom(S4Vectors,SimpleList) @@ -234,6 +237,7 @@ importFrom(S4Vectors,metadata) importFrom(S4Vectors,queryHits) importFrom(S4Vectors,start) importFrom(S4Vectors,subjectHits) +importFrom(S4Vectors,values) importFrom(SummarizedExperiment,"assay<-") importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") @@ -257,18 +261,17 @@ importFrom(VGAM,rbetabinom) importFrom(VGAM,vglm) importFrom(biomaRt,getBM) importFrom(biomaRt,useEnsembl) -importFrom(biovizBase,parseArgsForAes) -importFrom(biovizBase,parseArgsForNonAes) importFrom(cowplot,theme_cowplot) importFrom(extraDistr,dbbinom) importFrom(extraDistr,pbbinom) importFrom(extraDistr,rdirmnom) importFrom(generics,fit) -importFrom(ggbio,autoplot) importFrom(ggplot2,aes) importFrom(ggplot2,annotate) importFrom(ggplot2,annotation_logticks) importFrom(ggplot2,element_blank) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_hline) @@ -277,10 +280,12 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_smooth) +importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,labs) +importFrom(ggplot2,quo_name) importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_discrete) importFrom(ggplot2,scale_color_gradientn) @@ -349,6 +354,7 @@ importFrom(stats,rnbinom) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(stats,sd) +importFrom(tibble,"%>%") importFrom(tibble,as_tibble) importFrom(tools,file_path_as_absolute) importFrom(utils,capture.output) diff --git a/NEWS b/NEWS index 15ca2c35..6728c4a5 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,24 @@ +CHANGES IN VERSION 2.0.0 +------------------------- + o Major update to FRASER2: + o Introduction of new & more robust splice metric Intron Jaccard Index + o Only Intron Jaccard Index metric used by default + o Improved gene level pvalue calculation and internal storage + o Introduction of option to limit FDR correction to user-defined + subsets of genes per sample (e.g. OMIM genes with rare variant) + o Updated internal pseudocount parameter and default delta Jaccard + cutoff + o Junction filtering adapted to usage of Intron Jaccard Index metric + o Require min expression of N >= 10 in 25% of the samples + o Results table: + o Functionality to flag outliers in blacklist regions of the genome + o Functionality to annotate the predicted type of aberrantSplicing + (e.g. exon skipping, intron retention etc.) + o Several updates in the plotting functions + o introduction of manhattan plot functionality + o possibility to create sashimi plots to visualize read coverage in + the bam files for outliers + CHANGES IN VERSION 1.8.1 ------------------------- o Bugfix in merging splicing counts (#41) diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R index 0a364ac2..0015b143 100644 --- a/R/AllGenerics-definitions.R +++ b/R/AllGenerics-definitions.R @@ -152,3 +152,8 @@ setGeneric("nonSplicedReads<-", signature = "object", setGeneric("plotManhattan", function(object, ...) standardGeneric("plotManhattan")) +#' @rdname filtering +#' @export +setGeneric("filterVariability", function(object, ...) + standardGeneric("filterVariability")) + diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 1161e26a..4bfb89fe 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -66,6 +66,9 @@ NULL #' @rdname fds-methods #' @export setMethod("samples", "FraserDataSet", function(object) { + if(!is.null(colnames(object))){ + return(colnames(object)) + } return(as.character(colData(object)[,"sampleID"])) }) @@ -74,6 +77,7 @@ setMethod("samples", "FraserDataSet", function(object) { setReplaceMethod("samples", "FraserDataSet", function(object, value) { colData(object)[,"sampleID"] <- as.character(value) rownames(colData(object)) <- colData(object)[,"sampleID"] + colnames(object) <- as.character(value) validObject(object) return(object) }) @@ -515,12 +519,14 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace) #' @examples #' fds <- createTestFraserDataSet() #' -#' counts(fds, type="psi5", side="ofInterest") -#' counts(fds, type="psi5", side="other") +#' counts(fds, side="ofInterest") +#' counts(fds, type="jaccard", side="other") +#' head(K(fds)) +#' head(K(fds, type="psi5")) #' head(K(fds, type="psi3")) #' head(N(fds, type="theta")) #' -setMethod("counts", "FraserDataSet", function(object, type=NULL, +setMethod("counts", "FraserDataSet", function(object, type=currentType(object), side=c("ofInterest", "otherSide")){ side <- match.arg(side) if(side=="ofInterest"){ @@ -552,7 +558,8 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, #' setter for count data #' #' @rdname counts -setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL, +setReplaceMethod("counts", "FraserDataSet", function(object, + type=currentType(object), side=c("ofInterest", "otherSide"), ..., value){ side <- match.arg(side) @@ -904,22 +911,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value -#' aberrant(fds, type="psi5", by="sample") +#' aberrant(fds, type="jaccard", by="sample") #' #' # get aberrant events per gene (first annotate gene symbols) #' fds <- annotateRangesWithTxDb(fds) -#' aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, +#' aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA, #' aggregate=TRUE) #' #' # find aberrant junctions/splice sites -#' aberrant(fds, type="psi5") +#' aberrant(fds, type="jaccard") #' @export setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, - zScoreCutoff=NA, deltaPsiCutoff=0.3, - rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE, - minCount=5, psiType=c("psi3", "psi5", "theta", - "jaccard"), + zScoreCutoff=NA, deltaPsiCutoff=0.1, + rhoCutoff=1, aggregate=FALSE, collapse=FALSE, + minCount=5, psiType=currentType(object), geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, @@ -931,8 +937,8 @@ setMethod("results", "FraserDataSet", function(object, }) aberrant.FRASER <- function(object, type=currentType(object), - padjCutoff=0.05, deltaPsiCutoff=0.3, - zScoreCutoff=NA, minCount=5, rhoCutoff=0.1, + padjCutoff=0.05, deltaPsiCutoff=0.1, + zScoreCutoff=NA, minCount=5, rhoCutoff=1, by=c("none", "sample", "feature"), aggregate=FALSE, geneColumn="hgnc_symbol", ...){ diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 22cee146..0426bb26 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -31,6 +31,7 @@ #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList #' makeGRangesFromDataFrame invertStrand start end start<- end<- +#' seqinfo<- #' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance #' %over% #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter @@ -50,9 +51,6 @@ #' @importFrom RColorBrewer brewer.pal #' @importFrom cowplot theme_cowplot #' @importFrom ggrepel geom_text_repel -#' @importFrom biovizBase parseArgsForAes parseArgsForNonAes -#' @importFrom ggbio autoplot -#' #' ### Data handling #' @@ -85,10 +83,12 @@ #' #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<- #' start end metadata metadata<- subjectHits queryHits elementMetadata +#' values values<- #' @importFrom grDevices colorRampPalette #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels #' seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames -#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' sortSeqlevels #' @importFrom DelayedArray rowMaxs rowMeans path<- cbind plogis qlogis #' DelayedArray #' @importFrom DelayedMatrixStats colSds rowMedians rowSds colMeans2 rowMeans2 @@ -103,9 +103,10 @@ #' scale_y_log10 scale_color_gradientn labs theme_bw theme #' scale_color_brewer scale_color_discrete scale_linetype_manual #' annotate geom_histogram scale_fill_manual xlim scale_colour_manual -#' element_blank annotation_logticks ylim +#' element_blank annotation_logticks ylim quo_name facet_grid +#' facet_wrap geom_text #' -#' @importFrom tibble as_tibble +#' @importFrom tibble as_tibble %>% #' #' @useDynLib FRASER #' @@ -135,5 +136,8 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", "distNearestGene", "UTR_overlap", "meanCount", "medianCount", - "spliceType2"), + "spliceType2", "nonsplitProportion", "nonsplitCounts", + "nonsplitProportion_99quantile", "startID", "endID", "j_idx", + "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", + "pvalue"), package="FRASER") diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index c3b9b967..b7960dad 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -31,7 +31,7 @@ #' splicing types. #' @param implementation The method that should be used to correct for #' confounders. -#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing +#' @param type The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing #' efficiency) #' @param iterations The maximal number of iterations. When the autoencoder has #' not yet converged after these number of iterations, the fit stops anyway. @@ -61,16 +61,16 @@ #' # The functions run inside the FRASER function can also be directly #' # run themselves. #' # To directly run the fit function: -#' fds <- fit(fds, implementation="PCA", q=2, type="psi5") +#' fds <- fit(fds, implementation="PCA", q=2, type="jaccard") #' #' # To directly run the nomial and adjusted p value and z score #' # calculation, the following functions can be used: -#' fds <- calculatePvalues(fds, type="psi5") -#' head(pVals(fds, type="psi5")) -#' fds <- calculatePadjValues(fds, type="psi5", method="BY") -#' head(padjVals(fds, type="psi5")) -#' fds <- calculateZscore(fds, type="psi5") -#' head(zScores(fds, type="psi5")) +#' fds <- calculatePvalues(fds, type="jaccard") +#' head(pVals(fds, type="jaccard")) +#' fds <- calculatePadjValues(fds, type="jaccard", method="BY") +#' head(padjVals(fds, type="jaccard")) +#' fds <- calculateZscore(fds, type="jaccard") +#' head(zScores(fds, type="jaccard")) #' #' @seealso \code{\link[FRASER]{fit}} #' diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R index da036f5b..13939780 100644 --- a/R/annotationOfRanges.R +++ b/R/annotationOfRanges.R @@ -32,13 +32,13 @@ #' # either using biomart with GRCh38 #' try({ #' fds <- annotateRanges(fds, GRCh=38) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol")] #' }) #' #' # either using biomart with GRCh37 #' try({ #' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol_37")] #' }) #' #' # or with a provided TxDb object @@ -47,7 +47,7 @@ #' require(org.Hs.eg.db) #' orgDb <- org.Hs.eg.db #' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -#' rowRanges(fds, type="psi5")[,"hgnc_symbol"] +#' rowRanges(fds, type="j")[,"hgnc_symbol"] #' #' @rdname annotateRanges #' @export diff --git a/R/autoencoder.R b/R/autoencoder.R index f640fc7e..d9d0e7a2 100644 --- a/R/autoencoder.R +++ b/R/autoencoder.R @@ -2,7 +2,8 @@ #' Main autoencoder fit function #' #' @noRd -fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1, +fitAutoencoder <- function(fds, q, type=currentType(fds), noiseAlpha=1, + minDeltaPsi=0.1, rhoRange=c(-30, 30), lambda=0, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5, diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index fafc4f33..c0e9d425 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -14,18 +14,18 @@ #' #' @inheritParams countRNA #' @param types A vector with the psi types which should be calculated. Default -#' is all of psi5, psi3 and theta. +#' is all of jaccard, psi5, psi3 and theta. #' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will #' be recalculated based on the existing junction counts (aka K) #' @return FraserDataSet #' @export #' @examples #' fds <- createTestFraserDataSet() -#' fds <- calculatePSIValues(fds, types="psi5") +#' fds <- calculatePSIValues(fds, types="jaccard") #' #' ### usually one would run this function for all psi types by using: #' # fds <- calculatePSIValues(fds) -calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, +calculatePSIValues <- function(fds, types=psiTypes_avail, overwriteCts=FALSE, BPPARAM=bpparam()){ # check input stopifnot(is(fds, "FraserDataSet")) @@ -187,7 +187,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ # check input stopifnot(is(fds, "FraserDataSet")) - message(date(), ": Calculate the PSI site values ...") + message(date(), ": Calculate the theta values ...") psiName <- "theta" psiROCName <- "rawOtherCounts_theta" @@ -346,8 +346,6 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){ jaccardValues <- K(fds, type="j") / jaccard_denom otherCounts_jaccard <- jaccard_denom - K(fds, type="j") - # TODO also calculate it with nonsplit counts in the nominator - # assign it to our object assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues @@ -360,15 +358,15 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){ return(fds) } -#' Calculates the sum of nonsplit reads overlapping either the donor or acceptor -#' splice site and stores it as a new assay (one value for each junction and -#' sample). +#' Calculates the sum of nonsplit reads overlapping either the donor or +#' acceptor splice site and stores it as a new assay (one value for each +#' junction and sample). #' #' @noRd calculateIntronNonsplitSum <- function(fds, overwriteCts){ stopifnot(is(fds, "FraserDataSet")) - message(date(), ": Calculate the total nonsplict counts for each intron ", + message(date(), ": Calculate the total nonsplit counts for each intron ", "...") diff --git a/R/example_functions.R b/R/example_functions.R index 9d8bbf21..cec001f6 100644 --- a/R/example_functions.R +++ b/R/example_functions.R @@ -84,7 +84,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ suppressMessages({ fds <- annotateRangesWithTxDb(fds) }) # run FRASER pipeline - fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2) + fds <- FRASER(fds, q=c(jaccard=2, psi5=2, psi3=2, theta=2), iterations=2) # save data for later fds <- saveFraserDataSet(fds) diff --git a/R/filterExpression.R b/R/filterExpression.R index a62ae794..23729ee9 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -27,8 +27,8 @@ #' @examples #' fds <- createTestFraserDataSet() #' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -#' mcols(fds, type="psi5")[, c( -#' "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +#' mcols(fds, type="jaccard")[, c( +#' "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] #' #' plotFilterExpression(fds) #' plotFilterVariability(fds) @@ -42,7 +42,7 @@ NULL #' read support and introns that are not variable across samples. #' @export filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, minDeltaPsi=0.05, + quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM=bpparam()){ @@ -64,49 +64,47 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, } -filterExpression.FRASER <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, filter=TRUE, +#' This function filters out introns and corresponding +#' splice sites which are expressed at very low levels across samples. +#' @noRd +filterExpression_jaccard <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM=bpparam()){ - + stopifnot(is(object, "FraserDataSet")) message(date(), ": Filtering out introns with low read support ...") # extract counts cts <- K(object, type="j") - ctsN5 <- N(object, type="psi5") - ctsN3 <- N(object, type="psi3") + ctsN <- N(object, type="jaccard") if(isFALSE(delayed)){ cts <- as.matrix(cts) - ctsN5 <- as.matrix(ctsN5) - ctsN3 <- as.matrix(ctsN3) + ctsN <- as.matrix(ctsN) } - + # cutoff functions f1 <- function(cts, ...){ - rowMaxs(cts) } - f2 <- function(cts, ctsN5, quantile, ...){ - rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } - f3 <- function(cts, ctsN3, quantile, ...) { - rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } - - funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3) - + rowMaxs(cts) } + f2 <- function(cts, ctsN, quantile, ...){ + rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] } + + funs <- c(maxCount=f1, quantileValueN=f2) + # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) - + cts=cts, ctsN=ctsN, quantile=quantile) + # add annotation to object for(n in names(cutoffs)){ mcols(object, type="j")[n] <- cutoffs[[n]] } mcols(object, type="j")[['passedExpression']] <- - cutoffs$maxCount >= minExpressionInOneSample & - (cutoffs$quantileValue5 >= quantileMinExpression & - cutoffs$quantileValue3 >= quantileMinExpression) + cutoffs$maxCount >= minExpressionInOneSample & + cutoffs$quantileValueN >= quantileMinExpression if("passedVariability" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -118,10 +116,11 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, # filter if requested if(isTRUE(filter)){ - object <- applyExpressionFilters(object, minExpressionInOneSample, - quantileMinExpression) + object <- applyExpressionFilters_jaccard(object, + minExpressionInOneSample, + quantileMinExpression) } - + validObject(object) return(object) } @@ -130,78 +129,51 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, #' splice sites that have low read support in all samples. #' @export setMethod("filterExpression", signature="FraserDataSet", - filterExpression.FRASER) + filterExpression_jaccard) -#' @describeIn filtering This function filters out introns and corresponding + +#' This function filters out introns and corresponding #' splice sites which do not show variablity across samples. -#' @export -filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, - delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM=bpparam()){ +#' @noRd +filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ message(date(), ": Filtering out non-variable introns ...") # extract counts cts <- K(object, type="j") - ctsSE <- K(object, type="ss") - ctsN5 <- N(object, type="psi5") - ctsN3 <- N(object, type="psi3") - ctsNSE <- N(object, type="theta") + ctsN <- N(object, type="jaccard") if(isFALSE(delayed)){ cts <- as.matrix(cts) - ctsN5 <- as.matrix(ctsN5) - ctsN3 <- as.matrix(ctsN3) - ctsSE <- as.matrix(ctsSE) - ctsNSE <- as.matrix(ctsNSE) + ctsN <- as.matrix(ctsN) } # cutoff functions - f1 <- function(cts, ctsN3, ...) { - psi <- cts/ctsN3 - rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) } - f2 <- function(cts, ctsN5, ...) { - psi <- cts/ctsN5 - rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) } - f3 <- function(ctsSE, ctsNSE, ...) { - theta <- ctsSE/ctsNSE - dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), - na.rm=TRUE) } - + f1 <- function(cts, ctsN, ...) { + jaccard <- cts/ctsN + rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), + na.rm=TRUE) } - funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3) + funs <- c(maxDJaccard=f1) # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, - ctsSE=ctsSE, ctsNSE=ctsNSE) + cts=cts, ctsN=ctsN) # add annotation to object for(n in names(cutoffs)){ - if(n == "maxDTheta"){ - mcols(object, type="ss")[n] <- cutoffs[[n]] - } else{ - mcols(object, type="j")[n] <- cutoffs[[n]] - } + mcols(object, type="j")[n] <- cutoffs[[n]] } # add annotation of theta on splice sites of introns to mcols intron_dt <- as.data.table(rowRanges(object, type="j")) - ss_dt <- as.data.table(rowRanges(object, type="ss")) - mcols(object, type="j")["maxDThetaDonor"] <- - merge(intron_dt, ss_dt, by.x="startID", by.y="spliceSiteID", - all.x=TRUE, sort=FALSE)[,maxDTheta] - mcols(object, type="j")["maxDThetaAcceptor"] <- - merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", - all.x=TRUE, sort=FALSE)[,maxDTheta] - + # check which introns pass the filter mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, - cutoffs$maxDPsi3, - cutoffs$maxDPsi5, - mcols(object, type="j")$maxDThetaDonor, - mcols(object, type="j")$maxDThetaAcceptor, - 0) >= minDeltaPsi + cutoffs$maxDJaccard, + 0) >= minDeltaPsi if("passedExpression" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -213,27 +185,31 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, # filter if requested if(isTRUE(filter)){ - object <- applyVariabilityFilters(object, minDeltaPsi) + object <- applyVariabilityFilters_jaccard(object, minDeltaPsi) } validObject(object) return(object) } +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterVariability", signature="FraserDataSet", + filterVariability_jaccard) + #' Applies previously calculated filters for expression filters #' @noRd -applyExpressionFilters <- function(fds, minExpressionInOneSample, - quantileMinExpression){ +applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, + quantileMinExpression){ maxCount <- mcols(fds, type="j")[['maxCount']] - quantileValue5 <- mcols(fds, type="j")[['quantileValue5']] - quantileValue3 <- mcols(fds, type="j")[['quantileValue3']] + quantileValueN <- mcols(fds, type="j")[['quantileValueN']] # report rare junctions that passed minExpression filter but not # quantileFilter as SE obj junctionsToReport <- maxCount >= minExpressionInOneSample & - !(quantileValue5 >= quantileMinExpression & - quantileValue3 >= quantileMinExpression) + !(quantileValueN >= quantileMinExpression) outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(junctionsToReport)){ @@ -242,13 +218,14 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, for(aname in assayNames(rareJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3"))){ + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ assay(rareJunctions, aname) <- NULL } } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=file.path(tempdir(), "tmp_rJ"), - replace=TRUE) + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) # check if folder already exists from previous filtering rareJctsDir <- file.path(outputDir, "rareJunctions") @@ -258,8 +235,7 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, "cannot be restored.") rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir) toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & - !(mcols(rJ_stored)$quantileValue5 >= quantileMinExpression & - mcols(rJ_stored)$quantileValue3 >= quantileMinExpression) + !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions) @@ -273,7 +249,7 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=rareJctsDir, replace=TRUE) + dir=rareJctsDir, replace=TRUE) } # apply filter @@ -284,24 +260,19 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] return(fds) - } + #' Applies previously calculated variablilty filters #' @noRd -applyVariabilityFilters <- function(fds, minDeltaPsi){ +applyVariabilityFilters_jaccard <- function(fds, minDeltaPsi){ # passedVariability <- mcols(fds, type="j")[['passedVariability']] - # maxDPsi3 <- mcols(fds, type="j")[['maxDPsi3']] - # maxDPsi5 <- mcols(fds, type="j")[['maxDPsi5']] - # maxDThetaDonor <- mcols(fds, type="j")[['maxDThetaDonor']] - # maxDThetaAcceptor <- mcols(fds, type="j")[['maxDThetaAcceptor']] # store information of non-variable junctions filtered <- !passedVariability - # filtered <- (pmax(maxDPsi3, maxDPsi5, maxDThetaDonor, maxDThetaAcceptor) - # < minDeltaPsi) + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(filtered)){ # get SE object of junctions to report @@ -309,7 +280,8 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ for(aname in assayNames(nonVariableJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3"))){ + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ assay(nonVariableJunctions, aname) <- NULL } } @@ -324,16 +296,13 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "that were already filtered out but should be kept now ", "cannot be restored.") nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) - toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi & - mcols(nV_stored)$maxDPsi3 < minDeltaPsi & - mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & - mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi + toReport <- mcols(nV_stored)$maxDJaccard < minDeltaPsi nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) for(aname in assayNames(nVJunctions)){ assay(nVJunctions, aname) <- - rbind(as.matrix(assay(nonVariableJunctions, aname)), - as.matrix(assay(nV_stored[toReport,], aname)) ) + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) } nonVariableJunctions <- nVJunctions rm(nVJunctions) @@ -351,44 +320,15 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "% of the junctions")) fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] return(fds) - } -#' @describeIn filtering This functions filters out both introns with low -#' read support and introns that are not variable across samples. -#' @export -filterExpressionAndVariability_jaccard <- function(object, - minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=1, minDelta=0.05, - filter=TRUE, + +#' Old FRASER1 filtering functions +#' @noRd +filterExpression.FRASER <- function(object, minExpressionInOneSample=20, + quantile=0.95, quantileMinExpression=10, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM=bpparam()){ - # filter introns with low read support and corresponding splice sites - object <- filterExpression_jaccard(object, - minExpressionInOneSample=minExpressionInOneSample, - quantile=quantile, - quantileMinExpression=quantileMinExpression, - filter=filter, delayed=delayed, - BPPARAM=BPPARAM) - - # filter introns that are not variable across samples - object <- filterVariability_jaccard(object, minDelta=minDelta, - filter=filter, - delayed=delayed, BPPARAM=BPPARAM) - - # return fds - message(date(), ": Filtering done!") - return(object) - -} - -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites which are expressed at very low levels across samples. -#' @export -filterExpression_jaccard <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=1, filter=TRUE, - delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM=bpparam()){ stopifnot(is(object, "FraserDataSet")) @@ -396,24 +336,28 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20, # extract counts cts <- K(object, type="j") - ctsN <- N(object, type="jaccard") + ctsN5 <- N(object, type="psi5") + ctsN3 <- N(object, type="psi3") if(isFALSE(delayed)){ cts <- as.matrix(cts) - ctsN <- as.matrix(ctsN) + ctsN5 <- as.matrix(ctsN5) + ctsN3 <- as.matrix(ctsN3) } # cutoff functions f1 <- function(cts, ...){ rowMaxs(cts) } - f2 <- function(cts, ctsN, quantile, ...){ - rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] } + f2 <- function(cts, ctsN5, quantile, ...){ + rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } + f3 <- function(cts, ctsN3, quantile, ...) { + rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } - funs <- c(maxCount=f1, quantileValueN=f2) + funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3) # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN=ctsN, quantile=quantile) + cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) # add annotation to object for(n in names(cutoffs)){ @@ -422,7 +366,8 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20, mcols(object, type="j")[['passedExpression']] <- cutoffs$maxCount >= minExpressionInOneSample & - cutoffs$quantileValueN >= quantileMinExpression + (cutoffs$quantileValue5 >= quantileMinExpression & + cutoffs$quantileValue3 >= quantileMinExpression) if("passedVariability" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -434,57 +379,83 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20, # filter if requested if(isTRUE(filter)){ - object <- applyExpressionFilters_jaccard(object, - minExpressionInOneSample, - quantileMinExpression) + object <- applyExpressionFilters(object, minExpressionInOneSample, + quantileMinExpression) } validObject(object) return(object) } -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites which do not show variablity across samples. -#' @export -filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, - delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM=bpparam()){ +#' Old FRASER1 filtering functions +#' @noRd +filterVariability.FRASER <- function(object, minDeltaPsi=0.05, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ message(date(), ": Filtering out non-variable introns ...") # extract counts cts <- K(object, type="j") - ctsN <- N(object, type="jaccard") + ctsSE <- K(object, type="ss") + ctsN5 <- N(object, type="psi5") + ctsN3 <- N(object, type="psi3") + ctsNSE <- N(object, type="theta") if(isFALSE(delayed)){ cts <- as.matrix(cts) - ctsN <- as.matrix(ctsN) + ctsN5 <- as.matrix(ctsN5) + ctsN3 <- as.matrix(ctsN3) + ctsSE <- as.matrix(ctsSE) + ctsNSE <- as.matrix(ctsNSE) } # cutoff functions - f1 <- function(cts, ctsN, ...) { - jaccard <- cts/ctsN - rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), - na.rm=TRUE) } + f1 <- function(cts, ctsN3, ...) { + psi <- cts/ctsN3 + rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) } + f2 <- function(cts, ctsN5, ...) { + psi <- cts/ctsN5 + rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) } + f3 <- function(ctsSE, ctsNSE, ...) { + theta <- ctsSE/ctsNSE + dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), + na.rm=TRUE) } - funs <- c(maxDJaccard=f1) + + funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3) # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN=ctsN) + cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, + ctsSE=ctsSE, ctsNSE=ctsNSE) # add annotation to object for(n in names(cutoffs)){ + if(n == "maxDTheta"){ + mcols(object, type="ss")[n] <- cutoffs[[n]] + } else{ mcols(object, type="j")[n] <- cutoffs[[n]] + } } # add annotation of theta on splice sites of introns to mcols intron_dt <- as.data.table(rowRanges(object, type="j")) + ss_dt <- as.data.table(rowRanges(object, type="ss")) + mcols(object, type="j")["maxDThetaDonor"] <- + merge(intron_dt, ss_dt, by.x="startID", by.y="spliceSiteID", + all.x=TRUE, sort=FALSE)[,maxDTheta] + mcols(object, type="j")["maxDThetaAcceptor"] <- + merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", + all.x=TRUE, sort=FALSE)[,maxDTheta] # check which introns pass the filter mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, - cutoffs$maxDJaccard, - 0) >= minDelta + cutoffs$maxDPsi3, + cutoffs$maxDPsi5, + mcols(object, type="j")$maxDThetaDonor, + mcols(object, type="j")$maxDThetaAcceptor, + 0) >= minDeltaPsi if("passedExpression" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -496,7 +467,7 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, # filter if requested if(isTRUE(filter)){ - object <- applyVariabilityFilters(object, minDelta) + object <- applyVariabilityFilters(object, minDeltaPsi) } validObject(object) @@ -505,16 +476,18 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, #' Applies previously calculated filters for expression filters #' @noRd -applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, - quantileMinExpression){ +applyExpressionFilters <- function(fds, minExpressionInOneSample, + quantileMinExpression){ maxCount <- mcols(fds, type="j")[['maxCount']] - quantileValueN <- mcols(fds, type="j")[['quantileValueN']] + quantileValue5 <- mcols(fds, type="j")[['quantileValue5']] + quantileValue3 <- mcols(fds, type="j")[['quantileValue3']] # report rare junctions that passed minExpression filter but not # quantileFilter as SE obj junctionsToReport <- maxCount >= minExpressionInOneSample & - !(quantileValueN >= quantileMinExpression) + !(quantileValue5 >= quantileMinExpression & + quantileValue3 >= quantileMinExpression) outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(junctionsToReport)){ @@ -522,15 +495,14 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, rareJunctions <- asSE(fds[junctionsToReport, by="j"]) for(aname in assayNames(rareJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", - "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3", "jaccard", - "rawOtherCounts_intron_jaccard"))){ + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3"))){ assay(rareJunctions, aname) <- NULL } } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=file.path(tempdir(), "tmp_rJ"), - replace=TRUE) + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) # check if folder already exists from previous filtering rareJctsDir <- file.path(outputDir, "rareJunctions") @@ -540,60 +512,63 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, "cannot be restored.") rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir) toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & - !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) + !(mcols(rJ_stored)$quantileValue5 >= quantileMinExpression & + mcols(rJ_stored)$quantileValue3 >= quantileMinExpression) rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions) for(aname in assayNames(rJ_tmp)){ assay(rJ_tmp, aname) <- rbind(as.matrix(assay(rareJunctions, aname)), - as.matrix(assay(rJ_stored[toReport,], aname)) ) + as.matrix(assay(rJ_stored[toReport,], aname)) ) } rareJunctions <- rJ_tmp rm(rJ_tmp) } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=rareJctsDir, replace=TRUE) + dir=rareJctsDir, replace=TRUE) } # apply filter numFilt <- sum(mcols(fds, type="j")[['passedExpression']]) message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), - ". This is ", signif(numFilt/length(fds)*100, 3), - "% of the junctions")) + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] return(fds) - } - #' Applies previously calculated variablilty filters #' @noRd -applyVariabilityFilters <- function(fds, minDelta){ +applyVariabilityFilters <- function(fds, minDeltaPsi){ # passedVariability <- mcols(fds, type="j")[['passedVariability']] + # maxDPsi3 <- mcols(fds, type="j")[['maxDPsi3']] + # maxDPsi5 <- mcols(fds, type="j")[['maxDPsi5']] + # maxDThetaDonor <- mcols(fds, type="j")[['maxDThetaDonor']] + # maxDThetaAcceptor <- mcols(fds, type="j")[['maxDThetaAcceptor']] # store information of non-variable junctions filtered <- !passedVariability - + # filtered <- (pmax(maxDPsi3, maxDPsi5, maxDThetaDonor, maxDThetaAcceptor) + # < minDeltaPsi) outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(filtered)){ # get SE object of junctions to report nonVariableJunctions <- asSE(fds[filtered, by="j"]) for(aname in assayNames(nonVariableJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", - "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3", "jaccard", - "rawOtherCounts_intron_jaccard"))){ + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3"))){ assay(nonVariableJunctions, aname) <- NULL } } nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE, - nonVariableJunctions, - dir=file.path(tempdir(), "tmp_nvJ")) + nonVariableJunctions, + dir=file.path(tempdir(), "tmp_nvJ")) # check if folder already exists from previous filtering nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions") @@ -602,28 +577,31 @@ applyVariabilityFilters <- function(fds, minDelta){ "that were already filtered out but should be kept now ", "cannot be restored.") nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) - toReport <- mcols(nV_stored)$maxDJaccard < minDelta + toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi & + mcols(nV_stored)$maxDPsi3 < minDeltaPsi & + mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & + mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) for(aname in assayNames(nVJunctions)){ assay(nVJunctions, aname) <- rbind(as.matrix(assay(nonVariableJunctions, aname)), - as.matrix(assay(nV_stored[toReport,], aname)) ) + as.matrix(assay(nV_stored[toReport,], aname)) ) } nonVariableJunctions <- nVJunctions rm(nVJunctions) } nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir, - x=nonVariableJunctions, replace=TRUE) + x=nonVariableJunctions, replace=TRUE) } # apply filtering numFilt <- sum(passedVariability) message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), - ". This is ", signif(numFilt/length(fds)*100, 3), - "% of the junctions")) + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] return(fds) diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index af5be75a..f3747eab 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -113,17 +113,18 @@ findEncodingDim <- function(i, fds, type, params, implementation, #' @examples #' # generate data #' fds <- makeSimulatedFraserDataSet(m=15, j=20) +#' fds <- calculatePSIValues(fds) #' #' # run hyperparameter optimization -#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) +#' fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) #' #' # get estimated optimal dimension of the latent space -#' bestQ(fds, type="psi5") -#' hyperParams(fds, type="psi5") +#' bestQ(fds, type="jaccard") +#' hyperParams(fds, type="jaccard") #' #' @export -optimHyperParams <- function(fds, type, implementation="PCA", - q_param=seq(2, min(40, ncol(fds)), by=3), +optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA", + q_param=getEncDimRange(fds), noise_param=0, minDeltaPsi=0.1, iterations=5, setSubset=50000, injectFreq=1e-2, BPPARAM=bpparam(), internalThreads=1, plot=TRUE, @@ -190,7 +191,7 @@ optimHyperParams <- function(fds, type, implementation="PCA", # remove unneeded blocks to save memory a2rm <- paste(sep="_", c("originalCounts", "originalOtherCounts"), - rep(psiTypes, 2)) + rep(psiTypes_avail, 2)) for(a in a2rm){ assay(fds_copy, a) <- NULL } @@ -229,3 +230,19 @@ optimHyperParams <- function(fds, type, implementation="PCA", return(fds) } +#' Get default range of latent space dimensions to test during hyper param opt +#' @noRd +getEncDimRange <- function(fds, mp=3){ + # Get range for latent space dimension + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique + return(pars_q) +} diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index 38f5d634..5b87d1ce 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), - q, type="psi3", rhoRange=c(-30, 30), + q, type=currentType(object), rhoRange=c(-30, 30), weighted=FALSE, noiseAlpha=1, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), nSubset=15000, diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 608b0171..cd35cbf5 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -32,7 +32,7 @@ #' dontWriteHDF5 <- TRUE #' #' # get/set the splice metric for which results should be retrieved -#' currentType(fds) <- "psi5" +#' currentType(fds) <- "jaccard" #' currentType(fds) #' #' # get fitted parameters @@ -50,9 +50,9 @@ #' pseudocount() #' #' # retrieve or set a mask to exclude certain junctions in the fitting step -#' featureExclusionMask(fds, type="theta") <- sample( -#' c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -#' featureExclusionMask(fds, type="theta") +#' featureExclusionMask(fds, type="jaccard") <- sample( +#' c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +#' featureExclusionMask(fds, type="jaccard") #' #' # controlling the verbosity level of the output of some algorithms #' verbose(fds) <- 2 @@ -201,7 +201,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ } -`setAssayMatrix<-` <- function(fds, name, type, ..., value){ +`setAssayMatrix<-` <- function(fds, name, type=currentType(fds), ..., value){ if(!is.matrix(value)){ value <- matrix(value, ncol=ncol(fds), nrow=nrow(mcols(fds, type=type))) } @@ -220,7 +220,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ fds } -getAssayMatrix <- function(fds, name, type, byGroup=FALSE){ +getAssayMatrix <- function(fds, name, type=currentType(fds), byGroup=FALSE){ if(missing(name)){ name <- type } else { @@ -251,8 +251,7 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ #' @describeIn getter_setter_functions This returns the calculated p-values. #' @export pVals <- function(fds, type=currentType(fds), level="site", - filters=list(rho=0.1), - dist="BetaBinomial", ...){ + filters=list(rho=1), dist="BetaBinomial", ...){ level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) @@ -268,7 +267,14 @@ pVals <- function(fds, type=currentType(fds), level="site", aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters for(n in sort(names(filters))){ - aname <- paste0(aname, "_", n, filters[[n]]) + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds)))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } } if(level == "gene"){ if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ @@ -283,7 +289,7 @@ pVals <- function(fds, type=currentType(fds), level="site", } `pVals<-` <- function(fds, type=currentType(fds), level="site", - filters=list(rho=0.1), + filters=list(rho=1), dist="BetaBinomial", ..., value){ level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) @@ -297,7 +303,9 @@ pVals <- function(fds, type=currentType(fds), level="site", } # add information on used filters for(n in sort(names(filters))){ - aname <- paste0(aname, "_", n, filters[[n]]) + if(!(n == "rho" && filters[[n]] == 1)){ + aname <- paste0(aname, "_", n, filters[[n]]) + } } if(level == "gene"){ @@ -314,14 +322,21 @@ pVals <- function(fds, type=currentType(fds), level="site", #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), - level="site", filters=list(rho=0.1), ...){ + level="site", filters=list(rho=1), ...){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters for(n in sort(names(filters))){ - aname <- paste0(aname, "_", n, filters[[n]]) + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds)))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } } if(level == "gene"){ if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ @@ -334,14 +349,16 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), } `padjVals<-` <- function(fds, type=currentType(fds), level="site", - dist="BetaBinomial", filters=list(rho=0.1), ..., value){ + dist="BetaBinomial", filters=list(rho=1), ..., value){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters for(n in sort(names(filters))){ - aname <- paste0(aname, "_", n, filters[[n]]) + if(!(n == "rho" && filters[[n]] == 1)){ + aname <- paste0(aname, "_", n, filters[[n]]) + } } if(level == "gene"){ if(is.null(rownames(value))){ @@ -375,10 +392,14 @@ deltaPsiValue <- function(fds, type=currentType(fds)){ #' @describeIn getter_setter_functions Returns the psi type that is used -#' within several methods in the FRASER package. +#' within several methods in the FRASER package (defaults to jaccard). #' @export currentType <- function(fds){ - return(metadata(fds)[['currentType']]) + curType <- metadata(fds)[['currentType']] + if(is.null(curType)){ + curType <- "jaccard" + } + return(curType) } #' @describeIn getter_setter_functions Sets the psi type that is to be used @@ -497,7 +518,7 @@ dontWriteHDF5 <- function(fds){ return(fds) } -getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ +getTrueOutliers <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueOutliers", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -507,7 +528,7 @@ getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ pmin(pmax(ans, -1), 1) } -getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getTrueDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueDeltaPSI", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -515,7 +536,8 @@ getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ ans } -getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ +getAbsMaxByGroup <- function(fds, type=currentType(fds), mat, index=NULL, + BPPARAM=bpparam()){ if(is.null(index)){ index <- getSiteIndex(fds, type) } @@ -534,13 +556,13 @@ getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ return(values) } -getByGroup <- function(fds, type, value){ +getByGroup <- function(fds, type=currentType(fds), value){ index <- getSiteIndex(fds, type) idx <- !duplicated(index) return(value[idx,]) } -getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ mu <- predictedMeans(fds, type) dataPsi <- (K(fds, type) + pseudocount())/(N(fds, type) + 2*pseudocount()) deltaPSI <- dataPsi - mu @@ -552,7 +574,7 @@ getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ # calculate FRASER weights -calcFraserWeights <- function(fds, psiType){ +calcFraserWeights <- function(fds, psiType=currentType(fds)){ k <- as.matrix(K(fds, psiType)) n <- as.matrix(N(fds, psiType)) mu <- t(predictMu(fds, psiType)) @@ -584,7 +606,7 @@ calcFraserWeights <- function(fds, psiType){ } # get FRASER weights -weights <- function(fds, type){ +weights <- function(fds, type=currentType(fds)){ return(getAssayMatrix(fds, "weights", type)) } @@ -594,7 +616,7 @@ weights <- function(fds, type){ return(fds) } -getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ +getIndexFromResultTable <- function(fds, resultTable){ type <- as.character(resultTable$type) target <- makeGRangesFromDataFrame(resultTable) if(type == "theta"){ @@ -611,9 +633,9 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ ov } -getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, - idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, - geneColumn="hgnc_symbol", ...){ +getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), + result=NULL, idx=NULL, aggregate=FALSE, pvalLevel="site", + Ncpus=3, geneColumn="hgnc_symbol", ...){ if(!is.null(result)){ type <- as.character(result$type) idx <- getIndexFromResultTable(fds, result) diff --git a/R/helper-functions.R b/R/helper-functions.R index 21b117b2..ed73f639 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -22,7 +22,7 @@ checkCountData <- function(fds, stop=TRUE){ if(isFALSE(stop)) return(invisible(FALSE)) stop("No counts detected! Please provide counts first.") } - if(!all(paste0("rawOtherCounts_", psiTypes) %in% assayNames(fds))){ + if(!all(paste0("rawOtherCounts_", psiTypes_avail) %in% assayNames(fds))){ if(isFALSE(stop)) return(invisible(FALSE)) stop("Please compute first the total expression at each junction.") } @@ -71,10 +71,10 @@ checkReadType <- function(fds, type){ # check if type is null or missing if(missing(type) | is.null(type)){ - if(verbose(fds) > 0){ - warning("Read type was not specified!", - "We will assume the default: 'j'") - } + # if(verbose(fds) > 0){ + # warning("Read type was not specified!", + # "We will assume the default: 'j'") + # } return("j") } type <- unique(type) @@ -356,18 +356,18 @@ assayExists <- function(fds, assayName){ return(aexists) } -getAssayAsVector <- function(fds, prefix, psiType, sampleID){ +getAssayAsVector <- function(fds, prefix, psiType=currentType(fds), sampleID){ as.vector(assay(fds, paste0(prefix, psiType))[,sampleID]) } -variableJunctions <- function(fds, type, minDeltaPsi=0.1){ +variableJunctions <- function(fds, type=currentType(fds), minDeltaPsi=0.1){ psi <- K(fds, type=type)/N(fds, type=type) j2keep <- rowMaxs(abs(psi - rowMeans(psi, na.rm=TRUE)), na.rm=TRUE) j2keep >= minDeltaPsi } -subsetKMostVariableJunctions <- function(fds, type, n){ +subsetKMostVariableJunctions <- function(fds, type=currentType(fds), n){ curX <- x(fds, type=type, all=TRUE, center=FALSE, noiseAlpha=NULL) xsd <- colSds(curX) nMostVarJuncs <- which(xsd >= sort(xsd, TRUE)[min(length(xsd), n*2)]) @@ -376,7 +376,8 @@ subsetKMostVariableJunctions <- function(fds, type, n){ ans } -getSubsetVector <- function(fds, type, minDeltaPsi=0.1, nSubset=15000){ +getSubsetVector <- function(fds, type=currentType(fds), minDeltaPsi=0.1, + nSubset=15000){ # get any variable intron ans <- variableJunctions(fds, type, minDeltaPsi=minDeltaPsi) @@ -554,14 +555,21 @@ getStrandString <- function(fds){ #' #' Check if adjusted pvalues have been computed for a given set of filters. #' @noRd -checkPadjAvailableForFilters <- function(fds, type, filters=list(), - dist="BetaBinomial", aggregate=FALSE){ +checkPadjAvailableForFilters <- function(fds, type=currentType(fds), + filters=list(), dist="BetaBinomial", aggregate=FALSE){ dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname) # add information on used filters for(n in sort(names(filters))){ - aname <- paste0(aname, "_", n, filters[[n]]) + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds)))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } } aname <- paste(aname, type, sep="_") if(isTRUE(aggregate)){ diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index 5b6f7531..f451d5cf 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -435,9 +435,10 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' @examples #' # A generic dataset #' fds <- makeSimulatedFraserDataSet() +#' fds <- calculatePSIValues(fds) #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export -injectOutliers <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), +injectOutliers <- function(fds, type=currentType(fds), freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, method=c('samplePSI', 'meanPSI', 'simulatedPSI'), diff --git a/R/plotMethods.R b/R/plotMethods.R index 1b4ae279..3fd7ea50 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -162,6 +162,7 @@ #' (b,l,t,r). #' @param cex For controlling the size of text and numbers in #' \code{plotBamCoverage}. +#' @param color_chr Interchanging colors by chromosome for \code{plotManhattan}. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -229,33 +230,34 @@ #' @rdname plotFunctions #' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ #' plotExpression plotCountCorHeatmap plotFilterExpression -#' plotExpectedVsObservedPsi plotEncDimSearch +#' plotExpectedVsObservedPsi plotEncDimSearch plotManhattan +#' plotBamCoverage plotBamCoverageFromResultTable #' @examples #' # create full FRASER object #' fds <- makeSimulatedFraserDataSet(m=40, j=200) #' fds <- calculatePSIValues(fds) #' fds <- filterExpressionAndVariability(fds, filter=FALSE) -#' # this step should be done for all splicing metrics and more dimensions -#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) +#' # this step should be done for more dimensions in practice +#' fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) #' fds <- FRASER(fds) #' #' # QC plotting #' plotFilterExpression(fds) #' plotFilterVariability(fds) -#' plotCountCorHeatmap(fds, "theta") -#' plotCountCorHeatmap(fds, "theta", normalized=TRUE) -#' plotEncDimSearch(fds, type="psi5") +#' plotCountCorHeatmap(fds, "jaccard") +#' plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +#' plotEncDimSearch(fds, type="jaccard") #' #' # extract results #' plotAberrantPerSample(fds, aggregate=FALSE) -#' plotVolcano(fds, "sample1", "psi5") +#' plotVolcano(fds, "sample1", "jaccard") #' #' # dive into gene/sample level results #' res <- results(fds) #' res #' plotExpression(fds, result=res[1]) #' plotQQ(fds, result=res[1]) -#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +#' plotExpectedVsObservedPsi(fds, res=res[1]) #' #' # create manhattan plot of pvalues by genomic position #' plotManhattan(fds, type="jaccard", sampleID="sample10") @@ -297,9 +299,9 @@ NULL plotVolcano.FRASER <- function(object, sampleID, - type=c("psi3", "psi5", "theta", "jaccard"), basePlot=TRUE, + type=currentType(object), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, - deltaPsiCutoff=0.3, padjCutoff=0.1, ...){ + deltaPsiCutoff=0.1, padjCutoff=0.1, ...){ type <- match.arg(type) @@ -398,8 +400,8 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=c("psi3", "psi5", "theta", "jaccard"), - padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3, + type=psiTypes, + padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, BPPARAM=bpparam(), ...){ type <- match.arg(type, several.ok=TRUE) @@ -461,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), +plotExpression <- function(fds, type=currentType(fds), idx=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ @@ -553,7 +555,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"), +plotExpectedVsObservedPsi <- function(fds, type=currentType(fds), idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) @@ -818,7 +820,7 @@ setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) plotEncDimSearch.FRASER <- function(object, - type=c("psi3", "psi5", "theta", "jaccard"), + type=currentType(object), plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -986,7 +988,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=c("psi5", "psi3", "theta", "jaccard"), logit=FALSE, + type=currentType(object), logit=FALSE, topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, @@ -1427,11 +1429,12 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, } plotManhattan.FRASER <- function(object, sampleID, - type=c("psi5", "psi3", "theta", "jaccard"), + type=currentType(object), main=paste0("sampleID = ", sampleID), - color=c("black", "darkgrey"), + color_chr=c("black", "darkgrey"), ...){ # check arguments + stopifnot(is(object, "FraserDataSet")) stopifnot(sampleID %in% samples(object)) type <- match.arg(type) additional_args <- list(...) @@ -1473,7 +1476,7 @@ plotManhattan.FRASER <- function(object, sampleID, # plot manhattan plot plotGrandLinear.adapted(gr_sample, aes(y=pvalue), - color=color, + color=color_chr, highlight.gr=gr_sample[outlier_idx,], highlight.overlap="equal") + labs(title=main) @@ -1603,8 +1606,8 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, if (is.null(geom)) geom <- "point" args <- list(...) - args.aes <- parseArgsForAes(args) - args.non <- parseArgsForNonAes(args) + args.aes <- biovizBase::parseArgsForAes(args) + args.non <- biovizBase::parseArgsForNonAes(args) two.color <- c("#0080FF", "#4CC4FF") .is.seq <- FALSE if (!"colour" %in% names(args.aes)) { @@ -1634,7 +1637,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, args.non$geom <- geom args.non$object <- obj aes.res <- do.call(aes, args.aes) - p <- do.call(autoplot, c(list(aes.res), args.non)) + p <- do.call(ggbio::autoplot, c(list(aes.res), args.non)) if (!legend) p <- p + theme(legend.position = "none") if (!missing(ylab)) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 749d192c..f1dbcb45 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -162,7 +162,7 @@ adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff, } getFWERpvals_bySample <- function(pvals, index, rho, method="holm", - rhoCutoff=0.1, BPPARAM=bpparam()){ + rhoCutoff, BPPARAM=bpparam()){ fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, pvals=pvals, index, BPPARAM=BPPARAM, method=method, rho=rho, rhoCutoff=rhoCutoff) @@ -171,7 +171,7 @@ getFWERpvals_bySample <- function(pvals, index, rho, method="holm", } getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", - rhoCutoff=0.1, BPPARAM=bpparam()){ + rhoCutoff, BPPARAM=bpparam()){ unique_idx <- unique(index) fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx, pvals=pvals, index, BPPARAM=BPPARAM, @@ -230,7 +230,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' #' @export calculatePadjValues <- function(fds, type=currentType(fds), method="BY", - rhoCutoff=0.1, geneLevel=TRUE, + rhoCutoff=1, geneLevel=TRUE, geneColumn="hgnc_symbol", BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) @@ -263,8 +263,8 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", withDimnames=FALSE) <- as.matrix(padjDT) # gene-level pval correction and FDR - if(geneColumn %in% colnames(mcols(fds, type=type)) && - isTRUE(geneLevel)){ + if(isTRUE(geneLevel) && + geneColumn %in% colnames(mcols(fds, type=type))){ message(date(), ": calculating gene-level pvalues ...") gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, method="holm", FDRmethod=method, @@ -274,7 +274,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", withDimnames=FALSE) <- gene_pvals[["pvals"]] padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), withDimnames=FALSE) <- gene_pvals[["padj"]] - } else{ + } else if(isTRUE(geneLevel)){ warning("Gene-level pvalues could not be calculated as column ", geneColumn, "\nwas not found for the given fds object. ", "Please annotate gene symbols \nfirst using the ", @@ -285,7 +285,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", return(fds) } -getPvalsPerGene <- function(fds, type, +getPvalsPerGene <- function(fds, type=currentType(fds), pvals=pVals(fds, type=type, level="site"), sampleID=NULL, method="holm", FDRmethod="BY", geneColumn="hgnc_symbol", BPPARAM=bpparam()){ @@ -331,7 +331,7 @@ getPvalsPerGene <- function(fds, type, } -getSiteIndex <- function(fds, type){ +getSiteIndex <- function(fds, type=currentType(fds)){ if(type == "theta"){ return(mcols(fds, type=type)[['spliceSiteID']]) } @@ -354,7 +354,8 @@ getSiteIndex <- function(fds, type){ ans[selectionMat] } -getGeneIDs <- function(fds, type, unique=TRUE, geneColumn="hgnc_symbol"){ +getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, + geneColumn="hgnc_symbol"){ geneIDs <- mcols(fds, type=type)[[geneColumn]] if(isTRUE(unique)){ geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){ @@ -380,3 +381,80 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ return(pvalsPerGene) } +#' @describeIn FRASER This function does FDR correction only for all junctions +#' in a certain subset of genes which can differ per sample. Requires gene +#' symbols to have been annotated to junctions. As with the full FDR +#' correction across all junctions, first the previously calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, gene-level p values are computed. +#' +#' @param genesToTest A named list with the subset of genes to test per sample. +#' The names must correspond to the sampleIDs in the given fds object. +#' @param subsetName The name under which the resulting FDR corrected pvalues +#' will be stored in metadata(fds). +#' +#' @export +calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), + subsetName="FDR_subset", method="BY", + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ + + # check input + currentType(fds) <- type + stopifnot(!is.null(genesToTest)) + stopifnot(is.list(genesToTest)) + stopifnot(!is.null(names(genesToTest))) + if(!all(names(genesToTest) %in% samples(fds))){ + stop("names(genesToTest) need to be sampleIDs in the given fds object.") + } + if(!all(samples(fds) %in% names(genesToTest))){ + stop("All sampleIDs of the given fds object need to be in ", + "names(geneToTest).") + } + + # check if genes have been annotated + if(!geneColumn %in% colnames(mcols(fds, type=type))){ + stop(paste0("'", geneColumn, "' is not found in mcols(fds). ", + "Please annotate gene symbols \nfirst using the ", + "annotateRanges or annotateRangesWithTxDb function.")) + } + + # compute FDR on the given subsets of genes + message(date(), ": starting FDR calculation on subset of genes...") + FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, + FUN=function(sample_id, genes_to_test_sample){ + + # message(date(), ": FDR subset calculation for sample = ", sample_id) + # get idx of junctions corresponding to genes with var + jidx <- unlist(lapply(genes_to_test_sample, function(gene){ + idx <- which(grepl(gene, mcols(fds, type="j")[, geneColumn])) + names(idx) <- rep(gene, length(idx)) + return(idx) + })) + jidx <- sort(jidx[!duplicated(jidx)]) + + # retrieve pvalues of junctions to test + p <- pVals(fds, type=type, level="junction")[jidx, sample_id] + + # FDR correction + pa <- p.adjust(p, method=method) + + # gene level pvals + dt <- data.table(sampleID=sample_id, pval=p, FDR_subset=pa, + gene=names(jidx), jidx=jidx) + dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + + # gene level FDR + dt2 <- dt[, unique(pval_gene), by="gene"] + dt2[, FDR_subset_gene := p.adjust(V1, method=method)] + dt <- merge(dt, dt2[, .(gene, FDR_subset_gene)], by="gene", all.x=TRUE) + + # return new FDR + return(dt) + }, SIMPLIFY=FALSE, BPPARAM=BPPARAM)) + message(date(), ": finished FDR calculation on subset of genes.") + + # add FDR subset info to fds object and return + metadata(fds)[[subsetName]] <- FDR_subset + return(fds) +} + diff --git a/R/variables.R b/R/variables.R index a159d2ee..61f821e1 100644 --- a/R/variables.R +++ b/R/variables.R @@ -1,12 +1,22 @@ #' -#' Available psi types +#' Available splice metrics #' #' @examples -#' # to show available psi types: +#' # to show available splice metrics: +#' psiTypes_avail +#' +#' @export +psiTypes_avail <- c("psi5", "psi3", "theta", "jaccard") +names(psiTypes_avail) <- c("psi5", "psi3", "theta", "Intron Jaccard Index") + +#' +#' Splice metrics that are run by default +#' +#' @examples +#' # to show splice metrics selected to be fitted: #' psiTypes #' #' @export -psiTypes <- c("psi5", "psi3", "theta", "jaccard") -# psiTypes <- c("psi5", "psi3", "theta") -names(psiTypes) <- psiTypes +psiTypes <- c("jaccard") +names(psiTypes) <- c("Intron Jaccard Index") diff --git a/R/zzz.R b/R/zzz.R index 8f087a08..7f0b67d5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,7 +8,7 @@ op.fraser <- list( `FRASER-hdf5-chunk-nrow` = 30000, `FRASER-hdf5-chunk-ncol` = 20, - `FRASER.pseudoCount` = 1, + `FRASER.pseudoCount` = 0.1, `FRASER.minSamplesForDelayed` = 1000, `FRASER.maxSamplesNoHDF5` = 20, `FRASER.maxJunctionsNoHDF5` = 1000) diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 0931ec25..170a6271 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -5,6 +5,7 @@ \alias{calculateZscore} \alias{calculatePvalues} \alias{calculatePadjValues} +\alias{calculatePadjValuesOnSubset} \title{FRASER: Find RAre Splicing Events in RNA-seq data} \usage{ FRASER( @@ -32,11 +33,21 @@ calculatePadjValues( fds, type = currentType(fds), method = "BY", - rhoCutoff = 0.1, + rhoCutoff = 1, geneLevel = TRUE, geneColumn = "hgnc_symbol", BPPARAM = bpparam() ) + +calculatePadjValuesOnSubset( + fds, + genesToTest, + type = currentType(fds), + subsetName = "FDR_subset", + method = "BY", + geneColumn = "hgnc_symbol", + BPPARAM = bpparam() +) } \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} @@ -58,7 +69,7 @@ not yet converged after these number of iterations, the fit stops anyway.} \item{...}{Additional parameters passed on to the internal fit function} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{logit}{Indicates if z scores are computed on the logit scale (default) @@ -82,6 +93,12 @@ should be calculated. Defaults to TRUE.} \item{geneColumn}{The column name of the column that has the gene annotation that will be used for gene-level pvalue computation.} + +\item{genesToTest}{A named list with the subset of genes to test per sample. +The names must correspond to the sampleIDs in the given fds object.} + +\item{subsetName}{The name under which the resulting FDR corrected pvalues +will be stored in metadata(fds).} } \value{ FraserDataSet @@ -131,6 +148,13 @@ p-values per sample for multiple testing. First, the previoulsy calculated junction-level p values are adjusted with Holm's method per donor or acceptor site, respectively. Then, if gene symbols have been annotated to junctions (and not otherwise requested), gene-level p values are computed. + +\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions +in a certain subset of genes which can differ per sample. Requires gene +symbols to have been annotated to junctions. As with the full FDR +correction across all junctions, first the previously calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, gene-level p values are computed. }} \examples{ @@ -154,16 +178,16 @@ fds <- FRASER(fds, q=2, implementation="PCA") # The functions run inside the FRASER function can also be directly # run themselves. # To directly run the fit function: -fds <- fit(fds, implementation="PCA", q=2, type="psi5") +fds <- fit(fds, implementation="PCA", q=2, type="jaccard") # To directly run the nomial and adjusted p value and z score # calculation, the following functions can be used: -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds, type="psi5")) -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds, type="jaccard")) +fds <- calculateZscore(fds, type="jaccard") +head(zScores(fds, type="jaccard")) } \seealso{ diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd index c82a3211..4d1639f9 100644 --- a/man/annotateRanges.Rd +++ b/man/annotateRanges.Rd @@ -67,13 +67,13 @@ fds <- createTestFraserDataSet() # either using biomart with GRCh38 try({ fds <- annotateRanges(fds, GRCh=38) - rowRanges(fds, type="psi5")[,c("hgnc_symbol")] + rowRanges(fds, type="j")[,c("hgnc_symbol")] }) # either using biomart with GRCh37 try({ fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) - rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] + rowRanges(fds, type="j")[,c("hgnc_symbol_37")] }) # or with a provided TxDb object @@ -82,6 +82,6 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene require(org.Hs.eg.db) orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -rowRanges(fds, type="psi5")[,"hgnc_symbol"] +rowRanges(fds, type="j")[,"hgnc_symbol"] } diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd index 0128221d..9086c048 100644 --- a/man/calculatePSIValues.Rd +++ b/man/calculatePSIValues.Rd @@ -6,7 +6,7 @@ \usage{ calculatePSIValues( fds, - types = psiTypes, + types = psiTypes_avail, overwriteCts = FALSE, BPPARAM = bpparam() ) @@ -15,7 +15,7 @@ calculatePSIValues( \item{fds}{A \code{\link{FraserDataSet}} object} \item{types}{A vector with the psi types which should be calculated. Default -is all of psi5, psi3 and theta.} +is all of jaccard, psi5, psi3 and theta.} \item{overwriteCts}{FALSE or TRUE (the default) the total counts (aka N) will be recalculated based on the existing junction counts (aka K)} @@ -31,7 +31,7 @@ based on the FraserDataSet object } \examples{ fds <- createTestFraserDataSet() - fds <- calculatePSIValues(fds, types="psi5") + fds <- calculatePSIValues(fds, types="jaccard") ### usually one would run this function for all psi types by using: # fds <- calculatePSIValues(fds) diff --git a/man/counts.Rd b/man/counts.Rd index 1ddac4d5..79beb47f 100644 --- a/man/counts.Rd +++ b/man/counts.Rd @@ -11,9 +11,14 @@ K(fds, type = currentType(fds)) N(fds, type = currentType(fds)) -\S4method{counts}{FraserDataSet}(object, type = NULL, side = c("ofInterest", "otherSide")) - -\S4method{counts}{FraserDataSet,ANY}(object, type = NULL, side = c("ofInterest", "otherSide"), ...) <- value +\S4method{counts}{FraserDataSet}(object, type = currentType(object), side = c("ofInterest", "otherSide")) + +\S4method{counts}{FraserDataSet,ANY}( + object, + type = currentType(object), + side = c("ofInterest", "otherSide"), + ... +) <- value } \arguments{ \item{fds, object}{FraserDataSet} @@ -39,8 +44,10 @@ setter for count data \examples{ fds <- createTestFraserDataSet() - counts(fds, type="psi5", side="ofInterest") - counts(fds, type="psi5", side="other") + counts(fds, side="ofInterest") + counts(fds, type="jaccard", side="other") + head(K(fds)) + head(K(fds, type="psi5")) head(K(fds, type="psi3")) head(N(fds, type="theta")) diff --git a/man/filtering.Rd b/man/filtering.Rd index e075761a..5d604036 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filterExpression.R -\name{filtering} +% Please edit documentation in R/AllGenerics-definitions.R, R/filterExpression.R +\name{filterVariability} +\alias{filterVariability} \alias{filtering} \alias{filterExpressionAndVariability} \alias{filterExpression,FraserDataSet-method} -\alias{filterVariability} -\alias{filterExpressionAndVariability_jaccard} -\alias{filterExpression_jaccard} -\alias{filterVariability_jaccard} +\alias{filterVariability,FraserDataSet-method} \title{Filtering FraserDataSets} \usage{ +filterVariability(object, ...) + filterExpressionAndVariability( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, - minDeltaPsi = 0.05, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM = bpparam() @@ -24,45 +24,16 @@ filterExpressionAndVariability( \S4method{filterExpression}{FraserDataSet}( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM = bpparam() ) -filterVariability( +\S4method{filterVariability}{FraserDataSet}( object, - minDeltaPsi = 0.05, - filter = TRUE, - delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM = bpparam() -) - -filterExpressionAndVariability_jaccard( - object, - minExpressionInOneSample = 20, - quantile = 0.95, - quantileMinExpression = 1, - minDelta = 0.05, - filter = TRUE, - delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM = bpparam() -) - -filterExpression_jaccard( - object, - minExpressionInOneSample = 20, - quantile = 0.95, - quantileMinExpression = 1, - filter = TRUE, - delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM = bpparam() -) - -filterVariability_jaccard( - object, - minDelta = 0, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM = bpparam() @@ -71,6 +42,8 @@ filterVariability_jaccard( \arguments{ \item{object}{A \code{\link{FraserDataSet}} object} +\item{...}{Further parameters passed on to Rsubread::featureCounts.} + \item{minExpressionInOneSample}{The minimal read count in at least one sample that is required for an intron to pass the filter.} @@ -110,24 +83,15 @@ read support and introns that are not variable across samples. \item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability}: This function filters out introns and corresponding -splice sites which do not show variablity across samples. - -\item \code{filterExpressionAndVariability_jaccard}: This functions filters out both introns with low -read support and introns that are not variable across samples. - -\item \code{filterExpression_jaccard}: This function filters out introns and corresponding -splice sites which are expressed at very low levels across samples. - -\item \code{filterVariability_jaccard}: This function filters out introns and corresponding -splice sites which do not show variablity across samples. +\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding +splice sites that have low read support in all samples. }} \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -mcols(fds, type="psi5")[, c( - "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +mcols(fds, type="jaccard")[, c( + "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] plotFilterExpression(fds) plotFilterVariability(fds) diff --git a/man/fit.Rd b/man/fit.Rd index 35f7966e..659baa3b 100644 --- a/man/fit.Rd +++ b/man/fit.Rd @@ -10,7 +10,7 @@ implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), q, - type = "psi3", + type = currentType(object), rhoRange = c(-30, 30), weighted = FALSE, noiseAlpha = 1, @@ -35,7 +35,7 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{rhoRange}{Defines the range of values that rho parameter from the diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 79306a75..3b9c108a 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -39,7 +39,7 @@ pVals( fds, type = currentType(fds), level = "site", - filters = list(rho = 0.1), + filters = list(rho = 1), dist = "BetaBinomial", ... ) @@ -49,7 +49,7 @@ padjVals( type = currentType(fds), dist = c("BetaBinomial"), level = "site", - filters = list(rho = 0.1), + filters = list(rho = 1), ... ) @@ -135,7 +135,7 @@ values. observed and the fitted psi values. \item \code{currentType}: Returns the psi type that is used -within several methods in the FRASER package. +within several methods in the FRASER package (defaults to jaccard). \item \code{currentType<-}: Sets the psi type that is to be used within several methods in the FRASER package. @@ -174,7 +174,7 @@ dontWriteHDF5(fds) dontWriteHDF5 <- TRUE # get/set the splice metric for which results should be retrieved -currentType(fds) <- "psi5" +currentType(fds) <- "jaccard" currentType(fds) # get fitted parameters @@ -192,9 +192,9 @@ pseudocount(4L) pseudocount() # retrieve or set a mask to exclude certain junctions in the fitting step -featureExclusionMask(fds, type="theta") <- sample( - c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -featureExclusionMask(fds, type="theta") +featureExclusionMask(fds, type="jaccard") <- sample( + c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +featureExclusionMask(fds, type="jaccard") # controlling the verbosity level of the output of some algorithms verbose(fds) <- 2 diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index b32ed74a..64728eae 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -6,7 +6,7 @@ \usage{ injectOutliers( fds, - type = c("psi5", "psi3", "theta", "jaccard"), + type = currentType(fds), freq = 0.001, minDpsi = 0.2, minCoverage = 2, @@ -50,5 +50,6 @@ Inject artificial outliers in an existing fds \examples{ # A generic dataset fds <- makeSimulatedFraserDataSet() +fds <- calculatePSIValues(fds) fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) } diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index f3d10622..96ca78ba 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -6,9 +6,9 @@ \usage{ optimHyperParams( fds, - type, + type = currentType(fds), implementation = "PCA", - q_param = seq(2, min(40, ncol(fds)), by = 3), + q_param = getEncDimRange(fds), noise_param = 0, minDeltaPsi = 0.1, iterations = 5, @@ -24,7 +24,7 @@ optimHyperParams( \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{implementation}{The method that should be used to correct for @@ -71,13 +71,14 @@ ratios while maximizing the precision-recall curve. \examples{ # generate data fds <- makeSimulatedFraserDataSet(m=15, j=20) + fds <- calculatePSIValues(fds) # run hyperparameter optimization - fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) + fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) # get estimated optimal dimension of the latent space - bestQ(fds, type="psi5") - hyperParams(fds, type="psi5") + bestQ(fds, type="jaccard") + hyperParams(fds, type="jaccard") } \seealso{ diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 9f591ebb..1acf58c6 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -11,14 +11,14 @@ \alias{plotFilterExpression} \alias{plotExpectedVsObservedPsi} \alias{plotEncDimSearch} +\alias{plotBamCoverage} +\alias{plotBamCoverageFromResultTable} \alias{plotVolcano,FraserDataSet-method} \alias{plotAberrantPerSample,FraserDataSet-method} \alias{plotQQ,FraserDataSet-method} \alias{plotEncDimSearch,FraserDataSet-method} \alias{plotFilterVariability} \alias{plotCountCorHeatmap,FraserDataSet-method} -\alias{plotBamCoverage} -\alias{plotBamCoverageFromResultTable} \alias{plotManhattan,FraserDataSet-method} \title{Visualization functions for FRASER} \usage{ @@ -27,12 +27,12 @@ plotManhattan(object, ...) \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = c("psi3", "psi5", "theta", "jaccard"), + type = currentType(object), basePlot = TRUE, aggregate = FALSE, main = NULL, label = NULL, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, padjCutoff = 0.1, ... ) @@ -40,10 +40,10 @@ plotManhattan(object, ...) \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = c("psi3", "psi5", "theta", "jaccard"), + type = psiTypes, padjCutoff = 0.1, zScoreCutoff = NA, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, aggregate = TRUE, BPPARAM = bpparam(), ... @@ -51,7 +51,7 @@ plotManhattan(object, ...) plotExpression( fds, - type = c("psi5", "psi3", "theta", "jaccard"), + type = currentType(fds), idx = NULL, result = NULL, colGroup = NULL, @@ -63,7 +63,7 @@ plotExpression( plotExpectedVsObservedPsi( fds, - type = c("psi5", "psi3", "theta", "jaccard"), + type = currentType(fds), idx = NULL, result = NULL, colGroup = NULL, @@ -91,7 +91,7 @@ plotExpectedVsObservedPsi( \S4method{plotEncDimSearch}{FraserDataSet}( object, - type = c("psi3", "psi5", "theta", "jaccard"), + type = currentType(object), plotType = c("auc", "loss") ) @@ -111,7 +111,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = c("psi5", "psi3", "theta", "jaccard"), + type = currentType(object), logit = FALSE, topN = 50000, topJ = 5000, @@ -174,9 +174,9 @@ plotBamCoverageFromResultTable( \S4method{plotManhattan}{FraserDataSet}( object, sampleID, - type = c("psi5", "psi3", "theta", "jaccard"), + type = currentType(object), main = paste0("sampleID = ", sampleID), - color = c("black", "darkgrey"), + color_chr = c("black", "darkgrey"), ... ) } @@ -375,6 +375,8 @@ junction should be extended to the left in \item{right_extension}{Indicating how far the plotted range around the outlier junction should be extended to the right in \code{plotBamCoverageFromResultTable}.} + +\item{color_chr}{Interchanging colors by chromosome for \code{plotManhattan}.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -484,27 +486,27 @@ certain region around the outlier. fds <- makeSimulatedFraserDataSet(m=40, j=200) fds <- calculatePSIValues(fds) fds <- filterExpressionAndVariability(fds, filter=FALSE) -# this step should be done for all splicing metrics and more dimensions -fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) +# this step should be done for more dimensions in practice +fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) fds <- FRASER(fds) # QC plotting plotFilterExpression(fds) plotFilterVariability(fds) -plotCountCorHeatmap(fds, "theta") -plotCountCorHeatmap(fds, "theta", normalized=TRUE) -plotEncDimSearch(fds, type="psi5") +plotCountCorHeatmap(fds, "jaccard") +plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +plotEncDimSearch(fds, type="jaccard") # extract results plotAberrantPerSample(fds, aggregate=FALSE) -plotVolcano(fds, "sample1", "psi5") +plotVolcano(fds, "sample1", "jaccard") # dive into gene/sample level results res <- results(fds) res plotExpression(fds, result=res[1]) plotQQ(fds, result=res[1]) -plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +plotExpectedVsObservedPsi(fds, res=res[1]) # create manhattan plot of pvalues by genomic position plotManhattan(fds, type="jaccard", sampleID="sample10") diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index 43ce2ea6..773152c5 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -3,18 +3,18 @@ \docType{data} \name{psiTypes} \alias{psiTypes} -\title{Available psi types} +\title{Splice metrics that are run by default} \format{ -An object of class \code{character} of length 4. +An object of class \code{character} of length 1. } \usage{ psiTypes } \description{ -Available psi types +Splice metrics that are run by default } \examples{ - # to show available psi types: + # to show splice metrics selected to be fitted: psiTypes } diff --git a/man/psiTypes_avail.Rd b/man/psiTypes_avail.Rd new file mode 100644 index 00000000..6f8455a9 --- /dev/null +++ b/man/psiTypes_avail.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variables.R +\docType{data} +\name{psiTypes_avail} +\alias{psiTypes_avail} +\title{Available splice metrics} +\format{ +An object of class \code{character} of length 4. +} +\usage{ +psiTypes_avail +} +\description{ +Available splice metrics +} +\examples{ + # to show available splice metrics: + psiTypes_avail + +} +\keyword{datasets} diff --git a/man/results.Rd b/man/results.Rd index ef76b666..c7dec171 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -10,12 +10,12 @@ sampleIDs = samples(object), padjCutoff = 0.05, zScoreCutoff = NA, - deltaPsiCutoff = 0.3, - rhoCutoff = 0.1, + deltaPsiCutoff = 0.1, + rhoCutoff = 1, aggregate = FALSE, collapse = FALSE, minCount = 5, - psiType = c("psi3", "psi5", "theta", "jaccard"), + psiType = currentType(object), geneColumn = "hgnc_symbol", additionalColumns = NULL, BPPARAM = bpparam() @@ -25,10 +25,10 @@ object, type = currentType(object), padjCutoff = 0.05, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, zScoreCutoff = NA, minCount = 5, - rhoCutoff = 0.1, + rhoCutoff = 1, by = c("none", "sample", "feature"), aggregate = FALSE, geneColumn = "hgnc_symbol", @@ -117,13 +117,13 @@ results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value -aberrant(fds, type="psi5", by="sample") +aberrant(fds, type="jaccard", by="sample") # get aberrant events per gene (first annotate gene symbols) fds <- annotateRangesWithTxDb(fds) -aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, +aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA, aggregate=TRUE) # find aberrant junctions/splice sites -aberrant(fds, type="psi5") +aberrant(fds, type="jaccard") } diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 2bb6e2cc..761330ec 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -4,7 +4,7 @@ test_that("FRASER function", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", - "pvaluesBetaBinomial_rho0.1", "padjBetaBinomial_rho0.1", + "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", rep(psiTypes, 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R index 26331409..fd8a961b 100644 --- a/tests/testthat/test_hyperParams.R +++ b/tests/testthat/test_hyperParams.R @@ -2,6 +2,7 @@ context("Test hyper param optimization") test_that("Test hyper param testing", { fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB") + fds <- calculatePSIValues(fds) # test BB no hyper params and accessors fds <- optimHyperParams(fds, type="psi3", implementation="BB") diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R index 85a4b9d3..1394f13c 100644 --- a/tests/testthat/test_plotJunctionDist.R +++ b/tests/testthat/test_plotJunctionDist.R @@ -7,11 +7,11 @@ test_that("Main junction distribution plot", { # plot distributions expect_silent(plotExpression(fds, result=res[1])) - expect_silent(plotVolcano(fds, "sample1", "psi5")) + expect_silent(plotVolcano(fds, "sample1", "jaccard")) expect_silent(plotExpectedVsObservedPsi(fds, result=res[2])) - expect_is(plotCountCorHeatmap(fds, "psi5", norm=FALSE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE, topN=10), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=FALSE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE, topN=10), "pheatmap") }) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index cdc098df..e333b220 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -21,14 +21,14 @@ test_that("Zscore calculation", { fds <- getFraser(clean = TRUE) # prepare zScore input for logit scale - psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount()) - mu <- predictedMeans(fds, "psi5") + psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount()) + mu <- predictedMeans(fds, "jaccard") residual <- qlogis(psiVal) - qlogis(mu) # compute zscore zscores <- (residual - rowMeans(residual)) / rowSds(residual) - expect_equal(zscores, zScores(fds, "psi5")) + expect_equal(zscores, zScores(fds, "jaccard")) }) test_that("Gene p value calculation with NAs", { @@ -39,44 +39,44 @@ test_that("Gene p value calculation with NAs", { mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), times=c(4, 6, 4)) - # simulate junction with bad rho fit - rho_5 <- rho(fds, type="psi5") - rho_5[c(1, 4:7)] <- 0.5 - rho(fds, type="psi5") <- rho_5 - - rho_3 <- rho(fds, type="psi3") - rho_3 <- rep(0.5, length(rho_3)) - rho(fds, type="psi3") <- rho_3 + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho[c(1, 4:7)] <- 0.5 + rho(fds, type="jaccard") <- rho # calc p values - fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1) - fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1) + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) # check dimension of junction-, site- and gene-level pval matrices - expect_equal(nrow(pVals(fds, type="psi5", level="junction", - filters=list(rho=0.1))), nrow(fds)) - expect_equal(nrow(pVals(fds, type="psi5", level="site", + expect_equal(nrow(pVals(fds, type="jaccard", level="junction")), nrow(fds)) + expect_equal(nrow(pVals(fds, type="jaccard", level="site", filters=list(rho=0.1))), nrow(fds)) - expect_equal(nrow(pVals(fds, type="psi5", level="gene", + expect_equal(nrow(pVals(fds, type="jaccard", level="gene", filters=list(rho=0.1))), 3) - # check psi5 pvals are partly NAs - expect_true(all(is.na(pVals(fds, type="psi5", level="site", - filters=list(rho=0.1))[4:7,]))) - expect_true(all(is.na(pVals(fds, type="psi5", level="gene", - filters=list(rho=0.1))["geneB",]))) - expect_true(all(is.na(padjVals(fds, type="psi5", level="site", - filters=list(rho=0.1))[4:7,]))) - expect_true(all(is.na(padjVals(fds, type="psi5", level="gene", - filters=list(rho=0.1))["geneB",]))) + # check jaccard pvals are partly NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) + + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho <- rep(0.5, length(rho)) + rho(fds, type="jaccard") <- rho + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) - # check psi3 pvals are all NAs - expect_true(all(is.na(pVals(fds, type="psi3", level="site", + # check jaccard pvals are all NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", filters=list(rho=0.1))))) - expect_true(all(is.na(pVals(fds, type="psi3", level="gene", + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", filters=list(rho=0.1))))) - expect_true(all(is.na(padjVals(fds, type="psi3", level="site", + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", filters=list(rho=0.1))))) - expect_true(all(is.na(padjVals(fds, type="psi3", level="gene", + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", filters=list(rho=0.1))))) }) From 7d272883781edcf8aec33a179b04e508e440caea Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 24 Nov 2022 13:51:34 +0100 Subject: [PATCH 39/80] fix install of dependencies for github actions --- .github/helperScripts/setupEnv.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R index 9702e9b5..a141aac6 100644 --- a/.github/helperScripts/setupEnv.R +++ b/.github/helperScripts/setupEnv.R @@ -24,7 +24,7 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE) # add testthat to pre installation dependencies due to: https://github.com/r-lib/pkgload/issues/89 for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer")){ - installIfReq(p=p, Ncpus=NCPUS) + installIfReq(p=p, Ncpus=NCPUS, update=TRUE) } # because of https://github.com/r-windows/rtools-installer/issues/3 @@ -35,10 +35,10 @@ if("windows" == .Platform$OS.type){ } print_log("Update Packages") -BiocManager::install(ask=FALSE, Ncpus=NCPUS, version=BIOC_VERSION) +BiocManager::install(ask=FALSE, Ncpus=NCPUS, version=BIOC_VERSION, update=TRUE) print_log("Install dev package") -devtools::install(".", dependencies=TRUE) +devtools::install(".", dependencies=TRUE, update=TRUE) # to get FRASER session info try({ library(FRASER) }) From 86f6691cd04d464fcda68ed39a75f6e02875910b Mon Sep 17 00:00:00 2001 From: Christian Mertes Date: Thu, 24 Nov 2022 14:44:44 +0100 Subject: [PATCH 40/80] pkgload problem is fixed ==> parallel install --- .github/helperScripts/setupEnv.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R index a141aac6..979169ca 100644 --- a/.github/helperScripts/setupEnv.R +++ b/.github/helperScripts/setupEnv.R @@ -22,10 +22,11 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE) # install needed packages # add testthat to pre installation dependencies due to: https://github.com/r-lib/pkgload/issues/89 -for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", - "R.utils", "rtracklayer")){ - installIfReq(p=p, Ncpus=NCPUS, update=TRUE) -} +# for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", +# "R.utils", "rtracklayer")){ +# installIfReq(p=p, Ncpus=NCPUS, update=TRUE) +# } +BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS) # because of https://github.com/r-windows/rtools-installer/issues/3 if("windows" == .Platform$OS.type){ From 7d9abd45850f5143e29594aeff50a321cd0acb34 Mon Sep 17 00:00:00 2001 From: Christian Mertes Date: Thu, 24 Nov 2022 14:49:15 +0100 Subject: [PATCH 41/80] Add update=FALSE to install --- .github/helperScripts/setupEnv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R index 979169ca..b58d864d 100644 --- a/.github/helperScripts/setupEnv.R +++ b/.github/helperScripts/setupEnv.R @@ -26,7 +26,7 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE) # "R.utils", "rtracklayer")){ # installIfReq(p=p, Ncpus=NCPUS, update=TRUE) # } -BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS) +BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS, update=FALSE) # because of https://github.com/r-windows/rtools-installer/issues/3 if("windows" == .Platform$OS.type){ From e60ae0bf09cdb947050979d47f4b2310982a58e9 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 24 Nov 2022 21:09:48 +0100 Subject: [PATCH 42/80] update vignette to FRASER2 and set all available metrics in plot functions --- R/Fraser-pipeline.R | 7 +- R/plotMethods.R | 17 ++-- R/variables.R | 15 ++-- man/FRASER.Rd | 1 + man/plotFunctions.Rd | 18 ++-- man/psiTypes.Rd | 19 ++++- man/psiTypes_avail.Rd | 21 ----- vignettes/FRASER.Rnw | 105 ++++++++++++++++++------ vignettes/IntronJaccardIndex_sketch.png | Bin 0 -> 35651 bytes 9 files changed, 121 insertions(+), 82 deletions(-) delete mode 100644 man/psiTypes_avail.Rd create mode 100644 vignettes/IntronJaccardIndex_sketch.png diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index b7960dad..d6a106bf 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -85,8 +85,9 @@ NULL #' the beta-binomial fit, the computation of Z scores and p values as well as #' the computation of delta-PSI values. #' @export -FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", - "AE-weighted", "AE", "BB"), +FRASER <- function(fds, q, types=psiTypes, + implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", + "AE", "BB"), iterations=15, BPPARAM=bpparam(), correction, ...){ # Check input implementation <- match.arg(implementation) @@ -103,7 +104,7 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", } # fit each splicing type separately - for(i in psiTypes){ + for(i in types){ # get type specific q if(missing(q)){ diff --git a/R/plotMethods.R b/R/plotMethods.R index 3fd7ea50..2c38562f 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -299,7 +299,7 @@ NULL plotVolcano.FRASER <- function(object, sampleID, - type=currentType(object), basePlot=TRUE, + type=psiTypes_avail, basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, deltaPsiCutoff=0.1, padjCutoff=0.1, ...){ @@ -400,7 +400,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=psiTypes, + type=psiTypes_avail, padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, BPPARAM=bpparam(), ...){ @@ -463,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=currentType(fds), +plotExpression <- function(fds, type=psiTypes_avail, idx=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ @@ -555,7 +555,7 @@ plotExpression <- function(fds, type=currentType(fds), #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=currentType(fds), +plotExpectedVsObservedPsi <- function(fds, type=psiTypes_avail, idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) @@ -819,8 +819,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) -plotEncDimSearch.FRASER <- function(object, - type=currentType(object), +plotEncDimSearch.FRASER <- function(object, type=psiTypes_avail, plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -988,7 +987,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=currentType(object), logit=FALSE, + type=psiTypes_avail, logit=FALSE, topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, @@ -1429,7 +1428,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, } plotManhattan.FRASER <- function(object, sampleID, - type=currentType(object), + type=psiTypes_avail, main=paste0("sampleID = ", sampleID), color_chr=c("black", "darkgrey"), ...){ @@ -1543,7 +1542,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ } else{ vapply(type, FUN=function(x) switch (x, - jaccard = "jaccard-intron-index", + jaccard = "Intron-Jaccard-Index", psi5 = "psi[5]", psi3 = "psi[3]", theta = "theta"), diff --git a/R/variables.R b/R/variables.R index 61f821e1..214c283d 100644 --- a/R/variables.R +++ b/R/variables.R @@ -5,18 +5,15 @@ #' # to show available splice metrics: #' psiTypes_avail #' -#' @export -psiTypes_avail <- c("psi5", "psi3", "theta", "jaccard") -names(psiTypes_avail) <- c("psi5", "psi3", "theta", "Intron Jaccard Index") - -#' -#' Splice metrics that are run by default -#' -#' @examples #' # to show splice metrics selected to be fitted: #' psiTypes #' +#' @rdname psiTypes +#' @export +psiTypes_avail <- c("jaccard", "psi5", "psi3", "theta") +names(psiTypes_avail) <- c("Intron Jaccard Index", "psi5", "psi3", "theta") + +#' @describeIn psiTypes Splice metrics that are run by default #' @export psiTypes <- c("jaccard") names(psiTypes) <- c("Intron Jaccard Index") - diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 170a6271..a9a0ca97 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -11,6 +11,7 @@ FRASER( fds, q, + types = psiTypes, implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations = 15, BPPARAM = bpparam(), diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 1acf58c6..686bf15f 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -27,7 +27,7 @@ plotManhattan(object, ...) \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = currentType(object), + type = psiTypes_avail, basePlot = TRUE, aggregate = FALSE, main = NULL, @@ -40,7 +40,7 @@ plotManhattan(object, ...) \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = psiTypes, + type = psiTypes_avail, padjCutoff = 0.1, zScoreCutoff = NA, deltaPsiCutoff = 0.1, @@ -51,7 +51,7 @@ plotManhattan(object, ...) plotExpression( fds, - type = currentType(fds), + type = psiTypes_avail, idx = NULL, result = NULL, colGroup = NULL, @@ -63,7 +63,7 @@ plotExpression( plotExpectedVsObservedPsi( fds, - type = currentType(fds), + type = psiTypes_avail, idx = NULL, result = NULL, colGroup = NULL, @@ -89,11 +89,7 @@ plotExpectedVsObservedPsi( ... ) -\S4method{plotEncDimSearch}{FraserDataSet}( - object, - type = currentType(object), - plotType = c("auc", "loss") -) +\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes_avail, plotType = c("auc", "loss")) plotFilterExpression( fds, @@ -111,7 +107,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = currentType(object), + type = psiTypes_avail, logit = FALSE, topN = 50000, topJ = 5000, @@ -174,7 +170,7 @@ plotBamCoverageFromResultTable( \S4method{plotManhattan}{FraserDataSet}( object, sampleID, - type = currentType(object), + type = psiTypes_avail, main = paste0("sampleID = ", sampleID), color_chr = c("black", "darkgrey"), ... diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index 773152c5..dfff5dce 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -1,19 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/variables.R \docType{data} -\name{psiTypes} +\name{psiTypes_avail} +\alias{psiTypes_avail} \alias{psiTypes} -\title{Splice metrics that are run by default} +\title{Available splice metrics} \format{ +An object of class \code{character} of length 4. + An object of class \code{character} of length 1. } \usage{ +psiTypes_avail + psiTypes } \description{ -Splice metrics that are run by default +Available splice metrics } +\section{Functions}{ +\itemize{ +\item \code{psiTypes}: Splice metrics that are run by default +}} + \examples{ + # to show available splice metrics: + psiTypes_avail + # to show splice metrics selected to be fitted: psiTypes diff --git a/man/psiTypes_avail.Rd b/man/psiTypes_avail.Rd deleted file mode 100644 index 6f8455a9..00000000 --- a/man/psiTypes_avail.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/variables.R -\docType{data} -\name{psiTypes_avail} -\alias{psiTypes_avail} -\title{Available splice metrics} -\format{ -An object of class \code{character} of length 4. -} -\usage{ -psiTypes_avail -} -\description{ -Available splice metrics -} -\examples{ - # to show available splice metrics: - psiTypes_avail - -} -\keyword{datasets} diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 11a7a599..4a28fbbe 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -46,6 +46,7 @@ opts_chunk$set( \begin{document} +\SweaveOpts{concordance=TRUE} <>= opts_chunk$set(concordance=TRUE) @@ -175,6 +176,32 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the following. +From \fraser{}2 on, only one metric - the Intron Jaccard Index (Figure +\ref{IntronJaccardIndex_sketch}.) - is used by default. The Intron Jaccard +Index is more robust and allows to focus more on functionally relevant +aberrant splicing events. It allows to detect all types of aberrant splicing +previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$) +within a single metric. + +\incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron +Jaccard Index, the splice metric used in \fraser{}2.}{ +The Intron Jaccard Index considers both split and nonsplit reads and is +defined as the jaccard index of the set of donor reads (reads sharing a donor +site with the intron of interest and nonsplit reads at that donor site) and +acceptor reads (reads sharing an acceptor site with the intron of interest and +nonsplit reads at that acceptor site). } + +The Intron Jaccard Index considers both split and nonsplit reads and is +defined as the jaccard index of the set of donor reads (reads sharing a donor +site with the intron of interest and nonsplit reads at that donor site) and +acceptor reads (reads sharing an acceptor site with the intron of interest and +nonsplit reads at that acceptor site): + +\begin{equation} + J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}\label{eq:jaccard} +\end{equation} + + \section{Quick guide to \fraser{}} Here we quickly show how to do an analysis with \fraser{}, starting from a @@ -204,10 +231,6 @@ fds <- calculatePSIValues(fds) fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20, minDeltaPsi=0.0, filter=TRUE) -# fit the splicing model for each metric -# with a specific latentsapce dimension -fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3)) - # we provide two ways to anntoate introns with the corresponding gene symbols: # the first way uses TxDb-objects provided by the user as shown here library(TxDb.Hsapiens.UCSC.hg19.knownGene) @@ -216,6 +239,10 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) +# fit the splicing model for each metric +# with a specific latentsapce dimension +fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3)) + # alternatively, we also provide a way to use biomart for the annotation: # fds <- annotateRanges(fds) @@ -226,7 +253,7 @@ res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA) res # result visualization -plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE) +plotVolcano(fds, sampleID="sample1", type="jaccard", aggregate=TRUE) @ @@ -439,7 +466,7 @@ transformed $\psi$ values to compute the correlation. <>= # Heatmap of the sample correlation -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE) +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE) @ It is also possible to visualize the correlation structure of the logit @@ -447,7 +474,7 @@ transformed $\psi$ values of the $topJ$ most variable introns for all samples: <>= # Heatmap of the intron/sample expression -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE, +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi = 0.01) @ @@ -470,14 +497,14 @@ p-values and z-scores for all $\psi$ types. For more details see section <>= # This is computational heavy on real size datasets and can take awhile -fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2)) +fds <- FRASER(fds, q=c(jaccard=3)) @ To check whether the correction worked, we can have a look at the correlation heatmap using the normalized $\psi$ values from the fit. <>= -plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE) +plotCountCorHeatmap(fds, type="jaccard", normalized=TRUE, logit=TRUE) @ \subsubsection{Calling splicing outliers} @@ -495,6 +522,10 @@ $\psi$ values or both. <>= # annotate introns with the HGNC symbols of the corresponding gene +if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE)) + BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene") +if (!require("org.Hs.eg.db", quietly = TRUE)) + BiocManager::install("org.Hs.eg.db") library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) @@ -519,9 +550,12 @@ the following additional information: \item hgncSymbol: the gene symbol of the gene that contains the splice junction or site if available \item type: the metric for which the aberrant event was detected (either - psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$) + jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or + theta for $\theta$) \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of - this event + this event (at intron or splice site level depending on metric) + \item pValueGene, padjustGene: only present in the gene-level results table, + gives the p-value and FDR adjusted p-value at gene-level \item psiValue: the value of $\psi_5$, $\psi_3$ or $\theta$ metric (depending on the type column) of this junction or splice site for the sample in which it is detected as aberrant @@ -533,20 +567,39 @@ the following additional information: same donor or acceptor site as this junction or site over all samples \item counts, totalCounts: the count (k) and total count (n) of the splice junction or site for the sample where it is detected as aberrant + \item nonsplitCounts, nonsplitProportion: only present for the Intron + Jaccard Index. States the sum of nonsplit counts overlapping either the + donor or acceptor site of the outlier intron for the sample where it is + detected as aberrant; and their proportion out of the total counts (N). + A high nonsplitProportion indicates possible (partial) intron retention. \end{itemize} Please refer to section \ref{sec:Introduction} for more information about the -metrics $\psi_5$, $\psi_3$ and $\theta$ and their definition. In general, an +Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and +$\theta$) and their definition. In general, an aberrant $\psi_5$ value might indicate aberrant acceptor site usage of the junction where the event is detected; an aberrant $\psi_3$ value might indicate aberrant donor site usage of the junction where the event is detected; and an aberrant $\theta$ value might indicate partial or full intron retention, or -exon truncation or elongation. We recommend using a genome browser to -investigate interesting detected events in more detail. +exon truncation or elongation. As the Intron Jaccard Index combines the +previously described metrics, an aberrant Intron Jaccard value can indicate any +of the above described cases. We recommend using a genome browser to +investigate interesting detected events in more detail. \fraser{}2 also +provides the function \Rfunction{plotBamCoverageFromResultTable} to create a +sashimi plot for an outlier in the results table directly in R (if paths to +bam files are available in the \fds{} object). <>= # to show result visualization functions for this tuturial, zScore cutoff used res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1) res + +# for the gene level pvalues, gene symbols need to be annotated the fds object +# before calling the calculatePadjValues function (part of FRASER() function) +# as we previously called FRASER() before annotating genes, we run it again here +fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) +# generate gene-level results table (if gene symbols have been annotated) +res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=0.1) +res_gene @ \subsection{Finding splicing candidates in patients} @@ -555,7 +608,7 @@ Let's hava a look at sample 10 and check if we got some splicing candidates for this sample. <>= -plotVolcano(fds, type="psi5", "sample10") +plotVolcano(fds, type="jaccard", "sample10") @ Which are the splicing events in detail? @@ -568,7 +621,7 @@ sampleRes To have a closer look at the junction level, use the following functions: <>= -plotExpression(fds, type="psi5", result=sampleRes[1]) +plotExpression(fds, type="jaccard", result=sampleRes[1]) plotExpectedVsObservedPsi(fds, result=sampleRes[1]) @ @@ -619,7 +672,7 @@ confounders in the data. Currently the following methods are implemented: # Using an alternative way to correct splicing ratios # here: only 2 iteration to speed the calculation up # for the vignette, the default is 15 iterations -fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder", +fds <- fit(fds, q=3, type="jaccard", implementation="PCA-BB-Decoder", iterations=2) @ @@ -638,17 +691,17 @@ for a subset of the dataset: <>= set.seed(42) # hyperparameter opimization -fds <- optimHyperParams(fds, type="psi5", plot=FALSE) +fds <- optimHyperParams(fds, type="jaccard", plot=FALSE) # retrieve the estimated optimal dimension of the latent space -bestQ(fds, type="psi5") +bestQ(fds, type="jaccard") @ The results from this hyper parameter optimization can be visualized with the function \Rfunction{plotEncDimSearch}. <>= -plotEncDimSearch(fds, type="psi5") +plotEncDimSearch(fds, type="jaccard") @ \subsection{P-value calculation} @@ -669,8 +722,8 @@ computed as the product of the fitted correction values from the autoencoder and the fitted mean adjustements. <>= -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) @ Afterwards, adjusted p-values can be calculated. Multiple testing correction is @@ -680,8 +733,8 @@ methods supported by \Rfunction{p.adjust} can be used via the \Robject{method} argument. <>= -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds,type="psi5")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds,type="jaccard")) @ \subsection{Z-score calculation} @@ -704,8 +757,8 @@ counts and the counts after correction for confounders and $\bar{\delta_j}$ is the mean of intron $j$. <>= -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +fds <- calculateZscore(fds, type="jaccard") +head(zScores(fds, type="jaccard")) @ \subsection{Result visualization} diff --git a/vignettes/IntronJaccardIndex_sketch.png b/vignettes/IntronJaccardIndex_sketch.png new file mode 100644 index 0000000000000000000000000000000000000000..79cca6a809755fb30d0b103f6385d126dba3f472 GIT binary patch literal 35651 zcmd?RWk8hQ+b%kYw17$}Y0)WN0us{QUDBnbDzyFSR z@BRMY59gduhY!v;FzZ=sJ$GE!b>Hg=RFD(L#w5msKp@zX5~4~F2y!C?a*q`K9{8kO zP=Nsg@rFo>zE%08w>^vIfiHF^xX&&OV|pe*aB|n-n~a~15k!M~k8UUeNASL=kWkKx zuQW(Me+`NHdHZwmUc-P*9>&&&`G~I#Cp~fNZ zbkf9m{4F-PA_M|kZB+mJ#@i5m;P0DvLf)u|H!Sz=u_Io+A3aP)Lc9s_hW!1G!TtN@ zi2p-gJ`+MlygorfB1XJ6qW}MFh|zFX=7LvD-wRM6S*(an$?{jRzu4iL+jnr5(<&6w zRA}TvhNvJRHd2MlRZq__f%k%kd*Jan13 z;=(Z?$(D$nPu5bP8?La$wx0O$%Ry*os+hS!1s1XISt_--bU_D!7$fe2kkb#B6FR;F zkG1@YAY-HRRo1TB+GNw-7w%_$HTHgX$dD}3zZ0YiBV_8~b&DwQ)VTddf}gB8@||3u zmWCl@4+*kQgSdYN`Vp?K#j3gYDBMF>OB+fYu1B&dB_%zmA&_Roe1zOEeSPNC{CQ1=ZX3Ae5Zya8oT~J{ubgo#$aVlzg$YD>7oj6VXyk@;AGDr`9{(ZYN4(`O2`!!-STA z7ha#vc0YZ2?oIahD%xOwP0vDxXZ9BB&ZHjWU630QaG79|obHV4Gw+|R#UUkM{_}@Q zgoOU3b`(QJ#(f*Z9&we~LBYyn`B>H?5_qX(WX5o412>ib-hIgEqraEeQ2fO0r&zD{ z0zW3kg@_p^OI6yTf=_+Cg~uW7N)wVChA@i8oCq;757;s66;+#T%;MfQ>0~A@xXGh9 zh%qz*U$C(mj^z!QC1yuHjRp2Vh>CVp22t2S*seFn`|U}*JImTV{H=%WwTR)o|6vV!muierj|P!=_;*H97E)3MzNoi-eSr?EG{q$s zrdP=fG(Yk+MXuiBr6$BLQpqs{hOPlyNc%fS6qB11^|wM0eJkkl)2D$+9*cXMIWZ`D zC8mi{M8NLTkP){rPgGMuA?!!v_*8gUW#z=dk_fM(0&rFv$TPCPzC}KHca6to=%bas zNYd4zmXDdG%jepPX`)9CHXx z%wK{v?lI{SE96F6kLFxkjla8$0hW)zkRr@SNZUAyJS1fe!KExIxVdSZW-^|@kVlnEQcp48J=m60W2s! zKEQpFL)F6KGE@+i!>aA{{qeNj^k{xDQW`qgAfW`ug0w=7Z0(~O@wRZhdo&_0v98@m zghP5fQjZ@`U?+;)9VKBVK34L$0#U_E>F_GiRg@}Ithou7xbh}NGf z1~oOmygD()d(y<~(pUH4^PZvE`qvk>P4`{hc!A^po%Cr3AHQAW;s?S6G{(?M*P37N zJ?MWY_cyU@UTpNAy5kyO)VZHI?<`t5p{rxQ3w`pZLRD^cq4_x(Z{ZV9vx&lASv0_= zy}5ynde6pGyi62Z>npJ(tM*8;W0L=vZMpOl-!`$d4rCCQZFI|-iT=T&nYFd-W&2)^ z4<_oEZQm*GIwZ;b&388zRtYs4>P^W%d?UW#vPIZeVuj1ZQNLLdZz~tXd-OTt3X`sM zJDnIvTXzwLT1PJl&M(A{w>$eV;pF_4=Y3?qMaGJZo4#?AyQUjZb`B3MX3A&oC^@f9 zZ)^0d58|F4c_VR`LJAo{R4?SS_oL$&Zc$FFZhDNlXR0!d#!6pY%&uO-!>4RiGg|Fb z6)G+n%@Hid+cjVc#rfKqS(Hn!Dk-_y_U)}~Y@sJP2%rZ$(;zDqVuJCqR)32vKlLgv z-3XG3c(dwPo`8QIpw8j(OV2_Ivxcz<`i)pJnDD9Fq4VzmafRWG4Z+*2u^td7g8 zy0MQlpLysYI8KZ)6#EPAN}U6}6Gv4uF?_CX@-(QYCz(q2T36?>X>S<3rk!g#W@vf& zd}G+cZssX(1jxqSAi;QBejVB=o(=46v$Pc)cUGEDJc{hn#_zSbDTSG3q9vX5ug0az z{EW-JQgTS2FSm&>a-z?dWJ`a4YJIvM+31h0%w??rQ<=VCgQ?&@N)en#A4x32eVZ|{ zrP?>2SmN%Jg0wI;nCgLBqyF$=a(PP{C1l|5E}*5a)hAinPRMlg<)$`0&SS6fUhFEz z%VrcnH)ngz@?6&_Q!LmHUVqa@a?4>A>CRL#zuKF1d*ersQW8qB<~||Z7sHm)|EB$& z)81Org(;ufVtLNfP(Ld=d2f_XFk3E;jP|KBaU_*G*bW>@3x- z-J$d#YZB5Q-WKbX5=Sht@4ihCaXHUaE>39F<3UugL3ox>4b*YE_MhlKCqL;_31a96 zwhN3PAR+C+)(~a?blhGFtjJ7$`}~0N;K0MTaz(H5$3;7ZNB+%EctcXD`J<~f6sY>R zhrT}W-oF@7;<4Boksyu*d)J=Sx(HYBLs%Wr?jZVI#S2z}{Ajz9=jp81twhLMfo`A&lEJgre~M6=beY{4(QC*xZ`lM| z=9Lus9N2s;;`6CdtMv;<@{eT{@X7V)!ZDb!Z0ADaxQlrCFXXdpE3DK%9IaZP-t){F zs&TmxI2BiA$XMd?wnSk3S1%cJkC$u?WZzl+a%j(f+mI-&{?*3qOJbf!YiVy0!)~Zk zjR*w@bgl&Prek$VES}Sx^_sExRaGugjK(EI_cq61`C5Kh*gR)vd;R<#(FG{pQV6JJ zA<@ZT^G5NgBW!i=s$e26LfL=?KWO05Ue9Sbhi~T0UBA;-?{vm`e7UPbhqC;JAVlm) zbUj@f*>W`Jw>oBIOKWQgd2farbzdBCw%5Fk2K^SHpElBUc3DQ(8$WIl59t>k{GmqpC@FTIBQFIE`li{d@cu3cn-|#N66!<32JNpA+P?BMWtZ< zKFa=%uSvWx*&g(bf#+8(7_(|p;YdzfO3MG_wN1{?Hwvq3-SwArge7S1{BP!SuOlj$#Lg%LFKT`lN95yqJkdy6=!|jOI6FJJnl+SntS=y|lDrxNW{{hOSL`o9J zVvPEo2117m%%Md%U5R49$KcE})nsaQVmah-z4D2$IM(dy7j}}ZhTTFL+YB@sQwqUE z*Y$_=CG;qJ*LzT0mm-&o@dEMF-Ck2jK*PQ;0>mMg(p5sSyb}<%pjb)x^K5DekrJ7L zE1xgc@Wm;9%sym0d*y5_a5*wqY@eYW?O&r**a%PBGd$bfgF|y|v+R zaDW!jJ$I|Np|=Yp5JW_5nYTA*3hqlVox5zN7cogxE(-fd$$oYbYCG6&$0IPGIc(uk zVkl;42%^EJAcS&j;`OPJfDWz3YZJwv*1fR+cTc1d)lb&XKIIJY#1cE*GFx}ugE%9& zhjr-VqXC$rZNPiKR%hPl92E+$-K|LZfXZUA3zWY`Iy@F9ola7*grKlTLaEPQa&o|GGw#k)rI^ND@w{n)<8|tL z{E-{Fs;LCLC52}WrO+LWXJDiEyh^G`>-B;@EYOn^6>#lkcJnH!#2L&I6x~~rE?`Hp0u7K8hJ&mmuH-qzwaVjK44H>!GD8o!Av1xX*^{(=2c%63VVq95qPk=LwX zvhBhL=xDQ;9%U(0ZUqimeI_q2AN{%+-s(qIlx8#e3tB{#RVE+OURU}D2LWG9Itg-g z&4!XX?hi!OV&eu5 z%@qD*v_K!s?$VLb915Fmwhc{M2lVF{oy#?KMoRT-CyM zeNXR3x2tNTqZXy!S`Ez%+jK|94`*JW)@0H#$;}pi?(NH@H0_CAMtSl{u+^y z*7BW%y60)f(c$A*rW=py;k)mO{FWHR--*Ic}=1N9g}${|zLE z2t+xSI3+*Jt%DJ)&x>`;EwLO{dY&h$^=hOlm||0rjGwWI74BnXDO0a(6{u2P&nC0o z6Ct{e5PEtU>?x^(r;${g{LHe^JFnq(kz`wSW47jCI794Yub&H|!iTu^S!_G{arj$bQ~m zA6kAt8%0;ef74Fk8_PKw-|graLMB-MN_;lk8V;UOGD z2q955I%dbDD|5Dqb_ZRce$*&_k3m_hSz;w@2r5W{tkMCn2=^Zi0 zVSzqWAzexAc=Ke>dMb{K!cBzo+L1SZ56)9mxp$M3|rkvPR9(vkwt|ezaEOsGQav88zlOkCqmJi4Xn7tICFkKy7n! z+^hN-6Em5m_;MO`9jF3?Grv$w(4j<8r4(KdH-|ZGFg;Hv^NiaaFNvwO)Yw00-ZxpN zO1%+qkf&oRllbL%wNrtHOP;4K(oT(^ZNeI3ODR;fe6~l;sI#tLLUn_lW;55rg)D&v z99AW0+iN3_V#&w2yMg%&x^C^KpFXo=`PxqOaOj<t(hCd}E69VI4u$7Gs9`k``Q~d|9#1BR;DmfkMdXl$` zSSGi=)I}na9v(nnyL7R*|KlO>%1>XKjMAr{YtrvL`25Jq_NdpW%wN%>D_WPGi!mow!7bo$)%76{ODm@{~MP{0sn`tK)b*oh>!ored?>T;y zk?E>{D+!#RNxg+a>%-Um{QZZR5?BmqP582!tC5>Ld z+v{h^*}v|@f7PoyWYS-aLlQ*=Uf9Lu0b8A<>F-4sn2&ir2zA~aOcf8Ol6?#`3~7CR zI5CH%hM*A9Os!;KjGr%^9<4d+CFXCQA3Cj)Wa9<;nR)y+Q|u{JEG&i2J5wr8A3aJL ze$&6e@lL<>SDnv&X*}r#FGL!VcGVF8(GB5XVv$_K3av&xk>wRH*-PE?gnf ziPB0~731r49|eWOcINz`o#O6zSl+BZQMW*^4u_0agI8!RUV!ZM`lv}a9vIYiBJ#fw zf<*~G0j7zHgx0$$-`s($?Y)F7_SG1Un!dE*1%<25&*HUg7v)kQxBwrrcXm$Xb=sUN zv&xiDlB+2NHwHhsKS{9v3>8^wZ?3KZDyWa8)phMLxjFjzWNSDy1gOSMAx}MUAs<08 z!F!W|ipq4V)S_ST?%Ep(`E+O6Wqlx}Go1P;p(pLt>({GwXA>7CEv`pjE^DMCXTVMS znUFtRp;Wv_c=?8_*$;`&9M4*$iI(f4MA0@oRb=_au6PBqO` z%QDlXaVDF}c1lW0rhV~}ud6{^#G{hgKRJ<=kVw{bJKr~~r+*)EZy;jQ8+~ z&}wSbU!O;;W%F6?ynoOiUa>)$d$&I8E0|wHOc2&qkj}qLv|E<=q{DPo=Rwy(+#ly( z;vrwXu55a?22@G$UiNp)TOqXoTGIHtHG@$8oHI5dTBn{m(9U0~MA*xobI|FSE^<`R#oWvbR;_J{z^prLFE+kyj78 z8O{4QyeD*X*5$}l@^ji09i&c^jVoMBGJDMnZx7g0BZoYQ$8&3uj-R=(qtsxn0yxVHYgl-L#B7Y(mb9Kb9Vxlxj>_ zfZx;4SikV}ZmZ1-tDPd|^>t%U#a%~d&gM&G7^bQ%Cp6eLCwpGFxg+hDQgglmp5NQI ziUsvuIL*RI?l>+Xt6=3sLn+O^&>hzm7|=e?$suDt5Nrb%8qSkwBB>46i5wb?H?i7Z z@Fyq#$=SiT@xxFG*YtV6hL_{a0`h^H(UaE~0qwZk$K737BYxg(mlm_CsZ8wme|GS? zv8Ne}UuJZj?9WecC+4KR0thU*odRs0Tg%s99TN3;-YWn(OuQ`kF|)+Q(m!MlN)O?L zOe)K(#Zgs>iLnif-@g|vwPT&-=Rzp8FncLN`AaXba50B1I&sI0F);m8+`EfP9sx;cC zX%|O+o69hIK=em8GJoR7IbD9x(C`k)4GpYx>E^0Rn`qJQi1Dr&P;j6R4X^0_*3p{Y zb!VA0DPXOtV#I5kP4x~qw<@iv_DwE3C8c3o$oiv~=cs>0tgJpMf*c#_p-YAb^bv`vv|6p_&)ujJcW#SS4!X?21#8muc%q;p{nIsHRlu&^yu;EW9&Tv0q* zNjKZa23-R-)F``WEj!Jk)E0kyK&7Db^2)CMPan)(6x{$(n;adpXxQF$;BZK=xR^zD z?Z5|OGl&^LWn(Xiw;HcM8q)@fv{qzv$w5hQi8w!Zw=|ZCxzx~euj{Ag(@JzIw{w96 z^2?P;^huS^{XP3-&N(mjcoh_C)>3*jhfl~uFw3fw=$446p7$;6n_ zG#+M7_`z@A2x7b;QW%8UC?NIjaFCK-C&X*`Kb*{9Z1$-bPzFmL~I)|;g()VBM7ut>Zt_Aq^D*DS^Ed&6D#10Sd>pK|E>>j4$ zB`uZ*f)UWqhL|w-ra0ENHS3=?$^a+HfdPqynr|uY`{2xM70A>HnM2E!sxjF#1ue4C z^ZKc%HA(|_zE920bf!;FOm5a-Ur;(Qv0m};lw~Oh*lj0W5)se!(xey{W@e(Kn@Jx! zs2T&6vPE9N5FY6$>SDJhS09j|OviS*T{W8V?L&DzfY*D-3MPn+$3$5NIF|!ZdTW;# zunW_yQb2!%lRQuTOST40b<1&Yw@V~rBRV64%-%}KRS^*@6^`+*QTjDhuzwX*R2fMy zY^Hgpq_I5h{oN}#Sh?YbGwY+V^DzmZv-L+ub~r{dCSrwgCAX7<#W@1e%>pa&Z*rCR zxi^z_ipz%2L;ZIN?2nob#Jy%e?f9G3ipq1U%bW>|Xw3+`X+MJ;bk?{LuzAuacXSQS zWY;;{^#1g(;e$U=Kml_fQqu_%Ofpp<#d83a(e(oBsWZ4>mJ%H<<5j}1+<-3Y%F|jk$sKhkq#`lZ zXDkT{DjZNaba)USrG7^=@H+$<0r~av8CsU8>4n9KE3-LOQ0}pNALGMl#_wp|?S&!f zF@zN7Xl?T$$+Z0Pvy+@2y{*-Z6Rq#Wr_Wm7U_Vf#QwgS~xB}V7!8F0$d70p{&Hd=8 zV?w3p2VnxxoF~tY@;PmG?@meJ4@`#2tub6LH+p>0E>OL@0Cf8iEfTfol9=vlM9PTA z)zb33C3;gzu$M!l^wX%VeV*(1{G+9Fas?``8jUmIW{f?ujn|99H3wcm`dh2<$zO4G z^=mkPaW(h-)5Xf)$eOGse!6vLKyMo=pby!xMx_5Z?JK2)EH~Fq`vqkgW`qgd~?wFV{XTW-v%F`MbvjBkOx3Ea*v3MtAXBmxojj44n-R)1;sV&-6i4rLeDxL zYQO(E#`NZHGuP|;_ha>=cXe)=RPueDJj_y;!|-6giTkOi<>WD@p)7IT1MHA^(=u+L!VXOAO*4zxY^85s># z3j15kJFn*~)D03L<6+WPQPocCA7(o(L~5MvPWA)~t>HdJVR*$AJN(B}jc>1S%W#!0 z%4sDf6bQ|k!XGx{W^=p5^1pluQubXe$K+4{sj(}mcy|nL^Z|p=b($nENk)xAZ^2U{ zN@D77W4be3ubrYsm}bw0<*yp9`gC7hsrAfIo;Yz41$s?aP9FzT-X;K*J+sH1g}HC` z-Idg}JH~y)P%~u>r0Iv!Qjb$%EzeFQ9VZqIllT;VB~6t#m+y>yGszvCvaB(R5T&d7^x_ zy2zSCOWTG>22}D+CA1mslc zI|bBwqHy8n>WsoGoR;BCy6f@3S?o%AjmL%kTJWaOB?TwY` z3Ol=zEJZD?sb)K$pl3q9U*iQlW4)R}$=`HwnXjy@l-bV4;-p@nE@|#YMNmro=Ub0u zQ<0pU42CYKJB?MqBO~iQEFJiKTT6$JnS#v!_#XS$4gmi*#Cc!8{%JRG2}lO?YKN=of9rOESuv-84;ozH9ljUtJ{vKAKE;o-v_KVMFEQ%Xlhd!~+pmW;0*(XWkxj*pn@`SsO1`oOoF;p>I1%>pjTqJm{M0XEb5B;9JMWmC>v zwlnMV^MzL9O!Y`~K#db!jxkI8=;V~%yx5n(2WZA;uUZlgOUIolj(YtVuz@b&3>t-v zybWnRO}ba~u(lrPgV!KB5L8MV!h>Q&8+KKfnR@L=roD=0D9eVe%L-;>XQhwFf9OXc z4QI%1jpfIclpOsM3s#{cfIoowL$Hk$Ra8_Oyga)Ge*XNKx7riKR%SKsb=XOry|GY_ zTWJLykM5b>Y#^0YZVk1NRhmq{hq0smI(%llF6h;bjd3n>B)&`R$r-HEJv&UKx1(bn z5X2+_PaR#|BP97O@$cjQm_uwxF=9%l+e%uz%Rrrl)Tvn7N6(q(tU_(swZFz z^qS4>(2a$2mfsjvNn2)4TIyHDRY%c5QDZT%u&jpCK>d>u6BF~C6VVCuu*|sqyQ25) z?bY6ef6mA8+c5izOJ?!me`6w6wk|7Ou;aCpmOwlYWEIfZ@m0*x@i_5~`+A{9#o-xH zJd-|@+;JAIGH2(6|L4JgaLU%&di(NZOGH@Mk=|u{;w@AUmy|nM8v_RVJ&YEXmP7=5 zD7?;7Y**a(>ejZmNhv4_v^AanYfHlh{&si7>vn9O^8gojJpW4FfrBgG~No%v6=NP*Qmw(S4KADjP~Hk{Nm!_QNLjK3UI|splA&xjlVW- z+rLR3dYJrHG%PI4?Rb45*%**q{EQ1PRnsbcrO0RhdE8{Im_0r|R@ZYCJL})7{J8W1 z#ZVQv@A@Zq_r1xI059K`mX>`RQ=|WCn7Y}m@q*!`anK|%SMM2Thy3!nknfw~{~~hz ze-e(}U;Xb&!GDPjM?+EpvVQ_%wu27(tF*Ied%Q>Y!G$rkf z^FLDXw+@O*x>jAfhZn#rB%!mp683OTqjMuklgvqHeMuAMNghOEtE#HHI@?}@VAm>Bjkv^O#kbxDjmJ|KoHOc=|GB{bD?N^lddqH~K<5wZp77Q( z7qY@ELQ`hfSj2630KNLtH*i*)B7q($BE)O!=G3X=BcG4$LbkeEg2{HXs0M;DFy z5-Zv3p)>xga;|ZoRE0tj91j$TH_gHWcA?Rjl{Iuh4#+CTz`?;m?r~*JO=eo!zHV;K zz<|Q#Hi%Fj$dG7~Ul8x2_p9eoq?(g~Hjo;VM99v@#)d*7(5>=xs+7`|4(zh(fqW!N zaSehHD55pa^gKpk@k2JLp<61%Yh)lj#|$ro+)SD!VQ^$;XJ^VKY|d1%jVk-!R#-U& zKx|B;(uby}`KYM;?I3)~6ex>S@}@_8^bKjYQ!NEV$(KWS*{;hzl@F5pm5^dPdwQ&= z%ky({t&EIbjH_s9XlQA5sczCSF-3N-c-X|mgQAfXT3u9uTeMU<9Z2_d4h{}g#Jb0k z+uPRl^8vy1-97Hk31LDKt=|Ea_T~2$XU+xnrDM1{`0%Z#4gq)=({g4 zh_A=j;Tya0~wmtoQ!S$Eh)52EqjSU$|u(Cm)C8R9~KFh?RIhk#bFzX-F;M4%=}|Lt!9w@$;im!IBkd@K77Xhx^x4G zd#ipOKJCHuGuoLdlNY>UCy@tgYuh&xpsc+~ zY5qpSh5F!$kZkFoVY9!@Te*0!(7sF*o zUT4x4@)XFrcUOB276!-m%N;-SU;<>=-u^`(6}=SyUVD4F#bGsufA2V7yFy1tC+iR( zDHV~qOYiv7uw|+lf{hlG$DqZa&J-A&+G@Yz z)7aRQ{Hz;h0bO--)A}9nbZidVk|OENw&xOx%tBYXK)YwU&75_W{b3iZxEQZ(CDMWM z14&I6L)&dK(${!oeZw%!dyclYzkN?dSz&uKRU)6y^DE{aGto29(z-7A5p9mJvcD!`rGrbIE{oL^dc={R2qd1KOrpW<)N zIGU$dr8x{EaljWa(eC|z(> zY9z}@Zo%hGKC@D`LVZ(Fx?&5p31JM18HIWZr?{b!G$g$HEb5a3pTn!VLf_|7urE-f zC%YAX^6u;F%b<$6?tEtW>jWm}};H4aqN#KQ9PE4cNHQbyUhrXfeDI0hb zo)8@^k(3V5=Jv``sNh_IzF>-~Zseh+x4?&a2n-E}R6?79hGt;L6gSTUHFm%y*w!(T z$04Jpre@cJ|a%JH2G>PK{#H{bgTaJ3CcwL(l$N-}8jP$)JRrM(AvFGLoBciTvhieZ1>N!@E6D9M$}>_Cj50}MN%ie0^` zv2g^;0mfk#+#Tm;_`5hIn`o7eB=jpDNrsbF9`c~DwN2T{;|E(CP;=@jelEg<-0 zGsrNTPPhZ?o5h4RP)%)B1)0r|B}oaqIP7N5_HAfr&>~6JcU>X)?aO2<=cna9YdBl& zbQTeCSfrFq%KwR_ZUw=1Op1HsxUMcQL7O*2z_4wA>soNYb5sGHvN~eIPDs=5A`w@U;marzFxTQ(NPgwfUxBRhK^we~&7s zQnk%=IdYc* zBt2OYh9Gjb&rI2vvh2ry>jI_-J!mCLh>xF{oz=8zMbb)9Km=TT$xPX}a!@b=L5Rz4 zF23pa4{=%a0Xmo^sL-hJfC3=E&UgFGxTTMYFE)lVvpINqr$$H1Kz58s=}&x91B!%G zI)D~s3)a|3ynBc0x#^Eh0)Td;|0T$c?3N=E=mT%xz6F8Z8uSJNk|2+KyBN&8PU(5y za0_C98?5@>^y_*}kg)z#H$Zmo1j(GHhP_T#q@F=!|p9N{1}K0FP8 ze+h6>ubo$%gCzHFixXS_Ie1rQmQ=qAo=iAb&tQO{N~iIn)(U(9LODH$ zL`;~<+zMc%I@a&~DMo+6py>LYAKSuX5gk!<$mKD#PQ~=CQ8@+BLlrzbLIkPFOfuL8 z{B5PpdW%!E$2Ci=*H4|t7}kg@Q4%@2{cXele!hYD+}VAMA+GIV8ZA=K=_%5M}sQN7eHW1wUlKEEg`pdaV?O4vN|j zQDP+T475&Kb^b1JWm6?`a-A!TV)e7c7sOr}V3qU2eZ${Ck_+h0K94orlqh0((MlrD zEvcYUt=T!;CfPsW;v9EIg;a{hhWtMgvuD2`s4$xg1M=wH9!&YYYm{(R93CK(%L z-CKNnz1kIFqRNfTn1`A38m;~wM94?x){4j^gO=0+bvMKiY}-TQUi-+;e8w6j)-)qp z;GY?7uwBpE!Nt%Jdqy=rVr(CCfz$oA=kKmzuwE39-Hsr63xq?{pJ?spX2c>2=d_P@ z^f^S#8uBzb9SH}(m(4`-HOJyHIN9H%vaTHV(U53!vgbF&n)oWShA@(xkNbB;V9Vwf znrcGb-kEfP2%y+K@FI^Q2kq}PTY7HQv6Z=d{`Do1lrb>m!W?2!Jm7BdPL!BVvR4_s zQA7LrVV)iwJ9z*dg+Lltfhi%#P00VV6JA2KJ+~A1c!c~GC=dwHEFsVISRDi_r@x4) zofg`H@e#H27A!NwA)+q`>(yy_7;Dne3(g!p0r%`{u+05o&TAG2aHtF+=|VncFFswb z>BDkQ57d^uU`=fZtXaJB^TRTX+w{?9hQQ4mAtyauL&_(2>` z^ZpdL;KavL0lSIjjknc`n13!Z(m$t!Bv4Hq z!O6ELUr+*eH@&4#?zpu1>2!W6TF$l8g#|BDo&xFC^5kH8t zbl#R?52*7^xb47^ihMx!gvwsv{VBB&?1={}8T-4EKTpAo3)%mdc_Y63AFle}M?<{+ z&y#sQ6v^V>Nhd+9l8xg$z%IulA|iV9=+Ai21khJ3D5$8ZkFcN7stY^J;&k?uc^n&) z-bCfcj20fAhFg^K7hiWzI;zd>#k`enT?wz(1fOgZgmrfvOIl=Cf^!JokwM)*CB-l{ zDD!0e#3M_N2uf9S)N&^7!18PjT-pnnUIK=C$S zVtZFaSb_fyNo+S`(f|hBzf0Vc)$ie5O?$)VIR)40z*4eIa?)2%=nD84QvG0{nN_gZ z+WbwmiOAAx70zQky_joo+- zbJH0M#2Y<^O^6=W1zL#CcBZ+QnF|17wyQfKgp1H~bAJNacXxMJQBhGx+0t^80($*6 zA3)nhIRhdBf&_louvi4HgPqKWQZB8m5D*cmYiLN&#_)ocELrnkmv;3R{!Qyq`d*T= zUoHWc0|ZRL%nb|X=jJE8u(0qGkZv+E+5GzYdg1VA3=BQ}{RN=FMhuL(Mv%Ce7bbh&>z^VFm_|>#CM61kF2MBRWm5?Imk1{)dvHq;-MKoVk0wn zoIWlzp#-@B31N6gXd~vu1|a(pQYMOug8=Ym$WRH45#{I#&m=D{(%wCzBCRmwXY-oMm z+)eUsXEc{+RH3dK%sexi)uanx$FuFp=DGCXd+bi1R%6V-L9G6r>5AdC^v|O?s=1&* z;<7uFf+Hy@88Hv~D0_eF0GqP|6gL2eOe0r^$xK;iy}6iRNbgsazQ+Ly7OUmXzl2 ztnJaLj2rxCuaUjeVi<+Y-oIDi2E3TtJzFUoaLMUfR|mu2!sg~Qlnp@AQTYU#4=@XC zZEaODJZ6fF#r+ol?Afx~&GBP)sfgI6aG?U(NEXOL2=@Ql=)SuoQZ)e8O)8ke2T{V* z!G}8DN!dbA3v zEt{uAnORv`OKPI+Y>wN1SW8F4O5-${@$x>sc6^lb+}}r0wF|?a+Tsaa6v??`w?CD! zO1ejDq=k){KKCw#lOTd*Pr`X;*fGT>g~={h3o0neo|b2mb@=3TVmR~NK)!{ zx-I!}a>)?Lau#EGPq&1SVjq|6YMZB)3P);&3X&!RhnT6b)1U48QoIfHg$zK5m)gE9 z@va*o#U|{SPF2{!0X4S$;&?rftxTP8mc60zf}j6-ui;Lw$Vd$EJm6OOOnkqSJ=3R; zfYt)!Cy>0PRzOj3;FiaEJNgTi-9X~_w-U}$pzS!bn>c4AxU<@KGB;F`i&yx;4&DG`M{IQowXYnC)h(PX?w-U%Y%tb}XFc9Y)ij#7Ip|?VK2W z@Az<~+X-&JJHxYys9&8lBY_rqUS8h%vpM(bgwd%^>XCqZU^z`a+xA7uP1PJgiu3$r zj*I>%35VqJctZr)^aPNz%!)#YtRj#GUX*#0S)*r`BBYHJ5wz1eP*WQ^$HT*Ox(W_| z6aSer2BVW?ZCWT^=G>IC+b0PfQuzGnzBv>M?Q#|gz%g+NdZ?h3)<{f9_|CKbYL{K*5wmjVKbv4jj7$?&wXzFmaxI>vG1CnKo1wgYIY(|S%xS2p)_9kd8OAZC}S zU~@g{<)V#NA~i#D12>@#VF2n-kZRxtJ56^8ore7K1^lyksWaBGrevuX<=ROz7Z?Ft*`)v ztO#XVZDj)|d;7({Dy69m(Y`MHWcQQJ^y$k^>Li5^W_E^1v3?f9MX+*JmE-wUP0Jk> z>l|0Wp+`-WxkIoAkkN3`h-c%rI#=wcsQH|?6|B%iw(yx;U&X}401?f4&H*$a1N-uS z@bpD#*a7J7LYnIBwdsxJsLr>mJwPoRB5|VMiRsvDQnal-UvIBWp-&CbDo>D5-MZQ0 z>7ofv$%EFacZz9XK)g$%p9_^hw08Ym@DG7}J5XrY?BjEOpO;Z_cFGyJvu4BPO%t{sDBP=fU+l2s%tW_YmnKDo6a!<_icL4pzNuI4_ zzTWl7^wb^zisQr7Pyr(%h9PM7wgM--xfCMXoZ+Ha_hBr)_bx+4dxN2XV_O3=jz&r^{_7r61mTCg(n8 zb|ey+T={7tQZbSRGCU(k2;mX!5%GVJtzG~QZ1+xhZgl_+m#kDdC;*#eEM1yt>{}C)ReB`? zNP`;4F;D{S+3M2Lm|BiO+qn|NX~2j&Q1E(R1Z^ju(;t_jZYY?>fk1<3@y!R#1}TZc;aYs^pFkyH zM3fAn8Xdo<(fLmPhL?uM97y~lhV9?UW_%$)S1Eem83r6UJ;U<$QT!)2H*nB1fR|aP z@`sjRBEG9dDHxK_@)*$!9;f{0{QSHJ*y$LL!)lM&NE5m~S3}nl^gH+>jGf zF;woLLL?9mF7UpwXFJo4m(`g0uw(}LL5v`1SAQ8Eq5%a`$czrSt15yB8tBik*dYJw zi5=V01pPwLW8)ccE=*O&obuA@_q7k+1jIlaK?@D;b0J7<&EJO`yvtg|ur6WXM#hGe zBOYU5(D1__2_l1-w?V@U3k7(f#wWy=PJ{W7Z6dMmf77kR!u%)KM!_cxhJQE3_&no z??+%!#Q%D3#uFq2x&FV74!(-u+K3$mob~dvkB<*9u8Z^Y<<-S0e8k5r9o9&fMeXEU zL+M!hR^Ce4pjKukDTx6$cnKH=sCPLJTmleRIkr|-DJAlrbuvXH;9SdU5@dE$Q!0Hd z1x8W7G%~@5MyGyi-F6rup9`=c6VIpbueSyhDXTEj(Jhi#FVDKbK{%hPcG?2C@ECdB zfbsXx4^(VonMD4Wg0djcmNBH4ejjnv2n#>Q^nswk=}NA=+;FU)Y_herYk%a{rF+?C zaN%?C#ExQFZ$!Z%>)x?6bKNnh3?7tGrSzNRLh8U-3cd#e|FgX5Z7F^yv*wbdJA$j% z#)Lkg$`D5>G#nL{ub}Y(72Kwt7f7+JGaw$Zva>2m7rjf+7l8I zX_BeRk1z9~AlqGRRG0~)`F`=YfM3k&_KV!;erlFarabb1>oTMx>lREdk9Oy^6 zy}khYrxvc~J1ykg(2$TcTsgbRViPQ)uPgQ!pn_L_zKCTx3&NeVGiR=#`|0*TiipJ) zNZuKvoAv2a-MziKK&&51m&PUI{qo92{rF`W29}Q&!9p&01WLyXq)PziBbEhjj+NEb z$J?}yi$U`~zDKsxM4{nGj3vO=pjnnesW;B99z47y>F)Nd!J6<8lo=&q$If(!mYF$? z8fP1=QPA}>c?pn9_6A6FK$%C%&jBF!_%M(Z6P9^U7&oD~28}W*l04IslQ9BQ09wlA<|t?O=|gW? zFNp{V>yL+|!4q-VQHdWrxnCc&Z?O2rwuMs2+oiw;Rzaz>`EnXJGDUDi3mm5kEA?b$ z?0jCW`pB6N*gY2NIxzBLtixsd3vmihQ4Rn<49F^JN%50u@d1|pQ9KV0R{#@p*_jd} z%aDm_x|{*!>rwUp*4|qOW!1Lvq7R@VqM#@sA|;`;Al-t3v~+_?w{(XhBGN4&CEZ9j z7TwK5igZgioNM{*J>NcOX3w0N^T(MvbNu6F^yP_luRE^mS4%Jb_VV0ZIuw;Vws0yl zM13^lvKSHJ1f|6b##wjpIt}nyPuBMJDVFW!*+D%(e++F(F`ghWlLOiPVjI2RFPdl) zrt3PsjTIMt6UEN*RBPz_W8*8>*AhkTpn-m%RT2j^X@3#0Ze6-S;@}H8|NUK5R5a5Z z(jWXKPsaQcaGU%ulM)hS`7`0EatL3%?x680xqy-!K@mok%7 z8KOjG4Gd^#hvCdAj%m7NLUld2o#jK|hYv4U9k6zwX^>;xU!P#Kr-Opo5+>I*_55o+ z18*FId;ZgC#4m$270f0niX3TM(A8$0qf3l^ks|C8Hg-wz2quTG<^+$q7!W4X(m$s( z75J@UaW62IP>QI1wb1k~IJy^j0rl;O>2|X3cBk1|Ly^+8Iio#AVUgi{Ewx3N-CHEk zCo>N`geD{XK_1-4@k664<4}qYNriB>P7PO`bbi7uxn>*= zZzxli0pR1^oq3pa`FaiYfCG4e`ms8{^t~;XJ&dtK22+`r$PVsVAj&+*2Jn6>*x-99j}Gs#b_ zh3)3AniFVwm(xw~_Ox_5W-*Z9@#cM$nLHfXMp5d#y4s_>F*^%X)Pt1wOK|YOE;=B7 zX>njcd0D{Y$iAQ+S_@I8^5@=(oaRJzx2&2|LU4Po`2IknT@L`HSmxR)Sy`YFZGKiUi+hGAW7ZVkFm+g{CnPa&)l$!7Or^0^y zdO}Pb<230aVQJuvK&5LvDOin4DRcmqcvGYa?7YqM~YI`XMXkf_rw0N%dA!Y91YDxODNT9wdDKGZ$?^r_3pe;5fKr- zs&3xLB4hEk-A?Oh+XUMp)AmNMF9(+1Y$n*t8-s03J1YI>isa3)rl?R%I6y%F($l`Z z&|M;3vxCJ^lfq7`;OG<(7#N6wb%}?cAKq5O(%pNq)HK3puiR;sW5`r>N60>xSOZNss*r02Ryj#+QNd z%$|q;>} zcP_bzkjApk+7xaCR5cE4)G_VZ;ygE`vufei!@iMSCVx-B`EPO$%?BvPovQ)CLzJFC zW{L{B5xf);@s`u$%zwOpW_9$|#|Mr!dGpl$Vv<{LVGpI?ca*6RC*G&|0Z02%`Frp% zUV%=iTUW6uJ*-FK<~oZ6C&(vW?Dro)IW?m3dVXiAe>I!!?x+I*C240no9I)%3-%91 zDD0pS9jNv=c3d|oX=o6(j3b4*2Zj0+hK9Nq{&))_MJ4?CFWK;aqxOT66!rf&2JqX7 zGK0$F$KNN?(y~HBUsEK7O2=3w2+N;e+up45SW$(I&eKz9cY4Z*@QUxfnX}U|r&lL$ z?-h<;uA3#Z?L8ivTS@VJd_y@lF_9&v{cs;H1`ny+A0=~Wt3SU^LGHQz`#WQVd`|U4+5Ke;iTx)PcE(A>dTB?}*7Ytg|NLraR68OQxJ^o=s;q8O zB_Qj6s4G6CGS!oWr>9lhXm`?vD@No}U;Zt#d)&NEsD7`{qEqyut7McIkz3H|$;po( zaxMG=?XCK#aL16zuEHCrZ}q?fp*{sGT6hS}?BynnwvUb^or%L_KY27o_tQCj9+w7D)bk7d% zJY74Sa=FW6^N8yV^z5@0K&!?ecUtDL>DDAPtd1Ld&gZ9$IyJ7so4*D%%L@2;4oDit zlLd52=2bMsh6_r$ru00+Xlaz^-q|ZCe6{XbPeus zm13gR>G!At3fFV%u;F3~fw}@;B4Xn0F)UQkBc#b`Qh0F7^y}BjcrKkg94$3EHMtJU z&oPHW8tIry$v0p~%FP!zO+vfhUn?F_*Vngy{IZvDszttjdy#v=w%LDDR^sUS<)Hxh zb?*4U>LII>C?UIA;gh33k|D`&e~5aHL8*zvv?Uim=rYPoKU|yiF#Y>GH(%4GrgG8m zcMdBz*-ilGCTp5QQU%h}x3t1=+CBfSJ{rsI#8x8>sx2UX+gZdx zJ~w9D5fx*s7rUMB7ota#A%W%XfrRX3MgIBo$Ap3+gkFioY4P{kva1s7L7Y?XFB3Jj z!o4-Y<`De-zgaKN*cy)lJH~bV3LC0^D;{q~f9$HpZCo7M&?D!WGXI&FyR<}d;*CP( z@WcG|mUwYL(b{UtGuya3cX#D@*;l8Xxjr~pZDnxVb%{3V8mTS&qrh#Ku{Qg-m7oCw>aNf|WdsEF#yN*QRF{BPap=ggbVj0DubsbIPSs=9~4%vFD{$wj!^~AGG>CGO`Lje(VZMHx3Tv{IMu=Zowh(L9*+! zqg=m2&FxAn%W(?_t=oTH9D@LXM4MxbZ$AFeU&q!&MeYnoGsnkqP3Y#0Gt8ky4D5E{Tp0F6q8x-5J%+k^RKhJMT_E$g+hd*Anj-vE5Xx_2kF=hK2O-v6UVx zLz~Hz=a;UP6Ehfe6z?&F&xlooR{wi)ajZBf=C)FvM|;B631;Wn zRc1iDd#sgE&fakID18P720qofVU1Qy>85>PYZ$GtaC#l5_NNtJ_dLhEUvJWcXDof` zj{e6oYx=>xuClukf)x|hje@=?a&`FJ7RvS6*56cO$6X;<%bQi+oGm~X?b&F{q1=AAKO z6geXFI7k|+usWUF*f{@zd*l6y*?uuP`*%gi;8MSHv{ZM_j-%e6N#}KaYi15{b+?!- zTQX5sPqNp%XE~mXSR#`Wz_FiKy*_yTe7Jz>aGP{H*F!q1^`*jQ^YI|r#0DAb3W_=1 z&93{L>yt+Dp{v-tGCLb9(p~$C{E|Ac%22FtKFDo!8n3i9>>jUr!@|;ZVm3u!$bN`i ziSg*J>DOm5iOj^d&JHH*=Oh)pw3>dTa8ClHYd#!dVeL@}rA!T>LYr?u5XR@|Xrh~h zbK6a%H}LS9E}Kskcd|_}#VojTe>RhW3o-jzDVfS!V9m+luCLU!Jalg1v7)}c9II|ZJWJLR7q!}H!AFMl(Mggp z{pLyPBW6653ki71f3h$$uUQzRfta|;a`qbi*xk=YRRU9tGj%>qeKn2Y^Fdy=twHxw zoCJp0RwG)^=;`T?*5I{7$ZOZT2D?j)E)p`Ebrd?X3SE?TKha*OUq5og4obJ{zt}yL zq+2W-$@XKkBGQa>Q>x(`mW1451wU$518yLG%?K7Yre;~2uh_6}++6`Ff!^Hj2Kw4u z(Ynk;oUZ1M{_`t=H-^^J6=QlNam)Yvy0Y1^iu0qjilDhRlI`3HX_5LL`V0+&nT@ZI z7uRn%mzh`p-n_IOcsRJ;Sf=9~5yGuo=cv=Li$Q69PXx<3O9uqTw;vhRL_{F-|4FP& z10nrH!_W(T)q1U^p^nt(u^sJ$pYLeL#0AIn&0il-9OdJqTyDY@i$TPn$606G+>bGA zF|xbFJi-&F`Abd7?iDare=>sR`4-x8dB`{J3R%f{d#=Dld*XfR?_Y3`@Xu{D6u7(S zdzQK3*fQ|c{~`6zWMaECS3}sYau>Iq`=s%c-kL7VbI#*~eLzi4A?(D)|cu6c58k!Z!9p}(r!ZiZkHvs-Xfv>pI`L<%PZ5taw~Mo zK^)v*vw*qy#n7#utK)W93Cjkzmij^`g;aT~gYXJ-y4n8UT?~bz1`O2fW8}sZZe!zJ zC^s`iZt65$f1gCsqU(Or1`N*N`1acSN79wY{{}vt<%SZG)zJAh6eT_~u+EL|{Ei0- zERok~k=-?}7GN|TK|T*z&}P*|2KubX_0vzWo&2=+*!99fwfMmPOs<=Wg*l{>XM()$ zaqwXS^LZQ_g>h!7b$OPI&mpX=eZ^{>5q`1Z^o)$<^Q)-PVc5w8(x9dbK0jJ{A5I$t zcbb+`Z@M>%l;o$4qg1m|BH>Cm*SuN{aZLHNHQors)5)IdVWPIc6%|^e-QaopM4Xsa zcfiS&N4v`QsqPt|*}K=yL(|gIe(LJReD@rz)o4!$b4tKA?X=Nc#kfj$^lQU3b@h(o!;FDD*(ZO zl&n?;=x`I*fg=)f?Q=(OU~T5l^gPkp$==9k91te6yyNA+s!?VZwTchAZmqC;PXYfZgg1j;SA!Ol z;Rdwx%t-1%mT-Pngw53H^v z;lgui19|B0$z5i{Ku09$%JMwf+lCj46Sx2}4wKMTs^WN^%Gckk9RM5wEHN?zAZ=hn z`>Q1#nbEE3*&rka#AIkA)!WrL z1$?A@Am)L>AMnUUTlGB_^)4?(wZglN+({OA}Zy_L;5Vt|0+I-Y7<92mr<@kbM-IK4WjH(BO zNVVh0dUhD#d1zp#&(kHhG|(~QukCvCrSrsDefS>CZHrJc5yyO3EUo&IrN#O(pB{d% z_E{A!9(#7`b+({;n7Ybmuw6aS6Pe1$^I<&9BIlW$eZEETaOm!8dCr{pipD9Ju z7kI||(xG%N$!mX^HA}Np^GEyTg>}-U2wfaC-mdap zSB9~`R)i05TaGFRa}6>9Ec`$vC(>11qq0^a?dPzbSH9*P*+hX3e|A{R->LI6HQkIn zA9_I%Y1in%2sN4wiP#FD^0w+D&ehK^L=q_m6-hn7$kWCqzSZNn68 zn*|1k#<-=q!I9k!ge~E=ldz+3^1k?nT4E{`m6a+|-PE~gf+JG=7ha=OCN3`CW&j%T z!P%*ZxHwnapYI+1m3I3Cp9*^_n2uVmsE zd;UBWU0fq|Il9Bl!&c4l884-Buj=Y#ofbi6E$-}BKqgzmr3RJS$sb^Q0*WR8t|U>> zx3#P^*c8H^+MSCV8>}_yOtP|4-q9`-6BC0b4*|=GnNidz<~z@?c_%iBTRZ|14Kw3r z*#8akmDjnoFiUr0B$p`PXz}X~!Vb))v-cO&RGhOPW=9ti-Q;QxZ9%zMyd83zO;omx z9dzjFnQ@U=IITyvBYAX!f`W!`lH9laU0HbNBdOJuT)lN`9P}?gkZGgN zI6?;HmLu)t3+6bq-2&TzagmAgg1eDJW@(tES2(0Vh(x>9)YN3gSuCTbrbZZMC<}3e zDK%MQuf78_@}m67Jsb?!uPUsfE}CIzHkz+!HPgH_(Th8`G=A$dEPPsu?6($<9o*R+ z($IGV9v&p7>y$iAzr*qnPOupEwV(XK>eezr`!prW`I;&OC0rYzfH3e$eWYpU(0Rj$$Res@Nm``1|(D5pqp5a(r8eL%AKJ(%#se)w}o-!Dg zdBMa*USsOu_Vw#8q|V!(5+^qYlBAAFcSh6l@bXF)aBBT1U}6rKwpaE}5NVi8UJZwa(17exwkBq&`!CH3fGA`7a+)?Ds1=Me z7S_I+V1Ns{AIncg_1yxe2kdb}4n-N5rH+jsU_;RaXu9mXSR`2QwAo9CB!mWj{`^T> zc$Gq^7L>UJN{L=)F4-W}{KT{tz*^tq`VpA-PkmVSutnKNw*t5IY$xn`s$BY1rX9c2 z(O{u9vO7}I`MGorR&&Q1cQk9n2(3#`y1423)8!)7S-NQEUslfny$?xb-@kvSYbU0Q z?>~h^07YuFvICF%L6J<`S+q_4xyz)eo;jQS;`0bv46TjQ2xq7- zr~b6(;-b@(hgvX|gIh-Rsam=jP7jW+x%ItkS#6Su2vW9t<0cu%G5sp~j)sZ_z2f z+yk3JNvo}fFM6vgQP|5<#}Bw;Y3##W99o$6z+~oYH5@H7nyTpS3T?GeFOON8hvTIA z@Cnl4S3~mP$_v?xCM$ZpVVR8TMK27on7xpl%%hYarb%*GrrYY*X60f9isdQy?Vxgw zufP&V49dO(8q@_epVCnbhQ$xybbvj>;k;q`M{jEUl-Cr}B;1?ZP(RtUvi39&Y|)5G z>=Yay&xm$s9YOwI;vx~7weMNZ)M{u$%~9$~OK=+A<-tMO(KQS{SdpQ>vJ1d{`-cNKERh7|s@a0R<3G4LwdI7U+%Q=cJH~zBzt5-A$ z?SW=Ep!9%DxEu#BR4h{D!EA>9Z7Q0sHEe$8SxPTOf-7YRlMKNTX-_L8s z&Zi-W@|P{gHj%ufp}m{3e(Gdr2PDJ5yekAHsCWx`9F>;hkqeBtMgw0LclGCpq`Uve zk9U9pfg_YgW@=&MR+C=k_08Skm;d_e{o?u$<_$5}cc^WL(zQx_S%;G}i)GVbAE|jF z*iX$tV_SG;{hAewPZ7%d;1)B@Op1asq;)-o<)W%_a#{P^sKv!aaLu=oE2fGEL3YPz zwd7lC6B846_Y?RV!y^!1wp0(sU07sL$x$EvEbOIgL;__U`g-zMnfb!vV%D+lA{FGHdg6p_E&>BgeJa>DyY*@D>(GU)}jE?tT z`GHOab}X&oF1ZP}9%dyevL%l*i{$oy9aZ_U%xE!D7H{?!!@%gz4&IE6}5yCfqXtc!i7T_28t@AqMX%Sp`cRa?rj@0 zhT5F&r#Cs)rK_P8;ca;Y$+XYOfwnML+30}-JdPQQwM`$*tI`juW7#D=3{!*rE5s~$ ze^#e1_bex{kbch@PWUih^imA%HmKvhtVK^tOPdZHiDLF8g0RFbueZAsm|(|bPscXV zc*$E`B!s6nEX#Hs`?dT96G|rqif+){Z9U_*o0oBLN`7*|^7a);d{`|HbgE15Y^w;h zNCe)ddW5jP4lR5l5>KB>I!A@ragvadLNThObo7A)xt+9p?Gk0PSPn;0LJjaO`ZMKY zBymLO+D%Xl_oU-MMwZyRwN@3UGna-hbRhD|2@+U+cLJVGq@+SH(t(;D3XVea(5Y2UDaHRP^7oZ_E2Ao4fj_G7O#SZ67VDin;!gkg+2=z~8VrVn5DN_{$yN8Q>MwoTlFU`c_p}gUX`cOave8HLGDZ zT)^zMWQeN)cIdf*LnCyApWM!=#tte!m;yvCUyNdQWAc@=o~Il) z2ogL0M*4}T7fr|xLp>J~oJx1{RzZ6fRHh9o1vs4BGV9aiEdz`4uu9S44+jWapo@~wyhMFcD4II z1~bd-Nx3XOb*SX2v+*j2dJXa#!F?5fcKM-3(|D9`JX5RIk2kv|=Yqs%CwAM%8xka2 zfIn5XeqZU85}dqieYMSrQ83=En)9 zQ%k5qhpOWCVe~850Vtbeb23-g;BbjakezsR|NOn5Uv37h;%tnTLEZcK1`>mpBEnG2 zQN&gzWT zS_j#Go_R2ZFpIR3j~2aJfwo4KZFM0|$B_V=sb;?4MaXYT0MBWuX+MZi zlErbWytoT>C%Da3UwnUyV5j$DJ@r$(WzhZk|ix0sjgw3%|eX`tHz{7V-e!77XdU}gKs1YLqCPp> z&q+<$&LI#jU~gdx@*L~^ul(8Av|J#bS;Y=LFW zs8*XCwZ#HV^QQ;`0z6i&d992%K7>3Q0I9_fAX2>`xA9Ebl7)0L1k%jyu`|f zq-(uub7xyg$A`E8XJO}&H}=(kFHDHsSvcDD>=w2TECQtR@QQ5S^FreYm!_TZW{=jI zpar3!mPf>io-AWjwJRnKxG@%IyN^*Vb1-Uuh1K-+Q=3BYnN&M&zg*M0Cxd5>8*@}? z6au%)0i)Wk#Fukfp7qcQ!IA!<<+j%6`3e#|r&oV}2VbN2xumS5Kf1=r?s$)?%KBN& zb&1TwE|uc}Qpd{S+%7qwSdOXsW_FC%|7jeexEu1_ntg#zsG$Acfv!-s;rTyv&R!Z$1dK0Ja#j#y=$u2?b65MT%0P~u^ zm#{p1QqK@QhqVUE`8muO5mru4PMEHL!>GA=`4J*T`h!pZNuPZE*sDnC00)G8`P0Wz zD*wKIQTIk z!@FQ|kIi5H@i>y-6}%kC2^UBC)ULiztM(Wfqc}dY5)Kay&6)~E{rv{t;+qa2*G)-+ zHE#1V=ft{~%T`7lUmk89sukATH3e2aG9IX=ov0?O+8FpvufpJ<7r>ISgz{Dsv2y<2QLE4&mH~QW6KFyR)>qGrw4Nj z8yhtW$JSg$Z}y>a8E zp8K>~x|BLz;zv9?`<+5{o8z98-j^>Sj#`^6EH3Vj+tTutbDy>Ps}K75iVVgc=y#kt zNhzZp$KtMTY86k1^s~hrRKc4P!kZdMOG&lO^=IOcdO65vePE*acjLQ6_ve{kkG%bmZNVZUI?febD|`a( zQ-LIJAVf^9eAQ7xLLaZZ>=_Zhjcs`P#rord3p@QUUk=aASjuKqL`0F8HtxY;XS1c3XS-7`*XyMvot#u~g$gZ0B?0g*cw+Ha~Z&Osgd-G-H#(S&0 zf943!mNMff-TQ|dX-BRHY-p^HQ$vd}v%0ERVKov;*F`*vqF)0WTeWQe8{@H;G(mqRA?8zK;Ll*01bG#FZM?Q7~?= z^{l3<37zI;>`fLGSPRMYo2`}xS~sH9hGJMZl(KqM?`OWo_Iur~LXy)m^H8!wPOjef zN~`OyPkYz_=~4MIrTv&>_vvPy7n6zmFFsC|YM) z3hPG&UACq#;sOC=ZvVPyC#1od?Qp90p6FX^QPDq3CB%+o0yl5F*sAn=b!k{X_v{T0 z6ZZ8IqTY$frV`6|@6IKtOqag3Q0A3)^~6r;S1Z|Z_h$hzx0$9+kocVvVEmp2V%6I# z(O?L4gnBiuoa;Y)1B|=0d0g1{4r~th|19;}eEkZI%qey^$CwnR4U|3Oa|fTcW4?6j z^0*xR@bCY+B|1aRiV4rzq zeV5-^K@S&I1lkkt&k@l^>-)j`(bpsf4Zd0X8TM6QHLI}wao_XI91pfYqtBc-kDZzV zNoP|#k2_yX`M7nh@EaO!{Y5P`n0>eM@IlvlqR-^{3VVk!QF(~YV8P%{veyB7kG^Il z!#17o)s@Y7Wx109VKyq>nQ;<}UBSJVxpNJy6U1IiT~!P1#MT8mrD|rPT&L+9aSE$w z^HVL?_L67M>pd8s4okr|bOYQ#q^p%fWaEmU<-~njT8GWRXW9Fo$zSSNuZeazw6-Z| z$uBL7VTDM0Xs=$=spD)#Q`A;4-8)DVc ze4j@@kjgZ#i5#CsM474%WA2;Xc9Fhp!aroL^X0=}0Ttw+&yaG|tAy{ZfUV zsYk`V{|>-0#?iL|e&ggvWTtX5CmmB>mh%{zZ+FvP#$uc7+HbAKyt2+J9}hGpXgn?CSaw5xBAh)=Jlf)nb;9xhrQKo z#v@JK>~npIz=ExAEMkLcr(>O;u+o;ZJCX6a1RphfSI4`{_rF63|L%-9Vkj4ofx>|O z;D1@4+SDw2@44tWP?SpWdnnYmOaGf6vy#AGL+qX{a2mVUoEBDAatqt&K`2lFCc1jA zeG;};?<@x2z3bSpUIC<~RJSVpFvMvbX*>+~ps!p4XHSdy<(`%Z;3ciuV?oQ9Z(SJx zJFsfj^T&Ge2mpLYKr6LqZkGfwP^&7^E!Thl12(;|p~^}g53xN+b<;Hx22E-`2Fknk zJt|0`bqIAjq-A|0c>`HyB97OS!|pF(?tq#dNeK!MzqbnzI;8K8R9Ghsxc~{o+Hg_~ zs^A@knCtKS{3v*BpA`RwhW_OAv@EgsuC=vwtRN-jZ+P^BriZ(GZHF7+l+cX9ma5GM z!9Y6Zb@%nXqcg#s7v$FnNy>Tgk&wVi z7yuG88~{ZXw}0XbLpy2`VAhxB@6giqmPJJV)58?#c#Ge;A8utoZN=^VQZzdQjuh$i z9biAhu|eH0BQ5=C>hMNQ>GV%O;s?(~7y@q5-#Z7(O@L<4XK1=z4@X}C#YFo%N7cm? zfVT%)<#_}AJUq|#qLRk})`}Qhu>_kQy&HyRu*&QvSaXovq+RVV0MJ4Pfnq|s7(kPQ zcK%@0heUxk1HxqZJEUu=Kulwp#H^l%-63@jUF-hV4FJBBJ@2YrxI!aD)n1c+R0HS*b;+<=Z$!q@Tf)aW(-8 zl>5v>wJB{jOReY7qw|C6rNXPM`1mona#|wkhf!YXUf^VA(Wy2l|6Irn*uzp^I&D{6 zKUQhSJt`G~G-bpmtylt&2eORSD8lOj0GmDamy?Ek$(+@*vIiPB2UEc0vGoKjPvMVm z5)gosJX^3A(4*O}7y7|&*ggyXYg$I>f~aO7@S`%gEJw5APYh@guK_2Ju!;duD2O@0 z+j?S~M#Tw}F$QnzS*Y{>ZmPi~8J z=n$+3MP}`bCK~7PL27yG3JJI&At5ny&z?*cz4VFmfB?ikB<^@i{VZb(Tr%Ht=*_qM zRy>J^^f*thg;VZk_w*~3Pwt~BSy#cgP7=A5 zBJieXBXC9>$0Za0L;~HxvMIB>@<@kCfXjPOeLd2V~}-g zcqbVo*rj;Dg+J7gE6kD9_x1!MN?9vtGCuVcuJ=3>k3F3UJo{qkQn@tCFSbaMVxw%70B!>lna*8cr+f`PRM;Rmk&`2PJR z=wIUV2MYD{R26F&=Jg&344_a~&Lp2aR$lBy{PEyzbcMwjNc?u87MP1=rIgGCh&Dy88NFYH=#RM!!mkd`; zRfY*En!+n6qNVqQk^T|Cjp|pt9#CZv(B>*$moE}Hn3m8+X{qmj|gmV za`iDtmssf(8~B6X0GPI1Gr^z%&PP&C?J5JnQJkKqkK+!5DBc!w;$8Yqu@@1G-{>s3 zBGR|B6jDp?%=LoRZ6=7jV+q4``iHNk67Wudc!t04{>+;?s0`0LQqERf-h?c1gC=l% zJsUWN2_7C10R|`8rA>Gan%6+x^^!T)ONZPLVP{Zj@}%aoK1^7;c8?$s;s7CA<8fR* ziaBb?Oppvd%)Cy76)qlk%pSaq#?0`#Cz6{d{OZ@`m7&A!g?Qjk!BrKMIqPum-aUf^ z9c}Hr!l3$0uZ=r!)!|M~Ag@d3z{kh-+vWc<$yXVG?Fy7Ja9bczVCa5VBP?Rr`fVYz zNU~syX=wO0t6u$hr02B?;H4k{?-TqLR-P%R=(RSRa}3Q_b$1^=lPb4T(Vxu*!iW$J ztJDFTxAZo*)#p+6Vm!p1AIokCbDP#Huh_8Nb^;ScxlalEbTOw09P}q)6694tD#Q(3 zG_iB7-Ik$X{0(;;WIdU8TDvk(9^fN|IC7A~Y{Md8KLnGby0NTeAAsL#`W}~YJ}lef zMcLDSYO9v~G1TO4euIG;zZj4ldZfmjJ4ou`@78UHh7eS$g*~aP@V2qvoJ*%JI2t z-6C$@pAi(N>=g_dJfE1cAK~^8ID((bA;QpL z4v2K1Mj+YJ#e{(;B@e& z!`i6Ike<;-Z>nGI1?KQOSoRhc@T^fz+7^JrpHF4CeToT>r#zX~ApFJ8Iz z+`J$NOVE6X$z?g5R=Q36xjtT7EdbDuGDDK#(ajQ;tqAcJ2hu=OP84e72CSy=ceRjJ z&IF~FtoYNXtp@h?r9+Rud>Y8od&Z{axg$pqKdVFd7L*f4wdpdVDKRl>;P<650A<_} z=qCE&XwKUD@;25YE=mPG5zKGWgtRI6XgE*N^&W0Z840&l;q2V`j!n^0auqa55DQ?` zj^Q1>+ZjF8@ChEUX1@q|xitDKHY*0S$mEaJ(gcH_u}Av^oD8%kQm3I*Vz7!jqS*uo zqNqgqU@2>^MX8+$Ie0Az@7`rHcdO9z+uYlWR*h}6s3KG(FVv|Ss{RT33k$*L z{!GT8f+)UWU*ds_Z+An-VYbxE>N!zeG` z4O13+H@F;o>hy70ZSBvlW}MDda&U+Y%6DR+kOzCt-o@#mi-0&A4Ku}4XE&&87&93` zJ~%Mj9wUEDvX{rp%WEp#Pa@#_8o1Qmve;N5k7|KJA0M%d00yn{Rq$=6!-JFkZvT(Q zN0a->KI9M497vr437Nn0XgB1-;`zyfaL^r2B()ZnSBf<#JH3&|Alu40VwR$}^jB%6 zA`x-e(Tz5yaWC&l*cn>4N)ixdD>lKD0~vHV$p}{cuRBIEOk{!5K6)RxE)2;Ay%OMk zhWi{49PDUsKX5u*6AN7e(%}5-D-j7>hMO*qM-m1+FQONXu7XX;49V64omA;rqr)Tk zB3J)sUkOzngDhfU1ZXr;Lbwqoi9Xx3CjV3UQ##dc_MtJ;tQ)--60Npw@ekz=B33&cpm2Fk*ZWQ6Mgpz#Nap}!-s^i+sZ)= z1B%XcdP+)>sZjX1AS8jSurlKNO6X$|(iTKZNlq5Nt#D1kWRYvSkp0O=3@B zVEXsV2_TZ(;}Sf`()wXMB;VeXh4M!^;=kj{A!05(2OZYah^HW&f_Cb&%}`Bv2lJAK zk;3-JSB)Pb%3E@ZH5HBu$!;X!n6zUamzbrcuPur(Q%>&Y|GYd2Vx-*4D-(j_BN)3`}?~A9QR~ov|H?<`tQWB$KaEP)%c%5 zM@F6;d4hWjMEC!Xo?F5JveF+ue6ANCa1E3rJU!$nfex5c;P$)r|Nr>^#1Nd}t89J3 V6PVl&hP#TAd@A>(K=j4i{{xR`)sO%H literal 0 HcmV?d00001 From 8a8989c309275a051e1f008d039e2c7ed3a56cd8 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 24 Nov 2022 22:01:55 +0100 Subject: [PATCH 43/80] minor fix of available metrics in results function --- R/AllGenerics.R | 4 ++-- man/results.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 4bfb89fe..e84bab35 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -925,7 +925,7 @@ setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.1, rhoCutoff=1, aggregate=FALSE, collapse=FALSE, - minCount=5, psiType=currentType(object), + minCount=5, psiType=psiTypes_avail, geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, @@ -936,7 +936,7 @@ setMethod("results", "FraserDataSet", function(object, additionalColumns=additionalColumns, BPPARAM=BPPARAM) }) -aberrant.FRASER <- function(object, type=currentType(object), +aberrant.FRASER <- function(object, type=psiTypes_avail, padjCutoff=0.05, deltaPsiCutoff=0.1, zScoreCutoff=NA, minCount=5, rhoCutoff=1, by=c("none", "sample", "feature"), diff --git a/man/results.Rd b/man/results.Rd index c7dec171..9475839f 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -15,7 +15,7 @@ aggregate = FALSE, collapse = FALSE, minCount = 5, - psiType = currentType(object), + psiType = psiTypes_avail, geneColumn = "hgnc_symbol", additionalColumns = NULL, BPPARAM = bpparam() @@ -23,7 +23,7 @@ \S4method{aberrant}{FraserDataSet}( object, - type = currentType(object), + type = psiTypes_avail, padjCutoff = 0.05, deltaPsiCutoff = 0.1, zScoreCutoff = NA, From af0834ae753440d7a97836213b57d61493f6afb0 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Fri, 25 Nov 2022 18:01:10 +0100 Subject: [PATCH 44/80] improved selection of splice metrics to run --- NAMESPACE | 3 +- R/AllGenerics.R | 19 +++++++-- R/Fraser-pipeline.R | 2 +- R/calculatePSIValue.R | 2 +- R/filterExpression.R | 57 ++++++++++++++++++++++----- R/find_encoding_dimensions.R | 4 +- R/fitCorrectionMethods.R | 3 +- R/getNSetterFuns.R | 21 ++++++++++ R/helper-functions.R | 10 ++--- R/makeSimulatedDataset.R | 2 +- R/plotMethods.R | 16 ++++---- R/variables.R | 14 ++----- man/FRASER.Rd | 2 +- man/calculatePSIValues.Rd | 2 +- man/filtering.Rd | 3 ++ man/fit.Rd | 2 +- man/getter_setter_functions.Rd | 13 ++++++ man/injectOutliers.Rd | 2 +- man/optimHyperParams.Rd | 2 +- man/plotFunctions.Rd | 14 +++---- man/psiTypes.Rd | 17 +------- man/results.Rd | 4 +- tests/testthat/test_fraser_pipeline.R | 2 +- tests/testthat/test_stats.R | 24 +++++++++++ 24 files changed, 167 insertions(+), 73 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 139fcd83..436647d3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export("condition<-") export("currentType<-") export("dontWriteHDF5<-") export("featureExclusionMask<-") +export("fitMetrics<-") export("name<-") export("nonSplicedReads<-") export("pairedEnd<-") @@ -47,6 +48,7 @@ export(filterExpression) export(filterExpressionAndVariability) export(filterVariability) export(fit) +export(fitMetrics) export(flagBlacklistRegions) export(getNonSplitReadCountsForAllSamples) export(getSplitReadCountsForAllSamples) @@ -78,7 +80,6 @@ export(plotVolcano) export(predictedMeans) export(pseudocount) export(psiTypes) -export(psiTypes_avail) export(results) export(rho) export(samples) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index e84bab35..8a4bac5e 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -724,7 +724,19 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, !("annotatedJunction" %in% additionalColumns)){ additionalColumns <- c(additionalColumns, "annotatedJunction") } - + + # only extract results for requested psiTypes if pvals exist for them + stopifnot(all(psiType %in% psiTypes)) + pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType, + filters=list(rho=rhoCutoff), + aggregate=aggregate) + psiType <- psiType[pvalsAvailable] + if(all(isFALSE(pvalsAvailable))){ + stop("For the splice metric(s), pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function.") + } + resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ message(date(), ": Collecting results for: ", type) currentType(object) <- type @@ -925,7 +937,7 @@ setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, zScoreCutoff=NA, deltaPsiCutoff=0.1, rhoCutoff=1, aggregate=FALSE, collapse=FALSE, - minCount=5, psiType=psiTypes_avail, + minCount=5, psiType=psiTypes, geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, @@ -936,7 +948,7 @@ setMethod("results", "FraserDataSet", function(object, additionalColumns=additionalColumns, BPPARAM=BPPARAM) }) -aberrant.FRASER <- function(object, type=psiTypes_avail, +aberrant.FRASER <- function(object, type=fitMetrics(object), padjCutoff=0.05, deltaPsiCutoff=0.1, zScoreCutoff=NA, minCount=5, rhoCutoff=1, by=c("none", "sample", "feature"), @@ -948,6 +960,7 @@ aberrant.FRASER <- function(object, type=psiTypes_avail, checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) by <- match.arg(by) + type <- match.arg(type) dots <- list(...) if("n" %in% names(dots)){ diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index d6a106bf..8c1856b7 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -85,7 +85,7 @@ NULL #' the beta-binomial fit, the computation of Z scores and p values as well as #' the computation of delta-PSI values. #' @export -FRASER <- function(fds, q, types=psiTypes, +FRASER <- function(fds, q, types=fitMetrics(fds), implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations=15, BPPARAM=bpparam(), correction, ...){ diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index c0e9d425..f9f3f991 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -25,7 +25,7 @@ #' #' ### usually one would run this function for all psi types by using: #' # fds <- calculatePSIValues(fds) -calculatePSIValues <- function(fds, types=psiTypes_avail, overwriteCts=FALSE, +calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, BPPARAM=bpparam()){ # check input stopifnot(is(fds, "FraserDataSet")) diff --git a/R/filterExpression.R b/R/filterExpression.R index 23729ee9..9a47b51a 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -45,18 +45,22 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + # filter introns with low read support and corresponding splice sites object <- filterExpression(object, minExpressionInOneSample=minExpressionInOneSample, quantile=quantile, quantileMinExpression=quantileMinExpression, filter=filter, delayed=delayed, + filterOnJaccard=filterOnJaccard, BPPARAM=BPPARAM) # filter introns that are not variable across samples object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter, - delayed=delayed, BPPARAM=BPPARAM) + delayed=delayed, filterOnJaccard=filterOnJaccard, + BPPARAM=BPPARAM) # return fds message(date(), ": Filtering done!") @@ -64,6 +68,34 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, } +#' @noRd +filterExpression.FRASER2 <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + return(filterExpression_jaccard(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } else{ + return(filterExpression.FRASER(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterExpression", signature="FraserDataSet", + filterExpression.FRASER2) + #' This function filters out introns and corresponding #' splice sites which are expressed at very low levels across samples. #' @noRd @@ -125,11 +157,24 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20, return(object) } +#' @noRd +filterVariability.FRASER2 <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + object <- filterVariability_jaccard(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } else{ + object <- filterVariability.FRASER(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } +} + #' @describeIn filtering This function filters out introns and corresponding #' splice sites that have low read support in all samples. #' @export -setMethod("filterExpression", signature="FraserDataSet", - filterExpression_jaccard) +setMethod("filterVariability", signature="FraserDataSet", + filterVariability.FRASER2) #' This function filters out introns and corresponding @@ -192,12 +237,6 @@ filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE, return(object) } -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites that have low read support in all samples. -#' @export -setMethod("filterVariability", signature="FraserDataSet", - filterVariability_jaccard) - #' Applies previously calculated filters for expression filters #' @noRd applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index f3747eab..ad4873ec 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -123,7 +123,7 @@ findEncodingDim <- function(i, fds, type, params, implementation, #' hyperParams(fds, type="jaccard") #' #' @export -optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA", +optimHyperParams <- function(fds, type=psiTypes, implementation="PCA", q_param=getEncDimRange(fds), noise_param=0, minDeltaPsi=0.1, iterations=5, setSubset=50000, injectFreq=1e-2, @@ -191,7 +191,7 @@ optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA", # remove unneeded blocks to save memory a2rm <- paste(sep="_", c("originalCounts", "originalOtherCounts"), - rep(psiTypes_avail, 2)) + rep(psiTypes, 2)) for(a in a2rm){ assay(fds_copy, a) <- NULL } diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index 5b87d1ce..1b726403 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), - q, type=currentType(object), rhoRange=c(-30, 30), + q, type=psiTypes, rhoRange=c(-30, 30), weighted=FALSE, noiseAlpha=1, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), nSubset=15000, @@ -51,6 +51,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", paste(names(list(...)), collapse=", ")) } method <- match.arg(implementation) + type <- match.arg(type) verbose <- verbose(object) > 0 diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index cd35cbf5..c0bb9d65 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -411,6 +411,27 @@ currentType <- function(fds){ return(fds) } +#' @describeIn getter_setter_functions Returns the splice metrics that will be +#' fitted (defaults to jaccard, used within several methods in the +#' FRASER package). +#' @export +fitMetrics <- function(fds){ + metrics <- metadata(fds)[['fit_metrics']] + if(is.null(metrics)){ + metrics <- "jaccard" + } + return(metrics) +} + +#' @describeIn getter_setter_functions Sets the splice metrics that will be +#' fitted (used within several methods in the FRASER package). +#' @export +`fitMetrics<-` <- function(fds, value){ + stopifnot(is.character(whichPSIType(value))) + metadata(fds)[['fit_metrics']] <- whichPSIType(value) + return(fds) +} + #' @describeIn getter_setter_functions Sets and returns the pseudo count used #' within the FRASER fitting procedure. #' @export diff --git a/R/helper-functions.R b/R/helper-functions.R index ed73f639..54593b28 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -22,7 +22,7 @@ checkCountData <- function(fds, stop=TRUE){ if(isFALSE(stop)) return(invisible(FALSE)) stop("No counts detected! Please provide counts first.") } - if(!all(paste0("rawOtherCounts_", psiTypes_avail) %in% assayNames(fds))){ + if(!all(paste0("rawOtherCounts_", psiTypes) %in% assayNames(fds))){ if(isFALSE(stop)) return(invisible(FALSE)) stop("Please compute first the total expression at each junction.") } @@ -71,10 +71,10 @@ checkReadType <- function(fds, type){ # check if type is null or missing if(missing(type) | is.null(type)){ - # if(verbose(fds) > 0){ - # warning("Read type was not specified!", - # "We will assume the default: 'j'") - # } + if(verbose(fds) > 3){ + warning("Read type was not specified!", + "We will assume the default: 'j'") + } return("j") } type <- unique(type) diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index f451d5cf..7a1fb6f9 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -438,7 +438,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' fds <- calculatePSIValues(fds) #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export -injectOutliers <- function(fds, type=currentType(fds), +injectOutliers <- function(fds, type=psiTypes, freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, method=c('samplePSI', 'meanPSI', 'simulatedPSI'), diff --git a/R/plotMethods.R b/R/plotMethods.R index 2c38562f..8b57feda 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -299,7 +299,7 @@ NULL plotVolcano.FRASER <- function(object, sampleID, - type=psiTypes_avail, basePlot=TRUE, + type=fitMetrics(object), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, deltaPsiCutoff=0.1, padjCutoff=0.1, ...){ @@ -400,7 +400,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=psiTypes_avail, + type=fitMetrics(object), padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, BPPARAM=bpparam(), ...){ @@ -463,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=psiTypes_avail, +plotExpression <- function(fds, type=fitMetrics(fds), idx=NULL, result=NULL, colGroup=NULL, basePlot=TRUE, main=NULL, label="aberrant", ...){ if(!is.null(result)){ @@ -555,7 +555,7 @@ plotExpression <- function(fds, type=psiTypes_avail, #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=psiTypes_avail, +plotExpectedVsObservedPsi <- function(fds, type=fitMetrics(fds), idx=NULL, result=NULL, colGroup=NULL, main=NULL, basePlot=TRUE, label="aberrant", ...){ type <- match.arg(type) @@ -819,7 +819,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) -plotEncDimSearch.FRASER <- function(object, type=psiTypes_avail, +plotEncDimSearch.FRASER <- function(object, type=psiTypes, plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -987,7 +987,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=psiTypes_avail, logit=FALSE, + type=psiTypes, logit=FALSE, topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, @@ -1428,7 +1428,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, } plotManhattan.FRASER <- function(object, sampleID, - type=psiTypes_avail, + type=fitMetrics(object), main=paste0("sampleID = ", sampleID), color_chr=c("black", "darkgrey"), ...){ @@ -1534,7 +1534,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ if(isFALSE(asCharacter)){ vapply(type, FUN=function(x) switch (x, - jaccard = c(bquote(jaccard~intron~index)), + jaccard = c(bquote(Intron~Jaccard~Index)), psi5 = c(bquote(psi[5])), psi3 = c(bquote(psi[3])), theta = c(bquote(theta))), diff --git a/R/variables.R b/R/variables.R index 214c283d..d097201c 100644 --- a/R/variables.R +++ b/R/variables.R @@ -2,18 +2,10 @@ #' Available splice metrics #' #' @examples -#' # to show available splice metrics: -#' psiTypes_avail -#' -#' # to show splice metrics selected to be fitted: +#' # to show all available splice metrics: #' psiTypes #' #' @rdname psiTypes #' @export -psiTypes_avail <- c("jaccard", "psi5", "psi3", "theta") -names(psiTypes_avail) <- c("Intron Jaccard Index", "psi5", "psi3", "theta") - -#' @describeIn psiTypes Splice metrics that are run by default -#' @export -psiTypes <- c("jaccard") -names(psiTypes) <- c("Intron Jaccard Index") +psiTypes <- c("jaccard", "psi5", "psi3", "theta") +names(psiTypes) <- c("Intron Jaccard Index", "psi5", "psi3", "theta") diff --git a/man/FRASER.Rd b/man/FRASER.Rd index a9a0ca97..6acf7638 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -11,7 +11,7 @@ FRASER( fds, q, - types = psiTypes, + types = fitMetrics(fds), implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations = 15, BPPARAM = bpparam(), diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd index 9086c048..9e0c14fa 100644 --- a/man/calculatePSIValues.Rd +++ b/man/calculatePSIValues.Rd @@ -6,7 +6,7 @@ \usage{ calculatePSIValues( fds, - types = psiTypes_avail, + types = psiTypes, overwriteCts = FALSE, BPPARAM = bpparam() ) diff --git a/man/filtering.Rd b/man/filtering.Rd index 5d604036..ee21f45e 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -18,6 +18,7 @@ filterExpressionAndVariability( minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) @@ -28,6 +29,7 @@ filterExpressionAndVariability( quantileMinExpression = 10, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) @@ -36,6 +38,7 @@ filterExpressionAndVariability( minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) } diff --git a/man/fit.Rd b/man/fit.Rd index 659baa3b..dcbee2dc 100644 --- a/man/fit.Rd +++ b/man/fit.Rd @@ -10,7 +10,7 @@ implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), q, - type = currentType(object), + type = psiTypes, rhoRange = c(-30, 30), weighted = FALSE, noiseAlpha = 1, diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 3b9c108a..0ce81990 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -19,6 +19,8 @@ \alias{deltaPsiValue} \alias{currentType} \alias{currentType<-} +\alias{fitMetrics} +\alias{fitMetrics<-} \alias{pseudocount} \alias{hyperParams} \alias{dontWriteHDF5} @@ -61,6 +63,10 @@ currentType(fds) currentType(fds) <- value +fitMetrics(fds) + +fitMetrics(fds) <- value + pseudocount(value = NULL) hyperParams(fds, type = currentType(fds), all = FALSE) @@ -140,6 +146,13 @@ within several methods in the FRASER package (defaults to jaccard). \item \code{currentType<-}: Sets the psi type that is to be used within several methods in the FRASER package. +\item \code{fitMetrics}: Returns the splice metrics that will be +fitted (defaults to jaccard, used within several methods in the +FRASER package). + +\item \code{fitMetrics<-}: Sets the splice metrics that will be +fitted (used within several methods in the FRASER package). + \item \code{pseudocount}: Sets and returns the pseudo count used within the FRASER fitting procedure. diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index 64728eae..5132c6e4 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -6,7 +6,7 @@ \usage{ injectOutliers( fds, - type = currentType(fds), + type = psiTypes, freq = 0.001, minDpsi = 0.2, minCoverage = 2, diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index 96ca78ba..e1333b4a 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -6,7 +6,7 @@ \usage{ optimHyperParams( fds, - type = currentType(fds), + type = psiTypes, implementation = "PCA", q_param = getEncDimRange(fds), noise_param = 0, diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 686bf15f..dfd317a2 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -27,7 +27,7 @@ plotManhattan(object, ...) \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = psiTypes_avail, + type = fitMetrics(object), basePlot = TRUE, aggregate = FALSE, main = NULL, @@ -40,7 +40,7 @@ plotManhattan(object, ...) \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = psiTypes_avail, + type = fitMetrics(object), padjCutoff = 0.1, zScoreCutoff = NA, deltaPsiCutoff = 0.1, @@ -51,7 +51,7 @@ plotManhattan(object, ...) plotExpression( fds, - type = psiTypes_avail, + type = fitMetrics(fds), idx = NULL, result = NULL, colGroup = NULL, @@ -63,7 +63,7 @@ plotExpression( plotExpectedVsObservedPsi( fds, - type = psiTypes_avail, + type = fitMetrics(fds), idx = NULL, result = NULL, colGroup = NULL, @@ -89,7 +89,7 @@ plotExpectedVsObservedPsi( ... ) -\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes_avail, plotType = c("auc", "loss")) +\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes, plotType = c("auc", "loss")) plotFilterExpression( fds, @@ -107,7 +107,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = psiTypes_avail, + type = psiTypes, logit = FALSE, topN = 50000, topJ = 5000, @@ -170,7 +170,7 @@ plotBamCoverageFromResultTable( \S4method{plotManhattan}{FraserDataSet}( object, sampleID, - type = psiTypes_avail, + type = fitMetrics(object), main = paste0("sampleID = ", sampleID), color_chr = c("black", "darkgrey"), ... diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index dfff5dce..9a4b9ca2 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -1,33 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/variables.R \docType{data} -\name{psiTypes_avail} -\alias{psiTypes_avail} +\name{psiTypes} \alias{psiTypes} \title{Available splice metrics} \format{ An object of class \code{character} of length 4. - -An object of class \code{character} of length 1. } \usage{ -psiTypes_avail - psiTypes } \description{ Available splice metrics } -\section{Functions}{ -\itemize{ -\item \code{psiTypes}: Splice metrics that are run by default -}} - \examples{ - # to show available splice metrics: - psiTypes_avail - - # to show splice metrics selected to be fitted: + # to show all available splice metrics: psiTypes } diff --git a/man/results.Rd b/man/results.Rd index 9475839f..500bcbb6 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -15,7 +15,7 @@ aggregate = FALSE, collapse = FALSE, minCount = 5, - psiType = psiTypes_avail, + psiType = psiTypes, geneColumn = "hgnc_symbol", additionalColumns = NULL, BPPARAM = bpparam() @@ -23,7 +23,7 @@ \S4method{aberrant}{FraserDataSet}( object, - type = psiTypes_avail, + type = fitMetrics(object), padjCutoff = 0.05, deltaPsiCutoff = 0.1, zScoreCutoff = NA, diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 761330ec..55b0e54d 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -6,7 +6,7 @@ test_that("FRASER function", { anames <- c(psiTypes, paste0(c("delta", "predictedMeans", "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", - rep(psiTypes, 5))) + rep(fitMetrics(fds), 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) }) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index e333b220..5e389ec8 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -80,3 +80,27 @@ test_that("Gene p value calculation with NAs", { expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", filters=list(rho=0.1))))) }) + +test_that("FDR on subset of genes", { + fds <- getFraser() + mcols(fds, type="j")$hgnc_symbol <- + rep(c("geneA", "geneB", "geneC", "geneD", "geneE"), + times=c(3, 7, 5, 4, 8)) + + # define gene subset per sample + genes_per_sample <- list( + "sample1" = c("geneE", "geneC", "geneA"), + "sample2" = c("geneB"), + "sample3" = c("geneA", "geneB", "geneC", "geneD") + ) + expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4) + + subsetName <- "FDR_subset_test" + fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, + subsetName=subsetName, type="jaccard") + subset_dt <- metadata(fds)[[subsetName]] + expect_true(is(subset_dt, "data.table")) + expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt))) + expect_equal(subset_dt[, .N], expected_output_nrows) + +}) From c1215e1fff676512a81640d8ef23fcffccba0f8f Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 28 Nov 2022 21:59:23 +0100 Subject: [PATCH 45/80] fix vignette --- DESCRIPTION | 2 +- R/plotMethods.R | 2 +- man/FRASER.Rd | 12 +++++----- man/countRNA.Rd | 16 +++++++------- man/filtering.Rd | 8 +++---- man/getter_setter_functions.Rd | 40 +++++++++++++++++----------------- man/spliceTypeAnnotations.Rd | 8 +++---- vignettes/FRASER.Rnw | 27 ++++++++++++----------- 8 files changed, 58 insertions(+), 57 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fb58e4ec..2dc759a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ biocViews: License: MIT + file LICENSE URL: https://github.com/gagneurlab/FRASER BugRepots: https://github.com/gagneurlab/FRASER/issues -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.2 Encoding: UTF-8 VignetteBuilder: knitr Depends: diff --git a/R/plotMethods.R b/R/plotMethods.R index 8b57feda..a9f96bee 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -663,7 +663,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, if(isTRUE(global)){ if(is.null(type)){ - type <- psiTypes + type <- fitMetrics(object) } dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col", idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...)) diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 6acf7638..0ee1035e 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -131,33 +131,33 @@ raw counts ("BB"). } \section{Functions}{ \itemize{ -\item \code{FRASER}: This function runs the default FRASER pipeline combining +\item \code{FRASER()}: This function runs the default FRASER pipeline combining the beta-binomial fit, the computation of Z scores and p values as well as the computation of delta-PSI values. -\item \code{calculateZscore}: This function calculates z-scores based on the +\item \code{calculateZscore()}: This function calculates z-scores based on the observed and expected logit psi. -\item \code{calculatePvalues}: This function calculates two-sided p-values based on +\item \code{calculatePvalues()}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The returned p values are not yet adjusted with Holm's method per donor or acceptor site, respectively. -\item \code{calculatePadjValues}: This function adjusts the previously calculated +\item \code{calculatePadjValues()}: This function adjusts the previously calculated p-values per sample for multiple testing. First, the previoulsy calculated junction-level p values are adjusted with Holm's method per donor or acceptor site, respectively. Then, if gene symbols have been annotated to junctions (and not otherwise requested), gene-level p values are computed. -\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions +\item \code{calculatePadjValuesOnSubset()}: This function does FDR correction only for all junctions in a certain subset of genes which can differ per sample. Requires gene symbols to have been annotated to junctions. As with the full FDR correction across all junctions, first the previously calculated junction-level p values are adjusted with Holm's method per donor or acceptor site, respectively. Then, gene-level p values are computed. -}} +}} \examples{ # set default parallel backend register(SerialParam()) diff --git a/man/countRNA.Rd b/man/countRNA.Rd index 7940b91b..440231fe 100644 --- a/man/countRNA.Rd +++ b/man/countRNA.Rd @@ -245,32 +245,32 @@ a sample are set to zero. } \section{Functions}{ \itemize{ -\item \code{countRNAData}: This method extracts and counts the split reads and +\item \code{countRNAData()}: This method extracts and counts the split reads and non spliced reads from RNA bam files. -\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges +\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges object containing the split read counts from all specified samples. -\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges +\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges object containing the non split read counts at the exon-intron boundaries inferred from the GRanges object containing the positions of all the introns in this dataset. -\item \code{addCountsToFraserDataSet}: This method adds the split read and +\item \code{addCountsToFraserDataSet()}: This method adds the split read and non split read counts to a existing FraserDataSet containing the settings. -\item \code{countSplitReads}: This method counts all split reads in a +\item \code{countSplitReads()}: This method counts all split reads in a bam file for a single sample. -\item \code{mergeCounts}: This method merges counts for multiple +\item \code{mergeCounts()}: This method merges counts for multiple samples into one SummarizedExperiment object. -\item \code{countNonSplicedReads}: This method counts non spliced reads based +\item \code{countNonSplicedReads()}: This method counts non spliced reads based on the given target (acceptor/donor) regions for a single sample. -}} +}} \examples{ # On Windows SNOW is the default for the parallele backend, which can be # very slow for many but small tasks. Therefore, we will use diff --git a/man/filtering.Rd b/man/filtering.Rd index ee21f45e..9953fa0c 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -80,16 +80,16 @@ reliably detected and to remove introns with no variablity between samples. } \section{Functions}{ \itemize{ -\item \code{filterExpressionAndVariability}: This functions filters out both introns with low +\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low read support and introns that are not variable across samples. -\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding +\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding +\item \code{filterVariability(FraserDataSet)}: This function filters out introns and corresponding splice sites that have low read support in all samples. -}} +}} \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 0ce81990..c57387d7 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -115,70 +115,70 @@ the values within the FRASER model. } \section{Functions}{ \itemize{ -\item \code{featureExclusionMask}: Retrieves a logical vector indicating +\item \code{featureExclusionMask()}: Retrieves a logical vector indicating for each junction whether it is included or excluded during the fitting procedure. -\item \code{featureExclusionMask<-}: To remove certain junctions from +\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from being used in the train step of the encoding dimension we can set the \code{featureExclusion} vector to \code{FALSE}. This can be helpfull if we have local linkage between features which we do not want to model by the autoencoder. -\item \code{rho}: Returns the fitted rho values for the +\item \code{rho()}: Returns the fitted rho values for the beta-binomial distribution -\item \code{zScores}: This returns the calculated z-scores. +\item \code{zScores()}: This returns the calculated z-scores. -\item \code{pVals}: This returns the calculated p-values. +\item \code{pVals()}: This returns the calculated p-values. -\item \code{padjVals}: This returns the adjusted p-values. +\item \code{padjVals()}: This returns the adjusted p-values. -\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) +\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) values. -\item \code{deltaPsiValue}: Returns the difference between the +\item \code{deltaPsiValue()}: Returns the difference between the observed and the fitted psi values. -\item \code{currentType}: Returns the psi type that is used +\item \code{currentType()}: Returns the psi type that is used within several methods in the FRASER package (defaults to jaccard). -\item \code{currentType<-}: Sets the psi type that is to be used +\item \code{currentType(fds) <- value}: Sets the psi type that is to be used within several methods in the FRASER package. -\item \code{fitMetrics}: Returns the splice metrics that will be +\item \code{fitMetrics()}: Returns the splice metrics that will be fitted (defaults to jaccard, used within several methods in the FRASER package). -\item \code{fitMetrics<-}: Sets the splice metrics that will be +\item \code{fitMetrics(fds) <- value}: Sets the splice metrics that will be fitted (used within several methods in the FRASER package). -\item \code{pseudocount}: Sets and returns the pseudo count used +\item \code{pseudocount()}: Sets and returns the pseudo count used within the FRASER fitting procedure. -\item \code{hyperParams}: This returns the results of the +\item \code{hyperParams()}: This returns the results of the hyperparameter optimization NULL if the hyperparameter opimization was not run yet. -\item \code{bestQ}: This returns the optimal size of the +\item \code{bestQ()}: This returns the optimal size of the latent space according to the hyperparameter optimization or a simple estimate of about a tenth of the number of samples if the hyperparameter opimization was not run yet. -\item \code{dontWriteHDF5}: Gets the current value of whether the +\item \code{dontWriteHDF5()}: Gets the current value of whether the assays should be stored as hdf5 files. -\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored +\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose}: Dependent on the level of verbosity +\item \code{verbose()}: Dependent on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. -\item \code{verbose<-}: Sets the verbosity level to a value +\item \code{verbose(fds) <- value}: Sets the verbosity level to a value between 0 and 10. 0 means being quiet and 10 means reporting everything. -}} +}} \examples{ fds <- createTestFraserDataSet() diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd index b28e3c33..4556b7b6 100644 --- a/man/spliceTypeAnnotations.Rd +++ b/man/spliceTypeAnnotations.Rd @@ -93,20 +93,20 @@ These functions work on the result table and add additional } \section{Functions}{ \itemize{ -\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations +\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations based on overlap with the reference annotation (start, end, none, both) for the full fds. The overlap type is added as a new column \code{annotatedJunction} in \code{mcols(fds)}. -\item \code{annotateSpliceEventType}: This method annotates the splice event +\item \code{annotateSpliceEventType()}: This method annotates the splice event type to junctions in the given results table. -\item \code{flagBlacklistRegions}: This method flags all introns and +\item \code{flagBlacklistRegions()}: This method flags all introns and splice sites in the given results table for which at least one splice site (donor or acceptor) is located in a blacklist region. Blacklist regions of the genome are determined from the provided BED file. -}} +}} \examples{ # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 4a28fbbe..500c8ed2 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -1,4 +1,4 @@ -%\VignetteIndexEntry{FRASER: Find RAre Splicing Evens in RNA-seq Data} +%\VignetteIndexEntry{FRASER: Find RAre Splicing Events in RNA-seq Data} %\VignettePackage{FRASER} %\VignetteEngine{knitr::knitr} %\VignetteEncoding{UTF-8} @@ -36,7 +36,7 @@ opts_chunk$set( \newcommand{\fraser}{\Biocpkg{FRASER}} \newcommand{\fds}{\Rclass{FraserDataSet}} -\title{FRASER: Find RAre Splicing Events in RNA-seq} +\title{FRASER: Find RAre Splicing Events in RNA-seq Data} \author{ Christian Mertes$^{1}$, Ines Scheller$^{1}$, Julien Gagneur$^{1}$ \\ @@ -176,8 +176,8 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the following. -From \fraser{}2 on, only one metric - the Intron Jaccard Index (Figure -\ref{IntronJaccardIndex_sketch}.) - is used by default. The Intron Jaccard +From \fraser{}2 on, only a single metric - the Intron Jaccard Index (Figure +\ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard Index is more robust and allows to focus more on functionally relevant aberrant splicing events. It allows to detect all types of aberrant splicing previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$) @@ -185,11 +185,10 @@ within a single metric. \incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron Jaccard Index, the splice metric used in \fraser{}2.}{ -The Intron Jaccard Index considers both split and nonsplit reads and is -defined as the jaccard index of the set of donor reads (reads sharing a donor -site with the intron of interest and nonsplit reads at that donor site) and -acceptor reads (reads sharing an acceptor site with the intron of interest and -nonsplit reads at that acceptor site). } +The Intron Jaccard Index considers both split and nonsplit reads within a +single metric and allows to detect all different types of aberrant splicing +previously captured with either of the metrics $\psi_5$, $\psi_3$, $\theta$. +} The Intron Jaccard Index considers both split and nonsplit reads and is defined as the jaccard index of the set of donor reads (reads sharing a donor @@ -198,7 +197,8 @@ acceptor reads (reads sharing an acceptor site with the intron of interest and nonsplit reads at that acceptor site): \begin{equation} - J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}\label{eq:jaccard} + J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)} + \label{eq:jaccard} \end{equation} @@ -590,7 +590,7 @@ bam files are available in the \fds{} object). <>= # to show result visualization functions for this tuturial, zScore cutoff used -res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1) +res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=NA) res # for the gene level pvalues, gene symbols need to be annotated the fds object @@ -598,7 +598,8 @@ res # as we previously called FRASER() before annotating genes, we run it again here fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) # generate gene-level results table (if gene symbols have been annotated) -res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=0.1) +res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA, + zScoreCutoff=2) res_gene @ @@ -614,7 +615,7 @@ plotVolcano(fds, type="jaccard", "sample10") Which are the splicing events in detail? <>= -sampleRes <- res[res$sampleID == "sample10"] +sampleRes <- res[res$sampleID == "sample8"] sampleRes @ From 74b3d2a65541226676a514fee4007bf70d917cf1 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Nov 2022 11:15:09 +0100 Subject: [PATCH 46/80] fix missing udate to jaccard in vignette --- R/plotMethods.R | 6 +++++- vignettes/FRASER.Rnw | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index a9f96bee..2a05446a 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1432,6 +1432,10 @@ plotManhattan.FRASER <- function(object, sampleID, main=paste0("sampleID = ", sampleID), color_chr=c("black", "darkgrey"), ...){ + # load necessary packages + require(ggbio) + require(biovizBase) + # check arguments stopifnot(is(object, "FraserDataSet")) stopifnot(sampleID %in% samples(object)) @@ -1679,7 +1683,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, else { highlight.name <- values(highlight.gr)[, highlight.name] } - p <- p + geom_point(data = mold(p@data[queryHits(idx)]), + p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]), do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), color = highlight.col) if (!is.null(highlight.name)) { diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 500c8ed2..d48f9cf7 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -240,8 +240,8 @@ orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) # fit the splicing model for each metric -# with a specific latentsapce dimension -fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3)) +# with a specific latentspace dimension +fds <- FRASER(fds, q=c(jaccard=2)) # alternatively, we also provide a way to use biomart for the annotation: # fds <- annotateRanges(fds) From b052a38622a7c486547ef1c581807975d33a3b9e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Nov 2022 11:32:25 +0100 Subject: [PATCH 47/80] small fix in vignette --- vignettes/FRASER.Rnw | 1 - 1 file changed, 1 deletion(-) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index d48f9cf7..9482a775 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -46,7 +46,6 @@ opts_chunk$set( \begin{document} -\SweaveOpts{concordance=TRUE} <>= opts_chunk$set(concordance=TRUE) From 01c3a4e15a2ea5cff12ecec211ce63bfb0e4616f Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Nov 2022 13:41:23 +0100 Subject: [PATCH 48/80] fix biocCheck warnings --- R/AllGenerics.R | 4 ++-- R/Fraser-pipeline.R | 4 ++-- R/filterExpression.R | 3 +++ R/plotMethods.R | 27 ++++++++++++++++----------- man/FRASER.Rd | 8 ++++---- man/fds-methods.Rd | 20 +++----------------- man/filtering.Rd | 4 ++++ 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 8a4bac5e..ca46ba7f 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -12,8 +12,8 @@ asFDS <- function(x){ #' #' @title Getter/Setter methods for the FraserDataSet #' -#' The following methods are getter and setter methods to extract or set -#' certain values of a FraserDataSet object. +#' @description The following methods are getter and setter methods to extract +#' or set certain values of a FraserDataSet object. #' #' \code{samples} sets or gets the sample IDs; \code{condition} ; #' \code{} diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index 8c1856b7..7b9d0e71 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -85,7 +85,7 @@ NULL #' the beta-binomial fit, the computation of Z scores and p values as well as #' the computation of delta-PSI values. #' @export -FRASER <- function(fds, q, types=fitMetrics(fds), +FRASER <- function(fds, q, type=fitMetrics(fds), implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations=15, BPPARAM=bpparam(), correction, ...){ @@ -104,7 +104,7 @@ FRASER <- function(fds, q, types=fitMetrics(fds), } # fit each splicing type separately - for(i in types){ + for(i in type){ # get type specific q if(missing(q)){ diff --git a/R/filterExpression.R b/R/filterExpression.R index 9a47b51a..6c7eab8c 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -17,6 +17,9 @@ #' passed all filters is returned. If FALSE, no subsetting is done and the #' information of whether an intron passed the filters is only stored in the #' mcols. +#' @param filterOnJaccard If TRUE, the Intron Jaccard Metric is used to define +#' express introns during fitlering. Otherwise, the psi5, psi3 and theta +#' metrics are used (default: TRUE). #' @param delayed If FALSE, count matrices will be loaded into memory, #' otherwise the function works on the delayedMatrix representations. The #' default value depends on the number of samples in the fds-object. diff --git a/R/plotMethods.R b/R/plotMethods.R index 2a05446a..1685637a 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1432,9 +1432,13 @@ plotManhattan.FRASER <- function(object, sampleID, main=paste0("sampleID = ", sampleID), color_chr=c("black", "darkgrey"), ...){ - # load necessary packages - require(ggbio) - require(biovizBase) + # check necessary packages + if (!requireNamespace('ggbio')){ + stop("For this function, the ggbio package is required.") + } + if (!requireNamespace('biovizBase')){ + stop("For this function, the biovizBase package is required.") + } # check arguments stopifnot(is(object, "FraserDataSet")) @@ -1460,7 +1464,7 @@ plotManhattan.FRASER <- function(object, sampleID, mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID] # only one point per donor/acceptor site (relevant only for psi5 and psi3) - index <- FRASER:::getSiteIndex(object, type=type) + index <- getSiteIndex(object, type=type) nonDup <- !duplicated(index) gr_sample <- gr_sample[nonDup,] @@ -1657,9 +1661,9 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, } if (!missing(facets)) { args$facets <- facets - args.facets <- subsetArgsByFormals(args, facet_grid, + args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid, facet_wrap) - facet <- .buildFacetsFromArgs(obj, args.facets) + facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets) p <- p + facet } p <- p + theme(panel.grid.minor = element_blank()) @@ -1689,13 +1693,14 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, if (!is.null(highlight.name)) { seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj) suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj)) - .trans <- transformToGenome(.h.pos, space.skip = space.skip) + .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip) values(.trans)$mean <- (start(.trans) + end(.trans))/2 values(.trans)$names <- highlight.name - p <- p + geom_text(data = mold(.trans), size = highlight.label.size, - vjust = 0, color = highlight.label.col, do.call(aes, - list(x = substitute(mean), y = as.name("val"), - label = as.name("names")))) + p <- p + geom_text(data = biovizBase::mold(.trans), + size = highlight.label.size, + vjust = 0, color = highlight.label.col, do.call(aes, + list(x = substitute(mean), y = as.name("val"), + label = as.name("names")))) } } } diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 0ee1035e..f5961c5b 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -11,7 +11,7 @@ FRASER( fds, q, - types = fitMetrics(fds), + type = fitMetrics(fds), implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations = 15, BPPARAM = bpparam(), @@ -58,6 +58,9 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing +efficiency)} + \item{implementation}{The method that should be used to correct for confounders.} @@ -70,9 +73,6 @@ not yet converged after these number of iterations, the fit stops anyway.} \item{...}{Additional parameters passed on to the internal fit function} -\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing -efficiency)} - \item{logit}{Indicates if z scores are computed on the logit scale (default) or in the natural (psi) scale.} diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd index 3e11e4e4..d6caf3fd 100644 --- a/man/fds-methods.Rd +++ b/man/fds-methods.Rd @@ -41,17 +41,7 @@ \alias{FRASER.mcols.get} \alias{FRASER.rowRanges.get} \alias{mapSeqlevels} -\title{Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. - -\code{samples} sets or gets the sample IDs; \code{condition} ; -\code{} -\code{nonSplicedReads} return a RangedSummarizedExperiment object -containing the counts for the non spliced reads overlapping splice -sites in the fds. -\code{}} +\title{Getter/Setter methods for the FraserDataSet} \usage{ samples(object) @@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().} Getter method return the respective current value. } \description{ -Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. +The following methods are getter and setter methods to extract +or set certain values of a FraserDataSet object. \code{samples} sets or gets the sample IDs; \code{condition} ; \code{} @@ -162,8 +150,6 @@ certain values of a FraserDataSet object. containing the counts for the non spliced reads overlapping splice sites in the fds. \code{} - -Mapping of chromosome names } \examples{ fds <- createTestFraserDataSet() diff --git a/man/filtering.Rd b/man/filtering.Rd index 9953fa0c..226df199 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -68,6 +68,10 @@ mcols.} otherwise the function works on the delayedMatrix representations. The default value depends on the number of samples in the fds-object.} +\item{filterOnJaccard}{If TRUE, the Intron Jaccard Metric is used to define +express introns during fitlering. Otherwise, the psi5, psi3 and theta +metrics are used (default: TRUE).} + \item{BPPARAM}{the BiocParallel parameters for the parallelization} } \value{ From caa194cc2470642231c8bf092f0095d0e4710664 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Nov 2022 15:08:24 +0100 Subject: [PATCH 49/80] remove installs from vignette for biocCheck --- vignettes/FRASER.Rnw | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 9482a775..51783ed1 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -521,10 +521,6 @@ $\psi$ values or both. <>= # annotate introns with the HGNC symbols of the corresponding gene -if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE)) - BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene") -if (!require("org.Hs.eg.db", quietly = TRUE)) - BiocManager::install("org.Hs.eg.db") library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) From fc6d61e282ec16f233c75e60c7c327d640aeabbe Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 29 Nov 2022 17:04:47 +0100 Subject: [PATCH 50/80] only run example if needed package is available --- R/plotMethods.R | 4 +++- man/plotFunctions.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index 1685637a..95354df1 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -260,7 +260,9 @@ #' plotExpectedVsObservedPsi(fds, res=res[1]) #' #' # create manhattan plot of pvalues by genomic position -#' plotManhattan(fds, type="jaccard", sampleID="sample10") +#' if(require(ggbio)){ +#' plotManhattan(fds, type="jaccard", sampleID="sample10") +#' } #' #' # plot splice graph and coverage from bam files in a given region #' if(require(SGSeq)){ diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index dfd317a2..927d3d11 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -505,7 +505,9 @@ plotQQ(fds, result=res[1]) plotExpectedVsObservedPsi(fds, res=res[1]) # create manhattan plot of pvalues by genomic position -plotManhattan(fds, type="jaccard", sampleID="sample10") +if(require(ggbio)){ + plotManhattan(fds, type="jaccard", sampleID="sample10") +} # plot splice graph and coverage from bam files in a given region if(require(SGSeq)){ From 03e678fd45a6680e4a907395c0b37dcf74402b3a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 5 Dec 2022 17:34:05 +0100 Subject: [PATCH 51/80] R CMD CHECK set to ignore warnings; removed zscoreCutoff and added option to specify filters to annotate only certain types of genes --- .github/workflows/check-bioc.yml | 2 +- R/AllGenerics.R | 42 ++++++++------------------ R/Fraser-pipeline.R | 4 +-- R/annotationOfRanges.R | 21 ++++++++++--- R/example_functions.R | 2 +- R/getNSetterFuns.R | 6 ++-- R/plotMethods.R | 8 ++--- R/resultAnnotations.R | 6 ++-- man/annotateRanges.Rd | 8 ++++- man/getter_setter_functions.Rd | 3 ++ man/plotFunctions.Rd | 5 ++- man/results.Rd | 18 ++++------- man/spliceTypeAnnotations.Rd | 6 ++-- tests/testthat/test_fraser_pipeline.R | 3 +- tests/testthat/test_plotJunctionDist.R | 2 +- tests/testthat/test_stats.R | 26 ++++++++-------- vignettes/FRASER.Rnw | 41 +++++-------------------- 17 files changed, 88 insertions(+), 115 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 946e96ef..e32789e2 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -249,7 +249,7 @@ jobs: rcmdcheck::rcmdcheck( args = c("--no-manual", "--no-vignettes", "--timings"), build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"), - error_on = "warning", + error_on = "error", check_dir = "check" ) shell: Rscript {0} diff --git a/R/AllGenerics.R b/R/AllGenerics.R index ca46ba7f..55fe3f97 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -620,7 +620,7 @@ mapSeqlevels <- function(fds, style="UCSC", ...){ #' #' retrieve a single sample result object #' @noRd -resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, +resultsSingleSample <- function(sampleID, gr, pvals, padjs, psivals, rawCts, rawTotalCts, rawNonsplitCts, rawNsProportion, nsProportion_99quantile, deltaPsiVals, psiType, rowMeansK, rowMeansN, @@ -665,7 +665,6 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, mcols(ans)$type <- Rle(psiType) mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5) mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5) - mcols(ans)$zScore <- Rle(round(zscores[goodCut,sampleID], 2)) mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2)) mcols(ans)$deltaPsi <- round(deltaPsiVals[goodCut,sampleID], 2) mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID]) @@ -711,7 +710,7 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))]) } -FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, +FRASER.results <- function(object, sampleIDs, fdrCutoff, dPsiCutoff, minCount, rhoCutoff, psiType, maxCols=20, aggregate=FALSE, collapse=FALSE, geneColumn="hgnc_symbol", BPPARAM=bpparam(), @@ -763,7 +762,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, filters=list(rho=rhoCutoff))) padjs <- as.matrix(padjVals(tmp_x, filters=list(rho=rhoCutoff))) - zscores <- as.matrix(zScores(tmp_x)) psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) psivals_pc <- (rawCts + pseudocount()) / @@ -772,7 +770,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, rho <- rho(tmp_x, type) aberrant <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, - zScoreCutoff=zscoreCutoff, deltaPsiCutoff=dPsiCutoff, minCount=minCount, rhoCutoff=rhoCutoff, @@ -783,7 +780,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, padjsGene <- as.matrix(padjVals(tmp_x, level="gene")) aberrantGene <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, - zScoreCutoff=zscoreCutoff, deltaPsiCutoff=dPsiCutoff, minCount=minCount, rhoCutoff=rhoCutoff, @@ -798,7 +794,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc - colnames(zscores) <- sc colnames(deltaPsiVals) <- sc } @@ -817,7 +812,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, # create result table sampleRes <- lapply(sc, resultsSingleSample, gr=gr, pvals=pvals, - padjs=padjs, zscores=zscores, psiType=type, + padjs=padjs, psiType=type, psivals=psivals, deltaPsiVals=deltaPsiVals, rawCts=rawCts, rawTotalCts=rawTotalCts, rawNonsplitCts=rawNonsplitCts, @@ -866,7 +861,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param sampleIDs A vector of sample IDs for which results should be #' retrieved #' @param padjCutoff The FDR cutoff to be applied or NA if not requested. -#' @param zScoreCutoff The z-score cutoff to be applied or NA if not requested. #' @param deltaPsiCutoff The cutoff on delta psi or NA if not requested. #' @param minCount The minimum count value of the total coverage of an intron #' to be considered as significant. @@ -891,7 +885,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' types to return only one row per feature (gene) and sample. #' @param geneColumn The column name of the column that has the gene annotation #' that will be used for gene-level pvalue computation. -#' @param ... Further arguments can be passed to the method. If "n", "zscores", +#' @param ... Further arguments can be passed to the method. If "n", #' "padjVals", "dPsi" or "rhoVals" are given, the values of those #' arguments are used to define the aberrant events. #' @@ -908,18 +902,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' #' # extract results: for this example dataset, no cutoffs are used to #' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA) +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first #' # using annotateRanges() function) -#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, -#' aggregate=TRUE) +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) #' #' # aggregate the results by genes and collapse over all psi types to obtain #' # only one row per gene in the results table -#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, -#' aggregate=TRUE, collapse=TRUE) +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, +#' collapse=TRUE) #' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value @@ -927,21 +920,20 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' #' # get aberrant events per gene (first annotate gene symbols) #' fds <- annotateRangesWithTxDb(fds) -#' aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA, -#' aggregate=TRUE) +#' aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) #' #' # find aberrant junctions/splice sites #' aberrant(fds, type="jaccard") #' @export setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, - zScoreCutoff=NA, deltaPsiCutoff=0.1, + deltaPsiCutoff=0.1, rhoCutoff=1, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=psiTypes, geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, - zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, + dPsiCutoff=deltaPsiCutoff, rhoCutoff=rhoCutoff, minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, @@ -950,12 +942,11 @@ setMethod("results", "FraserDataSet", function(object, aberrant.FRASER <- function(object, type=fitMetrics(object), padjCutoff=0.05, deltaPsiCutoff=0.1, - zScoreCutoff=NA, minCount=5, rhoCutoff=1, + minCount=5, rhoCutoff=1, by=c("none", "sample", "feature"), aggregate=FALSE, geneColumn="hgnc_symbol", ...){ checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(zScoreCutoff, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) @@ -968,11 +959,6 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), } else { n <- N(object, type=type) } - if("zscores" %in% names(dots)){ - zscores <- dots[['zscores']] - } else { - zscores <- zScores(object, type=type) - } if("padjVals" %in% names(dots)){ padj <- dots[['padjVals']] } else { @@ -1019,10 +1005,6 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), if(!is.na(minCount)){ aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) } - if(!is.na(zScoreCutoff)){ - aberrantEvents <- aberrantEvents & - as.matrix(abs(zscores) >= zScoreCutoff) - } if(!is.na(deltaPsiCutoff)){ aberrantEvents <- aberrantEvents & as.matrix(abs(dpsi) >= deltaPsiCutoff) diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index 7b9d0e71..20485299 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -129,8 +129,8 @@ FRASER <- function(fds, q, type=fitMetrics(fds), message(date(), ": Adjust p values for: '", i, "'.") fds <- calculatePadjValues(fds, type=i) - message(date(), ": Compute Z scores for: '", i, "'.") - fds <- calculateZscore(fds, type=i) + # message(date(), ": Compute Z scores for: '", i, "'.") + # fds <- calculateZscore(fds, type=i) } # return final analysis diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R index 13939780..8796cd0b 100644 --- a/R/annotationOfRanges.R +++ b/R/annotationOfRanges.R @@ -17,6 +17,10 @@ #' \code{TxDb.Hsapiens.UCSC.hg19.knownGene}. #' @param orgDb An \code{orgDb} object or a data table to map the feature names. #' If this is NULL, then \code{org.Hs.eg.db} is used as the default. +#' @param filter A named list specifying the filters which should be applied to +#' subset to e.g. only protein-coding genes for annotation. +#' \code{names(filter)} needs to be column names in the given +#' orgDb object (default: no filtering). #' @param keytype The keytype or column name of gene IDs in the \code{TxDb} #' object (see #' \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}} @@ -104,7 +108,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, #' @export annotateRangesWithTxDb <- function(fds, feature="SYMBOL", featureName="hgnc_symbol", keytype="ENTREZID", - txdb=NULL, orgDb=NULL){ + txdb=NULL, orgDb=NULL, filter=list()){ gene_id <- NULL # check input @@ -136,15 +140,24 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL", if(is.data.table(orgDb)){ tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, - .(gene_id, feature=get(feature))] - setnames(tmp, "feature", feature) + c("gene_id", feature, names(filter)), with=FALSE] } else { tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], - columns=feature, keytype=keytype)) + columns=c(feature, names(filter)), keytype=keytype)) } + # filter genes as specified by user (e.g. only protein_coding) + tmp[, include:=TRUE] + if(!is.null(filter) & length(filter) > 0 & !is.null(names(filter))){ + for(n in names(filter)){ + stopifnot(n %in% colnames(tmp)) + tmp[!(get(n) %in% filter[[n]]), include:=FALSE] + } + } + # add the new feature to the annotation tmp[, uniqueID := .GRP, by=keytype] + tmp <- tmp[include == TRUE,] anno <- anno[tmp[,uniqueID]] mcols(anno)[[featureName]] <- tmp[,get(feature)] diff --git a/R/example_functions.R b/R/example_functions.R index cec001f6..c4b992be 100644 --- a/R/example_functions.R +++ b/R/example_functions.R @@ -61,7 +61,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ if(all(file.exists(hdf5Files))){ if(isFALSE(rerun)){ fds <- loadFraserDataSet(workingDir, name="Data_Analysis") - if(all(paste0(c("zScores", "padjBetaBinomial", "predictedMeans"), + if(all(paste0(c("padjBetaBinomial", "predictedMeans"), "_", rep(psiTypes, 3)) %in% assayNames(fds))){ message(date(), ": Use existing cache data.") return(fds) diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index c0bb9d65..ed19427f 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -43,6 +43,9 @@ #' # get statistics #' pVals(fds) #' padjVals(fds) +#' +#' # zscore not calculated by default +#' fds <- calculateZscore(fds, type="jaccard") #' zScores(fds) #' #' # set and get pseudocount @@ -701,7 +704,6 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), pval = c(pVals(fds, type=type, level=pvalLevel)[idxrow, idxcol]), padj = c(padjVals(fds, type=type)[idxrow, idxcol]), - zscore = c(zScores(fds, type=type)[idxrow, idxcol]), obsPsi = c(k/n), predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), rho = rep(rho(fds, type=type)[idxrow], @@ -711,7 +713,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), # add aberrant information to it aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], - dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], + dPsi=dt[,.(deltaPsi)], n=dt[,.(n)], rhoVals=dt[,.(rho)], aggregate=FALSE) dt[,aberrant:=aberrantVec] diff --git a/R/plotMethods.R b/R/plotMethods.R index 95354df1..e53ce761 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -31,7 +31,7 @@ #' @param idx A junction site ID or gene ID or one of both, which #' should be plotted. Can also be a vector. Integers are treated #' as indices. -#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta +#' @param padjCutoff,deltaPsiCutoff Significance or delta #' psi cutoff to mark outliers #' @param global Flag to plot a global Q-Q plot, default FALSE #' @param normalized If TRUE, the normalized psi values are used, the default, @@ -282,7 +282,7 @@ #' require(org.Hs.eg.db) #' orgDb <- org.Hs.eg.db #' -#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) #' res_dt <- as.data.table(res) #' res_dt <- res_dt[sampleID == "sample2",] #' @@ -403,7 +403,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, type=fitMetrics(object), - padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1, + padjCutoff=0.1, deltaPsiCutoff=0.1, aggregate=TRUE, BPPARAM=bpparam(), ...){ type <- match.arg(type, several.ok=TRUE) @@ -417,7 +417,7 @@ plotAberrantPerSample.FRASER <- function(object, main, # extract outliers outliers <- bplapply(type, aberrant, object=object, by="sample", - padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, + padjCutoff=padjCutoff, deltaPsiCutoff=deltaPsiCutoff, aggregate=aggregate, ..., BPPARAM=BPPARAM) dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){ diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 2dbd9a5b..bfb57620 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -69,9 +69,9 @@ #' # run this function before creating the results table #' fds <- annotateIntronReferenceOverlap(fds, txdb) #' -#' # extract results: for this small example dataset, only a z score cutoff -#' # of 1 is used to get at least one result. -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) +#' # extract results: for this small example dataset, no cutoffs used +#' # to get some results +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) #' #' # annotate the type of splice event and UTR overlap #' res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd index 4d1639f9..e74728b2 100644 --- a/man/annotateRanges.Rd +++ b/man/annotateRanges.Rd @@ -20,7 +20,8 @@ annotateRangesWithTxDb( featureName = "hgnc_symbol", keytype = "ENTREZID", txdb = NULL, - orgDb = NULL + orgDb = NULL, + filter = list() ) } \arguments{ @@ -52,6 +53,11 @@ one is used, currently this is \item{orgDb}{An \code{orgDb} object or a data table to map the feature names. If this is NULL, then \code{org.Hs.eg.db} is used as the default.} + +\item{filter}{A named list specifying the filters which should be applied to +subset to e.g. only protein-coding genes for annotation. +\code{names(filter)} needs to be column names in the given +orgDb object (default: no filtering).} } \value{ FraserDataSet diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index c57387d7..19828cc5 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -198,6 +198,9 @@ rho(fds) # get statistics pVals(fds) padjVals(fds) + +# zscore not calculated by default +fds <- calculateZscore(fds, type="jaccard") zScores(fds) # set and get pseudocount diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 927d3d11..79b1db7c 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -42,7 +42,6 @@ plotManhattan(object, ...) main, type = fitMetrics(object), padjCutoff = 0.1, - zScoreCutoff = NA, deltaPsiCutoff = 0.1, aggregate = TRUE, BPPARAM = bpparam(), @@ -202,7 +201,7 @@ samples. Labelling can be turned off by setting \code{label=NULL}. The user can also provide a custom list of gene symbols or sampleIDs.} -\item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta +\item{padjCutoff, deltaPsiCutoff}{Significance or delta psi cutoff to mark outliers} \item{BPPARAM}{BiocParallel parameter to use.} @@ -527,7 +526,7 @@ if(require(SGSeq)){ require(org.Hs.eg.db) orgDb <- org.Hs.eg.db - res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA) + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) res_dt <- as.data.table(res) res_dt <- res_dt[sampleID == "sample2",] diff --git a/man/results.Rd b/man/results.Rd index 500bcbb6..7c536129 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -9,7 +9,6 @@ object, sampleIDs = samples(object), padjCutoff = 0.05, - zScoreCutoff = NA, deltaPsiCutoff = 0.1, rhoCutoff = 1, aggregate = FALSE, @@ -26,7 +25,6 @@ type = fitMetrics(object), padjCutoff = 0.05, deltaPsiCutoff = 0.1, - zScoreCutoff = NA, minCount = 5, rhoCutoff = 1, by = c("none", "sample", "feature"), @@ -43,8 +41,6 @@ retrieved} \item{padjCutoff}{The FDR cutoff to be applied or NA if not requested.} -\item{zScoreCutoff}{The z-score cutoff to be applied or NA if not requested.} - \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.} \item{rhoCutoff}{The cutoff value on the fitted rho value @@ -80,7 +76,7 @@ are included.} \code{sample} or \code{feature} is specified the sum by sample or feature is returned} -\item{...}{Further arguments can be passed to the method. If "n", "zscores", +\item{...}{Further arguments can be passed to the method. If "n", "padjVals", "dPsi" or "rhoVals" are given, the values of those arguments are used to define the aberrant events.} } @@ -102,18 +98,17 @@ fds <- createTestFraserDataSet() # extract results: for this example dataset, no cutoffs are used to # get at least one result and show the output -res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA) +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) res # aggregate the results by genes (gene symbols need to be annotated first # using annotateRanges() function) -results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, - aggregate=TRUE) +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) # aggregate the results by genes and collapse over all psi types to obtain # only one row per gene in the results table -results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, - aggregate=TRUE, collapse=TRUE) +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, + collapse=TRUE) # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value @@ -121,8 +116,7 @@ aberrant(fds, type="jaccard", by="sample") # get aberrant events per gene (first annotate gene symbols) fds <- annotateRangesWithTxDb(fds) -aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA, - aggregate=TRUE) +aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) # find aberrant junctions/splice sites aberrant(fds, type="jaccard") diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd index 4556b7b6..0b14439c 100644 --- a/man/spliceTypeAnnotations.Rd +++ b/man/spliceTypeAnnotations.Rd @@ -119,9 +119,9 @@ regions of the genome are determined from the provided BED file. # run this function before creating the results table fds <- annotateIntronReferenceOverlap(fds, txdb) - # extract results: for this small example dataset, only a z score cutoff - # of 1 is used to get at least one result. - res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) + # extract results: for this small example dataset, no cutoffs used + # to get some results + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) # annotate the type of splice event and UTR overlap res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 55b0e54d..45c9b2ff 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -4,8 +4,7 @@ test_that("FRASER function", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", - "pvaluesBetaBinomial", "padjBetaBinomial", - "zScores"), "_", + "pvaluesBetaBinomial", "padjBetaBinomial"), "_", rep(fitMetrics(fds), 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) }) diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R index 1394f13c..43022359 100644 --- a/tests/testthat/test_plotJunctionDist.R +++ b/tests/testthat/test_plotJunctionDist.R @@ -3,7 +3,7 @@ context("Test distribution plots for given results/junction") test_that("Main junction distribution plot", { # get results fds <- getFraser() - res <- results(fds, padjCutoff=1, zScoreCutoff=NA, deltaPsiCutoff=NA) + res <- results(fds, padjCutoff=1, deltaPsiCutoff=NA) # plot distributions expect_silent(plotExpression(fds, result=res[1])) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 5e389ec8..dcbe5bea 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -17,19 +17,19 @@ test_that("PSI value calculation", { expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0)) }) -test_that("Zscore calculation", { - fds <- getFraser(clean = TRUE) - - # prepare zScore input for logit scale - psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount()) - mu <- predictedMeans(fds, "jaccard") - residual <- qlogis(psiVal) - qlogis(mu) - - # compute zscore - zscores <- (residual - rowMeans(residual)) / rowSds(residual) - - expect_equal(zscores, zScores(fds, "jaccard")) -}) +# test_that("Zscore calculation", { +# fds <- getFraser(clean = TRUE) +# +# # prepare zScore input for logit scale +# psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount()) +# mu <- predictedMeans(fds, "jaccard") +# residual <- qlogis(psiVal) - qlogis(mu) +# +# # compute zscore +# zscores <- (residual - rowMeans(residual)) / rowSds(residual) +# +# expect_equal(zscores, zScores(fds, "jaccard")) +# }) test_that("Gene p value calculation with NAs", { fds <- getFraser() diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 51783ed1..ef49a129 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -247,8 +247,8 @@ fds <- FRASER(fds, q=c(jaccard=2)) # get results: we recommend to use an FDR cutoff 0.05, but due to the small # dataset size we extract all events and their associated values -# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3) -res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA) +# eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1) +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) res # result visualization @@ -412,9 +412,9 @@ be processed with the same protocol and origin from the same tissue. \label{sec:filtering} Before we can filter the data, we have to compute the main splicing metric: -the $\psi$-value (Percent Spliced In). +the $\psi$-value (Percent Spliced In) and the Intron Jaccard Index. -<>= +<>= fds <- calculatePSIValues(fds) fds @ @@ -547,7 +547,7 @@ the following additional information: \item type: the metric for which the aberrant event was detected (either jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$) - \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of + \item pValue, padjust: the p-value and adjusted p-value (FDR) of this event (at intron or splice site level depending on metric) \item pValueGene, padjustGene: only present in the gene-level results table, gives the p-value and FDR adjusted p-value at gene-level @@ -584,8 +584,8 @@ sashimi plot for an outlier in the results table directly in R (if paths to bam files are available in the \fds{} object). <>= -# to show result visualization functions for this tuturial, zScore cutoff used -res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=NA) +# to show result visualization functions for this tuturial, no cutoff used +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) res # for the gene level pvalues, gene symbols need to be annotated the fds object @@ -593,8 +593,7 @@ res # as we previously called FRASER() before annotating genes, we run it again here fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) # generate gene-level results table (if gene symbols have been annotated) -res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA, - zScoreCutoff=2) +res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA) res_gene @ @@ -733,30 +732,6 @@ fds <- calculatePadjValues(fds, type="jaccard", method="BY") head(padjVals(fds,type="jaccard")) @ -\subsection{Z-score calculation} -\label{sec:Z-score-calculation} - -To calculate z-scores on the logit transformed $\Delta\psi$ values and to store -them in the \fds{} object, the function \Rfunction{calculateZScores} can be -called. The Z-scores can be used for visualization, filtering, and ranking of -samples. The Z-scores are calculated as follows: - -\begin{equation} - z_{ij} = \frac{\delta_{ij} - \bar{\delta_j}}{sd(\delta_j)} -\end{equation} -\begin{equation*} - \delta_{ij} = logit{(\frac{k_{ij} + 1}{n_{ij} + 2})} - logit{(\mu_{ij})}, -\end{equation*} - -where $\delta_{ij}$ is the difference on the logit scale between the measured -counts and the counts after correction for confounders and $\bar{\delta_j}$ is -the mean of intron $j$. - -<>= -fds <- calculateZscore(fds, type="jaccard") -head(zScores(fds, type="jaccard")) -@ - \subsection{Result visualization} \label{sec:result-vis} From 39d03b29f4b79719f8fdfe213773e326921c2691 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 7 Dec 2022 13:10:55 +0100 Subject: [PATCH 52/80] fixed gene-level pvalue retrieval to work with and without rhoCutoff --- R/AllGenerics.R | 11 +++++++++-- R/getNSetterFuns.R | 28 ++++++++++++++++------------ R/pvalsNzscore.R | 22 +++++++++++++--------- man/FRASER.Rd | 4 ++-- man/getter_setter_functions.Rd | 4 ++-- man/results.Rd | 4 ++-- 6 files changed, 44 insertions(+), 29 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 55fe3f97..886f625d 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -726,6 +726,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, # only extract results for requested psiTypes if pvals exist for them stopifnot(all(psiType %in% psiTypes)) + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType, filters=list(rho=rhoCutoff), aggregate=aggregate) @@ -928,7 +931,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, deltaPsiCutoff=0.1, - rhoCutoff=1, aggregate=FALSE, collapse=FALSE, + rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=psiTypes, geneColumn="hgnc_symbol", additionalColumns=NULL, BPPARAM=bpparam()){ @@ -942,7 +945,7 @@ setMethod("results", "FraserDataSet", function(object, aberrant.FRASER <- function(object, type=fitMetrics(object), padjCutoff=0.05, deltaPsiCutoff=0.1, - minCount=5, rhoCutoff=1, + minCount=5, rhoCutoff=NA, by=c("none", "sample", "feature"), aggregate=FALSE, geneColumn="hgnc_symbol", ...){ @@ -953,6 +956,10 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), by <- match.arg(by) type <- match.arg(type) + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } + dots <- list(...) if("n" %in% names(dots)){ n <- dots[['n']] diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index ed19427f..250ad922 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -254,7 +254,7 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ #' @describeIn getter_setter_functions This returns the calculated p-values. #' @export pVals <- function(fds, type=currentType(fds), level="site", - filters=list(rho=1), dist="BetaBinomial", ...){ + filters=list(), dist="BetaBinomial", ...){ level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) @@ -269,10 +269,14 @@ pVals <- function(fds, type=currentType(fds), level="site", } else{ aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } for(n in sort(names(filters))){ aname_new <- paste0(aname, "_", n, filters[[n]]) if(n == "rho" && filters[[n]] == 1){ - if(any(grepl(aname_new, assayNames(fds)))){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ aname <- aname_new } }else{ @@ -292,7 +296,7 @@ pVals <- function(fds, type=currentType(fds), level="site", } `pVals<-` <- function(fds, type=currentType(fds), level="site", - filters=list(rho=1), + filters=list(), dist="BetaBinomial", ..., value){ level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) @@ -306,9 +310,7 @@ pVals <- function(fds, type=currentType(fds), level="site", } # add information on used filters for(n in sort(names(filters))){ - if(!(n == "rho" && filters[[n]] == 1)){ - aname <- paste0(aname, "_", n, filters[[n]]) - } + aname <- paste0(aname, "_", n, filters[[n]]) } if(level == "gene"){ @@ -325,7 +327,7 @@ pVals <- function(fds, type=currentType(fds), level="site", #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), - level="site", filters=list(rho=1), ...){ + level="site", filters=list(), ...){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) @@ -333,8 +335,12 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), # add information on used filters for(n in sort(names(filters))){ aname_new <- paste0(aname, "_", n, filters[[n]]) + if(is.null(names(filters))){ + filters <- list(rho=1) + } if(n == "rho" && filters[[n]] == 1){ - if(any(grepl(aname_new, assayNames(fds)))){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ aname <- aname_new } }else{ @@ -352,16 +358,14 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), } `padjVals<-` <- function(fds, type=currentType(fds), level="site", - dist="BetaBinomial", filters=list(rho=1), ..., value){ + dist="BetaBinomial", filters=list(), ..., value){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters for(n in sort(names(filters))){ - if(!(n == "rho" && filters[[n]] == 1)){ - aname <- paste0(aname, "_", n, filters[[n]]) - } + aname <- paste0(aname, "_", n, filters[[n]]) } if(level == "gene"){ if(is.null(rownames(value))){ diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index f1dbcb45..61a8a2da 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -222,7 +222,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' multiple testing correction. #' @param rhoCutoff The cutoff value on the fitted rho value #' (overdispersion parameter of the betabinomial) above which junctions are -#' masked with NA during p value adjustment. +#' masked with NA during p value adjustment (default: NA, no masking). #' @param geneLevel Logical value indiciating whether gene-level p values #' should be calculated. Defaults to TRUE. #' @param geneColumn The column name of the column that has the gene annotation @@ -230,7 +230,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' #' @export calculatePadjValues <- function(fds, type=currentType(fds), method="BY", - rhoCutoff=1, geneLevel=TRUE, + rhoCutoff=NA, geneLevel=TRUE, geneColumn="hgnc_symbol", BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) @@ -247,11 +247,15 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", # splice site-level pval correction message(date(), ": adjusting junction-level pvalues ...") - # fwer_pvals <- getFWERpvals_byIdx(pvals, index, rho, method="holm", - # rhoCutoff=rhoCutoff, BPPARAM=BPPARAM) fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", - rhoCutoff=rhoCutoff, BPPARAM=BPPARAM) - pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff), + rhoCutoff=ifelse(is.na(rhoCutoff), 1, rhoCutoff), + BPPARAM=BPPARAM) + if(!is.na(rhoCutoff)){ + filters <- list(rho=rhoCutoff) + } else{ + filters <- list() + } + pVals(fds, dist=i, level="site", filters=filters, withDimnames=FALSE) <- fwer_pvals # junction-level FDR correction @@ -259,7 +263,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method) padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)] padjDT[,i:=NULL] - padjVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff), + padjVals(fds, dist=i, level="site", filters=filters, withDimnames=FALSE) <- as.matrix(padjDT) # gene-level pval correction and FDR @@ -270,9 +274,9 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", method="holm", FDRmethod=method, geneColumn=geneColumn, BPPARAM=BPPARAM) - pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), + pVals(fds, dist=i, level="gene", filters=filters, withDimnames=FALSE) <- gene_pvals[["pvals"]] - padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff), + padjVals(fds, dist=i, level="gene", filters=filters, withDimnames=FALSE) <- gene_pvals[["padj"]] } else if(isTRUE(geneLevel)){ warning("Gene-level pvalues could not be calculated as column ", diff --git a/man/FRASER.Rd b/man/FRASER.Rd index f5961c5b..1b2e5f72 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -34,7 +34,7 @@ calculatePadjValues( fds, type = currentType(fds), method = "BY", - rhoCutoff = 1, + rhoCutoff = NA, geneLevel = TRUE, geneColumn = "hgnc_symbol", BPPARAM = bpparam() @@ -87,7 +87,7 @@ multiple testing correction.} \item{rhoCutoff}{The cutoff value on the fitted rho value (overdispersion parameter of the betabinomial) above which junctions are -masked with NA during p value adjustment.} +masked with NA during p value adjustment (default: NA, no masking).} \item{geneLevel}{Logical value indiciating whether gene-level p values should be calculated. Defaults to TRUE.} diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 19828cc5..3f25c8b6 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -41,7 +41,7 @@ pVals( fds, type = currentType(fds), level = "site", - filters = list(rho = 1), + filters = list(), dist = "BetaBinomial", ... ) @@ -51,7 +51,7 @@ padjVals( type = currentType(fds), dist = c("BetaBinomial"), level = "site", - filters = list(rho = 1), + filters = list(), ... ) diff --git a/man/results.Rd b/man/results.Rd index 7c536129..4da3bf1b 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -10,7 +10,7 @@ sampleIDs = samples(object), padjCutoff = 0.05, deltaPsiCutoff = 0.1, - rhoCutoff = 1, + rhoCutoff = NA, aggregate = FALSE, collapse = FALSE, minCount = 5, @@ -26,7 +26,7 @@ padjCutoff = 0.05, deltaPsiCutoff = 0.1, minCount = 5, - rhoCutoff = 1, + rhoCutoff = NA, by = c("none", "sample", "feature"), aggregate = FALSE, geneColumn = "hgnc_symbol", From e9a64861b5b6284d9ea9b37bd520790642b7fa5e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 7 Dec 2022 15:00:52 +0100 Subject: [PATCH 53/80] further fix to correctly extract gene pvalues --- R/AllGenerics.R | 6 ++++-- R/getNSetterFuns.R | 6 +++--- R/helper-functions.R | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 886f625d..74433c6a 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -779,8 +779,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, aggregate=FALSE, geneColumn=geneColumn) if(isTRUE(aggregate)){ - pvalsGene <- as.matrix(pVals(tmp_x, level="gene")) - padjsGene <- as.matrix(padjVals(tmp_x, level="gene")) + pvalsGene <- as.matrix(pVals(tmp_x, level="gene", + filters=list(rho=rhoCutoff))) + padjsGene <- as.matrix(padjVals(tmp_x, level="gene", + filters=list(rho=rhoCutoff))) aberrantGene <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, deltaPsiCutoff=dPsiCutoff, diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 250ad922..0a03576f 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -333,11 +333,11 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } for(n in sort(names(filters))){ aname_new <- paste0(aname, "_", n, filters[[n]]) - if(is.null(names(filters))){ - filters <- list(rho=1) - } if(n == "rho" && filters[[n]] == 1){ if(any(grepl(aname_new, assayNames(fds))) || any(grepl(aname_new, names(metadata(fds))))){ diff --git a/R/helper-functions.R b/R/helper-functions.R index 54593b28..ae0dba66 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -564,7 +564,8 @@ checkPadjAvailableForFilters <- function(fds, type=currentType(fds), for(n in sort(names(filters))){ aname_new <- paste0(aname, "_", n, filters[[n]]) if(n == "rho" && filters[[n]] == 1){ - if(any(grepl(aname_new, assayNames(fds)))){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ aname <- aname_new } }else{ From 02fbb8412fcf6adbbbd3e5821ba27b79e942a33b Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Fri, 9 Dec 2022 14:48:52 +0100 Subject: [PATCH 54/80] minor improvements to plots --- R/plotMethods.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index e53ce761..e21471b3 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -431,7 +431,7 @@ plotAberrantPerSample.FRASER <- function(object, main, geom_line() + geom_hline(aes(yintercept=median, color=type, lty="Median")) + theme_bw() + - theme_cowplot() + + theme_cowplot() + background_grid(major="xy", minor="xy") + ggtitle(main) + xlab("Sample rank") + ylab("Number of outliers") + @@ -440,7 +440,7 @@ plotAberrantPerSample.FRASER <- function(object, main, scale_linetype_manual(name="", values=2, labels="Median") if(!all(dt2p[,value] == 0)){ - g <- g + scale_y_log10() + g <- g + scale_y_log10() + annotation_logticks(sides="l") } g @@ -846,6 +846,8 @@ plotEncDimSearch.FRASER <- function(object, type=psiTypes, geom_smooth(method="loess", formula=y~x) + geom_vline(data=data[isOptimalQ == TRUE,], mapping=aes(xintercept=q, col=nsubset, linetype=noise)) + + geom_text(data=data[isOptimalQ == TRUE,], + aes(y=0.0, q+1, label=q)) + ggtitle(as.expression(bquote(bold(paste( "Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) + xlab("Estimated q") + From 7e26a3654176d427bf1f543f7b29c53d146a2857 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Fri, 9 Dec 2022 18:00:07 +0100 Subject: [PATCH 55/80] added support to get results table only for subset of genes --- NAMESPACE | 1 + R/AllGenerics.R | 257 +++++++++++++++++++++++++++++++----- R/FRASER-package.R | 2 +- R/Fraser-pipeline.R | 6 + R/pvalsNzscore.R | 27 +++- man/FRASER.Rd | 8 +- man/results.Rd | 36 +++++ tests/testthat/test_stats.R | 4 +- 8 files changed, 299 insertions(+), 42 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 436647d3..f1ca889f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -262,6 +262,7 @@ importFrom(VGAM,rbetabinom) importFrom(VGAM,vglm) importFrom(biomaRt,getBM) importFrom(biomaRt,useEnsembl) +importFrom(cowplot,background_grid) importFrom(cowplot,theme_cowplot) importFrom(extraDistr,dbbinom) importFrom(extraDistr,pbbinom) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 74433c6a..f37cb6dd 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -714,6 +714,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, dPsiCutoff, minCount, rhoCutoff, psiType, maxCols=20, aggregate=FALSE, collapse=FALSE, geneColumn="hgnc_symbol", BPPARAM=bpparam(), + geneSubset=NULL, subsetName, fullSubset=FALSE, additionalColumns=NULL){ stopifnot(is(object, "FraserDataSet")) @@ -744,6 +745,16 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, currentType(object) <- type gr <- rowRanges(object, type=type) + # calculate FDR on subset first if requested + if(!is.null(geneSubset)){ + object <- calculatePadjValuesOnSubset(fds=object, type=type, + genesToTest=geneSubset, + subsetName=subsetName, + geneColumn=geneColumn) + fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]] + object <- object[, fdr_subset[, unique(sampleID)]] + } + # first get row means rowMeansK <- rowMeans(K(object, type=type)) rowMeansN <- rowMeans(N(object, type=type)) @@ -771,31 +782,100 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, (rawTotalCts + 2*pseudocount()) deltaPsiVals <- deltaPsiValue(tmp_x, type) rho <- rho(tmp_x, type) - aberrant <- aberrant.FRASER(tmp_x, type=type, + if(is.null(geneSubset)){ + aberrant <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, deltaPsiCutoff=dPsiCutoff, minCount=minCount, rhoCutoff=rhoCutoff, aggregate=FALSE, geneColumn=geneColumn) + } else{ + aberrant <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=FALSE, + geneColumn=geneColumn, + FDR_subset=fdr_subset[sampleID + %in% samples(tmp_x)], + fullSubset=fullSubset) + } if(isTRUE(aggregate)){ pvalsGene <- as.matrix(pVals(tmp_x, level="gene", filters=list(rho=rhoCutoff))) padjsGene <- as.matrix(padjVals(tmp_x, level="gene", filters=list(rho=rhoCutoff))) - aberrantGene <- aberrant.FRASER(tmp_x, type=type, - padjCutoff=fdrCutoff, - deltaPsiCutoff=dPsiCutoff, - minCount=minCount, - rhoCutoff=rhoCutoff, - aggregate=TRUE, - geneColumn=geneColumn) + if(is.null(geneSubset)){ + aberrantGene <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=TRUE, + geneColumn=geneColumn) + } else{ + aberrantGene <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=TRUE, + geneColumn=geneColumn, + FDR_subset=fdr_subset[sampleID + %in% samples(tmp_x)], + fullSubset=fullSubset) + + # get row,col idx of genes/samples in subset + subset_gene_padj <- as.matrix( + fdr_subset[sampleID %in% samples(tmp_x), + .(sapply(gene, + function(g) which(rownames(pvalsGene) == g)), + sapply(sampleID, + function(s) which(colnames(tmp_x) ==s)), + pval_gene, + FDR_subset_gene)] + ) + # replace pvalsGene and padjsGene with values on subset + geneNames <- rownames(pvalsGene) + pvalsGene <- matrix(NA, + nrow=nrow(pvalsGene), ncol=ncol(tmp_x)) + rownames(pvalsGene) <- geneNames + colnames(pvalsGene) <- colnames(tmp_x) + padjsGene <- matrix(NA, + nrow=nrow(padjsGene), ncol=ncol(tmp_x)) + rownames(padjsGene) <- geneNames + colnames(padjsGene) <- colnames(tmp_x) + if(nrow(subset_gene_padj) > 0){ + pvalsGene[subset_gene_padj[,1:2]] <- + subset_gene_padj[,3] + padjsGene[subset_gene_padj[,1:2]] <- + subset_gene_padj[,4] + } + } + } else{ pvalsGene <- NULL padjsGene <- NULL aberrantGene <- NULL } + if(!is.null(geneSubset)){ + # get row,col idx of introns/samples in subset + subset_padj <- as.matrix( + fdr_subset[sampleID %in% samples(tmp_x), .(jidx, + sapply(sampleID, + function(s) which(colnames(tmp_x) ==s)), + FDR_subset)] + ) + # replace padj values with values on subset + padj <- matrix(NA, nrow=nrow(tmp_x), ncol=ncol(tmp_x)) + if(nrow(subset_padj) > 0){ + padj[subset_padj[,1:2]] <- subset_padj[,3] + } + } + if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc @@ -843,6 +923,15 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, # sort it if existing if(length(ans) > 0){ ans <- ans[order(ans$pValue)] + if(is.null(geneSubset)){ + mcols(ans)[["FDR_set"]] <- "transcriptome-wide" + } else{ + if(is.null(subsetName)){ + mcols(ans)[["FDR_set"]] <- "subset" + } else{ + mcols(ans)[["FDR_set"]] <- subsetName + } + } } # collapse into one row per gene if requested @@ -890,6 +979,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' types to return only one row per feature (gene) and sample. #' @param geneColumn The column name of the column that has the gene annotation #' that will be used for gene-level pvalue computation. +#' @param subsets A named list of named lists specifying any number of gene +#' subsets (can differ per sample). For each subset, FDR correction +#' will be limited to introns in genes in the subset, and aberrant +#' events passing the FDR cutoff will be reported for each subset +#' separately. See the examples for how to use this argument. +#' @param fullSubset Only applies when \code{geneSubset} is not NULL. Specifies +#' whether all introns in given subset of genes should be +#' considered as aberrant, or only those passing the given cutoffs. +#' Defaults to FALSE (introns have to pass the cutoffs in addtion +#' to being in the gene subset to be considered aberrant). +#' @param geneSubset A named list giving a subset of genes per sample to which +#' FDR correction should be restricted. The names of the list must +#' correspond to the sampleIDs in the fds object. +#' @param subsetName The name under which the resulting FDR corrected pvalues +#' on the subset only will be displayed in the result table. #' @param ... Further arguments can be passed to the method. If "n", #' "padjVals", "dPsi" or "rhoVals" are given, the values of those #' arguments are used to define the aberrant events. @@ -929,6 +1033,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' #' # find aberrant junctions/splice sites #' aberrant(fds, type="jaccard") +#' +#' # retrieve results limiting FDR correction to only a subset of genes +#' # first, we need to create a list of genes per sample that will be tested +#' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +#' results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE) +#' +#' # results for several subsets can be retrieved at the same time: +#' geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1")) +#' results(fds, +#' subsets=list('random_subset'=geneList, 'another_subset'=geneList2), +#' fullSubset=TRUE) #' @export setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, @@ -936,20 +1051,62 @@ setMethod("results", "FraserDataSet", function(object, rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=psiTypes, geneColumn="hgnc_symbol", + subsets=NULL, fullSubset=FALSE, + returnTranscriptomewideResults=TRUE, additionalColumns=NULL, BPPARAM=bpparam()){ - FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, - dPsiCutoff=deltaPsiCutoff, - rhoCutoff=rhoCutoff, minCount=minCount, - psiType=match.arg(psiType, several.ok=TRUE), - aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, - additionalColumns=additionalColumns, BPPARAM=BPPARAM) + psiType <- match.arg(psiType, several.ok=TRUE) + if(is.null(subsets)){ + returnTranscriptomewideResults <- TRUE + } + if(isTRUE(returnTranscriptomewideResults)){ + res <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + geneSubset=NULL, subsetName=NULL, fullSubset=FALSE, + additionalColumns=additionalColumns, BPPARAM=BPPARAM) + } + + # add results for FDR_subsets if requested + if(!is.null(subsets)){ + stopifnot(is.list(subsets)) + stopifnot(!is.null(names(subsets))) + resls_subsets <- lapply(names(subsets), function(setName){ + geneList_sub <- subsets[[setName]] + res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + geneSubset=geneList_sub, subsetName=setName, + fullSubset=fullSubset, + additionalColumns=additionalColumns, BPPARAM=BPPARAM) + }) + if(isTRUE(returnTranscriptomewideResults)){ + res <- unlist(GRangesList(unlist(list(res, resls_subsets)))) + } else{ + res <- unlist(GRangesList(unlist(resls_subsets))) + } + + # sort it if existing + if(length(res) > 0){ + res <- res[order(res$pValue)] + if(isTRUE(aggregate)){ + res <- res[!is.na(res$pValueGene)] + } + } + } + return(res) }) aberrant.FRASER <- function(object, type=fitMetrics(object), padjCutoff=0.05, deltaPsiCutoff=0.1, minCount=5, rhoCutoff=NA, by=c("none", "sample", "feature"), - aggregate=FALSE, geneColumn="hgnc_symbol", ...){ + aggregate=FALSE, geneColumn="hgnc_symbol", + geneSubset=NULL, subsetName=NULL, + fullSubset=FALSE, ...){ checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) @@ -1004,24 +1161,61 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), } - if(is.na(padjCutoff)){ + if(is.na(padjCutoff) || (isTRUE(fullSubset) && + (!is.null(geneSubset) || ("FDR_subset" %in% names(dots))))){ padjCutoff <- 1 } - aberrantEvents <- as.matrix(padj) <= padjCutoff + if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){ + aberrantEvents <- as.matrix(padj) <= padjCutoff + } else{ + if("FDR_subset" %in% names(dots)){ + fdr_subset <- dots[["FDR_subset"]] + } else{ + if(is.null(subsetName)){ + subsetName <- "subset" + } + object <- calculatePadjValuesOnSubset(fds=object, type=type, + genesToTest=geneSubset, + subsetName=subsetName, + geneColumn=geneColumn) + fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]] + } + + + # define aberrant status based on whether intron/sample tuples are + # part of the given subset + aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj)) + colnames(aberrantEvents) <- colnames(padj) + FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset") + subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){ + col <- which(colnames(object) == sid) + rows <- fdr_subset[sampleID == sid & get(FDR_col) <= padjCutoff, + sort(unique(jidx))] + sub_idx <- matrix(c(rows, rep(col, length(rows))), + nrow=length(rows)) + return(sub_idx) + }) + subset_idx <- do.call(rbind, + subset_idx[which(sapply(subset_idx, nrow) > 0)]) + aberrantEvents[subset_idx] <- TRUE + } # check each cutoff if in use (not NA) - if(!is.na(minCount)){ - aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) - } - if(!is.na(deltaPsiCutoff)){ - aberrantEvents <- aberrantEvents & - as.matrix(abs(dpsi) >= deltaPsiCutoff) - } - if(!is.na(rhoCutoff)){ - aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff) + if((is.null(geneSubset) && !("FDR_subset" %in% names(dots))) || + isFALSE(fullSubset)){ + if(!is.na(minCount)){ + aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) + } + if(!is.na(deltaPsiCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(dpsi) >= deltaPsiCutoff) + } + if(!is.na(rhoCutoff)){ + aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff) + } + aberrantEvents[is.na(aberrantEvents)] <- FALSE } - aberrantEvents[is.na(aberrantEvents)] <- FALSE if(isTRUE(aggregate)){ if(is.null(rownames(padj_gene))){ @@ -1040,9 +1234,11 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), aberrantEvents <- as.matrix(ab_dt[,-1]) rownames(aberrantEvents) <- ab_dt[,geneID] - aberrantEvents <- aberrantEvents & as.matrix( - padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] - ) <= padjCutoff + if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){ + aberrantEvents <- aberrantEvents & as.matrix( + padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] + ) <= padjCutoff + } } return(switch(match.arg(by), @@ -1056,4 +1252,3 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), #' @export setMethod("aberrant", "FraserDataSet", aberrant.FRASER) - diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 0426bb26..85569e89 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -49,7 +49,7 @@ #' @importFrom plotly plot_ly subplot layout add_trace ggplotly #' @importFrom pheatmap pheatmap #' @importFrom RColorBrewer brewer.pal -#' @importFrom cowplot theme_cowplot +#' @importFrom cowplot theme_cowplot background_grid #' @importFrom ggrepel geom_text_repel #' ### Data handling diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index 20485299..10f22c3a 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -72,6 +72,12 @@ #' fds <- calculateZscore(fds, type="jaccard") #' head(zScores(fds, type="jaccard")) #' +#' # To calculate the FDR only on a subset of genes of interest (per sample): +#' geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, +#' subsetName="setOfInterest", type="jaccard") +#' metadata(fds)[["FDR_setOfInterest"]] +#' #' @seealso \code{\link[FRASER]{fit}} #' #' @author Christian Mertes \email{mertes@@in.tum.de} diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 61a8a2da..7ceb87f9 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -399,7 +399,7 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ #' #' @export calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), - subsetName="FDR_subset", method="BY", + subsetName="subset", method="BY", geneColumn="hgnc_symbol", BPPARAM=bpparam()){ # check input @@ -410,10 +410,6 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), if(!all(names(genesToTest) %in% samples(fds))){ stop("names(genesToTest) need to be sampleIDs in the given fds object.") } - if(!all(samples(fds) %in% names(genesToTest))){ - stop("All sampleIDs of the given fds object need to be in ", - "names(geneToTest).") - } # check if genes have been annotated if(!geneColumn %in% colnames(mcols(fds, type=type))){ @@ -430,12 +426,29 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), # message(date(), ": FDR subset calculation for sample = ", sample_id) # get idx of junctions corresponding to genes with var jidx <- unlist(lapply(genes_to_test_sample, function(gene){ - idx <- which(grepl(gene, mcols(fds, type="j")[, geneColumn])) + idx <- which(grepl(gene, mcols(fds, type=type)[, geneColumn])) names(idx) <- rep(gene, length(idx)) + if(length(idx) == 0 && verbose(fds) > 0){ + warning("No introns found in fds object for gene: ", gene, + " and sample: ", sample_id, ". Skipping this gene.") + } return(idx) })) jidx <- sort(jidx[!duplicated(jidx)]) + # check that jidx is not empty vector + if(length(jidx) == 0){ + warning("No introns found in the fds object for the given gene ", + "subset for sample: ", sample_id) + return(data.table(gene=character(0), + sampleID=character(0), + pval=numeric(0), + FDR_subset=numeric(0), + jidx=integer(0), + pval_gene=numeric(0), + FDR_subset_gene=numeric(0))) + } + # retrieve pvalues of junctions to test p <- pVals(fds, type=type, level="junction")[jidx, sample_id] @@ -458,7 +471,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), message(date(), ": finished FDR calculation on subset of genes.") # add FDR subset info to fds object and return - metadata(fds)[[subsetName]] <- FDR_subset + metadata(fds)[[paste("FDR", subsetName, sep="_")]] <- FDR_subset return(fds) } diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 1b2e5f72..13210477 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -44,7 +44,7 @@ calculatePadjValuesOnSubset( fds, genesToTest, type = currentType(fds), - subsetName = "FDR_subset", + subsetName = "subset", method = "BY", geneColumn = "hgnc_symbol", BPPARAM = bpparam() @@ -190,6 +190,12 @@ head(padjVals(fds, type="jaccard")) fds <- calculateZscore(fds, type="jaccard") head(zScores(fds, type="jaccard")) +# To calculate the FDR only on a subset of genes of interest (per sample): +geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, + subsetName="setOfInterest", type="jaccard") +metadata(fds)[["FDR_setOfInterest"]] + } \seealso{ \code{\link[FRASER]{fit}} diff --git a/man/results.Rd b/man/results.Rd index 4da3bf1b..03b67b36 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -16,6 +16,9 @@ minCount = 5, psiType = psiTypes, geneColumn = "hgnc_symbol", + subsets = NULL, + fullSubset = FALSE, + returnTranscriptomewideResults = TRUE, additionalColumns = NULL, BPPARAM = bpparam() ) @@ -30,6 +33,9 @@ by = c("none", "sample", "feature"), aggregate = FALSE, geneColumn = "hgnc_symbol", + geneSubset = NULL, + subsetName = NULL, + fullSubset = FALSE, ... ) } @@ -63,6 +69,18 @@ result} \item{geneColumn}{The column name of the column that has the gene annotation that will be used for gene-level pvalue computation.} +\item{subsets}{A named list of named lists specifying any number of gene +subsets (can differ per sample). For each subset, FDR correction +will be limited to introns in genes in the subset, and aberrant +events passing the FDR cutoff will be reported for each subset +separately. See the examples for how to use this argument.} + +\item{fullSubset}{Only applies when \code{geneSubset} is not NULL. Specifies +whether all introns in given subset of genes should be +considered as aberrant, or only those passing the given cutoffs. +Defaults to FALSE (introns have to pass the cutoffs in addtion +to being in the gene subset to be considered aberrant).} + \item{additionalColumns}{Character vector containing the names of additional columns from mcols(fds) that should appear in the result table (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns @@ -76,6 +94,13 @@ are included.} \code{sample} or \code{feature} is specified the sum by sample or feature is returned} +\item{geneSubset}{A named list giving a subset of genes per sample to which +FDR correction should be restricted. The names of the list must +correspond to the sampleIDs in the fds object.} + +\item{subsetName}{The name under which the resulting FDR corrected pvalues +on the subset only will be displayed in the result table.} + \item{...}{Further arguments can be passed to the method. If "n", "padjVals", "dPsi" or "rhoVals" are given, the values of those arguments are used to define the aberrant events.} @@ -120,4 +145,15 @@ aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) # find aberrant junctions/splice sites aberrant(fds, type="jaccard") + +# retrieve results limiting FDR correction to only a subset of genes +# first, we need to create a list of genes per sample that will be tested +geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE) + +# results for several subsets can be retrieved at the same time: +geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1")) +results(fds, + subsets=list('random_subset'=geneList, 'another_subset'=geneList2), + fullSubset=TRUE) } diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index dcbe5bea..b6e3aa07 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -95,10 +95,10 @@ test_that("FDR on subset of genes", { ) expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4) - subsetName <- "FDR_subset_test" + subsetName <- "subset_test" fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, subsetName=subsetName, type="jaccard") - subset_dt <- metadata(fds)[[subsetName]] + subset_dt <- metadata(fds)[[paste0("FDR_", subsetName)]] expect_true(is(subset_dt, "data.table")) expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt))) expect_equal(subset_dt[, .N], expected_output_nrows) From 9d8826c1e053a50cd7ff83e3a5dbed190792fe7d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 12 Dec 2022 11:43:22 +0100 Subject: [PATCH 56/80] minor fix for theta FDR on subset --- R/AllGenerics.R | 8 +++++--- R/pvalsNzscore.R | 5 +++-- tests/testthat/test_stats.R | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index f37cb6dd..38e71a44 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -751,7 +751,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, genesToTest=geneSubset, subsetName=subsetName, geneColumn=geneColumn) - fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]] + fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, + sep="_")]] object <- object[, fdr_subset[, unique(sampleID)]] } @@ -870,7 +871,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, FDR_subset)] ) # replace padj values with values on subset - padj <- matrix(NA, nrow=nrow(tmp_x), ncol=ncol(tmp_x)) + padj <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals)) if(nrow(subset_padj) > 0){ padj[subset_padj[,1:2]] <- subset_padj[,3] } @@ -1179,7 +1180,8 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), genesToTest=geneSubset, subsetName=subsetName, geneColumn=geneColumn) - fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]] + fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, + sep="_")]] } diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 7ceb87f9..979d97e8 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -442,6 +442,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), "subset for sample: ", sample_id) return(data.table(gene=character(0), sampleID=character(0), + type=character(0), pval=numeric(0), FDR_subset=numeric(0), jidx=integer(0), @@ -456,7 +457,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), pa <- p.adjust(p, method=method) # gene level pvals - dt <- data.table(sampleID=sample_id, pval=p, FDR_subset=pa, + dt <- data.table(sampleID=sample_id, type=type, pval=p, FDR_subset=pa, gene=names(jidx), jidx=jidx) dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] @@ -471,7 +472,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), message(date(), ": finished FDR calculation on subset of genes.") # add FDR subset info to fds object and return - metadata(fds)[[paste("FDR", subsetName, sep="_")]] <- FDR_subset + metadata(fds)[[paste("FDR", subsetName, type, sep="_")]] <- FDR_subset return(fds) } diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index b6e3aa07..79f737d1 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -98,7 +98,7 @@ test_that("FDR on subset of genes", { subsetName <- "subset_test" fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, subsetName=subsetName, type="jaccard") - subset_dt <- metadata(fds)[[paste0("FDR_", subsetName)]] + subset_dt <- metadata(fds)[[paste("FDR", subsetName, "jaccard", sep="_")]] expect_true(is(subset_dt, "data.table")) expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt))) expect_equal(subset_dt[, .N], expected_output_nrows) From f987da9144e5f0a0c0686f425649194ebc90e92e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 12 Dec 2022 15:42:07 +0100 Subject: [PATCH 57/80] minor fix to get correct rowMeans in result table when using FDR on subset --- R/AllGenerics.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 38e71a44..5c4cff54 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -745,6 +745,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, currentType(object) <- type gr <- rowRanges(object, type=type) + # first get row means + rowMeansK <- rowMeans(K(object, type=type)) + rowMeansN <- rowMeans(N(object, type=type)) + # calculate FDR on subset first if requested if(!is.null(geneSubset)){ object <- calculatePadjValuesOnSubset(fds=object, type=type, @@ -756,10 +760,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, object <- object[, fdr_subset[, unique(sampleID)]] } - # first get row means - rowMeansK <- rowMeans(K(object, type=type)) - rowMeansN <- rowMeans(N(object, type=type)) - # then iterate by chunk chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols) sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs, From a5ef78107ed2424eec773dd2a325ec35af376e58 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 15 Dec 2022 14:15:35 +0100 Subject: [PATCH 58/80] renamed annotated column to potentialImpact --- DESCRIPTION | 2 +- NAMESPACE | 2 +- R/AllGenerics.R | 4 +- R/resultAnnotations.R | 116 +++++++++--------- man/FRASER.Rd | 10 +- man/countRNA.Rd | 16 +-- man/fds-methods.Rd | 20 +-- man/filtering.Rd | 8 +- man/getter_setter_functions.Rd | 36 +++--- ...tions.Rd => potentialImpactAnnotations.Rd} | 44 +++---- 10 files changed, 123 insertions(+), 135 deletions(-) rename man/{spliceTypeAnnotations.Rd => potentialImpactAnnotations.Rd} (76%) diff --git a/DESCRIPTION b/DESCRIPTION index 7c8e24ab..2819f170 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ biocViews: License: MIT + file LICENSE URL: https://github.com/gagneurlab/FRASER BugRepots: https://github.com/gagneurlab/FRASER/issues -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.2 Encoding: UTF-8 VignetteBuilder: knitr Depends: diff --git a/NAMESPACE b/NAMESPACE index 861b7c2c..b30b1fb5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,9 +23,9 @@ export(N) export(aberrant) export(addCountsToFraserDataSet) export(annotateIntronReferenceOverlap) +export(annotatePotentialImpact) export(annotateRanges) export(annotateRangesWithTxDb) -export(annotateSpliceEventType) export(bamFile) export(bestQ) export(calculatePSIValues) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 6bebe86a..7a07b24b 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -12,8 +12,8 @@ asFDS <- function(x){ #' #' @title Getter/Setter methods for the FraserDataSet #' -#' The following methods are getter and setter methods to extract or set -#' certain values of a FraserDataSet object. +#' @description The following methods are getter and setter methods to extract +#' or set certain values of a FraserDataSet object. #' #' \code{samples} sets or gets the sample IDs; \code{condition} ; #' \code{} diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index 52f1c70b..d64ed4ee 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -2,23 +2,23 @@ #' @title Additional result annotations #' #' @description These functions work on the result table and add additional -#' annotations to the reported introns: the type of splice event (e.g. -#' exon skipping, exon truncation, ...), expected occurence of frameshift, -#' overlap with UTR regions as well as a flag for introns that are -#' located in blacklist regions of the genome. +#' annotations to the reported introns: the type of potential impact on +#' splicing (e.g. exon skipping, exon truncation, ...), potential occurence +#' of frameshift, overlap with UTR regions as well as a flag for introns +#' that are located in blacklist regions of the genome. #' #' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the #' fds for each intron based on the overlap of the intron's location with #' the reference annotation. Has to be run before the result table is #' created so that the new column can be included in it (see examples). #' -#' \code{\link{annotateSpliceEventType}} annotates each intron in the results -#' table with the type of splice event and expected occurence of frameshift -#' (likely, unlikely, inconclusive). Can also calculate overlap with -#' annotated UTR regions. Splice types can be one of: +#' \code{\link{annotatePotentialImpact}} annotates each intron in the results +#' table with the type of potential impact on splicing and potential +#' occurence of frameshift (likely, unlikely, inconclusive). Can also +#' calculate overlap with annotated UTR regions. Potential impact can be: #' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, #' exonTruncation, exonElongation, exonTruncation&Elongation, -#' singleExonSkipping, exonSkipping, splicingBeyondGene, +#' exonSkipping, splicingBeyondGene, #' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, #' complex (everything else). #' Splice sites (theta metric) annotations indicate how the splice site is @@ -37,8 +37,8 @@ #' @param result A result table as generated by FRASER, including the column #' \code{annotatedJunction} as generated by the function #' \code{annotateIntronReferenceOverlap}. -#' @param addSpliceType Logical, indicating if the type of the splice event -#' should be added to the results table. Defaults to \code{TRUE}. +#' @param addPotentialImpact Logical, indicating if the type of the potential +#' impact should be added to the results table. Defaults to \code{TRUE}. #' @param addUTRoverlap Logical, indicating if the overlap with UTR regions #' should checked and added to the results table. Defaults to \code{TRUE}. #' @param minoverlap Integer value defining the number of base pairs around the @@ -54,8 +54,8 @@ #' \code{bpparam()}. #' @return An annotated FraserDataSet or results table, respectively #' -#' @name spliceTypeAnnotations -#' @rdname spliceTypeAnnotations +#' @name potentialImpactAnnotations +#' @rdname potentialImpactAnnotations #' #' @examples #' # get data, fit and compute p-values and z-scores @@ -73,8 +73,8 @@ #' # of 1 is used to get at least one result. #' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) #' -#' # annotate the type of splice event and UTR overlap -#' res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) +#' # annotate the type of potential impact on splicing and UTR overlap +#' res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) #' #' # annotate overlap with blacklist regions #' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") @@ -84,7 +84,7 @@ #' NULL -#' @describeIn spliceTypeAnnotations This method calculates basic annotations +#' @describeIn potentialImpactAnnotations This method calculates basic annotations #' based on overlap with the reference annotation (start, end, none, both) #' for the full fds. The overlap type is added as a new column #' \code{annotatedJunction} in \code{mcols(fds)}. @@ -147,10 +147,10 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ return(fds) } -#' @describeIn spliceTypeAnnotations This method annotates the splice event +#' @describeIn potentialImpactAnnotations This method annotates the splice event #' type to junctions in the given results table. #' @export -annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, +annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE, addUTRoverlap=TRUE, minoverlap=5, BPPARAM=bpparam()){ @@ -170,8 +170,10 @@ annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, } # Calculate splice types and frameshift - if(isTRUE(addSpliceType)){ - annoResult <- addSpliceTypeLabels(annoResult, fds, txdb) + if(isTRUE(addPotentialImpact)){ + annoResult <- addPotentialImpactLabels(annoResult, fds, txdb) + annoResult[potentialImpact == "singleExonSkipping", + potentialImpact := "exonSkipping"] } # Add UTR labels @@ -187,7 +189,7 @@ annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, return(annoResult) } -#' @describeIn spliceTypeAnnotations This method flags all introns and +#' @describeIn potentialImpactAnnotations This method flags all introns and #' splice sites in the given results table for which at least one splice #' site (donor or acceptor) is located in a blacklist region. Blacklist #' regions of the genome are determined from the provided BED file. @@ -300,7 +302,7 @@ addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){ #' adds type of splicing to each intron in the results table #' @noRd -addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ +addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ message("preparing ...") psi_positions <- which(junctions_dt$type != "theta") colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" @@ -326,13 +328,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ keep.extra.columns = TRUE) # prepare the results column - junctions_dt[, spliceType := "complex"] - junctions_dt[, causesFrameshift := "inconclusive"] + junctions_dt[, potentialImpact := "complex"] + junctions_dt[, potentialFrameshift := "inconclusive"] junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, - spliceType := "annotatedIntron_increasedUsage"] + potentialImpact := "annotatedIntron_increasedUsage"] junctions_dt[annotatedJunction == "both" & deltaPsi < 0, - spliceType := "annotatedIntron_reducedUsage"] - junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] + potentialImpact := "annotatedIntron_reducedUsage"] + junctions_dt[annotatedJunction == "both", potentialFrameshift := "unlikely"] # TODO check for intron retention @@ -354,9 +356,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ intron_ranges, exons)) }) junctions_dt[psi_positions[starts], - causesFrameshift:=start_results[2,]] + potentialFrameshift:=start_results[2,]] junctions_dt[psi_positions[starts], - spliceType := start_results[1,]] + potentialImpact := start_results[1,]] # end junctions end_results <- sapply(ends, function(i){ @@ -371,8 +373,8 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ intron_ranges, exons)) }) - junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] - junctions_dt[psi_positions[ends], spliceType := end_results[1,]] + junctions_dt[psi_positions[ends], potentialFrameshift:=end_results[2,]] + junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]] # none junctions pt1 none_results <- sapply(nones, function(i){ @@ -423,10 +425,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ return(c(combined,frs)) }) - junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] - junctions_dt[psi_positions[nones], spliceType := none_results[1,]] + junctions_dt[psi_positions[nones], potentialFrameshift:=none_results[2,]] + junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]] - noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap") + noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap") refseq.genes<- genes(txdb) # none junctions pt2 @@ -454,9 +456,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ return(c("complex","inconclusive")) }) junctions_dt[psi_positions[noLaps], - causesFrameshift:=noLaps_results[2,]] + potentialFrameshift:=noLaps_results[2,]] junctions_dt[psi_positions[noLaps], - spliceType := noLaps_results[1,]] + potentialImpact := noLaps_results[1,]] # theta annotations thetas <- which(junctions_dt$type == "theta") @@ -464,12 +466,12 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ keep.extra.columns = TRUE) # specify default type for theta results as NA - junctions_dt[thetas, spliceType := NA] - junctions_dt[thetas, causesFrameshift := NA] + junctions_dt[thetas, potentialImpact := NA] + junctions_dt[thetas, potentialFrameshift := NA] # label all as intronic first if they have any intron overlap intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp))) - junctions_dt[thetas[intronic], spliceType := "intronicRegion"] + junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"] # for exonic, check if theta is fully contained in an exon # if one end is in an intron and the other in an exon it is a splice site @@ -482,10 +484,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ if(length(a) == length(w)) return("exonicRegion") return("annotatedSpliceSite") }) - junctions_dt[thetas[exonic], spliceType := exonic_results] + junctions_dt[thetas[exonic], potentialImpact := exonic_results] # check cases that don't overlap with an exon/intron - nones <- which(is.na(junctions_dt[thetas,]$spliceType)) + nones <- which(is.na(junctions_dt[thetas,]$potentialImpact)) none_results <- sapply(nones, function(i){ if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) #return("intergenic") @@ -500,13 +502,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){ return("upstreamOfNearestGene")) } }) - junctions_dt[thetas[nones], spliceType := none_results] + junctions_dt[thetas[nones], potentialImpact := none_results] # add distance to closest neighbour gene for intergenic results # (both psi and theta) message("adding distances to nearest gene ...") - up <- which(junctions_dt$spliceType == "upstreamOfNearestGene") - down <- which(junctions_dt$spliceType == "downstreamOfNearestGene") + up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene") + down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene") # create full grange object containing psi and theta junctions_gr <- makeGRangesFromDataFrame(junctions_dt, @@ -829,7 +831,7 @@ checkExonSkipping <- function(junctions_dt, txdb){ refseq.genes<- genes(txdb) - exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% + exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in% c("exonSkipping", "singleExonSkipping")) message("start checking exonSkipping") @@ -878,16 +880,16 @@ checkExonSkipping <- function(junctions_dt, txdb){ # checking exonSkipping done if(length(exonSkip) > 0){ junctions_dt[psi_positions[exonSkip], - spliceType2 := newSkip_results] - junctions_dt[spliceType2 == "splicingBeyondGene", - spliceType := "splicingBeyondGene"] - junctions_dt[spliceType2 == "splicingBeyondGene", - causesFrameshift := "inconclusive"] - junctions_dt[spliceType2 == "multigenicSplicing", - spliceType := "multigenicSplicing"] - junctions_dt[spliceType2 == "multigenicSplicing", - causesFrameshift := "inconclusive"] - junctions_dt[, spliceType2 := NULL] + potentialImpact2 := newSkip_results] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + potentialImpact := "splicingBeyondGene"] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + potentialFrameshift := "inconclusive"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + potentialImpact := "multigenicSplicing"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + potentialFrameshift := "inconclusive"] + junctions_dt[, potentialImpact2 := NULL] } colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" @@ -906,7 +908,7 @@ checkInconclusive <- function(junctions_dt, txdb){ refseq.genes<- genes(txdb) inconclusive <- which(junctions_dt[psi_positions - ]$spliceType == "complex") + ]$potentialImpact == "complex") inconclusive_results <- sapply(inconclusive, function(i){ start = start(junctions_gr[i]) @@ -954,7 +956,7 @@ checkInconclusive <- function(junctions_dt, txdb){ if(length(inconclusive) > 0){ junctions_dt[psi_positions[inconclusive], - spliceType := inconclusive_results] + potentialImpact := inconclusive_results] } return(junctions_dt) diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 22f3fff5..8ae94611 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -94,23 +94,23 @@ raw counts ("BB"). } \section{Functions}{ \itemize{ -\item \code{FRASER}: This function runs the default FRASER pipeline combining +\item \code{FRASER()}: This function runs the default FRASER pipeline combining the beta-binomial fit, the computation of Z scores and p values as well as the computation of delta-PSI values. -\item \code{calculateZscore}: This function calculates z-scores based on the +\item \code{calculateZscore()}: This function calculates z-scores based on the observed and expected logit psi. -\item \code{calculatePvalues}: This function calculates two-sided p-values based on +\item \code{calculatePvalues()}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The returned p values are already adjusted with Holm's method per donor or acceptor site, respectively. -\item \code{calculatePadjValues}: This function adjusts the previously calculated +\item \code{calculatePadjValues()}: This function adjusts the previously calculated p-values per sample for multiple testing. -}} +}} \examples{ # set default parallel backend register(SerialParam()) diff --git a/man/countRNA.Rd b/man/countRNA.Rd index 7940b91b..440231fe 100644 --- a/man/countRNA.Rd +++ b/man/countRNA.Rd @@ -245,32 +245,32 @@ a sample are set to zero. } \section{Functions}{ \itemize{ -\item \code{countRNAData}: This method extracts and counts the split reads and +\item \code{countRNAData()}: This method extracts and counts the split reads and non spliced reads from RNA bam files. -\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges +\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges object containing the split read counts from all specified samples. -\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges +\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges object containing the non split read counts at the exon-intron boundaries inferred from the GRanges object containing the positions of all the introns in this dataset. -\item \code{addCountsToFraserDataSet}: This method adds the split read and +\item \code{addCountsToFraserDataSet()}: This method adds the split read and non split read counts to a existing FraserDataSet containing the settings. -\item \code{countSplitReads}: This method counts all split reads in a +\item \code{countSplitReads()}: This method counts all split reads in a bam file for a single sample. -\item \code{mergeCounts}: This method merges counts for multiple +\item \code{mergeCounts()}: This method merges counts for multiple samples into one SummarizedExperiment object. -\item \code{countNonSplicedReads}: This method counts non spliced reads based +\item \code{countNonSplicedReads()}: This method counts non spliced reads based on the given target (acceptor/donor) regions for a single sample. -}} +}} \examples{ # On Windows SNOW is the default for the parallele backend, which can be # very slow for many but small tasks. Therefore, we will use diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd index 3e11e4e4..d6caf3fd 100644 --- a/man/fds-methods.Rd +++ b/man/fds-methods.Rd @@ -41,17 +41,7 @@ \alias{FRASER.mcols.get} \alias{FRASER.rowRanges.get} \alias{mapSeqlevels} -\title{Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. - -\code{samples} sets or gets the sample IDs; \code{condition} ; -\code{} -\code{nonSplicedReads} return a RangedSummarizedExperiment object -containing the counts for the non spliced reads overlapping splice -sites in the fds. -\code{}} +\title{Getter/Setter methods for the FraserDataSet} \usage{ samples(object) @@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().} Getter method return the respective current value. } \description{ -Getter/Setter methods for the FraserDataSet - -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. +The following methods are getter and setter methods to extract +or set certain values of a FraserDataSet object. \code{samples} sets or gets the sample IDs; \code{condition} ; \code{} @@ -162,8 +150,6 @@ certain values of a FraserDataSet object. containing the counts for the non spliced reads overlapping splice sites in the fds. \code{} - -Mapping of chromosome names } \examples{ fds <- createTestFraserDataSet() diff --git a/man/filtering.Rd b/man/filtering.Rd index b40ad786..361dde7e 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -72,16 +72,16 @@ reliably detected and to remove introns with no variablity between samples. } \section{Functions}{ \itemize{ -\item \code{filterExpressionAndVariability}: This functions filters out both introns with low +\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low read support and introns that are not variable across samples. -\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding +\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability}: This function filters out introns and corresponding +\item \code{filterVariability()}: This function filters out introns and corresponding splice sites which do not show variablity across samples. -}} +}} \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index bf56e32c..4dad7b0d 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -91,63 +91,63 @@ the values within the FRASER model. } \section{Functions}{ \itemize{ -\item \code{featureExclusionMask}: Retrieves a logical vector indicating +\item \code{featureExclusionMask()}: Retrieves a logical vector indicating for each junction whether it is included or excluded during the fitting procedure. -\item \code{featureExclusionMask<-}: To remove certain junctions from +\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from being used in the train step of the encoding dimension we can set the \code{featureExclusion} vector to \code{FALSE}. This can be helpfull if we have local linkage between features which we do not want to model by the autoencoder. -\item \code{rho}: Returns the fitted rho values for the +\item \code{rho()}: Returns the fitted rho values for the beta-binomial distribution -\item \code{zScores}: This returns the calculated z-scores. +\item \code{zScores()}: This returns the calculated z-scores. -\item \code{pVals}: This returns the calculated p-values. +\item \code{pVals()}: This returns the calculated p-values. -\item \code{padjVals}: This returns the adjusted p-values. +\item \code{padjVals()}: This returns the adjusted p-values. -\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) +\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) values. -\item \code{deltaPsiValue}: Returns the difference between the +\item \code{deltaPsiValue()}: Returns the difference between the observed and the fitted psi values. -\item \code{currentType}: Returns the psi type that is used +\item \code{currentType()}: Returns the psi type that is used within several methods in the FRASER package. -\item \code{currentType<-}: Sets the psi type that is to be used +\item \code{currentType(fds) <- value}: Sets the psi type that is to be used within several methods in the FRASER package. -\item \code{pseudocount}: Sets and returns the pseudo count used +\item \code{pseudocount()}: Sets and returns the pseudo count used within the FRASER fitting procedure. -\item \code{hyperParams}: This returns the results of the +\item \code{hyperParams()}: This returns the results of the hyperparameter optimization NULL if the hyperparameter opimization was not run yet. -\item \code{bestQ}: This returns the optimal size of the +\item \code{bestQ()}: This returns the optimal size of the latent space according to the hyperparameter optimization or a simple estimate of about a tenth of the number of samples if the hyperparameter opimization was not run yet. -\item \code{dontWriteHDF5}: Gets the current value of whether the +\item \code{dontWriteHDF5()}: Gets the current value of whether the assays should be stored as hdf5 files. -\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored +\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose}: Dependend on the level of verbosity +\item \code{verbose()}: Dependend on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. -\item \code{verbose<-}: Sets the verbosity level to a value +\item \code{verbose(fds) <- value}: Sets the verbosity level to a value between 0 and 10. 0 means being quiet and 10 means reporting everything. -}} +}} \examples{ fds <- createTestFraserDataSet() diff --git a/man/spliceTypeAnnotations.Rd b/man/potentialImpactAnnotations.Rd similarity index 76% rename from man/spliceTypeAnnotations.Rd rename to man/potentialImpactAnnotations.Rd index b28e3c33..6ffec6a7 100644 --- a/man/spliceTypeAnnotations.Rd +++ b/man/potentialImpactAnnotations.Rd @@ -1,19 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/resultAnnotations.R -\name{spliceTypeAnnotations} -\alias{spliceTypeAnnotations} +\name{potentialImpactAnnotations} +\alias{potentialImpactAnnotations} \alias{annotateIntronReferenceOverlap} -\alias{annotateSpliceEventType} +\alias{annotatePotentialImpact} \alias{flagBlacklistRegions} \title{Additional result annotations} \usage{ annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam()) -annotateSpliceEventType( +annotatePotentialImpact( result, txdb, fds, - addSpliceType = TRUE, + addPotentialImpact = TRUE, addUTRoverlap = TRUE, minoverlap = 5, BPPARAM = bpparam() @@ -38,8 +38,8 @@ flagBlacklistRegions( \code{annotatedJunction} as generated by the function \code{annotateIntronReferenceOverlap}.} -\item{addSpliceType}{Logical, indicating if the type of the splice event -should be added to the results table. Defaults to \code{TRUE}.} +\item{addPotentialImpact}{Logical, indicating if the type of the potential +impact should be added to the results table. Defaults to \code{TRUE}.} \item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions should checked and added to the results table. Defaults to \code{TRUE}.} @@ -61,23 +61,23 @@ An annotated FraserDataSet or results table, respectively } \description{ These functions work on the result table and add additional - annotations to the reported introns: the type of splice event (e.g. - exon skipping, exon truncation, ...), expected occurence of frameshift, - overlap with UTR regions as well as a flag for introns that are - located in blacklist regions of the genome. + annotations to the reported introns: the type of potential impact on + splicing (e.g. exon skipping, exon truncation, ...), potential occurence + of frameshift, overlap with UTR regions as well as a flag for introns + that are located in blacklist regions of the genome. \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the fds for each intron based on the overlap of the intron's location with the reference annotation. Has to be run before the result table is created so that the new column can be included in it (see examples). -\code{\link{annotateSpliceEventType}} annotates each intron in the results - table with the type of splice event and expected occurence of frameshift - (likely, unlikely, inconclusive). Can also calculate overlap with - annotated UTR regions. Splice types can be one of: +\code{\link{annotatePotentialImpact}} annotates each intron in the results + table with the type of potential impact on splicing and potential + occurence of frameshift (likely, unlikely, inconclusive). Can also + calculate overlap with annotated UTR regions. Potential impact can be: annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, exonTruncation, exonElongation, exonTruncation&Elongation, - singleExonSkipping, exonSkipping, splicingBeyondGene, + exonSkipping, splicingBeyondGene, multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, complex (everything else). Splice sites (theta metric) annotations indicate how the splice site is @@ -93,20 +93,20 @@ These functions work on the result table and add additional } \section{Functions}{ \itemize{ -\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations +\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations based on overlap with the reference annotation (start, end, none, both) for the full fds. The overlap type is added as a new column \code{annotatedJunction} in \code{mcols(fds)}. -\item \code{annotateSpliceEventType}: This method annotates the splice event +\item \code{annotatePotentialImpact()}: This method annotates the splice event type to junctions in the given results table. -\item \code{flagBlacklistRegions}: This method flags all introns and +\item \code{flagBlacklistRegions()}: This method flags all introns and splice sites in the given results table for which at least one splice site (donor or acceptor) is located in a blacklist region. Blacklist regions of the genome are determined from the provided BED file. -}} +}} \examples{ # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() @@ -123,8 +123,8 @@ regions of the genome are determined from the provided BED file. # of 1 is used to get at least one result. res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA) - # annotate the type of splice event and UTR overlap - res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds) + # annotate the type of potential impact on splicing and UTR overlap + res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) # annotate overlap with blacklist regions res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") From c1fb2b318a434d9f640168ddd2e81742ad5cf54e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 15 Dec 2022 15:28:45 +0100 Subject: [PATCH 59/80] update potential frameshift colname --- R/FRASER-package.R | 6 +++--- R/resultAnnotations.R | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 85569e89..a175a0bd 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -134,10 +134,10 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", "uniqueID", "V1", "value", "zscore", "maxDTheta", "par", "genes_donor", "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", - "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", + "blacklist", "potentialImpact", "causesFrameshift", "annotatedJunction", "distNearestGene", "UTR_overlap", "meanCount", "medianCount", - "spliceType2", "nonsplitProportion", "nonsplitCounts", - "nonsplitProportion_99quantile", "startID", "endID", "j_idx", + "potentialImpact2", "nonsplitProportion", "nonsplitCounts", + "nonsplitProportion_99quantile", "startID", "endID", "j_idx", "jidx", "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", "pvalue"), package="FRASER") diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R index bacc8b1c..7ab9b61d 100644 --- a/R/resultAnnotations.R +++ b/R/resultAnnotations.R @@ -329,23 +329,23 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ # prepare the results column junctions_dt[, potentialImpact := "complex"] - junctions_dt[, potentialFrameshift := "inconclusive"] + junctions_dt[, causesFrameshift := "inconclusive"] junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, potentialImpact := "annotatedIntron_increasedUsage"] junctions_dt[annotatedJunction == "both" & deltaPsi < 0, potentialImpact := "annotatedIntron_reducedUsage"] - junctions_dt[annotatedJunction == "both", potentialFrameshift := "unlikely"] + junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% colnames(junctions_dt))){ - junctions_dt[spliceType == "annotatedIntron_reducedUsage" & + junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" & type == "jaccard" & nonsplitProportion >= nonsplitProportion_99quantile + 0.05 & nonsplitCounts >= 10, - spliceType := "(partial)intronRetention"] + potentialImpact := "(partial)intronRetention"] # TODO check frameshift for intron retention - junctions_dt[spliceType == "(partial)intronRetention", + junctions_dt[potentialImpact == "(partial)intronRetention", causesFrameshift := "inconclusive"] } @@ -367,7 +367,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ intron_ranges, exons)) }) junctions_dt[psi_positions[starts], - potentialFrameshift:=start_results[2,]] + causesFrameshift:=start_results[2,]] junctions_dt[psi_positions[starts], potentialImpact := start_results[1,]] @@ -384,7 +384,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ intron_ranges, exons)) }) - junctions_dt[psi_positions[ends], potentialFrameshift:=end_results[2,]] + junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]] # none junctions pt1 @@ -436,7 +436,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ return(c(combined,frs)) }) - junctions_dt[psi_positions[nones], potentialFrameshift:=none_results[2,]] + junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]] noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap") @@ -467,7 +467,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ return(c("complex","inconclusive")) }) junctions_dt[psi_positions[noLaps], - potentialFrameshift:=noLaps_results[2,]] + causesFrameshift:=noLaps_results[2,]] junctions_dt[psi_positions[noLaps], potentialImpact := noLaps_results[1,]] @@ -478,7 +478,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ # specify default type for theta results as NA junctions_dt[thetas, potentialImpact := NA] - junctions_dt[thetas, potentialFrameshift := NA] + junctions_dt[thetas, causesFrameshift := NA] # label all as intronic first if they have any intron overlap intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp))) @@ -895,11 +895,11 @@ checkExonSkipping <- function(junctions_dt, txdb){ junctions_dt[potentialImpact2 == "splicingBeyondGene", potentialImpact := "splicingBeyondGene"] junctions_dt[potentialImpact2 == "splicingBeyondGene", - potentialFrameshift := "inconclusive"] + causesFrameshift := "inconclusive"] junctions_dt[potentialImpact2 == "multigenicSplicing", potentialImpact := "multigenicSplicing"] junctions_dt[potentialImpact2 == "multigenicSplicing", - potentialFrameshift := "inconclusive"] + causesFrameshift := "inconclusive"] junctions_dt[, potentialImpact2 := NULL] } From 5625f6116c357b4ab22a4ee965c99912e9e09dba Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 19 Dec 2022 16:49:58 +0100 Subject: [PATCH 60/80] fix to ignore cross-chromosome read pairs during counting --- R/countRNAseqData.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index ee9f908b..9ef6ef5f 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -569,6 +569,10 @@ countSplitReadsPerChromosome <- function(chromosome, bamFile, bamFile, param=param, strandMode=strandMode) } + # remove read pairs with NA seqnames + # (occurs if reads of a pair align to different chromosomes) + galignment <- galignment[!is.na(seqnames(galignment))] + # remove the strand information if unstranded data if(isFALSE(as.logical(strandMode))){ strand(galignment) <- "*" From 8de38a220558da9570ab96c821513ba5c86aff5f Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 19 Dec 2022 17:05:49 +0100 Subject: [PATCH 61/80] use correct pseudocount in plot heatmap function --- R/plotMethods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plotMethods.R b/R/plotMethods.R index e21471b3..555385f8 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1034,7 +1034,7 @@ plotCountCorHeatmap.FRASER <- function(object, object <- object[,ids2plot] } - xmat <- (skmat + 1)/(snmat + 2) + xmat <- (skmat + 1*pseudocount())/(snmat + 2*pseudocount()) if(isTRUE(logit)){ xmat <- qlogisWithCap(xmat) } From cf115564057081bb353add734147492f1b245d3a Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 20 Dec 2022 09:27:08 +0100 Subject: [PATCH 62/80] fix typo --- R/AllGenerics.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 5c4cff54..db93ff9c 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -870,10 +870,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, function(s) which(colnames(tmp_x) ==s)), FDR_subset)] ) - # replace padj values with values on subset - padj <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals)) + # replace padjs values with values on subset + padjs <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals)) if(nrow(subset_padj) > 0){ - padj[subset_padj[,1:2]] <- subset_padj[,3] + padjs[subset_padj[,1:2]] <- subset_padj[,3] } } From f57f4c8dd189d47e205cc9afe38df47ff1be458b Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Tue, 20 Dec 2022 10:12:37 +0100 Subject: [PATCH 63/80] add colnames to subsetted padjust for later retrieval --- R/AllGenerics.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index db93ff9c..7a632521 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -875,6 +875,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, if(nrow(subset_padj) > 0){ padjs[subset_padj[,1:2]] <- subset_padj[,3] } + colnames(padjs) <- colnames(pvals) } if(length(sc) == 1){ From f17ffe825e0eece2418d428e70c24d5bd31a377d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 19 Jan 2023 15:54:19 +0100 Subject: [PATCH 64/80] minor bug fixes --- R/AllGenerics.R | 19 ++++++++++++------- R/plotMethods.R | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 7a632521..fdbf7045 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -831,10 +831,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, # get row,col idx of genes/samples in subset subset_gene_padj <- as.matrix( fdr_subset[sampleID %in% samples(tmp_x), - .(sapply(gene, - function(g) which(rownames(pvalsGene) == g)), - sapply(sampleID, - function(s) which(colnames(tmp_x) ==s)), + .(as.numeric(sapply(gene, + function(g) which(rownames(pvalsGene) == g)) ), + as.numeric(sapply(sampleID, + function(s) which(colnames(tmp_x) ==s)) ), pval_gene, FDR_subset_gene)] ) @@ -850,9 +850,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, colnames(padjsGene) <- colnames(tmp_x) if(nrow(subset_gene_padj) > 0){ pvalsGene[subset_gene_padj[,1:2]] <- - subset_gene_padj[,3] + unlist(subset_gene_padj[,3]) padjsGene[subset_gene_padj[,1:2]] <- - subset_gene_padj[,4] + unlist(subset_gene_padj[,4]) } } @@ -1188,7 +1188,12 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), # define aberrant status based on whether intron/sample tuples are # part of the given subset - aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj)) + if(is.null(dim(padj))){ + # only one sample in fds present -> no dimensions set + aberrantEvents <- matrix(FALSE, nrow=length(padj), ncol=1) + } else{ + aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj)) + } colnames(aberrantEvents) <- colnames(padj) FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset") subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){ diff --git a/R/plotMethods.R b/R/plotMethods.R index 555385f8..faace9a4 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -440,7 +440,8 @@ plotAberrantPerSample.FRASER <- function(object, main, scale_linetype_manual(name="", values=2, labels="Median") if(!all(dt2p[,value] == 0)){ - g <- g + scale_y_log10() + annotation_logticks(sides="l") + g <- g + scale_y_log10(limits=c(1, max(unlist(outliers)))) + + annotation_logticks(sides="l") } g From ade6ebfd29e060217a7bb42e5e754f5c0302bc57 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 19 Jan 2023 16:37:55 +0100 Subject: [PATCH 65/80] fix small bug in extracting junction idx of genes in subsets --- R/pvalsNzscore.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 979d97e8..8a9e7e82 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -426,7 +426,8 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), # message(date(), ": FDR subset calculation for sample = ", sample_id) # get idx of junctions corresponding to genes with var jidx <- unlist(lapply(genes_to_test_sample, function(gene){ - idx <- which(grepl(gene, mcols(fds, type=type)[, geneColumn])) + idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), + mcols(fds, type=type)[, geneColumn])) names(idx) <- rep(gene, length(idx)) if(length(idx) == 0 && verbose(fds) > 0){ warning("No introns found in fds object for gene: ", gene, From 37c757aebafbdf957abea734e70ebde1c9865efa Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 15 Feb 2023 15:44:32 +0100 Subject: [PATCH 66/80] move nonsplitProportion calculation before subsetting in results function --- R/AllGenerics.R | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index fdbf7045..b99ddd8e 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -749,6 +749,18 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, rowMeansK <- rowMeans(K(object, type=type)) rowMeansN <- rowMeans(N(object, type=type)) + # get proportion of nonsplitCounts among all counts (N) for each intron + if(type == "jaccard"){ + rawNonsplitCts <- as.matrix(assay(object, "rawCountsJnonsplit")) + rawNsProportion <- rawNonsplitCts / as.matrix(N(object)) + nsProportion_99quantile <- + rowQuantiles(rawNsProportion, probs=0.99) + } else{ + rawNonsplitCts <- NULL + rawNsProportion <- NULL + nsProportion_99quantile <- NULL + } + # calculate FDR on subset first if requested if(!is.null(geneSubset)){ object <- calculatePadjValuesOnSubset(fds=object, type=type, @@ -884,33 +896,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, colnames(deltaPsiVals) <- sc } - if(type == "jaccard"){ - rawNonsplitCts <- as.matrix(assay(tmp_x, "rawCountsJnonsplit")) - rawNsProportion <- rawNonsplitCts / rawTotalCts - nsProportion_99quantile <- - rowQuantiles(rawNsProportion, probs=0.99) - } else{ - rawNonsplitCts <- NULL - rawNsProportion <- NULL - nsProportion_99quantile <- NULL - } - - # create result table sampleRes <- lapply(sc, - resultsSingleSample, gr=gr, pvals=pvals, - padjs=padjs, psiType=type, - psivals=psivals, deltaPsiVals=deltaPsiVals, - rawCts=rawCts, rawTotalCts=rawTotalCts, - rawNonsplitCts=rawNonsplitCts, - rawNsProportion=rawNsProportion, - nsProportion_99quantile=nsProportion_99quantile, - rowMeansK=rowMeansK, rowMeansN=rowMeansN, - aberrant=aberrant, aggregate=aggregate, - rho=rho, geneColumn=geneColumn, - pvalsGene=pvalsGene, padjsGene=padjsGene, - aberrantGene=aberrantGene, - additionalColumns=additionalColumns) + resultsSingleSample, gr=gr, pvals=pvals, + padjs=padjs, psiType=type, + psivals=psivals, deltaPsiVals=deltaPsiVals, + rawCts=rawCts, rawTotalCts=rawTotalCts, + rawNonsplitCts=rawNonsplitCts[,sc], + rawNsProportion=rawNsProportion[,sc], + nsProportion_99quantile=nsProportion_99quantile, + rowMeansK=rowMeansK, rowMeansN=rowMeansN, + aberrant=aberrant, aggregate=aggregate, + rho=rho, geneColumn=geneColumn, + pvalsGene=pvalsGene, padjsGene=padjsGene, + aberrantGene=aberrantGene, + additionalColumns=additionalColumns) # return combined result return(unlist(GRangesList(sampleRes))) From c1e13d3d63731e5c326543ea8049f8f17e838641 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 15 Feb 2023 17:37:32 +0100 Subject: [PATCH 67/80] small bugfix --- R/AllGenerics.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index b99ddd8e..c27960f1 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -902,8 +902,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, padjs=padjs, psiType=type, psivals=psivals, deltaPsiVals=deltaPsiVals, rawCts=rawCts, rawTotalCts=rawTotalCts, - rawNonsplitCts=rawNonsplitCts[,sc], - rawNsProportion=rawNsProportion[,sc], + rawNonsplitCts=rawNonsplitCts[,sc,drop=FALSE], + rawNsProportion=rawNsProportion[,sc,drop=FALSE], nsProportion_99quantile=nsProportion_99quantile, rowMeansK=rowMeansK, rowMeansN=rowMeansN, aberrant=aberrant, aggregate=aggregate, From 68f703912b2207b056c9ce8986864b51e34849ed Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 16 Feb 2023 10:08:07 +0100 Subject: [PATCH 68/80] small bugfix --- R/AllGenerics.R | 2 +- R/pvalsNzscore.R | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index c27960f1..6b72e958 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -1194,7 +1194,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), } else{ aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj)) } - colnames(aberrantEvents) <- colnames(padj) + colnames(aberrantEvents) <- colnames(object) FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset") subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){ col <- which(colnames(object) == sid) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 8a9e7e82..ccad870f 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -452,7 +452,11 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), } # retrieve pvalues of junctions to test - p <- pVals(fds, type=type, level="junction")[jidx, sample_id] + p <- as.matrix(pVals(fds, type=type, level="junction")) + if(ncol(p) == 1){ + colnames(p) <- colnames(fds) + } + p <- p[jidx, sample_id] # FDR correction pa <- p.adjust(p, method=method) From 63e0654c6217a215321b1107f61cc41b6a8c406e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 16 Feb 2023 17:35:44 +0100 Subject: [PATCH 69/80] fix gene pval in FDR on subset for psi3/5 --- R/pvalsNzscore.R | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index ccad870f..94c9c151 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -418,6 +418,9 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), "annotateRanges or annotateRangesWithTxDb function.")) } + # site index (for psi3/5) + site_idx <- getSiteIndex(fds, type=type) + # compute FDR on the given subsets of genes message(date(), ": starting FDR calculation on subset of genes...") FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, @@ -451,20 +454,22 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), FDR_subset_gene=numeric(0))) } - # retrieve pvalues of junctions to test - p <- as.matrix(pVals(fds, type=type, level="junction")) + # retrieve pvalues of junctions + p <- as.matrix(pVals(fds, type=type)) if(ncol(p) == 1){ colnames(p) <- colnames(fds) } p <- p[jidx, sample_id] # FDR correction - pa <- p.adjust(p, method=method) + pa <- p.adjust(p[!duplicated(site_idx[jidx])], method=method) # gene level pvals - dt <- data.table(sampleID=sample_id, type=type, pval=p, FDR_subset=pa, - gene=names(jidx), jidx=jidx) - dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + dt <- data.table(sampleID=sample_id, type=type, pval=p, + gene=names(jidx), jidx=jidx, site_idx=site_idx[jidx]) + dt <- merge(dt, data.table(site_idx=site_idx[jidx][!duplicated(site_idx[jidx])], FDR_subset=pa), by="site_idx") + dt[!duplicated(dt$site_idx), pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"] # gene level FDR dt2 <- dt[, unique(pval_gene), by="gene"] From 17811132069569a308d5e68fb3912f1a668e7d6b Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Mon, 13 Mar 2023 15:13:29 +0100 Subject: [PATCH 70/80] enable subsetting on manhattan plot & added plotSpliceMetricRank --- NAMESPACE | 1 + R/plotMethods.R | 177 ++++++++++++++++++++++++++++++++++++++++--- man/plotFunctions.Rd | 34 ++++++++- 3 files changed, 197 insertions(+), 15 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index bf731b72..64a7caf6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -76,6 +76,7 @@ export(plotFilterExpression) export(plotFilterVariability) export(plotManhattan) export(plotQQ) +export(plotSpliceMetricRank) export(plotVolcano) export(predictedMeans) export(pseudocount) diff --git a/R/plotMethods.R b/R/plotMethods.R index faace9a4..7609939c 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -18,6 +18,7 @@ #' \item plotBamCoverage() #' \item plotBamCoverageFromResultTable() #' \item plotManhattan() +#' \item plotSpliceMetricRank() #' } #' #' For a detailed description of each plot function please see the details. @@ -162,7 +163,11 @@ #' (b,l,t,r). #' @param cex For controlling the size of text and numbers in #' \code{plotBamCoverage}. -#' @param color_chr Interchanging colors by chromosome for \code{plotManhattan}. +#' @param chr Vector of chromosome names to show in \code{plotManhattan}. The +#' default is to show all chromosomes. +#' @param value Indicates which assay is shown in the manhattan plot. Defaults +#' to 'pvalue'. Other options are 'deltaPsi' and 'zScore'. +#' @param chrColor Interchanging colors by chromosome for \code{plotManhattan}. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -190,6 +195,9 @@ #' #' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi #' against the predicted psi for a given site. +#' +#' \code{plotSpliceMetricRank}: This function plots for a given intron the +#' observed values of the selected splice metrix against the sample rank. #' #' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either #' of the full data set (i.e. sample-sample correlations) or of the top x most @@ -549,6 +557,116 @@ plotExpression <- function(fds, type=fitMetrics(fds), plotBasePlot(g, basePlot) } +#' +#' Junction splice metric plot +#' +#' Plots the observed values of the splice metric across samples for a +#' junction of interest. +#' +#' @rdname plotFunctions +#' @export +plotSpliceMetricRank <- function(fds, type=fitMetrics(fds), + idx=NULL, result=NULL, colGroup=NULL, + basePlot=TRUE, main=NULL, label="aberrant", ...){ + if(!is.null(result)){ + type <- as.character(result$type) + idx <- getIndexFromResultTable(fds, result) + } else { + type <- match.arg(type) + } + + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + + # rank on observed value of splice metric of interest + dt[, rank := rank(obsPsi, ties.method="random", na.last=FALSE)] + + if(!is.null(colGroup)){ + if(all(colGroup %in% samples(fds))){ + colGroup <- samples(fds) %in% colGroup + } + dt[colGroup,aberrant:=TRUE] + } + dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))] + + gr <- granges(rowRanges(fds,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), + ":", strand(gr)) + + if(is.null(main)){ + if(isTRUE(basePlot)){ + # main <- as.expression(bquote(bold(paste( + # .(ggplotLabelPsi(type)[[1]]), " rank plot: ", + # .(genomic_pos_label), + # " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + # ")")))) + main <- as.expression(bquote(bold(paste( + .(genomic_pos_label), + " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + ")")))) + } else{ + # main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], + # " rank plot: ", dt[,unique(featureID)], + # " (site ", dt[,unique(idx)], ")") + main <- paste0(genomic_pos_label, + " (", dt[,unique(featureID)], ")") + } + } + + if(isTRUE(basePlot)){ + ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]])) + } else{ + ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]]) + } + + g <- ggplot(dt, aes(x=rank, y=obsPsi, color=aberrant, label=sampleID, + text=paste0( + "Sample: ", sampleID, "
", + "Counts (K): ", k, "
", + "Total counts (N): ", n, "
", + "p value: ", signif(pval, 5), "
", + "padjust: ", signif(padj, 5), "
", + "Observed Psi: ", round(obsPsi, 2), "
", + "Predicted mu: ", round(predPsi, 2), "
"))) + + geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) + + theme_bw() + + theme(legend.position="none", title=) + + xlab("Sample rank") + + ylab(ylab) + + ggtitle(main, subtitle=paste0("fds row index: ", dt[,unique(idx)])) + + ylim(0,1) + + + if(isTRUE(basePlot) && !is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(dt[aberrant == TRUE,]) > 0){ + g <- g + geom_text_repel(data=dt[aberrant == TRUE,], + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + } + else{ + if(nrow(dt[sampleID %in% label]) > 0){ + g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + if(any(!(label %in% dt[,sampleID]))){ + warning("Did not find sample(s) ", + paste(label[!(label %in% dt[,sampleID])], + collapse=", "), " to label.") + } + } + } + + if(is.null(colGroup)){ + g <- g + scale_colour_manual( + values=c("FALSE"="gray70", "TRUE"="firebrick")) + } + + plotBasePlot(g, basePlot) +} + #' #' Expected over Overserved plot @@ -1432,10 +1550,10 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, return(invisible(fds)) } -plotManhattan.FRASER <- function(object, sampleID, - type=fitMetrics(object), - main=paste0("sampleID = ", sampleID), - color_chr=c("black", "darkgrey"), +plotManhattan.FRASER <- function(object, sampleID, value="pvalue", + type=fitMetrics(object), chr=NULL, + main=paste0("sample: ", sampleID), + chrColor=c("black", "darkgrey"), ...){ # check necessary packages if (!requireNamespace('ggbio')){ @@ -1454,7 +1572,7 @@ plotManhattan.FRASER <- function(object, sampleID, if("padjCutoff" %in% names(additional_args)){ padjCutoff <- additional_args$padjCutoff } - deltaPsiCutoff <- 0.3 + deltaPsiCutoff <- ifelse(type == "jaccard", 0.1, 0.3) if("deltaPsiCutoff" %in% names(additional_args)){ deltaPsiCutoff <- additional_args$deltaPsiCutoff } @@ -1468,6 +1586,20 @@ plotManhattan.FRASER <- function(object, sampleID, padjVals(object, type=type, level="site")[,sampleID]) mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID] + # Add values to granges + if(value %in% c('pvalue', 'pValue', 'pv')){ + gr_sample$value <- mcols(gr_sample)[, "pvalue"] + ylabel <- expression(paste(-log[10], "(P-value)")) + } + if(value %in% c('zscore', 'zScore')){ + gr_sample$value <- zScores(object, type=type)[, sampleID] + ylabel <- value + } + if(value %in% c('delta', 'deltaPsi', 'deltaJaccard')){ + gr_sample$value <- mcols(gr_sample)[, "delta"] + ylabel <- bquote(Delta ~ .(ggplotLabelPsi(type)[[1]])) + } + # only one point per donor/acceptor site (relevant only for psi5 and psi3) index <- getSiteIndex(object, type=type) nonDup <- !duplicated(index) @@ -1477,6 +1609,24 @@ plotManhattan.FRASER <- function(object, sampleID, gr_sample <- sortSeqlevels(gr_sample) gr_sample <- sort(gr_sample) + # subset to chromosomes in chrSubset if requested + if(!is.null(chr)){ + # check input + if(!all(chr %in% unique(seqnames(gr_sample)))){ + stop("Not all chromosomes selected for subsetting are present ", + "in the GRanges object.") + } + + # subset + gr_sample <- gr_sample[as.character(seqnames(gr_sample)) %in% chr] + + # add chr to plot title if only one chr given + if(length(chr) == 1){ + main <- paste0(main, "; ", + paste(chr, collapse=", ", sep="")) + } + } + # find outlier indices if(!type %in% c("psi3", "psi5")){ outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) & @@ -1487,11 +1637,12 @@ plotManhattan.FRASER <- function(object, sampleID, message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...") # plot manhattan plot - plotGrandLinear.adapted(gr_sample, aes(y=pvalue), - color=color_chr, + plotGrandLinear.adapted(gr_sample, aes(y=value), + color=chrColor, highlight.gr=gr_sample[outlier_idx,], - highlight.overlap="equal") + - labs(title=main) + highlight.overlap="equal", + use.genome.coords=is.null(chr)) + + labs(title=main, x="", y=ylabel) } @@ -1614,7 +1765,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, highlight.label.size = 5, highlight.label.offset = 0.05, highlight.label.col = "black", highlight.overlap = c("any", "start", "end", "within", "equal"), - spaceline = FALSE){ + spaceline = FALSE, use.genome.coords=TRUE){ if (is.null(geom)) geom <- "point" args <- list(...) @@ -1644,7 +1795,9 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, } if (!"y" %in% names(args.aes)) stop("need to provide y") - args.non$coord <- "genome" + if(isTRUE(use.genome.coords)){ + args.non$coord <- "genome" + } args.non$space.skip <- space.skip args.non$geom <- geom args.non$object <- obj diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index 79b1db7c..db8025a0 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -15,6 +15,7 @@ \alias{plotBamCoverageFromResultTable} \alias{plotVolcano,FraserDataSet-method} \alias{plotAberrantPerSample,FraserDataSet-method} +\alias{plotSpliceMetricRank} \alias{plotQQ,FraserDataSet-method} \alias{plotEncDimSearch,FraserDataSet-method} \alias{plotFilterVariability} @@ -60,6 +61,18 @@ plotExpression( ... ) +plotSpliceMetricRank( + fds, + type = fitMetrics(fds), + idx = NULL, + result = NULL, + colGroup = NULL, + basePlot = TRUE, + main = NULL, + label = "aberrant", + ... +) + plotExpectedVsObservedPsi( fds, type = fitMetrics(fds), @@ -169,9 +182,11 @@ plotBamCoverageFromResultTable( \S4method{plotManhattan}{FraserDataSet}( object, sampleID, + value = "pvalue", type = fitMetrics(object), - main = paste0("sampleID = ", sampleID), - color_chr = c("black", "darkgrey"), + chr = NULL, + main = paste0("sample: ", sampleID), + chrColor = c("black", "darkgrey"), ... ) } @@ -371,7 +386,13 @@ junction should be extended to the left in outlier junction should be extended to the right in \code{plotBamCoverageFromResultTable}.} -\item{color_chr}{Interchanging colors by chromosome for \code{plotManhattan}.} +\item{value}{Indicates which assay is shown in the manhattan plot. Defaults +to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.} + +\item{chr}{Vector of chromosome names to show in \code{plotManhattan}. The +default is to show all chromosomes.} + +\item{chrColor}{Interchanging colors by chromosome for \code{plotManhattan}.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -390,6 +411,9 @@ Plot the number of aberrant events per samples Plots the observed split reads of the junction of interest over all reads coming from the given donor/acceptor. +Plots the observed values of the splice metric across samples for a +junction of interest. + Plots the expected psi value over the observed psi value of the given junction. @@ -416,6 +440,7 @@ This is the list of all plotting function provided by FRASER: \item plotBamCoverage() \item plotBamCoverageFromResultTable() \item plotManhattan() + \item plotSpliceMetricRank() } For a detailed description of each plot function please see the details. @@ -444,6 +469,9 @@ log10 space. \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi against the predicted psi for a given site. +\code{plotSpliceMetricRank}: This function plots for a given intron the +observed values of the selected splice metrix against the sample rank. + \code{plotCountCorHeatmap}: The correlation heatmap of the count data either of the full data set (i.e. sample-sample correlations) or of the top x most variable junctions (i.e. junction-sample correlations). By default the values From 3e8b39fbf65d4203af9405ce51c4f64bd0dc3f62 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 15 Mar 2023 15:57:09 +0100 Subject: [PATCH 71/80] added new plot functions to vignette --- vignettes/FRASER.Rnw | 62 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index ef49a129..f911b8ec 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -556,17 +556,19 @@ the following additional information: sample in which it is detected as aberrant \item deltaPsi: the $\Delta\psi$-value of the event in this sample, which is the difference between the actual observed $\psi$ and the expected $\psi$ + \item counts, totalCounts: the count (k) and total count (n) of the splice + junction or site for the sample where it is detected as aberrant \item meanCounts: the mean count (k) of reads mapping to this splice junction or site over all samples \item meanTotalCounts: the mean total count (n) of reads mapping to the same donor or acceptor site as this junction or site over all samples - \item counts, totalCounts: the count (k) and total count (n) of the splice - junction or site for the sample where it is detected as aberrant \item nonsplitCounts, nonsplitProportion: only present for the Intron Jaccard Index. States the sum of nonsplit counts overlapping either the donor or acceptor site of the outlier intron for the sample where it is detected as aberrant; and their proportion out of the total counts (N). A high nonsplitProportion indicates possible (partial) intron retention. + \item FDR_set The set of genes on which FDR correction is applied. If not + otherwise specified, FDR correction is transcriptome-wide. \end{itemize} Please refer to section \ref{sec:Introduction} for more information about the Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and @@ -609,15 +611,16 @@ plotVolcano(fds, type="jaccard", "sample10") Which are the splicing events in detail? <>= -sampleRes <- res[res$sampleID == "sample8"] +sampleRes <- res[res$sampleID == "sample10"] sampleRes @ To have a closer look at the junction level, use the following functions: <>= -plotExpression(fds, type="jaccard", result=sampleRes[1]) -plotExpectedVsObservedPsi(fds, result=sampleRes[1]) +plotExpression(fds, type="jaccard", result=sampleRes[9]) +plotSpliceMetricRank(fds, type="jaccard", result=sampleRes[9]) +plotExpectedVsObservedPsi(fds, result=sampleRes[9]) @ \subsection{Saving and loading a \fds{}} @@ -737,6 +740,7 @@ head(padjVals(fds,type="jaccard")) In addition to the plotting methods \Rfunction{plotVolcano}, \Rfunction{plotExpression}, \Rfunction{plotExpectedVsObservedPsi}, +\Rfunction{plotSpliceMetricRank}, \Rfunction{plotFilterExpression} and \Rfunction{plotEncDimSearch} used above, the \fraser{} package provides two additional functions to visualize the results: @@ -755,6 +759,54 @@ plotQQ(fds, result=res[1]) plotQQ(fds, aggregate=TRUE, global=TRUE) @ +The \Rfunction{plotManhattan} function can be used to visualize the p-values +along with the genomic coordinates of the introns: +<>= +plotManhattan(fds, sampleID="sample10") +plotManhattan(fds, sampleID="sample10", chr="chr19") +@ + +Finally, when one has access to the bam files from which the split and unsplit +counts of FRASER were created, the \Rfunction{plotBamCoverage} and +\Rfunction{plotBamCoverageFromResultTable} functions use the \Rpackage{SGSeq} +package to allow visualizing the read coverage in the bam file a certain intron +from the results table or within a given genomic region as a sashimi plot: +<>= +### plot coverage from bam file for a certain genomic region +fds <- createTestFraserSettings() +vizRange <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") +plotBamCoverage(fds, gr=vizRange, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + +### plot coverage from bam file for a row in the result table +fds <- createTestFraserDataSet() + +# load gene annotation +require(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +require(org.Hs.eg.db) +orgDb <- org.Hs.eg.db + +# get results table +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +res_dt <- as.data.table(res) +res_dt <- res_dt[sampleID == "sample2",] + +# plot full range of gene highlighting the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + +# plot only certain range around the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") +@ + \bibliography{bibliography} \section{Session Info} From bcc4fd3a80913aaab59371f779984993e0d6e7fb Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 15 Mar 2023 16:09:47 +0100 Subject: [PATCH 72/80] fix vignette --- vignettes/FRASER.Rnw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index f911b8ec..99df108d 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -567,7 +567,7 @@ the following additional information: donor or acceptor site of the outlier intron for the sample where it is detected as aberrant; and their proportion out of the total counts (N). A high nonsplitProportion indicates possible (partial) intron retention. - \item FDR_set The set of genes on which FDR correction is applied. If not + \item FDR\_set The set of genes on which FDR correction is applied. If not otherwise specified, FDR correction is transcriptome-wide. \end{itemize} Please refer to section \ref{sec:Introduction} for more information about the From 5e6893ff08f5f70c7919b19a70e0afd869b46069 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 29 Mar 2023 17:21:05 +0200 Subject: [PATCH 73/80] simplify result retrieval for FDR on subset --- NAMESPACE | 1 + R/AllGenerics.R | 262 ++++++++---------------------- R/Fraser-pipeline.R | 18 +- R/getNSetterFuns.R | 37 ++++- R/helper-functions.R | 5 +- R/pvalsNzscore.R | 175 ++++++++++++++------ man/FRASER.Rd | 40 +++-- man/countRNA.Rd | 16 +- man/filtering.Rd | 8 +- man/getter_setter_functions.Rd | 47 +++--- man/potentialImpactAnnotations.Rd | 8 +- man/results.Rd | 44 ++--- tests/testthat/test_stats.R | 14 +- vignettes/FRASER.Rnw | 23 ++- 14 files changed, 358 insertions(+), 340 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 64a7caf6..907f1408 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(annotateIntronReferenceOverlap) export(annotatePotentialImpact) export(annotateRanges) export(annotateRangesWithTxDb) +export(availableFDRsubsets) export(bamFile) export(bestQ) export(calculatePSIValues) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 6b72e958..7e1bf696 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -714,8 +714,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, dPsiCutoff, minCount, rhoCutoff, psiType, maxCols=20, aggregate=FALSE, collapse=FALSE, geneColumn="hgnc_symbol", BPPARAM=bpparam(), - geneSubset=NULL, subsetName, fullSubset=FALSE, - additionalColumns=NULL){ + subsetName=NULL, all=all, additionalColumns=NULL){ stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) @@ -732,7 +731,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, } pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType, filters=list(rho=rhoCutoff), - aggregate=aggregate) + aggregate=aggregate, + subsetName=subsetName) psiType <- psiType[pvalsAvailable] if(all(isFALSE(pvalsAvailable))){ stop("For the splice metric(s), pvalues are not yet computed. \n", @@ -741,7 +741,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, } resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ - message(date(), ": Collecting results for: ", type) + message(date(), ": Collecting results for: ", type, + ifelse(is.null(subsetName), " (transcriptome-wide)", + paste0(" (", subsetName, ")"))) currentType(object) <- type gr <- rowRanges(object, type=type) @@ -761,17 +763,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, nsProportion_99quantile <- NULL } - # calculate FDR on subset first if requested - if(!is.null(geneSubset)){ - object <- calculatePadjValuesOnSubset(fds=object, type=type, - genesToTest=geneSubset, - subsetName=subsetName, - geneColumn=geneColumn) - fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, - sep="_")]] - object <- object[, fdr_subset[, unique(sampleID)]] - } - # then iterate by chunk chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols) sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs, @@ -788,6 +779,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, pvals <- as.matrix(pVals(tmp_x, filters=list(rho=rhoCutoff))) padjs <- as.matrix(padjVals(tmp_x, + subsetName=subsetName, filters=list(rho=rhoCutoff))) psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) @@ -795,101 +787,36 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, (rawTotalCts + 2*pseudocount()) deltaPsiVals <- deltaPsiValue(tmp_x, type) rho <- rho(tmp_x, type) - if(is.null(geneSubset)){ - aberrant <- aberrant.FRASER(tmp_x, type=type, - padjCutoff=fdrCutoff, - deltaPsiCutoff=dPsiCutoff, - minCount=minCount, - rhoCutoff=rhoCutoff, - aggregate=FALSE, - geneColumn=geneColumn) - } else{ - aberrant <- aberrant.FRASER(tmp_x, type=type, + aberrant <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=FALSE, + all=all, + geneColumn=geneColumn, + subsetName=subsetName) + if(isTRUE(aggregate)){ + pvalsGene <- as.matrix(pVals(tmp_x, level="gene", + filters=list(rho=rhoCutoff))) + padjsGene <- as.matrix(padjVals(tmp_x, level="gene", + subsetName=subsetName, + filters=list(rho=rhoCutoff))) + aberrantGene <- aberrant.FRASER(tmp_x, type=type, padjCutoff=fdrCutoff, deltaPsiCutoff=dPsiCutoff, minCount=minCount, rhoCutoff=rhoCutoff, - aggregate=FALSE, + aggregate=TRUE, + all=all, geneColumn=geneColumn, - FDR_subset=fdr_subset[sampleID - %in% samples(tmp_x)], - fullSubset=fullSubset) - } - if(isTRUE(aggregate)){ - pvalsGene <- as.matrix(pVals(tmp_x, level="gene", - filters=list(rho=rhoCutoff))) - padjsGene <- as.matrix(padjVals(tmp_x, level="gene", - filters=list(rho=rhoCutoff))) - if(is.null(geneSubset)){ - aberrantGene <- aberrant.FRASER(tmp_x, type=type, - padjCutoff=fdrCutoff, - deltaPsiCutoff=dPsiCutoff, - minCount=minCount, - rhoCutoff=rhoCutoff, - aggregate=TRUE, - geneColumn=geneColumn) - } else{ - aberrantGene <- aberrant.FRASER(tmp_x, type=type, - padjCutoff=fdrCutoff, - deltaPsiCutoff=dPsiCutoff, - minCount=minCount, - rhoCutoff=rhoCutoff, - aggregate=TRUE, - geneColumn=geneColumn, - FDR_subset=fdr_subset[sampleID - %in% samples(tmp_x)], - fullSubset=fullSubset) - - # get row,col idx of genes/samples in subset - subset_gene_padj <- as.matrix( - fdr_subset[sampleID %in% samples(tmp_x), - .(as.numeric(sapply(gene, - function(g) which(rownames(pvalsGene) == g)) ), - as.numeric(sapply(sampleID, - function(s) which(colnames(tmp_x) ==s)) ), - pval_gene, - FDR_subset_gene)] - ) - # replace pvalsGene and padjsGene with values on subset - geneNames <- rownames(pvalsGene) - pvalsGene <- matrix(NA, - nrow=nrow(pvalsGene), ncol=ncol(tmp_x)) - rownames(pvalsGene) <- geneNames - colnames(pvalsGene) <- colnames(tmp_x) - padjsGene <- matrix(NA, - nrow=nrow(padjsGene), ncol=ncol(tmp_x)) - rownames(padjsGene) <- geneNames - colnames(padjsGene) <- colnames(tmp_x) - if(nrow(subset_gene_padj) > 0){ - pvalsGene[subset_gene_padj[,1:2]] <- - unlist(subset_gene_padj[,3]) - padjsGene[subset_gene_padj[,1:2]] <- - unlist(subset_gene_padj[,4]) - } - } - + subsetName=subsetName) } else{ pvalsGene <- NULL padjsGene <- NULL aberrantGene <- NULL } - if(!is.null(geneSubset)){ - # get row,col idx of introns/samples in subset - subset_padj <- as.matrix( - fdr_subset[sampleID %in% samples(tmp_x), .(jidx, - sapply(sampleID, - function(s) which(colnames(tmp_x) ==s)), - FDR_subset)] - ) - # replace padjs values with values on subset - padjs <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals)) - if(nrow(subset_padj) > 0){ - padjs[subset_padj[,1:2]] <- subset_padj[,3] - } - colnames(padjs) <- colnames(pvals) - } - if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc @@ -925,14 +852,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, # sort it if existing if(length(ans) > 0){ ans <- ans[order(ans$pValue)] - if(is.null(geneSubset)){ + if(is.null(subsetName)){ mcols(ans)[["FDR_set"]] <- "transcriptome-wide" } else{ - if(is.null(subsetName)){ - mcols(ans)[["FDR_set"]] <- "subset" - } else{ - mcols(ans)[["FDR_set"]] <- subsetName - } + mcols(ans)[["FDR_set"]] <- subsetName } } @@ -981,21 +904,14 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' types to return only one row per feature (gene) and sample. #' @param geneColumn The column name of the column that has the gene annotation #' that will be used for gene-level pvalue computation. -#' @param subsets A named list of named lists specifying any number of gene -#' subsets (can differ per sample). For each subset, FDR correction -#' will be limited to introns in genes in the subset, and aberrant -#' events passing the FDR cutoff will be reported for each subset -#' separately. See the examples for how to use this argument. -#' @param fullSubset Only applies when \code{geneSubset} is not NULL. Specifies -#' whether all introns in given subset of genes should be -#' considered as aberrant, or only those passing the given cutoffs. -#' Defaults to FALSE (introns have to pass the cutoffs in addtion -#' to being in the gene subset to be considered aberrant). -#' @param geneSubset A named list giving a subset of genes per sample to which -#' FDR correction should be restricted. The names of the list must -#' correspond to the sampleIDs in the fds object. -#' @param subsetName The name under which the resulting FDR corrected pvalues -#' on the subset only will be displayed in the result table. +#' @param all By default FALSE, only significant introns (or genes) are listed +#' in the results. If TRUE, results are assembled for all +#' samples and introns/genes regardless of significance. +#' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets +#' of genes of interest have been calculated, this parameter +#' indicates whether additionally the transcriptome-wide results +#' should be returned as well (default), or whether only results +#' for those subsets should be retrieved. #' @param ... Further arguments can be passed to the method. If "n", #' "padjVals", "dPsi" or "rhoVals" are given, the values of those #' arguments are used to define the aberrant events. @@ -1012,8 +928,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' fds <- createTestFraserDataSet() #' #' # extract results: for this example dataset, no cutoffs are used to -#' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +#' # show the output of the results function +#' res <- results(fds, all=TRUE) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first @@ -1039,52 +955,49 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' # retrieve results limiting FDR correction to only a subset of genes #' # first, we need to create a list of genes per sample that will be tested #' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) -#' results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=geneList)) +#' results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) #' -#' # results for several subsets can be retrieved at the same time: -#' geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1")) -#' results(fds, -#' subsets=list('random_subset'=geneList, 'another_subset'=geneList2), -#' fullSubset=TRUE) #' @export setMethod("results", "FraserDataSet", function(object, sampleIDs=samples(object), padjCutoff=0.05, deltaPsiCutoff=0.1, rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=psiTypes, - geneColumn="hgnc_symbol", - subsets=NULL, fullSubset=FALSE, + geneColumn="hgnc_symbol", all=FALSE, returnTranscriptomewideResults=TRUE, additionalColumns=NULL, BPPARAM=bpparam()){ psiType <- match.arg(psiType, several.ok=TRUE) - if(is.null(subsets)){ + FDRsets <- availableFDRsubsets(object) + + if(isFALSE(returnTranscriptomewideResults) && is.null(FDRsets)){ + warning("Retrieving transcriptome-wide results as no other ", + "FDR subsets are available in the fds object.") returnTranscriptomewideResults <- TRUE - } + } if(isTRUE(returnTranscriptomewideResults)){ res <- FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, rhoCutoff=rhoCutoff, minCount=minCount, - psiType=psiType, + psiType=psiType, all=all, aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, - geneSubset=NULL, subsetName=NULL, fullSubset=FALSE, - additionalColumns=additionalColumns, BPPARAM=BPPARAM) + subsetName=NULL, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) } # add results for FDR_subsets if requested - if(!is.null(subsets)){ - stopifnot(is.list(subsets)) - stopifnot(!is.null(names(subsets))) - resls_subsets <- lapply(names(subsets), function(setName){ - geneList_sub <- subsets[[setName]] + if(!is.null(FDRsets)){ + resls_subsets <- lapply(FDRsets, function(setName){ res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, rhoCutoff=rhoCutoff, minCount=minCount, - psiType=psiType, + psiType=psiType, all=all, aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, - geneSubset=geneList_sub, subsetName=setName, - fullSubset=fullSubset, - additionalColumns=additionalColumns, BPPARAM=BPPARAM) + subsetName=setName, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) }) + if(isTRUE(returnTranscriptomewideResults)){ res <- unlist(GRangesList(unlist(list(res, resls_subsets)))) } else{ @@ -1107,8 +1020,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), minCount=5, rhoCutoff=NA, by=c("none", "sample", "feature"), aggregate=FALSE, geneColumn="hgnc_symbol", - geneSubset=NULL, subsetName=NULL, - fullSubset=FALSE, ...){ + subsetName=NULL, all=FALSE, ...){ checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) @@ -1133,13 +1045,14 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), # check if padj values are available for the given filters pvalsAvailable <- checkPadjAvailableForFilters(object, type=type, filters=list(rho=rhoCutoff), - aggregate=aggregate) + aggregate=aggregate, + subsetName=subsetName) if(isFALSE(pvalsAvailable)){ stop("For the given filters, pvalues are not yet computed. \n", "Please compute them first by running the ", "calculatePadjValues function with the requested filters.") } - padj <- padjVals(object, type=type, level="site", + padj <- padjVals(object, type=type, level="site", subsetName=subsetName, filters=list(rho=rhoCutoff)) } if("dPsi" %in% names(dots)){ @@ -1158,60 +1071,23 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), padj_gene <- dots[['padjGeneVals']] } else{ padj_gene <- padjVals(object, type=type, level="gene", + subsetName=subsetName, filters=list(rho=rhoCutoff)) } } - if(is.na(padjCutoff) || (isTRUE(fullSubset) && - (!is.null(geneSubset) || ("FDR_subset" %in% names(dots))))){ + if(is.na(padjCutoff)){ padjCutoff <- 1 } - if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){ - aberrantEvents <- as.matrix(padj) <= padjCutoff + if(isTRUE(all)){ + aberrantEvents <- matrix(TRUE, nrow=nrow(object), ncol=ncol(object)) + colnames(aberrantEvents) <- colnames(fds) } else{ - if("FDR_subset" %in% names(dots)){ - fdr_subset <- dots[["FDR_subset"]] - } else{ - if(is.null(subsetName)){ - subsetName <- "subset" - } - object <- calculatePadjValuesOnSubset(fds=object, type=type, - genesToTest=geneSubset, - subsetName=subsetName, - geneColumn=geneColumn) - fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, - sep="_")]] - } - + aberrantEvents <- as.matrix(padj) <= padjCutoff - # define aberrant status based on whether intron/sample tuples are - # part of the given subset - if(is.null(dim(padj))){ - # only one sample in fds present -> no dimensions set - aberrantEvents <- matrix(FALSE, nrow=length(padj), ncol=1) - } else{ - aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj)) - } - colnames(aberrantEvents) <- colnames(object) - FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset") - subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){ - col <- which(colnames(object) == sid) - rows <- fdr_subset[sampleID == sid & get(FDR_col) <= padjCutoff, - sort(unique(jidx))] - sub_idx <- matrix(c(rows, rep(col, length(rows))), - nrow=length(rows)) - return(sub_idx) - }) - subset_idx <- do.call(rbind, - subset_idx[which(sapply(subset_idx, nrow) > 0)]) - aberrantEvents[subset_idx] <- TRUE - } - - # check each cutoff if in use (not NA) - if((is.null(geneSubset) && !("FDR_subset" %in% names(dots))) || - isFALSE(fullSubset)){ + # check each cutoff if in use (not NA) if(!is.na(minCount)){ aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) } @@ -1242,7 +1118,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object), aberrantEvents <- as.matrix(ab_dt[,-1]) rownames(aberrantEvents) <- ab_dt[,geneID] - if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){ + if(isFALSE(all)){ aberrantEvents <- aberrantEvents & as.matrix( padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] ) <= padjCutoff diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index 10f22c3a..92d0b492 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -72,11 +72,21 @@ #' fds <- calculateZscore(fds, type="jaccard") #' head(zScores(fds, type="jaccard")) #' -#' # To calculate the FDR only on a subset of genes of interest (per sample): -#' geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) -#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, +#' # example of restricting FDR correction to subsets of genes of interest +#' genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=genesOfInterest)) +#' padjVals(fds, type="jaccard", subsetName="exampleSubset") +#' padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +#' padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") +#' +#' # only adding FDR corrected pvalues on a subset without calculating +#' # transcriptome-wide FDR again: +#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, #' subsetName="setOfInterest", type="jaccard") -#' metadata(fds)[["FDR_setOfInterest"]] +#' padjVals(fds, type="jaccard", subsetName="setOfInterest") #' #' @seealso \code{\link[FRASER]{fit}} #' diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 0a03576f..00fb0055 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -327,11 +327,14 @@ pVals <- function(fds, type=currentType(fds), level="site", #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), - level="site", filters=list(), ...){ + level="site", subsetName=NULL, filters=list(), ...){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } # add information on used filters if(is.null(names(filters))){ filters <- list(rho=1) @@ -358,11 +361,15 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), } `padjVals<-` <- function(fds, type=currentType(fds), level="site", - dist="BetaBinomial", filters=list(), ..., value){ + dist="BetaBinomial", subsetName=NULL, filters=list(), ..., + value){ level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } # add information on used filters for(n in sort(names(filters))){ aname <- paste0(aname, "_", n, filters[[n]]) @@ -378,6 +385,32 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), return(fds) } +#' @describeIn getter_setter_functions This returns the names of FDR subsets +#' for which adjusted p values have been calculated. +#' @export +availableFDRsubsets <- function(fds){ + ans <- metadata(fds)[["FDRsubsets"]] + return(ans) +} + +`availableFDRsubsets<-` <- function(fds, value){ + metadata(fds)[["FDRsubsets"]] <- value + return(fds) +} + +`addToAvailableFDRsubsets<-` <- function(fds, value){ + if(!isScalarCharacter(value)){ + stop("The assigned value needs to be a scalar character.") + } + ans <- metadata(fds)[["FDRsubsets"]] + if(is.null(ans)){ + metadata(fds)[["FDRsubsets"]] <- value + } else{ + metadata(fds)[["FDRsubsets"]] <- unique(c(ans, value)) + } + return(fds) +} + #' @describeIn getter_setter_functions This returns the fitted mu (i.e. psi) #' values. #' @export diff --git a/R/helper-functions.R b/R/helper-functions.R index ae0dba66..03d8de3b 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -556,10 +556,13 @@ getStrandString <- function(fds){ #' Check if adjusted pvalues have been computed for a given set of filters. #' @noRd checkPadjAvailableForFilters <- function(fds, type=currentType(fds), - filters=list(), dist="BetaBinomial", aggregate=FALSE){ + filters=list(), dist="BetaBinomial", aggregate=FALSE, + subsetName=NULL){ dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("padj", dist) aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname) + aname <- ifelse(!is.null(subsetName), paste0(aname, "_", subsetName), aname) + # add information on used filters for(n in sort(names(filters))){ aname_new <- paste0(aname, "_", n, filters[[n]]) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 94c9c151..2e403173 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -227,11 +227,18 @@ singlePvalueBinomial <- function(idx, k, n, mu){ #' should be calculated. Defaults to TRUE. #' @param geneColumn The column name of the column that has the gene annotation #' that will be used for gene-level pvalue computation. +#' @param subsets A named list of named lists specifying any number of gene +#' subsets (can differ per sample). For each subset, FDR correction +#' will be limited to genes in the subset, and the FDR corrected +#' pvalues stored as an assay in the fds object in addition to the +#' transcriptome-wide FDR corrected pvalues. See the examples for +#' how to use this argument. #' #' @export calculatePadjValues <- function(fds, type=currentType(fds), method="BY", rhoCutoff=NA, geneLevel=TRUE, - geneColumn="hgnc_symbol", BPPARAM=bpparam()){ + geneColumn="hgnc_symbol", subsets=NULL, + BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) idx <- !duplicated(index) @@ -284,6 +291,21 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY", "Please annotate gene symbols \nfirst using the ", "annotateRanges function.") } + + # calculate FDR for each provided subset and assign to fds + if(!is.null(subsets)){ + stopifnot(is.list(subsets)) + stopifnot(!is.null(names(subsets))) + for(setName in names(subsets)){ + geneListSubset <- subsets[[setName]] + fds <- calculatePadjValuesOnSubset(fds=fds, + genesToTest=geneListSubset, + subsetName=setName, + type=type, method=method, + geneColumn=geneColumn, + BPPARAM=BPPARAM) + } + } } return(fds) @@ -360,6 +382,13 @@ getSiteIndex <- function(fds, type=currentType(fds)){ getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, geneColumn="hgnc_symbol"){ + if(!geneColumn %in% colnames(mcols(fds, type=type))){ + stop("Did not find column '", geneColumn, "' in mcols(fds, type='", + type, "'). Please assign introns\nto genes first using the ", + "annotateRanges(fds, ...) or annotateRangesWithTxDb(fds, ...) ", + "function.") + } + geneIDs <- mcols(fds, type=type)[[geneColumn]] if(isTRUE(unique)){ geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){ @@ -398,91 +427,129 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ #' will be stored in metadata(fds). #' #' @export -calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), - subsetName="subset", method="BY", +calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, + type=currentType(fds), method='BY', geneColumn="hgnc_symbol", BPPARAM=bpparam()){ # check input - currentType(fds) <- type stopifnot(!is.null(genesToTest)) - stopifnot(is.list(genesToTest)) + stopifnot(is.list(genesToTest) || is.vector(genesToTest)) + + # replicate subset genes for each sample if given as vector + if(!is.list(genesToTest)){ + genesToTest <- rep(list(genesToTest), ncol(fds)) + names(genesToTest) <- colnames(fds) + } + + # check that names are present and correspond to samples in ods stopifnot(!is.null(names(genesToTest))) - if(!all(names(genesToTest) %in% samples(fds))){ + if(!all(names(genesToTest) %in% colnames(fds))){ stop("names(genesToTest) need to be sampleIDs in the given fds object.") } - # check if genes have been annotated - if(!geneColumn %in% colnames(mcols(fds, type=type))){ - stop(paste0("'", geneColumn, "' is not found in mcols(fds). ", - "Please annotate gene symbols \nfirst using the ", - "annotateRanges or annotateRangesWithTxDb function.")) - } + # get genes from fds object + fds_genes <- getGeneIDs(fds, unique=TRUE, type=type, geneColumn=geneColumn) + ngenes <- length(fds_genes) # site index (for psi3/5) site_idx <- getSiteIndex(fds, type=type) - + # compute FDR on the given subsets of genes message(date(), ": starting FDR calculation on subset of genes...") - FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, - FUN=function(sample_id, genes_to_test_sample){ + + # compute FDR on the given subsets of genes + fdrSubset <- bplapply(colnames(fds), FUN=function(sampleId){ - # message(date(), ": FDR subset calculation for sample = ", sample_id) - # get idx of junctions corresponding to genes with var - jidx <- unlist(lapply(genes_to_test_sample, function(gene){ - idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), - mcols(fds, type=type)[, geneColumn])) - names(idx) <- rep(gene, length(idx)) - if(length(idx) == 0 && verbose(fds) > 0){ - warning("No introns found in fds object for gene: ", gene, - " and sample: ", sample_id, ". Skipping this gene.") - } - return(idx) - })) - jidx <- sort(jidx[!duplicated(jidx)]) + # get genes to test for this sample + genesToTestSample <- genesToTest[[sampleId]] + padj <- rep(NA, nrow(fds)) + padj_gene <- rep(NA, ngenes) + + # if no genes present in the subset for this sample, return NAs + if(is.null(genesToTestSample)){ + return(list(padj=padj, padj_gene=padj_gene)) + } - # check that jidx is not empty vector - if(length(jidx) == 0){ - warning("No introns found in the fds object for the given gene ", - "subset for sample: ", sample_id) - return(data.table(gene=character(0), - sampleID=character(0), - type=character(0), - pval=numeric(0), - FDR_subset=numeric(0), - jidx=integer(0), - pval_gene=numeric(0), - FDR_subset_gene=numeric(0))) + # get idx of junctions corresponding to genes to test + if(is.character(genesToTestSample)){ + rowIdx <- sort(which(fds_genes %in% genesToTestSample)) + rowIdx <- unlist(lapply(genesToTestSample, function(gene){ + idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), + mcols(fds, type=type)[, geneColumn])) + names(idx) <- rep(gene, length(idx)) + if(length(idx) == 0 && verbose(fds) > 0){ + warning("No introns found in fds object for gene: ", gene, + " and sample: ", sampleId, ". Skipping this gene.") + } + return(idx) + })) + rowIdx <- sort(rowIdx[!duplicated(rowIdx)]) + } else{ + stop("Genes in the list to test must be a character vector ", + "of geneIDs.") } - # retrieve pvalues of junctions + # check that rowIdx is not empty vector + if(length(rowIdx) == 0){ + warning("No genes from the given subset found in the fds ", + "object for sample: ", sampleId) + return(list(padj=padj, padj_gene=padj_gene)) + } + + + + # retrieve pvalues of introns to test p <- as.matrix(pVals(fds, type=type)) if(ncol(p) == 1){ colnames(p) <- colnames(fds) } - p <- p[jidx, sample_id] + p <- p[rowIdx, sampleId] - # FDR correction - pa <- p.adjust(p[!duplicated(site_idx[jidx])], method=method) + # FDR correction on subset + non_dup_site_idx <- !duplicated(site_idx[rowIdx]) + padjSub <- p.adjust(p[non_dup_site_idx], method=method) + + # set intron FDR on subset (filled with NA for all other genes) + padj[rowIdx] <- padjSub # gene level pvals - dt <- data.table(sampleID=sample_id, type=type, pval=p, - gene=names(jidx), jidx=jidx, site_idx=site_idx[jidx]) - dt <- merge(dt, data.table(site_idx=site_idx[jidx][!duplicated(site_idx[jidx])], FDR_subset=pa), by="site_idx") - dt[!duplicated(dt$site_idx), pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + dt <- data.table(sampleID=sampleId, type=type, pval=p, + gene=names(rowIdx), jidx=rowIdx, site_idx=site_idx[rowIdx]) + dt <- merge(dt, + data.table(site_idx=site_idx[rowIdx][non_dup_site_idx], + FDR_subset=padjSub), + by="site_idx") + dt[!duplicated(dt$site_idx), + pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"] # gene level FDR dt2 <- dt[, unique(pval_gene), by="gene"] dt2[, FDR_subset_gene := p.adjust(V1, method=method)] - dt <- merge(dt, dt2[, .(gene, FDR_subset_gene)], by="gene", all.x=TRUE) + dt2[, gene_rowIdx := sapply(gene, function(g) which(fds_genes == g))] + + # set intron FDR on subset (filled with NA for all other genes) + padj_gene[dt2[,gene_rowIdx]] <- dt2[, FDR_subset_gene] # return new FDR - return(dt) - }, SIMPLIFY=FALSE, BPPARAM=BPPARAM)) - message(date(), ": finished FDR calculation on subset of genes.") + return(list(padj=padj, padj_gene=padj_gene)) + + }, BPPARAM=BPPARAM) + padjSub <- vapply(fdrSubset, '[[', double(nrow(fds)), 'padj') + padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene') - # add FDR subset info to fds object and return - metadata(fds)[[paste("FDR", subsetName, type, sep="_")]] <- FDR_subset + rownames(padjSub) <- rownames(fds) + colnames(padjSub) <- colnames(fds) + rownames(padjSub_gene) <- fds_genes + colnames(padjSub_gene) <- colnames(fds) + + # add FDR subset info to ods object and return + padjVals(fds, type=type, level="site", subsetName=subsetName) <- padjSub + padjVals(fds, type=type, level="gene", subsetName=subsetName) <- padjSub_gene + addToAvailableFDRsubsets(fds) <- subsetName + + message(date(), ": finished FDR calculation on subset of genes.") + validObject(fds) return(fds) } diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 13210477..c4848052 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -37,14 +37,15 @@ calculatePadjValues( rhoCutoff = NA, geneLevel = TRUE, geneColumn = "hgnc_symbol", + subsets = NULL, BPPARAM = bpparam() ) calculatePadjValuesOnSubset( fds, genesToTest, + subsetName, type = currentType(fds), - subsetName = "subset", method = "BY", geneColumn = "hgnc_symbol", BPPARAM = bpparam() @@ -95,6 +96,13 @@ should be calculated. Defaults to TRUE.} \item{geneColumn}{The column name of the column that has the gene annotation that will be used for gene-level pvalue computation.} +\item{subsets}{A named list of named lists specifying any number of gene +subsets (can differ per sample). For each subset, FDR correction +will be limited to genes in the subset, and the FDR corrected +pvalues stored as an assay in the fds object in addition to the +transcriptome-wide FDR corrected pvalues. See the examples for +how to use this argument.} + \item{genesToTest}{A named list with the subset of genes to test per sample. The names must correspond to the sampleIDs in the given fds object.} @@ -131,33 +139,33 @@ raw counts ("BB"). } \section{Functions}{ \itemize{ -\item \code{FRASER()}: This function runs the default FRASER pipeline combining +\item \code{FRASER}: This function runs the default FRASER pipeline combining the beta-binomial fit, the computation of Z scores and p values as well as the computation of delta-PSI values. -\item \code{calculateZscore()}: This function calculates z-scores based on the +\item \code{calculateZscore}: This function calculates z-scores based on the observed and expected logit psi. -\item \code{calculatePvalues()}: This function calculates two-sided p-values based on +\item \code{calculatePvalues}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The returned p values are not yet adjusted with Holm's method per donor or acceptor site, respectively. -\item \code{calculatePadjValues()}: This function adjusts the previously calculated +\item \code{calculatePadjValues}: This function adjusts the previously calculated p-values per sample for multiple testing. First, the previoulsy calculated junction-level p values are adjusted with Holm's method per donor or acceptor site, respectively. Then, if gene symbols have been annotated to junctions (and not otherwise requested), gene-level p values are computed. -\item \code{calculatePadjValuesOnSubset()}: This function does FDR correction only for all junctions +\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions in a certain subset of genes which can differ per sample. Requires gene symbols to have been annotated to junctions. As with the full FDR correction across all junctions, first the previously calculated junction-level p values are adjusted with Holm's method per donor or acceptor site, respectively. Then, gene-level p values are computed. - }} + \examples{ # set default parallel backend register(SerialParam()) @@ -190,11 +198,21 @@ head(padjVals(fds, type="jaccard")) fds <- calculateZscore(fds, type="jaccard") head(zScores(fds, type="jaccard")) -# To calculate the FDR only on a subset of genes of interest (per sample): -geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) -fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, +# example of restricting FDR correction to subsets of genes of interest +genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +padjVals(fds, type="jaccard", subsetName="exampleSubset") +padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") + +# only adding FDR corrected pvalues on a subset without calculating +# transcriptome-wide FDR again: +fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, subsetName="setOfInterest", type="jaccard") -metadata(fds)[["FDR_setOfInterest"]] +padjVals(fds, type="jaccard", subsetName="setOfInterest") } \seealso{ diff --git a/man/countRNA.Rd b/man/countRNA.Rd index 440231fe..7940b91b 100644 --- a/man/countRNA.Rd +++ b/man/countRNA.Rd @@ -245,32 +245,32 @@ a sample are set to zero. } \section{Functions}{ \itemize{ -\item \code{countRNAData()}: This method extracts and counts the split reads and +\item \code{countRNAData}: This method extracts and counts the split reads and non spliced reads from RNA bam files. -\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges +\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges object containing the split read counts from all specified samples. -\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges +\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges object containing the non split read counts at the exon-intron boundaries inferred from the GRanges object containing the positions of all the introns in this dataset. -\item \code{addCountsToFraserDataSet()}: This method adds the split read and +\item \code{addCountsToFraserDataSet}: This method adds the split read and non split read counts to a existing FraserDataSet containing the settings. -\item \code{countSplitReads()}: This method counts all split reads in a +\item \code{countSplitReads}: This method counts all split reads in a bam file for a single sample. -\item \code{mergeCounts()}: This method merges counts for multiple +\item \code{mergeCounts}: This method merges counts for multiple samples into one SummarizedExperiment object. -\item \code{countNonSplicedReads()}: This method counts non spliced reads based +\item \code{countNonSplicedReads}: This method counts non spliced reads based on the given target (acceptor/donor) regions for a single sample. - }} + \examples{ # On Windows SNOW is the default for the parallele backend, which can be # very slow for many but small tasks. Therefore, we will use diff --git a/man/filtering.Rd b/man/filtering.Rd index 226df199..b99936dc 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -84,16 +84,16 @@ reliably detected and to remove introns with no variablity between samples. } \section{Functions}{ \itemize{ -\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low +\item \code{filterExpressionAndVariability}: This functions filters out both introns with low read support and introns that are not variable across samples. -\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding +\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability(FraserDataSet)}: This function filters out introns and corresponding +\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding splice sites that have low read support in all samples. - }} + \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 3f25c8b6..9b10f832 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -15,6 +15,7 @@ \alias{zScores} \alias{pVals} \alias{padjVals} +\alias{availableFDRsubsets} \alias{predictedMeans} \alias{deltaPsiValue} \alias{currentType} @@ -51,10 +52,13 @@ padjVals( type = currentType(fds), dist = c("BetaBinomial"), level = "site", + subsetName = NULL, filters = list(), ... ) +availableFDRsubsets(fds) + predictedMeans(fds, type = currentType(fds)) deltaPsiValue(fds, type = currentType(fds)) @@ -115,70 +119,73 @@ the values within the FRASER model. } \section{Functions}{ \itemize{ -\item \code{featureExclusionMask()}: Retrieves a logical vector indicating +\item \code{featureExclusionMask}: Retrieves a logical vector indicating for each junction whether it is included or excluded during the fitting procedure. -\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from +\item \code{featureExclusionMask<-}: To remove certain junctions from being used in the train step of the encoding dimension we can set the \code{featureExclusion} vector to \code{FALSE}. This can be helpfull if we have local linkage between features which we do not want to model by the autoencoder. -\item \code{rho()}: Returns the fitted rho values for the +\item \code{rho}: Returns the fitted rho values for the beta-binomial distribution -\item \code{zScores()}: This returns the calculated z-scores. +\item \code{zScores}: This returns the calculated z-scores. + +\item \code{pVals}: This returns the calculated p-values. -\item \code{pVals()}: This returns the calculated p-values. +\item \code{padjVals}: This returns the adjusted p-values. -\item \code{padjVals()}: This returns the adjusted p-values. +\item \code{availableFDRsubsets}: This returns the names of FDR subsets +for which adjusted p values have been calculated. -\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) +\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) values. -\item \code{deltaPsiValue()}: Returns the difference between the +\item \code{deltaPsiValue}: Returns the difference between the observed and the fitted psi values. -\item \code{currentType()}: Returns the psi type that is used +\item \code{currentType}: Returns the psi type that is used within several methods in the FRASER package (defaults to jaccard). -\item \code{currentType(fds) <- value}: Sets the psi type that is to be used +\item \code{currentType<-}: Sets the psi type that is to be used within several methods in the FRASER package. -\item \code{fitMetrics()}: Returns the splice metrics that will be +\item \code{fitMetrics}: Returns the splice metrics that will be fitted (defaults to jaccard, used within several methods in the FRASER package). -\item \code{fitMetrics(fds) <- value}: Sets the splice metrics that will be +\item \code{fitMetrics<-}: Sets the splice metrics that will be fitted (used within several methods in the FRASER package). -\item \code{pseudocount()}: Sets and returns the pseudo count used +\item \code{pseudocount}: Sets and returns the pseudo count used within the FRASER fitting procedure. -\item \code{hyperParams()}: This returns the results of the +\item \code{hyperParams}: This returns the results of the hyperparameter optimization NULL if the hyperparameter opimization was not run yet. -\item \code{bestQ()}: This returns the optimal size of the +\item \code{bestQ}: This returns the optimal size of the latent space according to the hyperparameter optimization or a simple estimate of about a tenth of the number of samples if the hyperparameter opimization was not run yet. -\item \code{dontWriteHDF5()}: Gets the current value of whether the +\item \code{dontWriteHDF5}: Gets the current value of whether the assays should be stored as hdf5 files. -\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored +\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose()}: Dependent on the level of verbosity +\item \code{verbose}: Dependent on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. -\item \code{verbose(fds) <- value}: Sets the verbosity level to a value +\item \code{verbose<-}: Sets the verbosity level to a value between 0 and 10. 0 means being quiet and 10 means reporting everything. - }} + \examples{ fds <- createTestFraserDataSet() diff --git a/man/potentialImpactAnnotations.Rd b/man/potentialImpactAnnotations.Rd index a06e304a..282a6d09 100644 --- a/man/potentialImpactAnnotations.Rd +++ b/man/potentialImpactAnnotations.Rd @@ -93,20 +93,20 @@ These functions work on the result table and add additional } \section{Functions}{ \itemize{ -\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations +\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations based on overlap with the reference annotation (start, end, none, both) for the full fds. The overlap type is added as a new column \code{annotatedJunction} in \code{mcols(fds)}. -\item \code{annotatePotentialImpact()}: This method annotates the splice event +\item \code{annotatePotentialImpact}: This method annotates the splice event type to junctions in the given results table. -\item \code{flagBlacklistRegions()}: This method flags all introns and +\item \code{flagBlacklistRegions}: This method flags all introns and splice sites in the given results table for which at least one splice site (donor or acceptor) is located in a blacklist region. Blacklist regions of the genome are determined from the provided BED file. - }} + \examples{ # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() diff --git a/man/results.Rd b/man/results.Rd index 03b67b36..ceeb9fc8 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -16,8 +16,7 @@ minCount = 5, psiType = psiTypes, geneColumn = "hgnc_symbol", - subsets = NULL, - fullSubset = FALSE, + all = FALSE, returnTranscriptomewideResults = TRUE, additionalColumns = NULL, BPPARAM = bpparam() @@ -33,9 +32,8 @@ by = c("none", "sample", "feature"), aggregate = FALSE, geneColumn = "hgnc_symbol", - geneSubset = NULL, subsetName = NULL, - fullSubset = FALSE, + all = FALSE, ... ) } @@ -69,17 +67,15 @@ result} \item{geneColumn}{The column name of the column that has the gene annotation that will be used for gene-level pvalue computation.} -\item{subsets}{A named list of named lists specifying any number of gene -subsets (can differ per sample). For each subset, FDR correction -will be limited to introns in genes in the subset, and aberrant -events passing the FDR cutoff will be reported for each subset -separately. See the examples for how to use this argument.} +\item{all}{By default FALSE, only significant introns (or genes) are listed +in the results. If TRUE, results are assembled for all +samples and introns/genes regardless of significance.} -\item{fullSubset}{Only applies when \code{geneSubset} is not NULL. Specifies -whether all introns in given subset of genes should be -considered as aberrant, or only those passing the given cutoffs. -Defaults to FALSE (introns have to pass the cutoffs in addtion -to being in the gene subset to be considered aberrant).} +\item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets +of genes of interest have been calculated, this parameter +indicates whether additionally the transcriptome-wide results +should be returned as well (default), or whether only results +for those subsets should be retrieved.} \item{additionalColumns}{Character vector containing the names of additional columns from mcols(fds) that should appear in the result table @@ -94,13 +90,6 @@ are included.} \code{sample} or \code{feature} is specified the sum by sample or feature is returned} -\item{geneSubset}{A named list giving a subset of genes per sample to which -FDR correction should be restricted. The names of the list must -correspond to the sampleIDs in the fds object.} - -\item{subsetName}{The name under which the resulting FDR corrected pvalues -on the subset only will be displayed in the result table.} - \item{...}{Further arguments can be passed to the method. If "n", "padjVals", "dPsi" or "rhoVals" are given, the values of those arguments are used to define the aberrant events.} @@ -122,8 +111,8 @@ aberrant splicing events based on the given cutoffs. fds <- createTestFraserDataSet() # extract results: for this example dataset, no cutoffs are used to -# get at least one result and show the output -res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +# show the output of the results function +res <- results(fds, all=TRUE) res # aggregate the results by genes (gene symbols need to be annotated first @@ -149,11 +138,8 @@ aberrant(fds, type="jaccard") # retrieve results limiting FDR correction to only a subset of genes # first, we need to create a list of genes per sample that will be tested geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) -results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=geneList)) +results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) -# results for several subsets can be retrieved at the same time: -geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1")) -results(fds, - subsets=list('random_subset'=geneList, 'another_subset'=geneList2), - fullSubset=TRUE) } diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 79f737d1..b1eedb8c 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -93,14 +93,18 @@ test_that("FDR on subset of genes", { "sample2" = c("geneB"), "sample3" = c("geneA", "geneB", "geneC", "geneD") ) - expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4) subsetName <- "subset_test" fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, subsetName=subsetName, type="jaccard") - subset_dt <- metadata(fds)[[paste("FDR", subsetName, "jaccard", sep="_")]] - expect_true(is(subset_dt, "data.table")) - expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt))) - expect_equal(subset_dt[, .N], expected_output_nrows) + subset_padj <- padjVals(fds, type="jaccard", subsetName=subsetName) + expect_true(is(subset_padj, "matrix")) + expect_true(nrow(subset_padj) == 27) + expect_true(ncol(subset_padj) == 3) + subset_padj_gene <- padjVals(fds, type="jaccard", level="gene", + subsetName=subsetName) + expect_true(is(subset_padj_gene, "matrix")) + expect_true(nrow(subset_padj_gene) == 5) + expect_true(ncol(subset_padj_gene) == 3) }) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 99df108d..3d54ac21 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -175,7 +175,7 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the following. -From \fraser{}2 on, only a single metric - the Intron Jaccard Index (Figure +From \fraser{} 2.0 on, only a single metric - the Intron Jaccard Index (Figure \ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard Index is more robust and allows to focus more on functionally relevant aberrant splicing events. It allows to detect all types of aberrant splicing @@ -248,7 +248,7 @@ fds <- FRASER(fds, q=c(jaccard=2)) # get results: we recommend to use an FDR cutoff 0.05, but due to the small # dataset size we extract all events and their associated values # eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1) -res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +res <- results(fds, all=TRUE) res # result visualization @@ -586,8 +586,8 @@ sashimi plot for an outlier in the results table directly in R (if paths to bam files are available in the \fds{} object). <>= -# to show result visualization functions for this tuturial, no cutoff used -res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +# to show result visualization functions for this tutorial, no cutoff used +res <- results(fds, all=TRUE) res # for the gene level pvalues, gene symbols need to be annotated the fds object @@ -595,7 +595,7 @@ res # as we previously called FRASER() before annotating genes, we run it again here fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) # generate gene-level results table (if gene symbols have been annotated) -res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA) +res_gene <- results(fds, aggregate=TRUE, all=TRUE) res_gene @ @@ -735,6 +735,19 @@ fds <- calculatePadjValues(fds, type="jaccard", method="BY") head(padjVals(fds,type="jaccard")) @ +With FRASER 2.0 we introduce the option to limit FDR correction to a subset of +genes based on prior knowledge, e.g. genes that contain a rare variant per +sample. To use this option, provide a list of genes per sample during FDR +computation: + +<>= +genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), + "sample2"=c("ARHGEF18", "TRAPPC5")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +head(padjVals(fds, type="jaccard", subsetName="exampleSubset")) +@ + \subsection{Result visualization} \label{sec:result-vis} From 4e3e32305c53f1344570cee91982b9736487723e Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 29 Mar 2023 18:04:01 +0200 Subject: [PATCH 74/80] small fix --- R/pvalsNzscore.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index 2e403173..a4bf89be 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -544,8 +544,10 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, colnames(padjSub_gene) <- colnames(fds) # add FDR subset info to ods object and return - padjVals(fds, type=type, level="site", subsetName=subsetName) <- padjSub - padjVals(fds, type=type, level="gene", subsetName=subsetName) <- padjSub_gene + padjVals(fds, type=type, level="site", subsetName=subsetName, + withDimnames=FALSE) <- padjSub + padjVals(fds, type=type, level="gene", subsetName=subsetName, + withDimnames=FALSE) <- padjSub_gene addToAvailableFDRsubsets(fds) <- subsetName message(date(), ": finished FDR calculation on subset of genes.") From 89983e5e10d3bdf87dde9f81e42dca88af05e0aa Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 29 Mar 2023 18:47:12 +0200 Subject: [PATCH 75/80] fix fdr on subset for theta --- R/pvalsNzscore.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index a4bf89be..be256009 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -462,7 +462,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, # get genes to test for this sample genesToTestSample <- genesToTest[[sampleId]] - padj <- rep(NA, nrow(fds)) + padj <- rep(NA, nrow(mcols(fds, type=type))) padj_gene <- rep(NA, ngenes) # if no genes present in the subset for this sample, return NAs @@ -535,10 +535,10 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, return(list(padj=padj, padj_gene=padj_gene)) }, BPPARAM=BPPARAM) - padjSub <- vapply(fdrSubset, '[[', double(nrow(fds)), 'padj') + padjSub <- vapply(fdrSubset, '[[', + double(nrow(mcols(fds, type=type))), 'padj') padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene') - rownames(padjSub) <- rownames(fds) colnames(padjSub) <- colnames(fds) rownames(padjSub_gene) <- fds_genes colnames(padjSub_gene) <- colnames(fds) From 54822b92805862b675d60a8721273773f9b0d223 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Wed, 29 Mar 2023 21:04:55 +0200 Subject: [PATCH 76/80] fix vignette error --- vignettes/FRASER.Rnw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 3d54ac21..70e80f21 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -740,7 +740,7 @@ genes based on prior knowledge, e.g. genes that contain a rare variant per sample. To use this option, provide a list of genes per sample during FDR computation: -<>= +<>= genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), "sample2"=c("ARHGEF18", "TRAPPC5")) fds <- calculatePadjValues(fds, type="jaccard", From 10deeab7240e2aed22d29f7d7e6cc434e417504d Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 30 Mar 2023 16:23:54 +0200 Subject: [PATCH 77/80] add FRASER 2.0 to readme --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 8e9443aa..5c15f95f 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,13 @@ Please cite our method paper if you use it in a publication: > Mertes, C., Scheller, I.F., YĆ©pez, V.A. *et al.* Detection of aberrant splicing events in RNA-seq data using FRASER. *Nat Commun* **12**, 529 (2021). https://doi.org/10.1038/s41467-020-20573-7 +## What's new + +FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as +its splice metric instead of FRASER's previous three metrics, is now available +and used by default (version 1.99.0 and above). The manuscript describing these +changes in more detail will be available soon. + ## Installation `FRASER` is an R/Bioconductor software package requiring a running From adf499cc2bd3e7c9e0a346e352a10be7f8e336b7 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 30 Mar 2023 17:12:09 +0200 Subject: [PATCH 78/80] have new delta and fdr cutoffs consistently --- R/AllGenerics.R | 4 ++-- R/plotMethods.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 7e1bf696..05eb80d1 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -961,7 +961,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, #' #' @export setMethod("results", "FraserDataSet", function(object, - sampleIDs=samples(object), padjCutoff=0.05, + sampleIDs=samples(object), padjCutoff=0.1, deltaPsiCutoff=0.1, rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, minCount=5, psiType=psiTypes, @@ -1016,7 +1016,7 @@ setMethod("results", "FraserDataSet", function(object, }) aberrant.FRASER <- function(object, type=fitMetrics(object), - padjCutoff=0.05, deltaPsiCutoff=0.1, + padjCutoff=0.1, deltaPsiCutoff=0.1, minCount=5, rhoCutoff=NA, by=c("none", "sample", "feature"), aggregate=FALSE, geneColumn="hgnc_symbol", diff --git a/R/plotMethods.R b/R/plotMethods.R index 7609939c..6f7a9a22 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1568,7 +1568,7 @@ plotManhattan.FRASER <- function(object, sampleID, value="pvalue", stopifnot(sampleID %in% samples(object)) type <- match.arg(type) additional_args <- list(...) - padjCutoff <- 0.05 + padjCutoff <- 0.1 if("padjCutoff" %in% names(additional_args)){ padjCutoff <- additional_args$padjCutoff } From 6ab5bd7e797e3a31eb84ac45d3f8e5f564828c46 Mon Sep 17 00:00:00 2001 From: Ines Scheller Date: Thu, 30 Mar 2023 17:12:40 +0200 Subject: [PATCH 79/80] readme update --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5c15f95f..21de3e75 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,16 @@ Please cite our method paper if you use it in a publication: ## What's new FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as -its splice metric instead of FRASER's previous three metrics, is now available -and used by default (version 1.99.0 and above). The manuscript describing these -changes in more detail will be available soon. +its splice metric instead of FRASER's previous three metrics along with some other +parameter optimizations of pseudocount, filtering settings and default delta cutoff, +is now available and used by default (version 1.99.0 and above). +To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics +specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the +new default delta cutoff is 0.1 instead of the previous value of 0.3. When using +the 3 previous metrics, the delta cutoff should therefore be manually set to 0.3 +during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`. + +The manuscript describing these changes in more detail will be available soon. ## Installation From 8251d53290ef7dcf7dfdc5027c86396dd6015c9e Mon Sep 17 00:00:00 2001 From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:29:45 +0200 Subject: [PATCH 80/80] Update README.md --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 21de3e75..ef17f68a 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,15 @@ Please cite our method paper if you use it in a publication: ## What's new -FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as -its splice metric instead of FRASER's previous three metrics along with some other -parameter optimizations of pseudocount, filtering settings and default delta cutoff, -is now available and used by default (version 1.99.0 and above). +FRASER 2.0, an improved version of FRASER, is now available and used by default (version 1.99.0 and above). +FRASER 2.0 uses the Intron Jaccard Index as its splice metric instead of FRASER's +previous three metrics along with some other parameter optimizations of pseudocount, +filtering settings and default delta cutoff. + To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the new default delta cutoff is 0.1 instead of the previous value of 0.3. When using -the 3 previous metrics, the delta cutoff should therefore be manually set to 0.3 +the 3 previous metrics, the delta cutoff should be set manually to 0.3 during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`. The manuscript describing these changes in more detail will be available soon.