From 8107537e6793c284a85fe7e6f2ef2e6d2fbf78eb Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Oct 2021 09:32:10 +0200
Subject: [PATCH 01/80] include filters in pvalue calculation & cleanup

---
 R/AllGenerics.R       | 352 ++++++++++++++++++++++--------------------
 R/countRNAseqData.R   |  23 +--
 R/example_functions.R |   6 +-
 R/getNSetterFuns.R    |  91 ++++++++---
 R/helper-functions.R  |  43 ++++++
 R/pvalsNzscore.R      | 151 +++++++++++-------
 src/RcppExports.cpp   |   5 +
 7 files changed, 412 insertions(+), 259 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 42986760..8298e03f 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -578,42 +578,70 @@ setAs("DataFrame", "matrix", function(from){
     as.matrix(as(from, "data.table")) })
 
 #'
-#' retrieve a single sample result object
-#' @noRd
-resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals,
-                rawCts, rawTotalCts, deltaPsiVals, muPsi, psiType, fdrCut,
-                zscoreCut, dPsiCut, rowMeansK, rowMeansN, minCount,
-                additionalColumns){
-
-    zscore  <- zscores[,sampleID]
-    dpsi    <- deltaPsiVals[,sampleID]
-    pval    <- pvals[,sampleID]
-    padj    <- padjs[,sampleID]
-
-    goodCut <- !logical(length(zscore))
-    if(!is.na(zscoreCut)){
-        goodCut <- goodCut & na2default(abs(zscore) >= zscoreCut, TRUE)
-    }
-    if(!is.na(dPsiCut)){
-        goodCut <- goodCut & na2default(abs(dpsi) >= dPsiCut, TRUE)
-    }
-    if(!is.na(fdrCut)){
-        goodCut <- goodCut & na2false(padj <= fdrCut)
+#' Mapping of chromosome names
+#'
+#' @param fds FraserDataSet
+#' @param style The style of the chromosome names.
+#' @param ... Further parameters. For mapSeqLevels: further parameters 
+#'     passed to GenomeInfoDb::mapSeqlevels().
+#' 
+#' @rdname fds-methods
+#' @export
+mapSeqlevels <- function(fds, style="UCSC", ...){
+    
+    mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...))
+    # fix missing names() when fds has only a single chromosome
+    if(is.null(names(mappings))){ 
+        names(mappings) <- seqlevels(fds)
     }
-    if(!is.na(minCount)){
-        goodCut <- goodCut & rawTotalCts[,sampleID] >= minCount
+    
+    if(length(mappings) != length(seqlevels(fds))){
+        message(date(), ": Drop non standard chromosomes for compatibility.")
+        fds <- keepStandardChromosomes(fds)
+        nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds))
+        validObject(fds)
     }
+    fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)]
+    
+    seqlevels(fds) <- as.vector(mappings)
+    seqlevels(nonSplicedReads(fds)) <- as.vector(mappings)
+    
+    return(fds)
+}
 
+#'
+#' retrieve a single sample result object
+#' @noRd
+resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, 
+                                psivals, rawCts, rawTotalCts, deltaPsiVals, 
+                                psiType, rowMeansK, rowMeansN, aberrant, 
+                                aggregate, rho, pvalsGene=NULL, padjsGene=NULL, 
+                                additionalColumns, filters=list()){
+    
+    # if gene level results, find the most aberrant junction per gene first
+    if(isTRUE(aggregate)){
+        goodGenes <- rownames(aberrant)[aberrant[,sampleID]]
+        geneJunctions <- findJunctionsForAberrantGenes(gr=gr, genes=goodGenes, 
+                            pvals=pvals[,sampleID], 
+                            dpsi=deltaPsiVals[,sampleID],
+                            minCount=rawTotalCts[,sampleID], rho=rho,
+                            filters=filters)    
+        goodCut <- rep(FALSE, nrow(pvals))
+        goodCut[geneJunctions] <- TRUE
+    } else{
+        goodCut <- aberrant[,sampleID]
+    }
+    
     ans <- granges(gr[goodCut])
-
+    
     if(!any(goodCut)){
         return(ans)
     }
-
+    
     if(!"hgnc_symbol" %in% colnames(mcols(gr))){
         mcols(gr)$hgnc_symbol <- NA_character_
     }
-
+    
     # extract data
     mcols(ans)$sampleID        <- Rle(sampleID)
     if("hgnc_symbol" %in% colnames(mcols(gr))){
@@ -623,57 +651,57 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals,
         mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol)
     }
     mcols(ans)$type            <- Rle(psiType)
-    mcols(ans)$pValue          <- signif(pval[goodCut], 5)
-    mcols(ans)$padjust         <- signif(padj[goodCut], 5)
-    mcols(ans)$zScore          <- Rle(round(zscore[goodCut], 2))
+    mcols(ans)$pValue          <- signif(pvals[goodCut,sampleID], 5)
+    mcols(ans)$padjust         <- signif(padjs[goodCut,sampleID], 5)
+    mcols(ans)$zScore          <- Rle(round(zscores[goodCut,sampleID], 2))
     mcols(ans)$psiValue        <- Rle(round(psivals[goodCut,sampleID], 2))
-    mcols(ans)$deltaPsi        <- Rle(round(dpsi[goodCut], 2))
+    mcols(ans)$deltaPsi        <- Rle(round(deltaPsiVals[goodCut,sampleID], 2))
     mcols(ans)$meanCounts      <- Rle(round(rowMeansK[goodCut], 2))
     mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
     mcols(ans)$counts          <- Rle(rawCts[goodCut, sampleID])
     mcols(ans)$totalCounts     <- Rle(rawTotalCts[goodCut, sampleID])
     
+    if(isTRUE(aggregate)){
+        mcols(ans)$pValueGene  <- signif(pvalsGene[goodCut,sampleID], 5)
+        mcols(ans)$padjustGene <- signif(padjsGene[goodCut,sampleID], 5)
+    }
+    
     if(!is.null(additionalColumns)){
         for(column in additionalColumns){
             mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column])
         }
     }
-
+    
     return(ans[order(mcols(ans)$pValue)])
 }
 
 FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, 
-                    dPsiCutoff, psiType, BPPARAM=bpparam(), maxCols=20, 
-                    minCount, additionalColumns=NULL){
-
-    # check input
-    checkNaAndRange(fdrCutoff,    min=0, max=1,   scalar=TRUE, na.ok=TRUE)
-    checkNaAndRange(dPsiCutoff,   min=0, max=1,   scalar=TRUE, na.ok=TRUE)
-    checkNaAndRange(zscoreCutoff, min=0, max=100, scalar=TRUE, na.ok=TRUE)
-    checkNaAndRange(minCount,     min=0, max=Inf, scalar=TRUE, na.ok=TRUE)
-
+                            dPsiCutoff, minCount, rhoCutoff, psiType, 
+                            maxCols=20, aggregate=FALSE,
+                            BPPARAM=bpparam(), additionalColumns=NULL){
+    
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
-
+    
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
         message(date(), ": Collecting results for: ", type)
         currentType(object) <- type
         gr <- rowRanges(object, type=type)
-
+        
         # first get row means
         rowMeansK <- rowMeans(K(object, type=type))
         rowMeansN <- rowMeans(N(object, type=type))
-
+        
         # then iterate by chunk
         chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols)
         sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs,
-                chunkSize=chunkCols)
-
+                                            chunkSize=chunkCols)
+        
         ans <- lapply(seq_along(sampleChunks), function(idx){
             message(date(), ": Process chunk: ", idx, " for: ", type)
             sc <- sampleChunks[[idx]]
             tmp_x <- object[,sc]
-
+            
             # extract values
             rawCts       <- as.matrix(K(tmp_x))
             rawTotalCts  <- as.matrix(N(tmp_x))
@@ -683,9 +711,24 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             psivals      <- as.matrix(assay(tmp_x, type))
             muPsi        <- as.matrix(predictedMeans(tmp_x))
             psivals_pc   <- (rawCts + pseudocount()) /
-                                (rawTotalCts + 2*pseudocount())
+                (rawTotalCts + 2*pseudocount())
             deltaPsiVals <- psivals_pc - muPsi
-
+            rho          <- rho(tmp_x, type)
+            aberrant     <- aberrant.FRASER(tmp_x, type=type, 
+                                                padjCutoff=fdrCutoff, 
+                                                zScoreCutoff=zscoreCutoff, 
+                                                deltaPsiCutoff=dPsiCutoff, 
+                                                minCount=minCount, 
+                                                rhoCutoff=rhoCutoff,
+                                                aggregate=aggregate)
+            if(isTRUE(aggregate)){
+                pvalsGene    <- as.matrix(pVals(tmp_x, level="gene"))
+                padjsGene    <- as.matrix(padjVals(tmp_x, level="gene"))
+            } else{
+                pvalsGene    <- NULL
+                padjsGene    <- NULL
+            }
+            
             if(length(sc) == 1){
                 colnames(pvals) <- sc
                 colnames(padjs) <- sc
@@ -695,33 +738,39 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             
             # create result table
             sampleRes <- lapply(sc,
-                    resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs,
-                    zscores=zscores, psiType=type, psivals=psivals,
-                    deltaPsiVals=deltaPsiVals, muPsi=muPsi, rawCts=rawCts,
-                    rawTotalCts=rawTotalCts, fdrCut=fdrCutoff,
-                    zscoreCut=zscoreCutoff, dPsiCut=dPsiCutoff,
-                    rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
-                    minCount=minCount, additionalColumns=additionalColumns)
-
+                                resultsSingleSample, gr=gr, pvals=pvals, 
+                                padjs=padjs, zscores=zscores, psiType=type, 
+                                psivals=psivals, deltaPsiVals=deltaPsiVals, 
+                                rawCts=rawCts, rawTotalCts=rawTotalCts, 
+                                rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
+                                aberrant=aberrant, aggregate=aggregate,
+                                rho=rho, 
+                                pvalsGene=pvalsGene, padjsGene=padjsGene,
+                                additionalColumns=additionalColumns,
+                                filters=list(dpsi=dPsiCutoff, 
+                                            minCount=minCount, 
+                                            rho=rhoCutoff))
+            
             # return combined result
             return(unlist(GRangesList(sampleRes)))
         })
-
+        
         unlist(GRangesList(ans))
     })
-
+    
     # merge results
     ans <- unlist(GRangesList(resultsls))
-
+    
     # sort it if existing
     if(length(ans) > 0){
         ans <- ans[order(ans$pValue)]
     }
-
+    
     # return only the results
     return(ans)
 }
 
+
 #'
 #' Extracting results and aberrant splicing events
 #'
@@ -794,94 +843,56 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
+                    rhoCutoff=0.1, aggregate=FALSE,
                     minCount=5, psiType=c("psi3", "psi5", "theta"),
                     additionalColumns=NULL, BPPARAM=bpparam(), ...){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
-            zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff,
-            minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE),
+            zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, 
+            rhoCutoff=rhoCutoff, minCount=minCount, 
+            psiType=match.arg(psiType, several.ok=TRUE),
+            aggregate=aggregate,
             additionalColumns=additionalColumns, BPPARAM=BPPARAM)
 })
 
 #' @rdname results
 #' @export
-resultsByGenes <- function(res, geneColumn="hgncSymbol", method="BY"){
-    # sort by pvalue
-    res <- res[order(res$pValue)]
+resultsByGenes <- function(res, geneColumn="hgncSymbol"){
+    # sort by gene pvalue
+    res <- res[order(res$pValueGene)]
 
     # extract subset
     if(is(res, "GRanges")){
-        ans <- as.data.table(mcols(res)[,c(geneColumn, "pValue", "sampleID")])
-        colnames(ans) <- c("features", "pval", "sampleID")
+        ans <- as.data.table(mcols(res)[,c(geneColumn, "sampleID")])
+        colnames(ans) <- c("features", "sampleID")
     } else {
         ans <- featureNames <- res[,.(
-                features=get(geneColumn), pval=pValue, sampleID=sampleID)]
+                features=get(geneColumn), sampleID=sampleID)]
     }
 
-    # remove NAs
+    # remove NAs and 
+    # keep only one row per gene with lowest pvalue over all psiTypes
     naIdx <- ans[,is.na(features)]
     ansNoNA <- ans[!is.na(features)]
-
-    # compute pvalues by gene
-    ansNoNA[,pByFeature:=min(p.adjust(pval, method="holm")),
-            by="sampleID,features"]
-
-    # subset to lowest pvalue by gene
     dupIdx <- duplicated(ansNoNA[,.(features,sampleID)])
-    ansGenes <- ansNoNA[!dupIdx]
-
-    # compute FDR
-    ansGenes[,fdrByFeature:=p.adjust(pByFeature, method=method),
-            by="sampleID"]
 
     # get final result table
     finalAns <- res[!naIdx][!dupIdx]
-    finalAns$pValueGene  <- ansGenes$pByFeature
-    finalAns$padjustGene <- ansGenes$fdrByFeature
     finalAns
 }
 
-#'
-#' Mapping of chromosome names
-#'
-#' @param fds FraserDataSet
-#' @param style The style of the chromosome names.
-#' @param ... Further parameters. For mapSeqLevels: further parameters 
-#'     passed to GenomeInfoDb::mapSeqlevels().
-#' 
-#' @rdname fds-methods
-#' @export
-mapSeqlevels <- function(fds, style="UCSC", ...){
-
-    mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...))
-    # fix missing names() when fds has only a single chromosome
-    if(is.null(names(mappings))){ 
-        names(mappings) <- seqlevels(fds)
-    }
-
-    if(length(mappings) != length(seqlevels(fds))){
-        message(date(), ": Drop non standard chromosomes for compatibility.")
-        fds <- keepStandardChromosomes(fds)
-        nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds))
-        validObject(fds)
-    }
-    fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)]
-
-    seqlevels(fds) <- as.vector(mappings)
-    seqlevels(nonSplicedReads(fds)) <- as.vector(mappings)
-
-    return(fds)
-}
-
-
-aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05,
-                    deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5,
-                    by=c("none", "sample", "feature"), aggregate=FALSE, ...){
-
-    checkNaAndRange(zScoreCutoff,   min=0, max=Inf, na.ok=TRUE)
-    checkNaAndRange(padjCutoff,     min=0, max=1,   na.ok=TRUE)
-    checkNaAndRange(deltaPsiCutoff, min=0, max=1,   na.ok=TRUE)
+aberrant.FRASER <- function(object, type=currentType(object), 
+                                padjCutoff=0.05, deltaPsiCutoff=0.3, 
+                                zScoreCutoff=NA, minCount=5, rhoCutoff=0.1,
+                                by=c("none", "sample", "feature"), 
+                                aggregate=FALSE, ...){
+    
+    checkNaAndRange(padjCutoff,     min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
+    checkNaAndRange(zScoreCutoff,   min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
+    checkNaAndRange(deltaPsiCutoff, min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
+    checkNaAndRange(rhoCutoff,      min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
+    checkNaAndRange(minCount,       min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
     by <- match.arg(by)
-
+    
     dots <- list(...)
     if("n" %in% names(dots)){
         n <- dots[['n']]
@@ -896,60 +907,73 @@ aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05,
     if("padjVals" %in% names(dots)){
         padj <- dots[['padjVals']]
     } else {
-        padj <- padjVals(object, type=type)
+        # check if padj values are available for the given filters
+        pvalsAvailable <- checkPadjAvailableForFilters(object, type=type,
+                                                filters=list(rho=rhoCutoff), 
+                                                aggregate=aggregate)
+        if(isFALSE(pvalsAvailable)){
+            stop("For the given filters, pvalues are not yet computed. \n", 
+                    "Please compute them first by running the ",
+                    "calculatePadjValues function with the requested filters.")
+        }
+        pvalLevel <- ifelse(isTRUE(aggregate), "gene", "site")
+        padj <- padjVals(object, type=type, level=pvalLevel, 
+                        filters=list(rho=rhoCutoff))
     }
     if("dPsi" %in% names(dots)){
         dpsi <- dots[['dPsi']]
     } else {
         dpsi <- deltaPsiValue(object, type=type)
     } 
+    if("rhoVals" %in% names(dots)){
+        rho <- dots[['rhoVals']]
+    } else {
+        rho <- matrix(rho(object, type=type), 
+                        nrow=nrow(dpsi), ncol=ncol(dpsi))
+    } 
     
-    
-    # create cutoff matrix
-    goodCutoff <- matrix(TRUE, nrow=nrow(zscores), ncol=ncol(zscores),
-            dimnames=dimnames(zscores))
-    if("hgnc_symbol" %in% colnames(mcols(object, type=type)) &
-                nrow(mcols(object, type=type)) == nrow(goodCutoff)){
-        rownames(goodCutoff) <- mcols(object, type=type)[,"hgnc_symbol"]
-    } else if(isTRUE(aggregate)){
-        stop("Please provide hgnc symbols to compute gene p values!")
-    }
-    
-    # check each cutoff if in use (not NA)
-    if(!is.na(minCount)){
-        goodCutoff <- goodCutoff & as.matrix(n >= minCount)
-    }
-    if(!is.na(zScoreCutoff)){
-        goodCutoff <- goodCutoff & as.matrix(abs(zscores) > zScoreCutoff)
+    if(is.na(padjCutoff)){
+        padjCutoff <- 1
     }
-    if(!is.na(deltaPsiCutoff)){
-        goodCutoff <- goodCutoff & as.matrix(abs(dpsi) > deltaPsiCutoff)
-    }
-    if(!is.na(padjCutoff)){
-        goodCutoff <- goodCutoff & as.matrix(padj < padjCutoff)
-    }
-    goodCutoff[is.na(goodCutoff)] <- FALSE
     
-    # check if we should go for aggregation
-    # TODO to speed it up we only use any hit within a feature
-    # but should do a holm's + BY correction per gene and genome wide
-    if(isTRUE(aggregate)){
-        goodCutoff <- as.matrix(data.table(goodCutoff, keep.rownames=TRUE)[,
-                as.data.table(t(colAnys(as.matrix(.SD)))), by=rn][,-1])
-        rownames(goodCutoff) <- unique(mcols(object, type=type)[,"hgnc_symbol"])
-        colnames(goodCutoff) <- colnames(zscores)
+    if(isFALSE(aggregate)){
+        aberrantEvents <- padj <= padjCutoff
+        
+        # check each cutoff if in use (not NA)
+        if(!is.na(minCount)){
+            aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
+        }
+        if(!is.na(zScoreCutoff)){
+            aberrantEvents <- aberrantEvents & 
+                                as.matrix(abs(zscores) > zScoreCutoff)
+        }
+        if(!is.na(deltaPsiCutoff)){
+            aberrantEvents <- aberrantEvents & 
+                                as.matrix(abs(dpsi) > deltaPsiCutoff)
+        }
+        if(!is.na(rhoCutoff)){
+            aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff)
+        }
+        aberrantEvents[is.na(aberrantEvents)] <- FALSE
+        
+    } else{
+        dt <- data.table(geneID=mcols(fds, type=type)$hgnc_symbol, padj)
+        dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)]
+        padj <- as.matrix(dt[, -1])
+        rownames(padj) <- dt[,geneID]
+        
+        aberrantEvents <- padj <= padjCutoff
     }
     
-    # return results
-    if(by == "feature"){
-        return(rowSums(goodCutoff))
-    }
-    if(by == "sample"){
-        return(colSums(goodCutoff))
-    }
-    return(goodCutoff)
+    return(switch(match.arg(by),
+                    none = aberrantEvents,
+                    sample = colSums(aberrantEvents, na.rm=TRUE),
+                    feature = rowSums(aberrantEvents, na.rm=TRUE)
+    ))
 }
 
 #' @rdname results
 #' @export
 setMethod("aberrant", "FraserDataSet", aberrant.FRASER)
+
+
diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index 3578fdef..2f278832 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -965,15 +965,20 @@ readJunctionMap <- function(junctionMap){
 #' @noRd
 extractSpliceSiteCoordinates <- function(junctions, fds){
     
-    if(strandSpecific(fds) >= 1L){
-        spliceSiteCoords <- unlist(GRangesList(
-            extractSpliceSiteCoordsPerStrand(junctions, "+"),
-            extractSpliceSiteCoordsPerStrand(junctions, "-")
-        ))
-    } else {
-        strand(junctions) <- "*"
-        spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*")
-    }
+    # if(strandSpecific(fds) >= 1L){
+    #     spliceSiteCoords <- unlist(GRangesList(
+    #         extractSpliceSiteCoordsPerStrand(junctions, "+"),
+    #         extractSpliceSiteCoordsPerStrand(junctions, "-")
+    #     ))
+    # } else {
+    #     strand(junctions) <- "*"
+    #     spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*")
+    # }
+    
+    spliceSiteCoords <- unlist(GRangesList(
+        lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, 
+               junctions=junctions)
+    ))
     
     return(unique(sort(spliceSiteCoords)))
 }
diff --git a/R/example_functions.R b/R/example_functions.R
index 16640805..90bf817e 100644
--- a/R/example_functions.R
+++ b/R/example_functions.R
@@ -81,12 +81,12 @@ createTestFraserDataSet <- function(workingDir=tempdir(), rerun=FALSE){
     fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=5, 
             minDeltaPsi=0, quantileMinExpression=0)
     
-    # run FRASER pipeline
-    fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2)
-    
     # annotate it
     suppressMessages({ fds <- annotateRangesWithTxDb(fds) })
     
+    # run FRASER pipeline
+    fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2)
+    
     # save data for later 
     fds <- saveFraserDataSet(fds)
     
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 94e8ca5e..26f72951 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -248,8 +248,9 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
 #' @describeIn getter_setter_functions This returns the calculated p-values.
 #' @export
 pVals <- function(fds, type=currentType(fds), level="site", 
+                    filters=list(rho=0.1),
                     dist="BetaBinomial", ...){
-    level <- match.arg(level, choices=c("site", "junction"))
+    level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("pvalues", dist)
     if(level == "junction"){
@@ -260,17 +261,39 @@ pVals <- function(fds, type=currentType(fds), level="site",
             warning("Did not find junction-level p values. ",
                     "Using site-level p values instead.")
         }
+    } else{ 
+        aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+        # add information on used filters
+        for(n in sort(names(filters))){
+            aname <- paste0(aname, "_", n, filters[[n]])
+        }
+        if(level == "gene"){
+            if(!paste(aname, type, sep="_") %in% assayNames(fds)){
+                stop("Did not find gene-level p values. ",
+                        "Please compute them first.")
+            }
+        }
     }
+    
     getAssayMatrix(fds, aname, type=type, ...)
 }
 
 `pVals<-` <- function(fds, type=currentType(fds), level="site",
+                    filters=list(rho=0.1),
                     dist="BetaBinomial", ..., value){
-    level <- match.arg(level, choices=c("site", "junction"))
+    level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("pvalues", dist)
     if(level == "junction"){
         aname <- paste0(aname, "_junction")
+        setAssayMatrix(fds, name=aname, type=type, ...) <- value
+        return(fds)
+    } else if(level == "gene"){
+        aname <- paste0(aname, "_gene")
+    }
+    # add information on used filters
+    for(n in sort(names(filters))){
+        aname <- paste0(aname, "_", n, filters[[n]])
     }
     setAssayMatrix(fds, name=aname, type=type, ...) <- value
     return(fds)
@@ -278,15 +301,36 @@ pVals <- function(fds, type=currentType(fds), level="site",
 
 #' @describeIn getter_setter_functions This returns the adjusted p-values.
 #' @export
-padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){
+padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), 
+                    level="site", filters=list(rho=0.1), ...){
+    level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
-    return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...))
+    aname <- paste0("padj", dist)
+    aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+    # add information on used filters
+    for(n in sort(names(filters))){
+        aname <- paste0(aname, "_", n, filters[[n]])
+    }
+    if(level == "gene"){
+        if(!paste(aname, type, sep="_") %in% assayNames(fds)){
+            stop("Did not find gene-level p values. ",
+                "Please compute them first.")
+        }
+    }
+    return(getAssayMatrix(fds, aname, type=type, ...))
 }
 
-`padjVals<-` <- function(fds, type=currentType(fds),
-                    dist="BetaBinomial", ..., value){
+`padjVals<-` <- function(fds, type=currentType(fds), level="site",
+                    dist="BetaBinomial", filters=list(rho=0.1), ..., value){
+    level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
-    setAssayMatrix(fds, name=paste0("padj", dist), type=type, ...) <- value
+    aname <- paste0("padj", dist)
+    aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+    # add information on used filters
+    for(n in sort(names(filters))){
+        aname <- paste0(aname, "_", n, filters[[n]])
+    }
+    setAssayMatrix(fds, name=aname, type=type, ...) <- value
     return(fds)
 }
 
@@ -589,32 +633,29 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
             padj      = c(padjVals(fds, type=type)[idxrow, idxcol]),
             zscore    = c(zScores(fds, type=type)[idxrow, idxcol]),
             obsPsi    = c((k + pseudocount())/(n + 2*pseudocount())),
-            predPsi   = c(predictedMeans(fds, type)[idxrow, idxcol]))
+            predPsi   = c(predictedMeans(fds, type)[idxrow, idxcol]),
+            rho       = rep(rho(fds, type=type)[idxrow], 
+                            ifelse(isTRUE(idxcol), ncol(fds), length(idxcol))) 
+    )
     dt[, deltaPsi:=obsPsi - predPsi]
 
     # add aberrant information to it
     aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)],
-        dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)])
+        dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], 
+        rhoVals=dt[,.(rho)])
     dt[,aberrant:=aberrantVec]
 
     # if requested return gene p values (correct for multiple testing again)
     if(isTRUE(aggregate)){
+        dt[, pval:=c(pVals(fds, type=type, 
+                            level="gene")[idxrow, idxcol])]
+        dt[, padj:=c(padjVals(fds, type=type, 
+                            level="gene")[idxrow, idxcol]),]
         dt <- dt[!is.na(featureID)]
-
-        # correct by gene and take the smallest p value
-        dt <- rbindlist(bplapply(unique(dt[,sampleID]), 
-            BPPARAM=getBPParam(Ncpus, length(unique(dt[,sampleID]))),
-            FUN=function(x){
-                    dttmp <- dt[sampleID == x]
-                    dttmp[, pval:=p.adjust(pval, method="holm"),
-                            by="sampleID,featureID,type"]
-                    dttmp <- dttmp[order(sampleID, featureID, type, -aberrant,
-                                            pval, -abs(deltaPsi))][
-                            !duplicated(data.table(sampleID, featureID, type))]
-                    dttmp <- dttmp[, padj:=p.adjust(pval, method="BY"),
-                            by="sampleID,type"]
-                    dttmp
-            }))
+        
+        dt <- dt[order(sampleID, featureID, type, -aberrant,
+                        padj, -abs(deltaPsi))][
+                !duplicated(data.table(sampleID, featureID, type))]
     }
 
     # return object
@@ -622,7 +663,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
 }
 
 
-#' @describeIn getter_setter_functions Dependend on the level of verbosity 
+#' @describeIn getter_setter_functions Dependent on the level of verbosity 
 #' the algorithm reports more or less to the user. 0 means being quiet 
 #' and 10 means everything.
 #' @export
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 88a7cbae..17a843a3 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -548,3 +548,46 @@ getStrandString <- function(fds){
     strand <- switch(strandSpecific(fds)+1L, "no", "yes", "reverse")
     return(strand)
 }
+
+#'
+#' Check if adjusted pvalues have been computed for a given set of filters.
+#' @noRd
+checkPadjAvailableForFilters <- function(fds, type, filters=list(),
+                                        dist="BetaBinomial", aggregate=FALSE){
+    dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
+    aname <- paste0("padj", dist)
+    aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname)
+    # add information on used filters
+    for(n in sort(names(filters))){
+        aname <- paste0(aname, "_", n, filters[[n]])
+    }
+    pvalsAvailable <- paste(aname, type, sep="_") %in% assayNames(fds)
+    return(pvalsAvailable)
+}
+
+#'
+#' Find most aberrant junction for each gene
+#' @noRd
+findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, 
+                                            rho, filters=list()){
+    geneJunctions <- mcols(gr)$hgnc_symbol %in% genes
+    dt <- data.table(idx=which(geneJunctions == TRUE), 
+                        geneID=mcols(gr)$hgnc_symbol[geneJunctions],
+                        pval=pvals[geneJunctions], 
+                        dpsi=abs(dpsi[geneJunctions]),
+                        minCount=minCount[geneJunctions],
+                        rho=rho[geneJunctions] )
+    
+    # mask junctions that don't pass filters (minCount, dPsi, rho)
+    for(n in names(filters)){
+        if(n == "rho"){
+            dt[rho > filters[["rho"]], pval:=NA]
+        } else{
+            dt[get(n) < filters[[n]], pval:=NA]
+        }
+    }
+    
+    # sort per gene by lowest pvalue / highest deltaPsi and return index
+    dt <- dt[order(geneID, pval, -dpsi)]
+    return(dt[!duplicated(dt, by="geneID"),idx])
+}
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index deac8b59..99b5754b 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -109,11 +109,6 @@ calculatePvalues <- function(fds, type=currentType(fds),
         pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5)
         pVals(fds, dist="BetaBinomial", level="junction", 
                 withDimnames=FALSE) <- pvals
-        fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
-                pvals=pvals, index, BPPARAM=BPPARAM)
-        fwer_pvals <- do.call(cbind, fwer_pval)
-        pVals(fds, dist="BetaBinomial", level="site", 
-                withDimnames=FALSE) <- fwer_pvals
     }
     
     if("binomial" %in% distributions){
@@ -125,11 +120,6 @@ calculatePvalues <- function(fds, type=currentType(fds),
         pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5)
         pVals(fds, dist="Binomial", level="junction", 
                 withDimnames=FALSE) <- pvals
-        fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
-                pvals=pvals, index, BPPARAM=BPPARAM)
-        fwer_pvals <- do.call(cbind, fwer_pval)
-        pVals(fds, dist="Binomial", level="site", 
-                withDimnames=FALSE) <- fwer_pvals
     }
     
     if("normal" %in% distributions){
@@ -142,19 +132,17 @@ calculatePvalues <- function(fds, type=currentType(fds),
         pvals <- 2 * pmin(pval, 1 - pval, 0.5)
         pVals(fds, dist="Normal", level="junction", 
                 withDimnames=FALSE) <- pvals
-        fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
-                pvals=pvals, index, BPPARAM=BPPARAM)
-        fwer_pvals <- do.call(cbind, fwer_pval)
-        pVals(fds, dist="Normal", level="site", 
-                withDimnames=FALSE) <- fwer_pvals
     }
     
     fds
 }
 
-adjust_FWER_PValues <- function(i, pvals=pvals, index=index){
-    dt <- data.table(p=pvals[,i], idx=index)
-    dt2 <- dt[,.(pa=min(p.adjust(p, method="holm"), na.rm=TRUE)),by=idx]
+adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, 
+                                method="holm"){
+    dt <- data.table(p=pvals[,i], idx=index, rho=rho)
+    dt[rho > rhoCutoff, p:=NA]
+    dt2 <- dt[,.(pa=min(p.adjust(p, method=method), na.rm=TRUE)),by=idx]
+    dt2[is.infinite(pa), pa:=NA]
     setkey(dt2, "idx")[J(index)][,pa]
 }
 
@@ -193,27 +181,100 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #' @param method The p.adjust method that should be used. 
 #' 
 #' @export
-calculatePadjValues <- function(fds, type=currentType(fds), method="BY"){
+calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
+                                rhoCutoff=0.1, 
+                                BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
     idx   <- !duplicated(index)
     
     for(i in c("BetaBinomial", "Binomial", "Normal")){
         # only do it if it exists
-        if(!paste0("pvalues", i, "_", type) %in% assayNames(fds)){
+        if(!paste0("pvalues", i, "_junction_", type) %in% assayNames(fds)){
             next
         }
         
-        pvals <- pVals(fds, dist=i)
-        padj <- apply(pvals[idx,], 2, p.adjust, method=method)
+        pvals <- pVals(fds, dist=i, level="junction")
+        rho <- rho(fds, type=type)
+        
+        # splice site-level pval correction
+        message(date(), ":  adjusting junction-level pvalues ...")
+        fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
+                                pvals=pvals, index, BPPARAM=BPPARAM,
+                                method="holm", rho=rho, rhoCutoff=rhoCutoff)
+        fwer_pvals <- do.call(cbind, fwer_pval)
+        pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff),
+                withDimnames=FALSE) <- fwer_pvals
+        
+        # junction-level FDR correction
+        padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method)
         padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)]
         padjDT[,i:=NULL]
-        padjVals(fds, dist=i, withDimnames=FALSE) <- as.matrix(padjDT)
+        padjVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff),
+                withDimnames=FALSE) <- as.matrix(padjDT)
+        
+        # gene-level pval correction and FDR
+        if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){
+            message(date(), ":  calculating gene-level pvalues ...")
+            gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals,
+                                            method="holm", FDRmethod=method, 
+                                            BPPARAM=BPPARAM)
+            pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
+                    withDimnames=FALSE) <- gene_pvals[["pvals"]]
+            padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
+                    withDimnames=FALSE) <- gene_pvals[["padj"]]
+        }
     }
     
     return(fds)
 }
 
+getPvalsPerGene <- function(fds, type, 
+                    pvals=pVals(fds, type=type, level="site"),
+                    sampleID=NULL, method="holm", FDRmethod="BY", 
+                    BPPARAM=bpparam()){
+    # extract data and take only the first index of per site
+    samples <- samples(fds)
+    if(is.null(colnames(pvals))){
+        colnames(pvals) <- samples
+    }
+    dt <- data.table(
+            idx=getSiteIndex(fds, type=type),
+            geneID=getGeneIDs(fds, type=type, unique=FALSE),
+            as.data.table(pvals))
+    dt <- dt[!is.na(geneID)]
+    geneIDs <- dt[,unique(geneID)]
+    setkey(dt, geneID)
+    
+    # extract samples
+    if(!is.null(sampleID)){
+        samples <- sampleID
+    }
+    
+    # aggregate pvalues to gene level per sample
+    pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM,
+        function(i){
+                dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], 
+                                        method=method), na.rm=TRUE),
+                            by=geneID]
+                setkey(dttmp, geneID)
+                dttmp[J(geneIDs), V1]
+        })), ncol=length(samples))
+    
+    colnames(pvalsPerGene) <- samples
+    rownames(pvalsPerGene) <- geneIDs
+    
+    # compute FDR
+    gene_padj <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod)
+    
+    # blow up back to original assay size
+    pvalsPerGene <- mapGeneToJunctionAssay(fds, type, pvalsPerGene)
+    padjPerGene <- mapGeneToJunctionAssay(fds, type, gene_padj)
+    
+    return(list(pvals=pvalsPerGene, padj=padjPerGene))
+
+}
+
 getSiteIndex <- function(fds, type){
     if(type == "theta"){
         return(mcols(fds, type=type)[['spliceSiteID']])
@@ -242,39 +303,13 @@ getGeneIDs <- function(fds, type, unique=TRUE){
     geneIDs
 }
 
-getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type),
-                    sampleID=NULL, method="holm", BPPARAM=bpparam()){
-    # extract data and take only the first index of per site
-    dt <- data.table(
-            idx=getSiteIndex(fds, type=type),
-            geneID=getGeneIDs(fds, type=type, unique=FALSE),
-            as.data.table(pvals))
-    dt <- dt[!duplicated(idx) & !is.na(geneID)]
-    setkey(dt, geneID)
-    
-    samples <- samples(fds)
-    if(!is.null(sampleID)){
-        samples <- sampleID
-    }
-    
-    pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM,
-        function(i){
-                dttmp <- dt[,min(p.adjust(get(i), method=method)),by=geneID]
-                setkey(dttmp, geneID)
-                dttmp[J(getGeneIDs(fds, type=type)), V1]
-        })), ncol=length(samples))
-    
-    colnames(pvalsPerGene) <- samples
-    rownames(pvalsPerGene) <- getGeneIDs(fds, type=type)
-    
-    return(pvalsPerGene)
-
-}
-
-getPadjPerGene <- function(pvals, method="BY"){
-
-    padjPerGene <- apply(pvals, 2, p.adjust, method=method)
-
-    return(padjPerGene)
-
+mapGeneToJunctionAssay <- function(fds, type, pvalMat){
+    # blow up back to original assay size
+    pvalDT <- data.table(geneID=rownames(pvalMat),
+                        as.data.table(pvalMat) )
+    setkey(pvalDT, "geneID")
+    pvalDT <- pvalDT[J(getGeneIDs(fds, type=type, unique=FALSE))]
+    fullMat <- as.matrix(pvalDT[, -1])
+    rownames(fullMat) <- pvalDT[, geneID]
+    return(fullMat)
 }
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 1a7c62e3..461280c0 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,6 +6,11 @@
 
 using namespace Rcpp;
 
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
 // setPseudoCount
 double setPseudoCount(double pseudoCount);
 RcppExport SEXP _FRASER_setPseudoCount(SEXP pseudoCountSEXP) {

From 420e0d5c860b3c1e32b93af253d1b3b7d4512985 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Oct 2021 18:32:01 +0200
Subject: [PATCH 02/80] update documentation & tests

---
 DESCRIPTION                           |  2 +-
 NAMESPACE                             |  1 -
 R/AllGenerics.R                       | 61 ++++++++++++---------------
 R/countRNAseqData.R                   | 12 +-----
 R/find_encoding_dimensions.R          |  4 +-
 R/getNSetterFuns.R                    |  5 ++-
 R/helper-functions.R                  | 26 +++++++++++-
 R/pvalsNzscore.R                      | 30 +++++++++----
 man/FRASER.Rd                         | 26 ++++++++++--
 man/getter_setter_functions.Rd        | 28 +++++++++---
 man/results.Rd                        | 36 ++++++++++------
 tests/testthat/test_fraser_pipeline.R |  3 +-
 tests/testthat/test_stats.R           | 50 ++++++++++++++++++++++
 13 files changed, 204 insertions(+), 80 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a8ebe376..c7704274 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,7 +28,7 @@ biocViews:
 License: MIT + file LICENSE
 URL: https://github.com/gagneurlab/FRASER
 BugRepots: https://github.com/gagneurlab/FRASER/issues
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Encoding: UTF-8
 VignetteBuilder: knitr
 Depends:
diff --git a/NAMESPACE b/NAMESPACE
index 35e2d66a..0625f5fc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -72,7 +72,6 @@ export(predictedMeans)
 export(pseudocount)
 export(psiTypes)
 export(results)
-export(resultsByGenes)
 export(rho)
 export(samples)
 export(saveFraserDataSet)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 8298e03f..d1dba092 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -677,11 +677,12 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
 
 FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, 
                             dPsiCutoff, minCount, rhoCutoff, psiType, 
-                            maxCols=20, aggregate=FALSE,
+                            maxCols=20, aggregate=FALSE, collapse=TRUE,
                             BPPARAM=bpparam(), additionalColumns=NULL){
     
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
+    message(date(), " collapse = ", collapse)
     
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
         message(date(), ": Collecting results for: ", type)
@@ -766,6 +767,19 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
         ans <- ans[order(ans$pValue)]
     }
     
+    # collapse into one row per gene if requested
+    if(isTRUE(aggregate) && isTRUE(collapse) && length(ans) > 0){
+        
+        ans <- ans[order(ans$pValueGene, ans$pValue)]
+        naIdx <- is.na(ans$hgncSymbol)
+        ansNoNA <- ans[!is.na(ans$hgncSymbol),]
+        
+        # get final result table
+        dupIdx <- duplicated(data.table(as.vector(ansNoNA$hgncSymbol), 
+                                        as.vector(ansNoNA$sampleID)))
+        ans <- ans[!naIdx,][!dupIdx,]
+    }
+    
     # return only the results
     return(ans)
 }
@@ -787,6 +801,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' @param minCount The minimum count value of the total coverage of an intron 
 #' to be considered as significant.
 #' result
+#' @param rhoCutoff The cutoff value on the fitted rho value 
+#' (overdispersion parameter of the betabinomial) above which 
+#' junctions are filtered 
 #' @param psiType The psi types for which the results should be retrieved.
 #' @param additionalColumns Character vector containing the names of additional 
 #' columns from mcols(fds) that should appear in the result table 
@@ -802,8 +819,11 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' @param by By default \code{none} which means no grouping. But if 
 #'              \code{sample} or \code{feature} is specified the sum by 
 #'              sample or feature is returned
-#' @param aggregate If TRUE the returned object is based on the grouped 
-#'              features
+#' @param aggregate If TRUE the returned object is aggregated to the feature 
+#'              level (i.e. gene level).
+#' @param collapse Only takes effect if \code{aggregate=TRUE}. 
+#'              If TRUE (default), collapses results across the different psi 
+#'              types to return only one row per feature (gene) and sample.
 #' @param ... Further arguments can be passed to the method. If "zscores", 
 #'              "padjVals" or "dPsi" is given, the values of those arguments
 #'              are used to define the aberrant events.
@@ -826,7 +846,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' 
 #' # aggregate the results by genes (gene symbols need to be annotated first 
 #' # using annotateRanges() function)
-#' resultsByGenes(res)
+#' results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, 
+#'         aggregate=TRUE)
 #'
 #' # get aberrant events per sample: on the example data, nothing is aberrant
 #' # based on the adjusted p-value
@@ -843,43 +864,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
-                    rhoCutoff=0.1, aggregate=FALSE,
+                    rhoCutoff=0.1, aggregate=FALSE, collapse=TRUE,
                     minCount=5, psiType=c("psi3", "psi5", "theta"),
                     additionalColumns=NULL, BPPARAM=bpparam(), ...){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
             zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, 
             rhoCutoff=rhoCutoff, minCount=minCount, 
             psiType=match.arg(psiType, several.ok=TRUE),
-            aggregate=aggregate,
+            aggregate=aggregate, collapse=collapse,
             additionalColumns=additionalColumns, BPPARAM=BPPARAM)
 })
 
-#' @rdname results
-#' @export
-resultsByGenes <- function(res, geneColumn="hgncSymbol"){
-    # sort by gene pvalue
-    res <- res[order(res$pValueGene)]
-
-    # extract subset
-    if(is(res, "GRanges")){
-        ans <- as.data.table(mcols(res)[,c(geneColumn, "sampleID")])
-        colnames(ans) <- c("features", "sampleID")
-    } else {
-        ans <- featureNames <- res[,.(
-                features=get(geneColumn), sampleID=sampleID)]
-    }
-
-    # remove NAs and 
-    # keep only one row per gene with lowest pvalue over all psiTypes
-    naIdx <- ans[,is.na(features)]
-    ansNoNA <- ans[!is.na(features)]
-    dupIdx <- duplicated(ansNoNA[,.(features,sampleID)])
-
-    # get final result table
-    finalAns <- res[!naIdx][!dupIdx]
-    finalAns
-}
-
 aberrant.FRASER <- function(object, type=currentType(object), 
                                 padjCutoff=0.05, deltaPsiCutoff=0.3, 
                                 zScoreCutoff=NA, minCount=5, rhoCutoff=0.1,
@@ -957,7 +952,7 @@ aberrant.FRASER <- function(object, type=currentType(object),
         aberrantEvents[is.na(aberrantEvents)] <- FALSE
         
     } else{
-        dt <- data.table(geneID=mcols(fds, type=type)$hgnc_symbol, padj)
+        dt <- data.table(geneID=mcols(object, type=type)$hgnc_symbol, padj)
         dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)]
         padj <- as.matrix(dt[, -1])
         rownames(padj) <- dt[,geneID]
diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index 2f278832..54bee9e3 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -965,19 +965,9 @@ readJunctionMap <- function(junctionMap){
 #' @noRd
 extractSpliceSiteCoordinates <- function(junctions, fds){
     
-    # if(strandSpecific(fds) >= 1L){
-    #     spliceSiteCoords <- unlist(GRangesList(
-    #         extractSpliceSiteCoordsPerStrand(junctions, "+"),
-    #         extractSpliceSiteCoordsPerStrand(junctions, "-")
-    #     ))
-    # } else {
-    #     strand(junctions) <- "*"
-    #     spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*")
-    # }
-    
     spliceSiteCoords <- unlist(GRangesList(
         lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, 
-               junctions=junctions)
+                junctions=junctions)
     ))
     
     return(unique(sort(spliceSiteCoords)))
diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R
index 212440cd..6653fcaa 100644
--- a/R/find_encoding_dimensions.R
+++ b/R/find_encoding_dimensions.R
@@ -27,6 +27,8 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){
 
     fds <- calculatePvalues(fds, type=type, implementation=implementation,
             BPPARAM=BPPARAM)
+    fds <- calculatePadjValues(fds, type=type, geneLevel=FALSE, 
+            BPPARAM=BPPARAM)
 
     return(fds)
 }
@@ -50,7 +52,7 @@ eval_prot <- function(fds, type){
     }, FUN.VALUE=logical(length(unique(index))) ) ) + 0
 
     if(any(is.na(scores))){
-        warning(sum(is.na(scores)), " P-values where NAs.")
+        # warning(sum(is.na(scores)), " P-values where NAs.")
         scores[is.na(scores)] <- min(scores, na.rm=TRUE)-1
     }
     pr <- pr.curve(scores, weights.class0=labels)
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 26f72951..22644e80 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -10,11 +10,14 @@
 #' @param level Indicates if the retrieved p values should be adjusted on the 
 #'          donor/acceptor site-level (default) or if unadjusted junction-level 
 #'          p values should be returned.
+#' @param filters A named list giving the filters that were applied for masking 
+#'          during p value correction. Used for storing and retrieving the 
+#'          correct set of requested p values.
 #' @param value The new value to be assigned.
 #' @param all Logical value indicating whether \code{hyperParams(fds)} should 
 #'          return the results of all evaluated parameter combinations or only 
 #'          for the optimal parameter combination.
-#' @param ... Internally used parameteres. 
+#' @param ... Internally used parameters. 
 #' @return A (delayed) matrix or vector dependent on the type of data retrieved.
 #' 
 #' @name getter_setter_functions
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 17a843a3..ca7db0a5 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -567,6 +567,18 @@ checkPadjAvailableForFilters <- function(fds, type, filters=list(),
 
 #'
 #' Find most aberrant junction for each gene
+#' 
+#' @param gr GRanges object with information about junctions.
+#' @param genes Significant genes for which the corresponding junction should 
+#'     be extracted. 
+#' @param pvals Vector of pvalues (for one sample).
+#' @param dpsi Vector of delta psi values (for one sample).
+#' @param minCount Vector of total counts (N) to which minCount filter will be 
+#'     applied.
+#' @param rho Vector of rho values (for all junctions).
+#' @param filters The filters which will be used for masked junctions during 
+#'     extraction (possible options: dpsi, minCount, rho).
+#' 
 #' @noRd
 findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, 
                                             rho, filters=list()){
@@ -581,13 +593,25 @@ findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount,
     # mask junctions that don't pass filters (minCount, dPsi, rho)
     for(n in names(filters)){
         if(n == "rho"){
+            if(is.na(filters[["rho"]])){
+                filters[["rho"]] <- 1
+            }
             dt[rho > filters[["rho"]], pval:=NA]
         } else{
+            if(is.na(filters[[n]])){
+                filters[[n]] <- 0
+            }
             dt[get(n) < filters[[n]], pval:=NA]
         }
     }
     
     # sort per gene by lowest pvalue / highest deltaPsi and return index
     dt <- dt[order(geneID, pval, -dpsi)]
-    return(dt[!duplicated(dt, by="geneID"),idx])
+    dt <- dt[!duplicated(dt, by="geneID"),]
+    
+    # remove gene-level significant result if no junction in that gene passed
+    # the filters
+    dt <- dt[!is.na(pval),]
+    
+    return(dt[,idx])
 }
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 99b5754b..16eae792 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -33,7 +33,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){
 
 #' @describeIn FRASER This function calculates two-sided p-values based on 
 #' the beta-binomial distribution (or binomial or normal if desired). The 
-#' returned p values are already adjusted with Holm's method per donor or 
+#' returned p values are not yet adjusted with Holm's method per donor or 
 #' acceptor site, respectively. 
 #' 
 #' @param distributions The distribution based on which the p-values are 
@@ -141,7 +141,8 @@ adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff,
                                 method="holm"){
     dt <- data.table(p=pvals[,i], idx=index, rho=rho)
     dt[rho > rhoCutoff, p:=NA]
-    dt2 <- dt[,.(pa=min(p.adjust(p, method=method), na.rm=TRUE)),by=idx]
+    suppressWarnings(dt2 <- dt[,.(pa=min(p.adjust(p, method=method), 
+                                        na.rm=TRUE)),by=idx])
     dt2[is.infinite(pa), pa:=NA]
     setkey(dt2, "idx")[J(index)][,pa]
 }
@@ -176,13 +177,22 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 }
 
 #' @describeIn FRASER This function adjusts the previously calculated 
-#' p-values per sample for multiple testing.
+#' p-values per sample for multiple testing. First, the previoulsy calculated 
+#' junction-level p values are adjusted with Holm's method per donor or 
+#' acceptor site, respectively. Then, if gene symbols have been annotated to 
+#' junctions (and not otherwise requested), gene-level p values are computed.
 #' 
-#' @param method The p.adjust method that should be used. 
+#' @param method The p.adjust method that should be used for genome-wide 
+#' multiple testing correction.
+#' @param rhoCutoff The cutoff value on the fitted rho value 
+#'     (overdispersion parameter of the betabinomial) above which junctions are 
+#'     masked with NA during p value adjustment. 
+#' @param geneLevel Logical value indiciating whether gene-level p values 
+#'     should be calculated. Defaults to TRUE.
 #' 
 #' @export
 calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
-                                rhoCutoff=0.1, 
+                                rhoCutoff=0.1, geneLevel=TRUE,
                                 BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
@@ -214,7 +224,8 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                 withDimnames=FALSE) <- as.matrix(padjDT)
         
         # gene-level pval correction and FDR
-        if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){
+        if("hgnc_symbol" %in% colnames(mcols(fds, type=type)) && 
+                isTRUE(geneLevel)){
             message(date(), ":  calculating gene-level pvalues ...")
             gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals,
                                             method="holm", FDRmethod=method, 
@@ -254,11 +265,14 @@ getPvalsPerGene <- function(fds, type,
     # aggregate pvalues to gene level per sample
     pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM,
         function(i){
+            suppressWarnings(
                 dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], 
                                         method=method), na.rm=TRUE),
                             by=geneID]
-                setkey(dttmp, geneID)
-                dttmp[J(geneIDs), V1]
+            )
+            dttmp[is.infinite(V1), V1:=NA]
+            setkey(dttmp, geneID)
+            dttmp[J(geneIDs), V1]
         })), ncol=length(samples))
     
     colnames(pvalsPerGene) <- samples
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 22f3fff5..58f9fb10 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -28,7 +28,14 @@ calculatePvalues(
   capN = 5 * 1e+05
 )
 
-calculatePadjValues(fds, type = currentType(fds), method = "BY")
+calculatePadjValues(
+  fds,
+  type = currentType(fds),
+  method = "BY",
+  rhoCutoff = 0.1,
+  geneLevel = TRUE,
+  BPPARAM = bpparam()
+)
 }
 \arguments{
 \item{fds}{A \code{\link{FraserDataSet}} object}
@@ -62,7 +69,15 @@ calculated. Possible are beta-binomial, binomial and normal.}
 \item{capN}{Counts are capped at this value to speed up the p-value 
 calculation}
 
-\item{method}{The p.adjust method that should be used.}
+\item{method}{The p.adjust method that should be used for genome-wide 
+multiple testing correction.}
+
+\item{rhoCutoff}{The cutoff value on the fitted rho value 
+(overdispersion parameter of the betabinomial) above which junctions are 
+masked with NA during p value adjustment.}
+
+\item{geneLevel}{Logical value indiciating whether gene-level p values 
+should be calculated. Defaults to TRUE.}
 }
 \value{
 FraserDataSet
@@ -104,11 +119,14 @@ psi.
 
 \item \code{calculatePvalues}: This function calculates two-sided p-values based on 
 the beta-binomial distribution (or binomial or normal if desired). The 
-returned p values are already adjusted with Holm's method per donor or 
+returned p values are not yet adjusted with Holm's method per donor or 
 acceptor site, respectively.
 
 \item \code{calculatePadjValues}: This function adjusts the previously calculated 
-p-values per sample for multiple testing.
+p-values per sample for multiple testing. First, the previoulsy calculated 
+junction-level p values are adjusted with Holm's method per donor or 
+acceptor site, respectively. Then, if gene symbols have been annotated to 
+junctions (and not otherwise requested), gene-level p values are computed.
 }}
 
 \examples{
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index bf56e32c..79306a75 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -35,9 +35,23 @@ rho(fds, type = currentType(fds))
 
 zScores(fds, type = currentType(fds), byGroup = FALSE, ...)
 
-pVals(fds, type = currentType(fds), level = "site", dist = "BetaBinomial", ...)
-
-padjVals(fds, type = currentType(fds), dist = c("BetaBinomial"), ...)
+pVals(
+  fds,
+  type = currentType(fds),
+  level = "site",
+  filters = list(rho = 0.1),
+  dist = "BetaBinomial",
+  ...
+)
+
+padjVals(
+  fds,
+  type = currentType(fds),
+  dist = c("BetaBinomial"),
+  level = "site",
+  filters = list(rho = 0.1),
+  ...
+)
 
 predictedMeans(fds, type = currentType(fds))
 
@@ -70,12 +84,16 @@ verbose(fds) <- value
 
 \item{byGroup}{If TRUE, aggregation by donor/acceptor site will be done.}
 
-\item{...}{Internally used parameteres.}
+\item{...}{Internally used parameters.}
 
 \item{level}{Indicates if the retrieved p values should be adjusted on the 
 donor/acceptor site-level (default) or if unadjusted junction-level 
 p values should be returned.}
 
+\item{filters}{A named list giving the filters that were applied for masking 
+during p value correction. Used for storing and retrieving the 
+correct set of requested p values.}
+
 \item{dist}{Distribution for which the p-values should be extracted.}
 
 \item{all}{Logical value indicating whether \code{hyperParams(fds)} should 
@@ -140,7 +158,7 @@ assays should be stored as hdf5 files.
 \item \code{dontWriteHDF5<-}: Sets whether the assays should be stored 
 as hdf5 files.
 
-\item \code{verbose}: Dependend on the level of verbosity 
+\item \code{verbose}: Dependent on the level of verbosity 
 the algorithm reports more or less to the user. 0 means being quiet 
 and 10 means everything.
 
diff --git a/man/results.Rd b/man/results.Rd
index 57682616..330aef7d 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -2,7 +2,6 @@
 % Please edit documentation in R/AllGenerics.R
 \name{results,FraserDataSet-method}
 \alias{results,FraserDataSet-method}
-\alias{resultsByGenes}
 \alias{aberrant,FraserDataSet-method}
 \title{Extracting results and aberrant splicing events}
 \usage{
@@ -12,6 +11,9 @@
   padjCutoff = 0.05,
   zScoreCutoff = NA,
   deltaPsiCutoff = 0.3,
+  rhoCutoff = 0.1,
+  aggregate = FALSE,
+  collapse = TRUE,
   minCount = 5,
   psiType = c("psi3", "psi5", "theta"),
   additionalColumns = NULL,
@@ -19,8 +21,6 @@
   ...
 )
 
-resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY")
-
 \S4method{aberrant}{FraserDataSet}(
   object,
   type = currentType(object),
@@ -28,6 +28,7 @@ resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY")
   deltaPsiCutoff = 0.3,
   zScoreCutoff = NA,
   minCount = 5,
+  rhoCutoff = 0.1,
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
   ...
@@ -45,6 +46,17 @@ retrieved}
 
 \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.}
 
+\item{rhoCutoff}{The cutoff value on the fitted rho value 
+(overdispersion parameter of the betabinomial) above which 
+junctions are filtered}
+
+\item{aggregate}{If TRUE the returned object is aggregated to the feature 
+level (i.e. gene level).}
+
+\item{collapse}{Only takes effect if \code{aggregate=TRUE}. 
+If TRUE (default), collapses results across the different psi 
+types to return only one row per feature (gene) and sample.}
+
 \item{minCount}{The minimum count value of the total coverage of an intron 
 to be considered as significant.
 result}
@@ -62,6 +74,12 @@ are included.}
 "padjVals" or "dPsi" is given, the values of those arguments
 are used to define the aberrant events.}
 
+\item{type}{Splicing type (psi5, psi3 or theta)}
+
+\item{by}{By default \code{none} which means no grouping. But if 
+\code{sample} or \code{feature} is specified the sum by 
+sample or feature is returned}
+
 \item{res}{Result as created with \code{results()}}
 
 \item{geneColumn}{The name of the column in \code{mcols(res)} that contains 
@@ -69,15 +87,6 @@ the gene symbols.}
 
 \item{method}{The p.adjust method that is being used to adjust p values per
 sample.}
-
-\item{type}{Splicing type (psi5, psi3 or theta)}
-
-\item{by}{By default \code{none} which means no grouping. But if 
-\code{sample} or \code{feature} is specified the sum by 
-sample or feature is returned}
-
-\item{aggregate}{If TRUE the returned object is based on the grouped 
-features}
 }
 \value{
 For \code{results}: GRanges object containing significant results.
@@ -102,7 +111,8 @@ res
 
 # aggregate the results by genes (gene symbols need to be annotated first 
 # using annotateRanges() function)
-resultsByGenes(res)
+results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, 
+        aggregate=TRUE)
 
 # get aberrant events per sample: on the example data, nothing is aberrant
 # based on the adjusted p-value
diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R
index 069a217d..2bb6e2cc 100644
--- a/tests/testthat/test_fraser_pipeline.R
+++ b/tests/testthat/test_fraser_pipeline.R
@@ -4,7 +4,8 @@ test_that("FRASER function", {
     fds <- createTestFraserDataSet()
     expect_is(fds, "FraserDataSet")
     anames <- c(psiTypes, paste0(c("delta", "predictedMeans", 
-            "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", 
+            "pvaluesBetaBinomial_rho0.1", "padjBetaBinomial_rho0.1", 
+            "zScores"), "_", 
             rep(psiTypes, 5)))
     expect_equal(anames %in% assayNames(fds), !logical(length(anames)))
 })
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index 8e76cc19..496ffd64 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -30,3 +30,53 @@ test_that("Zscore calculation", {
     
     expect_equal(zscores, zScores(fds, "psi5"))
 })
+
+test_that("Gene p value calculation with NAs", {
+    fds <- getFraser()
+    fds <- fds[15:24,]
+    mcols(fds, type="j")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), 
+                                            times=c(3, 4, 3))
+    mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), 
+                                            times=c(4, 6, 4))
+    
+    # simulate junction with bad rho fit
+    rho_5 <- rho(fds, type="psi5")
+    rho_5[c(1, 4:7)] <- 0.5
+    rho(fds, type="psi5") <- rho_5
+    
+    rho_3 <- rho(fds, type="psi3")
+    rho_3 <- rep(0.5, length(rho_3))
+    rho(fds, type="psi3") <- rho_3
+  
+    # calc p values
+    fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1)
+    fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1)
+    
+    # check psi5 pvals are partly NAs
+    expect_equal(pVals(fds, type="psi5", level="site", 
+                        filters=list(rho=0.1))[4:7,1],
+                 as.double(rep(NA, 4)))
+    expect_equal(pVals(fds, type="psi5", level="gene", 
+                        filters=list(rho=0.1))[4:7,2],
+                 as.double(rep(NA, 4)))
+    expect_equal(padjVals(fds, type="psi5", level="site", 
+                        filters=list(rho=0.1))[4:7,1],
+                 as.double(rep(NA, 4)))
+    expect_equal(padjVals(fds, type="psi5", level="gene", 
+                        filters=list(rho=0.1))[4:7,2],
+                 as.double(rep(NA, 4)))
+    
+    # check psi3 pvals are all NAs
+    expect_equal(pVals(fds, type="psi3", level="site", 
+                        filters=list(rho=0.1))[,1],
+                 as.double(rep(NA, nrow(fds))))
+    expect_equal(pVals(fds, type="psi3", level="gene", 
+                        filters=list(rho=0.1))[,2],
+                 as.double(rep(NA, nrow(fds))))
+    expect_equal(padjVals(fds, type="psi3", level="site", 
+                        filters=list(rho=0.1))[,1],
+                 as.double(rep(NA, nrow(fds))))
+    expect_equal(padjVals(fds, type="psi3", level="gene", 
+                        filters=list(rho=0.1))[,2],
+                 as.double(rep(NA, nrow(fds))))
+})

From f2b992ad1510dbd9f335d4bd1585e78dc29c026c Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Oct 2021 16:57:22 +0200
Subject: [PATCH 03/80] fix plotting examples

---
 R/plotMethods.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index 0afa2fd0..43a66403 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -213,7 +213,7 @@ plotVolcano.FRASER <- function(object, sampleID,
     }
     
     if(!is.na(padjCutoff)){
-        if(dt[,any(padj < padjCutoff)]){
+        if(dt[padj < padjCutoff, .N] > 0){
             padj_line <- min(dt[padj < padjCutoff, -log10(pval)])
         }
         if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){

From ad726273f39eeb1600e97ec1ff0878e94e62a433 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 2 Nov 2021 15:57:49 +0100
Subject: [PATCH 04/80] calculation of gene pvals for all annotated genes

---
 R/AllGenerics.R             | 202 +++++++++++++++++++++---------------
 R/FRASER-package.R          |   3 +-
 R/FraserDataSet-class.R     |  15 +--
 R/annotationOfRanges.R      | 120 +++++++++++----------
 R/getNSetterFuns.R          |  74 ++++++++++---
 R/helper-functions.R        | 117 ++++++++++++++-------
 R/makeSimulatedDataset.R    |   3 +-
 R/plotMethods.R             |   4 +
 R/pvalsNzscore.R            | 131 +++++++++++++++--------
 man/FRASER.Rd               |   4 +
 man/results.Rd              |  39 +++----
 tests/testthat/test_stats.R |  48 ++++-----
 12 files changed, 482 insertions(+), 278 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index d1dba092..ba1ce932 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -616,16 +616,19 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
                                 psivals, rawCts, rawTotalCts, deltaPsiVals, 
                                 psiType, rowMeansK, rowMeansN, aberrant, 
                                 aggregate, rho, pvalsGene=NULL, padjsGene=NULL, 
-                                additionalColumns, filters=list()){
-    
+                                aberrantGene, additionalColumns,
+                                geneColumn="hgnc_symbol"){
+    mcols(gr)$idx <- seq_along(gr)
     # if gene level results, find the most aberrant junction per gene first
     if(isTRUE(aggregate)){
-        goodGenes <- rownames(aberrant)[aberrant[,sampleID]]
-        geneJunctions <- findJunctionsForAberrantGenes(gr=gr, genes=goodGenes, 
+        goodGenes <- rownames(aberrantGene)[aberrantGene[,sampleID] & 
+                                        !is.na(aberrantGene[,sampleID])]
+        geneJunctions <- findJunctionsForAberrantGenes(gr=gr, 
+                            aberrantGenes=goodGenes, 
                             pvals=pvals[,sampleID], 
                             dpsi=deltaPsiVals[,sampleID],
-                            minCount=rawTotalCts[,sampleID], rho=rho,
-                            filters=filters)    
+                            geneColumn=geneColumn,
+                            aberrantJunctions=aberrant[,sampleID])    
         goodCut <- rep(FALSE, nrow(pvals))
         goodCut[geneJunctions] <- TRUE
     } else{
@@ -637,33 +640,43 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
     if(!any(goodCut)){
         return(ans)
     }
+    mcols(ans)$idx <- mcols(gr)$idx[goodCut]
     
-    if(!"hgnc_symbol" %in% colnames(mcols(gr))){
-        mcols(gr)$hgnc_symbol <- NA_character_
+    if(!geneColumn %in% colnames(mcols(gr))){
+        mcols(gr)[,geneColumn] <- NA_character_
     }
     
     # extract data
     mcols(ans)$sampleID        <- Rle(sampleID)
     if("hgnc_symbol" %in% colnames(mcols(gr))){
-        mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])$hgnc_symbol)
-    }
-    if("other_hgnc_symbol" %in% colnames(mcols(gr))){
-        mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol)
+        mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])[,geneColumn])
     }
+    
     mcols(ans)$type            <- Rle(psiType)
     mcols(ans)$pValue          <- signif(pvals[goodCut,sampleID], 5)
     mcols(ans)$padjust         <- signif(padjs[goodCut,sampleID], 5)
     mcols(ans)$zScore          <- Rle(round(zscores[goodCut,sampleID], 2))
     mcols(ans)$psiValue        <- Rle(round(psivals[goodCut,sampleID], 2))
-    mcols(ans)$deltaPsi        <- Rle(round(deltaPsiVals[goodCut,sampleID], 2))
-    mcols(ans)$meanCounts      <- Rle(round(rowMeansK[goodCut], 2))
-    mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
+    mcols(ans)$deltaPsi        <- round(deltaPsiVals[goodCut,sampleID], 2)
     mcols(ans)$counts          <- Rle(rawCts[goodCut, sampleID])
     mcols(ans)$totalCounts     <- Rle(rawTotalCts[goodCut, sampleID])
+    mcols(ans)$meanCounts      <- Rle(round(rowMeansK[goodCut], 2))
+    mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
     
     if(isTRUE(aggregate)){
-        mcols(ans)$pValueGene  <- signif(pvalsGene[goodCut,sampleID], 5)
-        mcols(ans)$padjustGene <- signif(padjsGene[goodCut,sampleID], 5)
+        # report junction more than once if it is significant for several genes
+        nrGenesPerJunction <- table(geneJunctions)
+        ans <- rep(ans, nrGenesPerJunction[as.character(mcols(ans)$idx)])
+        mcols(ans)$hgncSymbol <- 
+            as.data.table(ans)[, names(geneJunctions)[geneJunctions == idx], 
+                                by = eval(colnames(mcols(ans)))][,V1]
+        
+        # add gene level pvalue    
+        mcols(ans)$pValueGene  <- 
+            signif(pvalsGene[mcols(ans)$hgncSymbol,sampleID], 5)
+        mcols(ans)$padjustGene <- 
+            signif(padjsGene[mcols(ans)$hgncSymbol,sampleID], 5)
+        mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol)
     }
     
     if(!is.null(additionalColumns)){
@@ -672,17 +685,21 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
         }
     }
     
-    return(ans[order(mcols(ans)$pValue)])
+    # remove helper column
+    mcols(ans)$idx <- NULL
+    
+    
+    return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))])
 }
 
 FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, 
                             dPsiCutoff, minCount, rhoCutoff, psiType, 
-                            maxCols=20, aggregate=FALSE, collapse=TRUE,
-                            BPPARAM=bpparam(), additionalColumns=NULL){
+                            maxCols=20, aggregate=FALSE, collapse=FALSE,
+                            geneColumn="hgnc_symbol", BPPARAM=bpparam(), 
+                            additionalColumns=NULL){
     
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
-    message(date(), " collapse = ", collapse)
     
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
         message(date(), ": Collecting results for: ", type)
@@ -713,7 +730,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             muPsi        <- as.matrix(predictedMeans(tmp_x))
             psivals_pc   <- (rawCts + pseudocount()) /
                 (rawTotalCts + 2*pseudocount())
-            deltaPsiVals <- psivals_pc - muPsi
+            deltaPsiVals <- deltaPsiValue(tmp_x, type)
             rho          <- rho(tmp_x, type)
             aberrant     <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
@@ -721,13 +738,23 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
                                                 deltaPsiCutoff=dPsiCutoff, 
                                                 minCount=minCount, 
                                                 rhoCutoff=rhoCutoff,
-                                                aggregate=aggregate)
+                                                aggregate=FALSE,
+                                                geneColumn=geneColumn)
             if(isTRUE(aggregate)){
                 pvalsGene    <- as.matrix(pVals(tmp_x, level="gene"))
                 padjsGene    <- as.matrix(padjVals(tmp_x, level="gene"))
+                aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
+                                                padjCutoff=fdrCutoff, 
+                                                zScoreCutoff=zscoreCutoff, 
+                                                deltaPsiCutoff=dPsiCutoff, 
+                                                minCount=minCount, 
+                                                rhoCutoff=rhoCutoff,
+                                                aggregate=TRUE,
+                                                geneColumn=geneColumn)
             } else{
                 pvalsGene    <- NULL
                 padjsGene    <- NULL
+                aberrantGene <- NULL
             }
             
             if(length(sc) == 1){
@@ -745,12 +772,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
                                 rawCts=rawCts, rawTotalCts=rawTotalCts, 
                                 rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
                                 aberrant=aberrant, aggregate=aggregate,
-                                rho=rho, 
+                                rho=rho, geneColumn=geneColumn,
                                 pvalsGene=pvalsGene, padjsGene=padjsGene,
-                                additionalColumns=additionalColumns,
-                                filters=list(dpsi=dPsiCutoff, 
-                                            minCount=minCount, 
-                                            rho=rhoCutoff))
+                                aberrantGene=aberrantGene, 
+                                additionalColumns=additionalColumns)
             
             # return combined result
             return(unlist(GRangesList(sampleRes)))
@@ -768,16 +793,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
     }
     
     # collapse into one row per gene if requested
-    if(isTRUE(aggregate) && isTRUE(collapse) && length(ans) > 0){
-        
-        ans <- ans[order(ans$pValueGene, ans$pValue)]
-        naIdx <- is.na(ans$hgncSymbol)
-        ansNoNA <- ans[!is.na(ans$hgncSymbol),]
-        
-        # get final result table
-        dupIdx <- duplicated(data.table(as.vector(ansNoNA$hgncSymbol), 
-                                        as.vector(ansNoNA$sampleID)))
-        ans <- ans[!naIdx,][!dupIdx,]
+    if(isTRUE(aggregate) && isTRUE(collapse)){
+        ans <- collapseResTablePerGene(ans)
     }
     
     # return only the results
@@ -810,11 +827,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns 
 #' are included. 
 #' @param BPPARAM The BiocParallel parameter.
-#' @param res Result as created with \code{results()}
-#' @param geneColumn The name of the column in \code{mcols(res)} that contains 
-#'     the gene symbols.   
-#' @param method The p.adjust method that is being used to adjust p values per
-#'     sample.
 #' @param type Splicing type (psi5, psi3 or theta)
 #' @param by By default \code{none} which means no grouping. But if 
 #'              \code{sample} or \code{feature} is specified the sum by 
@@ -822,11 +834,13 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' @param aggregate If TRUE the returned object is aggregated to the feature 
 #'              level (i.e. gene level).
 #' @param collapse Only takes effect if \code{aggregate=TRUE}. 
-#'              If TRUE (default), collapses results across the different psi 
+#'              If TRUE, collapses results across the different psi 
 #'              types to return only one row per feature (gene) and sample.
-#' @param ... Further arguments can be passed to the method. If "zscores", 
-#'              "padjVals" or "dPsi" is given, the values of those arguments
-#'              are used to define the aberrant events.
+#' @param geneColumn The column name of the column that has the gene annotation 
+#'              that will be used for gene-level pvalue computation.
+#' @param ... Further arguments can be passed to the method. If "n", "zscores", 
+#'              "padjVals", "dPsi" or "rhoVals" are given, the values of those 
+#'              arguments are used to define the aberrant events.
 #'
 #' @return For \code{results}: GRanges object containing significant results.
 #'     For \code{aberrant}: Either a of logical values of size 
@@ -839,16 +853,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' # get data, fit and compute p-values and z-scores
 #' fds <- createTestFraserDataSet()
 #' 
-#' # extract results: for this example dataset, z score cutoff of 2 is used to
-#' # get at least one result and show the output
-#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+#' # extract results: for this example dataset, z score cutoff of 1 and delta 
+#' # psi cutoff of 0.2 is used to get at least one result and show the output
+#' res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.2)
 #' res
 #' 
 #' # aggregate the results by genes (gene symbols need to be annotated first 
 #' # using annotateRanges() function)
-#' results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, 
+#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
 #'         aggregate=TRUE)
 #'
+#' # aggregate the results by genes and collapse over all psi types to obtain 
+#' # only one row per gene in the results table 
+#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
+#'         aggregate=TRUE, collapse=TRUE)
+#'
 #' # get aberrant events per sample: on the example data, nothing is aberrant
 #' # based on the adjusted p-value
 #' aberrant(fds, type="psi5", by="sample")
@@ -864,14 +883,15 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
-                    rhoCutoff=0.1, aggregate=FALSE, collapse=TRUE,
+                    rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=c("psi3", "psi5", "theta"),
-                    additionalColumns=NULL, BPPARAM=bpparam(), ...){
+                    geneColumn="hgnc_symbol",
+                    additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
             zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, 
             rhoCutoff=rhoCutoff, minCount=minCount, 
             psiType=match.arg(psiType, several.ok=TRUE),
-            aggregate=aggregate, collapse=collapse,
+            aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
             additionalColumns=additionalColumns, BPPARAM=BPPARAM)
 })
 
@@ -879,7 +899,7 @@ aberrant.FRASER <- function(object, type=currentType(object),
                                 padjCutoff=0.05, deltaPsiCutoff=0.3, 
                                 zScoreCutoff=NA, minCount=5, rhoCutoff=0.1,
                                 by=c("none", "sample", "feature"), 
-                                aggregate=FALSE, ...){
+                                aggregate=FALSE, geneColumn="hgnc_symbol", ...){
     
     checkNaAndRange(padjCutoff,     min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(zScoreCutoff,   min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
@@ -911,8 +931,7 @@ aberrant.FRASER <- function(object, type=currentType(object),
                     "Please compute them first by running the ",
                     "calculatePadjValues function with the requested filters.")
         }
-        pvalLevel <- ifelse(isTRUE(aggregate), "gene", "site")
-        padj <- padjVals(object, type=type, level=pvalLevel, 
+        padj <- padjVals(object, type=type, level="site", 
                         filters=list(rho=rhoCutoff))
     }
     if("dPsi" %in% names(dots)){
@@ -926,38 +945,59 @@ aberrant.FRASER <- function(object, type=currentType(object),
         rho <- matrix(rho(object, type=type), 
                         nrow=nrow(dpsi), ncol=ncol(dpsi))
     } 
+    if(isTRUE(aggregate)){
+        if("padjGeneVals" %in% names(dots)){
+            padj_gene <- dots[['padjGeneVals']]
+        } else{
+            padj_gene <- padjVals(object, type=type, level="gene", 
+                                filters=list(rho=rhoCutoff))    
+        }
+        
+    }
     
     if(is.na(padjCutoff)){
         padjCutoff <- 1
     }
     
-    if(isFALSE(aggregate)){
-        aberrantEvents <- padj <= padjCutoff
+    aberrantEvents <- as.matrix(padj) <= padjCutoff
+    
+    # check each cutoff if in use (not NA)
+    if(!is.na(minCount)){
+        aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
+    }
+    if(!is.na(zScoreCutoff)){
+        aberrantEvents <- aberrantEvents & 
+            as.matrix(abs(zscores) > zScoreCutoff)
+    }
+    if(!is.na(deltaPsiCutoff)){
+        aberrantEvents <- aberrantEvents & 
+            as.matrix(abs(dpsi) > deltaPsiCutoff)
+    }
+    if(!is.na(rhoCutoff)){
+        aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff)
+    }
+    aberrantEvents[is.na(aberrantEvents)] <- FALSE
         
-        # check each cutoff if in use (not NA)
-        if(!is.na(minCount)){
-            aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
-        }
-        if(!is.na(zScoreCutoff)){
-            aberrantEvents <- aberrantEvents & 
-                                as.matrix(abs(zscores) > zScoreCutoff)
-        }
-        if(!is.na(deltaPsiCutoff)){
-            aberrantEvents <- aberrantEvents & 
-                                as.matrix(abs(dpsi) > deltaPsiCutoff)
-        }
-        if(!is.na(rhoCutoff)){
-            aberrantEvents <- aberrantEvents & as.matrix(rho < rhoCutoff)
+    if(isTRUE(aggregate)){
+        if(is.null(rownames(padj_gene))){
+            stop("Missing rownames for gene-level padj values.")
         }
-        aberrantEvents[is.na(aberrantEvents)] <- FALSE
-        
-    } else{
-        dt <- data.table(geneID=mcols(object, type=type)$hgnc_symbol, padj)
-        dt <- dt[!duplicated(dt, by="geneID") & !is.na(geneID)]
-        padj <- as.matrix(dt[, -1])
-        rownames(padj) <- dt[,geneID]
+        # reduce aberrant matrix to one row per gene 
+        # (TRUE if any junction is aberrant for each sample)
+        ab_dt <- data.table(geneID=getGeneIDs(object, type=type, unique=FALSE,
+                                                geneColumn=geneColumn), 
+                            aberrantEvents)
+        ab_dt[, dt_idx:=seq_len(.N)]
+        dt_tmp <- ab_dt[!is.na(geneID), splitGenes(geneID), by="dt_idx"]
+        ab_dt <- ab_dt[dt_tmp$dt_idx]
+        ab_dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
+        ab_dt <- ab_dt[,lapply(.SD, any), by="geneID"]
+        aberrantEvents <- as.matrix(ab_dt[,-1])
+        rownames(aberrantEvents) <- ab_dt[,geneID]
         
-        aberrantEvents <- padj <= padjCutoff
+        aberrantEvents <- aberrantEvents & as.matrix(
+                padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)]
+            ) <= padjCutoff
     }
     
     return(switch(match.arg(by),
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 3cecc264..35bf0016 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -128,5 +128,6 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
         "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa",
         "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5",
         "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr",
-        "uniqueID", "V1", "value", "zscore", "maxDTheta"),
+        "uniqueID", "V1", "value", "zscore", "maxDTheta", "genes_donor",
+        "genes_acceptor", "gene_pval", "gene_padj", "dt_idx"),
         package="FRASER")
diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R
index 8857751b..1ddea660 100644
--- a/R/FraserDataSet-class.R
+++ b/R/FraserDataSet-class.R
@@ -22,7 +22,8 @@ setClass("FraserDataSet",
         name            = "Data Analysis",
         bamParam        = ScanBamParam(mapqFilter=0),
         strandSpecific  = 0L,
-        workingDir      = file.path(tempdir(), "FRASER"),
+        workingDir      = getwd(),
+        # file.path(tempdir(), "FRASER"),
         nonSplicedReads = SummarizedExperiment(rowRanges=GRanges())
     )
 )
@@ -192,13 +193,13 @@ showFraserDataSet <- function(object) {
     cat("\n")
     
     cat("-------------------- BAM parameters --------------------\n")
-    if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){
-        cat(paste0("Default used with: ",
-                "bamMapqFilter=", bamMapqFilter(scanBamParam(object))
-        ))
-    } else {
+    # if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){
+    #     cat(paste0("Default used with: ",
+    #             "bamMapqFilter=", bamMapqFilter(scanBamParam(object))
+    #     ))
+    # } else {
         show(scanBamParam(object))
-    }
+    # }
     cat("\n\n")
 }
 
diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
index 62f6c377..da036f5b 100644
--- a/R/annotationOfRanges.R
+++ b/R/annotationOfRanges.R
@@ -59,9 +59,6 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
     if(length(fds) == 0) return(fds)
     
     # useEnsembl only understands GRCh=37 or GRCh=NULL (uses 38 then)
-    if(is.null(GRCh)){
-        GRCh <- 38
-    }
     if(GRCh == 38){
         GRCh <- NULL
     }
@@ -89,17 +86,16 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
     annotation <- getFeatureAsGRange(ensembl, feature, featureName,
             biotype, useUSCS)
 
-    # annotate split reads
-    for(i in c("psi3", "theta")){
-        gr <- rowRanges(fds, type=i)
-        if(any(strand(gr) == "*")){
-            strand(annotation) <- "*"
-        }
-        annos <- getAnnotationFeature(data=gr, featureName, annotation)
-        mcols(fds, type=i)[[featureName]] <- annos[["feature"]]
-        mcols(fds, type=i)[[paste0("other_", featureName)]] <- 
-            annos[["other_features"]]
+    # annotate splice sites first
+    gr <- rowRanges(fds, type="theta")
+    if(any(strand(gr) == "*")){
+        strand(annotation) <- "*"
     }
+    annos <- getAnnotationFeature(data=gr, featureName, annotation)
+    mcols(fds, type="theta")[[featureName]] <- annos
+    
+    # annotate junctions with genes at donor and acceptor sites
+    fds <- annotateFeatureFromSpliceSite(fds, featureName)
 
     return(fds)
 }
@@ -132,41 +128,40 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL",
         }
     }
     
-    for(i in c("psi3", "theta")){
-        # get GRanges object with the split reads which should be annotated
-        gr <- rowRanges(fds, type=i)
-        
-        # get the annotation to compare to
-        anno <- genes(txdb)
-        if(is.data.table(orgDb)){
-            tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, 
-                    by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[,
-                            .(gene_id, feature=get(feature))]
-            setnames(tmp, "feature", feature)
-        } else {
-            tmp  <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], 
-                    columns=feature, keytype=keytype))
-        }
-        
-        # add the new feature to the annotation
-        tmp[, uniqueID := .GRP, by=keytype]
-        anno <- anno[tmp[,uniqueID]]
-        mcols(anno)[[featureName]] <- tmp[,get(feature)]
+    # get GRanges object with the splice sites which should be annotated
+    gr <- rowRanges(fds, type="theta")
+    
+    # get the annotation to compare to
+    anno <- genes(txdb)
+    if(is.data.table(orgDb)){
+        tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, 
+                by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[,
+                        .(gene_id, feature=get(feature))]
+        setnames(tmp, "feature", feature)
+    } else {
+        tmp  <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], 
+                columns=feature, keytype=keytype))
+    }
         
-        # clean up of NA and "" ids
-        anno <- anno[!is.na(mcols(anno)[,featureName]),]
-        anno <- anno[mcols(anno)[,featureName] != "",]
-        if(any(strand(gr) == "*")){
-            strand(anno) <- "*"
-        }
+    # add the new feature to the annotation
+    tmp[, uniqueID := .GRP, by=keytype]
+    anno <- anno[tmp[,uniqueID]]
+    mcols(anno)[[featureName]] <- tmp[,get(feature)]
         
-        # retrieve the feature of interest for the split reads
-        annos <- getAnnotationFeature(data=gr, featureName, anno)
-        mcols(fds, type=i)[[featureName]] <- annos[["feature"]]
-        mcols(fds, type=i)[[paste0("other_", featureName)]] <- 
-            annos[["other_features"]]
+    # clean up of NA and "" ids
+    anno <- anno[!is.na(mcols(anno)[,featureName]),]
+    anno <- anno[mcols(anno)[,featureName] != "",]
+    if(any(strand(gr) == "*")){
+        strand(anno) <- "*"
     }
     
+    # retrieve the feature of interest for the splice sites
+    annos <- getAnnotationFeature(data=gr, featureName, anno)
+    mcols(fds, type="theta")[[featureName]] <- annos
+    
+    # transfer annoated features for splice sites to junctions
+    fds <- annotateFeatureFromSpliceSite(fds, featureName)
+    
     return(fds)   
 }
 
@@ -228,14 +223,12 @@ getAnnotationFeature <- function(data, feature, annotation){
     }
     
     # extract only the feature and group them with a ";"
-    featureDT <- featureDT[,
-            list(first_feature=unique(feature)[1], 
-                other_features=paste(unique(feature)[-1], collapse = ";")),
-            by="from"
-    ]
+    featureDT <- featureDT[,feature:=paste(unique(feature), collapse = ";"),
+                            by="from"]
+    featureDT <- featureDT[!duplicated(featureDT),]
+    featureDT[feature == "NA", feature:=NA]
 
-    return(list(feature=featureDT[order(from),first_feature],
-                other_features=featureDT[order(from),other_features]))
+    return(featureDT[order(from),feature])
 }
 
 
@@ -314,4 +307,27 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE,
     fds
 }
 
-
+#' annotate junctions with genes at donor and acceptor sites
+#' @noRd
+annotateFeatureFromSpliceSite <- function(fds, featureName){
+    ssdt <- data.table(spliceSiteID=mcols(fds, type="theta")$spliceSiteID,
+                        genes=mcols(fds, type="theta")[[featureName]] 
+    )
+    junction_dt <- data.table(startID=mcols(fds, type="psi3")$startID,
+                                endID=mcols(fds, type="psi3")$endID
+    )
+    junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, 
+                        by.x="startID", by.y="spliceSiteID", sort=FALSE)
+    setnames(junction_dt, "genes", "genes_donor")
+    junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, 
+                        by.x="endID", by.y="spliceSiteID", sort=FALSE)
+    setnames(junction_dt, "genes", "genes_acceptor")
+    
+    junction_dt[,genes:=paste(uniqueIgnoreNA(
+                    c(splitGenes(genes_donor), splitGenes(genes_acceptor))), 
+                    collapse=";"),
+                by="startID,endID"]
+    junction_dt[genes == "NA", genes:=NA]
+    mcols(fds, type="j")[[featureName]] <- junction_dt[,genes]
+    return(fds)
+}
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 22644e80..8e91710f 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -271,10 +271,11 @@ pVals <- function(fds, type=currentType(fds), level="site",
             aname <- paste0(aname, "_", n, filters[[n]])
         }
         if(level == "gene"){
-            if(!paste(aname, type, sep="_") %in% assayNames(fds)){
+            if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
                 stop("Did not find gene-level p values. ",
                         "Please compute them first.")
             }
+            return(metadata(fds)[[paste(aname, type, sep="_")]])
         }
     }
     
@@ -298,7 +299,15 @@ pVals <- function(fds, type=currentType(fds), level="site",
     for(n in sort(names(filters))){
         aname <- paste0(aname, "_", n, filters[[n]])
     }
-    setAssayMatrix(fds, name=aname, type=type, ...) <- value
+    
+    if(level == "gene"){
+        if(is.null(rownames(value))){
+            stop("Missing rownames when storing gene-level pvalues.")
+        }
+        metadata(fds)[[paste(aname, type, sep="_")]] <- value    
+    } else{
+        setAssayMatrix(fds, name=aname, type=type, ...) <- value
+    }
     return(fds)
 }
 
@@ -315,10 +324,11 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
         aname <- paste0(aname, "_", n, filters[[n]])
     }
     if(level == "gene"){
-        if(!paste(aname, type, sep="_") %in% assayNames(fds)){
-            stop("Did not find gene-level p values. ",
+        if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
+            stop("Did not find gene-level padj values. ",
                 "Please compute them first.")
         }
+        return(metadata(fds)[[paste(aname, type, sep="_")]])
     }
     return(getAssayMatrix(fds, aname, type=type, ...))
 }
@@ -333,7 +343,14 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
     for(n in sort(names(filters))){
         aname <- paste0(aname, "_", n, filters[[n]])
     }
-    setAssayMatrix(fds, name=aname, type=type, ...) <- value
+    if(level == "gene"){
+        if(is.null(rownames(value))){
+            stop("Missing rownames when storing gene-level pvalues.")
+        }
+        metadata(fds)[[paste(aname, type, sep="_")]] <- value
+    } else{
+        setAssayMatrix(fds, name=aname, type=type, ...) <- value
+    }
     return(fds)
 }
 
@@ -589,7 +606,8 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
 }
 
 getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
-                    idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, ...){
+                    idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, 
+                    geneColumn="hgnc_symbol", ...){
     if(!is.null(result)){
         type <- as.character(result$type)
         idx  <- getIndexFromResultTable(fds, result)
@@ -611,8 +629,8 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
     
     spliceID <- getSiteIndex(fds, type=type)[idxrow]
     feature_names <- rownames(mcols(fds, type=type))[idxrow]
-    if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){
-        feature_names <- mcols(fds, type=type)[idxrow,"hgnc_symbol"]
+    if(geneColumn %in% colnames(mcols(fds, type=type))){
+        feature_names <- mcols(fds, type=type)[idxrow, geneColumn]
     }
     if(is.null(feature_names)){
         feature_names <- as.character(seq_row(mcols(fds, type=type)))[idxrow]
@@ -638,29 +656,53 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
             obsPsi    = c((k + pseudocount())/(n + 2*pseudocount())),
             predPsi   = c(predictedMeans(fds, type)[idxrow, idxcol]),
             rho       = rep(rho(fds, type=type)[idxrow], 
-                            ifelse(isTRUE(idxcol), ncol(fds), length(idxcol))) 
+                            ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) 
     )
     dt[, deltaPsi:=obsPsi - predPsi]
 
     # add aberrant information to it
     aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)],
         dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], 
-        rhoVals=dt[,.(rho)])
+        rhoVals=dt[,.(rho)], aggregate=FALSE)
     dt[,aberrant:=aberrantVec]
 
-    # if requested return gene p values (correct for multiple testing again)
+    # if requested return gene p values
     if(isTRUE(aggregate)){
-        dt[, pval:=c(pVals(fds, type=type, 
-                            level="gene")[idxrow, idxcol])]
-        dt[, padj:=c(padjVals(fds, type=type, 
-                            level="gene")[idxrow, idxcol]),]
         dt <- dt[!is.na(featureID)]
+        # split featureID into several rows if more than one
+        dt[, dt_idx:=seq_len(.N)]
+        dt_tmp <- dt[, splitGenes(featureID), by="dt_idx"]
+        dt <- dt[dt_tmp$dt_idx,]
+        dt[,`:=`(featureID=dt_tmp$V1, dt_idx=NULL)]
+        
+        # get gene-level pvalue matrices
+        pvalsGene <- lapply(c("pval", "padj"), function(x){
+            if(x == "pval"){
+                pvalsGene <- pVals(fds, type=type, 
+                                    level="gene")[,idxcol,drop=FALSE]
+            } else {
+                pvalsGene <- padjVals(fds, type=type, 
+                                    level="gene")[,idxcol,drop=FALSE]
+            }
+            pvalsGene <- data.table(featureID=rownames(pvalsGene), pvalsGene)
+            pvalsGene <- melt(pvalsGene, value.name=paste0("gene_", x), 
+                                id.vars="featureID", variable.name="sampleID")
+            return(pvalsGene)
+        })
+        pvalsGene <- merge(pvalsGene[[1]], pvalsGene[[2]], 
+                            by=c("featureID", "sampleID"))
         
+        # merge with gene pval matrix
+        dt <- merge(dt, pvalsGene, by=c("featureID", "sampleID"))
+        dt[,`:=`(pval=gene_pval, padj=gene_padj, 
+                gene_pval=NULL, gene_padj=NULL)]
+        
+        # sort
         dt <- dt[order(sampleID, featureID, type, -aberrant,
                         padj, -abs(deltaPsi))][
                 !duplicated(data.table(sampleID, featureID, type))]
     }
-
+    
     # return object
     dt
 }
diff --git a/R/helper-functions.R b/R/helper-functions.R
index ca7db0a5..573ba2c7 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -561,57 +561,102 @@ checkPadjAvailableForFilters <- function(fds, type, filters=list(),
     for(n in sort(names(filters))){
         aname <- paste0(aname, "_", n, filters[[n]])
     }
-    pvalsAvailable <- paste(aname, type, sep="_") %in% assayNames(fds)
+    aname <- paste(aname, type, sep="_")
+    if(isTRUE(aggregate)){
+        pvalsAvailable <- aname %in% names(metadata(fds))
+    } else{
+        pvalsAvailable <- aname %in% assayNames(fds)
+    }
     return(pvalsAvailable)
 }
 
 #'
-#' Find most aberrant junction for each gene
+#' Find most aberrant junction for each aberrant gene
 #' 
 #' @param gr GRanges object with information about junctions.
-#' @param genes Significant genes for which the corresponding junction should 
-#'     be extracted. 
+#' @param aberrantGenes Significant genes for which the corresponding junction 
+#'     should be extracted. 
 #' @param pvals Vector of pvalues (for one sample).
 #' @param dpsi Vector of delta psi values (for one sample).
-#' @param minCount Vector of total counts (N) to which minCount filter will be 
-#'     applied.
-#' @param rho Vector of rho values (for all junctions).
-#' @param filters The filters which will be used for masked junctions during 
-#'     extraction (possible options: dpsi, minCount, rho).
-#' 
+#' @param aberrantJunctions Vector indicating which junctions are considered 
+#'     aberrant. 
+#' @param geneColumn Name of the column in mcols(fds) that has gene annotation.
 #' @noRd
-findJunctionsForAberrantGenes <- function(gr, genes, pvals, dpsi, minCount, 
-                                            rho, filters=list()){
-    geneJunctions <- mcols(gr)$hgnc_symbol %in% genes
-    dt <- data.table(idx=which(geneJunctions == TRUE), 
-                        geneID=mcols(gr)$hgnc_symbol[geneJunctions],
-                        pval=pvals[geneJunctions], 
-                        dpsi=abs(dpsi[geneJunctions]),
-                        minCount=minCount[geneJunctions],
-                        rho=rho[geneJunctions] )
-    
-    # mask junctions that don't pass filters (minCount, dPsi, rho)
-    for(n in names(filters)){
-        if(n == "rho"){
-            if(is.na(filters[["rho"]])){
-                filters[["rho"]] <- 1
-            }
-            dt[rho > filters[["rho"]], pval:=NA]
-        } else{
-            if(is.na(filters[[n]])){
-                filters[[n]] <- 0
-            }
-            dt[get(n) < filters[[n]], pval:=NA]
-        }
-    }
+findJunctionsForAberrantGenes <- function(gr, aberrantGenes, pvals, dpsi, 
+                                aberrantJunctions, geneColumn="hgnc_symbol"){
+    dt <- data.table(idx=mcols(gr)$idx, 
+                        geneID=mcols(gr)[,geneColumn],
+                        pval=pvals, 
+                        dpsi=abs(dpsi),
+                        aberrant=aberrantJunctions)
+    dt[, dt_idx:=seq_len(.N)]
+    dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"]
+    dt <- dt[dt_tmp$dt_idx,]
+    dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
+    dt <- dt[geneID %in% aberrantGenes,]
+    dt <- dt[!is.na(aberrant) & aberrant == TRUE,]
     
     # sort per gene by lowest pvalue / highest deltaPsi and return index
-    dt <- dt[order(geneID, pval, -dpsi)]
+    dt <- dt[order(geneID, -aberrant, pval, -dpsi)]
     dt <- dt[!duplicated(dt, by="geneID"),]
     
     # remove gene-level significant result if no junction in that gene passed
     # the filters
     dt <- dt[!is.na(pval),]
     
-    return(dt[,idx])
+    junctionsToReport <- dt[,idx]
+    names(junctionsToReport) <- dt[,geneID]
+    junctionsToReport <- sort(junctionsToReport)
+    return(junctionsToReport)
 }
+
+collapseResTablePerGene <- function(res, geneColumn="hgncSymbol"){
+    if(length(res) == 0){
+        return(res)
+    }
+    if(!is.data.table(res)){
+        res <- as.data.table(res)
+    }
+    
+    if(any(!c("pValue", "pValueGene", geneColumn) %in% colnames(res))){
+        stop("For collapsing per gene, the results table needs to contain ",
+            "the columns pValue, pValueGene and ", geneColumn, ".")
+    }
+    
+    res <- res[order(res$pValueGene, res$pValue)]
+    naIdx <- is.na(res[, get(geneColumn)])
+    ansNoNA <- res[!is.na(res[, get(geneColumn)]),]
+    
+    # get final result table
+    dupIdx <- duplicated(data.table(as.vector(ansNoNA[, get(geneColumn)]), 
+                                    as.vector(ansNoNA$sampleID)))
+    ans <- res[!naIdx,][!dupIdx,]
+    return(ans)
+}
+
+#' ignores NA in unique if other values than NA are present
+#' @noRd
+uniqueIgnoreNA <- function(x){
+    uniq <- unique(x)
+    if(length(uniq) > 1) uniq <- uniq[!is.na(uniq)]
+    return(uniq)
+}
+
+#' split string of gene names into vector
+#' @noRd
+splitGenes <- function(x, sep=";"){
+    return(unlist(strsplit(as.character(x), sep, fixed=TRUE)))
+}
+
+#' cap string of gene names to show max 3 gene names
+#' @noRd
+limitGeneNamesList <- function(gene_names, maxLength=3){
+    gene_names <- as.character(gene_names)
+    numFeatures <- unlist(lapply(gene_names, function(x) length(splitGenes(x))))
+    gene_names[numFeatures > maxLength] <- 
+        unlist(lapply(gene_names[numFeatures > maxLength], function(x){
+            paste(c(splitGenes(x)[seq_len(maxLength)], "..."), 
+                    collapse=";") 
+        } ))
+    return(gene_names)
+}
\ No newline at end of file
diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
index 6e29adf8..95331d19 100644
--- a/R/makeSimulatedDataset.R
+++ b/R/makeSimulatedDataset.R
@@ -500,7 +500,8 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
     dt[,groupSize:=.N, by=groupID]
 
     # Get groups where outlier can be injected
-    available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), unique(groupID)]
+    available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), 
+                            unique(groupID)]
     
     # e.g. for psi3/5: no donor/acceptor 
     # groups with at least 2 junctions (e.g in simulationBB)
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 43a66403..ccf99417 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -352,6 +352,8 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
     }
 
     dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...)
+    dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
+    
     if(!is.null(colGroup)){
         if(all(colGroup %in% samples(fds))){
             colGroup <- samples(fds) %in% colGroup
@@ -441,6 +443,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
             idx=idx, ...)
     type <- as.character(unique(dt$type))
     idx  <- unique(dt$idx)
+    dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
     
     if(is.null(main)){
         if(isTRUE(basePlot)){
@@ -562,6 +565,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
         } else {
             type <- as.character(dt[,unique(type)])
             featureID <- as.character(dt[,unique(featureID)])
+            featureID <- limitGeneNamesList(featureID, maxLength=3)
             if(isTRUE(basePlot)){
                 main <- as.expression(bquote(bold(paste(
                         .(ggplotLabelPsi(type)[[1]]),
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 16eae792..58d60a8b 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -147,6 +147,42 @@ adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff,
     setkey(dt2, "idx")[J(index)][,pa]
 }
 
+adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff, 
+                                method="holm"){
+    pvals[rho > rhoCutoff,] <- NA
+    dttmp <- data.table(idx=index, rho=rho, 
+                        apply(pvals, 2, as.numeric))[idx == i,]
+    suppressWarnings(
+        pa <- apply(as.matrix(dttmp[,-c("idx", "rho")]), 2,
+                        function(x) min(p.adjust(x, method=method), 
+                                        na.rm = TRUE) )
+    )
+    pa[is.infinite(pa)] <- NA
+    return(pa)
+}
+
+getFWERpvals_bySample <- function(pvals, index, rho, method="holm", 
+                                rhoCutoff=0.1, BPPARAM=bpparam()){
+    fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
+                    pvals=pvals, index, BPPARAM=BPPARAM,
+                    method=method, rho=rho, rhoCutoff=rhoCutoff)
+    fwer_pvals <- do.call(cbind, fwer_pval)
+    return(fwer_pvals)
+}
+
+getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", 
+                                rhoCutoff=0.1, BPPARAM=bpparam()){
+    unique_idx <- unique(index)
+    fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx,
+                    pvals=pvals, index, BPPARAM=BPPARAM,
+                    method=method, rho=rho, rhoCutoff=rhoCutoff)
+    fwer_pvals <- do.call(rbind, fwer_pval)
+    fwer_pvals <- as.matrix(
+        setkey(data.table(idx=unique_idx, fwer_pvals), 
+                "idx")[J(index)][,-c("idx")])
+    return(fwer_pvals)
+}
+
 singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){
 
     ki <- k[idx,]
@@ -189,11 +225,13 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #'     masked with NA during p value adjustment. 
 #' @param geneLevel Logical value indiciating whether gene-level p values 
 #'     should be calculated. Defaults to TRUE.
+#' @param geneColumn The column name of the column that has the gene annotation 
+#'              that will be used for gene-level pvalue computation.
 #' 
 #' @export
 calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
-                                rhoCutoff=0.1, geneLevel=TRUE,
-                                BPPARAM=bpparam()){
+                                rhoCutoff=0.1, geneLevel=TRUE, 
+                                geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
     idx   <- !duplicated(index)
@@ -209,14 +247,15 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
         
         # splice site-level pval correction
         message(date(), ":  adjusting junction-level pvalues ...")
-        fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
-                                pvals=pvals, index, BPPARAM=BPPARAM,
-                                method="holm", rho=rho, rhoCutoff=rhoCutoff)
-        fwer_pvals <- do.call(cbind, fwer_pval)
+        # fwer_pvals <- getFWERpvals_byIdx(pvals, index, rho, method="holm", 
+        #                                 rhoCutoff=rhoCutoff, BPPARAM=BPPARAM)
+        fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", 
+                                        rhoCutoff=rhoCutoff, BPPARAM=BPPARAM)
         pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff),
                 withDimnames=FALSE) <- fwer_pvals
         
         # junction-level FDR correction
+        message(date(), ":  genome-wide FDR for junction-level pvalues ...")
         padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method)
         padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)]
         padjDT[,i:=NULL]
@@ -224,16 +263,22 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                 withDimnames=FALSE) <- as.matrix(padjDT)
         
         # gene-level pval correction and FDR
-        if("hgnc_symbol" %in% colnames(mcols(fds, type=type)) && 
+        if(geneColumn %in% colnames(mcols(fds, type=type)) && 
                 isTRUE(geneLevel)){
             message(date(), ":  calculating gene-level pvalues ...")
             gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals,
                                             method="holm", FDRmethod=method, 
+                                            geneColumn=geneColumn,
                                             BPPARAM=BPPARAM)
             pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
                     withDimnames=FALSE) <- gene_pvals[["pvals"]]
             padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
                     withDimnames=FALSE) <- gene_pvals[["padj"]]
+        } else{
+            warning("Gene-level pvalues could not be calculated as column ",
+                    geneColumn, "\nwas not found for the given fds object. ", 
+                    "Please annotate gene symbols \nfirst using the ", 
+                    "annotateRanges function.")
         }
     }
     
@@ -243,18 +288,27 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
 getPvalsPerGene <- function(fds, type, 
                     pvals=pVals(fds, type=type, level="site"),
                     sampleID=NULL, method="holm", FDRmethod="BY", 
-                    BPPARAM=bpparam()){
+                    geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     # extract data and take only the first index of per site
+    message(date(), ":   starting gene-level pval computation for type ", type)
     samples <- samples(fds)
     if(is.null(colnames(pvals))){
         colnames(pvals) <- samples
     }
     dt <- data.table(
             idx=getSiteIndex(fds, type=type),
-            geneID=getGeneIDs(fds, type=type, unique=FALSE),
+            geneID=getGeneIDs(fds, type=type, unique=FALSE, 
+                                geneColumn=geneColumn),
             as.data.table(pvals))
     dt <- dt[!is.na(geneID)]
-    geneIDs <- dt[,unique(geneID)]
+    geneIDs <- getGeneIDs(fds, type=type, unique=TRUE, 
+                            geneColumn=geneColumn)
+    
+    # separate geneIDs into rows
+    dt[, dt_idx:=seq_len(.N)]
+    dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"]
+    dt <- dt[dt_tmp$dt_idx,]
+    dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)]
     setkey(dt, geneID)
     
     # extract samples
@@ -263,28 +317,16 @@ getPvalsPerGene <- function(fds, type,
     }
     
     # aggregate pvalues to gene level per sample
-    pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM,
-        function(i){
-            suppressWarnings(
-                dttmp <- dt[,min(p.adjust(.SD[!duplicated(idx),get(i)], 
-                                        method=method), na.rm=TRUE),
-                            by=geneID]
-            )
-            dttmp[is.infinite(V1), V1:=NA]
-            setkey(dttmp, geneID)
-            dttmp[J(geneIDs), V1]
-        })), ncol=length(samples))
-    
-    colnames(pvalsPerGene) <- samples
-    rownames(pvalsPerGene) <- geneIDs
+    message(date(), ":   gene-level pval computation per gene (n=", 
+            length(geneIDs), ")")
+    pvalsPerGene <- genePvalsByGeneID(dt, samples=samples, geneIDs=geneIDs, 
+                                        method=method, BPPARAM=BPPARAM)
     
     # compute FDR
-    gene_padj <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod)
-    
-    # blow up back to original assay size
-    pvalsPerGene <- mapGeneToJunctionAssay(fds, type, pvalsPerGene)
-    padjPerGene <- mapGeneToJunctionAssay(fds, type, gene_padj)
+    message(date(), ":   genome-wide FDR for gene-level pvals for type ", type)
+    padjPerGene <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod)
     
+    message(date(), ":   finished gene-level pval computation for type ", type)
     return(list(pvals=pvalsPerGene, padj=padjPerGene))
 
 }
@@ -308,22 +350,29 @@ getSiteIndex <- function(fds, type){
     ans[selectionMat]
 }
 
-getGeneIDs <- function(fds, type, unique=TRUE){
-    geneIDs <- mcols(fds, type=type)$hgnc_symbol
+getGeneIDs <- function(fds, type, unique=TRUE, geneColumn="hgnc_symbol"){
+    geneIDs <- mcols(fds, type=type)[[geneColumn]]
     if(isTRUE(unique)){
-        geneIDs <- unique(geneIDs)
+        geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){
+            unlist(strsplit(g, ";"))}) ))
         geneIDs <- geneIDs[!is.na(geneIDs)]
     }
     geneIDs
 }
 
-mapGeneToJunctionAssay <- function(fds, type, pvalMat){
-    # blow up back to original assay size
-    pvalDT <- data.table(geneID=rownames(pvalMat),
-                        as.data.table(pvalMat) )
-    setkey(pvalDT, "geneID")
-    pvalDT <- pvalDT[J(getGeneIDs(fds, type=type, unique=FALSE))]
-    fullMat <- as.matrix(pvalDT[, -1])
-    rownames(fullMat) <- pvalDT[, geneID]
-    return(fullMat)
+genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){
+    pvalsPerGene <- bplapply(geneIDs, BPPARAM=BPPARAM,
+        FUN=function(g) {
+            dttmp <- dt[geneID == g][!duplicated(idx)]
+            suppressWarnings(
+                pval_g <- apply(as.matrix(dttmp[,-c("idx", "geneID")]), 2,
+                    function(x) min(p.adjust(x, method=method), na.rm = TRUE) )
+            )
+            pval_g[is.infinite(pval_g)] <- NA
+            pval_g
+        })
+    pvalsPerGene <- do.call(rbind, pvalsPerGene)
+    rownames(pvalsPerGene) <- geneIDs
+    return(pvalsPerGene)
 }
+
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 58f9fb10..0931ec25 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -34,6 +34,7 @@ calculatePadjValues(
   method = "BY",
   rhoCutoff = 0.1,
   geneLevel = TRUE,
+  geneColumn = "hgnc_symbol",
   BPPARAM = bpparam()
 )
 }
@@ -78,6 +79,9 @@ masked with NA during p value adjustment.}
 
 \item{geneLevel}{Logical value indiciating whether gene-level p values 
 should be calculated. Defaults to TRUE.}
+
+\item{geneColumn}{The column name of the column that has the gene annotation 
+that will be used for gene-level pvalue computation.}
 }
 \value{
 FraserDataSet
diff --git a/man/results.Rd b/man/results.Rd
index 330aef7d..7f78ffe0 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -13,12 +13,12 @@
   deltaPsiCutoff = 0.3,
   rhoCutoff = 0.1,
   aggregate = FALSE,
-  collapse = TRUE,
+  collapse = FALSE,
   minCount = 5,
   psiType = c("psi3", "psi5", "theta"),
+  geneColumn = "hgnc_symbol",
   additionalColumns = NULL,
-  BPPARAM = bpparam(),
-  ...
+  BPPARAM = bpparam()
 )
 
 \S4method{aberrant}{FraserDataSet}(
@@ -31,6 +31,7 @@
   rhoCutoff = 0.1,
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
+  geneColumn = "hgnc_symbol",
   ...
 )
 }
@@ -54,7 +55,7 @@ junctions are filtered}
 level (i.e. gene level).}
 
 \item{collapse}{Only takes effect if \code{aggregate=TRUE}. 
-If TRUE (default), collapses results across the different psi 
+If TRUE, collapses results across the different psi 
 types to return only one row per feature (gene) and sample.}
 
 \item{minCount}{The minimum count value of the total coverage of an intron 
@@ -63,6 +64,9 @@ result}
 
 \item{psiType}{The psi types for which the results should be retrieved.}
 
+\item{geneColumn}{The column name of the column that has the gene annotation 
+that will be used for gene-level pvalue computation.}
+
 \item{additionalColumns}{Character vector containing the names of additional 
 columns from mcols(fds) that should appear in the result table 
 (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns 
@@ -70,23 +74,15 @@ are included.}
 
 \item{BPPARAM}{The BiocParallel parameter.}
 
-\item{...}{Further arguments can be passed to the method. If "zscores", 
-"padjVals" or "dPsi" is given, the values of those arguments
-are used to define the aberrant events.}
-
 \item{type}{Splicing type (psi5, psi3 or theta)}
 
 \item{by}{By default \code{none} which means no grouping. But if 
 \code{sample} or \code{feature} is specified the sum by 
 sample or feature is returned}
 
-\item{res}{Result as created with \code{results()}}
-
-\item{geneColumn}{The name of the column in \code{mcols(res)} that contains 
-the gene symbols.}
-
-\item{method}{The p.adjust method that is being used to adjust p values per
-sample.}
+\item{...}{Further arguments can be passed to the method. If "n", "zscores", 
+"padjVals", "dPsi" or "rhoVals" are given, the values of those 
+arguments are used to define the aberrant events.}
 }
 \value{
 For \code{results}: GRanges object containing significant results.
@@ -104,16 +100,21 @@ aberrant splicing events based on the given cutoffs.
 # get data, fit and compute p-values and z-scores
 fds <- createTestFraserDataSet()
 
-# extract results: for this example dataset, z score cutoff of 2 is used to
-# get at least one result and show the output
-res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+# extract results: for this example dataset, z score cutoff of 1 and delta 
+# psi cutoff of 0.2 is used to get at least one result and show the output
+res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.2)
 res
 
 # aggregate the results by genes (gene symbols need to be annotated first 
 # using annotateRanges() function)
-results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05, 
+results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
         aggregate=TRUE)
 
+# aggregate the results by genes and collapse over all psi types to obtain 
+# only one row per gene in the results table 
+results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
+        aggregate=TRUE, collapse=TRUE)
+
 # get aberrant events per sample: on the example data, nothing is aberrant
 # based on the adjusted p-value
 aberrant(fds, type="psi5", by="sample")
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index 496ffd64..cdc098df 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -52,31 +52,31 @@ test_that("Gene p value calculation with NAs", {
     fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1)
     fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1)
     
+    # check dimension of junction-, site- and gene-level pval matrices
+    expect_equal(nrow(pVals(fds, type="psi5", level="junction", 
+                            filters=list(rho=0.1))), nrow(fds))
+    expect_equal(nrow(pVals(fds, type="psi5", level="site", 
+                            filters=list(rho=0.1))), nrow(fds))
+    expect_equal(nrow(pVals(fds, type="psi5", level="gene", 
+                            filters=list(rho=0.1))), 3)
+    
     # check psi5 pvals are partly NAs
-    expect_equal(pVals(fds, type="psi5", level="site", 
-                        filters=list(rho=0.1))[4:7,1],
-                 as.double(rep(NA, 4)))
-    expect_equal(pVals(fds, type="psi5", level="gene", 
-                        filters=list(rho=0.1))[4:7,2],
-                 as.double(rep(NA, 4)))
-    expect_equal(padjVals(fds, type="psi5", level="site", 
-                        filters=list(rho=0.1))[4:7,1],
-                 as.double(rep(NA, 4)))
-    expect_equal(padjVals(fds, type="psi5", level="gene", 
-                        filters=list(rho=0.1))[4:7,2],
-                 as.double(rep(NA, 4)))
+    expect_true(all(is.na(pVals(fds, type="psi5", level="site", 
+                                filters=list(rho=0.1))[4:7,])))
+    expect_true(all(is.na(pVals(fds, type="psi5", level="gene", 
+                                filters=list(rho=0.1))["geneB",])))
+    expect_true(all(is.na(padjVals(fds, type="psi5", level="site", 
+                                filters=list(rho=0.1))[4:7,])))
+    expect_true(all(is.na(padjVals(fds, type="psi5", level="gene", 
+                                filters=list(rho=0.1))["geneB",])))
     
     # check psi3 pvals are all NAs
-    expect_equal(pVals(fds, type="psi3", level="site", 
-                        filters=list(rho=0.1))[,1],
-                 as.double(rep(NA, nrow(fds))))
-    expect_equal(pVals(fds, type="psi3", level="gene", 
-                        filters=list(rho=0.1))[,2],
-                 as.double(rep(NA, nrow(fds))))
-    expect_equal(padjVals(fds, type="psi3", level="site", 
-                        filters=list(rho=0.1))[,1],
-                 as.double(rep(NA, nrow(fds))))
-    expect_equal(padjVals(fds, type="psi3", level="gene", 
-                        filters=list(rho=0.1))[,2],
-                 as.double(rep(NA, nrow(fds))))
+    expect_true(all(is.na(pVals(fds, type="psi3", level="site", 
+                                filters=list(rho=0.1)))))
+    expect_true(all(is.na(pVals(fds, type="psi3", level="gene", 
+                                filters=list(rho=0.1)))))
+    expect_true(all(is.na(padjVals(fds, type="psi3", level="site", 
+                                filters=list(rho=0.1)))))
+    expect_true(all(is.na(padjVals(fds, type="psi3", level="gene", 
+                                filters=list(rho=0.1)))))
 })

From 1b4a0bd224715e3f5c017c34ccaeed186d3b0eb6 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 17 Jan 2022 10:30:15 +0100
Subject: [PATCH 05/80] first impleion of intron jaccard index

---
 R/AllGenerics.R       |   5 +-
 R/calculatePSIValue.R | 157 ++++++++++++++++++++++++
 R/filterExpression.R  | 272 ++++++++++++++++++++++++++++++++++++++++++
 R/helper-functions.R  |   7 +-
 R/plotMethods.R       |   8 +-
 R/variables.R         |   3 +-
 6 files changed, 442 insertions(+), 10 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index ba1ce932..9349f8b2 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -536,8 +536,9 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
     # extract psi value from type
     type <- whichPSIType(type)
     if(length(type) == 0 | length(type) > 1){
-        stop(paste0("Please provide a correct psi type: psi5, psi3, or ",
-                    "theta. Not the given one: '", type, "'."))
+        stop(paste0("Please provide a correct psi type: psi5, psi3, ",
+                    "theta or intron_jaccard. Not the given one: '", 
+                    type, "'."))
     }
     aname <- paste0("rawOtherCounts_", type)
     if(!aname %in% assayNames(object)){
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index 90894279..be0b8dc2 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -36,6 +36,9 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE,
                                 overwriteCts=overwriteCts, BPPARAM=BPPARAM)
     }
     
+    # calculate intron jaccard index
+    fds <- calculateIntronJaccardIndex(fds)
+    
     # calculate the delta psi value
     for(psiType in types){
         assayName <- paste0("delta_", psiType)
@@ -321,3 +324,157 @@ getOtherCountsCacheFolder <- function(fds){
     # return it
     return(cachedir)
 }
+
+#'
+#' calculates the intron jaccard index values for all junctions
+#' @inheritParams countRNA
+#' 
+calculateIntronJaccardIndex <- function(fds){
+    # retrieve junction and splice site annotation with count information
+    junction_dt <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, 
+                                                   strand, startID, endID)], 
+        as.matrix(K(fds, type="psi5")))
+    ss_dt <- data.table(
+        as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, 
+                                                    strand, spliceSiteID)], 
+        as.matrix(N(fds, type="theta")))
+    
+    # melt to have one row per sample - junction combination
+    junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", 
+                        id.vars=c("seqnames", "start", "end", "strand", 
+                                  "startID", "endID"))
+    ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", 
+                  id.vars=c("seqnames", "start", "end", "strand", 
+                            "spliceSiteID"))
+    
+    # merge junction information with splice site annotation (theta)
+    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
+                         all.x=TRUE, by.x=c("startID", "sampleID"), 
+                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
+    setnames(junction_dt, "n", "n_donor")
+    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
+                         all.x=TRUE, by.x=c("endID", "sampleID"), 
+                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
+    setnames(junction_dt, "n", "n_acceptor")
+    rm(ss_dt)
+    gc()
+    
+    # TODO deal with missing endIDs (why do we have them? lost in filtering?)
+    # for now: replace n_donor with N from psi5 (and same for n_acceptor and psi3)
+    junction_dt_nacceptor <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
+        as.matrix(N(fds, type="psi3")))
+    junction_dt_nacceptor <- melt(junction_dt_nacceptor, 
+                        variable.name="sampleID", value.name="n_psi3", 
+                        id.vars=c("startID", "endID"))
+    junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]]
+    rm(junction_dt_nacceptor)
+    gc()
+    junction_dt_ndonor <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
+        as.matrix(N(fds, type="psi5")))
+    junction_dt_ndonor <- melt(junction_dt_ndonor, 
+                               variable.name="sampleID", value.name="n_psi5", 
+                               id.vars=c("startID", "endID"))
+    junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]]
+    rm(junction_dt_ndonor)
+    gc()
+    
+    # replace n (with non-split counts) by n_psi3/n_psi5 if NA
+    junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3]
+    junction_dt[is.na(n_donor), n_donor:=n_psi5]
+    junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3]
+    junction_dt[n_donor < n_psi5, n_donor:=n_psi5]
+    
+    # calculate intron_jaccard
+    junction_dt[, denominator:=(n_donor + n_acceptor - k)]
+    junction_dt[, intron_jaccard:= k / denominator]
+    junction_dt[is.nan(intron_jaccard), intron_jaccard:=1] # n_donor = n_acceptor = 0
+    
+    # convert to matrix to store it as assay in the fds
+    intron_jaccard <- matrix(junction_dt[,intron_jaccard], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
+    rownames(intron_jaccard) <- rownames(fds)
+    colnames(intron_jaccard) <- colnames(fds)
+    assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- intron_jaccard
+    
+    # store denominator
+    jaccard_denom <- matrix(junction_dt[,denominator], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
+    rownames(jaccard_denom) <- rownames(fds)
+    colnames(jaccard_denom) <- colnames(fds)
+    assay(fds, type="j", "rawOtherCounts_intron_jaccard", 
+            withDimnames=FALSE) <- jaccard_denom - matrix(junction_dt[,k], 
+                                                            nrow=nrow(fds), 
+                                                            ncol=ncol(fds), 
+                                                            byrow=FALSE)
+    return(fds)
+}
+
+calculatePhi <- function(fds){
+    # retrieve junction and splice site annotation with count information
+    junction_dt <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, 
+                                                   strand, startID, endID)], 
+        as.matrix(K(fds, type="psi5")))
+    ss_dt <- data.table(
+        as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, 
+                                                    strand, spliceSiteID)], 
+        as.matrix(N(fds, type="theta")))
+    
+    # melt to have one row per sample - junction combination
+    junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", 
+                        id.vars=c("seqnames", "start", "end", "strand", 
+                                  "startID", "endID"))
+    ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", 
+                  id.vars=c("seqnames", "start", "end", "strand", 
+                            "spliceSiteID"))
+    
+    # merge junction information with splice site annotation (theta)
+    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
+                         all.x=TRUE, by.x=c("startID", "sampleID"), 
+                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
+    setnames(junction_dt, "n", "n_donor")
+    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
+                         all.x=TRUE, by.x=c("endID", "sampleID"), 
+                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
+    setnames(junction_dt, "n", "n_acceptor")
+    rm(ss_dt)
+    gc()
+    
+    # deal with missing endIDs
+    junction_dt_nacceptor <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
+        as.matrix(N(fds, type="psi3")))
+    junction_dt_nacceptor <- melt(junction_dt_nacceptor, 
+                                  variable.name="sampleID", value.name="n_psi3", 
+                                  id.vars=c("startID", "endID"))
+    junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]]
+    rm(junction_dt_nacceptor)
+    gc()
+    junction_dt_ndonor <- data.table(
+        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
+        as.matrix(N(fds, type="psi5")))
+    junction_dt_ndonor <- melt(junction_dt_ndonor, 
+                               variable.name="sampleID", value.name="n_psi5", 
+                               id.vars=c("startID", "endID"))
+    junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]]
+    rm(junction_dt_ndonor)
+    gc()
+    
+    # replace n (with non-split counts) by n_psi3/n_psi5 if NA
+    junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3]
+    junction_dt[is.na(n_donor), n_donor:=n_psi5]
+    junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3]
+    junction_dt[n_donor < n_psi5, n_donor:=n_psi5]
+    
+    # calculate phi
+    junction_dt[, phi:= k / ((n_donor + n_acceptor)/2)]
+    junction_dt[is.nan(phi), phi:=0] # n_donor = n_acceptor = 0
+    
+    # convert to matrix to store it as assay in the fds
+    phi <- matrix(junction_dt[,phi], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
+    rownames(phi) <- rownames(fds)
+    colnames(phi) <- colnames(fds)
+    assay(fds, "phi", type="psi5", withDimnames=FALSE) <- phi
+    return(fds)
+}
diff --git a/R/filterExpression.R b/R/filterExpression.R
index 3dbab8e2..7d8d2806 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -353,3 +353,275 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
     return(fds)
         
 }
+
+#' @describeIn filtering This functions filters out both introns with low 
+#' read support and introns that are not variable across samples. 
+#' @export
+filterExpressionAndVariability_jaccard <- function(object, 
+                        minExpressionInOneSample=20, 
+                        quantile=0.95, quantileMinExpression=1, minDelta=0.05,
+                        filter=TRUE, 
+                        delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                        BPPARAM=bpparam()){
+    # filter introns with low read support and corresponding splice sites
+    object <- filterExpression_jaccard(object, 
+                        minExpressionInOneSample=minExpressionInOneSample, 
+                        quantile=quantile, 
+                        quantileMinExpression=quantileMinExpression, 
+                        filter=filter, delayed=delayed,
+                        BPPARAM=BPPARAM)
+    
+    # filter introns that are not variable across samples
+    object <- filterVariability_jaccard(object, minDelta=minDelta, 
+                        filter=filter, 
+                        delayed=delayed, BPPARAM=BPPARAM)
+    
+    # return fds
+    message(date(), ": Filtering done!")
+    return(object)
+    
+}
+
+filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
+                            quantile=0.95, quantileMinExpression=1, filter=TRUE, 
+                            delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                            BPPARAM=bpparam()){
+    
+    stopifnot(is(object, "FraserDataSet"))
+    
+    message(date(), ": Filtering out introns with low read support ...")
+    
+    # extract counts
+    cts  <- K(object, type="j")
+    ctsN <- N(object, type="intron_jaccard")
+    
+    if(isFALSE(delayed)){
+        cts <- as.matrix(cts)
+        ctsN <- as.matrix(ctsN)
+    }
+    
+    # cutoff functions
+    f1 <- function(cts, ...){
+        rowMaxs(cts) }
+    f2 <- function(cts, ctsN, quantile, ...){
+        rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] }
+    
+    funs <- c(maxCount=f1, quantileValueN=f2)
+    
+    # run it in parallel
+    cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
+                        cts=cts, ctsN=ctsN, quantile=quantile)
+    
+    # add annotation to object
+    for(n in names(cutoffs)){
+        mcols(object, type="j")[n] <- cutoffs[[n]]
+    }
+    
+    mcols(object, type="j")[['passedExpression']] <-
+        cutoffs$maxCount >= minExpressionInOneSample &
+        cutoffs$quantileValueN >= quantileMinExpression  
+    if("passedVariability" %in% colnames(mcols(object, type="j"))){
+        mcols(object, type="j")[['passed']] <-  
+            mcols(object, type="j")[['passedExpression']] & 
+            mcols(object, type="j")[['passedVariability']]
+    } else{
+        mcols(object, type="j")[['passed']] <-  
+            mcols(object, type="j")[['passedExpression']]
+    }
+    
+    # filter if requested
+    if(isTRUE(filter)){
+        object <- applyExpressionFilters_jaccard(object, 
+                                                minExpressionInOneSample, 
+                                                quantileMinExpression)
+    }
+    
+    validObject(object)
+    return(object)
+}
+
+#' @describeIn filtering This function filters out introns and corresponding 
+#' splice sites which do not show variablity across samples.
+#' @export
+filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, 
+                              delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                              BPPARAM=bpparam()){
+    
+    message(date(), ": Filtering out non-variable introns ...")
+    
+    # extract counts
+    cts    <- K(object, type="j")
+    ctsN  <- N(object, type="intron_jaccard")
+    
+    if(isFALSE(delayed)){
+        cts <- as.matrix(cts)
+        ctsN <- as.matrix(ctsN5)
+    }
+    
+    # cutoff functions
+    f1 <- function(cts, ctsN3, ...) {
+        intron_jaccard <- cts/ctsN
+        rowMaxs(abs(intron_jaccard - rowMeans2(intron_jaccard, na.rm=TRUE)), 
+                na.rm=TRUE) }
+    
+    funs <- c(maxDJaccard=f1)
+    
+    # run it in parallel
+    cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
+                        cts=cts, ctsN=ctsN)
+    
+    # add annotation to object
+    for(n in names(cutoffs)){
+            mcols(object, type="j")[n] <- cutoffs[[n]]
+    }
+    
+    # add annotation of theta on splice sites of introns to mcols
+    intron_dt <- as.data.table(rowRanges(object, type="j"))
+    
+    # check which introns pass the filter
+    mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
+                                                           cutoffs$maxDJaccard, 
+                                                           0) >= minDelta
+    if("passedExpression" %in% colnames(mcols(object, type="j"))){
+        mcols(object, type="j")[['passed']] <-  
+            mcols(object, type="j")[['passedExpression']] & 
+            mcols(object, type="j")[['passedVariability']]
+    } else{
+        mcols(object, type="j")[['passed']] <-  
+            mcols(object, type="j")[['passedVariability']]
+    }
+    
+    # filter if requested
+    if(isTRUE(filter)){
+        object <- applyVariabilityFilters(object, minDelta)
+    }
+    
+    validObject(object)
+    return(object)
+}
+
+#' Applies previously calculated filters for expression filters
+#' @noRd
+applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, 
+                                   quantileMinExpression){
+    
+    maxCount       <- mcols(fds, type="j")[['maxCount']]
+    quantileValueN <- mcols(fds, type="j")[['quantileValueN']]
+    
+    # report rare junctions that passed minExpression filter but not 
+    # quantileFilter as SE obj
+    junctionsToReport <- maxCount >= minExpressionInOneSample & 
+        !(quantileValueN >= quantileMinExpression) 
+    outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
+    
+    if(any(junctionsToReport)){
+        # get SE object of junctions to report
+        rareJunctions <- asSE(fds[junctionsToReport, by="j"])
+        for(aname in assayNames(rareJunctions)){
+            if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
+                              "rawOtherCounts_psi3", "psi5", "psi3", 
+                              "delta_psi5", "delta_psi3", "intron_jaccard",
+                              "rawOtherCounts_intron_jaccard"))){
+                assay(rareJunctions, aname) <- NULL
+            }
+        }
+        rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
+                            dir=file.path(tempdir(), "tmp_rJ"), 
+                            replace=TRUE)
+        
+        # check if folder already exists from previous filtering
+        rareJctsDir <- file.path(outputDir, "rareJunctions")
+        if(dir.exists(rareJctsDir)){
+            warning("Filtering has already been applied previously. Introns ", 
+                    "that were already filtered out but should be kept now ",
+                    "cannot be restored.")
+            rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir)
+            toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & 
+                !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) 
+            
+            rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions)
+            
+            for(aname in assayNames(rJ_tmp)){
+                assay(rJ_tmp, aname) <- 
+                    rbind(as.matrix(assay(rareJunctions, aname)), 
+                          as.matrix(assay(rJ_stored[toReport,], aname)) )
+            }
+            rareJunctions <- rJ_tmp
+            rm(rJ_tmp)
+        } 
+        
+        rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
+                            dir=rareJctsDir, replace=TRUE)
+    }
+    
+    # apply filter
+    numFilt <- sum(mcols(fds, type="j")[['passedExpression']])
+    message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
+                   ". This is ", signif(numFilt/length(fds)*100, 3),
+                   "% of the junctions"))
+    fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
+    
+    return(fds)
+    
+}
+
+
+#' Applies previously calculated variablilty filters
+#' @noRd
+applyVariabilityFilters <- function(fds, minDelta){
+    
+    #
+    passedVariability <-  mcols(fds, type="j")[['passedVariability']]
+    
+    # store information of non-variable junctions
+    filtered <- !passedVariability
+
+    outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
+    if(any(filtered)){
+        # get SE object of junctions to report
+        nonVariableJunctions <- asSE(fds[filtered, by="j"])
+        for(aname in assayNames(nonVariableJunctions)){
+            if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
+                              "rawOtherCounts_psi3", "psi5", "psi3", 
+                              "delta_psi5", "delta_psi3", "intron_jaccard",
+                              "rawOtherCounts_intron_jaccard"))){
+                assay(nonVariableJunctions, aname) <- NULL
+            }
+        }
+        nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE,
+                                    nonVariableJunctions,
+                                    dir=file.path(tempdir(), "tmp_nvJ"))
+        
+        # check if folder already exists from previous filtering
+        nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions")
+        if(dir.exists(nonVarJctsDir)){
+            warning("Filtering has already been applied previously. Introns ", 
+                    "that were already filtered out but should be kept now ",
+                    "cannot be restored.")
+            nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) 
+            toReport <- mcols(nV_stored)$maxDJaccard < minDelta
+            
+            nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
+            for(aname in assayNames(nVJunctions)){
+                assay(nVJunctions, aname) <- 
+                    rbind(as.matrix(assay(nonVariableJunctions, aname)), 
+                          as.matrix(assay(nV_stored[toReport,], aname)) )
+            }
+            nonVariableJunctions <- nVJunctions
+            rm(nVJunctions)
+        } 
+        
+        nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir,
+                                    x=nonVariableJunctions, replace=TRUE)
+        
+    }
+    
+    # apply filtering
+    numFilt <- sum(passedVariability)
+    message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
+                   ". This is ", signif(numFilt/length(fds)*100, 3),
+                   "% of the junctions"))
+    fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
+    return(fds)
+    
+}
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 573ba2c7..69466b74 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -79,7 +79,7 @@ checkReadType <- function(fds, type){
     }
     type <- unique(type)
     stopifnot(isScalarCharacter(type))
-    correctTypes <- c(psi3="j", psi5="j", theta="ss")
+    correctTypes <- c(psi3="j", psi5="j", theta="ss", intron_jaccard="j")
 
     # check if it is already the correct type
     if(type %in% correctTypes) return(type)
@@ -109,7 +109,7 @@ checkReadType <- function(fds, type){
 #'
 #' @noRd
 whichPSIType <- function(type){
-    unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE)))
+    unlist(regmatches(type, gregexpr("psi(3|5)|theta|intron_jaccard", type, perl=TRUE)))
 }
 
 #'
@@ -122,7 +122,8 @@ whichReadType <- function(fds, name){
     # check writing
     if(name == "ss" | endsWith(name, "theta"))
         return("ss")
-    if(name == "j"  | endsWith(name, "psi5") | endsWith(name, "psi3"))
+    if(name == "j"  | endsWith(name, "psi5") | endsWith(name, "psi3") | 
+            endsWith(name, "intron_jaccard"))
         return("j")
 
     # check assay names
diff --git a/R/plotMethods.R b/R/plotMethods.R
index ccf99417..13e22bc3 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -178,7 +178,7 @@ NULL
 
 
 plotVolcano.FRASER <- function(object, sampleID, 
-                    type=c("psi3", "psi5", "theta"), basePlot=TRUE, 
+                    type=c("psi3", "psi5", "theta", "intron_jaccard"), basePlot=TRUE, 
                     aggregate=FALSE, main=NULL, label=NULL,
                     deltaPsiCutoff=0.3, padjCutoff=0.1, ...){
     
@@ -278,7 +278,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 
 plotAberrantPerSample.FRASER <- function(object, main, 
-                    type=c("psi3", "psi5", "theta"),
+                    type=c("psi3", "psi5", "theta", "intron_jaccard"),
                     padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
@@ -341,7 +341,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
+plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"),
                     site=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
@@ -433,7 +433,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"),
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"),
+plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"),
                     idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                     basePlot=TRUE, label="aberrant", ...){
     type <- match.arg(type)
diff --git a/R/variables.R b/R/variables.R
index ccba65f5..74bb6d91 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -6,6 +6,7 @@
 #'   psiTypes
 #' 
 #' @export
-psiTypes <- c("psi5", "psi3", "theta")
+psiTypes <- c("psi5", "psi3", "theta", "intron_jaccard")
+# psiTypes <- c("psi5", "psi3", "theta")
 names(psiTypes) <- psiTypes
 

From fdece76ac075fecae4a8919edd5de4e650685ae4 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 18 Jan 2022 13:29:30 +0100
Subject: [PATCH 06/80] included splice type annotation

---
 R/resultAnnotations.R                         | 818 ++++++++++++++++++
 .../hg19-blacklist.v2.bed.gz                  | Bin 0 -> 7731 bytes
 .../hg38-blacklist.v2.bed.gz                  | Bin 0 -> 5867 bytes
 3 files changed, 818 insertions(+)
 create mode 100644 R/resultAnnotations.R
 create mode 100644 inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz
 create mode 100644 inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz

diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
new file mode 100644
index 00000000..ff9268c9
--- /dev/null
+++ b/R/resultAnnotations.R
@@ -0,0 +1,818 @@
+#' 
+#' @title Result annotations
+#' 
+#' @description These functions work on the result table and add additional 
+#'     annotations to the reported junctions: the type of splice event (e.g. 
+#'     exon skipping, intron retention, ...), as well as a flag for junctions 
+#'     that are located in blacklist regions of the genome.
+#' 
+#' \code{\link{annotateSpliceEventType}} annotates each junction in the results 
+#'     table with the type of splice event: exon skipping, 
+#' 
+#' \code{\link{flagBlacklistRegions}} flags junction in the results table on 
+#'     whether or not they are located in a blacklist region of the genome. 
+#'
+#' @param result A result table as generated by FRASER.
+#' @param fds The FraserDataSet, which was used to generate the results table
+#' @param txdb A txdb object providing the reference annotation.
+#' @params blacklist_regions BED file that contains the blacklist regions. 
+#'     If \code{NULL} (default), the BED files that are packaged with FRASER are used.
+#' @param assemblyVersion Indicates the genome assembly of the junction 
+#'     coordinates. Only used if blacklist_regions is NULL.
+#' @param BPPARAM For controlling parallelization behavior. 
+#'     Defaults to \code{bpparam()}.
+#' @return annotated results table
+#' 
+#' @name spliceTypeAnnotations
+#' @rdname spliceTypeAnnotations
+#' 
+#' @examples
+#'   # get data, fit and compute p-values and z-scores
+#'   fds <- createTestFraserDataSet()
+#' 
+#'   # extract results: for this example dataset, z score cutoff of 2 is used to
+#'   # get at least one result
+#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+#'   res_dt <- as.data.table(res)
+#'   
+#'   # annotate type of splice event
+#'   requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
+#'   txdb <- 
+#'     TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 
+#'   res_dt <- annotateSpliceEventType(fds, res_dt, txdb)
+#'   
+#'   # annotate blacklist regions
+#'   res_dt <- flagBlacklistRegions(res_dt, assemblyVersion="hg19")
+#'   
+#'   # show annoated results table
+#'   res_dt
+#'   
+NULL
+
+#' @describeIn spliceTypeAnnotations This method annotates the splice event 
+#'     type to junctions in the given results table.
+#' @export
+annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){
+    
+    # Create basic annotation of overlap with reference
+    fds <- annotateJunctionReferenceOverlap(fds, txdb, BPPARAM)
+    
+    # Calculate splice types and frameshift
+    result <- aberrantSpliceType(result, fds, txdb)
+    
+    # Add the subtypes for exonSkipping and inconclusive
+    result <- checkExonSkipping(result, txdb)
+    result <- checkInconclusive(result, txdb)
+    
+    # Add UTR labels
+    result <- addUTRLabels(result, txdb)
+    
+    return(result)
+}
+
+#' @describeIn spliceTypeAnnotations This method flags blacklist regions in 
+#'     the given results table. 
+#' @export
+flagBlacklistRegions <- function(result, blacklist_regions=NULL,
+                                assemblyVersion=c('hg19', 'hg38')){
+    
+    assemblyVersion <- match.arg(assemblyVersion)
+    if(is.null(blacklist_regions)){
+        blacklist_regions <- 
+            system.file("extdata", "blacklist_regions", 
+                        paste0(assemblyVersion, "-blacklist.v2.bed.gz"), 
+                        package = "FRASER")
+    }
+    if(!file.exists(blacklist_regions)){
+        stop("BED file with blacklist regions does not exist: ", 
+            blacklist_regions)
+    }
+    print("Importing blacklist regions ...")
+    blacklist_gr <- import(blacklist_regions, format = "BED")
+    result <- addBlacklistLabels(result, blacklist_gr, "splicing")  
+    return(result)
+}
+
+############# helper functions ##############################
+
+#' blacklist annotation for aberrant splicing events  
+#' @noRd
+addBlacklistLabels <- function(junctions_dt, blacklist_gr){
+    # add the blacklist information
+    print("Set up aberrant splicing granges")
+    psi_positions <- which(junctions_dt$type != "theta")
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions])
+    
+    # set to the same seqlevelsstyle
+    seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
+    
+    ## create overlap with blacklist and annotate extra column
+    print("find blacklist overlap")
+    black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr)))
+    junctions_dt[, blacklist := FALSE]
+    
+    junctions_dt[psi_positions[black_hits], blacklist := TRUE]
+    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+    
+    print("blacklist labels done")
+    return(junctions_dt)
+}
+
+#' UTR annotation to results table
+#' @noRd
+addUTRLabels <- function(junctions_dt, txdb){
+    psi_positions <- which(junctions_dt$type != "theta")
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions])
+    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+    ### UTR labels based on txdb file
+    ### add 5' 3' UTR labels
+    print("find UTR overlap")
+    threes <- unique(from(findOverlaps(junctions_gr, 
+                            threeUTRsByTranscript(txdb, use.names = TRUE))))
+    fives <- unique(from(findOverlaps(junctions_gr, 
+                            fiveUTRsByTranscript(txdb, use.names = TRUE))))
+    junctions_dt[psi_positions, UTR := "no"]
+    junctions_dt[psi_positions[threes], UTR := "3"]
+    junctions_dt[psi_positions[fives], UTR := "5"]
+    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+    print("UTR labels done")
+    return(junctions_dt)
+}
+
+#' Basic annotations of overlap with reference (start, end, none, both) for 
+#' full fds.
+#' @noRd
+annotateJunctionReferenceOverlap <- function(fds, txdb, BPPARAM){
+    print("loading introns")
+    #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+    introns <- unique(unlist(intronsByTranscript(txdb)))
+    # reduce the introns to only the actually expressed introns
+    fds_known <- fds[unique(to(findOverlaps(introns, 
+                            rowRanges(fds, type = "j"), type = "equal"))),]
+    anno_introns <- as.data.table(rowRanges(fds_known, 
+                            type="j"))[,.(seqnames, start, end, strand)]
+    
+    # calculate extra columns with mean/median intron expression count
+    # add the new columns
+    print("adding median count to introns")
+    sampleCounts <- K(fds_known, type = "j")
+    anno_introns[, meanCount := rowMeans(sampleCounts)]
+    anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))]
+    # order by medianCount (highest first)
+    setorderv(anno_introns, "medianCount", order=-1) 
+    anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, 
+                                                    keep.extra.columns = TRUE)
+    
+    # get all fds junctions
+    fds_junctions <- rowRanges(fds, type = "j")
+    
+    # Do the annotation just for the intron with highest median expression
+    print("start calculating annotations")    
+    overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
+    annotations <- bplapply(1:length(fds_junctions), function(i){
+        # only select first intron as already ordered by medianCount beforehand
+        overlap <- overlaps[i]
+        if(is.na(overlap)) return("none") #no overlap with any intron
+        
+        hit_equal <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="equal"))
+        if(length(hit_equal) > 0) return("both")
+        
+        hit_start <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="start"))
+        if(length(hit_start) > 0) return("start")
+        hit_end   <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="end"))
+        if(length(hit_end) > 0) return("end")
+        
+        # overlaps but no start/end match
+        return("none") 
+    }, BPPARAM=BPPARAM) 
+    annotations <- unlist(annotations)
+    
+    rowRanges(fds)$annotatedJunction <- annotations
+    mcols(fds, type="ss")$annotatedJunction <- "none"
+    print("annotations done")
+    return(fds)
+}
+
+#' 
+#' @noRd
+aberrantSpliceType <- function(junctions_dt, fds, txdb){
+    print("preparing ...")
+    psi_positions <- which(junctions_dt$type != "theta")
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
+                                                keep.extra.columns = T)
+    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+    
+    introns_tmp <- unique(unlist(intronsByTranscript(txdb)))
+    exons <- exons(txdb)
+    
+    # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+    fds_known <- fds[unique(to(findOverlaps(introns_tmp, 
+                                            rowRanges(fds, type = "j"), 
+                                            type = "equal"))),]
+    grIntrons <- rowRanges(fds_known, type="j")
+    introns <- as.data.table(grIntrons)
+    introns <- introns[,.(seqnames, start, end, strand)]
+    
+    sampleCounts <- K(fds_known, type = "j")
+    introns[, "meanCount" := rowMeans(sampleCounts)]
+    introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
+    intron_ranges <- makeGRangesFromDataFrame(introns, 
+                                                keep.extra.columns = TRUE)
+    
+    # prepare the results column
+    junctions_dt[, aberrantSpliceType := "NA"]
+    junctions_dt[, causesFrameshift := "NA"]
+    junctions_dt[which(junctions_dt$annotatedJunction=="both"), 
+                    aberrantSpliceType := "sameIntron"]
+    junctions_dt[which(junctions_dt$annotatedJunction=="both"), 
+                    causesFrameshift := "unlikely"]
+    
+    starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
+    ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
+    nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none")
+    
+    print("calculating aberrant splice types")
+    print("start junctions")
+    start_results <- sapply(starts, function(i){
+        # find the most freq intron that overlaps again
+        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
+                                    type = "start"))
+        expre <- sapply(overlap, function(j){
+            elementMetadata(intron_ranges[j])$medianCount
+        })
+        maxExpr <- which.max(expre)
+        return(compareEnds(junctions_gr, i, overlap[maxExpr], F, 
+                            intron_ranges, exons))
+    })
+    junctions_dt[psi_positions[starts], 
+                    causesFrameshift:=start_results[2,]]
+    junctions_dt[psi_positions[starts], 
+                    aberrantSpliceType := start_results[1,]]
+    
+    print("end junctions")
+    end_results <- sapply(ends, function(i){
+        # find the most freq intron that overlaps again
+        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
+                                    type = "end"))
+        expre <- sapply(overlap, function(j){
+            elementMetadata(intron_ranges[j])$medianCount
+        })
+        maxExpr <- which.max(expre)
+        return(compareStarts(junctions_gr, i, overlap[maxExpr], F, 
+                                intron_ranges, exons))
+        
+    })
+    junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
+    junctions_dt[psi_positions[ends], aberrantSpliceType := end_results[1,]]
+    
+    print("none junctions pt1")
+    none_results <- sapply(nones, function(i){
+        # find most freq intron
+        # check start and end
+        
+        # find the most freq intron that overlaps again
+        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges))
+        if(length(overlap) == 0) return(c("noOverlap", "inconclusive"))
+        expre <- sapply(overlap, function(j){
+            elementMetadata(intron_ranges[j])$medianCount
+        })
+        maxExpr <- which.max(expre)
+        
+        # returns type of exon splicing, frameshift T/F, amount of shift
+        st = compareStarts(junctions_gr, i, overlap[maxExpr], T, 
+                            intron_ranges, exons)
+        en = compareEnds(junctions_gr, i, overlap[maxExpr], T, 
+                            intron_ranges, exons)
+        
+        # merge, start and end results
+        # merge exon elongation/truncation
+        # if both likely/unlikely fine
+        # if one is likely -> return likely
+        # if one is notYet -> return notYet
+        if((st[1] == "singleExonSkipping" & !(en[1] %in% 
+                                c("singleExonSkipping", "exonSkipping"))) ||
+           (en[1] == "singleExonSkipping" & !(st[1] %in% 
+                                c("singleExonSkipping", "exonSkipping")))){
+            ## only one is single exonSkipping, the other is trunc/elong
+            if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){
+                frs = "likely"
+            }else{ frs = "unlikely"}
+            return(c("singleExonSkipping", frs))
+        } 
+        
+        if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in% 
+                c("exonSkipping", "singleExonSkipping")){
+            return(c("exonSkipping", "inconclusive")) 
+        }
+        
+        if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){
+            frs = "likely"
+        }else{ frs = "unlikely"}
+        if( st[1] != en[1]){
+            combined = "trunc, elong"
+        }else{combined = st[1]}
+        return(c(combined,frs))
+        
+    })
+    junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
+    junctions_dt[psi_positions[nones], aberrantSpliceType := none_results[1,]]
+    
+    noLaps <-which(junctions_dt[psi_positions]$aberrantSpliceType=="noOverlap")
+    refseq.genes<- genes(txdb)
+    
+    print("none junctions pt2")
+    noLaps_results <- sapply(noLaps, function(i){
+        overlap <- to(findOverlaps(junctions_gr[i], exons))
+        # no overlap with an intron or an exon
+        if(length(overlap) == 0){
+            return(checkIntergenic(junctions_gr, i, refseq.genes)) 
+        }
+        
+        # for the exons, check if splice site is contained in the exon
+        for(j in overlap){
+            exon_start = start(exons[j])
+            exon_end = end(exons[j])
+            if(exon_start <= start(junctions_gr[i]) & 
+                    exon_end >= end(junctions_gr[i])){
+                if((end(junctions_gr[i]) - 
+                        start(junctions_gr[i]) + 1) %% 3 != 0){
+                    frs = "likely"
+                }else{ frs = "unlikely"}
+                return(c("exonTruncation", frs))
+            }
+        }
+        
+        return(c("inconclusive","inconclusive"))
+    })
+    junctions_dt[psi_positions[noLaps], 
+                    causesFrameshift:=noLaps_results[2,]]
+    junctions_dt[psi_positions[noLaps], 
+                    aberrantSpliceType := noLaps_results[1,]]
+    
+    
+    # add distance to closest neighbour gene for intergenic results
+    print("adding distances to nearest gene")
+    up <- which(junctions_dt[psi_positions]$aberrantSpliceType == "upstream")
+    down <- which(junctions_dt[psi_positions
+                                ]$aberrantSpliceType == "downstream")
+    print("Calculate distances")
+    
+    if(length(up) > 0){
+        distanceNearestGene_up <- sapply(up, function(i){
+            min(distance(junctions_gr[i], refseq.genes), na.rm = T)})
+        if(length(distanceNearestGene_up > 0)){
+            junctions_dt[psi_positions[up], 
+                            distNearestGene := distanceNearestGene_up]
+        } else{
+            junctions_dt[psi_positions[up], distNearestGene := NA]
+            print("No distances found for upstream")
+        }
+    }else{print("No upstream targets")}
+    
+    if(length(down) > 0){
+        distanceNearestGene_down <- sapply(down, function(i){
+            min(distance(junctions_gr[i], refseq.genes), na.rm = T)})
+        if(length(distanceNearestGene_down > 0)){
+            junctions_dt[psi_positions[down], 
+                            distNearestGene := distanceNearestGene_down]
+        }else{
+            junctions_dt[psi_positions[down], distNearestGene := NA]
+            print("No distances found for downstream")
+        }
+    }else{print("No downstream targets")}
+    
+    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+    print("done calculating aberrant splice types")
+    return(junctions_dt)
+}
+
+#' 
+#' @noRd
+compareStarts <- function(junctions_gr, i, max_lap, shift_needed, 
+                            intron_ranges, exons){
+    intron_start = start(intron_ranges[max_lap])
+    ss_start = start(junctions_gr[i])
+    
+    # found the most freq intron with same end again
+    # check if intron starts before splice site -> exon elongation -> FRS 
+    if(intron_start < ss_start){
+        if(((ss_start - intron_start) %% 3) != 0){
+            frs = "likely"
+        }else{ frs = "unlikely"}
+        
+        ifelse(shift_needed, 
+                return(c("exonElongation", frs, 
+                            (ss_start - intron_start))), 
+                return(c("exonElongation", frs)))
+    }
+    
+    # check if splice site ends in following exon -> exon truncation -> FRS
+    if(intron_start > ss_start){
+        
+        # create dummy exon find all exons starting from that intron end
+        dummy_exon <- GRanges(
+            seqnames = toString(seqnames(intron_ranges[max_lap])),
+            ranges = IRanges(intron_start-2, end = intron_start -1),
+            strand = toString(strand(intron_ranges[max_lap]))
+        )
+        exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end"))
+        for(j in exonChoices){
+            exon_start = start(exons[j])
+            if(exon_start < ss_start){
+                if((end(exons[j]) - ss_start + 1)%%3 != 0){
+                    frs = "likely"
+                }else{frs = "unlikely"}
+                ifelse(shift_needed, 
+                        return(c("exonTruncation", frs, 
+                                    (-1)*(end(exons[j]) - ss_start + 1))), 
+                        return(c("exonTruncation", frs)))
+            }
+        }
+        
+        # check for single exon skipping
+        if(length(exonChoices) == 1){
+            
+            # check if there is no other exon within the first intron: 
+            # splice site end until exon end
+            dummyFirstItr <- GRanges(
+                seqnames = toString(seqnames(intron_ranges[max_lap])),
+                ranges = IRanges(end(exons[exonChoices[1]]) + 1, 
+                                end(junctions_gr[i])),
+                strand = toString(strand(intron_ranges[max_lap]))
+            )
+            
+            if(length(findOverlaps(exons, dummyFirstItr, 
+                                    type = "within")) > 0){
+                # another exon is contained within the most freq used intron
+                ifelse(shift_needed, 
+                        return(c("exonSkipping", "inconclusive", 0)), 
+                        return(c("exonSkipping", "inconclusive")))
+            }
+            
+            
+            secondItr <- GRanges(
+                seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+                strand = toString(intron_ranges[max_lap]@strand@values),
+                ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1) 
+                # end of exon + 1, end of aberrant junction
+            )
+            secItrChoices <- to(findOverlaps(secondItr, intron_ranges, 
+                                            type = "end"))
+            # only look at most used one
+            expre <- sapply(secItrChoices, function(j){
+                elementMetadata(intron_ranges[j])$medianCount
+            })
+            maxExpr <- which.max(expre)
+            
+            if(length(secItrChoices) == 0){
+                ifelse(shift_needed, 
+                        return(c("exonSkipping", "inconclusive", 0)), 
+                        return(c("exonSkipping", "inconclusive")))
+            }
+            
+            if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){
+                # check if there is no other exon in that range
+                if(length(findOverlaps(exons, 
+                                        intron_ranges[secItrChoices[maxExpr]], 
+                                        type = "within")) == 0){
+                    # clear exon skipping, only exon is skipped 
+                    # calculate frameshift, skipped exon plus possible exon 
+                    #    elongation 
+                    
+                    shift = (-1)*(end(exons[exonChoices[1]]) - 
+                                     start(exons[exonChoices[1]]) + 1) + 
+                        ss_start - start(intron_ranges[secItrChoices[maxExpr]])
+                    
+                    frs = ifelse(shift %% 3 == 0,"unlikely","likely")
+                    ifelse(shift_needed, 
+                            return(c("singleExonSkipping", "inconclusive", 
+                                        shift)), 
+                            return(c("singleExonSkipping", frs)))
+                }
+            }
+        } # single exon skipping end
+        
+    }
+    
+    # splice site longer than one intron + exon -> not defined for now
+    ifelse(shift_needed, 
+            return(c("exonSkipping", "inconclusive", 0)), 
+            return(c("exonSkipping", "inconclusive")))
+}
+
+#' 
+#' @noRd
+compareEnds <- function(junctions_gr, i, max_lap, shift_needed, 
+                            intron_ranges, exons){
+    intron_end = end(intron_ranges[max_lap])
+    ss_end = end(junctions_gr[i])
+    
+    # found the most freq intron with same start again
+    # check if intron ends after splice site -> exon elongation -> FRS -> done
+    if(intron_end > ss_end){
+        if(((intron_end - ss_end) %% 3) != 0){
+            frs = "likely"
+        }else{ frs = "unlikely"}
+        
+        ifelse(shift_needed, 
+                return(c("exonElongation", frs, (intron_end - ss_end))), 
+                return(c("exonElongation", frs)))
+    }
+    
+    # check if splice site ends in following exon -> exon truncation -> FRS
+    if(intron_end < ss_end){
+        
+        # create dummy exon find all exons starting from that intron end
+        dummy_exon <- GRanges(
+            seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+            ranges = IRanges(intron_end + 1, end = intron_end + 2),
+            strand = toString(intron_ranges[max_lap]@strand@values)
+        )
+        exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start"))
+        for(j in exonChoices){
+            exon_end = end(exons[j])
+            if(exon_end > ss_end){
+                if((ss_end - start(exons[j]) + 1)%%3 != 0){
+                    frs = "likely"
+                }else{frs = "unlikely"}
+                ifelse(shift_needed, 
+                        return(c("exonTruncation",frs, 
+                                    (-1)*(ss_end - start(exons[j]) + 1))), 
+                        return(c("exonTruncation",frs)))
+            }
+        }
+        
+        # check for single exon skipping
+        if(length(exonChoices) == 1){
+            
+            # check if there is no other exon within the first intron: 
+            #    splice site end until exon end
+            dummyFirstItr <- GRanges(
+                seqnames = toString(seqnames(intron_ranges[max_lap])),
+                ranges = IRanges(start(junctions_gr[i]), 
+                                    start(exons[exonChoices[1]]) - 1),
+                strand = toString(strand(intron_ranges[max_lap]))
+            )
+            
+            if(length(findOverlaps(exons, dummyFirstItr, 
+                                    type = "within")) > 0){
+                # another exon is contained within the most freq used intron
+                ifelse(shift_needed, 
+                        return(c("exonSkipping", "inconclusive", 0)), 
+                        return(c("exonSkipping", "inconclusive")))
+            }
+            
+            
+            secondItr <- GRanges(
+                seqnames = toString(intron_ranges[max_lap]@seqnames@values),
+                strand = toString(intron_ranges[max_lap]@strand@values),
+                ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end) 
+                # end of exon + 1, end of aberrant junction
+            )
+            secItrChoices <- to(findOverlaps(secondItr, intron_ranges, 
+                                                type = "start"))
+            # only look at most used one
+            expre <- sapply(secItrChoices, function(j){
+                elementMetadata(intron_ranges[j])$medianCount
+            })
+            maxExpr <- which.max(expre)
+            
+            if(length(secItrChoices) == 0){
+                ifelse(shift_needed, 
+                        return(c("exonSkipping", "inconclusive", 0)), 
+                        return(c("exonSkipping", "inconclusive")))
+            }
+            
+            if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){
+                # check if there is no other exon in that range
+                if(length(findOverlaps(exons, 
+                                        intron_ranges[secItrChoices[maxExpr]], 
+                                        type = "within")) == 0){
+                    # clear exon skipping, only exon is skipped 
+                    # calculate frameshift, skipped exon plus possible exon 
+                    #    elongation at end
+                    shift = (-1)*(end(exons[exonChoices[1]]) - 
+                                        start(exons[exonChoices[1]]) + 1) + 
+                        end(intron_ranges[secItrChoices[maxExpr]]) - ss_end
+                    frs = ifelse(shift%%3 == 0,"unlikely","likely")
+                    ifelse(shift_needed, 
+                            return(c("singleExonSkipping", "inconclusive", 
+                                        shift)), 
+                            return(c("singleExonSkipping", frs)))
+                }
+            }
+        } # single exon skipping end
+        
+        
+    }
+    
+    # splice site longer than one intron + exon -> not defined for now
+    ifelse(shift_needed, 
+            return(c("exonSkipping", "inconclusive", 0)), 
+            return(c("exonSkipping", "inconclusive")))
+}
+
+#' 
+#' @noRd
+checkIntergenic <- function(junctions_gr, i, refseq.genes){
+    # check if start > 1000
+    # start - 1000, end + 1000
+    start = start(junctions_gr[i]) 
+    # ifelse(start > 1000, start = start - 1000, start = 1)
+    # if(start > 1000){
+    #     start = start - 1000
+    # }else{start = 1}
+    
+    end = end(junctions_gr[i])  #+ 1000
+    if(start + 2 < end){
+        start = start + 1
+        end = end - 1
+    }
+    
+    test_junction <- GRanges(
+        seqnames = seqnames(junctions_gr[i]),
+        ranges = IRanges(start, end),
+        strand = strand(junctions_gr[i])
+    )
+    
+    # overlap with introns and exon
+    # IGNORE STRANDS? -> decided its not necessary
+    
+    # check if distance to nearest is > 1000 -> intergenic
+    # otherwise up/downstream
+    dist = min(distance(test_junction, refseq.genes), na.rm = T)
+    if(dist > 0){
+        # if(dist > 1000){
+        #     print("intergenic")
+        #     return(c("intergenic", "unlikely"))
+        # }else{
+        
+        # find nearest and compare starts
+        if(start(refseq.genes[nearest(junctions_gr[i], 
+                                        refseq.genes)]) > start){
+            ifelse(strand(junctions_gr[i]) == "+", 
+                    return(c("upstream", "unlikely")), 
+                    return(c("downstream", "unlikely")))
+        }else{
+            ifelse(strand(junctions_gr[i]) == "+", 
+                    return(c("downstream", "unlikely")), 
+                    return(c("upstream", "unlikely")))
+        }
+    }
+    return(c("inconclusive", "inconclusive"))
+    
+    # if both lists == 0 return intergenic else inconclusive
+    # return(c("inconclusive", "inconclusive"))
+}
+
+#' 
+#' @noRd
+checkExonSkipping <- function(junctions_dt, txdb){
+    psi_positions <- which(junctions_dt$type != "theta")
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
+                                                keep.extra.columns = T)
+    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+    
+    refseq.genes<- genes(txdb)
+    
+    exonSkip <- which(junctions_dt[psi_positions]$aberrantSpliceType %in% 
+                            c("exonSkipping", "singleExonSkipping"))
+    
+    print("start checking exonSkipping")
+    newSkip_results <- sapply(exonSkip, function(i){
+        start = start(junctions_gr[i]) 
+        end = end(junctions_gr[i])
+        
+        # reduce the junction width so adjacent genes have a distance of 1
+        if(start + 2 < end){
+            start = start + 1
+            end = end - 1
+        }
+        
+        test_start <- GRanges(
+            seqnames = seqnames(junctions_gr[i]),
+            strand = strand(junctions_gr[i]),
+            ranges = IRanges(start, start + 1)
+        )
+        
+        test_end <- GRanges(
+            seqnames = seqnames(junctions_gr[i]),
+            strand = strand(junctions_gr[i]),
+            ranges = IRanges(end - 1, end)
+        )
+        
+        # check for which genes distance to start is 0
+        start_genes <- which(distance(test_start, refseq.genes) == 0)
+        # start is not in a gene
+        if(length(start_genes) == 0) return("beyondGene")
+        
+        # start is in a gene -> is end in same gene
+        for(to in start_genes){
+            # end is in same gene
+            if(distance(test_end, refseq.genes[to]) == 0){
+                return("exonSkipping")
+            }
+        }
+        
+        end_genes <- which(distance(test_end, refseq.genes) == 0)
+        # end is not in a gene
+        if(length(end_genes) == 0) return("beyondGene")
+        # end is in a different gene
+        return("multigenic")
+    })
+    
+    print("checking exonSkipping done")
+    if(length(exonSkip) > 0){
+        junctions_dt[psi_positions[exonSkip], 
+                        aberrantSpliceType2 := newSkip_results]
+        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), 
+                        aberrantSpliceType := "beyondGene"]
+        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), 
+                        causesFrameshift := "inconclusive"]
+        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), 
+                        aberrantSpliceType := "multigenic"]
+        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), 
+                        causesFrameshift := "inconclusive"]
+        junctions_dt[, aberrantSpliceType2 := NULL]
+    }
+    
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    return(junctions_dt)
+}
+
+#' 
+#' @noRd
+checkInconclusive <- function(junctions_dt, txdb){
+    psi_positions <- which(junctions_dt$type != "theta")
+    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
+                                                keep.extra.columns = T)
+    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+    
+    refseq.genes<- genes(txdb)
+    
+    inconclusive <- which(junctions_dt[psi_positions
+                                        ]$aberrantSpliceType == "inconclusive")
+    print("start checking inconclusive")
+    
+    inconclusive_results <- sapply(inconclusive, function(i){
+        start = start(junctions_gr[i]) 
+        end = end(junctions_gr[i])
+        
+        # reduce the junction width so adjacent genes have a distance of 1
+        if(start + 2 < end){
+            start = start + 1
+            end = end - 1
+        }
+        
+        test_start <- GRanges(
+            seqnames = seqnames(junctions_gr[i]),
+            strand = strand(junctions_gr[i]),
+            ranges = IRanges(start, start + 1)
+        )
+        
+        test_end <- GRanges(
+            seqnames = seqnames(junctions_gr[i]),
+            strand = strand(junctions_gr[i]),
+            ranges = IRanges(end - 1, end)
+        )
+        
+        # check for which genes distance to start is 0
+        start_genes <- which(distance(test_start, refseq.genes) == 0)
+        # start is not in a gene
+        if(length(start_genes) == 0) return("beyondGene")
+        
+        # start is in a gene -> is end in same gene
+        for(to in start_genes){
+            # end is in same gene
+            if(distance(test_end, refseq.genes[to]) == 0){
+                return("inconclusive")
+            } 
+        }
+        
+        end_genes <- which(distance(test_end, refseq.genes) == 0)
+        # end is not in a gene
+        if(length(end_genes) == 0) return("beyondGene")
+        # end is in a different gene
+        return("multigenic")
+    })
+    
+    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
+    print("done checking inconclusive")
+    if(length(inconclusive) > 0){
+        junctions_dt[psi_positions[inconclusive], 
+                        aberrantSpliceType := inconclusive_results]
+    } 
+    
+    return(junctions_dt)
+}
diff --git a/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz
new file mode 100644
index 0000000000000000000000000000000000000000..5d87eb135971a4b5e1010045ab5c5bbc4736e1ab
GIT binary patch
literal 7731
zcmV-39?an%iwFo<=jvMk188S4IW1yrVPk7-X>)Wgb}}wvWn=(_eMzouNpjryY8ADF
z=$jpIKuxGg*059%iU|-TK?o4Edz!h$n?1LS9DD<w<+&$15AV<4|MkCW`Si2fYNdQe
z8|PMlfBN$u|N58z_2)nS{cr#Fmw)*`|M>Gi{{4UaJ*cD2Hf~T)z0FbYnX8N<(59o-
zK1biH1e$&9oLoO3-PIjqp26RpXPIBEpS!@q>#fBh|MP$RzrXy?zy0Tb{_TJN`EP&z
z-~azOwbaSC!M|=|NId5R@eccPwT{Xs)mA;RuRTb_zw4$ixK1zpUHseZwAC`(nem2G
z)MUi9t<8lcxOM`^!DBdu*fnb$yCenwae?tq>%&tQHUX&>uVE)0jG?8j?cCz2bH%Qi
z<MgpC72{eQ4c?+1`>w045>Ew~)h_4W6FUuzjg4OIWSv=etPntKQnQEqT`^ezXV}%k
zOmYrl2b5}0B+0Gei9v3S^X145lHkNK0kX5(czk2q<czT|_xtSCZ0PoBUFR47F0)HO
zTrFO%t#Pw`k6&Fij>lMS*w<U7Z>|Z18=1-D_Qpn!)vnvlN4@9NSQP+2F|F(pXNO>#
zeL?Dld<p7p$3C?~^u|ng9;GkTxnq+NtbFj0;48+|H^yXNwC{W39_gpeSofTIdM`Jq
zeb1BsI>diWQ^$gKv&~7xZKFrdzU_7!b<em@2u?osIT~bL2%1)w9ne~5#g78u2Da48
z!H|lBUhN`n9PZ5nu}dq%o(zbj(m9a;)xYb-^wb-I=9FWfdUvg{=m3x+cJo6djgawy
zoCpUQ%7Gpn==vwO*>w{Gjb=70uDP<?4JOdYH&$)f^l>E;&1xqMx4~ImIt_&5@j{?u
zuatN)&>lvyBDr^ttO!j%X`Kgah>ZIr8(j~*s$Tcg9SjJGP!8^ZnD*2)o?Kg$gr2(h
z5%oCbfYe=M`xA7t13DyM!KEJ}u<Q`5zmL0N=am+`*4#3Y+Gmqg?Yr3*h1^|BzmAL9
zNET3QS#k9MXd@bG+!dsr=9FMku~S!2?{vZi)P^$>F92|jYC`I0aft$Srxkz-VI3Q<
zbtuO@pMM_v`1NsnGm-_=`2-5Vm!(=`#i9f3W>$aS9nuoFWOqji)B|F?=-IvdCB1uv
zq$0TXMSE#7F#yKS#1wh`#)<3DJDkEp>Xs$}aGeHC>XBgsU?mBtFkGlg5RBr-RA&q6
zNZ=22$IplD*=bJNNd;raXb7AJ4Jx)=C=&qHsdGUccZhoY_Jgn;(v$!~u}GE*g0IJm
z0t|bgA-?Ci(x}##`G@Hg7aG9s{zdVuIGg~>S(&6_|A$J4|KZD|E~w<OR4_UG{s-f<
zM?KDj0(Ea#8lfxhUhM4yz3*zT;E^qtDE;8NmsYpN^kX_uy<$?KfjPaZe+JH7$b5or
zpE}BOeE^)2*&O6y&x?0^m}UvDhT==zlPAjYM(A^+j_Rc|7+v;G9edj?GV1I_RLrYg
z%z-%!TMM8%k3djYTo0U8f0K{awKfFf%L|M#dr-$XUtRzjjNWxzugl=hYex>UyJ~)!
z;Llg<gjaCugmV-x2hg2{OzM_}GH7-Bu@9IALB$;1T+zb}jqn)(%rB3GP98^|z}XU=
z_VbNHM+4f+oU!{mNf_JC<tR4h4W|)-8_<c_q#p|Aq#w|k(%T1>b1QAe*>b!Aoce`&
zXWZdfsV@Qt_QMH>0KoM^6uVM79>DZhBQ(AF+-sqCSnBR!!vHM5`V+e(9tL2YF`(8W
z^?_h#?#ke;E010Maz3HF_mCxY-T6Z@A((m#>YDz-#i^7N3>wOIt_f0`lnCfBwc5=;
z>0^pvVZ}y%@F(oX(99<Ee6PS#=ITYE?i}OAAa%))0I*wro+h=IaBPmMgOxvWhL%?X
zEHj{a6%QadX5CjmaUl`x{6?I+%EAVi-ol!D)wt{kQg;w}yJ~anFz)Rm_YHNGz+Yc;
z-}YYg_MWTlcrl3vfOY&XK6}Pg6F650sF-zaj@xA1{r>9<MnkCgJn7Z(poXsC%~kVd
zTr?MM@J{oB4vZDEy{Hl76NI&sbqG+sJX)=eFy#okdCNwxV(|L$IxU^Oa>2DxuN850
z=7bmm7+<JR1VT-KSPC;adr^!c)Px1YqU2k6L7fk=L2z^ZW34026yS@6x?n=Yh9EEp
z5`UI=guulNKs{ATEWtISrb8+od9Zb@TwMn;P>-D3`KC#|+Tj4zImi2F!_tpC7+|%0
zA8pvta*qP|1G(!~{D;5=;z)%GFoSwmx4Uz<9SSADDc*;JRaDdy;1;`1xb<4>CV=iW
zZ+arbjss}LzboXW-gTVn{S%lfIq?XVpRD-;Yy>BZ(*|$GK_l>jY0o1Y3P3TEoVuDk
zrh7qS^^>}$;w%7PRQkQn2pLP@Pbsc0xo0+A>A4VPl<B^3(@lfB<iXYxk5`@jX`Is7
zN1LbA9FhZrE9MAbet9HLar`=fm!MD~kAfJt7Vj5<jc~&qMbNCC&RZca1^`LFlzMP;
zhRz2SxOvF~3ttX7fL?OBcc9wDj1rhd!tv>`l>p?y&VRa9#qtSeuKAtzEG`1C^<uSa
zC^i6Xn$cjpQ%ecpw-WB68c!nFxiSe2T52hQi^h>Udu+BtW7e-E3?04bq&&}m9Xh!N
zjtBb_j9W>rZQe>Q!U*cBaupJ|u`igW@s?{!<Yv4ncQ!%y1K&-44SKk|oH?h@I1dEf
z8c+x7ZX8Gg6IMA;phIL5tX#J_m)B76_E`Dy;55n#CusTeF0SFY005eC5<*nSM@rzG
z1{WtG)C2-=wcTD@tTMsP1@@&qt#E4HUcTLdODY}-@a5OBbwUXMaFPg{cgF=xU<}4P
zr{eeoezRTZ<gwWRns?(u>)DUmuXkiKtw=8lmD_J&=D|e3?ec_tdFqnd0U!k^8U9)~
zB->ptn<IZD)b8+PUHXdrZrW#uM8Hk-#YqdsiD@VB-l~hkwv=tx@`N_<E)4M)J9g*x
z!QKKX92Qw1zf!gECW<IJkXGXNH>jmSr%@&vkV`uVa+~ER4{3ghPJ^Od63EXjC{!s$
zp37_3wcwZzJsFC&zi>d=GFAfl-2jufPQ~Yme5{bEdu1tI0?3$cl@W;h(*4$#NI58X
zR=opI&xLEn$Z=BylP(3zghH<AU2}Y!g!R%Xh?dK=4DwlV7>K?c^;-5&@FI{mHkiY?
zUKS_py_-vhS{Sv|egHat591AYC}%|1STU8}QO!eTikDwCaQYG%$lIrtoUM0&=G(uK
zXN`F!YG0%TER=a4`K8oa@(uMR&@fVh`LRdIKCR9#8@T@XUx~W+B9XG<9*tDj^6kl2
zK~JLnB{`)+N?ff-f0?O{3`_lGGxg?@iGqx$Y59PBc!k+f%I7Bfq8)>>*V~*xUh4+s
z#^wn$O(cv__apgJNB!cvAmu7osE!utp_bf2dkJ*BQv#GRmD^Rv{9?zEqK!thy~iJv
zm&RKeK=sZ6#yZA%uW@WP&iF)jQE`r;@qy^Mv4z{NHpg##paj2Ugyv_9)UK2v0yiR3
zZx07Kz)T~ZgSiSN)DnQ6CZB=FWU)xC0C05$pfVH~SpXOInjO&Ol^t$!fMsWW?NBr5
z=bSc!d3@;654n4IwVEw8Xrw*k21==mvV#hFAJ*xix6#hPfd+YH*q!T=zCk@rK3M=8
zWgnl*+~Uw60i69iLU9{Ex)V;JIj|!Xkn^KmY^!q)DH!8v0`)kho8(<3gow7&Bxf9}
zx||X{KVi5;+7@PQnqxJ|P-b&$d)(|w2e0F{iG;bnls=U|66@5IsE)VDtT*t^>~YyO
zNLZHyIGHpiUvr0=;+!;~LZpY*&Wx#41q6>77FQ>~y7ieK#xG;f5x5ZM{H)t%Je9yD
z*US$#&<TYZz<Rn#y-mJJc|x)=K{M2k0$dj&1Qp8qlJ|ICr+b3BucAN%PUD&%2Giom
z9cTj1{{reb&;J6I-3jSQ@6_`S&QqOCn_)1?)U5P<%}n(T=8*4{c2jpnFN=Gm#mGBE
z$~LnFYJb@doq;$r1)#!Kh}(H!!}=Gvr79c%p%%BnucO)@ol#stW|lU+5ONzhf6-=X
zS5ZEBPxH=qB)>!!&X!s7JFA{6mlK9O-1WUbjc(;gWv%Nt<Cj!SOo))`^AJFV*nAnk
zrMcV+QX_+t>A8Klb=gdIU8*F3$n%rlLuZm;ntleb%XZu^1Ww#-7Q1YvNRCree=B&a
zDX_$)bT`aq_FaStKjE-5n|(+2UHmA3)ww~11XoI3-&QpSb^c`EF&1fd-fUG#P_rGU
z6p)HrD{}wl+BK63o9R3fU<B#ig>!~=QS28W%`4d-XgF2zpR=V8{?0ozRYSzl49$4w
z6D1qwNSx1Ak;1iV#9`ih?6B6XO$y@C5y0t9-Ek)>e`aW%#i_D~G9~J>0GiD{1(oT?
zVg6Aw^Hgk<INUVzT`ZJTsQe+-&YZ4F;jlJCf~a1m1lU<BRS`LW>Qa(O?MF5bx3;J5
zSB*<M$r<=urYpB)d-05%iX-(jq%wv(-=Mat3zyfqyUb2pk*f2>{JQgfn4zx9XZj2L
zys|K;iQ0!#dJmJSo8A?`86M}uYyjO^0N<#qY!!;H>Wm~%2aB~x1#rPhQW@9?YD7jo
zb;OQlmg|>yoqa#K+4FSm!xUH59dbsU)lKT$67XwvcWZ)5<+Mr>DE?rD4ILR`rQ0CO
zVWwe~=?n1XsVosw`e3)o`k*q!SQ&)<`W;A(KhDz%ud1ao!923m=;*UYV4be2{>Y)Q
z@49ibDkEd8JM9eYRbOp3K$^u!UXweYz}Y^e!roWv$SKYtc~~ke7bBBAoYRohrhpZJ
zQ}IacJ2VRb>m(p`?U!W&OWn>t^G2F%nr%zdFOz$QGIwlg>HQpNcj3*r)nz%~59Fja
zDs%Om)=KIuD*l7*<t+INYXS!}Tv@YErz2d}$V6dH0H?NHR<kG`$5^L-fx#U65I@x4
z&8`ZnWEsFp%^C1;jMdQt%9sn>T{dL6UHRb4W+Q#LlrCM5c`Xs}AMR-2`u?e*oNfny
zXcEUAzOJ)~p%*Vn+HOmoBDGlQ!rh4@7=fqBzJ)w?={Fx<7Fy$DGD|*?n9>^nrn@$v
z+ZCo;hn824?Q%lKRoYRdvf->9DUS-d;)=VjEeaTW=#r1ZASVvUI9=f!rZ%~Je(I0)
z%%^KU`#tB<_ZOacB4;0BxuC*^ssl8vFOo~!>lXKfKiq9oT!!i!0d$>YqES%ytDqfT
zH3Q>poin22-vK*y#~KGp-HiD2eo>r2+buAZy{iazAUk0&_?vVyqG`15Myr4KkTuMw
znTfY8c=v=2*jDIpDGy5@C%Qp^nsmVyj1h;r-PkvV3e8MepJTnVmQ!i*hG1r`c`kz*
z)5nj>8gOP?mWbCN3oteEZiibweIh%sj28+nQJ<PU5n7(wv%vOWMqH7K#ZmP>H()NK
zTXbnzOzCl)N(WYxpuAZ*Z16Au3DlX|^5mU9)kWseS8jir@szNJaYjwWdTS9(8?rEA
z73LZXsj%5}+8B;BTZ7GKP7AGu;3gU)c%3aPLSvxQ(yUISr%CZSf-F50JSY>Y)u6Y>
z1f+x76!Q#!kok4UqN|WXS1tZ;KmCF6p}r~*&SuU(wI97<Wv!26viQ2Z;_G1&Dcc8I
zNRC{BtiY_{9A3rd0GukzHYrQ>RRnHjKFkSuDVMmD-Ebh;<@CY-aE;Or=AH8Xo_mim
zPTJ>|rrHnuAI(_uO<8@RXD|~kog(%-I4@JoWva6~3+8aVu4bp2$X$yV{@5>uKZZ$X
zh;AqQd8j(04%uGkpRaO4nJm$qIHCr!6c!^=)b;m=uF!@;31E(i+a#QxG3GlM!zfPR
zGSNu>_tEP$3K@%@LS<bmtKDK<ywqAIW#r>lL8*#Fm&uM}#`KSN-i;30Kiok|=b^Hn
z!~scb@&K_+&m<~!UL;ouCcD(0tTlUkK%j?OH22Ijo#^YKQvfh#zIL7I0jduK=tkXS
zjrD9>F80~8QU_L<4jpO&Q{lo=vFEe~XvR$yWjiYM18}!@RJiO&9pmg6+mp;IwG!YO
z$;N_=tAu+3bB(gb0b}Qg2wJ*2huLtE7+<b)J9dgSwJ^KvFXx7H=^cqITwI|hip-?y
zJ74nkoml}|<RAC2PzkbZyvRMCTiL(Em18pZwyMPsf$?=jN-eJT5NJR>D-Ddi#iB&~
zKqo(k_(eS@P=0XOGHWV=Kx90k!R>Nz5;c>LLoOtpIt2oCqkL0nU`MJ2;rvN<$hq9U
zk}wd3#`K_w!5yyrs>KA-B=i-ssNZJ*-r<b#9$KwhqY#}ePn^U#t}^*(1R8HIxLkf&
z07O}1&mKCkO0@!+n<uziib)`IqmetCl!LIaGgGBU;JIt~M44BVExB-*l-~o?v%1_M
zb}0sxs)|1-yfT_Pjj>~clTEB;*VO^E$Fx(0?I&?V)cwxBFr@cuVYXd{_)lLlO~SSw
zi{SP)HMafAj}3P#BZYI&Od2&42VJYF!Em5bNQdi2H$L|%FSomp;P{NvZk!4zTf>xD
z54OeFU7ZxxF`11cN`m8=tYkI;*1_(ER|T&CW)4Sq;FYoiVB{V%`<gO22~0F{P|3Y&
z$wO4l%{}O;tIEQvtfn;{A8?u4rDIQIJnVznDnoBs>G8?xctB(pjOrqYj2<tp$|><r
z>n6V{xI?Kh{9lgoatC)SS&e8Lcbd3Y?mtegQEQpjcg%sGv;)H&oMwYg8XD{MNHlp;
zlO6*35_pCjFIriK*UT89lYv|*`A;CEqqy?vE!+4E;}|Qd^9W|vl>#)fl<@TY!s!El
zqBr5i$af=|OIkKTc3H*@$~Vw(CX7}-KaqD6K8ZL9ZWdctr)q#}TzZqaWpYS##=PCY
zag8W*4Z!(5$-G_NjOKNM6-+i7mFuxMM*^7`$xkz-ENwby^7^uwkUb`Ntr<*$EK~v7
z;om~2E|=ssKbtx7VPsGMy0K7sQfq`!is^s?=6!x+0#gJuS)Hw0oJ||;2|MHVr9$D?
zb|GV4TKfFa=;4$rCt%WRAv>e_oyo{m2ws9L4QNJDo@!gMigshL{6o!5iCppAWINj-
z4Ts2CmlS}o$BsNL%w4l59fwEu1Ch&<CKngJbR8g<l}PS>u3DmM@^;8Q{q@l&G64??
z#kQ0dNOZG^1BtyT`!<mAaZZ{tRp~lJ?yCtij2)%26S<OCiqSG(_9&6Lt7kaipW;ho
zcVgW?UEhe7AKRYcn3X}}qPZdViwb`OWnJ1xgiMk4zgf8MZ=KzxLr0IU%tj~il$VT5
zWCCyCvR{Fc-RPz;qgo&N27mw6g$rpZU&^R*>k|cy(Pu2VhZj@0)f&La?!{fJjW&AL
zoO&@kHx3N5bQ2%j`mjYy__<E%AH{p06S@#&V5U;8&w5|6Sa~C+#WqdxgrCyz-{*k)
zf|Ib*j`Hhs-$y#Ien#4_@KX;}d%_G(Co8nL*fjw8uT~!HI0jrxx7L_^RpbEZeE+^k
zMG?;cStN{WSzqty_Ie)tPJ_DB6oYC#^Y??cmWo<wdzq*onMW;O8mlnU2X=aH)QPz4
zOgRA!M>6!lD40&S8#|c)G}WCZfdfoz+Hml*PRVWqT)C?L?rcSGpLv>G5)seKWfIQL
zDD~wNmSOBoGGD^_7&c*jF{!#M^>41!!9>~%!w}@V8v3XDNfBff925dwiaJ5om}AAq
zT@-LTTip+Tr`_kR2T-r$--SxKYD7nE!MJ-@w0*3~B%eKN>$muIg&lj9z(hgG>`Im5
zH->VFHphuTTYMlU>l*^vf=L=FnCq&oU`A892|*U%TFgn3twcYQNuMF}<=+$JTZLT)
zMPpCYT+twNE6oR>XBI)w^evj@`*Ty;Lo1Fup|z8xf2?TES9L3}pL0%o_X5B%(L^v|
zyHEAyYw~^m{=oU}5Q408!s6F6)B<vs)@!<3c75JsRv>2C8Khl(+=alTb1fFLDKJ2g
zUk$q*wTmGz+CP{)+u`+sG#u$2$B4DkMEPNdbQ`ewMI>AR^n8-{em%elvgSZ55>C~!
zYp=8$C}~@j223_Jv@>*aqXStgF*Y`(0%*)P&1v^ls}k&VLf9EM(%K2E{9$EE99UlE
zS-0cG%0H#F@IziR9qgf99{Li*K-MyxT9y140Pl+Vopy`*JHvoUn<cS|ZmU|0z!>{r
zs{Ijj9h<A91arI^F#?lzyO_?PY@8d7Qqq@XqH8FLs~gFD*$^ZGk4(c$XXftnS}8F{
zFSjeGn^vju1X)QFX<hly1V-m9<+XYwa^H?3x0WWO9jIsBG=j)br-)14KN4bF&zb{j
z#-0l9l^Z0`OsBH4SfRY;InaE)0?$5A8q&BUb77J=f6_@K>mt|n@ykk32$V(K!gd(*
z_4v7C=T=)~mpZw!fp*qbcd=ziQ*BS6EFvAmBX=#)HtSQzmOng@8BBC=(N6<q_1r-m
zFC<$Ta?3}JKkxaTIJZ69mC1v0HV_%F54pSQi;qMjzf1X1sruS;?9sCBJS4IkyM_d0
zVq%M#(OZ?6fkwWGVT!3ECDAqW#OA{a$P<~@Z2Y`nxboO()70n$mmjARx2BCWR(ya@
zmm$%}DltKXa-;lvqGg`g+xzw<k&&4A^XsS{C@Z&L`f63!Z-I=N-eJ~u{dA8UJu^dW
z%B+q|L~h``hxM924T}>eLDM%`0<Byp!=|m$p@^DaNao+a;>_7^t|rsF-d?gUkhw6%
zd>@#Kio>j(F^Q{PdG%NmlB@i%HQSa*AUbA-4lD@G5KvZ+LPrW)n!Kd4eRCy<{!w*g
z4rE%e2e;3kB7we4y?Joy0T<VPxh=RyMTUWLms^dNE-`!}tKRdHkX#3<@%gsop^KMf
ztkU}VE?7e3)e|!IwFEUpeT*lNIc{tpE>GiHXP8@P?N={X`Kj3KV~9cAiV_jc-1z37
zkJg7pdu9_am{Gcgh;mDWP1%%09w@h%8Qy$X>bm92Hi8s*_0o$~yZNtE(<m?LaV#@%
zHkQ<!GvO=is1?N5)By-W*gA~Cc-;z#vX6=d@s$384X0;cM-AewI%{+WWnyc8!Vi?b
z+U8|b-=)*@Aw-J;mD~_yNwWJhRkB7m%Wy#;Frhc;Llyfe2ZnjK3dOdojv@1?kcEzJ
z<LKM*nSrh^7Yvsm#s&#jisIsTklhp=ewQX<paZYro{1TM-_`_LuJLtnSARL*Mcr2o
zm+p;cEl{diZd9p>YcqGFf6A>da9OWQpaplpBvl_apko3l<hK5VFj3}{CK0Oj%cGdP
zXMH6$lb?&e+{&W1Zql>J-LKN0Xqd1mxu@$Bet}FhI=D+E1VFFo`*&`%!8j&a`_TOr
z!#TyZ$~vFu-uT*4bW;?{#uZJSH7Y+@;(ojKSF=lCvcwKQucb2d(Q7psU1V;>@@e>)
zX=KIzm2nu=;%-FIu!oaGU}6S`u5;<61SULt=<JrGkihuIqp-iduM%z7S^~`TzQYU9
zOj79KS7a1^Ah66uMJ}AJly`3I;oy7qHLgec`9R1jJB@0mIlzZq^b{gGEdO`&Ns(7x
zYd9dgp7F)1pBB8w<zt>`Ia}&a^VwqjR^0i_^VA{P-IXA)XA8uf0@*~y@j~v{>g7k|
zzBY2{?2ePf5+}!F{7{X$=DX-KCd2U|kr|CPY4Mi@*~*UL$8C&gWLf)B<qW^sXDa`q
z0RDX~h)R`XvhPMnTe@?NG`Z|4Q9_Ykx4QcKOM;ZXO~+4u_U;Flf<{DOD(@~E9O(n-
zZ0czC$mORn<nbb~zkar|hCfk$=a}qN)vESVY0b_P8@ayzs3R)Mf4>#~V5>#<+1Won
zUEk@LX`tf|X&*%ynfWIBgDnYE!7_{adi)=e%p85-@U(o1|A6}XKG{0hC_O(2e|*^X
zG`t~mzJ71N4g~kNzL*TT-i;h2bEutYgxcKv@oJt5wG}6WFg>u()!8r-9^3?7Rf`5N
z#jji%$eq`PyzXI&D9B}NL%FlhtK&(zbS%!-(0aP28Gl69^-tGnwwSeCww9Ff5-s2J
zJ;#rvNSlk#{n$z9lvzrc8`m`pCYR~QN^GN8aGRhq-%6z{6O(LrARV^)h%3+whyTv~
tD?3|0@7TDe9n)X=iu#@%(~gXM|0_R(t$JX7$THj6{|&+E3w>gU000{4D_sBp

literal 0
HcmV?d00001

diff --git a/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz
new file mode 100644
index 0000000000000000000000000000000000000000..a4ec858104cbe907e9056e873cec278cee11066a
GIT binary patch
literal 5867
zcmV<H78L0piwFp6=jvMk188S6I4xpqVPk7-X>)Wgb}}wvWn=(_TuZKHxp5pHucDX0
zKa)GqK%Q8W+rzeCv<eU;K@ebg_lk(*^QcZd2<Qsng?M?$Wd6xN|K*>xJmnd4m-77a
z@BjPDfByE5fBg2}KmPjTU;p<Xf7V}njHta3%(EuIAAbD)mtX$$<M+S)_Sawj>wkX#
z@%O*|KXpf2qMo&nrJhqX>ZrRmOC2>Z9;vOC-7U4XeTJyLH9ObZ))|YV5zRhc7ohgO
z+KeK!&mQl?R;*<`W4?@4pIP-Kvz5062PX!9VuoAn`kXZ?{(cq~{B5XOs&*Ft9<y*H
zo?1ja(O!q^Yc{R}{+-JYH_JL2Z`N8Jwc+=)4FG*@N1X+N$lsl#M=y@LY6)$_s*p0f
zW4rV%20`=GKCwQec6^qd>)sJsBZ6%g>vWs}&K00-JL;ialL0XNXVn6cj@*3wK=J$4
zs-{qc`2TX`UU9gm&O=>Mt&~<Q+15f3#WK@qk5<bO`mUkV?e?iXIQoOqhkxCtfH<r6
z6a4aaXxlSS%Ts%Le>-a!tIVlE2-=%EA#bl3;$gGRsRZ>viHdrn54&`I>^ez>46kEO
zT!TJ^KHE~gefd#W#jsV!Xl!@lKwV&Pl?bNYMQ9@om+Ge0IqkSSzxzmlfULBE+{WnI
z1ft<r7nD=xN%9%h&$*W#Ye<euKf|st&IQ(>=Ls1*>>7hIw{$pB9HRh8Zmsnk8Hb?T
zV*tvSvnF{!cYAW@dZd+WIr|rKf9fi!)at71J8ElbW&@Vq%oo&XHLVc9GArDl$)UK<
z0NWY*PLoEhnZ|82w+bNF(PyqN&>H>a7!MlyfVncB%OK<EP}T#>VwPz<4N|4w1vpM%
zKwYcJngjG<C(!FYT^1W4%!-2AMo63pmN}B3jtNCf>X@t9x%PR+=86OGoI5IwYLN!(
z(>Vg;!O3otYl3Yyl+;}=6`N%jYs|6rSpeZ=3e*)T_5{vN8}L<HpXs#FC|D|VZL2BN
z)$i7(UgjDBv|~rbX3}LX0)AuhQ8IA??x27QC$)0|0MpMEf1EA#ERnPDB*Ps~)dIAe
zodhUr%UKp$M!S$aT4<(3r+hQ=Y&9LCLSycaISl@andBM$J2G`<l2`Rd9d281*O6ZE
zj?Jl<)JX6roHd<f%$zA*3$tw#?g@5l$%|U)p}$jmZPIfvP1D6kpw2z@83giEex_c*
zgM*(wT&!?rZa$dW()@aXUt<m<sSsOfu7Y6J>=UTie+f7?0RMATIGb`b3A&S5P*=wp
zCl$c?Uh{z;*L4N~V%x34k}pa2#>X`)02@b9D*$kgIf|g}m6m98!|YrUx;5!;fbBmk
z8p8M{DkfqDnsKnSEL2PzmWuO_59+B2pa|ksu<-Xd?JFogE;wEAyJ58zAe^~$4u3Z2
zf*~wRz6~O+=A0Z-WwA(Iz1v?0W2W&O%Q`ut9<4j#fbArJ)aaR{f!MZhCUAyS0E4*p
z79fEKjg}`62gQYEq_%Xy#v08<6pM@@Zx~>?IBq>sWeTC;Ma7%bwy7e!3&uBh{9J#@
z(Z*1k1fGSZL2K7k^~?FQn?Z%NB0_-n7Gi=0tCjhj!*gc8qwYw565;4Di?<(bh_p9=
z%de5ThjvdssL>vv!X7Uj9cU5fqhT?jG_@;(F^HW2;~A|e<Z{&ajh;AJ+DHQTO#zv(
zL2$?eC<;H#SHyF+G~n6J9}L*DoHy*6i)Hb%69!*vjj+W<vVj9~a00n1Z~&Z61$9U2
zNrKn#CLTz_QAAA8%}xi54!-2b`Yh|Vf{KaCmJ)>h{2hqDG%7&6WlO<h9$7g#rb_HG
zZLHEx>S})Rg9nXgA_~G0v+oD;({{Ct4MQW73vFN~2qF}oGLbrDh8UO|M~q0d_e#WZ
z>SDs^?BS`qr)~tXIpuO-7kcPy1DtM0?F=T<+Qf2h)S+p}qCtmm)QuD$CAm^h<7_*P
zaiFPt#xw(-p9?EN#jndG0Zf;NIS;=1Wd5O4n}`K9X3FoxiFd|_>CfZL{Bt1bva<y7
zmSSYW7h!jXMch*Nn{Zdw)o%^nYZ!9|#w<5BYs}>u5hVk%u5kzAU*{Dwi3DFjks0%h
z&Vc|UBvK`j^Wh(6iEFxVM9zhHSbM12d?9%~KV955;$#k<XquR_v9jjMV2#GH!w-@<
zICXg{e)nWD7{dQ9osH4W$-EQ1iD9;>$qb}=r|_GYxKTV~WhVBOa%x6ZXLpkoY!p#w
z?VA!<eoQdVNPm51D#uV^do!VXmdQbb3ZJ7&&;Y_XaiqeJ$$a^;DWt~CHbOPc6%uVs
zA*pqojO=S{69<S2r$W?O=P&}-m~G{E^K|rZE6|jW&8$qEVO*rfl0{E<6JT^tt+mP`
zz`#1c6I2}Do$CtV#HB&Y;*`i6DGUlcb!U}{)xt1~1tDqjmGG}g*?>2%{9Q%)#$(JN
zOULxJ;US#K6NfO>>=4t{S#BjTX&vV7cg<2C?4d~#aeGd=+jEN3T@Q}Ml6Vj}sRsea
zUeO<bU2!@X@{S2S6P~2BF@f=0C`GMBN)h~8_w88rsk}MBHpZYm%;{$0VIgjFdl@{J
zN`r35-Ag(Pv|Y0R4-n@iIYC-WD;3*1OJK&%a3df>Cg1=Q%aY%6Aja-n5L}MyT|j-i
zF%N@F1v>~t-*PRaNyD*PJ=d`V&PdM}2gq7XMOit?ji}?B<5N$h7j#;&q5%ec?<_Kw
z;!@w##-+8B5h14X);K~UoS>eAGTL~_pOD!4SUF-FuVJlfq4104DAm5Qi2JS+5CE_2
zS}D8Qv(8##$;um07$;ENe!mK6*|S*(#Zz-2YIeI;Z98ozkW2Z>bRj~~xJ1j1+Zua8
zr1BcivKtrM5P3X_jx)7j9M~keZbYUZ23!WF(hfw`M5*Mm2&@bN&@okMa`F38(nPM&
zMqK<o=K$qzqPEU7S$+SW&h{=H5w&?4-Z&X?QAfMqLkM--j{rP#)=VZ<wotvV4Vm-E
z4AYl4J_{?<HARTvydQlsp`21gm#YwVHi4aFLhl(jh|HzBpn%cOohCb!iXbq~Wv7(Q
zwDJJPUFwWKVq}&MVXPhVoFg}h!1x1@4kr9$<>5CNzlCyCZp|5BzMnQSnPC?b;lBO#
z1I?(q4dNOT`#?Gi7E&PqbJlJ*p-9u((q*?2KkJwhOMqo^IP75PO;Sf-QepjB6IMY@
zH~=P%Or|R6MZoy9WUg#vBrvUkU_vV8t`Zo-2xgQl!4lZB39ld{4uG^J@66jsLts)e
z{n>M6rfbjbH3JA;^8}bC^$R9!fLwHdX(c4GABOqDIQG0lx2LfH*mlfQYNi(si(8&K
z6l}rcYRBpjc<NQ227Y{4U3cL5rlSR5C4`k^#pM9a<VyRk*Jnsm*D(<QxoU6{myU+j
zwgrCrZFB{)astM=aW9d{J&+4UR<mUQ$Rr=wImCM6OmD9=D_4~UScgi;_-5qd@aH-Z
znfkW4*K;R2rrA>5=-Grv6n4!zV%wdKWa=dHTr5;}hxWOR+fVL9C?<1GWZD+VWiM6w
zolwHwP9q-9_)^GPtJ-7k+ua@WOwUC~k^U@^Ny5?ppwy<T(%UeJfB7mi`x1n#g@I0J
zJNxv2!uoM+{=WV<&@nzWxiPC|DP%HIYX3V!H4h;#JG5B!q<dAW^)V{!*1n10InyOH
zzF2FKb68M2_S;~Qva?Fa#0y;=GFfdF^0I^F%^pFbFol0|p?Gy7iCmV3+z~g#1nMRR
zAQ!G^CVzmsDXEjYSC#{Wyk82NmktWiH}V_3WcFIfpX2B!jg^p>onUr4#|5&!Pd(6m
zvQtFE#K!W@c{s%#-StqC%eFwxfmt|{C>2+ASaG4L{e0(6t4?&h>_9E$$w81%TKGAv
zmbnz7S^c-IIP#T*yk0+SR?jtKo7p$qo{l{byY4)fjhC0Z$~WA3Q#ywsoBuZa{p7dd
zFd6@ey}ZtzjAv%c)SSi(83<lm>W#5X24y^0LrEw8o$pdQ;B+pX=B<lB?>vOLc_*a1
zO3GlEbCmFA4kFz5;y|d2h}V@|BQ{s2X9F&QT|=Cqbc`u)cAixSSi4Ne)ATcN&0zzx
z#htlJQ$-l8jf|F;1Bi^pAQuKGH9JterR0VZP(8j<QEF*=Bhoq6uM?SuN^v8#645aJ
zY;yNbJp#cpb@L&{EuJ0(zb1FbZqi|FHJcu#hUT)YHB>J~<oZ6Z$xPz9-lg%I=YuKh
z?~Ru#9B<q#-Z=Jmd(Q?q{A+xBe8tJuTf__gI}j~glet_mRbR6A{gh*x0@yIy)3G2h
ztqzmDhiRz#Xk)nbLA2O(=aRt0^knXlEfD~IOgKU^)hw49?`Eous`M2Mq`N$VbuT;n
zFxa_n@@-&Z3uO)fT<wB;3Fkwd^EK}zyfSx<ys|)Ga@Xv?U75aOTfWgNlg&(dH;lmU
z`;Hiq%-HtVqpTs9i<Kbbw}Z*k%Ao*U^<X}1HMn;Y;orYov#HvSLc_yL#O~Ya&X4qX
zfVR<q<gU>!7s&1t%R9M~M9bt6Cu8$oG7`{rGCK#jTzL^i=o{`indcQU)c|lCn{qOu
z2QsZO)Qn%bbD?9ujmIz749I2hC#~hmRH2YPF)Z%pEkGetIVG3oGNVZ5WbAzDE2p;%
z<nGf_v(}OqD~R0X4QXK}#8M5{8g{;$MeZ9tsXFYqd*uyfqA=0FxbSgu4!n6~*I~zu
z%xn;utj(m8QMbCcfnX5c*I~h6WUbxR-PUv4P$S!r-dOB`&Ai<-R$#F&I9+Lkj=jB~
zm<5|V@}mh%229QM`?e6?n@q3FXqj~H3P>Fj+DaizXWRU{^e<j98wwQx!rb2?6S9^}
zxYzGGbJ4Bd*_~YPD+sU5fv8tZ)*##}L4@+|2?E4nSAN0Wk`-9+J&eSYIkWkdV7_!W
zkhXG?0NW&)miiG(Ca-|1ak{{*^Iqc?GD(}o9>bkQsmk6OlQ3CX51dy;m0^N(auM&R
zY2iv((Wf#?B}O#tW`PFp`zDU+FzxZ;_S^oobGnv0Qt1kr1Uv6S3=@X2*MDRQ(Fo{f
z=|Qn)*)QU;Vde7PZ)I}Y3<b|9FMtvlx!k-wmHi|D+t^w#_Xw>sz%elcnJY~uKxXHD
z`Y})%nU7oLxTioipP<NYa{yO3BeRy4=Vzxo{?ni4MQSAsSTHso5t(w@$;5DOeIP6m
zSuq^$ahI(&%$fVh_}zFz$vyYt3$Tgl$Za?)S-XMJFqU95!vdcoqkSM#H6@qoID1dn
z@AVRxlQ_@GoBf!myO6orb<l6I0B(FY`itD)?SFBDxq8o?XM82!&v)E7nIxZa+=UBr
z&rNR<4O85R2TCa4HaXR}HoQfV#qucIq_5(Mn(2A`#|S{M1eO;)Sll)NZahlbDuJat
znU;C_&H(qgq!_QxM0(X@sk!2v$*Xv2w6}eo0L)WYcKn~}LZWyUgejk?fzN6JvS11#
zJN{YwGL1giAIRLw0C%`x6wEosy$gEpv%T+SjC2zb3Y(O2D8y8J1y8n_vN4!}1I(cS
zT$&~xUfWT%iNFN7V4jip_Xv!~pi5mWa7gV;d&3Fb0Y@oMGLbQ2;>Hb@h>YGRw<8|U
zaVgk-TyEx7XZ8}~d3rtBxNCMFtiI+-2V74`87Aa7Z;Z~{>HErzP){1%>g<DfXP(HG
zNBH#H$nzl~d%^}A4lT2)2bj#&jaxwG%BxsxiaRfNJq}+H%Ocrm_5~#}<J^bGtVe&O
zyLvvUa~}hHV^Y|&_oXZ7GEB&W<Y@6d4EO82G&$4WV~gAuu@tK>Vmi+#a4?su=Is0F
z?vW8*gezI@CopOmOnkg-0f8}x=Lx=uBQ*kGZqR@^*#|4tRJJkQWY)YwPGD61dDL3k
zGN1u6vHM{Lr)Iw_QJ9>|Q6vAYz5;c-Q()|xSGa&$dvPE|j9a!RS}%>;B!-4jPegZp
z4xdMD|L-FA;kCYO@iXq)hxd8q(7R&y2R=$(_3Xy|{V<o2sY_#fKYRt`WNXhfUdKlp
zn+ZeLgauy9M-r;;>pEUGUe&hD!!6`RMxN~$HSu9POh%H!71KX#!Opu;_<6U+Ect1E
z;X;Sc)9$qygc!yn0B>eir0r=GBjn_K^=&Tu9ngjOg2o7QG(T{ZvNs?9+VS9m$(nB4
zZ2ft737fn#T;786vg5uSg9W!cf;cKWLdtPX>>u6>s99v>rKOiho3zNMY6y(QpfG$%
z^{6hEB(vp1lmzym!|1Nt6qFu?VU9Eud<e%RjKHK-)tKkL*7+)Cm5*>x&HzkcLV?^n
zpGKJ5J%|J*jgM+?yx8vM*~?xE_L(3zp$V9fkDb@VN_@kdH6Im$(`EWCz~t5DypKJa
z3G%3n?>le+?$$b(FuRct3a)Nk-Vc4Boo-iyLUk(--#Hr?;hZ-+=48*9UO_Fqd+3>2
zq?yG&vH>Q(SMy{WweeHP9QlwCfa@#+v)y+yC)A-IvyFR&Q59Xsq&KH8>O1V&obVuJ
z;|NSOA4~+88N5PF-xUZBM9n1vu)2#C3q{yWF<E;8Z@0<UTW}Q0i?0nDBD+ki!SgZU
z7?~?wBLG)p05h;X@}Q%2Q#1NJDln+l3t)eJ^V;M}<}iIZWS-TNd6wxh0~2eVedg?1
za{IX1+Mud1A8Yik5*%px+TC_*f!QUGMZi?|$c!Wa9Ba%GF=R~1inx{b%^T>K={o1h
zJX6jsfV-miVadGQDti=rE;gTQApsyvL;|O>%s#TpKD%9|Xt!ktATON$RN$lUnVAyi
zu$_3sDq7Bb!Ou=|_3U&bu*)9mk{9#16#9pkQnPD{z+7wwlgC>rzf)X$0hv(68WX@&
zDL#DZqAR>3u-|@oRam!8`@NO<?#acvu3Wf8HkeTEk_v!o54#>T$jorbE@#Vy>yfcX
zY0DHU$>kHg_qnpMts1F?UgAxqk1*|c<o1{~=i9@TM#zQr_-e-3%SShSiSEAaz_s=B
z<*Mc!fJ_xkzA`+;dmFyE7mmf{>k7%O^_%%pm3-Z@Z`N_FY$&}~M}SPCg|2vsdYeZ|
zczLYsFIl%KWw?d~a(m5Z^MuUhDRL3H*Svr{>;tsq&fW5zZWS2(xJ(Wumk)({?v6cf
z{2Z=;uJEPY>Tu<|OFklsvcF=TvaPm3@YGh%ryjm@{m3{sKBtAV-^K^ivK_gpS;=z8
zjC+j-$m+b>SPZ!D?3`{cbdd{>{(c*a&h`woU9U{?$toLsuvnX|NkUdjRs;Yy<R$W1
zh<5HXQ$*9;^-pfx^~E9fYt|VC<Qo`ex+f6HWk*5g!6^5BT;|H|Q{FwyMPhN+y$;f~
z&Xx<Wd3;@?Z$4AScOtWG7pR&tExE8PmkTlTdjYw*rB@=aQ#;(rOjZ#2M1Y=POfHaV
za3lAOD-S@!XF>YcyKO=~i#4DWTGm#8Y+#t?`qp=<YYJ34M7_t#hm5eHf-)xgC@Di@
zZ|6tv`RJRF-(x;)pNV9kyv+FXd=RvLq_H6Kie$hO&O3=j=H8Q1i8bSQA)m(Nv*PLO
z0PSy5loW=Wq#VPh6)6X|AVy?r1mrRfyk-09a{nLQ)|>U-+w1P#403h%O<Z=#`Z;_(
zuSPV!nFAl^)uG@!G~aPYKD5NUgUu_xN&hBW5}+f^T~cvR-Y(2GsdqvD^%?!XXw-*^
zG=G&_dfz!I!(^X+C1Kw--R_<eBXeZSB=hseW%yOk<?l>xwATj?{|x|yOPCUA001Y{
BczOT;

literal 0
HcmV?d00001


From 022965b30939989e8805d12fe3b4e4981145be98 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Fri, 21 Jan 2022 17:29:25 +0100
Subject: [PATCH 07/80] integration of spliceType and blacklist annotation to
 results table

---
 DESCRIPTION                  |   8 +-
 NAMESPACE                    |   7 +
 R/FRASER-package.R           |   5 +-
 R/resultAnnotations.R        | 401 ++++++++++++++++++++++-------------
 man/spliceTypeAnnotations.Rd | 127 +++++++++++
 src/RcppExports.cpp          |   5 +
 6 files changed, 398 insertions(+), 155 deletions(-)
 create mode 100644 man/spliceTypeAnnotations.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index a8ebe376..f5c8403f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,7 +28,7 @@ biocViews:
 License: MIT + file LICENSE
 URL: https://github.com/gagneurlab/FRASER
 BugRepots: https://github.com/gagneurlab/FRASER/issues
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Encoding: UTF-8
 VignetteBuilder: knitr
 Depends:
@@ -82,9 +82,10 @@ Suggests:
     covr,
     TxDb.Hsapiens.UCSC.hg19.knownGene,
     org.Hs.eg.db,
+    rtracklayer
 LinkingTo:
-    Rcpp,
-    RcppArmadillo
+    RcppArmadillo,
+    Rcpp
 Collate:
     variables.R
     getNSetterFuns.R
@@ -114,3 +115,4 @@ Collate:
     fitCorrectionMethods.R
     plotMethods.R
     zzz.R
+    resultAnnotations.R
diff --git a/NAMESPACE b/NAMESPACE
index 35e2d66a..cd7b6b50 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,8 +22,10 @@ export(K)
 export(N)
 export(aberrant)
 export(addCountsToFraserDataSet)
+export(annotateIntronReferenceOverlap)
 export(annotateRanges)
 export(annotateRangesWithTxDb)
+export(annotateSpliceEventType)
 export(bamFile)
 export(bestQ)
 export(calculatePSIValues)
@@ -44,6 +46,7 @@ export(filterExpression)
 export(filterExpressionAndVariability)
 export(filterVariability)
 export(fit)
+export(flagBlacklistRegions)
 export(getNonSplitReadCountsForAllSamples)
 export(getSplitReadCountsForAllSamples)
 export(hyperParams)
@@ -163,9 +166,12 @@ importFrom(GenomicAlignments,junctions)
 importFrom(GenomicAlignments,readGAlignmentPairs)
 importFrom(GenomicAlignments,readGAlignments)
 importFrom(GenomicAlignments,summarizeJunctions)
+importFrom(GenomicFeatures,exons)
+importFrom(GenomicFeatures,fiveUTRsByTranscript)
 importFrom(GenomicFeatures,genes)
 importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicFeatures,makeTxDbFromGFF)
+importFrom(GenomicFeatures,threeUTRsByTranscript)
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,GRangesList)
 importFrom(GenomicRanges,findOverlaps)
@@ -301,6 +307,7 @@ importFrom(rhdf5,H5Pget_chunk)
 importFrom(rhdf5,H5Pget_layout)
 importFrom(rhdf5,h5delete)
 importFrom(rhdf5,h5ls)
+importFrom(rtracklayer,import)
 importFrom(stats,cor)
 importFrom(stats,cutree)
 importFrom(stats,dbinom)
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 3cecc264..843f45d0 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -22,7 +22,8 @@
 #'
 ### GRange/Experiment/bamFile packages
 #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which
-#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes
+#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes exons
+#'          fiveUTRsByTranscript threeUTRsByTranscript
 #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions
 #'          readGAlignmentPairs
 #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames
@@ -40,7 +41,7 @@
 #'
 #' @importFrom biomaRt useEnsembl getBM
 #' @importFrom AnnotationDbi select
-#'
+#' @importFrom rtracklayer import
 #'
 ### Plotting
 #'
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index ff9268c9..971069e9 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -1,27 +1,52 @@
 #' 
-#' @title Result annotations
+#' @title Additional result annotations
 #' 
 #' @description These functions work on the result table and add additional 
-#'     annotations to the reported junctions: the type of splice event (e.g. 
-#'     exon skipping, intron retention, ...), as well as a flag for junctions 
-#'     that are located in blacklist regions of the genome.
+#'     annotations to the reported introns: the type of splice event (e.g. 
+#'     exon skipping, exon truncation, ...), expected occurence of frameshift, 
+#'     overlap with UTR regions as well as a flag for introns that are 
+#'     located in blacklist regions of the genome.
+#'     
+#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the 
+#'     fds for each intron based on the overlap of the intron's location with 
+#'     the reference annotation. Has to be run before the result table is 
+#'     created so that the new column can be included in it (see examples).
 #' 
-#' \code{\link{annotateSpliceEventType}} annotates each junction in the results 
-#'     table with the type of splice event: exon skipping, 
+#' \code{\link{annotateSpliceEventType}} annotates each intron in the results 
+#'     table with the type of splice event and expected occurence of frameshift 
+#'     (likely, unlikely, inconclusive). Can also calculate overlap with 
+#'     annotated UTR regions. Splice types can be one of: 
+#'     annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, 
+#'     exonTruncation, exonElongation, exonTruncation&Elongation, 
+#'     singleExonSkipping, exonSkipping, splicingBeyondGene, 
+#'     multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, 
+#'     complex (everything else).     
+#'     Splice sites (theta metric) annotations indicate how the splice site is 
+#'     located with respect to the reference annotation. The annotated types 
+#'     are: annotatedSpliceSite, exonicRegion, intronicRegion.
 #' 
-#' \code{\link{flagBlacklistRegions}} flags junction in the results table on 
-#'     whether or not they are located in a blacklist region of the genome. 
+#' \code{\link{flagBlacklistRegions}} flags introns in the results table on 
+#'     whether or not they are located in a blacklist region of the genome. By
+#'     default, the blacklist regions as reported in \cite{...} are used.
 #'
-#' @param result A result table as generated by FRASER.
-#' @param fds The FraserDataSet, which was used to generate the results table
+#' @param fds A FraserDataSet
 #' @param txdb A txdb object providing the reference annotation.
-#' @params blacklist_regions BED file that contains the blacklist regions. 
-#'     If \code{NULL} (default), the BED files that are packaged with FRASER are used.
-#' @param assemblyVersion Indicates the genome assembly of the junction 
-#'     coordinates. Only used if blacklist_regions is NULL.
-#' @param BPPARAM For controlling parallelization behavior. 
-#'     Defaults to \code{bpparam()}.
-#' @return annotated results table
+#' @param result A result table as generated by FRASER, including the column 
+#'     \code{annotatedJunction} as generated by the function
+#'     \code{annotateIntronReferenceOverlap}.
+#' @param addSpliceType Logical, indicating if the type of the splice event 
+#'     should be added to the results table. Defaults to \code{TRUE}.
+#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
+#'     should checked and added to the results table. Defaults to \code{TRUE}.
+#' @param blacklist_regions A BED file that contains the blacklist regions. 
+#'     If \code{NULL} (default), the BED files that are packaged with FRASER 
+#'     are used. 
+#' @param assemblyVersion Indicates the genome assembly version of the intron 
+#'     coordinates. Only used if blacklist_regions is NULL. For other versions, 
+#'     please provide the BED file containing the blacklist regions directly.
+#' @param BPPARAM For controlling parallelization behavior. Defaults to 
+#'     \code{bpparam()}.
+#' @return An annotated FraserDataSet or results table, respectively
 #' 
 #' @name spliceTypeAnnotations
 #' @rdname spliceTypeAnnotations
@@ -29,43 +54,125 @@
 #' @examples
 #'   # get data, fit and compute p-values and z-scores
 #'   fds <- createTestFraserDataSet()
-#' 
-#'   # extract results: for this example dataset, z score cutoff of 2 is used to
-#'   # get at least one result
-#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
-#'   res_dt <- as.data.table(res)
 #'   
-#'   # annotate type of splice event
+#'   # load reference annotation
 #'   requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
 #'   txdb <- 
 #'     TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 
-#'   res_dt <- annotateSpliceEventType(fds, res_dt, txdb)
+#'     
+#'   # add basic annotations for overlap with the reference annotation
+#'   # run this function before creating the results table
+#'   fds <- annotateIntronReferenceOverlap(fds, txdb) 
+#' 
+#'   # extract results: for this small example dataset, only a z score cutoff 
+#'   # of 1 is used to get at least one result. 
+#'   # Make sure to include the additional column in the results table
+#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, 
+#'                   additionalColumns = 'annotatedJunction')
+#'   res_dt <- as.data.table(res)
+#'   
+#'   # annotate the type of splice event and UTR overlap
+#'   res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb)
 #'   
-#'   # annotate blacklist regions
-#'   res_dt <- flagBlacklistRegions(res_dt, assemblyVersion="hg19")
+#'   # annotate overlap with blacklist regions
+#'   res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")
 #'   
-#'   # show annoated results table
+#'   # show results table containing additional annotations
 #'   res_dt
 #'   
 NULL
 
+#' @describeIn spliceTypeAnnotations This method calculates basic annotations 
+#'     based on overlap with the reference annotation (start, end, none, both) 
+#'     for the full fds. The overlap type is added as a new column 
+#'     \code{annotatedJunction} in \code{mcols(fds)}. 
+#' @export
+annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
+    print("loading introns")
+    #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+    introns <- unique(unlist(intronsByTranscript(txdb)))
+    # reduce the introns to only the actually expressed introns
+    fds_known <- fds[unique(to(findOverlaps(introns, 
+                                rowRanges(fds, type = "j"), type = "equal"))),]
+    anno_introns <- as.data.table(rowRanges(fds_known, 
+                                type="j"))[,.(seqnames, start, end, strand)]
+    
+    # calculate extra columns with mean/median intron expression count
+    # add the new columns
+    print("adding median count to introns")
+    sampleCounts <- K(fds_known, type = "j")
+    anno_introns[, meanCount := rowMeans(sampleCounts)]
+    anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))]
+    # order by medianCount (highest first)
+    setorderv(anno_introns, "medianCount", order=-1) 
+    anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, 
+                                                    keep.extra.columns = TRUE)
+    
+    # get all fds junctions
+    fds_junctions <- rowRanges(fds, type = "j")
+    
+    # Do the annotation just for the intron with highest median expression
+    print("start calculating annotations")    
+    overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
+    annotations <- bplapply(1:length(fds_junctions), function(i){
+        # only select first intron as already ordered by medianCount beforehand
+        overlap <- overlaps[i]
+        if(is.na(overlap)) return("none") #no overlap with any intron
+        
+        hit_equal <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="equal"))
+        if(length(hit_equal) > 0) return("both")
+        
+        hit_start <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="start"))
+        if(length(hit_start) > 0) return("start")
+        hit_end   <- from(findOverlaps(fds_junctions[i], 
+                                        anno_introns_ranges[overlap], 
+                                        type="end"))
+        if(length(hit_end) > 0) return("end")
+        
+        # overlaps but no start/end match
+        return("none") 
+    }, BPPARAM=BPPARAM) 
+    annotations <- unlist(annotations)
+    
+    rowRanges(fds)$annotatedJunction <- annotations
+    mcols(fds, type="ss")$annotatedJunction <- "not computed"
+    print("annotations done")
+    return(fds)
+}
+
 #' @describeIn spliceTypeAnnotations This method annotates the splice event 
 #'     type to junctions in the given results table.
 #' @export
-annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){
+annotateSpliceEventType <- function(result, txdb, addSpliceType=TRUE, 
+                                    addUTRoverlap=TRUE, BPPARAM=bpparam()){
     
     # Create basic annotation of overlap with reference
-    fds <- annotateJunctionReferenceOverlap(fds, txdb, BPPARAM)
+    if(!("annotatedJunction" %in% colnames(result))){
+        stop("Column 'annotatedJunction' not found in the results table!\n",
+            "Please run fds <- annotateIntronReferenceOverlap(fds, txdb) ",
+            "first and add it \nto the results table with ",
+            "results(..., additionalColumns = 'annotatedJunction')\n", 
+            "(see examples) before calling this function.")
+    }
     
-    # Calculate splice types and frameshift
-    result <- aberrantSpliceType(result, fds, txdb)
+    # convert to data.table if not already
+    if(!is.data.table(result)){
+        result <- as.data.table(result)
+    }
     
-    # Add the subtypes for exonSkipping and inconclusive
-    result <- checkExonSkipping(result, txdb)
-    result <- checkInconclusive(result, txdb)
+    # Calculate splice types and frameshift
+    if(isTRUE(addSpliceType)){
+        result <- addSpliceTypeLabels(result, fds, txdb)
+    }
     
     # Add UTR labels
-    result <- addUTRLabels(result, txdb)
+    if(isTRUE(addUTRoverlap)){
+        result <- addUTRLabels(result, txdb)
+    }
     
     return(result)
 }
@@ -76,6 +183,11 @@ annotateSpliceEventType <- function(result, fds, txdb, BPPARAM=bpparam()){
 flagBlacklistRegions <- function(result, blacklist_regions=NULL,
                                 assemblyVersion=c('hg19', 'hg38')){
     
+    # convert to data.table if not already
+    if(!is.data.table(result)){
+        result <- as.data.table(result)
+    }
+    
     assemblyVersion <- match.arg(assemblyVersion)
     if(is.null(blacklist_regions)){
         blacklist_regions <- 
@@ -89,7 +201,7 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL,
     }
     print("Importing blacklist regions ...")
     blacklist_gr <- import(blacklist_regions, format = "BED")
-    result <- addBlacklistLabels(result, blacklist_gr, "splicing")  
+    result <- addBlacklistLabels(result, blacklist_gr)  
     return(result)
 }
 
@@ -100,9 +212,8 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL,
 addBlacklistLabels <- function(junctions_dt, blacklist_gr){
     # add the blacklist information
     print("Set up aberrant splicing granges")
-    psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
-    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions])
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
     
     # set to the same seqlevelsstyle
     seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
@@ -112,19 +223,18 @@ addBlacklistLabels <- function(junctions_dt, blacklist_gr){
     black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr)))
     junctions_dt[, blacklist := FALSE]
     
-    junctions_dt[psi_positions[black_hits], blacklist := TRUE]
+    junctions_dt[black_hits, blacklist := TRUE]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     
     print("blacklist labels done")
     return(junctions_dt)
 }
 
-#' UTR annotation to results table
+#' adds UTR overlap annotation to results table
 #' @noRd
 addUTRLabels <- function(junctions_dt, txdb){
-    psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
-    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions])
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
     seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
     ### UTR labels based on txdb file
     ### add 5' 3' UTR labels
@@ -133,77 +243,19 @@ addUTRLabels <- function(junctions_dt, txdb){
                             threeUTRsByTranscript(txdb, use.names = TRUE))))
     fives <- unique(from(findOverlaps(junctions_gr, 
                             fiveUTRsByTranscript(txdb, use.names = TRUE))))
-    junctions_dt[psi_positions, UTR := "no"]
-    junctions_dt[psi_positions[threes], UTR := "3"]
-    junctions_dt[psi_positions[fives], UTR := "5"]
+    junctions_dt[, UTR_overlap := "no"]
+    junctions_dt[threes, UTR_overlap := "3'-UTR"]
+    junctions_dt[fives, UTR_overlap := "5'-UTR"]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     print("UTR labels done")
     return(junctions_dt)
 }
 
-#' Basic annotations of overlap with reference (start, end, none, both) for 
-#' full fds.
-#' @noRd
-annotateJunctionReferenceOverlap <- function(fds, txdb, BPPARAM){
-    print("loading introns")
-    #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
-    introns <- unique(unlist(intronsByTranscript(txdb)))
-    # reduce the introns to only the actually expressed introns
-    fds_known <- fds[unique(to(findOverlaps(introns, 
-                            rowRanges(fds, type = "j"), type = "equal"))),]
-    anno_introns <- as.data.table(rowRanges(fds_known, 
-                            type="j"))[,.(seqnames, start, end, strand)]
-    
-    # calculate extra columns with mean/median intron expression count
-    # add the new columns
-    print("adding median count to introns")
-    sampleCounts <- K(fds_known, type = "j")
-    anno_introns[, meanCount := rowMeans(sampleCounts)]
-    anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))]
-    # order by medianCount (highest first)
-    setorderv(anno_introns, "medianCount", order=-1) 
-    anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, 
-                                                    keep.extra.columns = TRUE)
-    
-    # get all fds junctions
-    fds_junctions <- rowRanges(fds, type = "j")
-    
-    # Do the annotation just for the intron with highest median expression
-    print("start calculating annotations")    
-    overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
-    annotations <- bplapply(1:length(fds_junctions), function(i){
-        # only select first intron as already ordered by medianCount beforehand
-        overlap <- overlaps[i]
-        if(is.na(overlap)) return("none") #no overlap with any intron
-        
-        hit_equal <- from(findOverlaps(fds_junctions[i], 
-                                        anno_introns_ranges[overlap], 
-                                        type="equal"))
-        if(length(hit_equal) > 0) return("both")
-        
-        hit_start <- from(findOverlaps(fds_junctions[i], 
-                                        anno_introns_ranges[overlap], 
-                                        type="start"))
-        if(length(hit_start) > 0) return("start")
-        hit_end   <- from(findOverlaps(fds_junctions[i], 
-                                        anno_introns_ranges[overlap], 
-                                        type="end"))
-        if(length(hit_end) > 0) return("end")
-        
-        # overlaps but no start/end match
-        return("none") 
-    }, BPPARAM=BPPARAM) 
-    annotations <- unlist(annotations)
-    
-    rowRanges(fds)$annotatedJunction <- annotations
-    mcols(fds, type="ss")$annotatedJunction <- "none"
-    print("annotations done")
-    return(fds)
-}
 
-#' 
+
+#' adds type of splicing to each intron in the results table
 #' @noRd
-aberrantSpliceType <- function(junctions_dt, fds, txdb){
+addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     print("preparing ...")
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
@@ -229,12 +281,15 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
                                                 keep.extra.columns = TRUE)
     
     # prepare the results column
-    junctions_dt[, aberrantSpliceType := "NA"]
-    junctions_dt[, causesFrameshift := "NA"]
-    junctions_dt[which(junctions_dt$annotatedJunction=="both"), 
-                    aberrantSpliceType := "sameIntron"]
-    junctions_dt[which(junctions_dt$annotatedJunction=="both"), 
-                    causesFrameshift := "unlikely"]
+    junctions_dt[, spliceType := "complex"]
+    junctions_dt[, causesFrameshift := "inconclusive"]
+    junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, 
+                    spliceType := "annotatedIntron_increasedUsage"]
+    junctions_dt[annotatedJunction == "both" & deltaPsi < 0, 
+                 spliceType := "annotatedIntron_reducedUsage"]
+    junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
+    
+    # TODO check for intron retention
     
     starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
     ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
@@ -256,7 +311,7 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
     junctions_dt[psi_positions[starts], 
                     causesFrameshift:=start_results[2,]]
     junctions_dt[psi_positions[starts], 
-                    aberrantSpliceType := start_results[1,]]
+                    spliceType := start_results[1,]]
     
     print("end junctions")
     end_results <- sapply(ends, function(i){
@@ -272,7 +327,7 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
         
     })
     junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
-    junctions_dt[psi_positions[ends], aberrantSpliceType := end_results[1,]]
+    junctions_dt[psi_positions[ends], spliceType := end_results[1,]]
     
     print("none junctions pt1")
     none_results <- sapply(nones, function(i){
@@ -318,15 +373,15 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
             frs = "likely"
         }else{ frs = "unlikely"}
         if( st[1] != en[1]){
-            combined = "trunc, elong"
+            combined = "exonTruncation&Elongation"
         }else{combined = st[1]}
         return(c(combined,frs))
         
     })
     junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
-    junctions_dt[psi_positions[nones], aberrantSpliceType := none_results[1,]]
+    junctions_dt[psi_positions[nones], spliceType := none_results[1,]]
     
-    noLaps <-which(junctions_dt[psi_positions]$aberrantSpliceType=="noOverlap")
+    noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap")
     refseq.genes<- genes(txdb)
     
     print("none junctions pt2")
@@ -351,21 +406,65 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
             }
         }
         
-        return(c("inconclusive","inconclusive"))
+        return(c("complex","inconclusive"))
     })
     junctions_dt[psi_positions[noLaps], 
                     causesFrameshift:=noLaps_results[2,]]
     junctions_dt[psi_positions[noLaps], 
-                    aberrantSpliceType := noLaps_results[1,]]
+                    spliceType := noLaps_results[1,]]
+    
+    # theta annotations
+    print("Annotate theta junctions")
+    thetas <- which(junctions_dt$type == "theta")
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], 
+                                            keep.extra.columns = TRUE)
+    
+    # specify default type for theta results as NA
+    junctions_dt[thetas, spliceType := NA]
+    junctions_dt[thetas, causesFrameshift := NA]
+    
+    # label all as intronic first if they have any intron overlap
+    intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
+    junctions_dt[thetas[intronic], spliceType := "intronicRegion"]
+    
+    # for exonic, check if theta is fully contained in an exon
+    # if one end is in an intron and the other in an exon it is a splice site
+    exonic <- unique(from(findOverlaps(junctions_gr, exons)))
+    within <- findOverlaps(junctions_gr, exons, type = "within")
+    all <- findOverlaps(junctions_gr, exons)
+    exonic_results <- sapply(exonic, function(i){
+        w <- unique(to(within)[which(from(within) == i)])
+        a <- unique(to(all)[which(from(all) == i)])
+        if(length(a) == length(w)) return("exonicRegion")
+        return("annotatedSpliceSite")
+    })
+    junctions_dt[thetas[exonic], spliceType := exonic_results]
     
+    # check cases that don't overlap with an exon/intron
+    print("nones")
+    nones <- which(is.na(junctions_dt[thetas,]$spliceType))
+    none_results <- sapply(nones, function(i){
+        if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
+        #return("intergenic")
+        if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start(junctions_gr[i])){
+            ifelse(strand(junctions_gr[i]) == "+", return("upstreamOfNearestGene"), return("downstreamOfNearestGene"))
+        }else{
+            ifelse(strand(junctions_gr[i]) == "+", return("downstreamOfNearestGene"), return("upstreamOfNearestGene"))
+        }
+    })
+    junctions_dt[thetas[nones], spliceType := none_results]
+    print("thetas done")
     
-    # add distance to closest neighbour gene for intergenic results
+    # add distance to closest neighbour gene for intergenic results 
+    # (both psi and theta)
     print("adding distances to nearest gene")
-    up <- which(junctions_dt[psi_positions]$aberrantSpliceType == "upstream")
-    down <- which(junctions_dt[psi_positions
-                                ]$aberrantSpliceType == "downstream")
-    print("Calculate distances")
+    up <- which(junctions_dt$spliceType == "upstreamOfNearestGene")
+    down <- which(junctions_dt$spliceType == "downstreamOfNearestGene")
     
+    # create full grange object containing psi and theta
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt, keep.extra.columns = T)
+    
+    print("Calculate distances")
     if(length(up) > 0){
         distanceNearestGene_up <- sapply(up, function(i){
             min(distance(junctions_gr[i], refseq.genes), na.rm = T)})
@@ -392,6 +491,11 @@ aberrantSpliceType <- function(junctions_dt, fds, txdb){
     
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     print("done calculating aberrant splice types")
+    
+    # Add the subtypes for exonSkipping and inconclusive
+    junctions_dt <- checkExonSkipping(junctions_dt, txdb)
+    junctions_dt <- checkInconclusive(junctions_dt, txdb)
+    
     return(junctions_dt)
 }
 
@@ -660,18 +764,15 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){
         if(start(refseq.genes[nearest(junctions_gr[i], 
                                         refseq.genes)]) > start){
             ifelse(strand(junctions_gr[i]) == "+", 
-                    return(c("upstream", "unlikely")), 
-                    return(c("downstream", "unlikely")))
+                    return(c("upstreamOfNearestGene", "unlikely")), 
+                    return(c("downstreamOfNearestGene", "unlikely")))
         }else{
             ifelse(strand(junctions_gr[i]) == "+", 
-                    return(c("downstream", "unlikely")), 
-                    return(c("upstream", "unlikely")))
+                    return(c("downstreamOfNearestGene", "unlikely")), 
+                    return(c("upstreamOfNearestGene", "unlikely")))
         }
     }
-    return(c("inconclusive", "inconclusive"))
-    
-    # if both lists == 0 return intergenic else inconclusive
-    # return(c("inconclusive", "inconclusive"))
+    return(c("complex", "inconclusive"))
 }
 
 #' 
@@ -685,7 +786,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
     
     refseq.genes<- genes(txdb)
     
-    exonSkip <- which(junctions_dt[psi_positions]$aberrantSpliceType %in% 
+    exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% 
                             c("exonSkipping", "singleExonSkipping"))
     
     print("start checking exonSkipping")
@@ -714,7 +815,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
         # check for which genes distance to start is 0
         start_genes <- which(distance(test_start, refseq.genes) == 0)
         # start is not in a gene
-        if(length(start_genes) == 0) return("beyondGene")
+        if(length(start_genes) == 0) return("splicingBeyondGene")
         
         # start is in a gene -> is end in same gene
         for(to in start_genes){
@@ -726,24 +827,24 @@ checkExonSkipping <- function(junctions_dt, txdb){
         
         end_genes <- which(distance(test_end, refseq.genes) == 0)
         # end is not in a gene
-        if(length(end_genes) == 0) return("beyondGene")
+        if(length(end_genes) == 0) return("splicingBeyondGene")
         # end is in a different gene
-        return("multigenic")
+        return("multigenicSplicing")
     })
     
     print("checking exonSkipping done")
     if(length(exonSkip) > 0){
         junctions_dt[psi_positions[exonSkip], 
-                        aberrantSpliceType2 := newSkip_results]
-        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), 
-                        aberrantSpliceType := "beyondGene"]
-        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "beyondGene"), 
+                        spliceType2 := newSkip_results]
+        junctions_dt[spliceType2 == "splicingBeyondGene", 
+                        spliceType := "splicingBeyondGene"]
+        junctions_dt[spliceType2 == "splicingBeyondGene", 
                         causesFrameshift := "inconclusive"]
-        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), 
-                        aberrantSpliceType := "multigenic"]
-        junctions_dt[which(junctions_dt$aberrantSpliceType2 == "multigenic"), 
+        junctions_dt[spliceType2 == "multigenicSplicing", 
+                        spliceType := "multigenicSplicing"]
+        junctions_dt[spliceType2 == "multigenicSplicing", 
                         causesFrameshift := "inconclusive"]
-        junctions_dt[, aberrantSpliceType2 := NULL]
+        junctions_dt[, spliceType2 := NULL]
     }
     
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
@@ -762,7 +863,7 @@ checkInconclusive <- function(junctions_dt, txdb){
     refseq.genes<- genes(txdb)
     
     inconclusive <- which(junctions_dt[psi_positions
-                                        ]$aberrantSpliceType == "inconclusive")
+                                        ]$spliceType == "complex")
     print("start checking inconclusive")
     
     inconclusive_results <- sapply(inconclusive, function(i){
@@ -790,28 +891,28 @@ checkInconclusive <- function(junctions_dt, txdb){
         # check for which genes distance to start is 0
         start_genes <- which(distance(test_start, refseq.genes) == 0)
         # start is not in a gene
-        if(length(start_genes) == 0) return("beyondGene")
+        if(length(start_genes) == 0) return("splicingBeyondGene")
         
         # start is in a gene -> is end in same gene
         for(to in start_genes){
             # end is in same gene
             if(distance(test_end, refseq.genes[to]) == 0){
-                return("inconclusive")
+                return("complex")
             } 
         }
         
         end_genes <- which(distance(test_end, refseq.genes) == 0)
         # end is not in a gene
-        if(length(end_genes) == 0) return("beyondGene")
+        if(length(end_genes) == 0) return("splicingBeyondGene")
         # end is in a different gene
-        return("multigenic")
+        return("multigenicSplicing")
     })
     
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     print("done checking inconclusive")
     if(length(inconclusive) > 0){
         junctions_dt[psi_positions[inconclusive], 
-                        aberrantSpliceType := inconclusive_results]
+                        spliceType := inconclusive_results]
     } 
     
     return(junctions_dt)
diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd
new file mode 100644
index 00000000..17f8332b
--- /dev/null
+++ b/man/spliceTypeAnnotations.Rd
@@ -0,0 +1,127 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/resultAnnotations.R
+\name{spliceTypeAnnotations}
+\alias{spliceTypeAnnotations}
+\alias{annotateIntronReferenceOverlap}
+\alias{annotateSpliceEventType}
+\alias{flagBlacklistRegions}
+\title{Additional result annotations}
+\usage{
+annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam())
+
+annotateSpliceEventType(
+  result,
+  txdb,
+  addSpliceType = TRUE,
+  addUTRoverlap = TRUE,
+  BPPARAM = bpparam()
+)
+
+flagBlacklistRegions(
+  result,
+  blacklist_regions = NULL,
+  assemblyVersion = c("hg19", "hg38")
+)
+}
+\arguments{
+\item{fds}{A FraserDataSet}
+
+\item{txdb}{A txdb object providing the reference annotation.}
+
+\item{BPPARAM}{For controlling parallelization behavior. Defaults to 
+\code{bpparam()}.}
+
+\item{result}{A result table as generated by FRASER, including the column 
+\code{annotatedJunction} as generated by the function
+\code{annotateIntronReferenceOverlap}.}
+
+\item{addSpliceType}{Logical, indicating if the type of the splice event 
+should be added to the results table. Defaults to \code{TRUE}.}
+
+\item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions
+should checked and added to the results table. Defaults to \code{TRUE}.}
+
+\item{blacklist_regions}{A BED file that contains the blacklist regions. 
+If \code{NULL} (default), the BED files that are packaged with FRASER 
+are used.}
+
+\item{assemblyVersion}{Indicates the genome assembly version of the intron 
+coordinates. Only used if blacklist_regions is NULL. For other versions, 
+please provide the BED file containing the blacklist regions directly.}
+}
+\value{
+An annotated FraserDataSet or results table, respectively
+}
+\description{
+These functions work on the result table and add additional 
+    annotations to the reported introns: the type of splice event (e.g. 
+    exon skipping, exon truncation, ...), expected occurence of frameshift, 
+    overlap with UTR regions as well as a flag for introns that are 
+    located in blacklist regions of the genome.
+    
+\code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the 
+    fds for each intron based on the overlap of the intron's location with 
+    the reference annotation. Has to be run before the result table is 
+    created so that the new column can be included in it (see examples).
+
+\code{\link{annotateSpliceEventType}} annotates each intron in the results 
+    table with the type of splice event and expected occurence of frameshift 
+    (likely, unlikely, inconclusive). Can also calculate overlap with 
+    annotated UTR regions. Splice types can be one of: 
+    annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, 
+    exonTruncation, exonElongation, exonTruncation&Elongation, 
+    singleExonSkipping, exonSkipping, splicingBeyondGene, 
+    multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, 
+    complex (everything else).     
+    Splice sites (theta metric) annotations indicate how the splice site is 
+    located with respect to the reference annotation. The annotated types 
+    are: annotatedSpliceSite, exonicRegion, intronicRegion.
+
+\code{\link{flagBlacklistRegions}} flags introns in the results table on 
+    whether or not they are located in a blacklist region of the genome. By
+    default, the blacklist regions as reported in \cite{...} are used.
+}
+\section{Functions}{
+\itemize{
+\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations 
+based on overlap with the reference annotation (start, end, none, both) 
+for the full fds. The overlap type is added as a new column 
+\code{annotatedJunction} in \code{mcols(fds)}.
+
+\item \code{annotateSpliceEventType}: This method annotates the splice event 
+type to junctions in the given results table.
+
+\item \code{flagBlacklistRegions}: This method flags blacklist regions in 
+the given results table.
+}}
+
+\examples{
+  # get data, fit and compute p-values and z-scores
+  fds <- createTestFraserDataSet()
+  
+  # load reference annotation
+  requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
+  txdb <- 
+    TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 
+    
+  # add basic annotations for overlap with the reference annotation
+  # run this function before creating the results table
+  fds <- annotateIntronReferenceOverlap(fds, txdb) 
+
+  # extract results: for this small example dataset, only a z score cutoff 
+  # of 1 is used to get at least one result. 
+  # Make sure to include the additional column in the results table
+  res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, 
+                  additionalColumns = 'annotatedJunction')
+  res_dt <- as.data.table(res)
+  
+  # annotate the type of splice event and UTR overlap
+  res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb)
+  
+  # annotate overlap with blacklist regions
+  res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")
+  
+  # show results table containing additional annotations
+  res_dt
+  
+}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 1a7c62e3..461280c0 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,6 +6,11 @@
 
 using namespace Rcpp;
 
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
 // setPseudoCount
 double setPseudoCount(double pseudoCount);
 RcppExport SEXP _FRASER_setPseudoCount(SEXP pseudoCountSEXP) {

From 49fd0687d151bf3a265523046f32af92d3c35f54 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 25 Jan 2022 10:46:58 +0100
Subject: [PATCH 08/80] cleanup for bioccheck

---
 NAMESPACE                    |  4 +++-
 R/FRASER-package.R           | 10 ++++----
 R/resultAnnotations.R        | 46 ++++++++++++++++++++----------------
 man/spliceTypeAnnotations.Rd |  3 ++-
 4 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index cd7b6b50..96deb2b5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -184,7 +184,9 @@ importFrom(HDF5Array,path)
 importFrom(HDF5Array,saveHDF5SummarizedExperiment)
 importFrom(HDF5Array,writeHDF5Array)
 importFrom(IRanges,IRanges)
+importFrom(IRanges,distance)
 importFrom(IRanges,from)
+importFrom(IRanges,nearest)
 importFrom(IRanges,ranges)
 importFrom(IRanges,subsetByOverlaps)
 importFrom(IRanges,to)
@@ -213,6 +215,7 @@ importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,DataFrame)
 importFrom(S4Vectors,Rle)
 importFrom(S4Vectors,SimpleList)
+importFrom(S4Vectors,elementMetadata)
 importFrom(S4Vectors,end)
 importFrom(S4Vectors,mcols)
 importFrom(S4Vectors,metadata)
@@ -307,7 +310,6 @@ importFrom(rhdf5,H5Pget_chunk)
 importFrom(rhdf5,H5Pget_layout)
 importFrom(rhdf5,h5delete)
 importFrom(rhdf5,h5ls)
-importFrom(rtracklayer,import)
 importFrom(stats,cor)
 importFrom(stats,cutree)
 importFrom(stats,dbinom)
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 843f45d0..f01fbc3f 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -31,7 +31,7 @@
 #'          rbind Assays
 #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
 #'          makeGRangesFromDataFrame invertStrand
-#' @importFrom IRanges subsetByOverlaps from to IRanges ranges
+#' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance
 #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter
 #'          bamWhich bamWhich<- BamFile idxstatsBam
 #' @importFrom Rsubread featureCounts
@@ -41,7 +41,6 @@
 #'
 #' @importFrom biomaRt useEnsembl getBM
 #' @importFrom AnnotationDbi select
-#' @importFrom rtracklayer import
 #'
 ### Plotting
 #'
@@ -82,7 +81,7 @@
 ### To be added into the functions above
 #'
 #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<-
-#'          start end metadata metadata<- subjectHits queryHits
+#'          start end metadata metadata<- subjectHits queryHits elementMetadata
 #' @importFrom grDevices colorRampPalette
 #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels
 #'          seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames 
@@ -129,5 +128,8 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
         "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa",
         "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5",
         "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr",
-        "uniqueID", "V1", "value", "zscore", "maxDTheta"),
+        "uniqueID", "V1", "value", "zscore", "maxDTheta", 
+        "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", 
+        "distNearestGene", "UTR_overlap", "meanCount", "medianCount", 
+        "spliceType2"),
         package="FRASER")
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 971069e9..87295444 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -72,7 +72,7 @@
 #'   res_dt <- as.data.table(res)
 #'   
 #'   # annotate the type of splice event and UTR overlap
-#'   res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb)
+#'   res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds)
 #'   
 #'   # annotate overlap with blacklist regions
 #'   res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")
@@ -114,7 +114,7 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
     # Do the annotation just for the intron with highest median expression
     print("start calculating annotations")    
     overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
-    annotations <- bplapply(1:length(fds_junctions), function(i){
+    annotations <- bplapply(seq_len(length(fds_junctions)), function(i){
         # only select first intron as already ordered by medianCount beforehand
         overlap <- overlaps[i]
         if(is.na(overlap)) return("none") #no overlap with any intron
@@ -147,7 +147,7 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
 #' @describeIn spliceTypeAnnotations This method annotates the splice event 
 #'     type to junctions in the given results table.
 #' @export
-annotateSpliceEventType <- function(result, txdb, addSpliceType=TRUE, 
+annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, 
                                     addUTRoverlap=TRUE, BPPARAM=bpparam()){
     
     # Create basic annotation of overlap with reference
@@ -200,7 +200,7 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL,
             blacklist_regions)
     }
     print("Importing blacklist regions ...")
-    blacklist_gr <- import(blacklist_regions, format = "BED")
+    blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED")
     result <- addBlacklistLabels(result, blacklist_gr)  
     return(result)
 }
@@ -260,7 +260,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
-                                                keep.extra.columns = T)
+                                                keep.extra.columns = TRUE)
     seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
     
     introns_tmp <- unique(unlist(intronsByTranscript(txdb)))
@@ -286,7 +286,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, 
                     spliceType := "annotatedIntron_increasedUsage"]
     junctions_dt[annotatedJunction == "both" & deltaPsi < 0, 
-                 spliceType := "annotatedIntron_reducedUsage"]
+                    spliceType := "annotatedIntron_reducedUsage"]
     junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
     
     # TODO check for intron retention
@@ -342,10 +342,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         })
         maxExpr <- which.max(expre)
         
-        # returns type of exon splicing, frameshift T/F, amount of shift
-        st = compareStarts(junctions_gr, i, overlap[maxExpr], T, 
+        # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift
+        st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE, 
                             intron_ranges, exons)
-        en = compareEnds(junctions_gr, i, overlap[maxExpr], T, 
+        en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE, 
                             intron_ranges, exons)
         
         # merge, start and end results
@@ -355,7 +355,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         # if one is notYet -> return notYet
         if((st[1] == "singleExonSkipping" & !(en[1] %in% 
                                 c("singleExonSkipping", "exonSkipping"))) ||
-           (en[1] == "singleExonSkipping" & !(st[1] %in% 
+            (en[1] == "singleExonSkipping" & !(st[1] %in% 
                                 c("singleExonSkipping", "exonSkipping")))){
             ## only one is single exonSkipping, the other is trunc/elong
             if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){
@@ -446,10 +446,15 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     none_results <- sapply(nones, function(i){
         if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
         #return("intergenic")
-        if(start(refseq.genes[nearest(junctions_gr[i], refseq.genes)]) > start(junctions_gr[i])){
-            ifelse(strand(junctions_gr[i]) == "+", return("upstreamOfNearestGene"), return("downstreamOfNearestGene"))
+        if(start(refseq.genes[nearest(junctions_gr[i], 
+                                    refseq.genes)]) > start(junctions_gr[i])){
+            ifelse(strand(junctions_gr[i]) == "+", 
+                    return("upstreamOfNearestGene"), 
+                    return("downstreamOfNearestGene"))
         }else{
-            ifelse(strand(junctions_gr[i]) == "+", return("downstreamOfNearestGene"), return("upstreamOfNearestGene"))
+            ifelse(strand(junctions_gr[i]) == "+", 
+                    return("downstreamOfNearestGene"), 
+                    return("upstreamOfNearestGene"))
         }
     })
     junctions_dt[thetas[nones], spliceType := none_results]
@@ -462,12 +467,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     down <- which(junctions_dt$spliceType == "downstreamOfNearestGene")
     
     # create full grange object containing psi and theta
-    junctions_gr <- makeGRangesFromDataFrame(junctions_dt, keep.extra.columns = T)
+    junctions_gr <- makeGRangesFromDataFrame(junctions_dt, 
+                                                keep.extra.columns = TRUE)
     
     print("Calculate distances")
     if(length(up) > 0){
         distanceNearestGene_up <- sapply(up, function(i){
-            min(distance(junctions_gr[i], refseq.genes), na.rm = T)})
+            min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
         if(length(distanceNearestGene_up > 0)){
             junctions_dt[psi_positions[up], 
                             distNearestGene := distanceNearestGene_up]
@@ -479,7 +485,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     
     if(length(down) > 0){
         distanceNearestGene_down <- sapply(down, function(i){
-            min(distance(junctions_gr[i], refseq.genes), na.rm = T)})
+            min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
         if(length(distanceNearestGene_down > 0)){
             junctions_dt[psi_positions[down], 
                             distNearestGene := distanceNearestGene_down]
@@ -593,7 +599,7 @@ compareStarts <- function(junctions_gr, i, max_lap, shift_needed,
                     #    elongation 
                     
                     shift = (-1)*(end(exons[exonChoices[1]]) - 
-                                     start(exons[exonChoices[1]]) + 1) + 
+                                        start(exons[exonChoices[1]]) + 1) + 
                         ss_start - start(intron_ranges[secItrChoices[maxExpr]])
                     
                     frs = ifelse(shift %% 3 == 0,"unlikely","likely")
@@ -753,7 +759,7 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){
     
     # check if distance to nearest is > 1000 -> intergenic
     # otherwise up/downstream
-    dist = min(distance(test_junction, refseq.genes), na.rm = T)
+    dist = min(distance(test_junction, refseq.genes), na.rm = TRUE)
     if(dist > 0){
         # if(dist > 1000){
         #     print("intergenic")
@@ -781,7 +787,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
-                                                keep.extra.columns = T)
+                                                keep.extra.columns = TRUE)
     seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
     
     refseq.genes<- genes(txdb)
@@ -857,7 +863,7 @@ checkInconclusive <- function(junctions_dt, txdb){
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
-                                                keep.extra.columns = T)
+                                                keep.extra.columns = TRUE)
     seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
     
     refseq.genes<- genes(txdb)
diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd
index 17f8332b..17fea92c 100644
--- a/man/spliceTypeAnnotations.Rd
+++ b/man/spliceTypeAnnotations.Rd
@@ -12,6 +12,7 @@ annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam())
 annotateSpliceEventType(
   result,
   txdb,
+  fds,
   addSpliceType = TRUE,
   addUTRoverlap = TRUE,
   BPPARAM = bpparam()
@@ -116,7 +117,7 @@ the given results table.
   res_dt <- as.data.table(res)
   
   # annotate the type of splice event and UTR overlap
-  res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb)
+  res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds)
   
   # annotate overlap with blacklist regions
   res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")

From e893f2cd2ced9071df2a6ef24741d4535b05c891 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 25 Jan 2022 23:08:29 +0100
Subject: [PATCH 09/80] cleanup

---
 DESCRIPTION                  |   1 +
 R/AllGenerics.R              |   5 +
 R/resultAnnotations.R        | 191 ++++++++++++++++++++---------------
 man/spliceTypeAnnotations.Rd |  37 ++++---
 4 files changed, 140 insertions(+), 94 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f5c8403f..8273914a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -7,6 +7,7 @@ Authors@R: c(
     person("Christian", "Mertes", role=c("aut", "cre"), 
             email="mertes@in.tum.de"),
     person("Ines", "Scheller",  role=c("aut"), email="scheller@in.tum.de"),
+    person("Karoline", "Lutz", role=c("aut"))
     person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"),
     person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
 Description: Detection of rare aberrant splicing events in transcriptome 
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 42986760..68ed8f38 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -654,6 +654,11 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
+    
+    if("annotatedJunction" %in% colnames(mcols(fds, type="j")) && 
+            !("annotatedJunction" %in% additionalColumns)){
+        additionalColumns <- c(additionalColumns, "annotatedJunction")
+    }
 
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
         message(date(), ": Collecting results for: ", type)
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 87295444..6ed35e65 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -27,7 +27,10 @@
 #' 
 #' \code{\link{flagBlacklistRegions}} flags introns in the results table on 
 #'     whether or not they are located in a blacklist region of the genome. By
-#'     default, the blacklist regions as reported in \cite{...} are used.
+#'     default, the blacklist regions as reported in 
+#'     \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from 
+#'     \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} 
+#'     are used.
 #'
 #' @param fds A FraserDataSet
 #' @param txdb A txdb object providing the reference annotation.
@@ -38,9 +41,12 @@
 #'     should be added to the results table. Defaults to \code{TRUE}.
 #' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
 #'     should checked and added to the results table. Defaults to \code{TRUE}.
+#' @param minoverlap Integer value defining the number of base pairs around the
+#'     splice site that need to overlap with UTR or blacklist region, 
+#'     respectivly, to be considered matching. Defaults to 5 bp. 
 #' @param blacklist_regions A BED file that contains the blacklist regions. 
 #'     If \code{NULL} (default), the BED files that are packaged with FRASER 
-#'     are used. 
+#'     are used (see Details for more information). 
 #' @param assemblyVersion Indicates the genome assembly version of the intron 
 #'     coordinates. Only used if blacklist_regions is NULL. For other versions, 
 #'     please provide the BED file containing the blacklist regions directly.
@@ -56,9 +62,8 @@
 #'   fds <- createTestFraserDataSet()
 #'   
 #'   # load reference annotation
-#'   requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
-#'   txdb <- 
-#'     TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 
+#'   library(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#'   txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 
 #'     
 #'   # add basic annotations for overlap with the reference annotation
 #'   # run this function before creating the results table
@@ -66,19 +71,16 @@
 #' 
 #'   # extract results: for this small example dataset, only a z score cutoff 
 #'   # of 1 is used to get at least one result. 
-#'   # Make sure to include the additional column in the results table
-#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, 
-#'                   additionalColumns = 'annotatedJunction')
-#'   res_dt <- as.data.table(res)
+#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
 #'   
 #'   # annotate the type of splice event and UTR overlap
-#'   res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds)
+#'   res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
 #'   
 #'   # annotate overlap with blacklist regions
-#'   res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")
+#'   res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
 #'   
 #'   # show results table containing additional annotations
-#'   res_dt
+#'   res
 #'   
 NULL
 
@@ -88,7 +90,7 @@ NULL
 #'     \code{annotatedJunction} in \code{mcols(fds)}. 
 #' @export
 annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
-    print("loading introns")
+    message("loading introns ...")
     #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
     introns <- unique(unlist(intronsByTranscript(txdb)))
     # reduce the introns to only the actually expressed introns
@@ -99,10 +101,9 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
     
     # calculate extra columns with mean/median intron expression count
     # add the new columns
-    print("adding median count to introns")
-    sampleCounts <- K(fds_known, type = "j")
+    sampleCounts <- as.matrix(K(fds_known, type = "j"))
     anno_introns[, meanCount := rowMeans(sampleCounts)]
-    anno_introns[, medianCount := rowMedians(as.matrix(sampleCounts))]
+    anno_introns[, medianCount := rowMedians(sampleCounts)]
     # order by medianCount (highest first)
     setorderv(anno_introns, "medianCount", order=-1) 
     anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, 
@@ -112,9 +113,10 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
     fds_junctions <- rowRanges(fds, type = "j")
     
     # Do the annotation just for the intron with highest median expression
-    print("start calculating annotations")    
+    message("start calculating basic annotations ...")    
     overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
-    annotations <- bplapply(seq_len(length(fds_junctions)), function(i){
+    annotations <- bplapply(seq_len(length(fds_junctions)), 
+            function(i, overlaps, fds_junctions, anno_introns_ranges){
         # only select first intron as already ordered by medianCount beforehand
         overlap <- overlaps[i]
         if(is.na(overlap)) return("none") #no overlap with any intron
@@ -135,12 +137,13 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
         
         # overlaps but no start/end match
         return("none") 
-    }, BPPARAM=BPPARAM) 
+    }, overlaps=overlaps, fds_junctions=fds_junctions, 
+        anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM) 
     annotations <- unlist(annotations)
     
     rowRanges(fds)$annotatedJunction <- annotations
     mcols(fds, type="ss")$annotatedJunction <- "not computed"
-    print("annotations done")
+    message("basic annotations done")
     return(fds)
 }
 
@@ -148,44 +151,56 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
 #'     type to junctions in the given results table.
 #' @export
 annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, 
-                                    addUTRoverlap=TRUE, BPPARAM=bpparam()){
-    
-    # Create basic annotation of overlap with reference
-    if(!("annotatedJunction" %in% colnames(result))){
-        stop("Column 'annotatedJunction' not found in the results table!\n",
-            "Please run fds <- annotateIntronReferenceOverlap(fds, txdb) ",
-            "first and add it \nto the results table with ",
-            "results(..., additionalColumns = 'annotatedJunction')\n", 
-            "(see examples) before calling this function.")
-    }
+                                    addUTRoverlap=TRUE, minoverlap=5, 
+                                    BPPARAM=bpparam()){
     
     # convert to data.table if not already
     if(!is.data.table(result)){
-        result <- as.data.table(result)
+        annoResult <- as.data.table(result)
+    } else{
+        annoResult <- result
+    }
+    
+    # Create basic annotation of overlap with reference
+    if(!("annotatedJunction" %in% colnames(annoResult))){
+        stop("Column 'annotatedJunction' not found in the results table!\n",
+            "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ",
+            "first and then extract \nthe results table using the ",
+            "'results(fds, ...)' function before calling this function.")
     }
     
     # Calculate splice types and frameshift
     if(isTRUE(addSpliceType)){
-        result <- addSpliceTypeLabels(result, fds, txdb)
+        annoResult <- addSpliceTypeLabels(annoResult, fds, txdb)
     }
     
     # Add UTR labels
     if(isTRUE(addUTRoverlap)){
-        result <- addUTRLabels(result, txdb)
+        annoResult <- addUTRLabels(annoResult, txdb)
     }
     
-    return(result)
+    if(is(result, "GenomicRanges")){
+        annoResult <- makeGRangesFromDataFrame(annoResult, 
+                                                keep.extra.columns=TRUE)
+    } 
+    
+    return(annoResult)
 }
 
-#' @describeIn spliceTypeAnnotations This method flags blacklist regions in 
-#'     the given results table. 
+#' @describeIn spliceTypeAnnotations This method flags all introns and 
+#'     splice sites in the given results table for which at least one splice 
+#'     site (donor or acceptor) is located in a blacklist region. Blacklist 
+#'     regions of the genome are determined from the provided BED file.  
 #' @export
 flagBlacklistRegions <- function(result, blacklist_regions=NULL,
-                                assemblyVersion=c('hg19', 'hg38')){
+                                assemblyVersion=c('hg19', 'hg38'),
+                                minoverlap=5){
     
     # convert to data.table if not already
     if(!is.data.table(result)){
-        result <- as.data.table(result)
+        annoResult <- as.data.table(result)
+    } else{
+        annoResult <- result
     }
     
     assemblyVersion <- match.arg(assemblyVersion)
@@ -199,55 +214,82 @@ flagBlacklistRegions <- function(result, blacklist_regions=NULL,
         stop("BED file with blacklist regions does not exist: ", 
             blacklist_regions)
     }
-    print("Importing blacklist regions ...")
+    message("Importing blacklist regions ...")
     blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED")
-    result <- addBlacklistLabels(result, blacklist_gr)  
-    return(result)
+    annoResult <- addBlacklistLabels(annoResult, blacklist_gr)  
+    
+    if(is(result, "GenomicRanges")){
+        annoResult <- makeGRangesFromDataFrame(annoResult, 
+                                                keep.extra.columns=TRUE)
+    }
+    
+    return(annoResult)
 }
 
 ############# helper functions ##############################
 
 #' blacklist annotation for aberrant splicing events  
 #' @noRd
-addBlacklistLabels <- function(junctions_dt, blacklist_gr){
+addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){
     # add the blacklist information
-    print("Set up aberrant splicing granges")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
     
+    # get gr with start/end positions of each intron
+    gr_start_ss <- junctions_gr
+    end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
+    start(gr_start_ss) <- start(gr_start_ss) - minoverlap
+    gr_end_ss <- junctions_gr
+    start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
+    end(gr_end_ss) <- end(gr_end_ss) + minoverlap
+    
     # set to the same seqlevelsstyle
     seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
     
     ## create overlap with blacklist and annotate extra column
-    print("find blacklist overlap")
-    black_hits <- unique(from(findOverlaps(junctions_gr, blacklist_gr)))
+    message("finding blacklist overlap ...")
+    black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr)))
+    black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr)))
     junctions_dt[, blacklist := FALSE]
     
-    junctions_dt[black_hits, blacklist := TRUE]
+    junctions_dt[black_hits_start_ss | black_hits_end_ss, blacklist := TRUE]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     
-    print("blacklist labels done")
+    message("blacklist labels done")
     return(junctions_dt)
 }
 
 #' adds UTR overlap annotation to results table
 #' @noRd
-addUTRLabels <- function(junctions_dt, txdb){
+addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
     seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
+    
+    # get gr with start/end positions of each intron
+    gr_start_ss <- junctions_gr
+    end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
+    start(gr_start_ss) <- start(gr_start_ss) - minoverlap
+    gr_end_ss <- junctions_gr
+    start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
+    end(gr_end_ss) <- end(gr_end_ss) + minoverlap
+    
     ### UTR labels based on txdb file
     ### add 5' 3' UTR labels
-    print("find UTR overlap")
-    threes <- unique(from(findOverlaps(junctions_gr, 
+    message("finding UTR overlap ...")
+    threes_start <- unique(from(findOverlaps(gr_start_ss, 
+                            threeUTRsByTranscript(txdb, use.names = TRUE))))
+    threes_end <- unique(from(findOverlaps(gr_end_ss, 
                             threeUTRsByTranscript(txdb, use.names = TRUE))))
-    fives <- unique(from(findOverlaps(junctions_gr, 
+    fives_start <- unique(from(findOverlaps(gr_start_ss, 
+                            fiveUTRsByTranscript(txdb, use.names = TRUE))))
+    fives_end <- unique(from(findOverlaps(gr_end_ss, 
                             fiveUTRsByTranscript(txdb, use.names = TRUE))))
     junctions_dt[, UTR_overlap := "no"]
-    junctions_dt[threes, UTR_overlap := "3'-UTR"]
-    junctions_dt[fives, UTR_overlap := "5'-UTR"]
+    junctions_dt[threes_start | threes_end, UTR_overlap := "3'-UTR"]
+    junctions_dt[fives_start | fives_end, UTR_overlap := "5'-UTR"]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
-    print("UTR labels done")
+    message("UTR labels done")
     return(junctions_dt)
 }
 
@@ -256,7 +298,7 @@ addUTRLabels <- function(junctions_dt, txdb){
 #' adds type of splicing to each intron in the results table
 #' @noRd
 addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
-    print("preparing ...")
+    message("preparing ...")
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
@@ -295,8 +337,8 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
     nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none")
     
-    print("calculating aberrant splice types")
-    print("start junctions")
+    message("calculating splice types ...")
+    # start junctions
     start_results <- sapply(starts, function(i){
         # find the most freq intron that overlaps again
         overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
@@ -313,7 +355,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     junctions_dt[psi_positions[starts], 
                     spliceType := start_results[1,]]
     
-    print("end junctions")
+    # end junctions
     end_results <- sapply(ends, function(i){
         # find the most freq intron that overlaps again
         overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
@@ -329,7 +371,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
     junctions_dt[psi_positions[ends], spliceType := end_results[1,]]
     
-    print("none junctions pt1")
+    # none junctions pt1
     none_results <- sapply(nones, function(i){
         # find most freq intron
         # check start and end
@@ -384,7 +426,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap")
     refseq.genes<- genes(txdb)
     
-    print("none junctions pt2")
+    # none junctions pt2
     noLaps_results <- sapply(noLaps, function(i){
         overlap <- to(findOverlaps(junctions_gr[i], exons))
         # no overlap with an intron or an exon
@@ -414,7 +456,6 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                     spliceType := noLaps_results[1,]]
     
     # theta annotations
-    print("Annotate theta junctions")
     thetas <- which(junctions_dt$type == "theta")
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], 
                                             keep.extra.columns = TRUE)
@@ -441,7 +482,6 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     junctions_dt[thetas[exonic], spliceType := exonic_results]
     
     # check cases that don't overlap with an exon/intron
-    print("nones")
     nones <- which(is.na(junctions_dt[thetas,]$spliceType))
     none_results <- sapply(nones, function(i){
         if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
@@ -458,11 +498,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         }
     })
     junctions_dt[thetas[nones], spliceType := none_results]
-    print("thetas done")
     
     # add distance to closest neighbour gene for intergenic results 
     # (both psi and theta)
-    print("adding distances to nearest gene")
+    message("adding distances to nearest gene ...")
     up <- which(junctions_dt$spliceType == "upstreamOfNearestGene")
     down <- which(junctions_dt$spliceType == "downstreamOfNearestGene")
     
@@ -470,7 +509,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt, 
                                                 keep.extra.columns = TRUE)
     
-    print("Calculate distances")
+    # Calculate distances
     if(length(up) > 0){
         distanceNearestGene_up <- sapply(up, function(i){
             min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
@@ -479,9 +518,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                             distNearestGene := distanceNearestGene_up]
         } else{
             junctions_dt[psi_positions[up], distNearestGene := NA]
-            print("No distances found for upstream")
+            message("No distances found for upstream")
         }
-    }else{print("No upstream targets")}
+    }else{message("No upstream targets")}
     
     if(length(down) > 0){
         distanceNearestGene_down <- sapply(down, function(i){
@@ -491,12 +530,12 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                             distNearestGene := distanceNearestGene_down]
         }else{
             junctions_dt[psi_positions[down], distNearestGene := NA]
-            print("No distances found for downstream")
+            message("No distances found for downstream")
         }
-    }else{print("No downstream targets")}
+    }else{message("No downstream targets")}
     
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
-    print("done calculating aberrant splice types")
+    message("done calculating splice types")
     
     # Add the subtypes for exonSkipping and inconclusive
     junctions_dt <- checkExonSkipping(junctions_dt, txdb)
@@ -761,11 +800,6 @@ checkIntergenic <- function(junctions_gr, i, refseq.genes){
     # otherwise up/downstream
     dist = min(distance(test_junction, refseq.genes), na.rm = TRUE)
     if(dist > 0){
-        # if(dist > 1000){
-        #     print("intergenic")
-        #     return(c("intergenic", "unlikely"))
-        # }else{
-        
         # find nearest and compare starts
         if(start(refseq.genes[nearest(junctions_gr[i], 
                                         refseq.genes)]) > start){
@@ -795,7 +829,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
     exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% 
                             c("exonSkipping", "singleExonSkipping"))
     
-    print("start checking exonSkipping")
+    message("start checking exonSkipping")
     newSkip_results <- sapply(exonSkip, function(i){
         start = start(junctions_gr[i]) 
         end = end(junctions_gr[i])
@@ -838,7 +872,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
         return("multigenicSplicing")
     })
     
-    print("checking exonSkipping done")
+    # checking exonSkipping done
     if(length(exonSkip) > 0){
         junctions_dt[psi_positions[exonSkip], 
                         spliceType2 := newSkip_results]
@@ -870,7 +904,6 @@ checkInconclusive <- function(junctions_dt, txdb){
     
     inconclusive <- which(junctions_dt[psi_positions
                                         ]$spliceType == "complex")
-    print("start checking inconclusive")
     
     inconclusive_results <- sapply(inconclusive, function(i){
         start = start(junctions_gr[i]) 
@@ -915,7 +948,7 @@ checkInconclusive <- function(junctions_dt, txdb){
     })
     
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
-    print("done checking inconclusive")
+
     if(length(inconclusive) > 0){
         junctions_dt[psi_positions[inconclusive], 
                         spliceType := inconclusive_results]
diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd
index 17fea92c..b28e3c33 100644
--- a/man/spliceTypeAnnotations.Rd
+++ b/man/spliceTypeAnnotations.Rd
@@ -15,13 +15,15 @@ annotateSpliceEventType(
   fds,
   addSpliceType = TRUE,
   addUTRoverlap = TRUE,
+  minoverlap = 5,
   BPPARAM = bpparam()
 )
 
 flagBlacklistRegions(
   result,
   blacklist_regions = NULL,
-  assemblyVersion = c("hg19", "hg38")
+  assemblyVersion = c("hg19", "hg38"),
+  minoverlap = 5
 )
 }
 \arguments{
@@ -42,9 +44,13 @@ should be added to the results table. Defaults to \code{TRUE}.}
 \item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions
 should checked and added to the results table. Defaults to \code{TRUE}.}
 
+\item{minoverlap}{Integer value defining the number of base pairs around the
+splice site that need to overlap with UTR or blacklist region, 
+respectivly, to be considered matching. Defaults to 5 bp.}
+
 \item{blacklist_regions}{A BED file that contains the blacklist regions. 
 If \code{NULL} (default), the BED files that are packaged with FRASER 
-are used.}
+are used (see Details for more information).}
 
 \item{assemblyVersion}{Indicates the genome assembly version of the intron 
 coordinates. Only used if blacklist_regions is NULL. For other versions, 
@@ -80,7 +86,10 @@ These functions work on the result table and add additional
 
 \code{\link{flagBlacklistRegions}} flags introns in the results table on 
     whether or not they are located in a blacklist region of the genome. By
-    default, the blacklist regions as reported in \cite{...} are used.
+    default, the blacklist regions as reported in 
+    \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from 
+    \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} 
+    are used.
 }
 \section{Functions}{
 \itemize{
@@ -92,8 +101,10 @@ for the full fds. The overlap type is added as a new column
 \item \code{annotateSpliceEventType}: This method annotates the splice event 
 type to junctions in the given results table.
 
-\item \code{flagBlacklistRegions}: This method flags blacklist regions in 
-the given results table.
+\item \code{flagBlacklistRegions}: This method flags all introns and 
+splice sites in the given results table for which at least one splice 
+site (donor or acceptor) is located in a blacklist region. Blacklist 
+regions of the genome are determined from the provided BED file.
 }}
 
 \examples{
@@ -101,9 +112,8 @@ the given results table.
   fds <- createTestFraserDataSet()
   
   # load reference annotation
-  requireNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
-  txdb <- 
-    TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene 
+  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
+  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 
     
   # add basic annotations for overlap with the reference annotation
   # run this function before creating the results table
@@ -111,18 +121,15 @@ the given results table.
 
   # extract results: for this small example dataset, only a z score cutoff 
   # of 1 is used to get at least one result. 
-  # Make sure to include the additional column in the results table
-  res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA, 
-                  additionalColumns = 'annotatedJunction')
-  res_dt <- as.data.table(res)
+  res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
   
   # annotate the type of splice event and UTR overlap
-  res_dt <- annotateSpliceEventType(result=res_dt, txdb=txdb, fds=fds)
+  res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
   
   # annotate overlap with blacklist regions
-  res_dt <- flagBlacklistRegions(result=res_dt, assemblyVersion="hg19")
+  res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
   
   # show results table containing additional annotations
-  res_dt
+  res
   
 }

From 735ed8c55bce40d069549a7fcce4a624f1103c49 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 26 Jan 2022 00:09:37 +0100
Subject: [PATCH 10/80] fix bioccheck

---
 DESCRIPTION           | 2 +-
 NAMESPACE             | 4 ++++
 R/FRASER-package.R    | 2 +-
 R/resultAnnotations.R | 4 ++--
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8273914a..c008fecb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -7,7 +7,7 @@ Authors@R: c(
     person("Christian", "Mertes", role=c("aut", "cre"), 
             email="mertes@in.tum.de"),
     person("Ines", "Scheller",  role=c("aut"), email="scheller@in.tum.de"),
-    person("Karoline", "Lutz", role=c("aut"))
+    person("Karoline", "Lutz", role=c("aut")),
     person("Vicente", "Yepez", role=c("ctb"), email="yepez@in.tum.de"),
     person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
 Description: Detection of rare aberrant splicing events in transcriptome 
diff --git a/NAMESPACE b/NAMESPACE
index 96deb2b5..861b7c2c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -172,12 +172,16 @@ importFrom(GenomicFeatures,genes)
 importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicFeatures,makeTxDbFromGFF)
 importFrom(GenomicFeatures,threeUTRsByTranscript)
+importFrom(GenomicRanges,"end<-")
+importFrom(GenomicRanges,"start<-")
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,GRangesList)
+importFrom(GenomicRanges,end)
 importFrom(GenomicRanges,findOverlaps)
 importFrom(GenomicRanges,granges)
 importFrom(GenomicRanges,invertStrand)
 importFrom(GenomicRanges,makeGRangesFromDataFrame)
+importFrom(GenomicRanges,start)
 importFrom(HDF5Array,HDF5Array)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(HDF5Array,path)
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index f01fbc3f..bb2e57a5 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -30,7 +30,7 @@
 #'          colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment
 #'          rbind Assays
 #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
-#'          makeGRangesFromDataFrame invertStrand
+#'          makeGRangesFromDataFrame invertStrand start end start<- end<-
 #' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance
 #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter
 #'          bamWhich bamWhich<- BamFile idxstatsBam
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 6ed35e65..5afb81e3 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -347,7 +347,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
             elementMetadata(intron_ranges[j])$medianCount
         })
         maxExpr <- which.max(expre)
-        return(compareEnds(junctions_gr, i, overlap[maxExpr], F, 
+        return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE, 
                             intron_ranges, exons))
     })
     junctions_dt[psi_positions[starts], 
@@ -364,7 +364,7 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
             elementMetadata(intron_ranges[j])$medianCount
         })
         maxExpr <- which.max(expre)
-        return(compareStarts(junctions_gr, i, overlap[maxExpr], F, 
+        return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE, 
                                 intron_ranges, exons))
         
     })

From 4a5b72ade085a8c88ef05a58351f0c9aa435dc9f Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 26 Jan 2022 01:01:28 +0100
Subject: [PATCH 11/80] bugfix

---
 R/resultAnnotations.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 5afb81e3..52f1c70b 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -252,7 +252,8 @@ addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){
     black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr)))
     junctions_dt[, blacklist := FALSE]
     
-    junctions_dt[black_hits_start_ss | black_hits_end_ss, blacklist := TRUE]
+    junctions_dt[black_hits_start_ss, blacklist := TRUE]
+    junctions_dt[black_hits_end_ss, blacklist := TRUE]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     
     message("blacklist labels done")
@@ -286,8 +287,10 @@ addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
     fives_end <- unique(from(findOverlaps(gr_end_ss, 
                             fiveUTRsByTranscript(txdb, use.names = TRUE))))
     junctions_dt[, UTR_overlap := "no"]
-    junctions_dt[threes_start | threes_end, UTR_overlap := "3'-UTR"]
-    junctions_dt[fives_start | fives_end, UTR_overlap := "5'-UTR"]
+    junctions_dt[threes_start, UTR_overlap := "3'-UTR"]
+    junctions_dt[threes_end, UTR_overlap := "3'-UTR"]
+    junctions_dt[fives_start, UTR_overlap := "5'-UTR"]
+    junctions_dt[fives_end, UTR_overlap := "5'-UTR"]
     colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
     message("UTR labels done")
     return(junctions_dt)

From 8da822ae6685f21e1c1d3355a619e97852f021ce Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 26 Jan 2022 01:37:27 +0100
Subject: [PATCH 12/80] minor bugfix

---
 R/AllGenerics.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 68ed8f38..e0a00fd6 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -655,7 +655,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
     
-    if("annotatedJunction" %in% colnames(mcols(fds, type="j")) && 
+    if("annotatedJunction" %in% colnames(mcols(object, type="j")) && 
             !("annotatedJunction" %in% additionalColumns)){
         additionalColumns <- c(additionalColumns, "annotatedJunction")
     }

From 7346d333c91f6ff58dcbd44b1865e5f61a66910d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 26 Jan 2022 16:01:42 +0100
Subject: [PATCH 13/80] cleaner jaccard index computation

---
 R/AllGenerics.R       |   3 +-
 R/calculatePSIValue.R | 239 ++++++++++++++++--------------------------
 2 files changed, 93 insertions(+), 149 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 9349f8b2..17553bc4 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -885,7 +885,8 @@ setMethod("results", "FraserDataSet", function(object,
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
                     rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE,
-                    minCount=5, psiType=c("psi3", "psi5", "theta"),
+                    minCount=5, psiType=c("psi3", "psi5", "theta", 
+                                            "intron_jaccard"),
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index be0b8dc2..6cf2f17b 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -37,7 +37,8 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE,
     }
     
     # calculate intron jaccard index
-    fds <- calculateIntronJaccardIndex(fds)
+    fds <- calculateIntronNonsplitSum(fds, overwriteCts=overwriteCts)
+    fds <- calculateJaccardIntronIndex(fds, overwriteCts=overwriteCts)
     
     # calculate the delta psi value
     for(psiType in types){
@@ -326,155 +327,97 @@ getOtherCountsCacheFolder <- function(fds){
 }
 
 #'
-#' calculates the intron jaccard index values for all junctions
-#' @inheritParams countRNA
-#' 
-calculateIntronJaccardIndex <- function(fds){
-    # retrieve junction and splice site annotation with count information
-    junction_dt <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, 
-                                                   strand, startID, endID)], 
-        as.matrix(K(fds, type="psi5")))
-    ss_dt <- data.table(
-        as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, 
-                                                    strand, spliceSiteID)], 
-        as.matrix(N(fds, type="theta")))
-    
-    # melt to have one row per sample - junction combination
-    junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", 
-                        id.vars=c("seqnames", "start", "end", "strand", 
-                                  "startID", "endID"))
-    ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", 
-                  id.vars=c("seqnames", "start", "end", "strand", 
-                            "spliceSiteID"))
-    
-    # merge junction information with splice site annotation (theta)
-    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
-                         all.x=TRUE, by.x=c("startID", "sampleID"), 
-                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
-    setnames(junction_dt, "n", "n_donor")
-    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
-                         all.x=TRUE, by.x=c("endID", "sampleID"), 
-                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
-    setnames(junction_dt, "n", "n_acceptor")
-    rm(ss_dt)
-    gc()
-    
-    # TODO deal with missing endIDs (why do we have them? lost in filtering?)
-    # for now: replace n_donor with N from psi5 (and same for n_acceptor and psi3)
-    junction_dt_nacceptor <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
-        as.matrix(N(fds, type="psi3")))
-    junction_dt_nacceptor <- melt(junction_dt_nacceptor, 
-                        variable.name="sampleID", value.name="n_psi3", 
-                        id.vars=c("startID", "endID"))
-    junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]]
-    rm(junction_dt_nacceptor)
-    gc()
-    junction_dt_ndonor <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
-        as.matrix(N(fds, type="psi5")))
-    junction_dt_ndonor <- melt(junction_dt_ndonor, 
-                               variable.name="sampleID", value.name="n_psi5", 
-                               id.vars=c("startID", "endID"))
-    junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]]
-    rm(junction_dt_ndonor)
-    gc()
-    
-    # replace n (with non-split counts) by n_psi3/n_psi5 if NA
-    junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3]
-    junction_dt[is.na(n_donor), n_donor:=n_psi5]
-    junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3]
-    junction_dt[n_donor < n_psi5, n_donor:=n_psi5]
-    
-    # calculate intron_jaccard
-    junction_dt[, denominator:=(n_donor + n_acceptor - k)]
-    junction_dt[, intron_jaccard:= k / denominator]
-    junction_dt[is.nan(intron_jaccard), intron_jaccard:=1] # n_donor = n_acceptor = 0
-    
-    # convert to matrix to store it as assay in the fds
-    intron_jaccard <- matrix(junction_dt[,intron_jaccard], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
-    rownames(intron_jaccard) <- rownames(fds)
-    colnames(intron_jaccard) <- colnames(fds)
-    assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- intron_jaccard
-    
-    # store denominator
-    jaccard_denom <- matrix(junction_dt[,denominator], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
-    rownames(jaccard_denom) <- rownames(fds)
-    colnames(jaccard_denom) <- colnames(fds)
-    assay(fds, type="j", "rawOtherCounts_intron_jaccard", 
-            withDimnames=FALSE) <- jaccard_denom - matrix(junction_dt[,k], 
-                                                            nrow=nrow(fds), 
-                                                            ncol=ncol(fds), 
-                                                            byrow=FALSE)
+#' calculates the jaccard intron value for the given junctions
+#'
+#' @noRd
+calculateJaccardIntronIndex <- function(fds, overwriteCts){
+    stopifnot(is(fds, "FraserDataSet"))
+    
+    message(date(), ": Calculate the Jaccard Intron values ...")
+    
+    # check if we have computed N_psi3, N_psi5 and K_nonsplit already
+    if(!all(c(paste0("rawOtherCounts_psi", c(5, 3)), "rawCountsJnonsplit") %in% 
+            assayNames(fds))){
+        stop("Please calculate N_psi3, N_psi5 and K_nonsplit first before ", 
+                "calling this function.")
+    }
+    
+    # calculate intron jaccard value
+    jaccard_denom <- N(fds, "psi3") + N(fds, "psi5") + 
+                        assay(fds, "rawCountsJnonsplit") - K(fds, type="j")
+    jaccardValues <- K(fds, type="j") / jaccard_denom 
+    otherCounts_jaccard <- jaccard_denom - K(fds, type="j")
+    
+    # TODO also calculate it with nonsplit counts in the nominator 
+    
+    # assign it to our object
+    assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- jaccardValues
+    
+    if(isTRUE(overwriteCts) || 
+            !("rawOtherCounts_intron_jaccard" %in% assayNames(fds))){
+        assay(fds, type="j", "rawOtherCounts_intron_jaccard", 
+                withDimnames=FALSE) <- otherCounts_jaccard
+    }
+    
     return(fds)
 }
 
-calculatePhi <- function(fds){
-    # retrieve junction and splice site annotation with count information
-    junction_dt <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(seqnames, start, end, 
-                                                   strand, startID, endID)], 
-        as.matrix(K(fds, type="psi5")))
-    ss_dt <- data.table(
-        as.data.table(rowRanges(fds, type="ss"))[,.(seqnames, start, end, 
-                                                    strand, spliceSiteID)], 
-        as.matrix(N(fds, type="theta")))
-    
-    # melt to have one row per sample - junction combination
-    junction_dt <- melt(junction_dt, variable.name="sampleID", value.name="k", 
-                        id.vars=c("seqnames", "start", "end", "strand", 
-                                  "startID", "endID"))
-    ss_dt <- melt(ss_dt, variable.name="sampleID", value.name="n", 
-                  id.vars=c("seqnames", "start", "end", "strand", 
-                            "spliceSiteID"))
-    
-    # merge junction information with splice site annotation (theta)
-    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
-                         all.x=TRUE, by.x=c("startID", "sampleID"), 
-                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
-    setnames(junction_dt, "n", "n_donor")
-    junction_dt <- merge(junction_dt, ss_dt[,.(spliceSiteID, sampleID, n)], 
-                         all.x=TRUE, by.x=c("endID", "sampleID"), 
-                         by.y=c("spliceSiteID", "sampleID"), sort=FALSE)
-    setnames(junction_dt, "n", "n_acceptor")
-    rm(ss_dt)
-    gc()
-    
-    # deal with missing endIDs
-    junction_dt_nacceptor <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
-        as.matrix(N(fds, type="psi3")))
-    junction_dt_nacceptor <- melt(junction_dt_nacceptor, 
-                                  variable.name="sampleID", value.name="n_psi3", 
-                                  id.vars=c("startID", "endID"))
-    junction_dt[, n_psi3:=junction_dt_nacceptor[,n_psi3]]
-    rm(junction_dt_nacceptor)
-    gc()
-    junction_dt_ndonor <- data.table(
-        as.data.table(rowRanges(fds, type="j"))[,.(startID, endID)], 
-        as.matrix(N(fds, type="psi5")))
-    junction_dt_ndonor <- melt(junction_dt_ndonor, 
-                               variable.name="sampleID", value.name="n_psi5", 
-                               id.vars=c("startID", "endID"))
-    junction_dt[, n_psi5:=junction_dt_ndonor[,n_psi5]]
-    rm(junction_dt_ndonor)
-    gc()
-    
-    # replace n (with non-split counts) by n_psi3/n_psi5 if NA
-    junction_dt[is.na(n_acceptor), n_acceptor:=n_psi3]
-    junction_dt[is.na(n_donor), n_donor:=n_psi5]
-    junction_dt[n_acceptor < n_psi3, n_acceptor:=n_psi3]
-    junction_dt[n_donor < n_psi5, n_donor:=n_psi5]
-    
-    # calculate phi
-    junction_dt[, phi:= k / ((n_donor + n_acceptor)/2)]
-    junction_dt[is.nan(phi), phi:=0] # n_donor = n_acceptor = 0
-    
-    # convert to matrix to store it as assay in the fds
-    phi <- matrix(junction_dt[,phi], nrow=nrow(fds), ncol=ncol(fds), byrow=FALSE)
-    rownames(phi) <- rownames(fds)
-    colnames(phi) <- colnames(fds)
-    assay(fds, "phi", type="psi5", withDimnames=FALSE) <- phi
+#' Calculates the sum of nonsplit reads overlapping either the donor or acceptor 
+#' splice site and stores it as a new assay (one value for each junction and 
+#' sample).
+#' 
+#' @noRd
+calculateIntronNonsplitSum <- function(fds, overwriteCts){
+    stopifnot(is(fds, "FraserDataSet"))
+    
+    message(date(), ": Calculate the total nonsplict counts for each intron ", 
+                "...")
+    
+    
+    # get splice site nonsplit counts
+    nsr_ss <- K(fds, "theta")
+    
+    # retrieve junction and splice site annotation
+    junction_dt <- as.data.table(rowRanges(fds, type="j"))[,
+                                                    .(seqnames, start, end, 
+                                                    strand, startID, endID)]
+    ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, 
+                            nsr_idx=seq_len(nrow(nonSplicedReads(fds))))
+    
+    junction_dt <- merge(junction_dt, ss_map, 
+                            by.x="startID", by.y="spliceSiteID",
+                            all.x=TRUE)
+    setnames(junction_dt, "nsr_idx", "start_idx")
+    junction_dt <- merge(junction_dt, ss_map, 
+                            by.x="endID", by.y="spliceSiteID", 
+                            all.x=TRUE)
+    setnames(junction_dt, "nsr_idx", "end_idx")
+    
+    # for each junction, find the two rows in K_theta corresponding to its 
+    # donor and acceptor splice site
+    donor_sites <-  junction_dt$start_idx
+    acc_sites <- junction_dt$end_idx
+    nsr_donor <- nsr_ss[donor_sites,]
+    nsr_acc <- nsr_ss[acc_sites,]
+    
+    # set nsr counts to 0 for junctions for which no mapping by spliceSiteID 
+    # could be found
+    nsr_donor[is.na(nsr_donor)] <- 0
+    nsr_acc[is.na(nsr_acc)] <- 0
+    
+    # sum them
+    nsr_j <- nsr_donor + nsr_acc
+    
+    if(nrow(nsr_j) != nrow(fds)){
+        warning("Unequal number of junctions in fds and junctions with ",
+                "computed nonsplit count sum!")
+    }
+    
+    # assign it to our object
+    if(isTRUE(overwriteCts) || 
+            !("rawCountsJnonsplit" %in% assayNames(fds))){
+        assay(fds, type="j", "rawCountsJnonsplit", withDimnames=FALSE) <- nsr_j
+    }
+    
     return(fds)
 }

From f1a9d099effd799ca9bd3a70efa58af3d40c057c Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Mar 2022 16:14:57 +0200
Subject: [PATCH 14/80] adaptions to using jaccard metric

---
 R/AllGenerics.R              | 10 ++++++----
 R/calculatePSIValue.R        | 22 +++++++++++++---------
 R/filterExpression.R         | 19 +++++++++++--------
 R/find_encoding_dimensions.R |  2 +-
 R/helper-functions.R         |  6 +++---
 R/makeSimulatedDataset.R     |  8 ++++++--
 R/plotMethods.R              | 13 ++++++++-----
 R/pvalsNzscore.R             |  4 ++++
 R/variables.R                |  2 +-
 9 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 17553bc4..73ebb087 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -537,7 +537,7 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
     type <- whichPSIType(type)
     if(length(type) == 0 | length(type) > 1){
         stop(paste0("Please provide a correct psi type: psi5, psi3, ",
-                    "theta or intron_jaccard. Not the given one: '", 
+                    "theta or jaccard. Not the given one: '", 
                     type, "'."))
     }
     aname <- paste0("rawOtherCounts_", type)
@@ -724,8 +724,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             # extract values
             rawCts       <- as.matrix(K(tmp_x))
             rawTotalCts  <- as.matrix(N(tmp_x))
-            pvals        <- as.matrix(pVals(tmp_x))
-            padjs        <- as.matrix(padjVals(tmp_x))
+            pvals        <- as.matrix(pVals(tmp_x, 
+                                            filters=list(rho=rhoCutoff)))
+            padjs        <- as.matrix(padjVals(tmp_x, 
+                                            filters=list(rho=rhoCutoff)))
             zscores      <- as.matrix(zScores(tmp_x))
             psivals      <- as.matrix(assay(tmp_x, type))
             muPsi        <- as.matrix(predictedMeans(tmp_x))
@@ -886,7 +888,7 @@ setMethod("results", "FraserDataSet", function(object,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
                     rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=c("psi3", "psi5", "theta", 
-                                            "intron_jaccard"),
+                                            "jaccard"),
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index 6cf2f17b..5fb07fad 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -351,11 +351,11 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){
     # TODO also calculate it with nonsplit counts in the nominator 
     
     # assign it to our object
-    assay(fds, type="j", "intron_jaccard", withDimnames=FALSE) <- jaccardValues
+    assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues
     
     if(isTRUE(overwriteCts) || 
-            !("rawOtherCounts_intron_jaccard" %in% assayNames(fds))){
-        assay(fds, type="j", "rawOtherCounts_intron_jaccard", 
+            !("rawOtherCounts_jaccard" %in% assayNames(fds))){
+        assay(fds, type="j", "rawOtherCounts_jaccard", 
                 withDimnames=FALSE) <- otherCounts_jaccard
     }
     
@@ -381,6 +381,7 @@ calculateIntronNonsplitSum <- function(fds, overwriteCts){
     junction_dt <- as.data.table(rowRanges(fds, type="j"))[,
                                                     .(seqnames, start, end, 
                                                     strand, startID, endID)]
+    junction_dt[, j_idx:=seq_len(.N)]
     ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, 
                             nsr_idx=seq_len(nrow(nonSplicedReads(fds))))
     
@@ -395,15 +396,18 @@ calculateIntronNonsplitSum <- function(fds, overwriteCts){
     
     # for each junction, find the two rows in K_theta corresponding to its 
     # donor and acceptor splice site
-    donor_sites <-  junction_dt$start_idx
-    acc_sites <- junction_dt$end_idx
-    nsr_donor <- nsr_ss[donor_sites,]
-    nsr_acc <- nsr_ss[acc_sites,]
+    donor_sites <-  junction_dt[!is.na(start_idx),]
+    acc_sites <- junction_dt[!is.na(end_idx),]
     
     # set nsr counts to 0 for junctions for which no mapping by spliceSiteID 
     # could be found
-    nsr_donor[is.na(nsr_donor)] <- 0
-    nsr_acc[is.na(nsr_acc)] <- 0
+    nsr_donor <- matrix(0, nrow=nrow(fds), ncol=ncol(fds))
+    nsr_acc <- matrix(0, nrow=nrow(fds), ncol=ncol(fds))
+    
+    nsr_donor[donor_sites[,j_idx],] <- 
+        as.matrix(nsr_ss[donor_sites[,start_idx],])
+    nsr_acc[acc_sites[,j_idx],] <- 
+        as.matrix(nsr_ss[acc_sites[,end_idx],])
     
     # sum them
     nsr_j <- nsr_donor + nsr_acc
diff --git a/R/filterExpression.R b/R/filterExpression.R
index 7d8d2806..1dee404b 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -382,6 +382,9 @@ filterExpressionAndVariability_jaccard <- function(object,
     
 }
 
+#' @describeIn filtering This function filters out introns and corresponding 
+#' splice sites which are expressed at very low levels across samples.
+#' @export
 filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
                             quantile=0.95, quantileMinExpression=1, filter=TRUE, 
                             delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
@@ -393,7 +396,7 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
     
     # extract counts
     cts  <- K(object, type="j")
-    ctsN <- N(object, type="intron_jaccard")
+    ctsN <- N(object, type="jaccard")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
@@ -451,17 +454,17 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE,
     
     # extract counts
     cts    <- K(object, type="j")
-    ctsN  <- N(object, type="intron_jaccard")
+    ctsN  <- N(object, type="jaccard")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
-        ctsN <- as.matrix(ctsN5)
+        ctsN <- as.matrix(ctsN)
     }
     
     # cutoff functions
-    f1 <- function(cts, ctsN3, ...) {
-        intron_jaccard <- cts/ctsN
-        rowMaxs(abs(intron_jaccard - rowMeans2(intron_jaccard, na.rm=TRUE)), 
+    f1 <- function(cts, ctsN, ...) {
+        jaccard <- cts/ctsN
+        rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), 
                 na.rm=TRUE) }
     
     funs <- c(maxDJaccard=f1)
@@ -520,7 +523,7 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample,
         for(aname in assayNames(rareJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
                               "rawOtherCounts_psi3", "psi5", "psi3", 
-                              "delta_psi5", "delta_psi3", "intron_jaccard",
+                              "delta_psi5", "delta_psi3", "jaccard",
                               "rawOtherCounts_intron_jaccard"))){
                 assay(rareJunctions, aname) <- NULL
             }
@@ -583,7 +586,7 @@ applyVariabilityFilters <- function(fds, minDelta){
         for(aname in assayNames(nonVariableJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
                               "rawOtherCounts_psi3", "psi5", "psi3", 
-                              "delta_psi5", "delta_psi3", "intron_jaccard",
+                              "delta_psi5", "delta_psi3", "jaccard",
                               "rawOtherCounts_intron_jaccard"))){
                 assay(nonVariableJunctions, aname) <- NULL
             }
diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R
index 6653fcaa..af5be75a 100644
--- a/R/find_encoding_dimensions.R
+++ b/R/find_encoding_dimensions.R
@@ -125,7 +125,7 @@ findEncodingDim <- function(i, fds, type, params, implementation,
 optimHyperParams <- function(fds, type, implementation="PCA",
                     q_param=seq(2, min(40, ncol(fds)), by=3),
                     noise_param=0, minDeltaPsi=0.1,
-                    iterations=5, setSubset=15000, injectFreq=1e-2,
+                    iterations=5, setSubset=50000, injectFreq=1e-2,
                     BPPARAM=bpparam(), internalThreads=1, plot=TRUE, 
                     delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE), ...){
     if(isFALSE(needsHyperOpt(implementation))){
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 69466b74..007390f9 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -79,7 +79,7 @@ checkReadType <- function(fds, type){
     }
     type <- unique(type)
     stopifnot(isScalarCharacter(type))
-    correctTypes <- c(psi3="j", psi5="j", theta="ss", intron_jaccard="j")
+    correctTypes <- c(psi3="j", psi5="j", theta="ss", jaccard="j")
 
     # check if it is already the correct type
     if(type %in% correctTypes) return(type)
@@ -109,7 +109,7 @@ checkReadType <- function(fds, type){
 #'
 #' @noRd
 whichPSIType <- function(type){
-    unlist(regmatches(type, gregexpr("psi(3|5)|theta|intron_jaccard", type, perl=TRUE)))
+    unlist(regmatches(type, gregexpr("psi(3|5)|theta|jaccard", type, perl=TRUE)))
 }
 
 #'
@@ -123,7 +123,7 @@ whichReadType <- function(fds, name){
     if(name == "ss" | endsWith(name, "theta"))
         return("ss")
     if(name == "j"  | endsWith(name, "psi5") | endsWith(name, "psi3") | 
-            endsWith(name, "intron_jaccard"))
+            endsWith(name, "jaccard"))
         return("j")
 
     # check assay names
diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
index 95331d19..5b6f7531 100644
--- a/R/makeSimulatedDataset.R
+++ b/R/makeSimulatedDataset.R
@@ -437,7 +437,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10,
 #' fds <- makeSimulatedFraserDataSet()
 #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
 #' @export
-injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
+injectOutliers <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
                     freq=1E-3, minDpsi=0.2, minCoverage=2,
                     deltaDistr="uniformDistr", verbose=FALSE,
                     method=c('samplePSI', 'meanPSI', 'simulatedPSI'),
@@ -472,6 +472,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
         setAssayMatrix(fds, type="psi3", "originalOtherCounts",
                 withDimnames=FALSE) <- 
                         counts(fds, type="psi3", side="other")
+        setAssayMatrix(fds, type="jaccard", "originalOtherCounts",
+                       withDimnames=FALSE) <- 
+                        counts(fds, type="jaccard", side="other")
     }
 
     # get infos from the fds
@@ -500,7 +503,8 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"),
     dt[,groupSize:=.N, by=groupID]
 
     # Get groups where outlier can be injected
-    available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), 
+    available_groups <- dt[groupSize > ifelse(type == "theta" | 
+                                                type == "jaccard", 0, 1), 
                             unique(groupID)]
     
     # e.g. for psi3/5: no donor/acceptor 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 13e22bc3..e18abe55 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -178,7 +178,7 @@ NULL
 
 
 plotVolcano.FRASER <- function(object, sampleID, 
-                    type=c("psi3", "psi5", "theta", "intron_jaccard"), basePlot=TRUE, 
+                    type=c("psi3", "psi5", "theta", "jaccard"), basePlot=TRUE, 
                     aggregate=FALSE, main=NULL, label=NULL,
                     deltaPsiCutoff=0.3, padjCutoff=0.1, ...){
     
@@ -278,7 +278,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 
 plotAberrantPerSample.FRASER <- function(object, main, 
-                    type=c("psi3", "psi5", "theta", "intron_jaccard"),
+                    type=c("psi3", "psi5", "theta", "jaccard"),
                     padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
@@ -341,7 +341,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"),
+plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
                     site=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
@@ -433,7 +433,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "intron_jaccard"),
+plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
                     idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                     basePlot=TRUE, label="aberrant", ...){
     type <- match.arg(type)
@@ -696,7 +696,8 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
 setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)
 
 
-plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), 
+plotEncDimSearch.FRASER <- function(object, 
+                    type=c("psi3", "psi5", "theta", "jaccard"), 
                     plotType=c("auc", "loss")){
     type <- match.arg(type)
     plotType <- match.arg(plotType)
@@ -1085,6 +1086,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
     if(isFALSE(asCharacter)){
         vapply(type, FUN=function(x)
             switch (x,
+                    jaccard = c(bquote(jaccard~intron~index)),
                     psi5 = c(bquote(psi[5])),
                     psi3 = c(bquote(psi[3])),
                     theta = c(bquote(theta))),
@@ -1092,6 +1094,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
     } else{
         vapply(type, FUN=function(x)
             switch (x,
+                    jaccard = "jaccard-intron-index",
                     psi5 = "psi[5]",
                     psi3 = "psi[3]",
                     theta = "theta"),
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 58d60a8b..b04f0494 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -336,6 +336,10 @@ getSiteIndex <- function(fds, type){
         return(mcols(fds, type=type)[['spliceSiteID']])
     }
     
+    if(type == "jaccard"){
+        return(seq_len(nrow(fds)))
+    }
+    
     startId <- mcols(fds, type=type)[,"startID"]
     endId   <- mcols(fds, type=type)[,"endID"]
     strand  <- strand(rowRanges(fds, type=type))
diff --git a/R/variables.R b/R/variables.R
index 74bb6d91..a159d2ee 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -6,7 +6,7 @@
 #'   psiTypes
 #' 
 #' @export
-psiTypes <- c("psi5", "psi3", "theta", "intron_jaccard")
+psiTypes <- c("psi5", "psi3", "theta", "jaccard")
 # psiTypes <- c("psi5", "psi3", "theta")
 names(psiTypes) <- psiTypes
 

From 14f0dbb28d2ce4d4a2071551d017c708feb33d16 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 31 Mar 2022 20:40:17 +0200
Subject: [PATCH 15/80] updated c++ code

---
 R/RcppExports.R                   |  4 +++
 R/updateRho.R                     | 33 ++++++++++++++++++++-----
 src/RcppExports.cpp               | 17 ++++++++++++-
 src/loss_n_gradient_functions.cpp | 41 ++++++++++++++++++++++++++++++-
 4 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 659f3b4c..42ec8fbc 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -41,6 +41,10 @@ truncNLL_rho <- function(rho, yi, ki, ni) {
     .Call('_FRASER_truncNLL_rho', PACKAGE = 'FRASER', rho, yi, ki, ni)
 }
 
+truncNLL_rho_penalized <- function(logit_rho, yi, ki, ni, lambda) {
+    .Call('_FRASER_truncNLL_rho_penalized', PACKAGE = 'FRASER', logit_rho, yi, ki, ni, lambda)
+}
+
 fullNLL <- function(y, rho, k, n, D, lambda, byRows = FALSE) {
     .Call('_FRASER_fullNLL', PACKAGE = 'FRASER', y, rho, k, n, D, lambda, byRows)
 }
diff --git a/R/updateRho.R b/R/updateRho.R
index f5a0d429..2348c059 100644
--- a/R/updateRho.R
+++ b/R/updateRho.R
@@ -8,11 +8,14 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
     n <- N(fds)
     y <- predictY(fds, noiseAlpha=currentNoiseAlpha(fds))
     
-    fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho,
-            k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM)
+    # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho,
+    #         k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM)
+    fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=trunc_negLogLikelihoodRho_penalized,
+                              k=k, n=n, y=y, rhoRange=rhoRange, lambda=0,
+                              BPPARAM=BPPARAM)
     
-    rho(fds) <- vapply(fitparameters, "[[", 
-            double(1), "minimum")
+    rho(fds) <- plogis(vapply(fitparameters, "[[", 
+            double(1), "minimum"))
     
     if(isTRUE(verbose)){
         stxt <- capture.output(summary(rho(fds)))
@@ -23,12 +26,15 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
     return(fds)
 }
 
-estRho <- function(idx, k, n, y, rhoRange, nll, control=list()){
+estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){
     ki <- k[idx,]
     ni <- n[idx,]
     yi <- y[idx,]
     
-    est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, 
+    # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, 
+    #                 maximum=FALSE, tol=0.0000001)
+    # est
+    est <- optimize(f=nll, interval=c(-40, 40), mui=plogis(yi), ki=ki, ni=ni, lambda=lambda,
                     maximum=FALSE, tol=0.0000001)
     est
 }
@@ -63,6 +69,21 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){
     mean(alpha + beta - alphaK - betaNK )
 }
 
+trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){
+    #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE))
+    
+    rho <- plogis(logit_rho)
+    r  <- (1-rho)/rho
+    eps <- 0.5
+    alpha  <- lgamma(mui*r)
+    alphaK <- lgamma(mui*r + ki + eps)
+    beta   <- lgamma((mui-1)*(-r))
+    betaNK <- lgamma((mui-1)*(-r) + (ni - ki + eps))
+    
+    #mean negative log likelihood with pseudocounts
+    mean(alpha + beta - alphaK - betaNK ) + lambda * sqrt(logit_rho)
+}
+
 
 methodOfMomentsRho <- function(k, n, rhoRange=c(1e-5, 1 - 1e-5)){
     # taken from wiki: 
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 461280c0..0d444179 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -1,7 +1,6 @@
 // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#include <RcppArmadillo.h>
 #include <Rcpp.h>
 
 using namespace Rcpp;
@@ -152,6 +151,21 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// truncNLL_rho_penalized
+double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda);
+RcppExport SEXP _FRASER_truncNLL_rho_penalized(SEXP logit_rhoSEXP, SEXP yiSEXP, SEXP kiSEXP, SEXP niSEXP, SEXP lambdaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< double >::type logit_rho(logit_rhoSEXP);
+    Rcpp::traits::input_parameter< arma::vec >::type yi(yiSEXP);
+    Rcpp::traits::input_parameter< arma::vec >::type ki(kiSEXP);
+    Rcpp::traits::input_parameter< arma::vec >::type ni(niSEXP);
+    Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
+    rcpp_result_gen = Rcpp::wrap(truncNLL_rho_penalized(logit_rho, yi, ki, ni, lambda));
+    return rcpp_result_gen;
+END_RCPP
+}
 // fullNLL
 arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows);
 RcppExport SEXP _FRASER_fullNLL(SEXP ySEXP, SEXP rhoSEXP, SEXP kSEXP, SEXP nSEXP, SEXP DSEXP, SEXP lambdaSEXP, SEXP byRowsSEXP) {
@@ -215,6 +229,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_FRASER_truncNLL_e", (DL_FUNC) &_FRASER_truncNLL_e, 7},
     {"_FRASER_truncGrad_e", (DL_FUNC) &_FRASER_truncGrad_e, 7},
     {"_FRASER_truncNLL_rho", (DL_FUNC) &_FRASER_truncNLL_rho, 4},
+    {"_FRASER_truncNLL_rho_penalized", (DL_FUNC) &_FRASER_truncNLL_rho_penalized, 5},
     {"_FRASER_fullNLL", (DL_FUNC) &_FRASER_fullNLL, 7},
     {"_FRASER_truncWeightedNLL_db", (DL_FUNC) &_FRASER_truncWeightedNLL_db, 7},
     {"_FRASER_truncWeightedGrad_db", (DL_FUNC) &_FRASER_truncWeightedGrad_db, 7},
diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp
index 07eadcca..6cde1bf1 100644
--- a/src/loss_n_gradient_functions.cpp
+++ b/src/loss_n_gradient_functions.cpp
@@ -6,7 +6,7 @@
 using namespace Rcpp;
 
 const double MAX_EXP_VALUE = 700;
-double PSEUDO_COUNT = 1;
+double PSEUDO_COUNT = 0;
 
 // [[Rcpp::export(.setPseudoCount)]]
 double setPseudoCount(double pseudoCount){
@@ -142,6 +142,12 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double
   infPosB = arma::find_nonfinite(beta);
   // beta.elem( infPosB ) = abs.elem( infPosB );
   beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+  
+  arma::uvec infPosA2, infPosB2;
+  infPosA2 = arma::find_nonfinite(alphaK);
+  alpha.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
+  infPosB2 = arma::find_nonfinite(betaNK);
+  beta.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
 
   nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem;
 
@@ -366,6 +372,39 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){
   return arma::as_scalar(nll);
 }
 
+// [[Rcpp::export()]]
+double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda){
+    arma::vec mui, u, alpha, alphaK, beta, betaNK, alphaBeta, nll;
+    double rho, rhoa, rhob;
+    
+    rho = exp(logit_rho)/(1 + exp(logit_rho));
+    rhoa = (1 - rho)/rho;
+    rhob = (rho - 1)/rho;
+    mui = predictMuCpp(yi);
+    u    = (mui-1) * rhob;
+    
+    alpha  = arma::lgamma(mui * rhoa);
+    alphaK = arma::lgamma(mui * rhoa + ki + PSEUDO_COUNT);
+    beta   = arma::lgamma(u);
+    betaNK = arma::lgamma(u + ni - ki + PSEUDO_COUNT);
+    alphaBeta = arma::lgamma(rhoa + ni + (2*PSEUDO_COUNT)) - lgamma(rhoa);
+    
+    // arma::vec abs;
+    arma::uvec infPosA, infPosB;
+    // abs = arma::abs(yi);
+    infPosA = arma::find_nonfinite(alpha);
+    // alpha.elem( infPosA ) = abs.elem( infPosA );
+    alpha.elem( infPosA ) = estLgammaAlpha(yi, infPosA, rhoa);
+    infPosB = arma::find_nonfinite(beta);
+    // beta.elem( infPosB ) = abs.elem( infPosB );
+    beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob);
+    
+    nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem;
+    nll = nll + lambda * (logit_rho*logit_rho);
+    
+    return arma::as_scalar(nll);
+}
+
 // [[Rcpp::export()]]
 arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows=false){
   arma::mat rhoa, rhob;

From 1fd5839882aced027868fdeae2333288118eff78 Mon Sep 17 00:00:00 2001
From: Ines Scheller <ines.scheller@in.tum.de>
Date: Thu, 31 Mar 2022 21:00:43 +0200
Subject: [PATCH 16/80] compiled and updated documentation

---
 NAMESPACE               |  3 +++
 man/filtering.Rd        | 41 +++++++++++++++++++++++++++++++++++++++++
 man/injectOutliers.Rd   |  2 +-
 man/optimHyperParams.Rd |  2 +-
 man/plotFunctions.Rd    | 10 +++++-----
 man/psiTypes.Rd         |  2 +-
 man/results.Rd          |  2 +-
 src/RcppExports.cpp     |  1 +
 8 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 0625f5fc..ec7d82f2 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -42,7 +42,10 @@ export(dontWriteHDF5)
 export(featureExclusionMask)
 export(filterExpression)
 export(filterExpressionAndVariability)
+export(filterExpressionAndVariability_jaccard)
+export(filterExpression_jaccard)
 export(filterVariability)
+export(filterVariability_jaccard)
 export(fit)
 export(getNonSplitReadCountsForAllSamples)
 export(getSplitReadCountsForAllSamples)
diff --git a/man/filtering.Rd b/man/filtering.Rd
index 3c2bace9..f81b1267 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -5,6 +5,9 @@
 \alias{filterExpressionAndVariability}
 \alias{filterExpression,FraserDataSet-method}
 \alias{filterVariability}
+\alias{filterExpressionAndVariability_jaccard}
+\alias{filterExpression_jaccard}
+\alias{filterVariability_jaccard}
 \title{Filtering FraserDataSets}
 \usage{
 filterExpressionAndVariability(
@@ -35,6 +38,35 @@ filterVariability(
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
   BPPARAM = bpparam()
 )
+
+filterExpressionAndVariability_jaccard(
+  object,
+  minExpressionInOneSample = 20,
+  quantile = 0.95,
+  quantileMinExpression = 1,
+  minDelta = 0.05,
+  filter = TRUE,
+  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  BPPARAM = bpparam()
+)
+
+filterExpression_jaccard(
+  object,
+  minExpressionInOneSample = 20,
+  quantile = 0.95,
+  quantileMinExpression = 1,
+  filter = TRUE,
+  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  BPPARAM = bpparam()
+)
+
+filterVariability_jaccard(
+  object,
+  minDelta = 0,
+  filter = TRUE,
+  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  BPPARAM = bpparam()
+)
 }
 \arguments{
 \item{object}{A \code{\link{FraserDataSet}} object}
@@ -80,6 +112,15 @@ splice sites that have low read support in all samples.
 
 \item \code{filterVariability}: This function filters out introns and corresponding 
 splice sites which do not show variablity across samples.
+
+\item \code{filterExpressionAndVariability_jaccard}: This functions filters out both introns with low 
+read support and introns that are not variable across samples.
+
+\item \code{filterExpression_jaccard}: This function filters out introns and corresponding 
+splice sites which are expressed at very low levels across samples.
+
+\item \code{filterVariability_jaccard}: This function filters out introns and corresponding 
+splice sites which do not show variablity across samples.
 }}
 
 \examples{
diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd
index 1f85338e..b32ed74a 100644
--- a/man/injectOutliers.Rd
+++ b/man/injectOutliers.Rd
@@ -6,7 +6,7 @@
 \usage{
 injectOutliers(
   fds,
-  type = c("psi5", "psi3", "theta"),
+  type = c("psi5", "psi3", "theta", "jaccard"),
   freq = 0.001,
   minDpsi = 0.2,
   minCoverage = 2,
diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd
index 3894d9d7..f3d10622 100644
--- a/man/optimHyperParams.Rd
+++ b/man/optimHyperParams.Rd
@@ -12,7 +12,7 @@ optimHyperParams(
   noise_param = 0,
   minDeltaPsi = 0.1,
   iterations = 5,
-  setSubset = 15000,
+  setSubset = 50000,
   injectFreq = 0.01,
   BPPARAM = bpparam(),
   internalThreads = 1,
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index cbeddee6..f6b36c41 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -21,7 +21,7 @@
 \S4method{plotVolcano}{FraserDataSet}(
   object,
   sampleID,
-  type = c("psi3", "psi5", "theta"),
+  type = c("psi3", "psi5", "theta", "jaccard"),
   basePlot = TRUE,
   aggregate = FALSE,
   main = NULL,
@@ -34,7 +34,7 @@
 \S4method{plotAberrantPerSample}{FraserDataSet}(
   object,
   main,
-  type = c("psi3", "psi5", "theta"),
+  type = c("psi3", "psi5", "theta", "jaccard"),
   padjCutoff = 0.1,
   zScoreCutoff = NA,
   deltaPsiCutoff = 0.3,
@@ -45,7 +45,7 @@
 
 plotExpression(
   fds,
-  type = c("psi5", "psi3", "theta"),
+  type = c("psi5", "psi3", "theta", "jaccard"),
   site = NULL,
   result = NULL,
   colGroup = NULL,
@@ -57,7 +57,7 @@ plotExpression(
 
 plotExpectedVsObservedPsi(
   fds,
-  type = c("psi5", "psi3", "theta"),
+  type = c("psi5", "psi3", "theta", "jaccard"),
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -85,7 +85,7 @@ plotExpectedVsObservedPsi(
 
 \S4method{plotEncDimSearch}{FraserDataSet}(
   object,
-  type = c("psi3", "psi5", "theta"),
+  type = c("psi3", "psi5", "theta", "jaccard"),
   plotType = c("auc", "loss")
 )
 
diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd
index f07be433..43ce2ea6 100644
--- a/man/psiTypes.Rd
+++ b/man/psiTypes.Rd
@@ -5,7 +5,7 @@
 \alias{psiTypes}
 \title{Available psi types}
 \format{
-An object of class \code{character} of length 3.
+An object of class \code{character} of length 4.
 }
 \usage{
 psiTypes
diff --git a/man/results.Rd b/man/results.Rd
index 7f78ffe0..392fff9f 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -15,7 +15,7 @@
   aggregate = FALSE,
   collapse = FALSE,
   minCount = 5,
-  psiType = c("psi3", "psi5", "theta"),
+  psiType = c("psi3", "psi5", "theta", "jaccard"),
   geneColumn = "hgnc_symbol",
   additionalColumns = NULL,
   BPPARAM = bpparam()
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 0d444179..b03ef489 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -1,6 +1,7 @@
 // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+#include <RcppArmadillo.h>
 #include <Rcpp.h>
 
 using namespace Rcpp;

From 3c4574f27568843be7d7659f92c479f1f2716ba3 Mon Sep 17 00:00:00 2001
From: Ines Scheller <ines.scheller@in.tum.de>
Date: Fri, 1 Apr 2022 12:38:46 +0200
Subject: [PATCH 17/80] small bug fix

---
 R/updateRho.R                     |  9 ++++----
 src/loss_n_gradient_functions.cpp | 35 +++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/R/updateRho.R b/R/updateRho.R
index 2348c059..d90d7aed 100644
--- a/R/updateRho.R
+++ b/R/updateRho.R
@@ -70,18 +70,17 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){
 }
 
 trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){
-    #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE))
+    #-mean(dbetabinom(ki, ni, mui, rho, log=TRUE))
     
     rho <- plogis(logit_rho)
     r  <- (1-rho)/rho
-    eps <- 0.5
     alpha  <- lgamma(mui*r)
-    alphaK <- lgamma(mui*r + ki + eps)
+    alphaK <- lgamma(mui*r + ki)
     beta   <- lgamma((mui-1)*(-r))
-    betaNK <- lgamma((mui-1)*(-r) + (ni - ki + eps))
+    betaNK <- lgamma((mui-1)*(-r) + (ni - ki))
     
     #mean negative log likelihood with pseudocounts
-    mean(alpha + beta - alphaK - betaNK ) + lambda * sqrt(logit_rho)
+    mean(alpha + beta - alphaK - betaNK ) + lambda * (logit_rho*logit_rho)
 }
 
 
diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp
index 6cde1bf1..0078a270 100644
--- a/src/loss_n_gradient_functions.cpp
+++ b/src/loss_n_gradient_functions.cpp
@@ -145,9 +145,9 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double
   
   arma::uvec infPosA2, infPosB2;
   infPosA2 = arma::find_nonfinite(alphaK);
-  alpha.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
+  alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
   infPosB2 = arma::find_nonfinite(betaNK);
-  beta.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
+  betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
 
   nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem;
 
@@ -272,6 +272,12 @@ double truncNLL_e(arma::vec par, arma::mat x, arma::mat D, arma::vec b,
     beta.elem( infPosB ) = abs.elem( infPosB );
     // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
 
+    arma::uvec infPosA2, infPosB2;
+    infPosA2 = arma::find_nonfinite(alphaK);
+    alphaK.elem( infPosA2 ) = abs.elem( infPosA2 );
+    infPosB2 = arma::find_nonfinite(betaNK);
+    betaNK.elem( infPosB2 ) = abs.elem( infPosB2 );
+
     nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem;
 
     return arma::as_scalar(nll);
@@ -367,6 +373,13 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){
   // beta.elem( infPosB ) = abs.elem( infPosB );
   beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob);
 
+  arma::uvec infPosA2, infPosB2;
+  infPosA2 = arma::find_nonfinite(alphaK);
+  alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa);
+  infPosB2 = arma::find_nonfinite(betaNK);
+  betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob);
+
+
   nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem;
 
   return arma::as_scalar(nll);
@@ -399,6 +412,13 @@ double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma
     // beta.elem( infPosB ) = abs.elem( infPosB );
     beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob);
     
+    arma::uvec infPosA2, infPosB2;
+    infPosA2 = arma::find_nonfinite(alphaK);
+    alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa);
+    infPosB2 = arma::find_nonfinite(betaNK);
+    betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob);
+
+
     nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem;
     nll = nll + lambda * (logit_rho*logit_rho);
     
@@ -434,6 +454,11 @@ arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::ma
   infPosB = arma::find_nonfinite(beta);
   beta.elem( infPosB ) = abs.elem( infPosB );
   // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
+  arma::uvec infPosA2, infPosB2;
+  infPosA2 = arma::find_nonfinite(alphaK);
+  alphaK.elem( infPosA2 ) = abs.elem( infPosA2 );
+  infPosB2 = arma::find_nonfinite(betaNK);
+  betaNK.elem( infPosB2 ) = abs.elem( infPosB2 );
 
   if(byRows){
     nll = rowMeans(alpha + beta - alphaK - betaNK + nonTruncTerms);
@@ -510,6 +535,12 @@ double truncWeightedNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n,
   // beta.elem( infPosB ) = abs.elem( infPosB );
   beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob);
 
+  arma::uvec infPosA2, infPosB2;
+  infPosA2 = arma::find_nonfinite(alphaK);
+  alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa);
+  infPosB2 = arma::find_nonfinite(betaNK);
+  betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob);
+
   nll = arma::accu((alpha + beta - alphaK - betaNK)%w)/k.n_elem;
 
   nll = nll + (lambda/k.n_elem) * arma::accu(d % d);

From b671b34089907ff3799e2e00d967579f051dd1ec Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 28 Apr 2022 10:51:40 +0200
Subject: [PATCH 18/80] updated rho fit

---
 DESCRIPTION              |  2 +-
 R/autoencoder.R          |  2 +-
 R/fitCorrectionMethods.R |  2 +-
 R/getNSetterFuns.R       |  2 +-
 R/plotMethods.R          |  8 ++++----
 R/pvalsNzscore.R         |  2 +-
 R/updateRho.R            | 19 ++++++++++++++-----
 7 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c7704274..f7a853c7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FRASER
 Type: Package
 Title: Find RAre Splicing Events in RNA-Seq Data
-Version: 1.2.2
+Version: 1.99.0
 Date: 2021-01-27
 Authors@R: c(
     person("Christian", "Mertes", role=c("aut", "cre"), 
diff --git a/R/autoencoder.R b/R/autoencoder.R
index bf1f4fd1..f59b4661 100644
--- a/R/autoencoder.R
+++ b/R/autoencoder.R
@@ -3,7 +3,7 @@
 #'
 #' @noRd
 fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1,
-                    rhoRange=c(1e-5, 1-1e-5), lambda=0, convergence=1e-5,
+                    rhoRange=c(-30, 30), lambda=0, convergence=1e-5,
                     iterations=15, initialize=TRUE, control=list(),
                     BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5,
                     weighted=FALSE, nSubset=15000, multiRho=FALSE,
diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R
index f9b67599..38f5d634 100644
--- a/R/fitCorrectionMethods.R
+++ b/R/fitCorrectionMethods.R
@@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
                             "AE", "AE-weighted", "PCA-BB-full", "fullAE", 
                             "PCA-regression", "PCA-reg-full", 
                             "PCA-BB-Decoder-no-weights", "BB"),
-                    q, type="psi3", rhoRange=c(1e-8, 1-1e-8), 
+                    q, type="psi3", rhoRange=c(-30, 30), 
                     weighted=FALSE, noiseAlpha=1, convergence=1e-5, 
                     iterations=15, initialize=TRUE, control=list(), 
                     BPPARAM=bpparam(), nSubset=15000, 
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 8e91710f..63c8d604 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -653,7 +653,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
                                 level=pvalLevel)[idxrow, idxcol]),
             padj      = c(padjVals(fds, type=type)[idxrow, idxcol]),
             zscore    = c(zScores(fds, type=type)[idxrow, idxcol]),
-            obsPsi    = c((k + pseudocount())/(n + 2*pseudocount())),
+            obsPsi    = c(k/n),
             predPsi   = c(predictedMeans(fds, type)[idxrow, idxcol]),
             rho       = rep(rho(fds, type=type)[idxrow], 
                             ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index e18abe55..5d567691 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -25,7 +25,7 @@
 #' @param type The psi type: either psi5, psi3 or theta (for SE).
 #' @param sampleID A sample ID which should be plotted. Can also be a vector.
 #'             Integers are treated as indices.
-#' @param idx,site A junction site ID or gene ID or one of both, which
+#' @param idx A junction site ID or gene ID or one of both, which
 #'             should be plotted. Can also be a vector. Integers are treated
 #'             as indices.
 #' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta
@@ -342,16 +342,16 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #' @rdname plotFunctions
 #' @export
 plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
-                    site=NULL, result=NULL, colGroup=NULL, 
+                    idx=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
         type <- as.character(result$type)
-        site <- getIndexFromResultTable(fds, result)
+        idx <- getIndexFromResultTable(fds, result)
     } else {
         type <- match.arg(type)
     }
 
-    dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...)
+    dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...)
     dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
     
     if(!is.null(colGroup)){
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index b04f0494..749d192c 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -196,7 +196,7 @@ singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){
     pvals <- pmin(1, pbbinom(ki, ni, alphai, betai))
 
     if(any(is.na(pvals))){
-        message(date(), " : ", idx)
+        message(date(), ": obtained NA pvalues for junction ", idx)
     }
 
     return (pvals)
diff --git a/R/updateRho.R b/R/updateRho.R
index d90d7aed..628b21fd 100644
--- a/R/updateRho.R
+++ b/R/updateRho.R
@@ -10,9 +10,10 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
     
     # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho,
     #         k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM)
-    fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=trunc_negLogLikelihoodRho_penalized,
-                              k=k, n=n, y=y, rhoRange=rhoRange, lambda=0,
-                              BPPARAM=BPPARAM)
+    fitparameters <- bplapply(seq_len(nrow(k)), estRho, 
+                                nll=fullNLLRho_penalized,
+                                k=k, n=n, y=y, rhoRange=rhoRange, lambda=1e-4,
+                                BPPARAM=BPPARAM)
     
     rho(fds) <- plogis(vapply(fitparameters, "[[", 
             double(1), "minimum"))
@@ -26,7 +27,7 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){
     return(fds)
 }
 
-estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){
+estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=1e-4){
     ki <- k[idx,]
     ni <- n[idx,]
     yi <- y[idx,]
@@ -34,11 +35,19 @@ estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=0){
     # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, 
     #                 maximum=FALSE, tol=0.0000001)
     # est
-    est <- optimize(f=nll, interval=c(-40, 40), mui=plogis(yi), ki=ki, ni=ni, lambda=lambda,
+    est <- optimize(f=nll, interval=rhoRange, 
+                    mui=plogis(yi), ki=ki, ni=ni, lambda=lambda,
                     maximum=FALSE, tol=0.0000001)
     est
 }
 
+fullNLLRho_penalized <- function(logit_rho, ki, ni, mui, lambda=1e-4){
+    rho <- plogis(logit_rho)
+    nll <- -mean(dbetabinom(ki, ni, mui, rho, log=TRUE))
+    nll <- nll + lambda * (logit_rho^2)
+    return(nll)
+}
+
 negLogLikelihoodRho <- function(rho, ki, ni, mui){
     #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE))
     

From 783a88c24332ca86f3cadf7baef6b97c9d066f8a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 4 Jul 2022 13:05:10 +0200
Subject: [PATCH 19/80] minor changes

---
 R/getNSetterFuns.R | 16 +++++++++++-----
 R/plotMethods.R    |  1 +
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 63c8d604..d5914185 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -406,7 +406,7 @@ pseudocount <- function(value=NULL){
     # set pseudo count if provided
     stopifnot(isScalarNumeric(value))
     stopifnot(value >= 0)
-    value <- as.integer(value)
+    # value <- as.integer(value)
     options('FRASER.pseudoCount'=value)
     devNULL <- .setPseudoCount(value)
     stopifnot(value == devNULL)
@@ -557,8 +557,9 @@ calcFraserWeights <- function(fds, psiType){
     n <- as.matrix(N(fds, psiType))
     mu <- t(predictMu(fds, psiType))
     rho <- rho(fds, psiType)
-    dataPsi <- plogis(t(
-            x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL)))
+    # dataPsi <- plogis(t(
+    #         x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL)))
+    dataPsi <- k / n
 
     # pearson residuals for BB
     # on counts of success k
@@ -567,13 +568,18 @@ calcFraserWeights <- function(fds, psiType){
     #       (1+((n+2*pseudocount())-1)*rho))
     # on probability of success mu
     r <- (dataPsi - mu) / sqrt(
-            mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) /
-            (n+2*pseudocount()))
+            # mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) /
+            # (n+2*pseudocount()))
+            mu * (1-mu) * (1+(n-1)*rho) / n
+        )
 
     # weights according to Huber function (as in edgeR)
     c <- 1.345; # constant, as suggested in edgeR paper
     w <- ifelse(abs(r) > c, c/abs(r) , 1)
 
+    # set weights to 0 if NA (i.e. N=0)
+    w[is.na(w)] <- 0
+
     return(w)
 }
 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 5d567691..6b68fc26 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -487,6 +487,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jacc
         geom_point(alpha=ifelse(dt$aberrant, 1, 0.5),
                 color=c("gray70", "firebrick")[dt$aberrant + 1]) +
         geom_abline(intercept = 0, slope=1) +
+        xlim(c(0,1)) + ylim(c(0,1)) +
         theme_bw() +
         theme(legend.position="none") +
         xlab(xlab) +

From 75acec3f144f2ac76360f140f3081751ce71149c Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Jul 2022 11:47:30 +0200
Subject: [PATCH 20/80] fix bug in spliceSiteID mapping and intron retention
 annotation

---
 NAMESPACE               |  1 +
 R/AllGenerics.R         | 44 ++++++++++++++++++++++++++++++++---------
 R/FRASER-package.R      |  2 +-
 R/FraserDataSet-class.R |  2 +-
 R/countRNAseqData.R     | 22 +++++++++++++--------
 R/getNSetterFuns.R      |  2 +-
 R/resultAnnotations.R   | 12 ++++++++++-
 man/fit.Rd              |  2 +-
 man/plotFunctions.Rd    | 10 +++++-----
 9 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index ab6011bb..075aa04a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -286,6 +286,7 @@ importFrom(ggplot2,theme_bw)
 importFrom(ggplot2,xlab)
 importFrom(ggplot2,xlim)
 importFrom(ggplot2,ylab)
+importFrom(ggplot2,ylim)
 importFrom(ggrepel,geom_text_repel)
 importFrom(grDevices,colorRampPalette)
 importFrom(matrixStats,colAnys)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 7e8055a2..334db5a5 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -614,9 +614,11 @@ mapSeqlevels <- function(fds, style="UCSC", ...){
 #' retrieve a single sample result object
 #' @noRd
 resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, 
-                                psivals, rawCts, rawTotalCts, deltaPsiVals, 
-                                psiType, rowMeansK, rowMeansN, aberrant, 
-                                aggregate, rho, pvalsGene=NULL, padjsGene=NULL, 
+                                psivals, rawCts, rawTotalCts, rawNonsplitCts, 
+                                rawNsProportion, nsProportion_99quantile,
+                                deltaPsiVals, psiType, rowMeansK, rowMeansN, 
+                                aberrant, aggregate, rho, 
+                                pvalsGene=NULL, padjsGene=NULL, 
                                 aberrantGene, additionalColumns,
                                 geneColumn="hgnc_symbol"){
     mcols(gr)$idx <- seq_along(gr)
@@ -664,6 +666,21 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
     mcols(ans)$meanCounts      <- Rle(round(rowMeansK[goodCut], 2))
     mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2))
     
+    if(psiType == "jaccard"){
+        mcols(ans)$nonsplitCounts <- 
+            Rle(round(rawNonsplitCts[goodCut, sampleID], 2))
+        mcols(ans)$nonsplitProportion <- 
+            Rle(round(rawNsProportion[goodCut, sampleID], 2))
+        mcols(ans)$nonsplitProportion_99quantile  <- 
+            Rle(round(nsProportion_99quantile[goodCut], 2))
+    }
+    
+    if(!is.null(additionalColumns)){
+        for(column in additionalColumns){
+            mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column])
+        }
+    }
+    
     if(isTRUE(aggregate)){
         # report junction more than once if it is significant for several genes
         nrGenesPerJunction <- table(geneJunctions)
@@ -680,12 +697,6 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
         mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol)
     }
     
-    if(!is.null(additionalColumns)){
-        for(column in additionalColumns){
-            mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column])
-        }
-    }
-    
     # remove helper column
     mcols(ans)$idx <- NULL
     
@@ -772,12 +783,27 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
                 colnames(deltaPsiVals) <- sc
             }
             
+            if(type == "jaccard"){
+                rawNonsplitCts  <- as.matrix(assay(tmp_x, "rawCountsJnonsplit"))
+                rawNsProportion <- rawNonsplitCts / rawTotalCts
+                nsProportion_99quantile <- 
+                    rowQuantiles(rawNsProportion, probs=0.99)
+            } else{
+                rawNonsplitCts  <- NULL
+                rawNsProportion <- NULL
+                nsProportion_99quantile <- NULL
+            }
+            
+            
             # create result table
             sampleRes <- lapply(sc,
                                 resultsSingleSample, gr=gr, pvals=pvals, 
                                 padjs=padjs, zscores=zscores, psiType=type, 
                                 psivals=psivals, deltaPsiVals=deltaPsiVals, 
                                 rawCts=rawCts, rawTotalCts=rawTotalCts, 
+                                rawNonsplitCts=rawNonsplitCts, 
+                                rawNsProportion=rawNsProportion,
+                                nsProportion_99quantile=nsProportion_99quantile,
                                 rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
                                 aberrant=aberrant, aggregate=aggregate,
                                 rho=rho, geneColumn=geneColumn,
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index e70f894e..c6511822 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -100,7 +100,7 @@
 #'          scale_y_log10 scale_color_gradientn labs theme_bw theme
 #'          scale_color_brewer scale_color_discrete scale_linetype_manual 
 #'          annotate geom_histogram scale_fill_manual xlim scale_colour_manual
-#'          element_blank annotation_logticks
+#'          element_blank annotation_logticks ylim
 #'
 #' @importFrom tibble as_tibble
 #'
diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R
index a0e9d7f5..a92e27e2 100644
--- a/R/FraserDataSet-class.R
+++ b/R/FraserDataSet-class.R
@@ -103,7 +103,7 @@ validateWorkingDir <- function(object) {
     }
     if(!dir.exists(object@workingDir)){
         message(date(), ": The given working directory '", object@workingDir,
-                "' does not exists. We will create it."
+                "' does not exist. We will create it."
         )
         dir.create(object@workingDir, recursive = TRUE)
     }
diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index eafb79d6..1ed57401 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -363,7 +363,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges,
             " splice junctions are found.")
     
     # extract donor and acceptor sites
-    spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+    spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
     message(date(), ": In total ", length(spliceSiteCoords),
             " splice sites (acceptor/donor) will be counted ...")
     
@@ -860,7 +860,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
         }
         
         # extract donor and acceptor sites
-        spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+        spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
     }
     
     
@@ -982,7 +982,7 @@ readJunctionMap <- function(junctionMap){
 
 #' extracts the splice site coordinates from a junctions GRange object (
 #' @noRd
-extractSpliceSiteCoordinates <- function(junctions, fds){
+extractSpliceSiteCoordinates <- function(junctions){
     
     spliceSiteCoords <- unlist(GRangesList(
         lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, 
@@ -1052,15 +1052,21 @@ annotateSpliceSite <- function(gr){
     dt <- GRanges2SAF(gr)
     
     # extract donor/acceptor annotation
-    startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
-    endSideDT   <- dt[,.(Start=End, type="end"  ),by="Chr,End,Strand"]
+    startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
+    endSiteDT   <- dt[,.(Start=End, type="end"  ),by="Chr,End,Strand"]
+    startSiteDT[,Start:=Start-1]
+    endSiteDT[,End:=End+1]
     
     # annotate and enumerate donor/acceptor
-    annotadedDT <- rbind(startSideDT, endSideDT)
-    annotadedDT[,id:=seq_len(nrow(annotadedDT))]
+    annotatedDT <- rbind(startSiteDT, endSiteDT)
+    annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"]
+    
+    # set back start / end positions for merging with junction ranges
+    annotatedDT[type == "start", Start:=End]
+    annotatedDT[type == "end", End:=Start]
     
     # convert back to granges
-    annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE)
+    annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE)
     
     ids <- lapply(c("start", "end"), function(type){
         # reduce annogr to only the specific type to prevent overlap
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index d5914185..608b0171 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -406,7 +406,7 @@ pseudocount <- function(value=NULL){
     # set pseudo count if provided
     stopifnot(isScalarNumeric(value))
     stopifnot(value >= 0)
-    # value <- as.integer(value)
+    value <- as.numeric(value)
     options('FRASER.pseudoCount'=value)
     devNULL <- .setPseudoCount(value)
     stopifnot(value == devNULL)
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 52f1c70b..5cafc8ea 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -334,7 +334,17 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                     spliceType := "annotatedIntron_reducedUsage"]
     junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
     
-    # TODO check for intron retention
+    if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% 
+            colnames(junctions_dt))){
+        junctions_dt[spliceType == "annotatedIntron_reducedUsage" & 
+                         type == "jaccard" &
+                         nonsplitProportion >= nonsplitProportion_99quantile,
+                     spliceType := "intron_retention"]
+        
+        # TODO check frameshift for intron retention
+        junctions_dt[spliceType == "intron_retention", 
+                        causesFrameshift := "inconclusive"]
+    }
     
     starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
     ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
diff --git a/man/fit.Rd b/man/fit.Rd
index eb68296c..35f7966e 100644
--- a/man/fit.Rd
+++ b/man/fit.Rd
@@ -11,7 +11,7 @@
     "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"),
   q,
   type = "psi3",
-  rhoRange = c(1e-08, 1 - 1e-08),
+  rhoRange = c(-30, 30),
   weighted = FALSE,
   noiseAlpha = 1,
   convergence = 1e-05,
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index f6b36c41..4cfeb0e6 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -46,7 +46,7 @@
 plotExpression(
   fds,
   type = c("psi5", "psi3", "theta", "jaccard"),
-  site = NULL,
+  idx = NULL,
   result = NULL,
   colGroup = NULL,
   basePlot = TRUE,
@@ -158,14 +158,14 @@ otherwise in the details for each plot function}
 
 \item{BPPARAM}{BiocParallel parameter to use.}
 
+\item{idx}{A junction site ID or gene ID or one of both, which
+should be plotted. Can also be a vector. Integers are treated
+as indices.}
+
 \item{result}{The result table to be used by the method.}
 
 \item{colGroup}{Group of samples that should be colored.}
 
-\item{idx, site}{A junction site ID or gene ID or one of both, which
-should be plotted. Can also be a vector. Integers are treated
-as indices.}
-
 \item{global}{Flag to plot a global Q-Q plot, default FALSE}
 
 \item{conf.alpha}{If set, a confidence interval is plotted, defaults to 0.05}

From 62d90c16249cd5155f3f0fc79f6b83ca0e0487fa Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Jul 2022 11:55:59 +0200
Subject: [PATCH 21/80] fix missing spliceSiteID bug

---
 R/countRNAseqData.R | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index f68c4bdc..d2b1a8ae 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -976,15 +976,10 @@ readJunctionMap <- function(junctionMap){
 #' @noRd
 extractSpliceSiteCoordinates <- function(junctions, fds){
     
-    if(strandSpecific(fds) >= 1L){
-        spliceSiteCoords <- unlist(GRangesList(
-            extractSpliceSiteCoordsPerStrand(junctions, "+"),
-            extractSpliceSiteCoordsPerStrand(junctions, "-")
-        ))
-    } else {
-        strand(junctions) <- "*"
-        spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*")
-    }
+    spliceSiteCoords <- unlist(GRangesList(
+        lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, 
+                junctions=junctions)
+    ))
     
     return(unique(sort(spliceSiteCoords)))
 }
@@ -1049,12 +1044,18 @@ annotateSpliceSite <- function(gr){
     dt <- GRanges2SAF(gr)
     
     # extract donor/acceptor annotation
-    startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
-    endSideDT   <- dt[,.(Start=End, type="end"  ),by="Chr,End,Strand"]
+    startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"]
+    endSiteDT   <- dt[,.(Start=End, type="end"  ),by="Chr,End,Strand"]
+    startSiteDT[,Start:=Start-1]
+    endSiteDT[,End:=End+1]
     
     # annotate and enumerate donor/acceptor
-    annotadedDT <- rbind(startSideDT, endSideDT)
-    annotadedDT[,id:=seq_len(nrow(annotadedDT))]
+    annotatedDT <- rbind(startSiteDT, endSiteDT)
+    annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"]
+    
+    # set back start / end positions for merging with junction ranges
+    annotatedDT[type == "start", Start:=End]
+    annotatedDT[type == "end", End:=Start]
     
     # convert back to granges
     annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE)

From d930ca1af06a0291a3849e694c507a9d4dab5277 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Jul 2022 14:22:28 +0200
Subject: [PATCH 22/80] fix missing spliceSiteID bug part2

---
 R/countRNAseqData.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index d2b1a8ae..b5b442df 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -361,7 +361,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges,
             " splice junctions are found.")
     
     # extract donor and acceptor sites
-    spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+    spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
     message(date(), ": In total ", length(spliceSiteCoords),
             " splice sites (acceptor/donor) will be counted ...")
     
@@ -852,7 +852,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
         }
         
         # extract donor and acceptor sites
-        spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds)
+        spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges)
     }
     
     
@@ -974,7 +974,7 @@ readJunctionMap <- function(junctionMap){
 
 #' extracts the splice site coordinates from a junctions GRange object (
 #' @noRd
-extractSpliceSiteCoordinates <- function(junctions, fds){
+extractSpliceSiteCoordinates <- function(junctions){
     
     spliceSiteCoords <- unlist(GRangesList(
         lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, 
@@ -1058,7 +1058,7 @@ annotateSpliceSite <- function(gr){
     annotatedDT[type == "end", End:=Start]
     
     # convert back to granges
-    annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE)
+    annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE)
     
     ids <- lapply(c("start", "end"), function(type){
         # reduce annogr to only the specific type to prevent overlap

From cc4868d8aa64ec84b73f4a476620920dc635888a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Jul 2022 16:28:33 +0200
Subject: [PATCH 23/80] add functionality to plot coverage from bam files

---
 DESCRIPTION          |   1 +
 NAMESPACE            |   7 +
 R/FRASER-package.R   |  10 +-
 R/plotMethods.R      | 349 ++++++++++++++++++++++++++++++++++++++++++-
 man/plotFunctions.Rd | 155 +++++++++++++++++++
 5 files changed, 513 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 854339e6..fc5ca357 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -82,6 +82,7 @@ Suggests:
     covr,
     TxDb.Hsapiens.UCSC.hg19.knownGene,
     org.Hs.eg.db,
+    SGSeq
 LinkingTo:
     Rcpp,
     RcppArmadillo
diff --git a/NAMESPACE b/NAMESPACE
index 35e2d66a..78006b7d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -60,6 +60,8 @@ export(pVals)
 export(padjVals)
 export(pairedEnd)
 export(plotAberrantPerSample)
+export(plotBamCoverage)
+export(plotBamCoverageFromResultTable)
 export(plotCountCorHeatmap)
 export(plotEncDimSearch)
 export(plotExpectedVsObservedPsi)
@@ -166,6 +168,7 @@ importFrom(GenomicAlignments,summarizeJunctions)
 importFrom(GenomicFeatures,genes)
 importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicFeatures,makeTxDbFromGFF)
+importFrom(GenomicFeatures,seqlevels0)
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,GRangesList)
 importFrom(GenomicRanges,findOverlaps)
@@ -177,6 +180,9 @@ importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(HDF5Array,path)
 importFrom(HDF5Array,saveHDF5SummarizedExperiment)
 importFrom(HDF5Array,writeHDF5Array)
+importFrom(IRanges,"%over%")
+importFrom(IRanges,"end<-")
+importFrom(IRanges,"start<-")
 importFrom(IRanges,IRanges)
 importFrom(IRanges,from)
 importFrom(IRanges,ranges)
@@ -328,5 +334,6 @@ importFrom(tibble,as_tibble)
 importFrom(tools,file_path_as_absolute)
 importFrom(utils,capture.output)
 importFrom(utils,packageVersion)
+importFrom(utils,tail)
 importMethodsFrom(OUTRIDER,results)
 useDynLib(FRASER)
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 87e5ad96..560de2f9 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -22,7 +22,8 @@
 #'
 ### GRange/Experiment/bamFile packages
 #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which
-#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes
+#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes 
+#'          seqlevels0
 #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions
 #'          readGAlignmentPairs
 #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames
@@ -30,7 +31,8 @@
 #'          rbind Assays
 #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
 #'          makeGRangesFromDataFrame invertStrand
-#' @importFrom IRanges subsetByOverlaps from to IRanges ranges
+#' @importFrom IRanges subsetByOverlaps from to IRanges ranges %over% 
+#'          start<- end<-
 #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter
 #'          bamWhich bamWhich<- BamFile idxstatsBam
 #' @importFrom Rsubread featureCounts
@@ -74,7 +76,7 @@
 #' @importFrom R.utils renameFile withTimeout
 #' @importFrom tools file_path_as_absolute
 #' @importFrom methods as callNextMethod is new show slot slot<- validObject
-#' @importFrom utils capture.output packageVersion
+#' @importFrom utils capture.output packageVersion tail
 #'
 #'
 #'
@@ -128,5 +130,5 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
         "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa",
         "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5",
         "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr",
-        "uniqueID", "V1", "value", "zscore", "maxDTheta"),
+        "uniqueID", "V1", "value", "zscore", "maxDTheta", "par"),
         package="FRASER")
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 6d824392..4c0ef708 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -15,6 +15,8 @@
 #'   \item plotFilterExpression()
 #'   \item plotFilterVariability()
 #'   \item plotEncDimSearch()
+#'   \item plotBamCoverage()
+#'   \item plotBamCoverageFromResultTable()
 #' }
 #'
 #' For a detailed description of each plot function please see the details.
@@ -52,11 +54,59 @@
 #'             sample-sample correlation heatmap or \code{"junctionSample"}
 #'             for a junction-sample correlation heatmap.
 #' @param onlyVariableIntrons Logical value indicating whether to show only 
-#'              introns that also pass the variability filter. Defaults to 
-#'              FALSE.
+#'             introns that also pass the variability filter. Defaults to 
+#'             FALSE.
 #' @param onlyExpressedIntrons Logical value indicating whether to show only 
-#'              introns that also pass the expression filter. Defaults to 
-#'              FALSE.
+#'             introns that also pass the expression filter. Defaults to 
+#'             FALSE.
+#' @param control_samples The sampleIDs of the samples used as control in  
+#'             \code{plotBamCoverage}.
+#' @param min_junction_count The minimal junction count across samples required 
+#'             for a junction to appear in the splicegraph and coverage tracks 
+#'             of \code{plotBamCoverage}.
+#' @param txdb A TxDb object giving the gene/transcript annotation to use.
+#' @param orgDb A OrgDb object giving the mapping of gene ids and symbols.
+#' @param show_full_gene Should the full genomic range of the gene be shown in 
+#'             \code{plotBamCoverageFromResultTable} (default: FALSE)? 
+#'             If FALSE, only a certain region (see parameters left_extension 
+#'             and right_extension) around the outlier junction is shown. 
+#' @param left_extension Indicating how far the plotted range around the outlier 
+#'             junction should be extended to the left in 
+#'             \code{plotBamCoverageFromResultTable}.
+#' @param right_extension Indicating how far the plotted range around the 
+#'             outlier junction should be extended to the right in 
+#'             \code{plotBamCoverageFromResultTable}.
+#' @param res_gene_col The column name in the given results table that 
+#'             contains the gene annotation.
+#' @param res_gene_type The type of gene annotation in \code{res_gene_col} 
+#'             (e.g. SYMBOL or ENTREZID etc.). This information is needed for 
+#'             mapping between the results table and the provided annotation 
+#'             in the txdb object. 
+#' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} 
+#'             (e.g. ENTREZID). This information is needed for 
+#'             mapping between the results table and the provided annotation 
+#'             in the txdb object. 
+#' @param highlight_range A \code{GenomicRanges} or \code{GenomicRangesList} 
+#'             object of ranges to be highlighted in the splicegraph of 
+#'             \code{plotBamCoverage}.
+#' @param highlight_range_color The color of highlighted ranges in 
+#'             the splicegraph of \code{plotBamCoverage}.
+#' @param toscale In \code{plotBamCoverage}, indicates which part of the 
+#'             plotted region should be drawn to scale. Possible values are 
+#'             'exon' (exonic regions are drawn to scale), 
+#'             'gene' (both exonic and intronic regions are drawn to scale) or 
+#'             'none' (exonic and intronic regions have constant length) 
+#'             (see SGSeq package).
+#' @param splicegraph_labels Indicated the format of exon/splice junction 
+#'             labels in the splicegraph of \code{plotBamCoverage}. 
+#'             Possible values are 'genomic_range' (gives the start position 
+#'             of the first exon and the end position of the last exon that 
+#'             are shown),  'id' (format E1,... J1,...), 'name' (format 
+#'             type:chromosome:start-end:strand for each feature), 
+#'             'none' for no labels (see SGSeq package).             
+#' @param splicegraph_position The position of the splicegraph relative to the 
+#'             coverage tracks in \code{plotBamCoverage}. Possible values 
+#'             are 'top' (default) and 'bottom'.
 #'              
 #### Graphical parameters
 #' @param main Title for the plot, if missing a default title will be used.
@@ -89,6 +139,26 @@
 #' @param bins Set the number of bins to be used in the histogram.
 #' @param legend.position Set legend position (x and y coordinate), defaults to
 #'             the top right corner.
+#' @param color_annotated The color for exons and junctions present in 
+#'             the given annotation (in the splicegraph of 
+#'             \code{plotBamCoverage}).
+#' @param color_novel The color for novel exons and junctions not present in 
+#'             the given annotation (in the splicegraph of 
+#'             \code{plotBamCoverage}).
+#' @param color_sample_interest The color in \code{plotBamCoverage} for the 
+#'             sample of interest.
+#' @param color_control_samples The color in \code{plotBamCoverage} for the 
+#'             samples used as controls.
+#' @param curvature_splicegraph The curvature of the junction arcs in the 
+#'             splicegraph in \code{plotBamCoverage}. Decrease this value 
+#'             for flatter arcs and increase it for steeper arcs.
+#' @param curvature_coverage The curvature of the junction arcs in the 
+#'             coverage tracks of \code{plotBamCoverage}. Decrease this 
+#'             value for flatter arcs and increase it for steeper arcs.
+#' @param mar The margin of the plot area for \code{plotBamCoverage} 
+#'             (b,l,t,r).
+#' @param cex For controlling the size of text and numbers in 
+#'             \code{plotBamCoverage}.
 #'
 #### Additional ... parameter
 #' @param ... Additional parameters passed to plot() or plot_ly() if not stated
@@ -173,7 +243,30 @@
 #' plotQQ(fds, result=res[1])
 #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 #' 
-#'
+#' # plot splice graph and coverage from bam files in a given region
+#' fds <- createTestFraserSettings()
+#' gr <- GRanges(seqnames="chr19", 
+#'     IRanges(start=7587496, end=7598895), 
+#'     strand="+")
+#' plotBamCoverage(fds, gr=gr, sampleID="sample3", 
+#'     control_samples="sample2", min_junction_count=5,
+#'     curvature_splicegraph=1, curvature_coverage=1, 
+#'     mar=c(1, 7, 0.1, 3))
+#' 
+#' # plot coverage from bam file for a row in the result table
+#' \donttest{    
+#'     fds <- createTestFraserDataSet()
+#'     require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#'     txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+#'     require(org.Hs.eg.db)
+#'     orgDb <- org.Hs.eg.db
+#'     plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE,
+#'         txdb=txdb, orgDb=orgDb, control_samples="sample3")
+#'     plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, 
+#'         control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+#'         curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+#' }
+#' 
 NULL
 
 
@@ -1056,6 +1149,213 @@ plotCountCorHeatmap.FRASER <- function(object,
 setMethod("plotCountCorHeatmap", signature="FraserDataSet", 
         plotCountCorHeatmap.FRASER)
 
+#'
+#' Plot coverage from bam files for given genomic range and sample ids
+#'
+#' @rdname plotFunctions
+#' @export
+plotBamCoverage <- function(fds, gr, sampleID,
+        control_samples=sample(
+            samples(fds[, which(samples(fds) != sampleID)]), 
+            min(3, ncol(fds)-length(sampleID))), 
+        txdb=NULL, min_junction_count=20, 
+        highlight_range=NULL, highlight_range_color="firebrick", 
+        color_annotated="gray", color_novel="goldenrod3", 
+        color_sample_interest="firebrick", color_control_samples="dodgerblue4",
+        toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5),
+        curvature_splicegraph=1, curvature_coverage=1, cex=1,
+        splicegraph_labels=c("genomic_range", "id", "name", "none"),
+        splicegraph_position=c("top", "bottom")){
+    
+    require(SGSeq)
+    if(missing(fds)){
+        stop("Missing input: fds (FraserDataSet object)")
+    } else{
+        stopifnot(is(fds, "FraserDataSet"))
+    }
+    if(missing(gr)){
+        stop("Missing input gr (genomic range to plot).")
+    } else{
+        stopifnot(is(gr, "GenomicRanges"))
+    }
+    if(missing(sampleID)){
+        stop("Missing input: sample_of_interest")
+    }
+    toscale <- match.arg(toscale)
+    splicegraph_labels <- match.arg(splicegraph_labels)
+    splicegraph_position <- match.arg(splicegraph_position)
+    
+    # extract bam info for sample ids to plot
+    all_sids <- c(sampleID, control_samples)
+    si_out <- getSGSeqSI(fds, all_sids)
+    sgseq_si <- si_out[[1]]
+    fds <- si_out[[2]]
+    
+    # collapse input ranges if several 
+    gr <- range(gr)
+    gr <- keepSeqlevels(gr, as.character(seqnames(gr)))
+    if(all(strand(gr) == "*")){
+        # seems to throw an error with * strand so guessing + strand instead
+        strand(gr) <- "+"
+    }
+    
+    # convert highlight_range to GRangesList if not
+    if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){
+        stopifnot(is(highlight_range, "GRanges"))
+        highlight_range <- GRangesList(highlight_range)
+    }
+    
+    # extract splice graph
+    sgfc_pred <- analyzeFeatures(sgseq_si, which = gr, 
+                                min_junction_count=min_junction_count, psi=0)
+    
+    # overlap detected junctions with annotation
+    if(!is.null(txdb)){
+        # subset to chr of interest
+        seqlevels(txdb) <- as.character(seqnames(gr))
+        
+        # extract transcript features with SGSeq package
+        txf <- convertToTxFeatures(txdb)
+        txf <- txf[txf %over% gr]
+        
+        # restore seqlevels of txdb object
+        seqlevels(txdb) <- seqlevels0(txdb)
+        
+        # annotate splice junctions with annotation features
+        sgfc_pred <- SGSeq::annotate(sgfc_pred, txf)
+    } else{
+        # when no annotation is given, show everything in the same color
+        color_novel <- color_annotated
+    }
+    
+    # get genomic positions for first and last exon in given range
+    if(splicegraph_labels == "genomic_range"){
+        # tell plotSpliceGraph function to use custom labels
+        splicegraph_labels <- "label"
+        # create custom labels (only for first and last exon for readability)
+        mcols(sgfc_pred)$label <- ""
+        exons <- which(type(sgfc_pred) == "E" & rowRanges(sgfc_pred) %over% gr)
+        exons <- unique(c(exons[1], tail(exons, n=1)))
+        if(length(exons) == 1){
+            mcols(sgfc_pred)$label[exons] <- 
+                paste(seqnames(sgfc_pred), 
+                        paste(start(sgfc_pred), end(sgfc_pred), sep="-"),
+                        strand(sgfc_pred), sep=":")[exons]
+        }
+        if(length(exons) == 2){
+            mcols(sgfc_pred)$label[exons[1]] <- 
+                paste(seqnames(sgfc_pred), 
+                        start(sgfc_pred), 
+                        strand(sgfc_pred), sep=":")[exons[1]]
+            mcols(sgfc_pred)$label[exons[2]] <- 
+                paste(seqnames(sgfc_pred), 
+                        end(sgfc_pred), 
+                        strand(sgfc_pred), sep=":")[exons[2]]
+        }
+    }
+    
+    # plot splice graph and coverage of junctions from bam
+    nr_sa2p <- length(all_sids)
+    par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) 
+    if(splicegraph_position == "top"){
+        plotSpliceGraph(rowRanges(sgfc_pred), 
+                which=gr, 
+                toscale=toscale, 
+                color=color_annotated,
+                color_novel=color_novel,
+                ypos=c(0.25, 0.1),
+                ranges=highlight_range,
+                ranges_color=highlight_range_color, 
+                ranges_ypos=c(0.01, 0.02),
+                curvature=curvature_splicegraph,
+                label=splicegraph_labels)
+    }
+    for (j in seq_along(sampleID)) {
+        plotCoverage(
+                sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], 
+                which = gr,
+                toscale = toscale, 
+                label=sampleID[j],
+                color=color_sample_interest,
+                curvature=curvature_coverage)
+    }
+    for (j in seq_along(control_samples)) {
+        plotCoverage(
+                sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])],
+                which = gr,
+                toscale = toscale, 
+                label=control_samples[j],
+                color=color_control_samples,
+                curvature=curvature_coverage)
+    }
+    if(splicegraph_position == "bottom"){
+        plotSpliceGraph(rowRanges(sgfc_pred), 
+                which=gr, 
+                toscale=toscale, 
+                color_novel=color_novel,
+                ypos=c(0.25, 0.1),
+                ranges=highlight_range,
+                ranges_color=highlight_range_color, 
+                ranges_ypos=c(0.01, 0.02),
+                curvature=curvature_splicegraph,
+                label=splicegraph_labels)
+    }
+    
+    return(invisible(fds))
+}
+
+#'
+#' Plot coverage from bam files for given row of results table
+#'
+#' @rdname plotFunctions
+#' @export
+plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, 
+                txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol",
+                res_gene_type="SYMBOL", txdb_geneid_type="ENTREZID",
+                left_extension=1000, right_extension=1000, ...){
+    stopifnot(is(fds, "FraserDataSet"))
+    
+    if(is(result, "GenomicRanges")){
+        result <- as.data.table(result)
+    }
+    
+    stopifnot(is.data.table(result))
+    stopifnot(result[,.N] == 1)
+    
+    sid <- result[,sampleID]
+    jidx <- getIndexFromResultTable(fds, result)
+    outlier_range <- rowRanges(fds, type=result[,type])[jidx,]
+    
+    # showing either full range of the gene in which the outlier occured
+    if(show_full_gene == TRUE){
+        if(missing(txdb)){
+            stop("Missing input: txdb (for extracting gene range)")
+        }
+        if(missing(orgDb)){
+            stop("Missing input: orgDb (for mapping of IDs to txdb)")
+        }
+        txdb_geneid <- select(orgDb, 
+                        keys=as.character(result[,res_gene_col, with=FALSE]), 
+                        columns=txdb_geneid_type, 
+                        keytype=res_gene_type)[1]
+        gr <- genes(txdb, filter=list("gene_id"=txdb_geneid))
+    } else{
+        # or just showing a certain region around the outlier junction
+        gr <- outlier_range
+        start(gr) <- start(gr) - left_extension
+        end(gr) <- end(gr) + right_extension
+    }
+    
+    # create the coverage plot for the given outlier
+    fds <- plotBamCoverage(fds, 
+                            gr=gr, 
+                            sampleID=sid, 
+                            txdb=txdb,
+                            highlight_range=outlier_range, 
+                            ...)
+    return(invisible(fds))
+}
+
 #'
 #' helper function to get the annotation as data frame from the col data object
 #'
@@ -1112,3 +1412,42 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
             FUN.VALUE=character(1))
     }
 }
+
+#'
+#' Extract info from bam files needed for SGSeq functions to work
+#' 
+#' @noRd
+getSGSeqSI <- function(fds, sample_ids){
+    
+    require(SGSeq)
+    # check if bam info is already stored in fds for given samples
+    if("SGSeq_sampleinfo" %in% names(metadata(fds))){
+        si <- metadata(fds)[["SGSeq_sampleinfo"]]
+        si <- si[si$sample_name %in% sample_ids,]
+        if(nrow(si) != length(sample_ids)){
+            # add bam info for missing sample_ids
+            missing_ids <- sample_ids[!sample_ids %in% si$sample_name]
+            message("Extracting SGSeq sample info from BAM files for samples ",
+                    paste(missing_ids, collapse=", "), " ...")
+            df_missing <- data.frame(
+                sample_name=samples(fds)[samples(fds) %in% missing_ids],
+                file_bam=bamFile(fds)[samples(fds) %in% missing_ids])
+            si_new <- getBamInfo(df_missing, yieldSize=1e6)
+            si_new$lib_size <- 50e6 # dummy value to speed up this part
+            si <- rbind(si, si_new)
+            metadata(fds)[["SGSeq_sampleinfo"]] <- 
+                rbind(metadata(fds)[["SGSeq_sampleinfo"]], si_new)
+        }
+        return(list(si, fds))
+    } else{
+        message("Extracting SGSeq sample info from BAM files for samples ",
+                paste(sample_ids, collapse=", "), " ...")
+        df <- data.frame(
+            sample_name=samples(fds)[samples(fds) %in% sample_ids],
+            file_bam=bamFile(fds)[samples(fds) %in% sample_ids])
+        si <- getBamInfo(df, yieldSize=1e6)  
+        si$lib_size <- 50e6 # dummy value to speed up this part
+        metadata(fds)[["SGSeq_sampleinfo"]] <- si
+        return(list(si, fds))
+    }
+}
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index cbeddee6..2fc50b2b 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -16,6 +16,8 @@
 \alias{plotEncDimSearch,FraserDataSet-method}
 \alias{plotFilterVariability}
 \alias{plotCountCorHeatmap,FraserDataSet-method}
+\alias{plotBamCoverage}
+\alias{plotBamCoverageFromResultTable}
 \title{Visualization functions for FRASER}
 \usage{
 \S4method{plotVolcano}{FraserDataSet}(
@@ -126,6 +128,43 @@ plotFilterVariability(
   plotCov = TRUE,
   ...
 )
+
+plotBamCoverage(
+  fds,
+  gr,
+  sampleID,
+  control_samples = sample(samples(fds[, which(samples(fds) != sampleID)]), min(3,
+    ncol(fds) - length(sampleID))),
+  txdb = NULL,
+  min_junction_count = 20,
+  highlight_range = NULL,
+  highlight_range_color = "firebrick",
+  color_annotated = "gray",
+  color_novel = "goldenrod3",
+  color_sample_interest = "firebrick",
+  color_control_samples = "dodgerblue4",
+  toscale = c("exon", "gene", "none"),
+  mar = c(2, 10, 0.1, 5),
+  curvature_splicegraph = 1,
+  curvature_coverage = 1,
+  cex = 1,
+  splicegraph_labels = c("genomic_range", "id", "name", "none"),
+  splicegraph_position = c("top", "bottom")
+)
+
+plotBamCoverageFromResultTable(
+  fds,
+  result,
+  show_full_gene = FALSE,
+  txdb = NULL,
+  orgDb = NULL,
+  res_gene_col = "hgncSymbol",
+  res_gene_type = "SYMBOL",
+  txdb_geneid_type = "ENTREZID",
+  left_extension = 1000,
+  right_extension = 1000,
+  ...
+)
 }
 \arguments{
 \item{object, fds}{An \code{\link{FraserDataSet}} object.}
@@ -228,6 +267,97 @@ annotation of the heatmap.}
 
 \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with
 the mean psi values or the junction coverage.}
+
+\item{control_samples}{The sampleIDs of the samples used as control in
+\code{plotBamCoverage}.}
+
+\item{txdb}{A TxDb object giving the gene/transcript annotation to use.}
+
+\item{min_junction_count}{The minimal junction count across samples required 
+for a junction to appear in the splicegraph and coverage tracks 
+of \code{plotBamCoverage}.}
+
+\item{highlight_range}{A \code{GenomicRanges} or \code{GenomicRangesList} 
+object of ranges to be highlighted in the splicegraph of 
+\code{plotBamCoverage}.}
+
+\item{highlight_range_color}{The color of highlighted ranges in 
+the splicegraph of \code{plotBamCoverage}.}
+
+\item{color_annotated}{The color for exons and junctions present in 
+the given annotation (in the splicegraph of 
+\code{plotBamCoverage}).}
+
+\item{color_novel}{The color for novel exons and junctions not present in 
+the given annotation (in the splicegraph of 
+\code{plotBamCoverage}).}
+
+\item{color_sample_interest}{The color in \code{plotBamCoverage} for the 
+sample of interest.}
+
+\item{color_control_samples}{The color in \code{plotBamCoverage} for the 
+samples used as controls.}
+
+\item{toscale}{In \code{plotBamCoverage}, indicates which part of the 
+plotted region should be drawn to scale. Possible values are 
+'exon' (exonic regions are drawn to scale), 
+'gene' (both exonic and intronic regions are drawn to scale) or 
+'none' (exonic and intronic regions have constant length) 
+(see SGSeq package).}
+
+\item{mar}{The margin of the plot area for \code{plotBamCoverage} 
+(b,l,t,r).}
+
+\item{curvature_splicegraph}{The curvature of the junction arcs in the 
+splicegraph in \code{plotBamCoverage}. Decrease this value 
+for flatter arcs and increase it for steeper arcs.}
+
+\item{curvature_coverage}{The curvature of the junction arcs in the 
+coverage tracks of \code{plotBamCoverage}. Decrease this 
+value for flatter arcs and increase it for steeper arcs.}
+
+\item{cex}{For controlling the size of text and numbers in 
+\code{plotBamCoverage}.}
+
+\item{splicegraph_labels}{Indicated the format of exon/splice junction 
+labels in the splicegraph of \code{plotBamCoverage}. 
+Possible values are 'genomic_range' (gives the start position 
+of the first exon and the end position of the last exon that 
+are shown),  'id' (format E1,... J1,...), 'name' (format 
+type:chromosome:start-end:strand for each feature), 
+'none' for no labels (see SGSeq package).}
+
+\item{splicegraph_position}{The position of the splicegraph relative to the 
+coverage tracks in \code{plotBamCoverage}. Possible values 
+are 'top' (default) and 'bottom'.}
+
+\item{show_full_gene}{Should the full genomic range of the gene be shown in 
+\code{plotBamCoverageFromResultTable} (default: FALSE)? 
+If FALSE, only a certain region (see parameters left_extension 
+and right_extension) around the outlier junction is shown.}
+
+\item{orgDb}{A OrgDb object giving the mapping of gene ids and symbols.}
+
+\item{res_gene_col}{The column name in the given results table that 
+contains the gene annotation.}
+
+\item{res_gene_type}{The type of gene annotation in \code{res_gene_col} 
+(e.g. SYMBOL or ENTREZID etc.). This information is needed for 
+mapping between the results table and the provided annotation 
+in the txdb object.}
+
+\item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} 
+(e.g. ENTREZID). This information is needed for 
+mapping between the results table and the provided annotation 
+in the txdb object.}
+
+\item{left_extension}{Indicating how far the plotted range around the outlier 
+junction should be extended to the left in 
+\code{plotBamCoverageFromResultTable}.}
+
+\item{right_extension}{Indicating how far the plotted range around the 
+outlier junction should be extended to the right in 
+\code{plotBamCoverageFromResultTable}.}
 }
 \value{
 If base R graphics are used nothing is returned else the plotly or
@@ -269,6 +399,8 @@ This is the list of all plotting function provided by FRASER:
   \item plotFilterExpression()
   \item plotFilterVariability()
   \item plotEncDimSearch()
+  \item plotBamCoverage()
+  \item plotBamCoverageFromResultTable()
 }
 
 For a detailed description of each plot function please see the details.
@@ -345,5 +477,28 @@ plotExpression(fds, result=res[1])
 plotQQ(fds, result=res[1])
 plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 
+# plot splice graph and coverage from bam files in a given region
+fds <- createTestFraserSettings()
+gr <- GRanges(seqnames="chr19", 
+    IRanges(start=7587496, end=7598895), 
+    strand="+")
+plotBamCoverage(fds, gr=gr, sampleID="sample3", 
+    control_samples="sample2", min_junction_count=5,
+    curvature_splicegraph=1, curvature_coverage=1, 
+    mar=c(1, 7, 0.1, 3))
+
+# plot coverage from bam file for a row in the result table
+\donttest{    
+    fds <- createTestFraserDataSet()
+    require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+    require(org.Hs.eg.db)
+    orgDb <- org.Hs.eg.db
+    plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE,
+        txdb=txdb, orgDb=orgDb, control_samples="sample3")
+    plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, 
+        control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+        curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+}
 
 }

From 3959c71f72b85be510ed83c7a5d26053227ca077 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 12 Jul 2022 19:54:35 +0200
Subject: [PATCH 24/80] fix examples

---
 R/AllGenerics.R      |  4 ++--
 R/plotMethods.R      | 40 ++++++++++++++++++++------------
 man/plotFunctions.Rd | 55 +++-----------------------------------------
 man/results.Rd       |  4 ++--
 4 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index cfb73078..a285f411 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -770,9 +770,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' # get data, fit and compute p-values and z-scores
 #' fds <- createTestFraserDataSet()
 #' 
-#' # extract results: for this example dataset, z score cutoff of 2 is used to
+#' # extract results: for this example dataset, no cutoffs are used to
 #' # get at least one result and show the output
-#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+#' res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA)
 #' res
 #' 
 #' # aggregate the results by genes (gene symbols need to be annotated first 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 4c0ef708..f363ee55 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -59,6 +59,8 @@
 #' @param onlyExpressedIntrons Logical value indicating whether to show only 
 #'             introns that also pass the expression filter. Defaults to 
 #'             FALSE.
+#' @param gr A GRanges object indicating the genomic range that should be shown 
+#'             in \code{plotBamCoverage}.
 #' @param control_samples The sampleIDs of the samples used as control in  
 #'             \code{plotBamCoverage}.
 #' @param min_junction_count The minimal junction count across samples required 
@@ -254,17 +256,24 @@
 #'     mar=c(1, 7, 0.1, 3))
 #' 
 #' # plot coverage from bam file for a row in the result table
-#' \donttest{    
-#'     fds <- createTestFraserDataSet()
-#'     require(TxDb.Hsapiens.UCSC.hg19.knownGene)
-#'     txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-#'     require(org.Hs.eg.db)
-#'     orgDb <- org.Hs.eg.db
-#'     plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE,
-#'         txdb=txdb, orgDb=orgDb, control_samples="sample3")
-#'     plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, 
-#'         control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-#'         curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+#' fds <- createTestFraserDataSet()
+#' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+#' require(org.Hs.eg.db)
+#' orgDb <- org.Hs.eg.db
+#'     
+#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+#' res_dt <- as.data.table(res)
+#' res_dt <- res_dt[sampleID == "sample2",]
+#'     
+#' # plot full range of gene containing outlier junction
+#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+#'     txdb=txdb, orgDb=orgDb, control_samples="sample3")
+#'     
+#' # plot only certain range around outlier junction
+#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
+#'     control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+#'     curvature_coverage=0.5, right_extension=5000, left_extension=5000)
 #' }
 #' 
 NULL
@@ -1335,10 +1344,11 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
             stop("Missing input: orgDb (for mapping of IDs to txdb)")
         }
         txdb_geneid <- select(orgDb, 
-                        keys=as.character(result[,res_gene_col, with=FALSE]), 
-                        columns=txdb_geneid_type, 
-                        keytype=res_gene_type)[1]
-        gr <- genes(txdb, filter=list("gene_id"=txdb_geneid))
+                    keys=as.character(result[,res_gene_col, with=FALSE]), 
+                    columns=txdb_geneid_type, 
+                    keytype=res_gene_type)[1,]
+        gr <- genes(txdb, 
+                    filter=list("gene_id"=txdb_geneid[,txdb_geneid_type]))
     } else{
         # or just showing a certain region around the outlier junction
         gr <- outlier_range
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 2fc50b2b..7248f24f 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -268,6 +268,9 @@ annotation of the heatmap.}
 \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with
 the mean psi values or the junction coverage.}
 
+\item{gr}{A GRanges object indicating the genomic range that should be shown 
+in \code{plotBamCoverage}.}
+
 \item{control_samples}{The sampleIDs of the samples used as control in
 \code{plotBamCoverage}.}
 
@@ -450,55 +453,3 @@ It plots the encoding dimension against the achieved loss (area under the
 precision-recall curve). From this plot the optimum should be choosen for
 the \code{q} in fitting process.
 }
-\examples{
-# create full FRASER object 
-fds <- makeSimulatedFraserDataSet(m=40, j=200)
-fds <- calculatePSIValues(fds)
-fds <- filterExpressionAndVariability(fds, filter=FALSE)
-# this step should be done for all splicing metrics and more dimensions
-fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
-fds <- FRASER(fds)
-
-# QC plotting
-plotFilterExpression(fds)
-plotFilterVariability(fds)
-plotCountCorHeatmap(fds, "theta")
-plotCountCorHeatmap(fds, "theta", normalized=TRUE)
-plotEncDimSearch(fds, type="psi5")
-
-# extract results 
-plotAberrantPerSample(fds, aggregate=FALSE)
-plotVolcano(fds, "sample1", "psi5")
-
-# dive into gene/sample level results
-res <- results(fds)
-res
-plotExpression(fds, result=res[1])
-plotQQ(fds, result=res[1])
-plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
-
-# plot splice graph and coverage from bam files in a given region
-fds <- createTestFraserSettings()
-gr <- GRanges(seqnames="chr19", 
-    IRanges(start=7587496, end=7598895), 
-    strand="+")
-plotBamCoverage(fds, gr=gr, sampleID="sample3", 
-    control_samples="sample2", min_junction_count=5,
-    curvature_splicegraph=1, curvature_coverage=1, 
-    mar=c(1, 7, 0.1, 3))
-
-# plot coverage from bam file for a row in the result table
-\donttest{    
-    fds <- createTestFraserDataSet()
-    require(TxDb.Hsapiens.UCSC.hg19.knownGene)
-    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-    require(org.Hs.eg.db)
-    orgDb <- org.Hs.eg.db
-    plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=TRUE,
-        txdb=txdb, orgDb=orgDb, control_samples="sample3")
-    plotBamCoverageFromResultTable(fds, result=res[1,], show_full_gene=FALSE, 
-        control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-        curvature_coverage=0.5, right_extension=5000, left_extension=5000)
-}
-
-}
diff --git a/man/results.Rd b/man/results.Rd
index 57682616..7061f285 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -95,9 +95,9 @@ aberrant splicing events based on the given cutoffs.
 # get data, fit and compute p-values and z-scores
 fds <- createTestFraserDataSet()
 
-# extract results: for this example dataset, z score cutoff of 2 is used to
+# extract results: for this example dataset, no cutoffs are used to
 # get at least one result and show the output
-res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05)
+res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # aggregate the results by genes (gene symbols need to be annotated first 

From 7c208125a27d23882636b0c0ff72200d2c668f7d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 13:14:48 +0200
Subject: [PATCH 25/80] minor improvements

---
 R/plotMethods.R      | 37 ++++++++++++++++--------
 man/plotFunctions.Rd | 68 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index f363ee55..316c43dc 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -80,10 +80,10 @@
 #'             \code{plotBamCoverageFromResultTable}.
 #' @param res_gene_col The column name in the given results table that 
 #'             contains the gene annotation.
-#' @param res_gene_type The type of gene annotation in \code{res_gene_col} 
-#'             (e.g. SYMBOL or ENTREZID etc.). This information is needed for 
-#'             mapping between the results table and the provided annotation 
-#'             in the txdb object. 
+#' @param res_geneid_type The type of gene annotation in the results table in 
+#'             \code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This 
+#'             information is needed for mapping between the results table and 
+#'             the provided annotation in the txdb object. 
 #' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} 
 #'             (e.g. ENTREZID). This information is needed for 
 #'             mapping between the results table and the provided annotation 
@@ -274,7 +274,6 @@
 #' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
 #'     control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
 #'     curvature_coverage=0.5, right_extension=5000, left_extension=5000)
-#' }
 #' 
 NULL
 
@@ -1320,7 +1319,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
 #' @export
 plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, 
                 txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol",
-                res_gene_type="SYMBOL", txdb_geneid_type="ENTREZID",
+                res_geneid_type="SYMBOL", txdb_geneid_type="ENTREZID",
                 left_extension=1000, right_extension=1000, ...){
     stopifnot(is(fds, "FraserDataSet"))
     
@@ -1343,12 +1342,26 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
         if(missing(orgDb)){
             stop("Missing input: orgDb (for mapping of IDs to txdb)")
         }
-        txdb_geneid <- select(orgDb, 
-                    keys=as.character(result[,res_gene_col, with=FALSE]), 
-                    columns=txdb_geneid_type, 
-                    keytype=res_gene_type)[1,]
-        gr <- genes(txdb, 
-                    filter=list("gene_id"=txdb_geneid[,txdb_geneid_type]))
+        result_gene <- result[,get(res_gene_col)]
+        if(is.data.table(orgDb)){
+            tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, 
+                         by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, 
+                         sort=FALSE)[,.(gene_id, feature=get(res_geneid_type))]
+            setnames(tmp, "feature", res_geneid_type)
+            txdb_geneid <- tmp[get(res_geneid_type) %in% result_gene, gene_id]
+        } else {
+            tmp <- as.data.table(
+                select(orgDb, 
+                       keys=result_gene, 
+                       columns=txdb_geneid_type, 
+                       keytype=res_geneid_type)
+            )
+            txdb_geneid <- tmp[, get(txdb_geneid_type)]
+        }
+        gr <- genes(txdb, filter=list("gene_id"=txdb_geneid))
+        if(length(gr) == 0){
+            stop("Could not extract genomic coordinates for input gene.")
+        }
     } else{
         # or just showing a certain region around the outlier junction
         gr <- outlier_range
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 7248f24f..14958dae 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -159,7 +159,7 @@ plotBamCoverageFromResultTable(
   txdb = NULL,
   orgDb = NULL,
   res_gene_col = "hgncSymbol",
-  res_gene_type = "SYMBOL",
+  res_geneid_type = "SYMBOL",
   txdb_geneid_type = "ENTREZID",
   left_extension = 1000,
   right_extension = 1000,
@@ -344,10 +344,10 @@ and right_extension) around the outlier junction is shown.}
 \item{res_gene_col}{The column name in the given results table that 
 contains the gene annotation.}
 
-\item{res_gene_type}{The type of gene annotation in \code{res_gene_col} 
-(e.g. SYMBOL or ENTREZID etc.). This information is needed for 
-mapping between the results table and the provided annotation 
-in the txdb object.}
+\item{res_geneid_type}{The type of gene annotation in the results table in 
+\code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This 
+information is needed for mapping between the results table and 
+the provided annotation in the txdb object.}
 
 \item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} 
 (e.g. ENTREZID). This information is needed for 
@@ -453,3 +453,61 @@ It plots the encoding dimension against the achieved loss (area under the
 precision-recall curve). From this plot the optimum should be choosen for
 the \code{q} in fitting process.
 }
+\examples{
+# create full FRASER object 
+fds <- makeSimulatedFraserDataSet(m=40, j=200)
+fds <- calculatePSIValues(fds)
+fds <- filterExpressionAndVariability(fds, filter=FALSE)
+# this step should be done for all splicing metrics and more dimensions
+fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
+fds <- FRASER(fds)
+
+# QC plotting
+plotFilterExpression(fds)
+plotFilterVariability(fds)
+plotCountCorHeatmap(fds, "theta")
+plotCountCorHeatmap(fds, "theta", normalized=TRUE)
+plotEncDimSearch(fds, type="psi5")
+
+# extract results 
+plotAberrantPerSample(fds, aggregate=FALSE)
+plotVolcano(fds, "sample1", "psi5")
+
+# dive into gene/sample level results
+res <- results(fds)
+res
+plotExpression(fds, result=res[1])
+plotQQ(fds, result=res[1])
+plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
+
+# plot splice graph and coverage from bam files in a given region
+fds <- createTestFraserSettings()
+gr <- GRanges(seqnames="chr19", 
+    IRanges(start=7587496, end=7598895), 
+    strand="+")
+plotBamCoverage(fds, gr=gr, sampleID="sample3", 
+    control_samples="sample2", min_junction_count=5,
+    curvature_splicegraph=1, curvature_coverage=1, 
+    mar=c(1, 7, 0.1, 3))
+
+# plot coverage from bam file for a row in the result table
+fds <- createTestFraserDataSet()
+require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+require(org.Hs.eg.db)
+orgDb <- org.Hs.eg.db
+    
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+res_dt <- as.data.table(res)
+res_dt <- res_dt[sampleID == "sample2",]
+    
+# plot full range of gene containing outlier junction
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+    txdb=txdb, orgDb=orgDb, control_samples="sample3")
+    
+# plot only certain range around outlier junction
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
+    control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+    curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+
+}

From 8910fc13e62791f9f57156b1e1f687a1e34a7f03 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 14:14:47 +0200
Subject: [PATCH 26/80] minor fix

---
 R/plotMethods.R | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index ef0551cb..b920b863 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1192,6 +1192,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
         stop("Missing input gr (genomic range to plot).")
     } else{
         stopifnot(is(gr, "GenomicRanges"))
+        stopifnot(length(gr) > 0)
     }
     if(missing(sampleID)){
         stop("Missing input: sample_of_interest")
@@ -1207,12 +1208,25 @@ plotBamCoverage <- function(fds, gr, sampleID,
     fds <- si_out[[2]]
     
     # collapse input ranges if several 
-    gr <- range(gr)
-    gr <- keepSeqlevels(gr, as.character(seqnames(gr)))
-    if(all(strand(gr) == "*")){
-        # seems to throw an error with * strand so guessing + strand instead
-        strand(gr) <- "+"
+    if(any(strand(gr) == "*")){
+        # seems to throw an error with * strand so guessing strand instead
+        if(all(strand(gr) == "*")){
+            guessStrand <- "+"
+        } else{
+            guessStrand <- strand(gr[strand(gr) != "*"])[1]
+        }
+        strand(gr)[strand(gr) == "*"] <- guessStrand
+        warning("Input genomic ranges contained unstranded ranges.\n", 
+                "This function needs strand information, guessing strand to ", 
+                "be ", guessStrand, ".")
     }
+    if(!all(strand(gr) == strand(gr[1,]))){
+        warning("Input genomic ranges contained ranges on different strands,\n", 
+                "only showing coverage for the ", strand(gr[1,]), " strand.")
+        strand(gr) <- rep(strand(gr[1,]), length(gr))
+    }
+    gr <- range(gr)
+    gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr))))
     
     # convert highlight_range to GRangesList if not
     if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){
@@ -1227,7 +1241,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
     # overlap detected junctions with annotation
     if(!is.null(txdb)){
         # subset to chr of interest
-        seqlevels(txdb) <- as.character(seqnames(gr))
+        seqlevels(txdb) <- unique(as.character(seqnames(gr)))
         
         # extract transcript features with SGSeq package
         txf <- convertToTxFeatures(txdb)
@@ -1350,6 +1364,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
             stop("Missing input: orgDb (for mapping of IDs to txdb)")
         }
         result_gene <- result[,get(res_gene_col)]
+        result_gene <- strsplit(result_gene, ";", fixed=TRUE)[[1]]
         if(is.data.table(orgDb)){
             tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, 
                          by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, 
@@ -1376,6 +1391,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
         end(gr) <- end(gr) + right_extension
     }
     
+    # if several genes overlap, only show those on same strand as outlier
+    gr <- gr[strand(gr) == strand(outlier_range),]
+    
     # create the coverage plot for the given outlier
     fds <- plotBamCoverage(fds, 
                             gr=gr, 

From 22f579b92c16784027a87113f60f7fad0fe0b2dc Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 14:25:03 +0200
Subject: [PATCH 27/80] deal with unstranded input ranges

---
 R/plotMethods.R | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index 316c43dc..c74a8c5c 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1175,7 +1175,6 @@ plotBamCoverage <- function(fds, gr, sampleID,
         splicegraph_labels=c("genomic_range", "id", "name", "none"),
         splicegraph_position=c("top", "bottom")){
     
-    require(SGSeq)
     if(missing(fds)){
         stop("Missing input: fds (FraserDataSet object)")
     } else{
@@ -1200,12 +1199,25 @@ plotBamCoverage <- function(fds, gr, sampleID,
     fds <- si_out[[2]]
     
     # collapse input ranges if several 
+    if(any(strand(gr) == "*")){
+        # seems to throw an error with * strand so guessing strand instead
+        if(all(strand(gr) == "*")){
+            guessStrand <- "+"
+        } else{
+            guessStrand <- strand(gr[strand(gr) != "*"])[1]
+        }
+        strand(gr)[strand(gr) == "*"] <- guessStrand
+        warning("Input genomic ranges contained unstranded ranges.\n", 
+                "This function needs strand information, guessing strand to ", 
+                "be ", guessStrand, ".")
+    }
+    if(!all(strand(gr) == strand(gr[1,]))){
+        warning("Input genomic ranges contained ranges on different strands,\n", 
+                "only showing coverage for the ", strand(gr[1,]), " strand.")
+        strand(gr) <- rep(strand(gr[1,]), length(gr))
+    }
     gr <- range(gr)
     gr <- keepSeqlevels(gr, as.character(seqnames(gr)))
-    if(all(strand(gr) == "*")){
-        # seems to throw an error with * strand so guessing + strand instead
-        strand(gr) <- "+"
-    }
     
     # convert highlight_range to GRangesList if not
     if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){
@@ -1214,7 +1226,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
     }
     
     # extract splice graph
-    sgfc_pred <- analyzeFeatures(sgseq_si, which = gr, 
+    sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, 
                                 min_junction_count=min_junction_count, psi=0)
     
     # overlap detected junctions with annotation
@@ -1223,7 +1235,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
         seqlevels(txdb) <- as.character(seqnames(gr))
         
         # extract transcript features with SGSeq package
-        txf <- convertToTxFeatures(txdb)
+        txf <- SGSeq::convertToTxFeatures(txdb)
         txf <- txf[txf %over% gr]
         
         # restore seqlevels of txdb object
@@ -1242,7 +1254,8 @@ plotBamCoverage <- function(fds, gr, sampleID,
         splicegraph_labels <- "label"
         # create custom labels (only for first and last exon for readability)
         mcols(sgfc_pred)$label <- ""
-        exons <- which(type(sgfc_pred) == "E" & rowRanges(sgfc_pred) %over% gr)
+        exons <- which(SGSeq::type(sgfc_pred) == "E" & 
+                        rowRanges(sgfc_pred) %over% gr)
         exons <- unique(c(exons[1], tail(exons, n=1)))
         if(length(exons) == 1){
             mcols(sgfc_pred)$label[exons] <- 
@@ -1266,7 +1279,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
     nr_sa2p <- length(all_sids)
     par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) 
     if(splicegraph_position == "top"){
-        plotSpliceGraph(rowRanges(sgfc_pred), 
+        SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), 
                 which=gr, 
                 toscale=toscale, 
                 color=color_annotated,
@@ -1279,7 +1292,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
                 label=splicegraph_labels)
     }
     for (j in seq_along(sampleID)) {
-        plotCoverage(
+        SGSeq::plotCoverage(
                 sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], 
                 which = gr,
                 toscale = toscale, 
@@ -1288,7 +1301,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
                 curvature=curvature_coverage)
     }
     for (j in seq_along(control_samples)) {
-        plotCoverage(
+        SGSeq::plotCoverage(
                 sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])],
                 which = gr,
                 toscale = toscale, 
@@ -1297,7 +1310,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
                 curvature=curvature_coverage)
     }
     if(splicegraph_position == "bottom"){
-        plotSpliceGraph(rowRanges(sgfc_pred), 
+        SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), 
                 which=gr, 
                 toscale=toscale, 
                 color_novel=color_novel,
@@ -1369,6 +1382,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
         end(gr) <- end(gr) + right_extension
     }
     
+    # if several genes overlap, only show those on same strand as outlier
+    gr <- gr[strand(gr) == strand(outlier_range),]
+    
     # create the coverage plot for the given outlier
     fds <- plotBamCoverage(fds, 
                             gr=gr, 
@@ -1442,7 +1458,6 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
 #' @noRd
 getSGSeqSI <- function(fds, sample_ids){
     
-    require(SGSeq)
     # check if bam info is already stored in fds for given samples
     if("SGSeq_sampleinfo" %in% names(metadata(fds))){
         si <- metadata(fds)[["SGSeq_sampleinfo"]]
@@ -1455,7 +1470,7 @@ getSGSeqSI <- function(fds, sample_ids){
             df_missing <- data.frame(
                 sample_name=samples(fds)[samples(fds) %in% missing_ids],
                 file_bam=bamFile(fds)[samples(fds) %in% missing_ids])
-            si_new <- getBamInfo(df_missing, yieldSize=1e6)
+            si_new <- SGSeq::getBamInfo(df_missing, yieldSize=1e6)
             si_new$lib_size <- 50e6 # dummy value to speed up this part
             si <- rbind(si, si_new)
             metadata(fds)[["SGSeq_sampleinfo"]] <- 
@@ -1468,7 +1483,7 @@ getSGSeqSI <- function(fds, sample_ids){
         df <- data.frame(
             sample_name=samples(fds)[samples(fds) %in% sample_ids],
             file_bam=bamFile(fds)[samples(fds) %in% sample_ids])
-        si <- getBamInfo(df, yieldSize=1e6)  
+        si <- gSGSeq::getBamInfo(df, yieldSize=1e6)  
         si$lib_size <- 50e6 # dummy value to speed up this part
         metadata(fds)[["SGSeq_sampleinfo"]] <- si
         return(list(si, fds))

From 51bfdfb5521b0d7125d84c9fde89b6f72a57392d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 15:14:43 +0200
Subject: [PATCH 28/80] fix typo

---
 R/plotMethods.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index c74a8c5c..4deeb120 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1483,7 +1483,7 @@ getSGSeqSI <- function(fds, sample_ids){
         df <- data.frame(
             sample_name=samples(fds)[samples(fds) %in% sample_ids],
             file_bam=bamFile(fds)[samples(fds) %in% sample_ids])
-        si <- gSGSeq::getBamInfo(df, yieldSize=1e6)  
+        si <- SGSeq::getBamInfo(df, yieldSize=1e6)  
         si$lib_size <- 50e6 # dummy value to speed up this part
         metadata(fds)[["SGSeq_sampleinfo"]] <- si
         return(list(si, fds))

From f6bdef66fa027bb1fb22f2c377b574481c29efda Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 16:28:30 +0200
Subject: [PATCH 29/80] slightly improved intron_retention detection

---
 R/resultAnnotations.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 5cafc8ea..2dbd9a5b 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -337,12 +337,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
     if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% 
             colnames(junctions_dt))){
         junctions_dt[spliceType == "annotatedIntron_reducedUsage" & 
-                         type == "jaccard" &
-                         nonsplitProportion >= nonsplitProportion_99quantile,
-                     spliceType := "intron_retention"]
+                    type == "jaccard" &
+                    nonsplitProportion >= nonsplitProportion_99quantile + 0.05 &
+                    nonsplitCounts >= 10,
+                spliceType := "(partial)intronRetention"]
         
         # TODO check frameshift for intron retention
-        junctions_dt[spliceType == "intron_retention", 
+        junctions_dt[spliceType == "(partial)intronRetention", 
                         causesFrameshift := "inconclusive"]
     }
     

From 1aa4052d64a0ae0b1c5908181e1512ab4c20891a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 16:40:02 +0200
Subject: [PATCH 30/80] require sgseq package to be available for running
 examples

---
 R/plotMethods.R      | 60 +++++++++++++++++++++++---------------------
 man/plotFunctions.Rd | 54 ++++++++++++++++++++-------------------
 2 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index 4deeb120..eef9ce4e 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -246,34 +246,36 @@
 #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 #' 
 #' # plot splice graph and coverage from bam files in a given region
-#' fds <- createTestFraserSettings()
-#' gr <- GRanges(seqnames="chr19", 
-#'     IRanges(start=7587496, end=7598895), 
-#'     strand="+")
-#' plotBamCoverage(fds, gr=gr, sampleID="sample3", 
-#'     control_samples="sample2", min_junction_count=5,
-#'     curvature_splicegraph=1, curvature_coverage=1, 
-#'     mar=c(1, 7, 0.1, 3))
+#' if(require(SGSeq)){
+#'     fds <- createTestFraserSettings()
+#'     gr <- GRanges(seqnames="chr19", 
+#'         IRanges(start=7587496, end=7598895), 
+#'         strand="+")
+#'     plotBamCoverage(fds, gr=gr, sampleID="sample3", 
+#'         control_samples="sample2", min_junction_count=5,
+#'         curvature_splicegraph=1, curvature_coverage=1, 
+#'         mar=c(1, 7, 0.1, 3))
 #' 
-#' # plot coverage from bam file for a row in the result table
-#' fds <- createTestFraserDataSet()
-#' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
-#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-#' require(org.Hs.eg.db)
-#' orgDb <- org.Hs.eg.db
+#'     # plot coverage from bam file for a row in the result table
+#'     fds <- createTestFraserDataSet()
+#'     require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+#'     txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+#'     require(org.Hs.eg.db)
+#'     orgDb <- org.Hs.eg.db
+#'  
+#'     res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+#'     res_dt <- as.data.table(res)
+#'     res_dt <- res_dt[sampleID == "sample2",]
 #'     
-#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
-#' res_dt <- as.data.table(res)
-#' res_dt <- res_dt[sampleID == "sample2",]
+#'     # plot full range of gene containing outlier junction
+#'     plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+#'         txdb=txdb, orgDb=orgDb, control_samples="sample3")
 #'     
-#' # plot full range of gene containing outlier junction
-#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
-#'     txdb=txdb, orgDb=orgDb, control_samples="sample3")
-#'     
-#' # plot only certain range around outlier junction
-#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
-#'     control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-#'     curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+#'     # plot only certain range around outlier junction
+#'     plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
+#'         control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+#'         curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+#' }
 #' 
 NULL
 
@@ -1217,7 +1219,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
         strand(gr) <- rep(strand(gr[1,]), length(gr))
     }
     gr <- range(gr)
-    gr <- keepSeqlevels(gr, as.character(seqnames(gr)))
+    gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr))))
     
     # convert highlight_range to GRangesList if not
     if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){
@@ -1232,7 +1234,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
     # overlap detected junctions with annotation
     if(!is.null(txdb)){
         # subset to chr of interest
-        seqlevels(txdb) <- as.character(seqnames(gr))
+        seqlevels(txdb) <- unique(as.character(seqnames(gr)))
         
         # extract transcript features with SGSeq package
         txf <- SGSeq::convertToTxFeatures(txdb)
@@ -1383,7 +1385,9 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
     }
     
     # if several genes overlap, only show those on same strand as outlier
-    gr <- gr[strand(gr) == strand(outlier_range),]
+    if(as.character(strand(outlier_range)) != "*"){
+        gr <- gr[strand(gr) == strand(outlier_range),]
+    }
     
     # create the coverage plot for the given outlier
     fds <- plotBamCoverage(fds, 
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 14958dae..3ee980cf 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -481,33 +481,35 @@ plotQQ(fds, result=res[1])
 plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 
 # plot splice graph and coverage from bam files in a given region
-fds <- createTestFraserSettings()
-gr <- GRanges(seqnames="chr19", 
-    IRanges(start=7587496, end=7598895), 
-    strand="+")
-plotBamCoverage(fds, gr=gr, sampleID="sample3", 
-    control_samples="sample2", min_junction_count=5,
-    curvature_splicegraph=1, curvature_coverage=1, 
-    mar=c(1, 7, 0.1, 3))
-
-# plot coverage from bam file for a row in the result table
-fds <- createTestFraserDataSet()
-require(TxDb.Hsapiens.UCSC.hg19.knownGene)
-txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
-require(org.Hs.eg.db)
-orgDb <- org.Hs.eg.db
+if(require(SGSeq)){
+    fds <- createTestFraserSettings()
+    gr <- GRanges(seqnames="chr19", 
+        IRanges(start=7587496, end=7598895), 
+        strand="+")
+    plotBamCoverage(fds, gr=gr, sampleID="sample3", 
+        control_samples="sample2", min_junction_count=5,
+        curvature_splicegraph=1, curvature_coverage=1, 
+        mar=c(1, 7, 0.1, 3))
+
+    # plot coverage from bam file for a row in the result table
+    fds <- createTestFraserDataSet()
+    require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+    require(org.Hs.eg.db)
+    orgDb <- org.Hs.eg.db
+ 
+    res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+    res_dt <- as.data.table(res)
+    res_dt <- res_dt[sampleID == "sample2",]
     
-res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
-res_dt <- as.data.table(res)
-res_dt <- res_dt[sampleID == "sample2",]
+    # plot full range of gene containing outlier junction
+    plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+        txdb=txdb, orgDb=orgDb, control_samples="sample3")
     
-# plot full range of gene containing outlier junction
-plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
-    txdb=txdb, orgDb=orgDb, control_samples="sample3")
-    
-# plot only certain range around outlier junction
-plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
-    control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-    curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+    # plot only certain range around outlier junction
+    plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
+        control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+        curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+}
 
 }

From 8a83664ac5f3503221c82a31a1392fdd6526c3ad Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 13 Jul 2022 19:13:56 +0200
Subject: [PATCH 31/80] fix example

---
 R/plotMethods.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index eef9ce4e..ad241e25 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -274,7 +274,8 @@
 #'     # plot only certain range around outlier junction
 #'     plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
 #'         control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-#'         curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+#'         curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+#'         splicegraph_labels="id")
 #' }
 #' 
 NULL

From 78efd11c36f3ba71625d785a003dd7c25c7b2e07 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 14 Jul 2022 14:06:21 +0200
Subject: [PATCH 32/80] add updated man/plotFunctions.Rd

---
 man/plotFunctions.Rd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 3ee980cf..f4cc5c47 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -509,7 +509,8 @@ if(require(SGSeq)){
     # plot only certain range around outlier junction
     plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
         control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
-        curvature_coverage=0.5, right_extension=5000, left_extension=5000)
+        curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+        splicegraph_labels="id")
 }
 
 }

From d775fe4740bf015a309ce997e6914d24085b4189 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 19 Jul 2022 15:42:51 +0200
Subject: [PATCH 33/80] allow for small variants in nonsplit anchor region

---
 R/countRNAseqData.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index b5b442df..ee9f908b 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -895,7 +895,8 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
     
     # extract the counts with Rsubread
     tmp_ssc <- checkSeqLevelStyle(spliceSiteCoords, fds, sampleID, TRUE)
-    anno <- GRanges2SAF(tmp_ssc, minAnchor=minAnchor)
+    # use minAnchor+1 here to allow for small variants in the anchor region
+    anno <- GRanges2SAF(tmp_ssc, minAnchor=(minAnchor+1))
     rsubreadCounts <- featureCounts(files=bamFile, annot.ext=anno,
             minOverlap=minAnchor*2, 
             allowMultiOverlap=TRUE,

From 580234b05ec346b98bf6179f9719ebfd535c366b Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 19 Jul 2022 17:48:41 +0200
Subject: [PATCH 34/80] adapt test case to compare to correct result when
 allowing short mismatches in nonsplit anchor

---
 tests/testthat/test_counting.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R
index 4fcd0d61..1bdbfc37 100644
--- a/tests/testthat/test_counting.R
+++ b/tests/testthat/test_counting.R
@@ -51,7 +51,7 @@ test_that("test minAnchor", {
         "sample3", features, fds, minAnchor=25, recount=TRUE)) })
     
     expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1])
-    expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1])
+    expect_equivalent(c(5, 8, 0, 0, 7), ctnNS25[,1])
 })
 
 test_that("Test psi values", {

From df77b899a73f65bf9fe43d43971d7871b3292794 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 21 Jul 2022 15:20:18 +0200
Subject: [PATCH 35/80] fix minor issue

---
 R/plotMethods.R | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index ad241e25..f78a41c0 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1176,7 +1176,7 @@ plotBamCoverage <- function(fds, gr, sampleID,
         toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5),
         curvature_splicegraph=1, curvature_coverage=1, cex=1,
         splicegraph_labels=c("genomic_range", "id", "name", "none"),
-        splicegraph_position=c("top", "bottom")){
+        splicegraph_position=c("top", "bottom"), ...){
     
     if(missing(fds)){
         stop("Missing input: fds (FraserDataSet object)")
@@ -1230,7 +1230,8 @@ plotBamCoverage <- function(fds, gr, sampleID,
     
     # extract splice graph
     sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, 
-                                min_junction_count=min_junction_count, psi=0)
+                                min_junction_count=min_junction_count, psi=0, 
+                                ...)
     
     # overlap detected junctions with annotation
     if(!is.null(txdb)){
@@ -1386,7 +1387,8 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
     }
     
     # if several genes overlap, only show those on same strand as outlier
-    if(as.character(strand(outlier_range)) != "*"){
+    if(as.character(strand(outlier_range)) != "*" & 
+        length(gr[strand(gr) == strand(outlier_range),]) > 0){
         gr <- gr[strand(gr) == strand(outlier_range),]
     }
     

From 211da6094221923a2379cdaedb112ce3165aec94 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 17 Aug 2022 11:01:57 +0200
Subject: [PATCH 36/80] small fix to plot cor heatmap for jaccard

---
 R/plotMethods.R      | 4 ++--
 man/plotFunctions.Rd | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index cb392593..5021edf0 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -971,8 +971,8 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
 
 
 plotCountCorHeatmap.FRASER <- function(object,
-                    type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000, 
-                    topJ=5000, minMedian=1, minCount=10, 
+                    type=c("psi5", "psi3", "theta", "jaccard"), logit=FALSE, 
+                    topN=50000, topJ=5000, minMedian=1, minCount=10, 
                     main=NULL, normalized=FALSE, show_rownames=FALSE,
                     show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
                     annotation_row=NA, border_color=NA, nClust=5,
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index a2862aed..57b42b60 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -107,7 +107,7 @@ plotFilterVariability(
 
 \S4method{plotCountCorHeatmap}{FraserDataSet}(
   object,
-  type = c("psi5", "psi3", "theta"),
+  type = c("psi5", "psi3", "theta", "jaccard"),
   logit = FALSE,
   topN = 50000,
   topJ = 5000,
@@ -149,7 +149,8 @@ plotBamCoverage(
   curvature_coverage = 1,
   cex = 1,
   splicegraph_labels = c("genomic_range", "id", "name", "none"),
-  splicegraph_position = c("top", "bottom")
+  splicegraph_position = c("top", "bottom"),
+  ...
 )
 
 plotBamCoverageFromResultTable(

From 90ae558c5a70f51a15554fa9d7f05c223a56d50a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 23 Nov 2022 14:08:51 +0100
Subject: [PATCH 37/80] add manhattan plot function and update documentation

---
 DESCRIPTION                 |   4 +-
 NAMESPACE                   |   5 +
 R/AllGenerics-definitions.R |   6 ++
 R/FRASER-package.R          |   2 +
 R/plotMethods.R             | 207 ++++++++++++++++++++++++++++++++++++
 man/plotFunctions.Rd        |  38 ++++++-
 6 files changed, 256 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1961b62e..a38de791 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -84,7 +84,9 @@ Suggests:
     TxDb.Hsapiens.UCSC.hg19.knownGene,
     org.Hs.eg.db,
     rtracklayer,
-    SGSeq
+    SGSeq,
+    ggbio,
+    biovizBase
 LinkingTo:
     RcppArmadillo,
     Rcpp
diff --git a/NAMESPACE b/NAMESPACE
index 0e6eb40a..4d813d74 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -74,6 +74,7 @@ export(plotExpectedVsObservedPsi)
 export(plotExpression)
 export(plotFilterExpression)
 export(plotFilterVariability)
+export(plotManhattan)
 export(plotQQ)
 export(plotVolcano)
 export(predictedMeans)
@@ -112,6 +113,7 @@ exportMethods(pairedEnd)
 exportMethods(plotAberrantPerSample)
 exportMethods(plotCountCorHeatmap)
 exportMethods(plotEncDimSearch)
+exportMethods(plotManhattan)
 exportMethods(plotQQ)
 exportMethods(plotVolcano)
 exportMethods(results)
@@ -255,11 +257,14 @@ importFrom(VGAM,rbetabinom)
 importFrom(VGAM,vglm)
 importFrom(biomaRt,getBM)
 importFrom(biomaRt,useEnsembl)
+importFrom(biovizBase,parseArgsForAes)
+importFrom(biovizBase,parseArgsForNonAes)
 importFrom(cowplot,theme_cowplot)
 importFrom(extraDistr,dbbinom)
 importFrom(extraDistr,pbbinom)
 importFrom(extraDistr,rdirmnom)
 importFrom(generics,fit)
+importFrom(ggbio,autoplot)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,annotate)
 importFrom(ggplot2,annotation_logticks)
diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R
index ced4142a..0a364ac2 100644
--- a/R/AllGenerics-definitions.R
+++ b/R/AllGenerics-definitions.R
@@ -146,3 +146,9 @@ setGeneric("nonSplicedReads",
 #' @export
 setGeneric("nonSplicedReads<-", signature = "object", 
         function(object, value) standardGeneric("nonSplicedReads<-"))
+
+#' @rdname plotFunctions
+#' @export
+setGeneric("plotManhattan", function(object, ...) 
+    standardGeneric("plotManhattan"))
+
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 642dc225..22cee146 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -50,6 +50,8 @@
 #' @importFrom RColorBrewer brewer.pal
 #' @importFrom cowplot theme_cowplot
 #' @importFrom ggrepel geom_text_repel
+#' @importFrom biovizBase parseArgsForAes parseArgsForNonAes 
+#' @importFrom ggbio autoplot
 #'
 #'
 ### Data handling
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 5021edf0..1b4ae279 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -17,6 +17,7 @@
 #'   \item plotEncDimSearch()
 #'   \item plotBamCoverage()
 #'   \item plotBamCoverageFromResultTable()
+#'   \item plotManhattan()
 #' }
 #'
 #' For a detailed description of each plot function please see the details.
@@ -209,6 +210,17 @@
 #' It plots the encoding dimension against the achieved loss (area under the
 #' precision-recall curve). From this plot the optimum should be choosen for
 #' the \code{q} in fitting process.
+#' 
+#' \code{plotManhattan}: A Manhattan plot showing the junction pvalues by 
+#' genomic position. Useful to identify if outliers cluster by genomic position.
+#' 
+#' \code{plotBamCoverage}: A sashimi plot showing the read coverage from 
+#' the underlying bam files for a given genomic range and sampleIDs. 
+#' 
+#' \code{plotBamCoverageFromResultTable}: A sashimi plot showing the read 
+#' coverage from the underlying bam files for a row in the results table. Can 
+#' either show the full range of the gene with the outlier junction or only a 
+#' certain region around the outlier. 
 #'
 #' @return If base R graphics are used nothing is returned else the plotly or
 #'             the gplot object is returned.
@@ -245,6 +257,9 @@
 #' plotQQ(fds, result=res[1])
 #' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 #' 
+#' # create manhattan plot of pvalues by genomic position
+#' plotManhattan(fds, type="jaccard", sampleID="sample10")
+#' 
 #' # plot splice graph and coverage from bam files in a given region
 #' if(require(SGSeq)){
 #'     fds <- createTestFraserSettings()
@@ -1411,6 +1426,69 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
     return(invisible(fds))
 }
 
+plotManhattan.FRASER <- function(object, sampleID, 
+                                type=c("psi5", "psi3", "theta", "jaccard"), 
+                                main=paste0("sampleID = ", sampleID), 
+                                color=c("black", "darkgrey"),
+                                ...){
+    # check arguments
+    stopifnot(sampleID %in% samples(object))
+    type <- match.arg(type)
+    additional_args <- list(...)
+    padjCutoff <- 0.05
+    if("padjCutoff" %in% names(additional_args)){
+        padjCutoff <- additional_args$padjCutoff
+    }
+    deltaPsiCutoff <- 0.3
+    if("deltaPsiCutoff" %in% names(additional_args)){
+        deltaPsiCutoff <- additional_args$deltaPsiCutoff
+    }
+    
+    # extract neccessary informations
+    gr_sample <- rowRanges(object, type=type)
+    seqlevelsStyle(gr_sample) <- seqlevelsStyle(object)
+    mcols(gr_sample)[,"pvalue"] <- -log10(
+        pVals(object, type=type, level="junction")[,sampleID])
+    mcols(gr_sample)[,"padjust"] <- -log10(
+        padjVals(object, type=type, level="site")[,sampleID])
+    mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID]
+    
+    # only one point per donor/acceptor site (relevant only for psi5 and psi3)
+    index <- FRASER:::getSiteIndex(object, type=type)
+    nonDup <- !duplicated(index)
+    gr_sample <- gr_sample[nonDup,]
+    
+    # Sort granges for plot
+    gr_sample <- sortSeqlevels(gr_sample)
+    gr_sample <- sort(gr_sample)
+    
+    # find outlier indices
+    if(!type %in% c("psi3", "psi5")){
+        outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) &
+                                abs(gr_sample$delta) >= deltaPsiCutoff)
+    } else{
+        outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff))
+    }
+    message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...")
+    
+    # plot manhattan plot
+    plotGrandLinear.adapted(gr_sample, aes(y=pvalue), 
+                            color=color, 
+                            highlight.gr=gr_sample[outlier_idx,],
+                            highlight.overlap="equal") + 
+        labs(title=main)
+    
+}
+
+#'
+#' Plot manhattan plot of junction pvalues
+#'
+#' @rdname plotFunctions
+#' @export
+setMethod("plotManhattan", signature="FraserDataSet", 
+          plotManhattan.FRASER)
+
+
 #'
 #' helper function to get the annotation as data frame from the col data object
 #'
@@ -1507,3 +1585,132 @@ getSGSeqSI <- function(fds, sample_ids){
         return(list(si, fds))
     }
 }
+
+#'
+#' Adapted function from ggbio package to create manhattan plot. 
+#' Adapted to allow highlighting only ranges that exactly match. Uses functions 
+#' from package biovizBase.
+#'
+#' @noRd
+plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, 
+        geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1, 
+        legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL, 
+        highlight.name = NULL, highlight.col = "red", highlight.label = TRUE, 
+        highlight.label.size = 5, highlight.label.offset = 0.05, 
+        highlight.label.col = "black", 
+        highlight.overlap = c("any", "start", "end", "within", "equal"),
+        spaceline = FALSE){
+    if (is.null(geom)) 
+        geom <- "point"
+    args <- list(...)
+    args.aes <- parseArgsForAes(args)
+    args.non <- parseArgsForNonAes(args)
+    two.color <- c("#0080FF", "#4CC4FF")
+    .is.seq <- FALSE
+    if (!"colour" %in% names(args.aes)) {
+        if (!any(c("color", "colour") %in% names(args.non))) {
+            .color <- two.color
+            args.aes$color <- as.name("seqnames")
+            .is.seq <- TRUE
+        }
+        else {
+            if (length(args.non$color) > 1) {
+                .color <- args.non$color
+                args.aes$color <- as.name("seqnames")
+                .is.seq <- TRUE
+                args.non <- args.non[!names(args.non) %in% c("colour", 
+                                                             "color")]
+            }
+        }
+    }
+    else {
+        if (quo_name(args.aes$colour) == "seqnames") 
+            args.aes$colour <- as.name("seqnames")
+    }
+    if (!"y" %in% names(args.aes)) 
+        stop("need to provide y")
+    args.non$coord <- "genome"
+    args.non$space.skip <- space.skip
+    args.non$geom <- geom
+    args.non$object <- obj
+    aes.res <- do.call(aes, args.aes)
+    p <- do.call(autoplot, c(list(aes.res), args.non))
+    if (!legend) 
+        p <- p + theme(legend.position = "none")
+    if (!missing(ylab)) 
+        p <- p + ylab(ylab)
+    if (!is.null(cutoff)) 
+        p <- p + geom_hline(yintercept = cutoff, color = cutoff.color, 
+                            size = cutoff.size)
+    chrs <- names(seqlengths(obj))
+    if (.is.seq) {
+        N <- length(chrs)
+        cols <- rep(.color, round(N/length(.color)) + 1)[1:N]
+        names(cols) <- chrs
+        p <- p + scale_color_manual(values = cols)
+    }
+    if (!missing(facets)) {
+        args$facets <- facets
+        args.facets <- subsetArgsByFormals(args, facet_grid, 
+                                           facet_wrap)
+        facet <- .buildFacetsFromArgs(obj, args.facets)
+        p <- p + facet
+    }
+    p <- p + theme(panel.grid.minor = element_blank())
+    if (!is.null(highlight.gr)) {
+        highlight.overlap <- match.arg(highlight.overlap)
+        idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap)
+        .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)), 
+                         function(id) {
+                             gr <- GRanges(as.character(seqnames(p@data))[id][1], 
+                                           IRanges(start = min(start(p@data[id])), end = max(end(p@data[id]))))
+                             val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)]))
+                             val <- val * (1 + highlight.label.offset)
+                             values(gr)$val <- val
+                             gr
+                         })
+        .h.pos <- suppressWarnings(do.call("c", unname(.h.pos)))
+        if (length(.h.pos)) {
+            if (is.null(highlight.name)) {
+                highlight.name <- names(highlight.gr)
+            }
+            else {
+                highlight.name <- values(highlight.gr)[, highlight.name]
+            }
+            p <- p + geom_point(data = mold(p@data[queryHits(idx)]), 
+                                do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), 
+                                color = highlight.col)
+            if (!is.null(highlight.name)) {
+                seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj)
+                suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj))
+                .trans <- transformToGenome(.h.pos, space.skip = space.skip)
+                values(.trans)$mean <- (start(.trans) + end(.trans))/2
+                values(.trans)$names <- highlight.name
+                p <- p + geom_text(data = mold(.trans), size = highlight.label.size, 
+                                   vjust = 0, color = highlight.label.col, do.call(aes, 
+                                                                                   list(x = substitute(mean), y = as.name("val"), 
+                                                                                        label = as.name("names"))))
+            }
+        }
+    }
+    if (spaceline) {
+        vline.df <- p@ggplot$data
+        vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames, 
+                                      function(dd) {
+                                          data.frame(start = min(dd$start), end = max(dd$end))
+                                      }))
+        gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2
+        p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") + 
+            theme(panel.grid = element_blank())
+    }
+    if (!missing(main)) 
+        p <- p + labs(title = main)
+    if (!missing(xlim)) 
+        p <- p + xlim(xlim)
+    if (!missing(ylim)) 
+        p <- p + ylim(ylim)
+    if (missing(xlab)) 
+        xlab <- ""
+    p <- p + ggplot2::xlab(xlab)
+    p
+}
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 57b42b60..9f591ebb 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -1,6 +1,7 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plotMethods.R
-\name{plotFunctions}
+% Please edit documentation in R/AllGenerics-definitions.R, R/plotMethods.R
+\name{plotManhattan}
+\alias{plotManhattan}
 \alias{plotFunctions}
 \alias{plotAberrantPerSample}
 \alias{plotVolcano}
@@ -18,8 +19,11 @@
 \alias{plotCountCorHeatmap,FraserDataSet-method}
 \alias{plotBamCoverage}
 \alias{plotBamCoverageFromResultTable}
+\alias{plotManhattan,FraserDataSet-method}
 \title{Visualization functions for FRASER}
 \usage{
+plotManhattan(object, ...)
+
 \S4method{plotVolcano}{FraserDataSet}(
   object,
   sampleID,
@@ -166,10 +170,22 @@ plotBamCoverageFromResultTable(
   right_extension = 1000,
   ...
 )
+
+\S4method{plotManhattan}{FraserDataSet}(
+  object,
+  sampleID,
+  type = c("psi5", "psi3", "theta", "jaccard"),
+  main = paste0("sampleID = ", sampleID),
+  color = c("black", "darkgrey"),
+  ...
+)
 }
 \arguments{
 \item{object, fds}{An \code{\link{FraserDataSet}} object.}
 
+\item{...}{Additional parameters passed to plot() or plot_ly() if not stated
+otherwise in the details for each plot function}
+
 \item{sampleID}{A sample ID which should be plotted. Can also be a vector.
 Integers are treated as indices.}
 
@@ -193,9 +209,6 @@ list of gene symbols or sampleIDs.}
 \item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta
 psi cutoff to mark outliers}
 
-\item{...}{Additional parameters passed to plot() or plot_ly() if not stated
-otherwise in the details for each plot function}
-
 \item{BPPARAM}{BiocParallel parameter to use.}
 
 \item{idx}{A junction site ID or gene ID or one of both, which
@@ -405,6 +418,7 @@ This is the list of all plotting function provided by FRASER:
   \item plotEncDimSearch()
   \item plotBamCoverage()
   \item plotBamCoverageFromResultTable()
+  \item plotManhattan()
 }
 
 For a detailed description of each plot function please see the details.
@@ -453,6 +467,17 @@ introns and for the filtered (i.e. non-variable) introns.
 It plots the encoding dimension against the achieved loss (area under the
 precision-recall curve). From this plot the optimum should be choosen for
 the \code{q} in fitting process.
+
+\code{plotManhattan}: A Manhattan plot showing the junction pvalues by 
+genomic position. Useful to identify if outliers cluster by genomic position.
+
+\code{plotBamCoverage}: A sashimi plot showing the read coverage from 
+the underlying bam files for a given genomic range and sampleIDs. 
+
+\code{plotBamCoverageFromResultTable}: A sashimi plot showing the read 
+coverage from the underlying bam files for a row in the results table. Can 
+either show the full range of the gene with the outlier junction or only a 
+certain region around the outlier.
 }
 \examples{
 # create full FRASER object 
@@ -481,6 +506,9 @@ plotExpression(fds, result=res[1])
 plotQQ(fds, result=res[1])
 plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
 
+# create manhattan plot of pvalues by genomic position
+plotManhattan(fds, type="jaccard", sampleID="sample10")
+
 # plot splice graph and coverage from bam files in a given region
 if(require(SGSeq)){
     fds <- createTestFraserSettings()

From 0a6addc6991957fdc764be207ae7939fa9188aba Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 24 Nov 2022 13:21:24 +0100
Subject: [PATCH 38/80] code cleanup & adaption of documentation, examples and
 tests for FRASER2

---
 NAMESPACE                              |  18 +-
 NEWS                                   |  21 ++
 R/AllGenerics-definitions.R            |   5 +
 R/AllGenerics.R                        |  32 ++-
 R/FRASER-package.R                     |  18 +-
 R/Fraser-pipeline.R                    |  16 +-
 R/annotationOfRanges.R                 |   6 +-
 R/autoencoder.R                        |   3 +-
 R/calculatePSIValue.R                  |  18 +-
 R/example_functions.R                  |   2 +-
 R/filterExpression.R                   | 358 ++++++++++++-------------
 R/find_encoding_dimensions.R           |  29 +-
 R/fitCorrectionMethods.R               |   2 +-
 R/getNSetterFuns.R                     |  78 ++++--
 R/helper-functions.R                   |  32 ++-
 R/makeSimulatedDataset.R               |   3 +-
 R/plotMethods.R                        |  47 ++--
 R/pvalsNzscore.R                       |  96 ++++++-
 R/variables.R                          |  20 +-
 R/zzz.R                                |   2 +-
 man/FRASER.Rd                          |  42 ++-
 man/annotateRanges.Rd                  |   6 +-
 man/calculatePSIValues.Rd              |   6 +-
 man/counts.Rd                          |  17 +-
 man/filtering.Rd                       |  70 ++---
 man/fit.Rd                             |   4 +-
 man/getter_setter_functions.Rd         |  14 +-
 man/injectOutliers.Rd                  |   3 +-
 man/optimHyperParams.Rd                |  13 +-
 man/plotFunctions.Rd                   |  40 +--
 man/psiTypes.Rd                        |   8 +-
 man/psiTypes_avail.Rd                  |  21 ++
 man/results.Rd                         |  16 +-
 tests/testthat/test_fraser_pipeline.R  |   2 +-
 tests/testthat/test_hyperParams.R      |   1 +
 tests/testthat/test_plotJunctionDist.R |   8 +-
 tests/testthat/test_stats.R            |  62 ++---
 37 files changed, 659 insertions(+), 480 deletions(-)
 create mode 100644 man/psiTypes_avail.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 4d813d74..139fcd83 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -30,6 +30,7 @@ export(bamFile)
 export(bestQ)
 export(calculatePSIValues)
 export(calculatePadjValues)
+export(calculatePadjValuesOnSubset)
 export(calculatePvalues)
 export(calculateZscore)
 export(condition)
@@ -44,10 +45,7 @@ export(dontWriteHDF5)
 export(featureExclusionMask)
 export(filterExpression)
 export(filterExpressionAndVariability)
-export(filterExpressionAndVariability_jaccard)
-export(filterExpression_jaccard)
 export(filterVariability)
-export(filterVariability_jaccard)
 export(fit)
 export(flagBlacklistRegions)
 export(getNonSplitReadCountsForAllSamples)
@@ -80,6 +78,7 @@ export(plotVolcano)
 export(predictedMeans)
 export(pseudocount)
 export(psiTypes)
+export(psiTypes_avail)
 export(results)
 export(rho)
 export(samples)
@@ -106,6 +105,7 @@ exportMethods(assays)
 exportMethods(bamFile)
 exportMethods(condition)
 exportMethods(filterExpression)
+exportMethods(filterVariability)
 exportMethods(length)
 exportMethods(name)
 exportMethods(nonSplicedReads)
@@ -167,6 +167,7 @@ importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomeInfoDb,seqlevels)
 importFrom(GenomeInfoDb,seqlevelsStyle)
 importFrom(GenomeInfoDb,seqnames)
+importFrom(GenomeInfoDb,sortSeqlevels)
 importFrom(GenomeInfoDb,standardChromosomes)
 importFrom(GenomicAlignments,junctions)
 importFrom(GenomicAlignments,readGAlignmentPairs)
@@ -180,6 +181,7 @@ importFrom(GenomicFeatures,makeTxDbFromGFF)
 importFrom(GenomicFeatures,seqlevels0)
 importFrom(GenomicFeatures,threeUTRsByTranscript)
 importFrom(GenomicRanges,"end<-")
+importFrom(GenomicRanges,"seqinfo<-")
 importFrom(GenomicRanges,"start<-")
 importFrom(GenomicRanges,GRanges)
 importFrom(GenomicRanges,GRangesList)
@@ -224,6 +226,7 @@ importFrom(Rsamtools,scanBamHeader)
 importFrom(Rsubread,featureCounts)
 importFrom(S4Vectors,"mcols<-")
 importFrom(S4Vectors,"metadata<-")
+importFrom(S4Vectors,"values<-")
 importFrom(S4Vectors,DataFrame)
 importFrom(S4Vectors,Rle)
 importFrom(S4Vectors,SimpleList)
@@ -234,6 +237,7 @@ importFrom(S4Vectors,metadata)
 importFrom(S4Vectors,queryHits)
 importFrom(S4Vectors,start)
 importFrom(S4Vectors,subjectHits)
+importFrom(S4Vectors,values)
 importFrom(SummarizedExperiment,"assay<-")
 importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
@@ -257,18 +261,17 @@ importFrom(VGAM,rbetabinom)
 importFrom(VGAM,vglm)
 importFrom(biomaRt,getBM)
 importFrom(biomaRt,useEnsembl)
-importFrom(biovizBase,parseArgsForAes)
-importFrom(biovizBase,parseArgsForNonAes)
 importFrom(cowplot,theme_cowplot)
 importFrom(extraDistr,dbbinom)
 importFrom(extraDistr,pbbinom)
 importFrom(extraDistr,rdirmnom)
 importFrom(generics,fit)
-importFrom(ggbio,autoplot)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,annotate)
 importFrom(ggplot2,annotation_logticks)
 importFrom(ggplot2,element_blank)
+importFrom(ggplot2,facet_grid)
+importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_abline)
 importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_hline)
@@ -277,10 +280,12 @@ importFrom(ggplot2,geom_point)
 importFrom(ggplot2,geom_ribbon)
 importFrom(ggplot2,geom_segment)
 importFrom(ggplot2,geom_smooth)
+importFrom(ggplot2,geom_text)
 importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,ggtitle)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,quo_name)
 importFrom(ggplot2,scale_color_brewer)
 importFrom(ggplot2,scale_color_discrete)
 importFrom(ggplot2,scale_color_gradientn)
@@ -349,6 +354,7 @@ importFrom(stats,rnbinom)
 importFrom(stats,rnorm)
 importFrom(stats,runif)
 importFrom(stats,sd)
+importFrom(tibble,"%>%")
 importFrom(tibble,as_tibble)
 importFrom(tools,file_path_as_absolute)
 importFrom(utils,capture.output)
diff --git a/NEWS b/NEWS
index 15ca2c35..6728c4a5 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,24 @@
+CHANGES IN VERSION 2.0.0
+-------------------------
+    o Major update to FRASER2:
+        o Introduction of new & more robust splice metric Intron Jaccard Index
+        o Only Intron Jaccard Index metric used by default    
+        o Improved gene level pvalue calculation and internal storage
+        o Introduction of option to limit FDR correction to user-defined 
+          subsets of genes per sample (e.g. OMIM genes with rare variant)
+        o Updated internal pseudocount parameter and default delta Jaccard 
+          cutoff    
+        o Junction filtering adapted to usage of Intron Jaccard Index metric
+            o Require min expression of N >= 10 in 25% of the samples
+        o Results table:
+            o Functionality to flag outliers in blacklist regions of the genome
+            o Functionality to annotate the predicted type of aberrantSplicing 
+              (e.g. exon skipping, intron retention etc.)
+        o Several updates in the plotting functions    
+            o introduction of manhattan plot functionality
+            o possibility to create sashimi plots to visualize read coverage in 
+              the bam files for outliers
+
 CHANGES IN VERSION 1.8.1
 -------------------------
     o Bugfix in merging splicing counts (#41)
diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R
index 0a364ac2..0015b143 100644
--- a/R/AllGenerics-definitions.R
+++ b/R/AllGenerics-definitions.R
@@ -152,3 +152,8 @@ setGeneric("nonSplicedReads<-", signature = "object",
 setGeneric("plotManhattan", function(object, ...) 
     standardGeneric("plotManhattan"))
 
+#' @rdname filtering
+#' @export
+setGeneric("filterVariability", function(object, ...)
+    standardGeneric("filterVariability"))
+
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 1161e26a..4bfb89fe 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -66,6 +66,9 @@ NULL
 #' @rdname fds-methods
 #' @export
 setMethod("samples", "FraserDataSet", function(object) {
+    if(!is.null(colnames(object))){
+        return(colnames(object))
+    }
     return(as.character(colData(object)[,"sampleID"]))
 })
 
@@ -74,6 +77,7 @@ setMethod("samples", "FraserDataSet", function(object) {
 setReplaceMethod("samples", "FraserDataSet", function(object, value) {
     colData(object)[,"sampleID"] <- as.character(value)
     rownames(colData(object)) <- colData(object)[,"sampleID"]
+    colnames(object) <- as.character(value)
     validObject(object)
     return(object)
 })
@@ -515,12 +519,14 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace)
 #' @examples 
 #'  fds <- createTestFraserDataSet()
 #'  
-#'  counts(fds, type="psi5", side="ofInterest")
-#'  counts(fds, type="psi5", side="other")
+#'  counts(fds, side="ofInterest")
+#'  counts(fds, type="jaccard", side="other")
+#'  head(K(fds))
+#'  head(K(fds, type="psi5"))
 #'  head(K(fds, type="psi3"))
 #'  head(N(fds, type="theta"))
 #'  
-setMethod("counts", "FraserDataSet", function(object, type=NULL,
+setMethod("counts", "FraserDataSet", function(object, type=currentType(object),
             side=c("ofInterest", "otherSide")){
     side <- match.arg(side)
     if(side=="ofInterest"){
@@ -552,7 +558,8 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
 #' setter for count data
 #' 
 #' @rdname counts
-setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL,
+setReplaceMethod("counts", "FraserDataSet", function(object, 
+                    type=currentType(object),
                     side=c("ofInterest", "otherSide"), ..., value){
     side <- match.arg(side)
 
@@ -904,22 +911,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #'
 #' # get aberrant events per sample: on the example data, nothing is aberrant
 #' # based on the adjusted p-value
-#' aberrant(fds, type="psi5", by="sample")
+#' aberrant(fds, type="jaccard", by="sample")
 #' 
 #' # get aberrant events per gene (first annotate gene symbols)
 #' fds <- annotateRangesWithTxDb(fds)
-#' aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA,
+#' aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA,
 #'         aggregate=TRUE)
 #'         
 #' # find aberrant junctions/splice sites
-#' aberrant(fds, type="psi5")
+#' aberrant(fds, type="jaccard")
 #' @export
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
-                    zScoreCutoff=NA, deltaPsiCutoff=0.3,
-                    rhoCutoff=0.1, aggregate=FALSE, collapse=FALSE,
-                    minCount=5, psiType=c("psi3", "psi5", "theta", 
-                                            "jaccard"),
+                    zScoreCutoff=NA, deltaPsiCutoff=0.1,
+                    rhoCutoff=1, aggregate=FALSE, collapse=FALSE,
+                    minCount=5, psiType=currentType(object),
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
@@ -931,8 +937,8 @@ setMethod("results", "FraserDataSet", function(object,
 })
 
 aberrant.FRASER <- function(object, type=currentType(object), 
-                                padjCutoff=0.05, deltaPsiCutoff=0.3, 
-                                zScoreCutoff=NA, minCount=5, rhoCutoff=0.1,
+                                padjCutoff=0.05, deltaPsiCutoff=0.1, 
+                                zScoreCutoff=NA, minCount=5, rhoCutoff=1,
                                 by=c("none", "sample", "feature"), 
                                 aggregate=FALSE, geneColumn="hgnc_symbol", ...){
     
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 22cee146..0426bb26 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -31,6 +31,7 @@
 #'          rbind Assays
 #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
 #'          makeGRangesFromDataFrame invertStrand start end start<- end<-
+#'          seqinfo<-
 #' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance
 #'          %over%
 #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter
@@ -50,9 +51,6 @@
 #' @importFrom RColorBrewer brewer.pal
 #' @importFrom cowplot theme_cowplot
 #' @importFrom ggrepel geom_text_repel
-#' @importFrom biovizBase parseArgsForAes parseArgsForNonAes 
-#' @importFrom ggbio autoplot
-#'
 #'
 ### Data handling
 #'
@@ -85,10 +83,12 @@
 #'
 #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<-
 #'          start end metadata metadata<- subjectHits queryHits elementMetadata
+#'          values values<-
 #' @importFrom grDevices colorRampPalette
 #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels
 #'          seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames 
-#'          seqinfo standardChromosomes dropSeqlevels keepSeqlevels
+#'          seqinfo standardChromosomes dropSeqlevels keepSeqlevels 
+#'          sortSeqlevels
 #' @importFrom DelayedArray rowMaxs rowMeans path<- cbind plogis qlogis 
 #'          DelayedArray
 #' @importFrom DelayedMatrixStats colSds rowMedians rowSds colMeans2 rowMeans2
@@ -103,9 +103,10 @@
 #'          scale_y_log10 scale_color_gradientn labs theme_bw theme
 #'          scale_color_brewer scale_color_discrete scale_linetype_manual 
 #'          annotate geom_histogram scale_fill_manual xlim scale_colour_manual
-#'          element_blank annotation_logticks ylim
+#'          element_blank annotation_logticks ylim quo_name facet_grid 
+#'          facet_wrap geom_text
 #'
-#' @importFrom tibble as_tibble
+#' @importFrom tibble as_tibble %>%
 #'
 #' @useDynLib FRASER
 #'
@@ -135,5 +136,8 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
         "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", 
         "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", 
         "distNearestGene", "UTR_overlap", "meanCount", "medianCount", 
-        "spliceType2"),
+        "spliceType2", "nonsplitProportion", "nonsplitCounts", 
+        "nonsplitProportion_99quantile", "startID", "endID", "j_idx", 
+        "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", 
+        "pvalue"),
         package="FRASER")
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index c3b9b967..b7960dad 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -31,7 +31,7 @@
 #'         splicing types.
 #' @param implementation The method that should be used to correct for 
 #' confounders. 
-#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing 
+#' @param type The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
 #' efficiency)
 #' @param iterations The maximal number of iterations. When the autoencoder has 
 #' not yet converged after these number of iterations, the fit stops anyway.
@@ -61,16 +61,16 @@
 #' # The functions run inside the FRASER function can also be directly 
 #' # run themselves. 
 #' # To directly run the fit function:
-#' fds <- fit(fds, implementation="PCA", q=2, type="psi5")
+#' fds <- fit(fds, implementation="PCA", q=2, type="jaccard")
 #' 
 #' # To directly run the nomial and adjusted p value and z score 
 #' # calculation, the following functions can be used:
-#' fds <- calculatePvalues(fds, type="psi5")
-#' head(pVals(fds, type="psi5"))
-#' fds <- calculatePadjValues(fds, type="psi5", method="BY")
-#' head(padjVals(fds, type="psi5"))
-#' fds <- calculateZscore(fds, type="psi5")
-#' head(zScores(fds, type="psi5")) 
+#' fds <- calculatePvalues(fds, type="jaccard")
+#' head(pVals(fds, type="jaccard"))
+#' fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+#' head(padjVals(fds, type="jaccard"))
+#' fds <- calculateZscore(fds, type="jaccard")
+#' head(zScores(fds, type="jaccard")) 
 #' 
 #' @seealso \code{\link[FRASER]{fit}}
 #' 
diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
index da036f5b..13939780 100644
--- a/R/annotationOfRanges.R
+++ b/R/annotationOfRanges.R
@@ -32,13 +32,13 @@
 #' # either using biomart with GRCh38
 #' try({
 #'   fds <- annotateRanges(fds, GRCh=38)
-#'   rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
+#'   rowRanges(fds, type="j")[,c("hgnc_symbol")]
 #' })
 #' 
 #' # either using biomart with GRCh37
 #' try({
 #'   fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
-#'   rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
+#'   rowRanges(fds, type="j")[,c("hgnc_symbol_37")]
 #' })
 #'  
 #' # or with a provided TxDb object
@@ -47,7 +47,7 @@
 #' require(org.Hs.eg.db)
 #' orgDb <- org.Hs.eg.db
 #' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
-#' rowRanges(fds, type="psi5")[,"hgnc_symbol"]
+#' rowRanges(fds, type="j")[,"hgnc_symbol"]
 #' 
 #' @rdname annotateRanges
 #' @export
diff --git a/R/autoencoder.R b/R/autoencoder.R
index f640fc7e..d9d0e7a2 100644
--- a/R/autoencoder.R
+++ b/R/autoencoder.R
@@ -2,7 +2,8 @@
 #' Main autoencoder fit function
 #'
 #' @noRd
-fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1,
+fitAutoencoder <- function(fds, q, type=currentType(fds), noiseAlpha=1, 
+                    minDeltaPsi=0.1,
                     rhoRange=c(-30, 30), lambda=0, convergence=1e-5,
                     iterations=15, initialize=TRUE, control=list(),
                     BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5,
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index fafc4f33..c0e9d425 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -14,18 +14,18 @@
 #'
 #' @inheritParams countRNA
 #' @param types A vector with the psi types which should be calculated. Default 
-#' is all of psi5, psi3 and theta.
+#' is all of jaccard, psi5, psi3 and theta.
 #' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will
 #'              be recalculated based on the existing junction counts (aka K)
 #' @return FraserDataSet
 #' @export
 #' @examples
 #'   fds <- createTestFraserDataSet()
-#'   fds <- calculatePSIValues(fds, types="psi5")
+#'   fds <- calculatePSIValues(fds, types="jaccard")
 #'   
 #'   ### usually one would run this function for all psi types by using:
 #'   # fds <- calculatePSIValues(fds)
-calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, 
+calculatePSIValues <- function(fds, types=psiTypes_avail, overwriteCts=FALSE, 
                                 BPPARAM=bpparam()){
     # check input
     stopifnot(is(fds, "FraserDataSet"))
@@ -187,7 +187,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
     # check input
     stopifnot(is(fds, "FraserDataSet"))
     
-    message(date(), ": Calculate the PSI site values ...")
+    message(date(), ": Calculate the theta values ...")
     
     psiName <- "theta"
     psiROCName <- "rawOtherCounts_theta"
@@ -346,8 +346,6 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){
     jaccardValues <- K(fds, type="j") / jaccard_denom 
     otherCounts_jaccard <- jaccard_denom - K(fds, type="j")
     
-    # TODO also calculate it with nonsplit counts in the nominator 
-    
     # assign it to our object
     assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues
     
@@ -360,15 +358,15 @@ calculateJaccardIntronIndex <- function(fds, overwriteCts){
     return(fds)
 }
 
-#' Calculates the sum of nonsplit reads overlapping either the donor or acceptor 
-#' splice site and stores it as a new assay (one value for each junction and 
-#' sample).
+#' Calculates the sum of nonsplit reads overlapping either the donor or 
+#' acceptor splice site and stores it as a new assay (one value for each 
+#' junction and sample).
 #' 
 #' @noRd
 calculateIntronNonsplitSum <- function(fds, overwriteCts){
     stopifnot(is(fds, "FraserDataSet"))
     
-    message(date(), ": Calculate the total nonsplict counts for each intron ", 
+    message(date(), ": Calculate the total nonsplit counts for each intron ", 
                 "...")
     
     
diff --git a/R/example_functions.R b/R/example_functions.R
index 9d8bbf21..cec001f6 100644
--- a/R/example_functions.R
+++ b/R/example_functions.R
@@ -84,7 +84,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){
     suppressMessages({ fds <- annotateRangesWithTxDb(fds) })
     
     # run FRASER pipeline
-    fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2)
+    fds <- FRASER(fds, q=c(jaccard=2, psi5=2, psi3=2, theta=2), iterations=2)
     
     # save data for later 
     fds <- saveFraserDataSet(fds)
diff --git a/R/filterExpression.R b/R/filterExpression.R
index a62ae794..23729ee9 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -27,8 +27,8 @@
 #' @examples
 #' fds <- createTestFraserDataSet()
 #' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
-#' mcols(fds, type="psi5")[, c(
-#'         "maxCount", "passedExpression", "maxDPsi3", "passedVariability")]
+#' mcols(fds, type="jaccard")[, c(
+#'         "maxCount", "passedExpression", "maxDJaccard", "passedVariability")]
 #' 
 #' plotFilterExpression(fds)
 #' plotFilterVariability(fds)     
@@ -42,7 +42,7 @@ NULL
 #' read support and introns that are not variable across samples. 
 #' @export
 filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, 
-                    quantile=0.95, quantileMinExpression=10, minDeltaPsi=0.05,
+                    quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0,
                     filter=TRUE, 
                     delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
                     BPPARAM=bpparam()){
@@ -64,49 +64,47 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
     
 }
 
-filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
-                    quantile=0.95, quantileMinExpression=10, filter=TRUE, 
+#' This function filters out introns and corresponding 
+#' splice sites which are expressed at very low levels across samples.
+#' @noRd
+filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
+                    quantile=0.75, quantileMinExpression=10, filter=TRUE, 
                     delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
                     BPPARAM=bpparam()){
-
+    
     stopifnot(is(object, "FraserDataSet"))
     
     message(date(), ": Filtering out introns with low read support ...")
     
     # extract counts
     cts  <- K(object, type="j")
-    ctsN5 <- N(object, type="psi5")
-    ctsN3 <- N(object, type="psi3")
+    ctsN <- N(object, type="jaccard")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
-        ctsN5 <- as.matrix(ctsN5)
-        ctsN3 <- as.matrix(ctsN3)
+        ctsN <- as.matrix(ctsN)
     }
-
+    
     # cutoff functions
     f1 <- function(cts, ...){
-            rowMaxs(cts) }
-    f2 <- function(cts, ctsN5, quantile, ...){
-            rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] }
-    f3 <- function(cts, ctsN3, quantile, ...) {
-            rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] }
-
-    funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3)
-
+        rowMaxs(cts) }
+    f2 <- function(cts, ctsN, quantile, ...){
+        rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] }
+    
+    funs <- c(maxCount=f1, quantileValueN=f2)
+    
     # run it in parallel
     cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
-            cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile)
-
+                        cts=cts, ctsN=ctsN, quantile=quantile)
+    
     # add annotation to object
     for(n in names(cutoffs)){
         mcols(object, type="j")[n] <- cutoffs[[n]]
     }
     
     mcols(object, type="j")[['passedExpression']] <-
-            cutoffs$maxCount >= minExpressionInOneSample &
-            (cutoffs$quantileValue5 >= quantileMinExpression &
-                cutoffs$quantileValue3 >= quantileMinExpression) 
+        cutoffs$maxCount >= minExpressionInOneSample &
+        cutoffs$quantileValueN >= quantileMinExpression  
     if("passedVariability" %in% colnames(mcols(object, type="j"))){
         mcols(object, type="j")[['passed']] <-  
             mcols(object, type="j")[['passedExpression']] & 
@@ -118,10 +116,11 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
     
     # filter if requested
     if(isTRUE(filter)){
-        object <- applyExpressionFilters(object, minExpressionInOneSample, 
-                quantileMinExpression)
+        object <- applyExpressionFilters_jaccard(object, 
+                                                minExpressionInOneSample, 
+                                                quantileMinExpression)
     }
-
+    
     validObject(object)
     return(object)
 }
@@ -130,78 +129,51 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
 #' splice sites that have low read support in all samples.
 #' @export
 setMethod("filterExpression", signature="FraserDataSet",
-        filterExpression.FRASER)
+        filterExpression_jaccard)
 
-#' @describeIn filtering This function filters out introns and corresponding 
+
+#' This function filters out introns and corresponding 
 #' splice sites which do not show variablity across samples.
-#' @export
-filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, 
-                    delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
-                    BPPARAM=bpparam()){
+#' @noRd
+filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE, 
+                delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), 
+                BPPARAM=bpparam()){
     
     message(date(), ": Filtering out non-variable introns ...")
     
     # extract counts
     cts    <- K(object, type="j")
-    ctsSE  <- K(object, type="ss")
-    ctsN5  <- N(object, type="psi5")
-    ctsN3  <- N(object, type="psi3")
-    ctsNSE <- N(object, type="theta")
+    ctsN  <- N(object, type="jaccard")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
-        ctsN5 <- as.matrix(ctsN5)
-        ctsN3 <- as.matrix(ctsN3)
-        ctsSE <- as.matrix(ctsSE)
-        ctsNSE <- as.matrix(ctsNSE)
+        ctsN <- as.matrix(ctsN)
     }
     
     # cutoff functions
-    f1 <- function(cts, ctsN3, ...) {
-        psi <- cts/ctsN3
-        rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
-    f2 <- function(cts, ctsN5, ...) {
-        psi <- cts/ctsN5
-        rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
-    f3 <- function(ctsSE, ctsNSE, ...) {
-        theta <- ctsSE/ctsNSE
-        dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), 
-                            na.rm=TRUE) }
-        
+    f1 <- function(cts, ctsN, ...) {
+        jaccard <- cts/ctsN
+        rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), 
+                na.rm=TRUE) }
     
-    funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3)
+    funs <- c(maxDJaccard=f1)
     
     # run it in parallel
     cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
-                        cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, 
-                        ctsSE=ctsSE, ctsNSE=ctsNSE)
+                        cts=cts, ctsN=ctsN)
     
     # add annotation to object
     for(n in names(cutoffs)){
-        if(n == "maxDTheta"){
-            mcols(object, type="ss")[n] <- cutoffs[[n]]
-        } else{ 
-            mcols(object, type="j")[n] <- cutoffs[[n]]
-        }
+        mcols(object, type="j")[n] <- cutoffs[[n]]
     }
     
     # add annotation of theta on splice sites of introns to mcols
     intron_dt <- as.data.table(rowRanges(object, type="j"))
-    ss_dt <- as.data.table(rowRanges(object, type="ss"))
-    mcols(object, type="j")["maxDThetaDonor"] <- 
-        merge(intron_dt, ss_dt, by.x="startID", by.y="spliceSiteID", 
-                all.x=TRUE, sort=FALSE)[,maxDTheta]
-    mcols(object, type="j")["maxDThetaAcceptor"] <- 
-        merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", 
-                all.x=TRUE, sort=FALSE)[,maxDTheta]
-
+    
     # check which introns pass the filter
     mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
-            cutoffs$maxDPsi3, 
-            cutoffs$maxDPsi5, 
-            mcols(object, type="j")$maxDThetaDonor, 
-            mcols(object, type="j")$maxDThetaAcceptor,
-            0) >= minDeltaPsi
+                                                        cutoffs$maxDJaccard, 
+                                                        0) >= minDeltaPsi
     if("passedExpression" %in% colnames(mcols(object, type="j"))){
         mcols(object, type="j")[['passed']] <-  
             mcols(object, type="j")[['passedExpression']] & 
@@ -213,27 +185,31 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE,
     
     # filter if requested
     if(isTRUE(filter)){
-        object <- applyVariabilityFilters(object, minDeltaPsi)
+        object <- applyVariabilityFilters_jaccard(object, minDeltaPsi)
     }
     
     validObject(object)
     return(object)
 }
 
+#' @describeIn filtering This function filters out introns and corresponding 
+#' splice sites that have low read support in all samples.
+#' @export
+setMethod("filterVariability", signature="FraserDataSet",
+            filterVariability_jaccard)
+
 #' Applies previously calculated filters for expression filters
 #' @noRd
-applyExpressionFilters <- function(fds, minExpressionInOneSample, 
-                                    quantileMinExpression){
+applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, 
+                                            quantileMinExpression){
     
     maxCount       <- mcols(fds, type="j")[['maxCount']]
-    quantileValue5 <- mcols(fds, type="j")[['quantileValue5']]
-    quantileValue3 <- mcols(fds, type="j")[['quantileValue3']]
+    quantileValueN <- mcols(fds, type="j")[['quantileValueN']]
     
     # report rare junctions that passed minExpression filter but not 
     # quantileFilter as SE obj
     junctionsToReport <- maxCount >= minExpressionInOneSample & 
-                        !(quantileValue5 >= quantileMinExpression &
-                                quantileValue3 >= quantileMinExpression) 
+        !(quantileValueN >= quantileMinExpression) 
     outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
     
     if(any(junctionsToReport)){
@@ -242,13 +218,14 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
         for(aname in assayNames(rareJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
                                 "rawOtherCounts_psi3", "psi5", "psi3", 
-                                "delta_psi5", "delta_psi3"))){
+                                "delta_psi5", "delta_psi3", "jaccard",
+                                "rawOtherCounts_intron_jaccard"))){
                 assay(rareJunctions, aname) <- NULL
             }
         }
         rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
-                                            dir=file.path(tempdir(), "tmp_rJ"), 
-                                            replace=TRUE)
+                                        dir=file.path(tempdir(), "tmp_rJ"), 
+                                        replace=TRUE)
         
         # check if folder already exists from previous filtering
         rareJctsDir <- file.path(outputDir, "rareJunctions")
@@ -258,8 +235,7 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
                     "cannot be restored.")
             rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir)
             toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & 
-                !(mcols(rJ_stored)$quantileValue5 >= quantileMinExpression &
-                    mcols(rJ_stored)$quantileValue3 >= quantileMinExpression) 
+                !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) 
             
             rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions)
             
@@ -273,7 +249,7 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
         } 
         
         rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
-                                                dir=rareJctsDir, replace=TRUE)
+                                            dir=rareJctsDir, replace=TRUE)
     }
     
     # apply filter
@@ -284,24 +260,19 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample,
     fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
     
     return(fds)
-    
 }
 
+
 #' Applies previously calculated variablilty filters
 #' @noRd
-applyVariabilityFilters <- function(fds, minDeltaPsi){
+applyVariabilityFilters_jaccard <- function(fds, minDeltaPsi){
     
     #
     passedVariability <-  mcols(fds, type="j")[['passedVariability']]
-    # maxDPsi3 <- mcols(fds, type="j")[['maxDPsi3']]
-    # maxDPsi5 <- mcols(fds, type="j")[['maxDPsi5']]
-    # maxDThetaDonor    <- mcols(fds, type="j")[['maxDThetaDonor']]
-    # maxDThetaAcceptor <- mcols(fds, type="j")[['maxDThetaAcceptor']]
     
     # store information of non-variable junctions
     filtered <- !passedVariability
-    # filtered <- (pmax(maxDPsi3, maxDPsi5, maxDThetaDonor, maxDThetaAcceptor) 
-    #                 < minDeltaPsi)
+    
     outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
     if(any(filtered)){
         # get SE object of junctions to report
@@ -309,7 +280,8 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
         for(aname in assayNames(nonVariableJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
                                 "rawOtherCounts_psi3", "psi5", "psi3", 
-                                "delta_psi5", "delta_psi3"))){
+                                "delta_psi5", "delta_psi3", "jaccard",
+                                "rawOtherCounts_intron_jaccard"))){
                 assay(nonVariableJunctions, aname) <- NULL
             }
         }
@@ -324,16 +296,13 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
                     "that were already filtered out but should be kept now ",
                     "cannot be restored.")
             nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) 
-            toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi &
-                        mcols(nV_stored)$maxDPsi3 < minDeltaPsi &
-                        mcols(nV_stored)$maxDThetaDonor < minDeltaPsi &
-                        mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi
+            toReport <- mcols(nV_stored)$maxDJaccard < minDeltaPsi
             
             nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
             for(aname in assayNames(nVJunctions)){
                 assay(nVJunctions, aname) <- 
-                rbind(as.matrix(assay(nonVariableJunctions, aname)), 
-                        as.matrix(assay(nV_stored[toReport,], aname)) )
+                    rbind(as.matrix(assay(nonVariableJunctions, aname)), 
+                            as.matrix(assay(nV_stored[toReport,], aname)) )
             }
             nonVariableJunctions <- nVJunctions
             rm(nVJunctions)
@@ -351,44 +320,15 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){
                     "% of the junctions"))
     fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
     return(fds)
-        
 }
 
-#' @describeIn filtering This functions filters out both introns with low 
-#' read support and introns that are not variable across samples. 
-#' @export
-filterExpressionAndVariability_jaccard <- function(object, 
-                        minExpressionInOneSample=20, 
-                        quantile=0.95, quantileMinExpression=1, minDelta=0.05,
-                        filter=TRUE, 
+
+#' Old FRASER1 filtering functions
+#' @noRd
+filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
+                        quantile=0.95, quantileMinExpression=10, filter=TRUE, 
                         delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
                         BPPARAM=bpparam()){
-    # filter introns with low read support and corresponding splice sites
-    object <- filterExpression_jaccard(object, 
-                        minExpressionInOneSample=minExpressionInOneSample, 
-                        quantile=quantile, 
-                        quantileMinExpression=quantileMinExpression, 
-                        filter=filter, delayed=delayed,
-                        BPPARAM=BPPARAM)
-    
-    # filter introns that are not variable across samples
-    object <- filterVariability_jaccard(object, minDelta=minDelta, 
-                        filter=filter, 
-                        delayed=delayed, BPPARAM=BPPARAM)
-    
-    # return fds
-    message(date(), ": Filtering done!")
-    return(object)
-    
-}
-
-#' @describeIn filtering This function filters out introns and corresponding 
-#' splice sites which are expressed at very low levels across samples.
-#' @export
-filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
-                            quantile=0.95, quantileMinExpression=1, filter=TRUE, 
-                            delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
-                            BPPARAM=bpparam()){
     
     stopifnot(is(object, "FraserDataSet"))
     
@@ -396,24 +336,28 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
     
     # extract counts
     cts  <- K(object, type="j")
-    ctsN <- N(object, type="jaccard")
+    ctsN5 <- N(object, type="psi5")
+    ctsN3 <- N(object, type="psi3")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
-        ctsN <- as.matrix(ctsN)
+        ctsN5 <- as.matrix(ctsN5)
+        ctsN3 <- as.matrix(ctsN3)
     }
     
     # cutoff functions
     f1 <- function(cts, ...){
         rowMaxs(cts) }
-    f2 <- function(cts, ctsN, quantile, ...){
-        rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] }
+    f2 <- function(cts, ctsN5, quantile, ...){
+        rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] }
+    f3 <- function(cts, ctsN3, quantile, ...) {
+        rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] }
     
-    funs <- c(maxCount=f1, quantileValueN=f2)
+    funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3)
     
     # run it in parallel
     cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
-                        cts=cts, ctsN=ctsN, quantile=quantile)
+                        cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile)
     
     # add annotation to object
     for(n in names(cutoffs)){
@@ -422,7 +366,8 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
     
     mcols(object, type="j")[['passedExpression']] <-
         cutoffs$maxCount >= minExpressionInOneSample &
-        cutoffs$quantileValueN >= quantileMinExpression  
+        (cutoffs$quantileValue5 >= quantileMinExpression &
+            cutoffs$quantileValue3 >= quantileMinExpression) 
     if("passedVariability" %in% colnames(mcols(object, type="j"))){
         mcols(object, type="j")[['passed']] <-  
             mcols(object, type="j")[['passedExpression']] & 
@@ -434,57 +379,83 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
     
     # filter if requested
     if(isTRUE(filter)){
-        object <- applyExpressionFilters_jaccard(object, 
-                                                minExpressionInOneSample, 
-                                                quantileMinExpression)
+        object <- applyExpressionFilters(object, minExpressionInOneSample, 
+                                            quantileMinExpression)
     }
     
     validObject(object)
     return(object)
 }
 
-#' @describeIn filtering This function filters out introns and corresponding 
-#' splice sites which do not show variablity across samples.
-#' @export
-filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE, 
-                              delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
-                              BPPARAM=bpparam()){
+#' Old FRASER1 filtering functions
+#' @noRd
+filterVariability.FRASER <- function(object, minDeltaPsi=0.05, filter=TRUE, 
+                        delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                        BPPARAM=bpparam()){
     
     message(date(), ": Filtering out non-variable introns ...")
     
     # extract counts
     cts    <- K(object, type="j")
-    ctsN  <- N(object, type="jaccard")
+    ctsSE  <- K(object, type="ss")
+    ctsN5  <- N(object, type="psi5")
+    ctsN3  <- N(object, type="psi3")
+    ctsNSE <- N(object, type="theta")
     
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)
-        ctsN <- as.matrix(ctsN)
+        ctsN5 <- as.matrix(ctsN5)
+        ctsN3 <- as.matrix(ctsN3)
+        ctsSE <- as.matrix(ctsSE)
+        ctsNSE <- as.matrix(ctsNSE)
     }
     
     # cutoff functions
-    f1 <- function(cts, ctsN, ...) {
-        jaccard <- cts/ctsN
-        rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), 
-                na.rm=TRUE) }
+    f1 <- function(cts, ctsN3, ...) {
+        psi <- cts/ctsN3
+        rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
+    f2 <- function(cts, ctsN5, ...) {
+        psi <- cts/ctsN5
+        rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
+    f3 <- function(ctsSE, ctsNSE, ...) {
+        theta <- ctsSE/ctsNSE
+        dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), 
+                            na.rm=TRUE) }
     
-    funs <- c(maxDJaccard=f1)
+    
+    funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3)
     
     # run it in parallel
     cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
-                        cts=cts, ctsN=ctsN)
+                        cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, 
+                        ctsSE=ctsSE, ctsNSE=ctsNSE)
     
     # add annotation to object
     for(n in names(cutoffs)){
+        if(n == "maxDTheta"){
+            mcols(object, type="ss")[n] <- cutoffs[[n]]
+        } else{ 
             mcols(object, type="j")[n] <- cutoffs[[n]]
+        }
     }
     
     # add annotation of theta on splice sites of introns to mcols
     intron_dt <- as.data.table(rowRanges(object, type="j"))
+    ss_dt <- as.data.table(rowRanges(object, type="ss"))
+    mcols(object, type="j")["maxDThetaDonor"] <- 
+        merge(intron_dt, ss_dt, by.x="startID", by.y="spliceSiteID", 
+                all.x=TRUE, sort=FALSE)[,maxDTheta]
+    mcols(object, type="j")["maxDThetaAcceptor"] <- 
+        merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", 
+                all.x=TRUE, sort=FALSE)[,maxDTheta]
     
     # check which introns pass the filter
     mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
-                                                           cutoffs$maxDJaccard, 
-                                                           0) >= minDelta
+                                    cutoffs$maxDPsi3, 
+                                    cutoffs$maxDPsi5, 
+                                    mcols(object, type="j")$maxDThetaDonor, 
+                                    mcols(object, type="j")$maxDThetaAcceptor,
+                                0) >= minDeltaPsi
     if("passedExpression" %in% colnames(mcols(object, type="j"))){
         mcols(object, type="j")[['passed']] <-  
             mcols(object, type="j")[['passedExpression']] & 
@@ -496,7 +467,7 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE,
     
     # filter if requested
     if(isTRUE(filter)){
-        object <- applyVariabilityFilters(object, minDelta)
+        object <- applyVariabilityFilters(object, minDeltaPsi)
     }
     
     validObject(object)
@@ -505,16 +476,18 @@ filterVariability_jaccard <- function(object, minDelta=0, filter=TRUE,
 
 #' Applies previously calculated filters for expression filters
 #' @noRd
-applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, 
-                                   quantileMinExpression){
+applyExpressionFilters <- function(fds, minExpressionInOneSample, 
+                                    quantileMinExpression){
     
     maxCount       <- mcols(fds, type="j")[['maxCount']]
-    quantileValueN <- mcols(fds, type="j")[['quantileValueN']]
+    quantileValue5 <- mcols(fds, type="j")[['quantileValue5']]
+    quantileValue3 <- mcols(fds, type="j")[['quantileValue3']]
     
     # report rare junctions that passed minExpression filter but not 
     # quantileFilter as SE obj
     junctionsToReport <- maxCount >= minExpressionInOneSample & 
-        !(quantileValueN >= quantileMinExpression) 
+        !(quantileValue5 >= quantileMinExpression &
+            quantileValue3 >= quantileMinExpression) 
     outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
     
     if(any(junctionsToReport)){
@@ -522,15 +495,14 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample,
         rareJunctions <- asSE(fds[junctionsToReport, by="j"])
         for(aname in assayNames(rareJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
-                              "rawOtherCounts_psi3", "psi5", "psi3", 
-                              "delta_psi5", "delta_psi3", "jaccard",
-                              "rawOtherCounts_intron_jaccard"))){
+                                "rawOtherCounts_psi3", "psi5", "psi3", 
+                                "delta_psi5", "delta_psi3"))){
                 assay(rareJunctions, aname) <- NULL
             }
         }
         rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
-                            dir=file.path(tempdir(), "tmp_rJ"), 
-                            replace=TRUE)
+                                        dir=file.path(tempdir(), "tmp_rJ"), 
+                                        replace=TRUE)
         
         # check if folder already exists from previous filtering
         rareJctsDir <- file.path(outputDir, "rareJunctions")
@@ -540,60 +512,63 @@ applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample,
                     "cannot be restored.")
             rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir)
             toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & 
-                !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) 
+                !(mcols(rJ_stored)$quantileValue5 >= quantileMinExpression &
+                    mcols(rJ_stored)$quantileValue3 >= quantileMinExpression) 
             
             rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions)
             
             for(aname in assayNames(rJ_tmp)){
                 assay(rJ_tmp, aname) <- 
                     rbind(as.matrix(assay(rareJunctions, aname)), 
-                          as.matrix(assay(rJ_stored[toReport,], aname)) )
+                            as.matrix(assay(rJ_stored[toReport,], aname)) )
             }
             rareJunctions <- rJ_tmp
             rm(rJ_tmp)
         } 
         
         rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, 
-                            dir=rareJctsDir, replace=TRUE)
+                                                dir=rareJctsDir, replace=TRUE)
     }
     
     # apply filter
     numFilt <- sum(mcols(fds, type="j")[['passedExpression']])
     message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
-                   ". This is ", signif(numFilt/length(fds)*100, 3),
-                   "% of the junctions"))
+                    ". This is ", signif(numFilt/length(fds)*100, 3),
+                    "% of the junctions"))
     fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
     
     return(fds)
-    
 }
 
-
 #' Applies previously calculated variablilty filters
 #' @noRd
-applyVariabilityFilters <- function(fds, minDelta){
+applyVariabilityFilters <- function(fds, minDeltaPsi){
     
     #
     passedVariability <-  mcols(fds, type="j")[['passedVariability']]
+    # maxDPsi3 <- mcols(fds, type="j")[['maxDPsi3']]
+    # maxDPsi5 <- mcols(fds, type="j")[['maxDPsi5']]
+    # maxDThetaDonor    <- mcols(fds, type="j")[['maxDThetaDonor']]
+    # maxDThetaAcceptor <- mcols(fds, type="j")[['maxDThetaAcceptor']]
     
     # store information of non-variable junctions
     filtered <- !passedVariability
-
+    # filtered <- (pmax(maxDPsi3, maxDPsi5, maxDThetaDonor, maxDThetaAcceptor) 
+    #                 < minDeltaPsi)
     outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
     if(any(filtered)){
         # get SE object of junctions to report
         nonVariableJunctions <- asSE(fds[filtered, by="j"])
         for(aname in assayNames(nonVariableJunctions)){
             if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", 
-                              "rawOtherCounts_psi3", "psi5", "psi3", 
-                              "delta_psi5", "delta_psi3", "jaccard",
-                              "rawOtherCounts_intron_jaccard"))){
+                            "rawOtherCounts_psi3", "psi5", "psi3", 
+                            "delta_psi5", "delta_psi3"))){
                 assay(nonVariableJunctions, aname) <- NULL
             }
         }
         nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE,
-                                    nonVariableJunctions,
-                                    dir=file.path(tempdir(), "tmp_nvJ"))
+                                        nonVariableJunctions,
+                                        dir=file.path(tempdir(), "tmp_nvJ"))
         
         # check if folder already exists from previous filtering
         nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions")
@@ -602,28 +577,31 @@ applyVariabilityFilters <- function(fds, minDelta){
                     "that were already filtered out but should be kept now ",
                     "cannot be restored.")
             nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) 
-            toReport <- mcols(nV_stored)$maxDJaccard < minDelta
+            toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi &
+                mcols(nV_stored)$maxDPsi3 < minDeltaPsi &
+                mcols(nV_stored)$maxDThetaDonor < minDeltaPsi &
+                mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi
             
             nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
             for(aname in assayNames(nVJunctions)){
                 assay(nVJunctions, aname) <- 
                     rbind(as.matrix(assay(nonVariableJunctions, aname)), 
-                          as.matrix(assay(nV_stored[toReport,], aname)) )
+                            as.matrix(assay(nV_stored[toReport,], aname)) )
             }
             nonVariableJunctions <- nVJunctions
             rm(nVJunctions)
         } 
         
         nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir,
-                                    x=nonVariableJunctions, replace=TRUE)
+                                        x=nonVariableJunctions, replace=TRUE)
         
     }
     
     # apply filtering
     numFilt <- sum(passedVariability)
     message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
-                   ". This is ", signif(numFilt/length(fds)*100, 3),
-                   "% of the junctions"))
+                    ". This is ", signif(numFilt/length(fds)*100, 3),
+                    "% of the junctions"))
     fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
     return(fds)
     
diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R
index af5be75a..f3747eab 100644
--- a/R/find_encoding_dimensions.R
+++ b/R/find_encoding_dimensions.R
@@ -113,17 +113,18 @@ findEncodingDim <- function(i, fds, type, params, implementation,
 #' @examples
 #'   # generate data
 #'   fds <- makeSimulatedFraserDataSet(m=15, j=20)
+#'   fds <- calculatePSIValues(fds)
 #'   
 #'   # run hyperparameter optimization
-#'   fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5))
+#'   fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5))
 #'   
 #'   # get estimated optimal dimension of the latent space
-#'   bestQ(fds, type="psi5")
-#'   hyperParams(fds, type="psi5")
+#'   bestQ(fds, type="jaccard")
+#'   hyperParams(fds, type="jaccard")
 #'   
 #' @export
-optimHyperParams <- function(fds, type, implementation="PCA",
-                    q_param=seq(2, min(40, ncol(fds)), by=3),
+optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA",
+                    q_param=getEncDimRange(fds),
                     noise_param=0, minDeltaPsi=0.1,
                     iterations=5, setSubset=50000, injectFreq=1e-2,
                     BPPARAM=bpparam(), internalThreads=1, plot=TRUE, 
@@ -190,7 +191,7 @@ optimHyperParams <- function(fds, type, implementation="PCA",
 
         # remove unneeded blocks to save memory
         a2rm <- paste(sep="_", c("originalCounts", "originalOtherCounts"),
-                rep(psiTypes, 2))
+                rep(psiTypes_avail, 2))
         for(a in a2rm){
             assay(fds_copy, a) <- NULL
         }
@@ -229,3 +230,19 @@ optimHyperParams <- function(fds, type, implementation="PCA",
     return(fds)
 }
 
+#' Get default range of latent space dimensions to test during hyper param opt
+#' @noRd
+getEncDimRange <- function(fds, mp=3){
+    # Get range for latent space dimension
+    a <- 2 
+    b <- min(ncol(fds), nrow(fds)) / mp   # N/mp
+    
+    maxSteps <- 12
+    if(mp < 6){
+        maxSteps <- 15
+    }
+    
+    Nsteps <- min(maxSteps, b)
+    pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique
+    return(pars_q)
+}
diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R
index 38f5d634..5b87d1ce 100644
--- a/R/fitCorrectionMethods.R
+++ b/R/fitCorrectionMethods.R
@@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
                             "AE", "AE-weighted", "PCA-BB-full", "fullAE", 
                             "PCA-regression", "PCA-reg-full", 
                             "PCA-BB-Decoder-no-weights", "BB"),
-                    q, type="psi3", rhoRange=c(-30, 30), 
+                    q, type=currentType(object), rhoRange=c(-30, 30), 
                     weighted=FALSE, noiseAlpha=1, convergence=1e-5, 
                     iterations=15, initialize=TRUE, control=list(), 
                     BPPARAM=bpparam(), nSubset=15000, 
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 608b0171..cd35cbf5 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -32,7 +32,7 @@
 #' dontWriteHDF5 <- TRUE
 #' 
 #' # get/set the splice metric for which results should be retrieved 
-#' currentType(fds) <- "psi5"
+#' currentType(fds) <- "jaccard"
 #' currentType(fds)
 #' 
 #' # get fitted parameters
@@ -50,9 +50,9 @@
 #' pseudocount()
 #' 
 #' # retrieve or set a mask to exclude certain junctions in the fitting step
-#' featureExclusionMask(fds, type="theta") <- sample(
-#'         c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE)
-#' featureExclusionMask(fds, type="theta")
+#' featureExclusionMask(fds, type="jaccard") <- sample(
+#'         c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE)
+#' featureExclusionMask(fds, type="jaccard")
 #' 
 #' # controlling the verbosity level of the output of some algorithms
 #' verbose(fds) <- 2
@@ -201,7 +201,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){
 }
 
 
-`setAssayMatrix<-` <- function(fds, name, type, ..., value){
+`setAssayMatrix<-` <- function(fds, name, type=currentType(fds), ..., value){
     if(!is.matrix(value)){
         value <- matrix(value, ncol=ncol(fds), nrow=nrow(mcols(fds, type=type)))
     }
@@ -220,7 +220,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){
     fds
 }
 
-getAssayMatrix <- function(fds, name, type, byGroup=FALSE){
+getAssayMatrix <- function(fds, name, type=currentType(fds), byGroup=FALSE){
     if(missing(name)){
         name <- type
     } else {
@@ -251,8 +251,7 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
 #' @describeIn getter_setter_functions This returns the calculated p-values.
 #' @export
 pVals <- function(fds, type=currentType(fds), level="site", 
-                    filters=list(rho=0.1),
-                    dist="BetaBinomial", ...){
+                    filters=list(rho=1), dist="BetaBinomial", ...){
     level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("pvalues", dist)
@@ -268,7 +267,14 @@ pVals <- function(fds, type=currentType(fds), level="site",
         aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
         # add information on used filters
         for(n in sort(names(filters))){
-            aname <- paste0(aname, "_", n, filters[[n]])
+            aname_new <- paste0(aname, "_", n, filters[[n]])
+            if(n == "rho" && filters[[n]] == 1){
+                if(any(grepl(aname_new, assayNames(fds)))){
+                    aname <- aname_new
+                }
+            }else{
+                aname <- aname_new
+            }
         }
         if(level == "gene"){
             if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
@@ -283,7 +289,7 @@ pVals <- function(fds, type=currentType(fds), level="site",
 }
 
 `pVals<-` <- function(fds, type=currentType(fds), level="site",
-                    filters=list(rho=0.1),
+                    filters=list(rho=1),
                     dist="BetaBinomial", ..., value){
     level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
@@ -297,7 +303,9 @@ pVals <- function(fds, type=currentType(fds), level="site",
     }
     # add information on used filters
     for(n in sort(names(filters))){
-        aname <- paste0(aname, "_", n, filters[[n]])
+        if(!(n == "rho" && filters[[n]] == 1)){
+            aname <- paste0(aname, "_", n, filters[[n]])
+        }
     }
     
     if(level == "gene"){
@@ -314,14 +322,21 @@ pVals <- function(fds, type=currentType(fds), level="site",
 #' @describeIn getter_setter_functions This returns the adjusted p-values.
 #' @export
 padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), 
-                    level="site", filters=list(rho=0.1), ...){
+                    level="site", filters=list(rho=1), ...){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
     # add information on used filters
     for(n in sort(names(filters))){
-        aname <- paste0(aname, "_", n, filters[[n]])
+        aname_new <- paste0(aname, "_", n, filters[[n]])
+        if(n == "rho" && filters[[n]] == 1){
+            if(any(grepl(aname_new, assayNames(fds)))){
+                aname <- aname_new
+            }
+        }else{
+            aname <- aname_new
+        }
     }
     if(level == "gene"){
         if(!paste(aname, type, sep="_") %in% names(metadata(fds))){
@@ -334,14 +349,16 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
 }
 
 `padjVals<-` <- function(fds, type=currentType(fds), level="site",
-                    dist="BetaBinomial", filters=list(rho=0.1), ..., value){
+                    dist="BetaBinomial", filters=list(rho=1), ..., value){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
     # add information on used filters
     for(n in sort(names(filters))){
-        aname <- paste0(aname, "_", n, filters[[n]])
+        if(!(n == "rho" && filters[[n]] == 1)){
+            aname <- paste0(aname, "_", n, filters[[n]])
+        }
     }
     if(level == "gene"){
         if(is.null(rownames(value))){
@@ -375,10 +392,14 @@ deltaPsiValue <- function(fds, type=currentType(fds)){
 
 
 #' @describeIn getter_setter_functions Returns the psi type that is used 
-#' within several methods in the FRASER package.
+#' within several methods in the FRASER package (defaults to jaccard).
 #' @export
 currentType <- function(fds){
-    return(metadata(fds)[['currentType']])
+    curType <- metadata(fds)[['currentType']]
+    if(is.null(curType)){
+        curType <- "jaccard"
+    }
+    return(curType)
 }
 
 #' @describeIn getter_setter_functions Sets the psi type that is to be used 
@@ -497,7 +518,7 @@ dontWriteHDF5 <- function(fds){
     return(fds)
 }
 
-getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){
+getTrueOutliers <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
     ans <- getAssayMatrix(fds, "trueOutliers", type)
     if(isTRUE(byGroup)){
         ans <- getAbsMaxByGroup(fds, type, ans, ...)
@@ -507,7 +528,7 @@ getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){
     pmin(pmax(ans, -1), 1)
 }
 
-getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
+getTrueDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
     ans <- getAssayMatrix(fds, "trueDeltaPSI", type)
     if(isTRUE(byGroup)){
         ans <- getAbsMaxByGroup(fds, type, ans, ...)
@@ -515,7 +536,8 @@ getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
     ans
 }
 
-getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){
+getAbsMaxByGroup <- function(fds, type=currentType(fds), mat, index=NULL, 
+                                BPPARAM=bpparam()){
     if(is.null(index)){
         index <- getSiteIndex(fds, type)
     }
@@ -534,13 +556,13 @@ getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){
     return(values)
 }
 
-getByGroup <- function(fds, type, value){
+getByGroup <- function(fds, type=currentType(fds), value){
     index <- getSiteIndex(fds, type)
     idx   <- !duplicated(index)
     return(value[idx,])
 }
 
-getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
+getDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
     mu <- predictedMeans(fds, type)
     dataPsi <- (K(fds, type) + pseudocount())/(N(fds, type) + 2*pseudocount())
     deltaPSI <- dataPsi - mu
@@ -552,7 +574,7 @@ getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){
 
 
 # calculate FRASER weights
-calcFraserWeights <- function(fds, psiType){
+calcFraserWeights <- function(fds, psiType=currentType(fds)){
     k <- as.matrix(K(fds, psiType))
     n <- as.matrix(N(fds, psiType))
     mu <- t(predictMu(fds, psiType))
@@ -584,7 +606,7 @@ calcFraserWeights <- function(fds, psiType){
 }
 
 # get FRASER weights
-weights <- function(fds, type){
+weights <- function(fds, type=currentType(fds)){
     return(getAssayMatrix(fds, "weights", type))
 }
 
@@ -594,7 +616,7 @@ weights <- function(fds, type){
     return(fds)
 }
 
-getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
+getIndexFromResultTable <- function(fds, resultTable){
     type <- as.character(resultTable$type)
     target <- makeGRangesFromDataFrame(resultTable)
     if(type == "theta"){
@@ -611,9 +633,9 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
     ov
 }
 
-getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL,
-                    idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, 
-                    geneColumn="hgnc_symbol", ...){
+getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), 
+                    result=NULL, idx=NULL, aggregate=FALSE, pvalLevel="site", 
+                    Ncpus=3, geneColumn="hgnc_symbol", ...){
     if(!is.null(result)){
         type <- as.character(result$type)
         idx  <- getIndexFromResultTable(fds, result)
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 21b117b2..ed73f639 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -22,7 +22,7 @@ checkCountData <- function(fds, stop=TRUE){
         if(isFALSE(stop)) return(invisible(FALSE))
         stop("No counts detected! Please provide counts first.")
     }
-    if(!all(paste0("rawOtherCounts_", psiTypes) %in% assayNames(fds))){
+    if(!all(paste0("rawOtherCounts_", psiTypes_avail) %in% assayNames(fds))){
         if(isFALSE(stop)) return(invisible(FALSE))
         stop("Please compute first the total expression at each junction.")
     }
@@ -71,10 +71,10 @@ checkReadType <- function(fds, type){
 
     # check if type is null or missing
     if(missing(type) | is.null(type)){
-        if(verbose(fds) > 0){
-            warning("Read type was not specified!",
-                    "We will assume the default: 'j'")
-        }
+        # if(verbose(fds) > 0){
+        #     warning("Read type was not specified!",
+        #             "We will assume the default: 'j'")
+        # }
         return("j")
     }
     type <- unique(type)
@@ -356,18 +356,18 @@ assayExists <- function(fds, assayName){
     return(aexists)
 }
 
-getAssayAsVector <- function(fds, prefix, psiType, sampleID){
+getAssayAsVector <- function(fds, prefix, psiType=currentType(fds), sampleID){
     as.vector(assay(fds, paste0(prefix, psiType))[,sampleID])
 }
 
 
-variableJunctions <- function(fds, type, minDeltaPsi=0.1){
+variableJunctions <- function(fds, type=currentType(fds), minDeltaPsi=0.1){
     psi <- K(fds, type=type)/N(fds, type=type)
     j2keep <- rowMaxs(abs(psi - rowMeans(psi, na.rm=TRUE)), na.rm=TRUE)
     j2keep >= minDeltaPsi
 }
 
-subsetKMostVariableJunctions <- function(fds, type, n){
+subsetKMostVariableJunctions <- function(fds, type=currentType(fds), n){
     curX <- x(fds, type=type, all=TRUE, center=FALSE, noiseAlpha=NULL)
     xsd <- colSds(curX)
     nMostVarJuncs <- which(xsd >= sort(xsd, TRUE)[min(length(xsd), n*2)])
@@ -376,7 +376,8 @@ subsetKMostVariableJunctions <- function(fds, type, n){
     ans
 }
 
-getSubsetVector <- function(fds, type, minDeltaPsi=0.1, nSubset=15000){
+getSubsetVector <- function(fds, type=currentType(fds), minDeltaPsi=0.1, 
+                            nSubset=15000){
     # get any variable intron
     ans <- variableJunctions(fds, type, minDeltaPsi=minDeltaPsi)
 
@@ -554,14 +555,21 @@ getStrandString <- function(fds){
 #'
 #' Check if adjusted pvalues have been computed for a given set of filters.
 #' @noRd
-checkPadjAvailableForFilters <- function(fds, type, filters=list(),
-                                        dist="BetaBinomial", aggregate=FALSE){
+checkPadjAvailableForFilters <- function(fds, type=currentType(fds), 
+                    filters=list(), dist="BetaBinomial", aggregate=FALSE){
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname)
     # add information on used filters
     for(n in sort(names(filters))){
-        aname <- paste0(aname, "_", n, filters[[n]])
+        aname_new <- paste0(aname, "_", n, filters[[n]])
+        if(n == "rho" && filters[[n]] == 1){
+            if(any(grepl(aname_new, assayNames(fds)))){
+                aname <- aname_new
+            }
+        }else{
+            aname <- aname_new
+        }
     }
     aname <- paste(aname, type, sep="_")
     if(isTRUE(aggregate)){
diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
index 5b6f7531..f451d5cf 100644
--- a/R/makeSimulatedDataset.R
+++ b/R/makeSimulatedDataset.R
@@ -435,9 +435,10 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10,
 #' @examples 
 #' # A generic dataset
 #' fds <- makeSimulatedFraserDataSet()
+#' fds <- calculatePSIValues(fds)
 #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
 #' @export
-injectOutliers <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
+injectOutliers <- function(fds, type=currentType(fds),
                     freq=1E-3, minDpsi=0.2, minCoverage=2,
                     deltaDistr="uniformDistr", verbose=FALSE,
                     method=c('samplePSI', 'meanPSI', 'simulatedPSI'),
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 1b4ae279..3fd7ea50 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -162,6 +162,7 @@
 #'             (b,l,t,r).
 #' @param cex For controlling the size of text and numbers in 
 #'             \code{plotBamCoverage}.
+#' @param color_chr Interchanging colors by chromosome for \code{plotManhattan}.
 #'
 #### Additional ... parameter
 #' @param ... Additional parameters passed to plot() or plot_ly() if not stated
@@ -229,33 +230,34 @@
 #' @rdname plotFunctions
 #' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ 
 #'             plotExpression plotCountCorHeatmap plotFilterExpression 
-#'             plotExpectedVsObservedPsi plotEncDimSearch
+#'             plotExpectedVsObservedPsi plotEncDimSearch plotManhattan
+#'             plotBamCoverage plotBamCoverageFromResultTable
 #' @examples
 #' # create full FRASER object 
 #' fds <- makeSimulatedFraserDataSet(m=40, j=200)
 #' fds <- calculatePSIValues(fds)
 #' fds <- filterExpressionAndVariability(fds, filter=FALSE)
-#' # this step should be done for all splicing metrics and more dimensions
-#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
+#' # this step should be done for more dimensions in practice
+#' fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25))
 #' fds <- FRASER(fds)
 #' 
 #' # QC plotting
 #' plotFilterExpression(fds)
 #' plotFilterVariability(fds)
-#' plotCountCorHeatmap(fds, "theta")
-#' plotCountCorHeatmap(fds, "theta", normalized=TRUE)
-#' plotEncDimSearch(fds, type="psi5")
+#' plotCountCorHeatmap(fds, "jaccard")
+#' plotCountCorHeatmap(fds, "jaccard", normalized=TRUE)
+#' plotEncDimSearch(fds, type="jaccard")
 #' 
 #' # extract results 
 #' plotAberrantPerSample(fds, aggregate=FALSE)
-#' plotVolcano(fds, "sample1", "psi5")
+#' plotVolcano(fds, "sample1", "jaccard")
 #' 
 #' # dive into gene/sample level results
 #' res <- results(fds)
 #' res
 #' plotExpression(fds, result=res[1])
 #' plotQQ(fds, result=res[1])
-#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
+#' plotExpectedVsObservedPsi(fds, res=res[1])
 #' 
 #' # create manhattan plot of pvalues by genomic position
 #' plotManhattan(fds, type="jaccard", sampleID="sample10")
@@ -297,9 +299,9 @@ NULL
 
 
 plotVolcano.FRASER <- function(object, sampleID, 
-                    type=c("psi3", "psi5", "theta", "jaccard"), basePlot=TRUE, 
+                    type=currentType(object), basePlot=TRUE, 
                     aggregate=FALSE, main=NULL, label=NULL,
-                    deltaPsiCutoff=0.3, padjCutoff=0.1, ...){
+                    deltaPsiCutoff=0.1, padjCutoff=0.1, ...){
     
     type <- match.arg(type)
 
@@ -398,8 +400,8 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 
 plotAberrantPerSample.FRASER <- function(object, main, 
-                    type=c("psi3", "psi5", "theta", "jaccard"),
-                    padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3,
+                    type=psiTypes,
+                    padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
     type <- match.arg(type, several.ok=TRUE)
@@ -461,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
+plotExpression <- function(fds, type=currentType(fds),
                     idx=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
@@ -553,7 +555,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta", "jaccard"),
+plotExpectedVsObservedPsi <- function(fds, type=currentType(fds),
                     idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                     basePlot=TRUE, label="aberrant", ...){
     type <- match.arg(type)
@@ -818,7 +820,7 @@ setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)
 
 
 plotEncDimSearch.FRASER <- function(object, 
-                    type=c("psi3", "psi5", "theta", "jaccard"), 
+                    type=currentType(object), 
                     plotType=c("auc", "loss")){
     type <- match.arg(type)
     plotType <- match.arg(plotType)
@@ -986,7 +988,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
 
 
 plotCountCorHeatmap.FRASER <- function(object,
-                    type=c("psi5", "psi3", "theta", "jaccard"), logit=FALSE, 
+                    type=currentType(object), logit=FALSE, 
                     topN=50000, topJ=5000, minMedian=1, minCount=10, 
                     main=NULL, normalized=FALSE, show_rownames=FALSE,
                     show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
@@ -1427,11 +1429,12 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
 }
 
 plotManhattan.FRASER <- function(object, sampleID, 
-                                type=c("psi5", "psi3", "theta", "jaccard"), 
+                                type=currentType(object), 
                                 main=paste0("sampleID = ", sampleID), 
-                                color=c("black", "darkgrey"),
+                                color_chr=c("black", "darkgrey"),
                                 ...){
     # check arguments
+    stopifnot(is(object, "FraserDataSet"))
     stopifnot(sampleID %in% samples(object))
     type <- match.arg(type)
     additional_args <- list(...)
@@ -1473,7 +1476,7 @@ plotManhattan.FRASER <- function(object, sampleID,
     
     # plot manhattan plot
     plotGrandLinear.adapted(gr_sample, aes(y=pvalue), 
-                            color=color, 
+                            color=color_chr, 
                             highlight.gr=gr_sample[outlier_idx,],
                             highlight.overlap="equal") + 
         labs(title=main)
@@ -1603,8 +1606,8 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
     if (is.null(geom)) 
         geom <- "point"
     args <- list(...)
-    args.aes <- parseArgsForAes(args)
-    args.non <- parseArgsForNonAes(args)
+    args.aes <- biovizBase::parseArgsForAes(args)
+    args.non <- biovizBase::parseArgsForNonAes(args)
     two.color <- c("#0080FF", "#4CC4FF")
     .is.seq <- FALSE
     if (!"colour" %in% names(args.aes)) {
@@ -1634,7 +1637,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
     args.non$geom <- geom
     args.non$object <- obj
     aes.res <- do.call(aes, args.aes)
-    p <- do.call(autoplot, c(list(aes.res), args.non))
+    p <- do.call(ggbio::autoplot, c(list(aes.res), args.non))
     if (!legend) 
         p <- p + theme(legend.position = "none")
     if (!missing(ylab)) 
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 749d192c..f1dbcb45 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -162,7 +162,7 @@ adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff,
 }
 
 getFWERpvals_bySample <- function(pvals, index, rho, method="holm", 
-                                rhoCutoff=0.1, BPPARAM=bpparam()){
+                                rhoCutoff, BPPARAM=bpparam()){
     fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues,
                     pvals=pvals, index, BPPARAM=BPPARAM,
                     method=method, rho=rho, rhoCutoff=rhoCutoff)
@@ -171,7 +171,7 @@ getFWERpvals_bySample <- function(pvals, index, rho, method="holm",
 }
 
 getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", 
-                                rhoCutoff=0.1, BPPARAM=bpparam()){
+                                rhoCutoff, BPPARAM=bpparam()){
     unique_idx <- unique(index)
     fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx,
                     pvals=pvals, index, BPPARAM=BPPARAM,
@@ -230,7 +230,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #' 
 #' @export
 calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
-                                rhoCutoff=0.1, geneLevel=TRUE, 
+                                rhoCutoff=1, geneLevel=TRUE, 
                                 geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
@@ -263,8 +263,8 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                 withDimnames=FALSE) <- as.matrix(padjDT)
         
         # gene-level pval correction and FDR
-        if(geneColumn %in% colnames(mcols(fds, type=type)) && 
-                isTRUE(geneLevel)){
+        if(isTRUE(geneLevel) && 
+                geneColumn %in% colnames(mcols(fds, type=type))){
             message(date(), ":  calculating gene-level pvalues ...")
             gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals,
                                             method="holm", FDRmethod=method, 
@@ -274,7 +274,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                     withDimnames=FALSE) <- gene_pvals[["pvals"]]
             padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
                     withDimnames=FALSE) <- gene_pvals[["padj"]]
-        } else{
+        } else if(isTRUE(geneLevel)){
             warning("Gene-level pvalues could not be calculated as column ",
                     geneColumn, "\nwas not found for the given fds object. ", 
                     "Please annotate gene symbols \nfirst using the ", 
@@ -285,7 +285,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
     return(fds)
 }
 
-getPvalsPerGene <- function(fds, type, 
+getPvalsPerGene <- function(fds, type=currentType(fds), 
                     pvals=pVals(fds, type=type, level="site"),
                     sampleID=NULL, method="holm", FDRmethod="BY", 
                     geneColumn="hgnc_symbol", BPPARAM=bpparam()){
@@ -331,7 +331,7 @@ getPvalsPerGene <- function(fds, type,
 
 }
 
-getSiteIndex <- function(fds, type){
+getSiteIndex <- function(fds, type=currentType(fds)){
     if(type == "theta"){
         return(mcols(fds, type=type)[['spliceSiteID']])
     }
@@ -354,7 +354,8 @@ getSiteIndex <- function(fds, type){
     ans[selectionMat]
 }
 
-getGeneIDs <- function(fds, type, unique=TRUE, geneColumn="hgnc_symbol"){
+getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, 
+                        geneColumn="hgnc_symbol"){
     geneIDs <- mcols(fds, type=type)[[geneColumn]]
     if(isTRUE(unique)){
         geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){
@@ -380,3 +381,80 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){
     return(pvalsPerGene)
 }
 
+#' @describeIn FRASER This function does FDR correction only for all junctions 
+#' in a certain subset of genes which can differ per sample. Requires gene 
+#' symbols to have been annotated to junctions. As with the full FDR 
+#' correction across all junctions, first the previously calculated 
+#' junction-level p values are adjusted with Holm's method per donor or 
+#' acceptor site, respectively. Then, gene-level p values are computed.
+#' 
+#' @param genesToTest A named list with the subset of genes to test per sample.
+#'     The names must correspond to the sampleIDs in the given fds object.
+#' @param subsetName The name under which the resulting FDR corrected pvalues 
+#'     will be stored in metadata(fds).
+#' 
+#' @export
+calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), 
+                                subsetName="FDR_subset", method="BY", 
+                                geneColumn="hgnc_symbol", BPPARAM=bpparam()){
+    
+    # check input
+    currentType(fds) <- type
+    stopifnot(!is.null(genesToTest))
+    stopifnot(is.list(genesToTest))
+    stopifnot(!is.null(names(genesToTest)))
+    if(!all(names(genesToTest) %in% samples(fds))){
+        stop("names(genesToTest) need to be sampleIDs in the given fds object.")
+    }
+    if(!all(samples(fds) %in% names(genesToTest))){
+        stop("All sampleIDs of the given fds object need to be in ",
+            "names(geneToTest).")
+    }
+    
+    # check if genes have been annotated
+    if(!geneColumn %in% colnames(mcols(fds, type=type))){    
+        stop(paste0("'", geneColumn, "' is not found in mcols(fds). ", 
+            "Please annotate gene symbols \nfirst using the ", 
+            "annotateRanges or annotateRangesWithTxDb function."))
+    }
+    
+    # compute FDR on the given subsets of genes
+    message(date(), ": starting FDR calculation on subset of genes...")
+    FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, 
+            FUN=function(sample_id, genes_to_test_sample){
+        
+        # message(date(), ": FDR subset calculation for sample = ", sample_id)
+        # get idx of junctions corresponding to genes with var
+        jidx <- unlist(lapply(genes_to_test_sample, function(gene){
+            idx <- which(grepl(gene, mcols(fds, type="j")[, geneColumn]))
+            names(idx) <- rep(gene, length(idx))
+            return(idx)
+        }))
+        jidx <- sort(jidx[!duplicated(jidx)])
+        
+        # retrieve pvalues of junctions to test
+        p <- pVals(fds, type=type, level="junction")[jidx, sample_id]
+        
+        # FDR correction
+        pa <- p.adjust(p, method=method)
+        
+        # gene level pvals
+        dt <- data.table(sampleID=sample_id, pval=p, FDR_subset=pa, 
+                        gene=names(jidx), jidx=jidx)
+        dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
+        
+        # gene level FDR
+        dt2 <- dt[, unique(pval_gene), by="gene"]
+        dt2[, FDR_subset_gene := p.adjust(V1, method=method)]
+        dt <- merge(dt, dt2[, .(gene, FDR_subset_gene)], by="gene", all.x=TRUE)
+        
+        # return new FDR
+        return(dt)
+    }, SIMPLIFY=FALSE, BPPARAM=BPPARAM))
+    message(date(), ": finished FDR calculation on subset of genes.")
+    
+    # add FDR subset info to fds object and return
+    metadata(fds)[[subsetName]] <- FDR_subset
+    return(fds)
+}
+
diff --git a/R/variables.R b/R/variables.R
index a159d2ee..61f821e1 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -1,12 +1,22 @@
 #'
-#' Available psi types
+#' Available splice metrics
 #'
 #' @examples 
-#'   # to show available psi types:
+#'   # to show available splice metrics:
+#'   psiTypes_avail
+#' 
+#' @export
+psiTypes_avail <- c("psi5", "psi3", "theta", "jaccard")
+names(psiTypes_avail) <- c("psi5", "psi3", "theta", "Intron Jaccard Index")
+
+#'
+#' Splice metrics that are run by default
+#'
+#' @examples 
+#'   # to show splice metrics selected to be fitted:
 #'   psiTypes
 #' 
 #' @export
-psiTypes <- c("psi5", "psi3", "theta", "jaccard")
-# psiTypes <- c("psi5", "psi3", "theta")
-names(psiTypes) <- psiTypes
+psiTypes <- c("jaccard")
+names(psiTypes) <- c("Intron Jaccard Index")
 
diff --git a/R/zzz.R b/R/zzz.R
index 8f087a08..7f0b67d5 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -8,7 +8,7 @@
     op.fraser <- list(
         `FRASER-hdf5-chunk-nrow` = 30000,
         `FRASER-hdf5-chunk-ncol` = 20,
-        `FRASER.pseudoCount` = 1,
+        `FRASER.pseudoCount` = 0.1,
         `FRASER.minSamplesForDelayed` = 1000,
         `FRASER.maxSamplesNoHDF5` = 20,
         `FRASER.maxJunctionsNoHDF5` = 1000)
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 0931ec25..170a6271 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -5,6 +5,7 @@
 \alias{calculateZscore}
 \alias{calculatePvalues}
 \alias{calculatePadjValues}
+\alias{calculatePadjValuesOnSubset}
 \title{FRASER: Find RAre Splicing Events in RNA-seq data}
 \usage{
 FRASER(
@@ -32,11 +33,21 @@ calculatePadjValues(
   fds,
   type = currentType(fds),
   method = "BY",
-  rhoCutoff = 0.1,
+  rhoCutoff = 1,
   geneLevel = TRUE,
   geneColumn = "hgnc_symbol",
   BPPARAM = bpparam()
 )
+
+calculatePadjValuesOnSubset(
+  fds,
+  genesToTest,
+  type = currentType(fds),
+  subsetName = "FDR_subset",
+  method = "BY",
+  geneColumn = "hgnc_symbol",
+  BPPARAM = bpparam()
+)
 }
 \arguments{
 \item{fds}{A \code{\link{FraserDataSet}} object}
@@ -58,7 +69,7 @@ not yet converged after these number of iterations, the fit stops anyway.}
 
 \item{...}{Additional parameters passed on to the internal fit function}
 
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing 
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
 efficiency)}
 
 \item{logit}{Indicates if z scores are computed on the logit scale (default) 
@@ -82,6 +93,12 @@ should be calculated. Defaults to TRUE.}
 
 \item{geneColumn}{The column name of the column that has the gene annotation 
 that will be used for gene-level pvalue computation.}
+
+\item{genesToTest}{A named list with the subset of genes to test per sample.
+The names must correspond to the sampleIDs in the given fds object.}
+
+\item{subsetName}{The name under which the resulting FDR corrected pvalues 
+will be stored in metadata(fds).}
 }
 \value{
 FraserDataSet
@@ -131,6 +148,13 @@ p-values per sample for multiple testing. First, the previoulsy calculated
 junction-level p values are adjusted with Holm's method per donor or 
 acceptor site, respectively. Then, if gene symbols have been annotated to 
 junctions (and not otherwise requested), gene-level p values are computed.
+
+\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions 
+in a certain subset of genes which can differ per sample. Requires gene 
+symbols to have been annotated to junctions. As with the full FDR 
+correction across all junctions, first the previously calculated 
+junction-level p values are adjusted with Holm's method per donor or 
+acceptor site, respectively. Then, gene-level p values are computed.
 }}
 
 \examples{
@@ -154,16 +178,16 @@ fds <- FRASER(fds, q=2, implementation="PCA")
 # The functions run inside the FRASER function can also be directly 
 # run themselves. 
 # To directly run the fit function:
-fds <- fit(fds, implementation="PCA", q=2, type="psi5")
+fds <- fit(fds, implementation="PCA", q=2, type="jaccard")
 
 # To directly run the nomial and adjusted p value and z score 
 # calculation, the following functions can be used:
-fds <- calculatePvalues(fds, type="psi5")
-head(pVals(fds, type="psi5"))
-fds <- calculatePadjValues(fds, type="psi5", method="BY")
-head(padjVals(fds, type="psi5"))
-fds <- calculateZscore(fds, type="psi5")
-head(zScores(fds, type="psi5")) 
+fds <- calculatePvalues(fds, type="jaccard")
+head(pVals(fds, type="jaccard"))
+fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+head(padjVals(fds, type="jaccard"))
+fds <- calculateZscore(fds, type="jaccard")
+head(zScores(fds, type="jaccard")) 
 
 }
 \seealso{
diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd
index c82a3211..4d1639f9 100644
--- a/man/annotateRanges.Rd
+++ b/man/annotateRanges.Rd
@@ -67,13 +67,13 @@ fds <- createTestFraserDataSet()
 # either using biomart with GRCh38
 try({
   fds <- annotateRanges(fds, GRCh=38)
-  rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
+  rowRanges(fds, type="j")[,c("hgnc_symbol")]
 })
 
 # either using biomart with GRCh37
 try({
   fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
-  rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
+  rowRanges(fds, type="j")[,c("hgnc_symbol_37")]
 })
  
 # or with a provided TxDb object
@@ -82,6 +82,6 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
 require(org.Hs.eg.db)
 orgDb <- org.Hs.eg.db
 fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
-rowRanges(fds, type="psi5")[,"hgnc_symbol"]
+rowRanges(fds, type="j")[,"hgnc_symbol"]
 
 }
diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd
index 0128221d..9086c048 100644
--- a/man/calculatePSIValues.Rd
+++ b/man/calculatePSIValues.Rd
@@ -6,7 +6,7 @@
 \usage{
 calculatePSIValues(
   fds,
-  types = psiTypes,
+  types = psiTypes_avail,
   overwriteCts = FALSE,
   BPPARAM = bpparam()
 )
@@ -15,7 +15,7 @@ calculatePSIValues(
 \item{fds}{A \code{\link{FraserDataSet}} object}
 
 \item{types}{A vector with the psi types which should be calculated. Default 
-is all of psi5, psi3 and theta.}
+is all of jaccard, psi5, psi3 and theta.}
 
 \item{overwriteCts}{FALSE or TRUE (the default) the total counts (aka N) will
 be recalculated based on the existing junction counts (aka K)}
@@ -31,7 +31,7 @@ based on the FraserDataSet object
 }
 \examples{
   fds <- createTestFraserDataSet()
-  fds <- calculatePSIValues(fds, types="psi5")
+  fds <- calculatePSIValues(fds, types="jaccard")
   
   ### usually one would run this function for all psi types by using:
   # fds <- calculatePSIValues(fds)
diff --git a/man/counts.Rd b/man/counts.Rd
index 1ddac4d5..79beb47f 100644
--- a/man/counts.Rd
+++ b/man/counts.Rd
@@ -11,9 +11,14 @@ K(fds, type = currentType(fds))
 
 N(fds, type = currentType(fds))
 
-\S4method{counts}{FraserDataSet}(object, type = NULL, side = c("ofInterest", "otherSide"))
-
-\S4method{counts}{FraserDataSet,ANY}(object, type = NULL, side = c("ofInterest", "otherSide"), ...) <- value
+\S4method{counts}{FraserDataSet}(object, type = currentType(object), side = c("ofInterest", "otherSide"))
+
+\S4method{counts}{FraserDataSet,ANY}(
+  object,
+  type = currentType(object),
+  side = c("ofInterest", "otherSide"),
+  ...
+) <- value
 }
 \arguments{
 \item{fds, object}{FraserDataSet}
@@ -39,8 +44,10 @@ setter for count data
 \examples{
  fds <- createTestFraserDataSet()
  
- counts(fds, type="psi5", side="ofInterest")
- counts(fds, type="psi5", side="other")
+ counts(fds, side="ofInterest")
+ counts(fds, type="jaccard", side="other")
+ head(K(fds))
+ head(K(fds, type="psi5"))
  head(K(fds, type="psi3"))
  head(N(fds, type="theta"))
  
diff --git a/man/filtering.Rd b/man/filtering.Rd
index e075761a..5d604036 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -1,21 +1,21 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/filterExpression.R
-\name{filtering}
+% Please edit documentation in R/AllGenerics-definitions.R, R/filterExpression.R
+\name{filterVariability}
+\alias{filterVariability}
 \alias{filtering}
 \alias{filterExpressionAndVariability}
 \alias{filterExpression,FraserDataSet-method}
-\alias{filterVariability}
-\alias{filterExpressionAndVariability_jaccard}
-\alias{filterExpression_jaccard}
-\alias{filterVariability_jaccard}
+\alias{filterVariability,FraserDataSet-method}
 \title{Filtering FraserDataSets}
 \usage{
+filterVariability(object, ...)
+
 filterExpressionAndVariability(
   object,
   minExpressionInOneSample = 20,
-  quantile = 0.95,
+  quantile = 0.75,
   quantileMinExpression = 10,
-  minDeltaPsi = 0.05,
+  minDeltaPsi = 0,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
   BPPARAM = bpparam()
@@ -24,45 +24,16 @@ filterExpressionAndVariability(
 \S4method{filterExpression}{FraserDataSet}(
   object,
   minExpressionInOneSample = 20,
-  quantile = 0.95,
+  quantile = 0.75,
   quantileMinExpression = 10,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
   BPPARAM = bpparam()
 )
 
-filterVariability(
+\S4method{filterVariability}{FraserDataSet}(
   object,
-  minDeltaPsi = 0.05,
-  filter = TRUE,
-  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
-  BPPARAM = bpparam()
-)
-
-filterExpressionAndVariability_jaccard(
-  object,
-  minExpressionInOneSample = 20,
-  quantile = 0.95,
-  quantileMinExpression = 1,
-  minDelta = 0.05,
-  filter = TRUE,
-  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
-  BPPARAM = bpparam()
-)
-
-filterExpression_jaccard(
-  object,
-  minExpressionInOneSample = 20,
-  quantile = 0.95,
-  quantileMinExpression = 1,
-  filter = TRUE,
-  delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
-  BPPARAM = bpparam()
-)
-
-filterVariability_jaccard(
-  object,
-  minDelta = 0,
+  minDeltaPsi = 0,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
   BPPARAM = bpparam()
@@ -71,6 +42,8 @@ filterVariability_jaccard(
 \arguments{
 \item{object}{A \code{\link{FraserDataSet}} object}
 
+\item{...}{Further parameters passed on to Rsubread::featureCounts.}
+
 \item{minExpressionInOneSample}{The minimal read count in at least one 
 sample that is required for an intron to pass the filter.}
 
@@ -110,24 +83,15 @@ read support and introns that are not variable across samples.
 \item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
 
-\item \code{filterVariability}: This function filters out introns and corresponding 
-splice sites which do not show variablity across samples.
-
-\item \code{filterExpressionAndVariability_jaccard}: This functions filters out both introns with low 
-read support and introns that are not variable across samples.
-
-\item \code{filterExpression_jaccard}: This function filters out introns and corresponding 
-splice sites which are expressed at very low levels across samples.
-
-\item \code{filterVariability_jaccard}: This function filters out introns and corresponding 
-splice sites which do not show variablity across samples.
+\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding 
+splice sites that have low read support in all samples.
 }}
 
 \examples{
 fds <- createTestFraserDataSet()
 fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
-mcols(fds, type="psi5")[, c(
-        "maxCount", "passedExpression", "maxDPsi3", "passedVariability")]
+mcols(fds, type="jaccard")[, c(
+        "maxCount", "passedExpression", "maxDJaccard", "passedVariability")]
 
 plotFilterExpression(fds)
 plotFilterVariability(fds)     
diff --git a/man/fit.Rd b/man/fit.Rd
index 35f7966e..659baa3b 100644
--- a/man/fit.Rd
+++ b/man/fit.Rd
@@ -10,7 +10,7 @@
   implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full",
     "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"),
   q,
-  type = "psi3",
+  type = currentType(object),
   rhoRange = c(-30, 30),
   weighted = FALSE,
   noiseAlpha = 1,
@@ -35,7 +35,7 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown.
 If a named vector is provided it is used for the different
 splicing types.}
 
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing 
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
 efficiency)}
 
 \item{rhoRange}{Defines the range of values that rho parameter from the 
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index 79306a75..3b9c108a 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -39,7 +39,7 @@ pVals(
   fds,
   type = currentType(fds),
   level = "site",
-  filters = list(rho = 0.1),
+  filters = list(rho = 1),
   dist = "BetaBinomial",
   ...
 )
@@ -49,7 +49,7 @@ padjVals(
   type = currentType(fds),
   dist = c("BetaBinomial"),
   level = "site",
-  filters = list(rho = 0.1),
+  filters = list(rho = 1),
   ...
 )
 
@@ -135,7 +135,7 @@ values.
 observed and the fitted psi values.
 
 \item \code{currentType}: Returns the psi type that is used 
-within several methods in the FRASER package.
+within several methods in the FRASER package (defaults to jaccard).
 
 \item \code{currentType<-}: Sets the psi type that is to be used 
 within several methods in the FRASER package.
@@ -174,7 +174,7 @@ dontWriteHDF5(fds)
 dontWriteHDF5 <- TRUE
 
 # get/set the splice metric for which results should be retrieved 
-currentType(fds) <- "psi5"
+currentType(fds) <- "jaccard"
 currentType(fds)
 
 # get fitted parameters
@@ -192,9 +192,9 @@ pseudocount(4L)
 pseudocount()
 
 # retrieve or set a mask to exclude certain junctions in the fitting step
-featureExclusionMask(fds, type="theta") <- sample(
-        c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE)
-featureExclusionMask(fds, type="theta")
+featureExclusionMask(fds, type="jaccard") <- sample(
+        c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE)
+featureExclusionMask(fds, type="jaccard")
 
 # controlling the verbosity level of the output of some algorithms
 verbose(fds) <- 2
diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd
index b32ed74a..64728eae 100644
--- a/man/injectOutliers.Rd
+++ b/man/injectOutliers.Rd
@@ -6,7 +6,7 @@
 \usage{
 injectOutliers(
   fds,
-  type = c("psi5", "psi3", "theta", "jaccard"),
+  type = currentType(fds),
   freq = 0.001,
   minDpsi = 0.2,
   minCoverage = 2,
@@ -50,5 +50,6 @@ Inject artificial outliers in an existing fds
 \examples{
 # A generic dataset
 fds <- makeSimulatedFraserDataSet()
+fds <- calculatePSIValues(fds)
 fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
 }
diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd
index f3d10622..96ca78ba 100644
--- a/man/optimHyperParams.Rd
+++ b/man/optimHyperParams.Rd
@@ -6,9 +6,9 @@
 \usage{
 optimHyperParams(
   fds,
-  type,
+  type = currentType(fds),
   implementation = "PCA",
-  q_param = seq(2, min(40, ncol(fds)), by = 3),
+  q_param = getEncDimRange(fds),
   noise_param = 0,
   minDeltaPsi = 0.1,
   iterations = 5,
@@ -24,7 +24,7 @@ optimHyperParams(
 \arguments{
 \item{fds}{A \code{\link{FraserDataSet}} object}
 
-\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing 
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
 efficiency)}
 
 \item{implementation}{The method that should be used to correct for 
@@ -71,13 +71,14 @@ ratios while maximizing the precision-recall curve.
 \examples{
   # generate data
   fds <- makeSimulatedFraserDataSet(m=15, j=20)
+  fds <- calculatePSIValues(fds)
   
   # run hyperparameter optimization
-  fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5))
+  fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5))
   
   # get estimated optimal dimension of the latent space
-  bestQ(fds, type="psi5")
-  hyperParams(fds, type="psi5")
+  bestQ(fds, type="jaccard")
+  hyperParams(fds, type="jaccard")
   
 }
 \seealso{
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 9f591ebb..1acf58c6 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -11,14 +11,14 @@
 \alias{plotFilterExpression}
 \alias{plotExpectedVsObservedPsi}
 \alias{plotEncDimSearch}
+\alias{plotBamCoverage}
+\alias{plotBamCoverageFromResultTable}
 \alias{plotVolcano,FraserDataSet-method}
 \alias{plotAberrantPerSample,FraserDataSet-method}
 \alias{plotQQ,FraserDataSet-method}
 \alias{plotEncDimSearch,FraserDataSet-method}
 \alias{plotFilterVariability}
 \alias{plotCountCorHeatmap,FraserDataSet-method}
-\alias{plotBamCoverage}
-\alias{plotBamCoverageFromResultTable}
 \alias{plotManhattan,FraserDataSet-method}
 \title{Visualization functions for FRASER}
 \usage{
@@ -27,12 +27,12 @@ plotManhattan(object, ...)
 \S4method{plotVolcano}{FraserDataSet}(
   object,
   sampleID,
-  type = c("psi3", "psi5", "theta", "jaccard"),
+  type = currentType(object),
   basePlot = TRUE,
   aggregate = FALSE,
   main = NULL,
   label = NULL,
-  deltaPsiCutoff = 0.3,
+  deltaPsiCutoff = 0.1,
   padjCutoff = 0.1,
   ...
 )
@@ -40,10 +40,10 @@ plotManhattan(object, ...)
 \S4method{plotAberrantPerSample}{FraserDataSet}(
   object,
   main,
-  type = c("psi3", "psi5", "theta", "jaccard"),
+  type = psiTypes,
   padjCutoff = 0.1,
   zScoreCutoff = NA,
-  deltaPsiCutoff = 0.3,
+  deltaPsiCutoff = 0.1,
   aggregate = TRUE,
   BPPARAM = bpparam(),
   ...
@@ -51,7 +51,7 @@ plotManhattan(object, ...)
 
 plotExpression(
   fds,
-  type = c("psi5", "psi3", "theta", "jaccard"),
+  type = currentType(fds),
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -63,7 +63,7 @@ plotExpression(
 
 plotExpectedVsObservedPsi(
   fds,
-  type = c("psi5", "psi3", "theta", "jaccard"),
+  type = currentType(fds),
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -91,7 +91,7 @@ plotExpectedVsObservedPsi(
 
 \S4method{plotEncDimSearch}{FraserDataSet}(
   object,
-  type = c("psi3", "psi5", "theta", "jaccard"),
+  type = currentType(object),
   plotType = c("auc", "loss")
 )
 
@@ -111,7 +111,7 @@ plotFilterVariability(
 
 \S4method{plotCountCorHeatmap}{FraserDataSet}(
   object,
-  type = c("psi5", "psi3", "theta", "jaccard"),
+  type = currentType(object),
   logit = FALSE,
   topN = 50000,
   topJ = 5000,
@@ -174,9 +174,9 @@ plotBamCoverageFromResultTable(
 \S4method{plotManhattan}{FraserDataSet}(
   object,
   sampleID,
-  type = c("psi5", "psi3", "theta", "jaccard"),
+  type = currentType(object),
   main = paste0("sampleID = ", sampleID),
-  color = c("black", "darkgrey"),
+  color_chr = c("black", "darkgrey"),
   ...
 )
 }
@@ -375,6 +375,8 @@ junction should be extended to the left in
 \item{right_extension}{Indicating how far the plotted range around the 
 outlier junction should be extended to the right in 
 \code{plotBamCoverageFromResultTable}.}
+
+\item{color_chr}{Interchanging colors by chromosome for \code{plotManhattan}.}
 }
 \value{
 If base R graphics are used nothing is returned else the plotly or
@@ -484,27 +486,27 @@ certain region around the outlier.
 fds <- makeSimulatedFraserDataSet(m=40, j=200)
 fds <- calculatePSIValues(fds)
 fds <- filterExpressionAndVariability(fds, filter=FALSE)
-# this step should be done for all splicing metrics and more dimensions
-fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25))
+# this step should be done for more dimensions in practice
+fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25))
 fds <- FRASER(fds)
 
 # QC plotting
 plotFilterExpression(fds)
 plotFilterVariability(fds)
-plotCountCorHeatmap(fds, "theta")
-plotCountCorHeatmap(fds, "theta", normalized=TRUE)
-plotEncDimSearch(fds, type="psi5")
+plotCountCorHeatmap(fds, "jaccard")
+plotCountCorHeatmap(fds, "jaccard", normalized=TRUE)
+plotEncDimSearch(fds, type="jaccard")
 
 # extract results 
 plotAberrantPerSample(fds, aggregate=FALSE)
-plotVolcano(fds, "sample1", "psi5")
+plotVolcano(fds, "sample1", "jaccard")
 
 # dive into gene/sample level results
 res <- results(fds)
 res
 plotExpression(fds, result=res[1])
 plotQQ(fds, result=res[1])
-plotExpectedVsObservedPsi(fds, type="psi5", res=res[1])
+plotExpectedVsObservedPsi(fds, res=res[1])
 
 # create manhattan plot of pvalues by genomic position
 plotManhattan(fds, type="jaccard", sampleID="sample10")
diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd
index 43ce2ea6..773152c5 100644
--- a/man/psiTypes.Rd
+++ b/man/psiTypes.Rd
@@ -3,18 +3,18 @@
 \docType{data}
 \name{psiTypes}
 \alias{psiTypes}
-\title{Available psi types}
+\title{Splice metrics that are run by default}
 \format{
-An object of class \code{character} of length 4.
+An object of class \code{character} of length 1.
 }
 \usage{
 psiTypes
 }
 \description{
-Available psi types
+Splice metrics that are run by default
 }
 \examples{
-  # to show available psi types:
+  # to show splice metrics selected to be fitted:
   psiTypes
 
 }
diff --git a/man/psiTypes_avail.Rd b/man/psiTypes_avail.Rd
new file mode 100644
index 00000000..6f8455a9
--- /dev/null
+++ b/man/psiTypes_avail.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/variables.R
+\docType{data}
+\name{psiTypes_avail}
+\alias{psiTypes_avail}
+\title{Available splice metrics}
+\format{
+An object of class \code{character} of length 4.
+}
+\usage{
+psiTypes_avail
+}
+\description{
+Available splice metrics
+}
+\examples{
+  # to show available splice metrics:
+  psiTypes_avail
+
+}
+\keyword{datasets}
diff --git a/man/results.Rd b/man/results.Rd
index ef76b666..c7dec171 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -10,12 +10,12 @@
   sampleIDs = samples(object),
   padjCutoff = 0.05,
   zScoreCutoff = NA,
-  deltaPsiCutoff = 0.3,
-  rhoCutoff = 0.1,
+  deltaPsiCutoff = 0.1,
+  rhoCutoff = 1,
   aggregate = FALSE,
   collapse = FALSE,
   minCount = 5,
-  psiType = c("psi3", "psi5", "theta", "jaccard"),
+  psiType = currentType(object),
   geneColumn = "hgnc_symbol",
   additionalColumns = NULL,
   BPPARAM = bpparam()
@@ -25,10 +25,10 @@
   object,
   type = currentType(object),
   padjCutoff = 0.05,
-  deltaPsiCutoff = 0.3,
+  deltaPsiCutoff = 0.1,
   zScoreCutoff = NA,
   minCount = 5,
-  rhoCutoff = 0.1,
+  rhoCutoff = 1,
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
   geneColumn = "hgnc_symbol",
@@ -117,13 +117,13 @@ results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1,
 
 # get aberrant events per sample: on the example data, nothing is aberrant
 # based on the adjusted p-value
-aberrant(fds, type="psi5", by="sample")
+aberrant(fds, type="jaccard", by="sample")
 
 # get aberrant events per gene (first annotate gene symbols)
 fds <- annotateRangesWithTxDb(fds)
-aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA,
+aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA,
         aggregate=TRUE)
         
 # find aberrant junctions/splice sites
-aberrant(fds, type="psi5")
+aberrant(fds, type="jaccard")
 }
diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R
index 2bb6e2cc..761330ec 100644
--- a/tests/testthat/test_fraser_pipeline.R
+++ b/tests/testthat/test_fraser_pipeline.R
@@ -4,7 +4,7 @@ test_that("FRASER function", {
     fds <- createTestFraserDataSet()
     expect_is(fds, "FraserDataSet")
     anames <- c(psiTypes, paste0(c("delta", "predictedMeans", 
-            "pvaluesBetaBinomial_rho0.1", "padjBetaBinomial_rho0.1", 
+            "pvaluesBetaBinomial", "padjBetaBinomial", 
             "zScores"), "_", 
             rep(psiTypes, 5)))
     expect_equal(anames %in% assayNames(fds), !logical(length(anames)))
diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R
index 26331409..fd8a961b 100644
--- a/tests/testthat/test_hyperParams.R
+++ b/tests/testthat/test_hyperParams.R
@@ -2,6 +2,7 @@ context("Test hyper param optimization")
 
 test_that("Test hyper param testing", {
     fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB")
+    fds <- calculatePSIValues(fds)
     
     # test BB no hyper params and accessors
     fds <- optimHyperParams(fds, type="psi3", implementation="BB")
diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R
index 85a4b9d3..1394f13c 100644
--- a/tests/testthat/test_plotJunctionDist.R
+++ b/tests/testthat/test_plotJunctionDist.R
@@ -7,11 +7,11 @@ test_that("Main junction distribution plot", {
 
     # plot distributions
     expect_silent(plotExpression(fds, result=res[1]))
-    expect_silent(plotVolcano(fds, "sample1", "psi5"))
+    expect_silent(plotVolcano(fds, "sample1", "jaccard"))
     expect_silent(plotExpectedVsObservedPsi(fds, result=res[2]))
     
-    expect_is(plotCountCorHeatmap(fds, "psi5", norm=FALSE), "pheatmap")
-    expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE), "pheatmap")
-    expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE, topN=10), "pheatmap")
+    expect_is(plotCountCorHeatmap(fds, "jaccard", norm=FALSE), "pheatmap")
+    expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE), "pheatmap")
+    expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE, topN=10), "pheatmap")
 })
 
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index cdc098df..e333b220 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -21,14 +21,14 @@ test_that("Zscore calculation", {
     fds <- getFraser(clean = TRUE)
     
     # prepare zScore input for logit scale
-    psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount())
-    mu <- predictedMeans(fds, "psi5")
+    psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount())
+    mu <- predictedMeans(fds, "jaccard")
     residual <- qlogis(psiVal) - qlogis(mu)
     
     # compute zscore
     zscores <- (residual - rowMeans(residual)) / rowSds(residual)
     
-    expect_equal(zscores, zScores(fds, "psi5"))
+    expect_equal(zscores, zScores(fds, "jaccard"))
 })
 
 test_that("Gene p value calculation with NAs", {
@@ -39,44 +39,44 @@ test_that("Gene p value calculation with NAs", {
     mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), 
                                             times=c(4, 6, 4))
     
-    # simulate junction with bad rho fit
-    rho_5 <- rho(fds, type="psi5")
-    rho_5[c(1, 4:7)] <- 0.5
-    rho(fds, type="psi5") <- rho_5
-    
-    rho_3 <- rho(fds, type="psi3")
-    rho_3 <- rep(0.5, length(rho_3))
-    rho(fds, type="psi3") <- rho_3
+    # simulate junction with bad rho fit to create partly NAs
+    rho <- rho(fds, type="jaccard")
+    rho[c(1, 4:7)] <- 0.5
+    rho(fds, type="jaccard") <- rho
   
     # calc p values
-    fds <- calculatePadjValues(fds, type="psi5", rhoCutoff=0.1)
-    fds <- calculatePadjValues(fds, type="psi3", rhoCutoff=0.1)
+    fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1)
     
     # check dimension of junction-, site- and gene-level pval matrices
-    expect_equal(nrow(pVals(fds, type="psi5", level="junction", 
-                            filters=list(rho=0.1))), nrow(fds))
-    expect_equal(nrow(pVals(fds, type="psi5", level="site", 
+    expect_equal(nrow(pVals(fds, type="jaccard", level="junction")), nrow(fds))
+    expect_equal(nrow(pVals(fds, type="jaccard", level="site", 
                             filters=list(rho=0.1))), nrow(fds))
-    expect_equal(nrow(pVals(fds, type="psi5", level="gene", 
+    expect_equal(nrow(pVals(fds, type="jaccard", level="gene", 
                             filters=list(rho=0.1))), 3)
     
-    # check psi5 pvals are partly NAs
-    expect_true(all(is.na(pVals(fds, type="psi5", level="site", 
-                                filters=list(rho=0.1))[4:7,])))
-    expect_true(all(is.na(pVals(fds, type="psi5", level="gene", 
-                                filters=list(rho=0.1))["geneB",])))
-    expect_true(all(is.na(padjVals(fds, type="psi5", level="site", 
-                                filters=list(rho=0.1))[4:7,])))
-    expect_true(all(is.na(padjVals(fds, type="psi5", level="gene", 
-                                filters=list(rho=0.1))["geneB",])))
+    # check jaccard pvals are partly NAs
+    expect_true(all(is.na(pVals(fds, type="jaccard", level="site", 
+                            filters=list(rho=0.1))[4:7,])))
+    expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", 
+                            filters=list(rho=0.1))["geneB",])))
+    expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", 
+                            filters=list(rho=0.1))[4:7,])))
+    expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", 
+                            filters=list(rho=0.1))["geneB",])))
+    
+    # simulate junction with bad rho fit to create partly NAs
+    rho <- rho(fds, type="jaccard")
+    rho <- rep(0.5, length(rho))
+    rho(fds, type="jaccard") <- rho
+    fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1)
     
-    # check psi3 pvals are all NAs
-    expect_true(all(is.na(pVals(fds, type="psi3", level="site", 
+    # check jaccard pvals are all NAs
+    expect_true(all(is.na(pVals(fds, type="jaccard", level="site", 
                                 filters=list(rho=0.1)))))
-    expect_true(all(is.na(pVals(fds, type="psi3", level="gene", 
+    expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", 
                                 filters=list(rho=0.1)))))
-    expect_true(all(is.na(padjVals(fds, type="psi3", level="site", 
+    expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", 
                                 filters=list(rho=0.1)))))
-    expect_true(all(is.na(padjVals(fds, type="psi3", level="gene", 
+    expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", 
                                 filters=list(rho=0.1)))))
 })

From 7d272883781edcf8aec33a179b04e508e440caea Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 24 Nov 2022 13:51:34 +0100
Subject: [PATCH 39/80] fix install of dependencies for github actions

---
 .github/helperScripts/setupEnv.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R
index 9702e9b5..a141aac6 100644
--- a/.github/helperScripts/setupEnv.R
+++ b/.github/helperScripts/setupEnv.R
@@ -24,7 +24,7 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE)
 # add testthat to pre installation dependencies due to: https://github.com/r-lib/pkgload/issues/89
 for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck",
             "R.utils", "rtracklayer")){
-    installIfReq(p=p, Ncpus=NCPUS)
+    installIfReq(p=p, Ncpus=NCPUS, update=TRUE)
 }
 
 # because of https://github.com/r-windows/rtools-installer/issues/3
@@ -35,10 +35,10 @@ if("windows" == .Platform$OS.type){
 }
 
 print_log("Update Packages")
-BiocManager::install(ask=FALSE, Ncpus=NCPUS, version=BIOC_VERSION)
+BiocManager::install(ask=FALSE, Ncpus=NCPUS, version=BIOC_VERSION, update=TRUE)
 
 print_log("Install dev package")
-devtools::install(".", dependencies=TRUE)
+devtools::install(".", dependencies=TRUE, update=TRUE)
 
 # to get FRASER session info
 try({ library(FRASER) })

From 86f6691cd04d464fcda68ed39a75f6e02875910b Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Thu, 24 Nov 2022 14:44:44 +0100
Subject: [PATCH 40/80] pkgload problem is fixed ==> parallel install

---
 .github/helperScripts/setupEnv.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R
index a141aac6..979169ca 100644
--- a/.github/helperScripts/setupEnv.R
+++ b/.github/helperScripts/setupEnv.R
@@ -22,10 +22,11 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE)
 
 # install needed packages
 # add testthat to pre installation dependencies due to: https://github.com/r-lib/pkgload/issues/89
-for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck",
-            "R.utils", "rtracklayer")){
-    installIfReq(p=p, Ncpus=NCPUS, update=TRUE)
-}
+# for(p in c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck",
+#            "R.utils", "rtracklayer")){
+#    installIfReq(p=p, Ncpus=NCPUS, update=TRUE)
+# }
+BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS)
 
 # because of https://github.com/r-windows/rtools-installer/issues/3
 if("windows" == .Platform$OS.type){

From 7d9abd45850f5143e29594aeff50a321cd0acb34 Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Thu, 24 Nov 2022 14:49:15 +0100
Subject: [PATCH 41/80] Add update=FALSE to install

---
 .github/helperScripts/setupEnv.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/helperScripts/setupEnv.R b/.github/helperScripts/setupEnv.R
index 979169ca..b58d864d 100644
--- a/.github/helperScripts/setupEnv.R
+++ b/.github/helperScripts/setupEnv.R
@@ -26,7 +26,7 @@ BiocManager::install("BiocVersion", version=BIOC_VERSION,ask = FALSE)
 #            "R.utils", "rtracklayer")){
 #    installIfReq(p=p, Ncpus=NCPUS, update=TRUE)
 # }
-BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS)
+BiocManager::install(c("getopt", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "rtracklayer"), Ncpus=NCPUS, update=FALSE)
 
 # because of https://github.com/r-windows/rtools-installer/issues/3
 if("windows" == .Platform$OS.type){

From e60ae0bf09cdb947050979d47f4b2310982a58e9 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 24 Nov 2022 21:09:48 +0100
Subject: [PATCH 42/80] update vignette to FRASER2 and set all available
 metrics in plot functions

---
 R/Fraser-pipeline.R                     |   7 +-
 R/plotMethods.R                         |  17 ++--
 R/variables.R                           |  15 ++--
 man/FRASER.Rd                           |   1 +
 man/plotFunctions.Rd                    |  18 ++--
 man/psiTypes.Rd                         |  19 ++++-
 man/psiTypes_avail.Rd                   |  21 -----
 vignettes/FRASER.Rnw                    | 105 ++++++++++++++++++------
 vignettes/IntronJaccardIndex_sketch.png | Bin 0 -> 35651 bytes
 9 files changed, 121 insertions(+), 82 deletions(-)
 delete mode 100644 man/psiTypes_avail.Rd
 create mode 100644 vignettes/IntronJaccardIndex_sketch.png

diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index b7960dad..d6a106bf 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -85,8 +85,9 @@ NULL
 #' the beta-binomial fit, the computation of Z scores and p values as well as 
 #' the computation of delta-PSI values.
 #' @export
-FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", 
-                                        "AE-weighted", "AE", "BB"), 
+FRASER <- function(fds, q, types=psiTypes, 
+                   implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", 
+                                    "AE", "BB"), 
                     iterations=15, BPPARAM=bpparam(), correction, ...){
     # Check input
     implementation <- match.arg(implementation)
@@ -103,7 +104,7 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder",
     }
 
     # fit each splicing type separately
-    for(i in psiTypes){
+    for(i in types){
 
         # get type specific q
         if(missing(q)){
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 3fd7ea50..2c38562f 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -299,7 +299,7 @@ NULL
 
 
 plotVolcano.FRASER <- function(object, sampleID, 
-                    type=currentType(object), basePlot=TRUE, 
+                    type=psiTypes_avail, basePlot=TRUE, 
                     aggregate=FALSE, main=NULL, label=NULL,
                     deltaPsiCutoff=0.1, padjCutoff=0.1, ...){
     
@@ -400,7 +400,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 
 plotAberrantPerSample.FRASER <- function(object, main, 
-                    type=psiTypes,
+                    type=psiTypes_avail,
                     padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
@@ -463,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpression <- function(fds, type=currentType(fds),
+plotExpression <- function(fds, type=psiTypes_avail,
                     idx=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
@@ -555,7 +555,7 @@ plotExpression <- function(fds, type=currentType(fds),
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpectedVsObservedPsi <- function(fds, type=currentType(fds),
+plotExpectedVsObservedPsi <- function(fds, type=psiTypes_avail,
                     idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                     basePlot=TRUE, label="aberrant", ...){
     type <- match.arg(type)
@@ -819,8 +819,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
 setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)
 
 
-plotEncDimSearch.FRASER <- function(object, 
-                    type=currentType(object), 
+plotEncDimSearch.FRASER <- function(object, type=psiTypes_avail, 
                     plotType=c("auc", "loss")){
     type <- match.arg(type)
     plotType <- match.arg(plotType)
@@ -988,7 +987,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
 
 
 plotCountCorHeatmap.FRASER <- function(object,
-                    type=currentType(object), logit=FALSE, 
+                    type=psiTypes_avail, logit=FALSE, 
                     topN=50000, topJ=5000, minMedian=1, minCount=10, 
                     main=NULL, normalized=FALSE, show_rownames=FALSE,
                     show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
@@ -1429,7 +1428,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
 }
 
 plotManhattan.FRASER <- function(object, sampleID, 
-                                type=currentType(object), 
+                                type=psiTypes_avail, 
                                 main=paste0("sampleID = ", sampleID), 
                                 color_chr=c("black", "darkgrey"),
                                 ...){
@@ -1543,7 +1542,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
     } else{
         vapply(type, FUN=function(x)
             switch (x,
-                    jaccard = "jaccard-intron-index",
+                    jaccard = "Intron-Jaccard-Index",
                     psi5 = "psi[5]",
                     psi3 = "psi[3]",
                     theta = "theta"),
diff --git a/R/variables.R b/R/variables.R
index 61f821e1..214c283d 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -5,18 +5,15 @@
 #'   # to show available splice metrics:
 #'   psiTypes_avail
 #' 
-#' @export
-psiTypes_avail <- c("psi5", "psi3", "theta", "jaccard")
-names(psiTypes_avail) <- c("psi5", "psi3", "theta", "Intron Jaccard Index")
-
-#'
-#' Splice metrics that are run by default
-#'
-#' @examples 
 #'   # to show splice metrics selected to be fitted:
 #'   psiTypes
 #' 
+#' @rdname psiTypes
+#' @export
+psiTypes_avail <- c("jaccard", "psi5", "psi3", "theta")
+names(psiTypes_avail) <- c("Intron Jaccard Index", "psi5", "psi3", "theta")
+
+#' @describeIn psiTypes Splice metrics that are run by default
 #' @export
 psiTypes <- c("jaccard")
 names(psiTypes) <- c("Intron Jaccard Index")
-
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 170a6271..a9a0ca97 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -11,6 +11,7 @@
 FRASER(
   fds,
   q,
+  types = psiTypes,
   implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"),
   iterations = 15,
   BPPARAM = bpparam(),
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 1acf58c6..686bf15f 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -27,7 +27,7 @@ plotManhattan(object, ...)
 \S4method{plotVolcano}{FraserDataSet}(
   object,
   sampleID,
-  type = currentType(object),
+  type = psiTypes_avail,
   basePlot = TRUE,
   aggregate = FALSE,
   main = NULL,
@@ -40,7 +40,7 @@ plotManhattan(object, ...)
 \S4method{plotAberrantPerSample}{FraserDataSet}(
   object,
   main,
-  type = psiTypes,
+  type = psiTypes_avail,
   padjCutoff = 0.1,
   zScoreCutoff = NA,
   deltaPsiCutoff = 0.1,
@@ -51,7 +51,7 @@ plotManhattan(object, ...)
 
 plotExpression(
   fds,
-  type = currentType(fds),
+  type = psiTypes_avail,
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -63,7 +63,7 @@ plotExpression(
 
 plotExpectedVsObservedPsi(
   fds,
-  type = currentType(fds),
+  type = psiTypes_avail,
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -89,11 +89,7 @@ plotExpectedVsObservedPsi(
   ...
 )
 
-\S4method{plotEncDimSearch}{FraserDataSet}(
-  object,
-  type = currentType(object),
-  plotType = c("auc", "loss")
-)
+\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes_avail, plotType = c("auc", "loss"))
 
 plotFilterExpression(
   fds,
@@ -111,7 +107,7 @@ plotFilterVariability(
 
 \S4method{plotCountCorHeatmap}{FraserDataSet}(
   object,
-  type = currentType(object),
+  type = psiTypes_avail,
   logit = FALSE,
   topN = 50000,
   topJ = 5000,
@@ -174,7 +170,7 @@ plotBamCoverageFromResultTable(
 \S4method{plotManhattan}{FraserDataSet}(
   object,
   sampleID,
-  type = currentType(object),
+  type = psiTypes_avail,
   main = paste0("sampleID = ", sampleID),
   color_chr = c("black", "darkgrey"),
   ...
diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd
index 773152c5..dfff5dce 100644
--- a/man/psiTypes.Rd
+++ b/man/psiTypes.Rd
@@ -1,19 +1,32 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/variables.R
 \docType{data}
-\name{psiTypes}
+\name{psiTypes_avail}
+\alias{psiTypes_avail}
 \alias{psiTypes}
-\title{Splice metrics that are run by default}
+\title{Available splice metrics}
 \format{
+An object of class \code{character} of length 4.
+
 An object of class \code{character} of length 1.
 }
 \usage{
+psiTypes_avail
+
 psiTypes
 }
 \description{
-Splice metrics that are run by default
+Available splice metrics
 }
+\section{Functions}{
+\itemize{
+\item \code{psiTypes}: Splice metrics that are run by default
+}}
+
 \examples{
+  # to show available splice metrics:
+  psiTypes_avail
+
   # to show splice metrics selected to be fitted:
   psiTypes
 
diff --git a/man/psiTypes_avail.Rd b/man/psiTypes_avail.Rd
deleted file mode 100644
index 6f8455a9..00000000
--- a/man/psiTypes_avail.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/variables.R
-\docType{data}
-\name{psiTypes_avail}
-\alias{psiTypes_avail}
-\title{Available splice metrics}
-\format{
-An object of class \code{character} of length 4.
-}
-\usage{
-psiTypes_avail
-}
-\description{
-Available splice metrics
-}
-\examples{
-  # to show available splice metrics:
-  psiTypes_avail
-
-}
-\keyword{datasets}
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 11a7a599..4a28fbbe 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -46,6 +46,7 @@ opts_chunk$set(
 
 
 \begin{document}
+\SweaveOpts{concordance=TRUE}
 
 <<include=FALSE>>=
 opts_chunk$set(concordance=TRUE)
@@ -175,6 +176,32 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and
 between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the 
 following.
 
+From \fraser{}2 on, only one metric - the Intron Jaccard Index (Figure 
+\ref{IntronJaccardIndex_sketch}.) - is used by default. The Intron Jaccard 
+Index is more robust and allows to focus more on functionally relevant 
+aberrant splicing events. It allows to detect all types of aberrant splicing 
+previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$)
+within a single metric.
+
+\incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron 
+Jaccard Index, the splice metric used in \fraser{}2.}{
+The Intron Jaccard Index considers both split and nonsplit reads and is 
+defined as the jaccard index of the set of donor reads (reads sharing a donor 
+site with the intron of interest and nonsplit reads at that donor site) and 
+acceptor reads (reads sharing an acceptor site with the intron of interest and
+nonsplit reads at that acceptor site). }
+
+The Intron Jaccard Index considers both split and nonsplit reads and is 
+defined as the jaccard index of the set of donor reads (reads sharing a donor 
+site with the intron of interest and nonsplit reads at that donor site) and 
+acceptor reads (reads sharing an acceptor site with the intron of interest and
+nonsplit reads at that acceptor site): 
+
+\begin{equation}
+    J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}\label{eq:jaccard}
+\end{equation}
+
+
 \section{Quick guide to \fraser{}}
 
 Here we quickly show how to do an analysis with \fraser{}, starting from a
@@ -204,10 +231,6 @@ fds <- calculatePSIValues(fds)
 fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20,
        minDeltaPsi=0.0, filter=TRUE)
 
-# fit the splicing model for each metric 
-# with a specific latentsapce dimension
-fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3))
-
 # we provide two ways to anntoate introns with the corresponding gene symbols:
 # the first way uses TxDb-objects provided by the user as shown here
 library(TxDb.Hsapiens.UCSC.hg19.knownGene)
@@ -216,6 +239,10 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
 orgDb <- org.Hs.eg.db
 fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
 
+# fit the splicing model for each metric 
+# with a specific latentsapce dimension
+fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3))
+
 # alternatively, we also provide a way to use biomart for the annotation:
 # fds <- annotateRanges(fds)
 
@@ -226,7 +253,7 @@ res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # result visualization
-plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE)
+plotVolcano(fds, sampleID="sample1", type="jaccard", aggregate=TRUE)
 
 @
 
@@ -439,7 +466,7 @@ transformed $\psi$ values to compute the correlation.
 
 <<sample_covariation, echo=TRUE>>=
 # Heatmap of the sample correlation
-plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE)
+plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE)
 @
 
 It is also possible to visualize the correlation structure of the logit
@@ -447,7 +474,7 @@ transformed $\psi$ values of the $topJ$ most variable introns for all samples:
 
 <<intron_sample_correlation, echo=TRUE, eval=FALSE>>=
 # Heatmap of the intron/sample expression
-plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE,
+plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE,
     plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)
 @
 
@@ -470,14 +497,14 @@ p-values and z-scores for all $\psi$ types. For more details see section
 
 <<model fitting, echo=TRUE>>=
 # This is computational heavy on real size datasets and can take awhile
-fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2))
+fds <- FRASER(fds, q=c(jaccard=3))
 @
 
 To check whether the correction worked, we can have a look at the correlation
 heatmap using the normalized $\psi$ values from the fit.
 
 <<covariation_after_fitting, echo=TRUE>>=
-plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE)
+plotCountCorHeatmap(fds, type="jaccard", normalized=TRUE, logit=TRUE)
 @
 
 \subsubsection{Calling splicing outliers}
@@ -495,6 +522,10 @@ $\psi$ values or both.
 
 <<result table, echo=TRUE>>=
 # annotate introns with the HGNC symbols of the corresponding gene
+if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE))
+    BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
+if (!require("org.Hs.eg.db", quietly = TRUE))
+    BiocManager::install("org.Hs.eg.db")
 library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 library(org.Hs.eg.db)
 
@@ -519,9 +550,12 @@ the following additional information:
     \item hgncSymbol: the gene symbol of the gene that contains the splice
     junction or site if available
     \item type: the metric for which the aberrant event was detected (either
-    psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$)
+    jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or 
+    theta for $\theta$)
     \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of
-    this event
+    this event (at intron or splice site level depending on metric)
+    \item pValueGene, padjustGene: only present in the gene-level results table,
+    gives the p-value and FDR adjusted p-value at gene-level
     \item psiValue: the value of $\psi_5$, $\psi_3$ or $\theta$ metric
     (depending on the type column) of this junction or splice site for the
     sample in which it is detected as aberrant
@@ -533,20 +567,39 @@ the following additional information:
     same donor or acceptor site as this junction or site over all samples
     \item counts, totalCounts: the count (k) and total count (n) of the splice
     junction or site for the sample where it is detected as aberrant
+    \item nonsplitCounts, nonsplitProportion: only present for the Intron 
+    Jaccard Index. States the sum of nonsplit counts overlapping either the 
+    donor or acceptor site of the outlier intron for the sample where it is 
+    detected as aberrant; and their proportion out of the total counts (N). 
+    A high nonsplitProportion indicates possible (partial) intron retention.
 \end{itemize}
 Please refer to section \ref{sec:Introduction} for more information about the
-metrics $\psi_5$, $\psi_3$ and $\theta$ and their definition. In general, an
+Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and 
+$\theta$) and their definition. In general, an
 aberrant $\psi_5$ value might indicate aberrant acceptor site usage of the 
 junction where the event is detected; an aberrant $\psi_3$ value might indicate 
 aberrant donor site usage of the junction where the event is detected; and an 
 aberrant $\theta$ value might indicate partial or full intron retention, or 
-exon truncation or elongation. We recommend using a genome browser to 
-investigate interesting detected events in more detail.
+exon truncation or elongation. As the Intron Jaccard Index combines the 
+previously described metrics, an aberrant Intron Jaccard value can indicate any
+of the above described cases. We recommend using a genome browser to 
+investigate interesting detected events in more detail. \fraser{}2 also 
+provides the function \Rfunction{plotBamCoverageFromResultTable} to create a 
+sashimi plot for an outlier in the results table directly in R (if paths to 
+bam files are available in the \fds{} object).
 
 <<result_table, echo=TRUE>>=
 # to show result visualization functions for this tuturial, zScore cutoff used
 res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1)
 res
+
+# for the gene level pvalues, gene symbols need to be annotated the fds object
+# before calling the calculatePadjValues function (part of FRASER() function)
+# as we previously called FRASER() before annotating genes, we run it again here
+fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE)
+# generate gene-level results table (if gene symbols have been annotated)
+res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=0.1)
+res_gene
 @
 
 \subsection{Finding splicing candidates in patients}
@@ -555,7 +608,7 @@ Let's hava a look at sample 10 and check if we got some splicing
 candidates for this sample.
 
 <<finding_candidates, echo=TRUE>>=
-plotVolcano(fds, type="psi5", "sample10")
+plotVolcano(fds, type="jaccard", "sample10")
 @
 
 Which are the splicing events in detail?
@@ -568,7 +621,7 @@ sampleRes
 To have a closer look at the junction level, use the following functions:
 
 <<plot_expression, echo=TRUE, eval=FALSE>>=
-plotExpression(fds, type="psi5", result=sampleRes[1])
+plotExpression(fds, type="jaccard", result=sampleRes[1])
 plotExpectedVsObservedPsi(fds, result=sampleRes[1])
 @
 
@@ -619,7 +672,7 @@ confounders in the data. Currently the following methods are implemented:
 # Using an alternative way to correct splicing ratios
 # here: only 2 iteration to speed the calculation up
 # for the vignette, the default is 15 iterations
-fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder", 
+fds <- fit(fds, q=3, type="jaccard", implementation="PCA-BB-Decoder", 
             iterations=2)
 @
 
@@ -638,17 +691,17 @@ for a subset of the dataset:
 <<findBestQ, echo=TRUE>>=
 set.seed(42)
 # hyperparameter opimization
-fds <- optimHyperParams(fds, type="psi5", plot=FALSE)
+fds <- optimHyperParams(fds, type="jaccard", plot=FALSE)
 
 # retrieve the estimated optimal dimension of the latent space
-bestQ(fds, type="psi5")
+bestQ(fds, type="jaccard")
 @
 
 The results from this hyper parameter optimization can be visualized with the
 function \Rfunction{plotEncDimSearch}.
 
 <<figure_findBestQ, echo=TRUE>>=
-plotEncDimSearch(fds, type="psi5")
+plotEncDimSearch(fds, type="jaccard")
 @
 
 \subsection{P-value calculation}
@@ -669,8 +722,8 @@ computed as the product of the fitted correction values from the autoencoder and
 the fitted mean adjustements.
 
 <<p-value calculation, echo=TRUE>>=
-fds <- calculatePvalues(fds, type="psi5")
-head(pVals(fds, type="psi5"))
+fds <- calculatePvalues(fds, type="jaccard")
+head(pVals(fds, type="jaccard"))
 @
 
 Afterwards, adjusted p-values can be calculated. Multiple testing correction is
@@ -680,8 +733,8 @@ methods supported by \Rfunction{p.adjust} can be used via the \Robject{method}
 argument.
 
 <<p-adj calculation, echo=TRUE>>=
-fds <- calculatePadjValues(fds, type="psi5", method="BY")
-head(padjVals(fds,type="psi5"))
+fds <- calculatePadjValues(fds, type="jaccard", method="BY")
+head(padjVals(fds,type="jaccard"))
 @
 
 \subsection{Z-score calculation}
@@ -704,8 +757,8 @@ counts and the counts after correction for confounders and $\bar{\delta_j}$ is
 the mean of intron $j$.
 
 <<z-score calculation, echo=TRUE>>=
-fds <- calculateZscore(fds, type="psi5")
-head(zScores(fds, type="psi5"))
+fds <- calculateZscore(fds, type="jaccard")
+head(zScores(fds, type="jaccard"))
 @
 
 \subsection{Result visualization}
diff --git a/vignettes/IntronJaccardIndex_sketch.png b/vignettes/IntronJaccardIndex_sketch.png
new file mode 100644
index 0000000000000000000000000000000000000000..79cca6a809755fb30d0b103f6385d126dba3f472
GIT binary patch
literal 35651
zcmd?RWk8hQ+b%kYw17$}Y0)WN0us{QUDBnb<RA!0C<+Xn(k0y~-Q6{GcQ<>DzyFSR
z@BRMY59gduhY!v;FzZ=sJ$GE!b>Hg=RFD(L#w5msKp@zX5~4~F2y!C?a*q`K9{8kO
zP=Nsg@rFo>zE%08w>^vIfiHF^xX&&OV|pe*aB|n-n~a~15k!M~k8UUeNASL=kWkKx
zuQW(Me+`NHdHZ<ZlPZw(zK@=rCgRqS{&483?C#9A!>wmUc-P*9>&&&`G~I#Cp~fNZ
zbkf9m{4F-PA_M|kZB+mJ#@i5m;P0DvLf)u|H!Sz=u_Io+A3aP)Lc9s_hW!1G!TtN@
zi2p-gJ`+MlygorfB1XJ6qW}MFh|zFX=7LvD-wRM6S*(an$?{jRzu4iL+jnr5(<&6w
zRA}TvhNvJRHd2MlRZq__f%k%kd*Ja<QL0w<vOD<npWjg&mX=lq;!%jMWhT{E7>n13
z;=(Z?$(D$nPu5bP8?La$wx0O$%Ry*os+hS!1s1XISt_--bU_D!7$fe2kkb#B6FR;F
zkG1@YAY-HRRo1TB+GNw-7w%_$HTHgX$dD}3zZ0YiBV_8~b&DwQ)VTddf}gB8@||3u
zmWCl@4+*kQgSdYN`Vp?K#j3gYDBMF>OB+fYu1B&dB_%zmA&_Roe1zOEeSP<KW;(}-
zMG_MeS=`GTa_QsP5*&6(;mD9i2;wq79*fDz4P9T{GBMq9SmOKpANIbZ`~Lkq86Rhs
zCYzWo6C4#{06}bvMB5EAnvZt*h)mswkCO3Rp-t@Y=2t;IGWD2{MwY*0dH?F@$oh;t
z*$Vwx=T3x*a`2{yuD_f9YFV<L5F&(yu)UVw^Q*F)nmkz+!|n~&ztpnN$yd`={GE1X
z;(|Bff_Ilr^#qV@a>NC{CQ1=ZX3Ae5Zya8oT~J{ubgo#$aVlz<QP`_2#)w8)3#
zO(ufy0SuO7jkn#3%ge8zr<y_uys@<79G0(oq7`Gqnq2NdYQ7-+VNqDKs<A(`pzNno
z48v_R?v1Vv&AWElm?kb{Z(U);FpoNUT$qky%<hqJgk`25*pw^dKYhw(sUoRE3Wq?P
zF%Sz7O*!73jUG&$EL|E{OQe5*u>g$YD>7oj6VXyk@;AGDr`9{(ZYN4(`O2`!!-STA
z7ha#vc0YZ2?oIahD%xOwP0vDxXZ9BB&ZHjWU630QaG79|obHV4Gw+|R#UUkM{_}@Q
zgoOU3b`(QJ#(f*Z9&we~LBYyn`B>H?5_qX(WX5o412>ib-hIgEqraEeQ2fO0r&zD{
z0zW3kg@_p^OI6yTf=_+Cg~uW7N)wVChA@i8oCq;757;s66;+#T%;MfQ>0~A@xXGh9
zh%qz*U$C(mj^z!QC1yuHjRp2Vh>CVp22t2S*seFn`|U}*<MXBMY%iX*{-fSeb#O_2
z6v*QM#0E#Ie%exd53k?$Cs0I2ols4V$jn3=E1(!GG5Z~W2Cfn2_xI0!b@a+=6?(rx
z<Es=4zG}E>JImTV{H=%WwTR)o|6vV!muierj|P!=_;*H97E)3MzNoi-eSr?EG{q$s
zrdP=fG(Yk+MXuiBr6$BLQpqs{hOPlyNc%fS6qB11^|wM0eJkkl)2D$+9*cXMIWZ`D
zC8mi{M8NLTkP){rPgGMuA?!!v_*8gUW#z=dk_fM(0&rFv$TPCPzC}KHca6to=%bas
zNYd4zmXDdG%jepPX`)9CH<gdLxbPr=NMIr+sG=&VqjS8!u&q?pBY3wQR|dR&DMQB4
z6ZlGm+57!B+Zv~Vc$)3s56U3V)DUB@9AQ*!m88Ma=x8M`F1NgSooEQrIL&{Qn*>Xx
z%wK{v?lI{SE96F6kLFxkjla<I;<=o4sS-F1IbFzi-Z4jece)V{f(ZNpWYHJwsKWP~
z#HZihUC6>8$0hW)zkRr@SNZUAyJS1fe!KExIxVdSZW-^|@kVlnEQcp48J=m60W2s!
zKEQpFL)F6KGE@+i!>aA{{qeNj^k{xDQW`qgAfW`ug0w=7Z0(~O@wRZhdo&_0v98@m
zghP5fQjZ@`U?+;)9VKBVK34L$0#U_E<?a@R%kI#Iw^%;D@JJ=Ur(0UDiZxm9O|K2F
zj?QAHEPb7O6tL7qL}qekY>>F<rRbCo!TP}WYAM!8IT(J;AI9pqH0UMmCCfnPN@IvD
zd3$|?&&8!`K?~n6Z2IxAe$%kkWmlmC$8v0YPF`@MA5ugQV)2(yfx`ktUn9rGT2~LY
ze;jdwHeJdg_Ck;XRarE{WbV6tqD_iI4kwwc#cjQy)B0;>_GiRg@}Ithou7xbh}NGf
z1~oOmygD()d(y<~(pUH4^PZvE`qvk>P4`{hc!A^po%Cr3AHQAW;s?S6G{(?M*P37N
zJ?MWY_cyU@UTpNAy5kyO)VZHI?<`t5p{rxQ3w`pZLRD^cq4_x(Z{ZV9vx&lASv0_=
zy}5ynde6pGyi62Z>npJ(tM*8;W0L=vZMpOl-!`$d4rCCQZFI|-iT=T&nYFd-W&2)^
z4<_oEZQm*GIwZ;b&388zRtYs4>P^W%d?UW#vPIZeVuj1ZQNLLdZz~tXd-OTt3X`sM
zJDnIvTXzwLT1PJl&M(A{w>$eV;pF_4=Y3?qMaGJZo4#?AyQUjZb`B3MX3A&oC^@f9
zZ)^0d58|F4c_VR`LJAo{R4?SS_oL$&Zc$FFZhDNlXR0!d#!6pY%&uO-!>4RiGg|Fb
z6)G+n%@Hid+cjVc#rfKqS(Hn!Dk-_y_U)}~Y@sJP2%rZ$(;zDqVuJCqR)32vKlLgv
z-3XG3c(dwPo`8QIpw8j(OV2_Ivxcz<`i)pJnDD9Fq4VzmafRWG4Z+*2u<i>^td7g8
zy<dea^<#bJtFg@beTh0E`UziR*mkb#DIuJnfnANNmFAh}O((ody*_o16<ji1p{Md7
z>0MQlpLysYI8KZ)6#EPAN}U6}6Gv4uF?_CX@-(QYCz(q2T36?>X>S<3rk!g#W@vf&
zd}G+cZssX(1jxqSAi;QBejVB=o(=46v$Pc)cUGEDJc{hn#_zSbDTSG3q9vX5ug0az
z{EW-JQgTS2FSm&>a-z?dWJ`a4YJIvM+31h0%w??rQ<=VCgQ?&@N)e<dgXBSgljRUp
z36~wIXa&%|#kY+6esK5R?7EJ$JCQ_AhWTkZom6Fvqu0+@1@5`$lY>n#A4x32eVZ|{
zrP?>2SmN%Jg0wI;nCgLBqyF$=a(PP{C1l|5E}*5a)hAinPRMlg<)$`0&SS6fUhFEz
z%VrcnH)ngz@?6&_Q!LmHUVqa@a?4>A>CRL#zuKF1d*ersQW8qB<~||Z7sHm)|EB$&
z)81Org(;ufV<K_=u6-i?9v)~S*pj2Sh%H%RJ4@Sj5Rt!{G~g6_uQZq)r|FUZl{Ppi
ztl`|}kg#<ix$RpO$%Ckt_V(`-du!`i7G;YtA6Qz9Qn8bjMif(*&%OHI?<;O@b8d%{
z&gLK82cGp#0((Qik}NgMAXqx)_0>tLNfP(Ld=d2f_XFk3E;jP|KBaU_*G*bW>@3x-
z-J$d#YZB5Q-WKbX5=Sht@4ihCaXHUaE>39F<3UugL3ox>4b*YE_MhlKCqL;_31a96
zwhN3PAR+C+)(~a?blhGFtjJ7$`}~0N;K0MTaz(H5$3;7ZNB+%EctcXD`J<~f6sY>R
zhrT}W-oF@7;<4Boksyu*d)J=Sx(HYBLs%Wr?jZVI#S2z}{Ajz9=jp81twhLM<MPwB
zRk5?e8md}dOwJ+O!)QN$M!b5}BAZA$S4N+>fo`A&lEJgre~M6=beY{4(QC*xZ`lM|
z=9Lus9N2s;;`6CdtMv;<@{eT{@X7V)!ZDb!Z0ADaxQlrCFXXdpE3DK%9IaZP-t){F
zs&TmxI2BiA$XMd?wnSk3S1%cJkC$u?WZzl+a%j(f+mI-&{?*3qOJbf!YiVy0!)~Zk
zjR*w@bgl&Prek$VES}Sx^_sExRaGugjK(EI_cq61`C5Kh*gR)vd;R<#(FG{pQV6JJ
zA<@ZT^G5NgBW!i=s$e26LfL=?KWO05Ue9Sbhi~T0UBA;-?{vm`e7UPbhqC;JAVlm)
zbUj@f*>W`Jw>oBIOKWQgd2farbzdBCw%5Fk2K^SHpElBU<mKz1Lg@a^-FVS|9RvQZ
zJwxgcp1J{<uj8no6=9xE=%~<q2SOdN9fr3Vc5tQ<D_h9fQs`URIDtxO-}?pld9}6l
z>c3DQ(8$WIl59t>k{GmqpC@FTIBQFIE`li{d@cu3cn-|#N66!<32JNpA+P?BMWtZ<
zKFa=%uSvWx*&g(bf#+8(7_(|p;YdzfO3MG_wN1{?Hwvq3-SwArg<hT@rNt*KQOPq&
zO3VY)frPk=?>e7S1{BP!SuOlj$#Lg%LFKT`lN95yqJkdy6=!|jOI6F<oA~Lp{Y2l2
z3)9ypOz=u3hI+1BKL4H9p5Ee2)$Q-!$JpVu_LLx}qasp7B|bj=EDmJ_DaF9G=bd@X
zD>JJnl+SntS=y|lDrxN<e6`Ddu{d6R;|j8aug1Nl-z*z9Cqk((|F{>W{{hOSL`o9J
zVvPEo2117m%%Md%U5R49$KcE})nsaQVmah-z4D2$IM(dy7j}}ZhTTFL+YB@sQwqUE
z*Y$_=CG;qJ*LzT0mm-&o@dEMF-Ck2jK*PQ;0>mMg(p5sSyb}<%pjb)x^K5DekrJ7L
zE1xgc@Wm;9%sym0d*y5_a5*wqY@eYW?O&r**a%PBGd$bfgF|y|v<XvF;hq)Tys>+R
zaDW!jJ$I|Np|=Yp5JW_5nYTA*3hqlVox5zN7cogxE(-fd$$oYbYCG6&$0IPGIc(uk
zVkl;42%^EJAcS&j;`OPJfDWz3YZJwv*1fR+cTc1d)lb&XKIIJY#1cE*GFx}ugE%9&
zhjr-VqXC$rZNPiKR%hPl92E+$-K|LZf<B$)P8VN!P_5D&9flDyBH+ZKEXZEMo`w?W
zl%|pHG$PYSAabe4hCSvokpj>XZUA3zWY`Iy@F9ola7*grKl<ZxuTUL<?TkJ)acyl_
zyxpNfo6G-j=(3swowJ;sl6OBV0#HTi{ms>TLaEPQa&o|GGw#k)rI^ND@w{n)<8|tL
z{E-{Fs;LCLC52}WrO+LWXJDiEyh^<G0&NK?Jdv^CacuW^ByGLJ+UfkC?GOXe{zMcn
zFAv<H@c<mv5;M#x(QFyQqd9l6pFcf(b?IbfWutk98{U1Z#K1e}{D%dN;W5&aR|r%O
z(lp>G`>-B;@EYOn^6>#lkcJnH!#2L&I6x~~rE?`Hp0u7K8hJ&mmuH-q<MCP_4BpPz
z$pNg52_U%wltO95?64;<f>zwaVjK44H>!GD8o!Av1xX*^{(=2c%63VVq95qPk=LwX
zvhBhL=xDQ;9%U(0ZUqimeI_q2AN{%+-s(qIlx8#e3tB{#RVE+OURU}D2LWG9Itg-g
z&4!X<iA4viHlEy7@huLQJQ5ArKZ<dja$@!(T_l0^TVmSIU7tz)xx3DsF~j!J#P|O)
z<8|Y-@JnVlWnJj19X`pECmj{fW_Hf$ZHWyGfWfLgo88$Vu%aOZGmv>X?hi!OV&eu5
z%@qD*v_K!s?$VLb915Fmwhc{M2l<MJF^C=|U(7-L{f-TiNTL_<T8h(3ydd+8G~ec^
z`jMyS1vz9CALJMfHNztN^PH^k`nx&Wd*rm^%`cAy?@|=PT=$>VF{oy#?KMoRT-CyM
zeNXR3x2tNTqZXy!S`Ez%+jK|94`*JW)@0H#$;}pi?(NH@H0_CA<Z}U$>MtSl{u+^y
z*7BW%y60)f(c$A*rW=py;k)mO{<a78aFEk^?d}!67E>FWHR--*Ic}=1N9g}${|zLE
z2t+xSI3+*Jt%DJ)&x>`;EwLO{dY&h$^=hOlm||0rjGwWI74BnXDO0a(6{u2P&nC0o
z6Ct{e5PEtU>?x^(r;${g{LHe^JFnq(kz`wSW47jCI794Yub&H|!iTu^<kX%e)&1Cm
zJ+Grlge7C`2m?MYwej%s9+tgGreng{8~c{<2_aa#V)_k8_u*5xd^i0oqBE99zNLE&
zP?nntD3rfL!Q1u102k2G<Q(BeEch*JkS^`{5Os=?05angB!6t~6L*^D6$lTsfuW&f
zf^+?Yisfz`aEunwbcHlK`0c#mxSZTa<qWV%`w2q%a`2+1+t0`zjNTClE7}+;kzU*V
zYRCJ|!DTHo19gWtoQVitwU(EcXEpx2wXH3ul`NT{swzD@+a-z#019cg;8eBzE;RF`
z{aQS=TA7lxv{y7V9nH-z`T67RN(6(px6|YWPh6ZvJw`W2!}B$LNw7A<0Lc|NeX<k3
zzdP%|thXG8l!g!rWVrPCC@L<lZoAK3nXJ*Dn`(!5uQ;sr<!hE?_>S!_G{arj$bQ~m
zA6kAt8%0;ef74Fk8_PKw-|graLMB-MN<gnd1mzHuK-bKQs=sHBI7-`mvcN|;MWgo9
z#{^!d2_{OLsnROPjmPl*TYPbG@rJA4NvGNLpX^hZV8R~(a03~_Xjs>_;lk8V;UOGD
z2q955I%dbDD|5Dqb_ZRce$*&_k3m_hSz;<CDymcCyt5Le?>w@2r5W{tkMCn2=^Zi0
zVSzqWAzexAc=Ke>dMb{K!cBzo+L1SZ<s8yA&<}Ook}_w4TMN*u<t6J@n-9L2N-ruZ
zs&U!-t>56)9mxp$M3|rkvPR9(vkwt|ezaEOsGQav88zlOkCqmJi4Xn7tICFkKy7n!
z+^hN-6Em5m_;MO`9jF3?Grv$w(4j<8r4(KdH-|ZGFg;Hv^NiaaFNvwO)Yw00-ZxpN
zO1%+qkf&oRllbL%wNrtHOP;4K(oT(^ZNeI3ODR;fe6~l;sI#tLLUn_lW;55rg)D&v
z99AW0+iN3_V#&w2yMg%&x^C^KpFXo=`PxqOaOj<<Hp=?1{rxN?<Oz$_j(SflyV)Td
z)y@^w2A9t|DOxTXW7c`5Qf>t(hCd}E69VI4u$7Gs9`k``Q~d|9#1BR;DmfkMdXl$`
zSSGi=)I}na9v(nnyL7R*|KlO>%1>XKjMAr{YtrvL`25Jq_NdpW%w<mx)482oAhX7L
zl64EsT&m3_is7I@QH1U7=|TH&g`NC#X`np;ykF*?F{a+-tJ3C(<OrD~LC5N}wD-;a
z*mZ6vmLpk@;*fhWKc26BLbiXyKcT4nAy@6`?r!8&wREDA=83TcBp1MTi0``H4TauQ
zkYJ4}vf_$gQwhkX4JVwC33x>N%>D_WPGi!mow!7bo$)%76{ODm@{~<!yueHy({y{b
z+l4i0QHiqEync*y&TWfvD)(4FXs|t$qQPZvE;Gq12%=lWZ@2d~@r`B(&feUcy)J@!
zw5-u%yp)*MFPTH$q5#(jUD%4Q7V(so)|RthYl)J>MP{0sn`tK)b*oh>!ored?>T;y
zk?E>{D+!#RNxg+a>%-Um{QZZR5?Bm<Y*!|VjI+aLDIY#i%BFdS;0>qP582!tC5>Ld
z+v{h^*}v|@f7PoyWYS-aLlQ*=Uf9Lu0b8A<>F-4sn2&ir2zA~aOcf8Ol6?#`3~7CR
zI5CH%hM*A9Os!;KjGr%^9<4d+CFXCQA3Cj)Wa9<;nR)y+Q|u{JEG&i2J5wr8A3aJL
ze$<tTke{>&6e@lL<>SDnv&X*}r#FGL!VcGVF8(GB5XVv$_K3av&xk>wRH*-PE?gnf
ziPB0~731r49|eWOcINz`o#O6zSl+BZQMW*^4u_0agI8!RUV!ZM`lv}a9vIYiBJ#fw
zf<*~G0j7zHgx0$$-`s($?Y)F7_SG1Un!dE*1%<25&*HUg7v)kQxBwrrcXm$Xb=sUN
zv&xiDlB+2NHwHhsKS{9v3>8^wZ?3KZDyWa8)phMLxjFjzWNSDy1gOSMAx}MUAs<08
z!F!W|ipq4V)S_ST?%Ep(`E+O6Wqlx}Go1P;p(pLt>({GwXA>7CEv`pjE^DMCXTVMS
znUFtRp;Wv_<dzgVx#8?I7v0R}Pr>c=?8_*$;`&9M4*$iI(f4MA0@oRb=_au6PBqO`
z%QDlXaVDF}c1lW0rhV~}ud6{^#G{hgKRJ<=kVw{bJKr~~r+<Q6kv1R}kPz#hUQ<og
zm(X2-^jBuhD)_Q`YbW-e;La1Cm@OrZP(-uTLw`BU8;7QQoh4Y9#l{>*)EZy;jQ8+~
z&}wSbU!O;;W%F6?ynoOiUa>)$d$&I8E0|wHOc2&qkj}qLv|E<=q{DPo=Rwy(+#ly(
z;vrwXu55a?22@G$UiNp)TOqXoTGIHtHG@$8oHI5dTBn{m(9U0~<K*~(=Pr-)B2Te=
z(idF8W%nn6Lo8VB4&oGZ)vJdqY=dXR-BszRPO0lMNU=^Cjrj`vZuPPrTCd3%2BT^Y
z%p1}h7I`@p0bB86vG_B}Ve#jlm>MA*xobI|FSE^<`R#oWvbR;_J{z^prLFE+kyj78
z8O{4QyeD*X*5$}l@^ji09i&c^jVoMBGJDMnZx7g0BZo<m;e8BEP#+WHIWL&~yzn;r
zP|8EBm<=nG0$mf_3_pwdUP!uB-HHaL=SIU6G|CQ{gjQDeTM}n4eDdOt<3+x9?qTSY
zQPsK8+*0T|HPhAk$9n9>YQ$8&3uj-R=(qtsxn0yxVHYgl-L#B7Y(mb9Kb9Vxlxj>_
zfZx;4SikV}ZmZ1-tDPd|^>t%U#a%~d&gM&G7^bQ%Cp6eLCwpGFxg+hDQgglmp5NQI
ziUsvuIL*RI?l>+Xt6=3sLn+O^&>hzm7|=e?$suDt5Nrb%8qSkwBB>46i5wb?H?i7Z
z@Fyq#$=SiT@xxFG*YtV6hL_{a0`h^H(UaE~0qwZk$K737BYxg(mlm_CsZ8wme|GS?
zv8Ne}UuJZj?9WecC+4KR0thU*odRs0Tg%s99TN3;-YWn(OuQ`kF|)+Q(m!MlN)O?L
zOe)K(#Zgs>iLnif-@g|vwPT&-=Rzp8FncLN`AaXba50B1I&sI0F);m<DjyVu5fKl$
zv;uVYxNgmbL*(^OBX)f(6MP<?PI|TbBl)>8+`EfP9sx;cC<YcFLO@{2O0!ohM5p+>
zX%|O+o69hIK=em8GJoR7IbD9x(C`k)4GpYx>E^0Rn`qJQi1Dr&P;j6R4X^0_*3p{Y
zb!VA0DPXOtV#I5kP4x~qw<@iv_DwE3C8c3o$oiv~=cs>0tgJpMf*c#_p-YAb^bv<g
z3f8|_XlROE>`vv|6p_&)ujJcW#SS4!X?21#8muc%q;p{nIsHRlu&^yu;EW9&Tv0q*
zNjKZa23-R-)F``WEj!Jk)E0kyK&7Db^2)CMPan)(6x{$(n;adpXxQF$;BZK=xR^zD
z?Z5|OGl&^LWn(Xiw;HcM8q)@fv{qzv$w5hQi8w!Zw=|ZCxzx~euj{Ag(@JzIw{w96
z^2?P;^huS^{XP3-&N(mj<iylPd^ms}y1>coh_C)>3*jhfl~uFw3fw=$446p7$;6n_
zG#+M7_`z@A2x7b;QW%8UC?NIjaFCK-C&X*`<K*clz2F$9rp`5Aord<forPj|GsO##
zM&i>Kb*{9Z1$-bPzFmL~I)|;g()VBM7ut>Zt_Aq^D*DS^Ed&6D#10Sd>pK|E>>j4$
zB`uZ*f)UWqhL|w-ra0ENHS3=?$^a+HfdPqynr|uY`{2xM70A>HnM2E!sxjF#1ue4C
z^ZKc%HA(|_zE920bf!;FOm5a-Ur;(Qv0m};lw~Oh*lj0W5)se!(xey{W@e(Kn@Jx!
zs2T&6vPE9N5FY6$>SDJhS09j|OviS*T{W8V?L&DzfY*D-3MPn+$3$5NIF|!ZdTW;#
zunW_yQb2!%lRQuTOST40b<1&Yw@V~rBRV64%-%}KRS^*@6^`+*QTjDhuzwX*R2fMy
zY^Hgpq_I5h{oN}#Sh?YbGwY+V^DzmZv-L+ub~r{dCSrwgCAX7<#W@1e%>pa&Z*rCR
zxi^z_ipz%2L;ZIN?2nob#Jy%e?f9G3ipq1U%bW>|Xw3+`X+MJ;bk?{LuzAuacXSQS
zWY;;{^#1g(;e$U=Kml_fQqu_%Ofpp<#d83a(e(oBsWZ4>mJ%H<<5j}1<te|l+lw1D
znU**1=ycn{&3lph$Na8wcEbY59&_K%wLgA3|Nbd^`A{jQ^0!`DRajcua=&0^l6gJS
z-zlfjQI<HKj#Z9F`qS?`o6wYNo6WPrTfL|@|ILeOJ+VK>+<-3Y%F|jk$sKhkq#`lZ
zXDkT{DjZNaba)USrG7^=@H+$<0r~av8CsU8>4n9KE3-LOQ0}pNALGMl#_wp|?S&!f
zF@zN7Xl?T$$+Z0Pvy+@2y{*-Z6Rq#Wr_Wm7U_Vf#QwgS~xB}V7!8F0$d70p{&Hd=8
zV?w3p2VnxxoF~tY@;PmG?@meJ4@`#2tub6LH+p>0E>OL@0Cf8iEfTfol9=vlM9PTA
z)zb33C3;gzu$M!l^wX%VeV*(1{G+9Fas?``8jUmIW{f?ujn|99H3wcm`dh2<$zO4G
z^=mkPaW(h-)5Xf)$eOGse!6vLKy<SH(;d)=2Y-voVUH3uxXoSf2lpWp2Dk^T9lS{%
z0zCWJ>Mo=pby!xMx_5Z?JK2)EH~Fq`vqkgW`qgd~?wFV{XTW-v%F`Mbv<mA^RwGce
z^+2o1RHE74nWEhOHSLQ=1j>jBkOx3Ea*v3MtAXBmxojj44n-R)1;sV&-6i4rLeDxL
zYQO(E#`NZHGuP|;_ha>=cXe)=RPueDJ<Ec)PpEXvC7xcM6s@)Ab9e9Q^EG<Cy3l=z
zsGctq&;#-tnXi>j_y;!|-6giTkOi<>WD@p)7IT1MHA^(=u+L!VXOAO*4zxY^85s>#
z3j15kJFn*~)D03L<6+WPQPocCA7(o(L~5MvPWA)~t>HdJVR*$AJN(B}jc>1S%W#!0
z%4sDf6bQ|k!XGx{W^=p5^1pluQubXe$K+4{sj(}mcy|nL^Z|p=b($nENk)xAZ^2U{
zN@D77W4be3ubrYsm}bw0<*yp9`gC7hsrAfIo;Yz41$s?aP9FzT-X;K*J+sH1g}HC`
z-Idg}JH~y)P%~u>r0Iv!Qjb$%EzeFQ9VZqIllT;VB~6t#m+y>yGszvCvaB(<ci+5F
zeCU#rGH_7J=*Q(NUh9LxelnUtd7Ej&Y3HrB;!$&(GF=wX!7%aC;wCJt#hmiBgy&<j
z@=p_MI1%fa<vWEoHBQEpmZjxWOwmAx+x}VVB`IlC8ulbjAP`5Jf7<~~>R5T&d7^x_
zy2zSCOWTG>22}D+CA1m<EYy;Gpp+yUz({^tg-m&wZNq0hZ9A(`PMn`#jztAvWhXi|
z_R0P}4K3{gNeFemzEaqe{8y;i^1qr1aj#}RKfW0$Co8w9uw~HxHa0Zf+uwh7bQJvU
z+sbBhQ`36C)yLO~R#u>s<zW;dqZT*+cEjBbvlTIq{i`||NlDjpvW&uGX4<2Fv#7}F
z-%8m!rRIwHT4fUyV}Pn$&Qv)FZz0$OD=n?$8reSRhk5n_|NDknrFbZLxRb}}q*>lc
zI|bBwqHy8n>WsoGoR;BCy6f@3S?o%AjmL%k<q@ydSYC4SGhEzI+u7>TJWaOB?TwY`
z3Ol=zEJZD?sb)K$pl3q9U*iQlW4)R}$=`HwnXjy@l-bV4;-p@nE@|#YMNmro=Ub0u
zQ<0pU42CYKJB?MqBO~iQEFJiKTT6$JnS#v!_#XS$4gmi*#Cc!8{%JRG2}lO?YKN<S
z>=of9rOESuv-84;ozH9ljUtJ{vKAKE;o-v_KVMFEQ%Xlhd!~+pmW;0*(XWkxj<mYA
zCa&pvxZIhmULYqcyTTXa>*pn@`SsO1`oOoF;p>I1%>pjTqJm{M0XEb5B;9JMWmC>v
zwlnMV^MzL9O!Y`~K#db!jxkI8=;V~%yx5n(2WZA;uUZlgOUIolj(YtVuz@b&3>t-v
zybWnRO}ba~u(lrPgV!KB5L8MV!h>Q&8+KKfnR@L=roD=0D9eVe%L-;>XQhwFf9OXc
z4QI%1jpfIclpOsM3s#{cfIoowL$Hk$Ra8_Oyga)Ge*XNKx7riKR%SKsb=XOry|GY_
zTWJLykM5b>Y#^0YZVk1NRhmq{hq0smI(%llF6h;bjd3n>B)&`R$r-HEJv&UKx1(bn
z5X2+_PaR#|B<Sshw>P97O@$cjQm_uwxF=9%l+e%uz%Rrrl)Tvn7N6(q(tU_(swZFz
z^qS4>(2a$2mfsjvNn2)4TIyHDRY%c5QDZT%u&jpCK>d>u6BF~C6VVCuu*|sqyQ25)
z?bY6ef6mA8+c5izOJ?!me`6w6wk|7Ou;aCpmOwlYWEIfZ@m0*x@i_5~`+A{9#o-xH
zJd-|@+;JAIGH2(6|L4JgaLU%&di(NZOGH@Mk=|u{;w@AUmy|nM8v_RVJ&YEXmP7=5
zD7?;7Y**a(>ejZmNhv4_v^AanYfHlh{&si7>vn9O^8gojJpW4<NDM^^*($R&E(MCI
zZNGJEq@<;>FfrBgG~No%v6=NP*Qmw(S4KADjP~Hk{Nm!_QNLjK3UI|splA&xjlVW-
z+rLR3dYJrHG%PI4?Rb45*%**q{EQ1PRnsbcrO0RhdE8{Im_0r|R@ZYCJL})7{J8W1
z#ZVQv@A@Zq_r1xI059K`mX>`RQ=|WCn7Y}m@q*!`anK|%SMM2Thy3!nknfw~{~~hz
ze-e(}U;X<KtTv5;2bTCIY>b&!GDPjM?+EpvVQ_%wu27(t<uuN8xi(j1{yVGO9UqK`
z$$!X;^p&s(89qeaBhEaBrdLZ_Mq$1xvTN<pjnzM+3tO-FRwLV>F*Ied%Q>Y!G$rkf
z^FLDXw+@O*x>jAfhZn#rB%!mp683OTqjMuklgvqHeMuAMNghOEtE#HHI@?<dy%7s}
zCvChc=pQX%8->}@VAm>Bjkv^O#kbxDjmJ|KoHOc=|GB{bD?N^lddqH~K<5wZp77Q(
z7qY@ELQ`hfSj2630KNLtH*i*)<m>B7q($BE)O!=G3X=BcG4$LbkeEg2{HXs0M;DFy
z5-Zv3p)>xga;|ZoRE0tj91j$TH_gHWcA?Rjl{Iuh4#+CTz`?;m?r~*JO=eo!zHV;K
zz<|Q#Hi%Fj$dG7~Ul8x2_p9eoq?(g~Hjo;VM99v@#)d*7(5>=xs+7`|4(zh(fqW!N
zaSehHD55pa^gKpk@k2JLp<61%Yh)lj#|$ro+)SD!VQ^$;XJ^VKY|d1%jVk-!R#-U&
zKx|B;(uby}`KYM;?I3)~6ex>S@}@_8^bKjYQ!NEV$(KWS*{;hzl@F5pm5^dPdwQ&=
z%ky({t&EIbjH_s9XlQA5sczCSF-3N-c-X|mgQAfXT3u9uTeMU<9Z2_d4h{}g#Jb0k
z+uPRl<qxo7p7Hj`>^8vy1-97Hk31LDKt=|Ea_T~2$XU+xnrDM1{`0%Z#4gq)=({g4
zh<D@X0+zrOh-|sd^gCJYa_dKy<v{fxQI;>_A=j;Tya0~wmtoQ!S$Eh)52EqjSU$<t
zKm+s+>|u(Cm)C8R9~KFh?RIhk#bFzX-F;M4%=}|Lt!9w@$;im!IBkd@K77Xhx^x4G
zd#<jogT(Ri@d9_(4rQFPLmhEkc70<!?>ipOKJCHuGuoLdlNY>UCy@tgYuh&xpsc+~
zY5qpSh5F!$kZkFoVY9!@T<zprY9Zy%0x#X|fck9_Tq?X!XTZcD0?qP!Wz+w`89UkN
z{3#L(Rxp}5X6gt#2!E&c2aB!8_WSdH0&ppAcPij+C!1o-wByBUQi9mT?B*j`gUg-a
z(3_KS6f8n+zQ1|WuWx8*sNVBRX+Hr3#y~u3nh3#*<#4C1v6G`CW4p{mP#8$z2m!_1
zF&T@vQLLGn8FPn`fG{pUT^AP@m=uS{`3J2v0B$@^w<m$sRC!*V%Eqz>e*0!(7sF*o
zUT4x4@)XFrcUOB276!-m%N;-SU;<>=-u^`(6}=SyUVD4F#bGsufA2V7yFy1tC+iR(
zDHV~<xL6RbpNC|+1qTK3+KM^8;)LFMM5g>qOYiv7uw|+lf{hlG$DqZa&J-A&+G@Yz
z)7aRQ{Hz;h0bO--)A}9nbZidVk|OENw&xOx%tBYXK)YwU&75_W{b3iZxEQZ(CDMWM
z14&I6L)&dK(${!oeZw%!dyclYzkN?dSz&uKRU)6y^DE{aGto29(z-7A5p9m<M#aQr
zkD;1x)_UEzE$rk`adSs^;b&*p+RpN(z>JvcD!`rGrbIE{oL^dc={R2qd1KOrpW<)N
zIGU$dr<ElsCH2ec`%^amE-^8&ldW+I*7sirq(BZCEVWQpOJuirEVjJA|KT*u=eC=<
zf#JS+dDYji=wn??yp&F-#Z##Yxg=&bCS!S;$14hSu<LW7_wU>8x{EaljWa(eC|z(>
zY9z}@Zo%hGKC@D`LVZ(Fx?&5p31JM18HIWZr?{b!G$g$HEb5a3pTn!VLf_|7urE-f
zC%YAX^6u;F%b<$<SYbUWyaMcC-Fc$Sib$1Qz(YfqFlwpGYcQ)UoGGhK*?rfr<?Eve
zwWGtsFKVo|Gu-0h?Z?NCWt>6?tEtW>jWm}};H4aqN#KQ9PE4cNHQbyUhrXfeDI0hb
zo)8@^k(3V5=Jv``sNh_IzF>-~Zseh+x4?&a2n-E}R6?79hGt;L6gSTUHFm%y*w!(T
z$04Jpre@cJ<Y9?UVEkv)AEhI*amUQ*`8|)hT#Ii*<yVNVlizX}A3$VjgJNmIlwAM#
zpy&utMKSAVbnVq&$>|a%JH2G>PK{#H{bgTaJ3CcwL(l$N-}8jP$)JRrM(AvFG<YjH
z7-jv(kB2G#vieNwT3WGM?n<tAw>LoBciTvhie<NGirpN^P8~zFqEikp<Xqg@*-;=&
z-lzjQvw>Z<W(gJL1{t68x_&LX=}#Dp!(~@zA&>1>N!@E6D9M$}>_Cj50}MN%ie0^`
zv2g^;0mfk#+#Tm;_`5hIn`o7eB=jpDNrsbF9`c~DwN2T{;|E(CP;=@jelEg<-0
zGsrNTPPhZ?o5h4RP)%)B1)0r|B}oaqIP7N5_HAfr&>~6JcU>X)?aO2<=cna9YdBl&
zbQTeCSfrFq%KwR_ZUw=1Op1HsxUMcQL7<O7S#N3*wv6TexY$6VkXxYNu)Dvn%WVs<
zmz_>O*2z_4wA>soNYb5sGHvN~eIPDs=5A`w@U;marzFxTQ(NPgwfUxBRhK^we~&7s
z<NW*Ihc8#OwiNJ^-Ab&zGayrS?k=F17^z9Ss8E4NEw_F;S?xrxdV4aizbTHgu9*6c
zhmX(UER5uh8+*Rs%a<ZHO5Pdg(sogu?|JLkiSGRT{08Uc6Zs`?)_UVIr!R-3nMZ%?
z#=*49EJsttac9RsflU{E0QlU=At-0tJ2+^Pnti0`SqGfuYo9Yt5hR&a=2r4`={z<1
zb~p<1F(6-=GBLs@;neb`93WFNF){g~Vn6=9PRGEoJ(8^qqH%Glcs~K!I^65(TbI7q
zMfw0S0fCrixqOnqb~*-9tPkI4PXeEMmZY4V?LhLMyDR8jXqVv4F<#ja)K9Ie{`e71
zdr4G8J}p$g&J9+V*5A7N1w;X-O%==0I$H7?jKJM*7s;FJ9m}9-3+C9g3cbZnxL^Z5
z86)ig!+Ic7$D*$3P+nR4zZu3~8?mn%=5*|4Rldt_8+c)TukkncJ^_^>Qnk%=IdYc*
zBt2OYh9Gjb&rI2vvh2ry>jI_-J!mCLh>xF{oz=8zMbb)9Km=TT$xPX}a!@b=L5Rz4
zF23pa4{=%a0Xmo^sL-hJfC3=E&UgFGxTTMYFE)lVvpINqr$$H1Kz58s=}&x91B!%G
zI)D~s3)a|3ynBc0x#^Eh0)Td;|0T$c?3N=E=mT%xz6F8Z8uSJNk|2+KyBN&8PU(5y
za0_C<P;z(Cj*oK`$zfRWzpfM+7x)PocTDcJL{+)KeUvQn9=v2)TFgOB7N^aThTBVN
zXg%-<cl}QbC}=zBpCZzj^=fI@Z6*po&t+Ya{!z}9Q~b6sfp2+vebhf)Y9V50H&c0l
zM&Xg{9t8)p&_B~Dt_M~yJOyfYj`Y&f(zH)Y%xV;LcE<}Sy0;!sZh$rLnkN+vH9(<3
zgVqvFGa<{%@26~UE>98?5@>^y_*}kg)z#H$Zmo1j(GHhP_T#q@F=!|p9N{1}K0FP8
ze+h6>ubo$%gCzHFixXS_Ie1rQm<ZZQ4J=q4lH5(DBYv7CE53Vb;!e}eaD*j3fhm!I
ztd9hFCdK{yK19v<5WB>Q=qAo=iAb&tQO{N~iIn)(U<OH+H~r2wz#4D&yB>(9LODH$
zL`;~<+zMc%I@a&~DMo+6py>LYAKSuX5gk!<$mKD#PQ~=CQ8@+BLlrzbLIkPFOfuL8
z{B5PpdW%!E$2Ci=*H4|t7}kg@Q4%@2{cXele!hYD+}VAMA+GI<K9yH=iJ?4zeGo`z
z7sqYd&mW(1ItD20tRk`fo*@3=v5Mq)TV5Mbs8X(@_l&dLdi^cq7z2r0bnu`FGMY4u
zmc(s7n4^0|Kw<Ugip~-=rlaTV>V8ZA=K=_%5M}sQN7eHW1wUlKEEg`pdaV?O4vN|j
zQDP+T475&Kb^b1JWm6?`a-A!TV)e7c7sOr}V3qU2eZ${Ck_+h0K94orlqh0((MlrD
zEvcYUt=T!;<g>CfPsW;v9EIg;a{hhWtMgvuD2`s4$xg1M=wH9!&YYYm{(R93CK(%L
z-CKNnz1kIFqRNfTn1`A38m;~wM94?x){4j^gO=0+bvMKiY}-TQUi-+;e8w6j)-)qp
z;GY?7uwBpE!Nt%Jdqy=rVr(CCfz$oA=kKmzuwE39-Hsr63xq?{pJ?spX2c>2=d_P@
z^f^S#8uBzb9SH}(m(4`-HOJyHIN9H%vaTHV(U53!vgbF&n)oWShA@(xkNbB;V9Vwf
znrcGb-kEfP2%y+K@FI^Q2kq}PTY7HQv6Z=d{`Do1lrb>m!W?2!Jm7BdPL!BVvR4_s
zQA7LrVV)iwJ9z*dg+Lltfhi%#P00VV6JA2KJ+~A1c!c~GC=dwHEFsVISRDi_r@x4)
zofg`H@e#H27A!NwA)+q`>(yy_7;Dne3(g!p0r%`{u+05o&TAG2aHtF+=|VncFFswb
z><mvueUNo2yfp|?8c}>BDkQ57d^uU`=fZtXaJB^TRTX+w{?9hQQ4mAtyauL&_(2>`
z^ZpdL;K<rX35G;NTwI#U2cs7pBY7qd&YdtJY>avL0lSIjjknc`n13!Z(m$t!Bv4Hq
z!O6ELU<AZS|KWh!Gri~%uv;$>r+*eH@&4#?zpu1>2!W6TF$l8g#|BDo&xFC^5kH8t
zbl#R?52*7^xb47^ihMx!gvwsv{VBB&?1={}8T-4EKTpAo3)%mdc_Y63AFle}M?<{+
z&y#sQ6v^V>Nhd+9l8xg$z%IulA|iV9=+Ai21khJ3D5$8ZkFcN7stY^J;&k?uc^n&)
z-bCfcj20fAhFg^K7hiWzI;zd>#k`enT?wz(1fOgZgmrfvOIl=Cf^!JokwM)*CB-l{
zDD!0e#3M_N2uf9S)N&^7!18<AdXRXjsM-bxizLITK;BWMgL#xNo?l*$Wh<{0Ao{yR
ztuGE&aBy%iFfcrjB`{59KnHzKnS%E!SF#BWxBbeoH4nY=rOXIM>PjT-pnnUIK=C$S
zVtZFaSb_fyNo+S`(f|hBzf0Vc)$ie5O?$)VIR)40z*4eIa?)2%=nD84QvG0{nN_gZ
z+WbwmiOAAx70zQ<r*8k#xD<O7eWDcW^)0I`oj+%x3S(w-$YO?%aO!}?cl02+wFVJ@
zjIUi`n||$b4GPhdgyv7!p8!=rfJx(l?$dONVETqaNUvJ!n0|~s%37K%6A7=Kjqt@+
z7b<%74~f%Rhpox+_We-xdKt;rnL;VrrfVHe-3*Ru^*zG*wlN9f<Ba+^y(c*RxgHni
z-}02XN*ucxRyQM|H4z^h*-{TzY^aq$BjYe*$(vN|+ttwJ6R5!Xu9mxka>ky_joo+-
zbJH0M#2Y<^O^6=W1zL#CcBZ+QnF|17wyQfKgp1H~bAJNacXxMJQBhGx+0t^80($*6
zA3)nhIRhdBf&_louvi4HgPqKWQZB8m5D*cmYiLN&#_)ocELrnkmv;3R{!Qyq`d*T=
zUoHWc0|ZRL%nb|X=jJE8u(0qGkZv+E+5GzYdg1VA3=BQ}{RN=<HUGyKXpd$eKfccG
z@9!6M-hQG@4FfWEZ1rHkRI1bMwW}N7_;Yslh*<N4(z9da(uxW$_W}S&`{IO-bjA6o
z5sJXWB)Y^bww*7eSR$*8CFbTTJb#2P<m?~Om(XaQ9QICMtRxoXOHB^&EagOCC*=N?
z0(TvXYu}^l%onkJy7yW0!ldNBId3g{Hy1y|+bNn&D2d0x-v|!)b2dL<B4%c0T?N%-
zpbbP1VBk#v1;~I~&;eyOfFoNaM=Y3dG)vK!4k%~xcb<UGWeHl<0gXLZEkCOmK+(MB
z+nX!D(dz1IxmfmWz)eHnoN55=G^-i(uCTIkn3b#j*4a?-SL?eRb|+ys7vea6cWDP2
zru@oQmX{TJuP=_W!!QBh>FMhu<ZKB6EU)dfk8iJV)a&l%OvQDvB``xajyK}lk1n8_
zJf0JtO!(HISLe2^3&a3CRtl3XFbPG<9Xx94%t%IU91_l+6b7MTh}+q&EHls@dQh*Y
ztO^u62-R>L(Mv%Ce7bbh&>z^VFm_|>#CM61kF2MBRWm5?Imk1{)dvHq;-MKoVk0wn
zoIWlzp#-@B31N6gXd~vu1|a(pQYMOug8=Ym<JP571*NED5EE+PlJiSg90JuKgLOrs
ztLE;aA1W*?yl&#=#=9bIS?6`btIZe;!=J>$WRH45#{I#&m=D{(%wCzBCRmwXY-oMm
z+)eUsXEc{+RH3dK%sexi)uanx$FuFp=DGCXd+bi1R%6V-L9G6r>5AdC^v|O?s=1&*
z;<7uFf+Hy@88Hv~D0_eF0GqP|6gL2eOe0r^$xK;iy}6iRNbgsazQ+Ly<wjEzph85&
zKYH&SuO&g{vSUv(wcN{{06-f#?&X4`FCjT;`rj&a$4XYy4gQ&UN_HcF?Lu!C=ATlW
zg7HL^R-NT(^Kx-YCwN<e@x+u?Klsy!!Y)(&=ycNX!}+9qI&LmI6&{|{v>7OUmXzl2
ztnJaLj2rxCuaUjeVi<+Y-oIDi2E3TtJzFUoaLMUfR|mu2!sg~Qlnp@AQTYU#4=@XC
zZEaODJZ6fF#r+ol?Afx~&GBP)sfgI6aG?U(NEXOL2=@Ql=)SuoQZ)e8O)8ke2T{V*
z!G}8D<h+hF-;}AYT%5^Z*``3u*(`?08b-A-JI*V3X9%qICE!_7$;OI4(s5}Mv$Ul1
zv&M8#$(Z4H-ktzA9W(#SsSKd5qf?SEL2~fCSc!7drVYkIiscmFxCSPk7l>N!dbA3v
zEt{uAnORv`OKPI+Y>wN1SW8F4O5-${@$x>sc6^lb+}}r0wF|?a+Tsaa6v??`w?CD!
zO1ejD<NF7AU5ORs0FSh>q<E|!`cjyQguf19+^hRgFoW1(=v5}wYBAVil054ve4HB|
zk~XHV^AH0OQ>=k){KKCw#lOTd*Pr`X;*fGT>g~={h3o0neo|b2mb@=3TVmR~NK)!{
zx-I!}a>)?Lau#EGPq&1SVjq|6YMZB)3P);&3X&!RhnT6b)1U48QoIfHg$zK5m)gE9
z@va*o#U|{SPF2{!0X4S$;&?rftxTP8mc60zf}j6-ui;Lw$Vd$EJm6OOOnkqSJ=3R;
zfYt)!Cy>0PRzOj3;FiaEJNgTi-9X~_w-U}$pzS!bn>c4AxU<<jDsScd#@h9Q_UwvX
zN036>@KGB;F`i&yx;4&DG`M{IQowXYnC)h(PX?w-U%Y%tb}XFc9Y)ij#7Ip|?VK2W
z@Az<~+X-&JJHxYys9&8lBY_rqUS8h%vpM(bgwd%^>XCqZU^z`a+xA7uP1PJgiu3$r
zj*I>%35VqJctZr)^aPNz%!)#YtRj#GUX*#0S)*r`BBYHJ5wz1eP*WQ^$HT*Ox(W_|
z6aSer2BVW?ZCWT^=G>IC+b0PfQuzGnzBv>M?Q#|gz%g+NdZ?h3)<{f9_|CKbYL{<w
zX=y3->K*5wmjVKbv4jj7$?&wXzFmaxI>vG1CnKo1wgYIY(|S%xS2p)_9kd8OAZC}S
zU~@g{<)V#NA~i#D12<I%O&32Icyzw@Fc7!8+N(LLudlCxz{l&nEx|J}J{~oDoC<<6
zgAm&N`vZi4+gZ&5`KefpONb3*A|RNOvlu`StCy|7a9k5&)+`n+t<sz~y`I74inl{n
z%#rebhmHyfWR>>@#VF2n-kZRxtJ56^8ore7K1^lyksWaBGrevuX<=ROz7Z?Ft*`)v
ztO#XVZDj)|d;7({Dy69m(Y`MHWcQQJ^y$k^>Li5^W_E^1v3?f9MX+*JmE-wUP0Jk>
z>l|0Wp+`-WxkIoAkkN3`h-c%rI#=wcsQH|?6|B%iw(yx;U&X}401?f4&H*$a1N-uS
z@bpD#*a7J7LYnIBwdsxJsLr>mJwPoRB5|VMiRsvDQnal-UvIBWp-&CbDo>D5-MZQ0
z>7ofv$%EFacZz9XK)g$%p9_^hw08Ym@DG7}J5XrY?BjE<D`I2~%oSfQulQ4)^NtoB
zXvs`^vG>OpO;Z_cFGyJvu4BPO%t{sDBP=fU+l2s%tW_YmnKDo6a!<_icL4pzNuI4_
zzTWl7^wb^zisQr7Pyr(%h9PM7wgM--xfCMXoZ+Ha_hBr)_bx+4d<B$k6)r*rZw+Ke
z%(f?rREEo})YU{Nz6jJjJgBR!ouD@2+e~no14>xN2XV_O3=jz&r^{_7r61mTCg(n8
zb|ey+T={7tQZbSRGCU(k2;<!XOmWwSJx8hD=sc2`DvG``-M`aBL|s)a-FkhqKPQ4B
z5NGE&^o*{m5N*T(OHLZu`#c87Vekg{pZy1`PuWZrwm@TP-k5-D4LB<hVGDQL$_I9!
zbhA~^(6^iTwyF@EN>mX!5%GVJtzG~QZ1+xhZgl_+m#kDdC;*#eEM1yt>{}C)ReB`?
zNP`;4F;D{S+3M2Lm|BiO+qn|NX~2j&Q1E(R1Z^ju(;t_jZ<P=$a1w0JI^%I0s5T6u
zC%NzP%!WFz^^x&5CMzo9-_HBuYL#100^y5TkW4)yGBT%(WM{U9%fx&N8Avn4o<D#=
zVj%IdvKCtmXB<0Q62CTT!IS5IOR)x|`5A~%Y1bQMn<gXs92WeL93?pxk7;a?6?<fT
z9m|hZ6@yt)7eNc!meD%D=MkHRJ*$*T_s_0s-j$Q+O!mqId~E{C5E!TDVR9%bPqF1F
zok)Me_pe`*)fOZvLAeLGB!G#BcQ^1mkcZ{Z>YY?>fk1<3@y!R#1}TZc;aYs^pFkyH
zM3fAn8Xdo<(fLmPhL?uM97y~lhV9?UW_%$)S1Eem83r6UJ;U<$QT!)2H*nB1fR|aP
z@`sjRBEG9dDHxK_@)*$!9;f{0{QSHJ*y$LL!)lM<?YYRAWYEWtAG@QNGpTsvxgFOZ
zdMXh^Y`!+V*-}taDhA3F@Li?K25=;S%t9e|{{hhy0np@j-si*U)q^gq=^+y&4rDJd
zgFD-UAV4?qhuAnU2!Uq{Abtq{4?e3B03oBG=$DNK&t(9V>&NE5m~S3}nl^gH+>jGf
zF;woLLL?9mF7UpwXFJo4m(`g0uw(}LL5v`1SAQ8Eq5%a`$czrSt15yB8tBik*dYJw
zi5=V01pPwLW8)ccE=*O&obuA@_q7k+1jIlaK?@D;b0J7<&EJO`yvtg|ur6WXM#hGe
zBOYU5(D1__2_l1-w?V@U3k7(f#wWy=PJ{W7Z6dMmf77kR!u%)<m@MJ|9)!FR!vf;n
zg_w*t8Ve;c&^iCLppgHt^eaCea1!pHN$cAp=<2_Q{Y%FGUJh)>KM!_cxhJQE3_&no
z??+%!#Q%D3#uFq2x&FV74!(-u+K3$mob~dvkB<*9u8Z^Y<<-S0e8k5r9o9&fMeXEU
zL+M!hR^Ce4pjKukDTx6$cnKH=sCPLJTmleRIkr|-DJAlrbuvXH;9SdU5@dE$Q!0Hd
z1x8W7G%~@5MyGyi-F6rup9`=c6VIpbueSyhDXTEj(Jhi#FVDKbK{%hPcG?2C@ECdB
zfbsXx4^(VonMD4Wg0djcmNBH4ejjnv2n#>Q^nswk=}NA=+;FU)Y_herYk%a{rF+?C
zaN%?C#ExQFZ$!Z%>)x?6bKNnh3?7tGrSzNRLh8U-3cd#e|FgX5Z7F^yv*wbdJA$j%
z#)Lkg$`D5>G#nL{ub}Y(72Kwt7f7+JGaw$Zva<gA^$Rd9MDqc2aB#5Z7Dx)9{Gb}-
z22ROT0|Ot8=X2QwGzC;tfdgLb*6a-p4{P@S`0)dPM8Lb&Ky6V<M&>>2m7rjf+7l8I
zX_BeRk1z9~AlqG<edoEoy=8x*HH|BH{VYvpiY7k?l;A)q^w+&_%L`l<*ZHko<mBY+
zUN9-A<8{TUU3Gm=r=>RRG0~)`F`=YfM3k&_KV!;erlFarabb1>oTMx>lREdk9Oy^6
zy}khYrxvc~J1ykg(2$TcTsgbRViPQ)uPgQ!pn_L_zKCTx3&NeVGiR=#`|0*TiipJ)
zNZuKvoAv2a-MziKK&&51m&PUI{qo92{rF`W29}Q&!9p&01WLyXq)PziBbEhjj+NEb
z$J?}yi$U`~zDKsxM4{nGj3vO=pjnnesW;B99z47y>F)Nd!J6<8lo=&q$If(!mYF$?
z8fP1=QPA}>c?pn9_6A6FK$%C%&jBF!_%M(<aR7U=Izo!l=IbRpc$UtXT8XKw)&c;+
z*&9GC0o*H<SWA$AN-n;m;bMVj01)D|P(-1Y^A@B81w;j9#BpnkQ46^)PrZPgKtl~k
z8-S3PfsQ6D=GsOr3+$%lJun>Z6P9^U7&oD~28}W*l04IslQ9BQ09wlA<|t?O=|gW?
zFNp{V>yL+|!4q-VQHdWrxnCc&Z?O2rwuMs2+oiw;Rzaz>`EnXJGDUDi3mm5kEA?b$
z?0jCW`pB6N*gY2NIxzBLtixsd3vmihQ4Rn<49F^JN%50u@d1|pQ9KV0R{#@p*_jd}
z%aDm_x|{*!>rwUp*4|qOW!1Lvq7R@VqM#@sA|;`;Al-t3v~+_?w{(XhBGN4&CEZ9j
z7TwK5igZgioNM{*J>NcOX3w0N^T(MvbNu6F^yP_luRE^mS4%Jb_VV0ZIuw;Vws0yl
zM13^lvKSHJ1f|6b##wjpIt}nyPuBMJDVFW!*+D%(e++F(F`ghWlLOiPVjI2RFPdl)
zrt3PsjTIMt6UEN*RBPz_W8*8>*AhkTpn-m%RT2j^X@3#0Ze6-S;@}H8|NUK5R5a5Z
z(jWXKPsaQcaGU%ulM)hS`7`0<M0&m$ASW`1piTc;aeR2Fu=B4SqA#bv3)n_;{)gZi
zA{Olyb`BrAK7v=8muHCjjqG1sksxf(ke%*HDtWp3_A(xd%iudoHbuiE2<YY;(RBm(
zJv;0I0@{mHRdx$d$}di4Q1kMl8yn|flTeAdxcJ!T5>EatL3%?x680xqy-!K@mok%7
z8KOjG4Gd^#hvCdAj%m7NLUld2o#jK|hYv4U9k6zwX^>;xU!P#Kr-Opo5+>I*_55o+
z18*FId;ZgC#4m$270f0niX3TM(A8$0qf3l^ks|C8Hg-wz2quTG<^+$q7!W4X(m$s(
z75J@UaW62IP>QI1wb1k~IJy^j0rl;O>2|X3cBk1|Ly^+8Iio#AVUgi{Ewx3N-CHEk
zCo>N`geD{XK_1-4@k664<4}qY<i7;GBWv4lwCY|puwD{u2~j~;dgSEfsx{o9B`!S!
z?c1Irw@5df&AMH%A4LU&{K%I<OiXNW6g5$4qyJK+RFf;>NriB>P7PO`bbi7uxn>*=
zZzxli<Qj~A5hdGPy2H@TWSlIk_4;1PM~mIEpTu}5i<Y+-t}s(X&2@Ek8!?Vxon7b4
zOix!>0pR1^oq3pa`FaiYfCG4e`ms8{^t~;XJ&dtK22<l;!`~mc038|{rebnDRi9Ah
za(nXW+85H~<{;9A<OGHZxsbxbLf|rCg0SW>+`r$PVsVAj&+*2Jn6>*x-99j}Gs#b_
zh3)3AniFVwm(xw~_Ox_5W-*Z9@#cM$nLHfXMp5d#y4s_>F*^%X)Pt1wOK|YOE;=B7
zX>njcd0D{Y$iAQ+S_@I8^5@=(oaRJzx2&2|LU4Po`2IknT@L<E^XasyX=}Sf83d=w
z5EO_AZLNFPku=$c7>`HSmxR)Sy`YFZGKiUi+hGAW7ZVkFm+g{CnPa&)l$!7Or^0^y
zdO}Pb<230aVQJuvK&5LvDOin4DRcm<J$4Jb=Vsc3T3cM?G3}#KMJUm1$x4^YVn4|R
zUF}s>qcvGYa?7YqM~YI`XMXkf_rw0N%dA!Y91YDxODNT9wdDKGZ$?^r_3pe;5fKr-
zs&3xLB4hEk-A?Oh+XUMp)AmNMF9(+1Y$n*t8-s03J1YI>isa3)rl?R%I6y%F($l`Z
z&|M;3vxCJ^lfq7`;OG<(7#N6wb%}?cAKq5O(%pNq)HK3puiR;s<t}?{plwcWZmy}*
zbQX^YvQnT%g9?sg3%WYMtGG=2Ae+Olj2wU=A>W5`r>N60>xSOZNss*r02Ryj#+QNd
z%$|q;>}<hyZu6kVoZNg3G&UEl9KF63)*pS`1qE)e2CYchtF1#hv34gOES<6~8#}M3
z?_?nqE7}rs2l(E#<fj@+I=bGf=H2;xyu7_;;78DC%ey7xOL&JgIZBGQCPWuFT@%Y&
z34rvmj%#aJ4xu`&ZpeBb&Jnx9nb3(8?(Lb0t{(~coYt9SmV5Af%+~15uAp~ZOtW@2
zLCX`vtko3vAhLwii37ONAtm`0(5~hUwzmiD-7W)|Wpio}Lz#6h@l8S7+wBCEcZHW>
zcP_bzkjApk+7xaCR5cE4)G_VZ;ygE`vufei!@iMSCVx-B`EPO$%?BvPovQ)CLzJFC
zW{L{B5xf);@s`u$%zwOpW_9$|#|Mr!dGpl$Vv<{LVGpI?ca*6RC*G&|0Z02%`Frp%
zUV%=iTUW6uJ*-FK<~oZ6C&(vW?Dro)IW?m3dVXiAe>I!!?x+I*C240no9I)%3-%91
zDD0pS9jNv=c3d|oX=o6(j3b4*2Zj0+hK9Nq{&))_MJ4?CFWK;aqxOT66!rf&2JqX7
zGK0$F$KNN?(y~HBUsEK7O2=3w2+N;e+up45SW$(I&eKz9cY4Z*@QUxfnX}U|r&lL$
z?-h<;uA3#Z?L8ivTS@VJd_y@lF_9&v{cs;H1`ny+A0=~Wt3SU^LGHQz`#WQ<s?KMg
z%|G$hx@-LlniW>Vd`|U4+5Ke;iTx)PcE(A>dTB?}*7Ytg|NLraR68OQxJ^o=s;q8O
zB_Qj6s4G6CGS!oWr>9lhXm`?vD@No}U;Zt#d)&NEsD7`{qEqyut7McIkz3H|$;po(
zaxMG=?XCK#aL16zuEHCrZ}q?fp*{sGT6<bq4V0Nrt_<tl;as-eUmxaktR2lmNAWon
z@bK$7+3z^Kt~~rJ%Bk!%h(5GU;2Y@oJKA4$F`a#8&0W#LsJ3NmcWC+P<J%1#jnO3I
z(-`*+PK(WST|0Na(<J#Z8@0Hg>hS}?BynnwvUb^or%L_KY27o_tQCj9+w7D)bk7d%
zJY74Sa=FW6^N8yV^z5@0K&!?ecUtDL>DDAPtd1Ld&gZ9$IyJ7so4*D%%L@2;4oDit
zlLd52=2bMsh6_r$ru00+Xlaz^-q|ZCe6{Xb<E$yTs<nFf=jw*zOaT|kQoesn>Peus
zm13gR>G!At3fFV%u;F3~fw}@;B4Xn0F)UQkBc#b`Qh0F7^y}BjcrKkg94$3EHMtJU
z&oPHW8tIry$v0p~%FP!zO+vfhUn?F_*Vngy{IZvDszttjdy#v=w%LDDR^sUS<)Hxh
zb?*4U>LII>C?UIA;gh33k|D`&e~5aHL8*zvv?Uim=rYPoKU|yiF#Y>GH(%4GrgG8m
zcMdBz*<E+Dfy(imsNvt=>-ilGCTp5QQU%h}x3t1=+CBfSJ{rsI#8x8>sx2UX+gZdx
zJ~w9D5fx*s7rUMB7ota#A%W%XfrRX3MgIBo$Ap3+gkFioY4P{kva1s7L7Y?XFB3Jj
z!o4-Y<`De-zgaKN*cy)lJH~bV3LC0^D;{q~f9$HpZCo7M&?D!WGXI&FyR<}d;*CP(
z@WcG|mUwYL(b{UtGuya3cX#D@*;l8Xxjr~pZDnxVb%{3V8mTS&qr<T5!E{@f*H8X1
zRRknt>h#Ku{Qg-m7oCw>aNf|WdsEF#yN*QRF{BP<c_Me2LA3WFEqkB&nR}t*n)~O^
zJC6zoqgPm15-rDc69xI{*%h5EF8V&bFDT~6Qr&ZO%6Q&TckCVfW%-z=UUHFA^qI+9
z%8wte2r*<`4k5Un_B3)b-RA-2D~!k-$}11ZZoPXK-lITIPxA(~WWRKeR)kXIGAB;j
z>ap=ggbVj0DubsbIPSs=<Xv5LPDaM}OIEFojMvY^hWW-czqFN2O)f?-lbchAhg)eB
zlA;8@!5n$5*4o;d>9~4%vFD{$wj!^~AGG>CGO`Lje(VZMHx3Tv{IMu=Zowh(L9*+!
zqg=m2&FxAn%W(?_t=oTH9D@LXM4MxbZ$AFeU&q!&MeYnoGsnkqP3Y#0G<Y=HIZ8#&
z*#>t8ky4D5E{Tp0F6q8x-5J%+k^RKhJMT_E$g+hd*Anj-vE5Xx_2kF=hK2O-v6UVx
zLz~Hz=a;UP6Ehfe6z?&F&xl<iK0Dn#55CQId`SM7cI?URlDxW)c4I9$Iu-6mGxL@0
zYCBptQRE=)L-o|y&51i7WMs&ST_NJgP(O}-(R>ooR{wi)ajZBf=C)FvM|;B631;Wn
zRc1iDd#sgE&fakID18P720qofVU1Qy>85>PYZ$GtaC#l5_NNtJ_dLhEUvJWcXDof`
zj{e6oYx=>xuClukf)x|hje@=?a&`FJ7RvS6*56cO<wUGj9ff<$FX{XY8{4ksaaPI)
zG<r<bRdbxCJNlq$Cr$(Dhw9UnYg%HCt#rS#wGAaHT=1?111;)&<7lqE#;6;QL5quf
zoYO?dz=P|S`%cO7+A)V#&jy(_y~jjMx!du=vQ>$6X;<%bQi+oGm~X?b&F{q1=AAKO
z6geXFI7k|+usWUF*f{@zd*l6y*?uuP`*%gi;8MSHv{ZM_j-%e6N#}KaYi15{b+?!-
zTQX5sPqNp%XE~mXSR#`Wz_FiKy*_yTe7Jz>aGP{H*F!q1^`*jQ^YI|r#0DAb3W_=1
z&93{L>yt+Dp{v-tGCLb9(p~$C{E|Ac%22FtKFDo!8n3i9>>jUr!@|;ZVm3u!$bN`i
ziSg*J>DOm5iOj^d&JHH*=Oh)pw3>dTa8ClHYd#!dVeL@}rA!T>LYr?u5XR@|Xrh~h
zbK6a%H}LS9E}Kskcd|_}#VojTe>RhW3o-jzDVfS!V9m<???SGSRe0_Bc$t~ycm`mJ
z=!(P6v0YYKNfvP*d!gQD<%>+luCLU!Jalg1v7)}c9II|ZJWJLR7q!}H!AFMl(Mggp
z{pLyPBW6653ki71f3h$$uUQzRfta|;a`qbi*xk=YRRU9tGj%>qeKn2Y^Fdy=twHxw
zoCJp0RwG)^=;`T?*5I{7$ZOZT2D?j)E)p`Ebrd?X3SE?TKha*OUq5og4obJ{zt}yL
zq+2W-$@XKkBGQa>Q>x(`mW1451wU$518yLG%?K7Yre;~2uh_6}++6`Ff!^Hj2Kw4u
z(Ynk;oUZ1M{_`t=H-^^J6=QlNam)Yvy0Y1^iu0qjilDhRlI`3HX_5LL`V0+&nT@ZI
z7uRn%mzh`p-n_IOcsRJ;Sf=9~5yGuo=cv=Li$Q69PXx<3O9uqTw;vhRL_{F-|4FP&
z10nrH!_W(T)q1U^p^nt(u^sJ$pYLeL#0AIn&0il-9OdJqTyDY@i$TPn$606G+>bGA
zF|xbFJi-&F`Abd7?iDare=>sR`4-x8dB`{J3R%f{d#=Dld*XfR?_Y3`@Xu{D6u7(S
zdzQK3*fQ|c{~`6zWMaECS3}sYau>Iq`=s%c-kL7VbI#*~eLzi4A?<OC5=@<b!vJ;d
zot|vZkMEC7D7xcD-`>(D)|cu6c58k!Z!9p}(r!ZiZkHvs-Xfv>pI`L<%PZ5taw~Mo
zK^)v*vw*qy#n7#utK)W93Cjkzmij^`g;aT~gYXJ-y4n8UT?~bz1`O2fW8}sZZe!zJ
zC^s`iZt65$f1gCsqU(Or1`N*N`1acSN79wY{{}vt<%SZG)zJAh6eT_~u+EL|{Ei0-
zERok~k=-?}7GN|TK|T*z&}P*|2KubX_0vzWo&2=+*!99fwfMmPOs<=Wg*l{>XM()$
zaqwXS^LZQ_g>h!7b$OPI&mpX=eZ^{>5q`1Z^o)$<^Q)-PVc5w8(x9dbK0jJ{A5I$t
zcbb+`Z@M>%l;o$4qg1m|BH>Cm*SuN{aZLHNHQors)5)IdVWPIc6%|^e-QaopM4Xsa
zcfiS&N4v`QsqPt|*}K=yL(|gIe(LJRe<b2WUVDz&_+HJU`F3!L5+CPwmZDb%%$GW3
zDr*X#OUJM*ofDuP0S!abt{dY#laYzEd~n{qNQxSPZEltY0Fs9!m6eq{8&l^mrTaj_
z8aaFhVH-o;!?ZLschxH&X1#cfjK?PP!D8kh|83-(FU$udn*K%RWgSmNkaLy%3mKWf
zGuALmU?ReF{s2Za$UV{x+>D@rz)o4!$b4tKA?X=Nc#kfj$^lQU3b@h(o!;FDD*(ZO
zl&n?;=x`I*fg=)f?Q=(OU~T5l^gPkp$==9k91t<E?NoI&z5blgmwx!gn6&E=a^VZ*
zVXS+T?ngVOr+W9?pn08JUe?Ul<U}(9J@Li5+z-YHBn$1|^XsXIsYS5PFC)X6g<?Km
zUcN7%$g~TFfu2EOIDC<J4zIzo?}>e6yyNA+s!?VZwTchAZmqC;PXYfZgg1j;SA!Ol
z<L`BtLsjh)n5E7u`4#qWwpP(4&=gq$cyheEvaq(65FY-8^H76^h6eCX3BIehAHD#4
zon~$^0LMTHi)upX4)jZ0D|Fy^gkI<H_*i-ccqfzEO-33TgWcsphy;RhJUo~Q+25Ed
zT%P;>;Rdw<OZHV<+;*Ob!-LHk=n*}f0w9=$g6H+OYEE!uCaUSS`%!q_CL~P9?E!=;
zkxgG5tda(kz`#tL`xb?J;<0GnQm-ufH=Kk_XD(%2|Aw<F)>x%t-1%mT-Pngw53H^v
z;lgui19|B0$z5i{Ku09$%JMwf+lCj46Sx2}4wKMTs^WN^%Gckk9RM5wEHN?zAZ=hn
z`>Q1#nbEE3*&rka#AIkA)!W<q8Yvetb?bT!Z~)&?kuBctj^~zlRN&_3Mq76$@MkWk
zrlvyo$C^N^UZ`Ew=Q<Duh%olWi(~@M2)hL+a#XEX8QoJNmVjz-oq%z|(9s1e$JOQQ
zmgDnwZ9qDkVmm<R44^8IP!;x}`}(h67Zvyj6j~51{RN_I8FL4g4KcGOjWu5^9ECO@
zD{JLp^zOKk{AQds09E&TTGTX--SA`F32;OKgp8_b&%(GToo9TBbwe`%Nup(8kt>rL
z1$?A@Am)L>AMnUUTlGB_^)4?(<lQ_xkua43ONwGp30IZGH#IfQQK182ZUn2IkPGED
zV9hG=3SnLH0-L|g3+s%EKS745veXqDTU!RDj6kOC?99aQBe27mmN01)(|Dbo<dn%J
z@P)rIIE2PlIiKLwKyH>wZglN+({OA}Zy_L;5Vt|0+I-Y7<92mr<@kbM-IK4WjH(BO
zNVVh0dUhD#d1zp#&(kHhG|(~QukCvCrSrsDefS>CZHrJc5yyO3EUo&IrN#O(pB{d%
z_E{A!9(#7`b+(<vJ>{;n7Ybmuw6aS6Pe1$^I<&9BIlW$eZEETaOm!8dCr{pipD9Ju
z7kI||(xG%N$!mX^HA}Np^GEyTg>}-U2wfaC-md<HohOQpw@JzRuKz9&TB?2lfTD))
zOZR?xYDfm^YpVQWj@BtNQ-DQs@<gg^=LTxTody`@yOUUV$sk*rY!YU9cBI(UpT>ap
zSB9~`R)i05TaGFRa}6>9Ec`$vC(>11qq0^a?dPzbSH9*P*+hX3e|A{R->LI6HQkIn
zA9_I%Y1in%2sN4wiP#FD^0w+D&ehK<xKQiC$lj-MVG*n-rbobN&O(Y(pu2%;p{~xc
z==j)}me4W4$hmZbOUrOG07Mj{gW0N<B6{)gHb9Tjtq2gJnO<^qTTW7vgy(hgK*wiZ
zrOpDO2H08LL5?)B#P<}l)ILkAVjU1`1X{g8MMLvM^>^L=q_m6-hn7$kWCqzSZNn68
zn*|1k#<-=q!I9k!ge~E=ldz+3^1k?nT4E{`m6a+|-PE~gf+JG=7ha=OCN3`CW&j%T
z!P%*ZxHwnapYI+1m3I3Cp9*^_n<oONdHB=C9(6a5+s??oW3l{s$Fo!}SnJ>2uVmsE
zd;UBWU0fq|Il9Bl!&c4l884-Buj=Y#ofbi6E$-}BKqgzmr3RJS$sb^Q0*WR8t|U>>
zx3#P^*c8H^+MSCV8>}_yOtP|4-q9`-6BC0b4*|=GnNidz<~z@?c_%iBTRZ|14Kw3r
z*#8akmDjnoFiUr0B$p`PXz}X~!Vb))v-cO&RGhOPW=9ti-Q;QxZ9%zMyd83zO;omx
z9dzjFnQ@U=IITyvBYAX!f`W!`lH9laU0HbNBdOJuT)lN`9P<VCwzd=#<~muXb!{8s
zE`&n3L}-n{jbxA%8W$V9aybD+yj7oqf?@@a;KN*miboVv?_;5={D%4HS>}?gkZGgN
zI6?;HmLu)t3+6bq-2&TzagmAgg1eDJW@(tES2(0Vh(x>9)YN3gSuCTbrbZZMC<}3e
zDK%MQuf78_@}m67Jsb?!uPUsfE}CIzHkz+!HPgH_(Th8`G=A$dEPPsu?6($<9o*R+
z($IGV9v&p7>y$iAzr*qnPOupEwV(XK>eezr`!prW`I;&OC0rYzfH3e<Z}6;(i!?Y{
zOby|J^sDm)ws>$eWYpU(0Rj$$Res@Nm``1|(D5pqp5a(r8eL%AKJ(%#se)w}o-!Dg
zdBMa*USsOu_Vw#8q|V!(5+^qYlBAAFcSh6l@bXF)aBBT1U}6rKwpaE}<gxjp`Uv3S
z?B4n;g;_z`4Psu;Q0^{}9@GN0zA^-mdK=R+k7~L9JJU7$GIX(q%o1Ge3ZgVJkNT;J
zW*VjOt^wSqN}3466&Sm|0bn;>5NVi8UJZwa(17exwkBq&`!CH3fGA`7a+)?Ds1=Me
z7S_I+V1Ns{AIncg_1yxe2kdb}4n-N5rH+jsU_;RaXu9mXSR`2QwAo9CB!mWj{`^T>
zc$Gq^7L>UJN{L=)F4-W}{KT{tz*^tq`VpA-PkmVSutnKNw*t5IY$xn`s$BY1rX9c2
z(O{u9vO7}I`MGorR&&Q1cQk9n2(3#`y1423)8!)7S-NQEUslfny$?xb-@kvSYbU0Q
z?>~h^07YuFvICF%L6J<`S+q_4xyz)eo;jQS;`0bv46TjQ<uOw9Tu1Z)BYMJ>2xq7-
zr~b6(;-b@(hgvX|gIh-Rsam=jP7j<crB;Iq7^8eFPN4WPGZ4%lE`rTPF`HIL5)z3w
zbnD#@-{IVxwPb82=*z)Ffzky?)zb7>W+x%ItkS#6Su2vW9t<0cu%G5sp~j)sZ_z2f
z+yk3JNvo}fFM6vgQP|5<#}Bw;Y3##W99o$6z+~oYH5@H7nyTpS3T?GeFOON8hvTIA
z@Cnl4S3~mP$_v?xCM$ZpVVR8TMK27on7xpl%%hYarb%*GrrYY*X60f9isdQy?Vxgw
zufP&V49dO(8q@_epVCnbhQ$xybbvj>;k;q`M{jEUl-Cr}B;1?ZP(RtUvi39&Y|)5G
z>=Yay&xm$s9YOwI;vx~7weMNZ)M{u$%~9$~OK=+A<-tMO(KQS{SdpQ>vJ1<eAJ_hp
zIMvmv+5(kuXEe7a@R<3=lG4%<Gcuf4Kd^{!TZ}vprpo7!@1H7PHmJI=2nW&%pSlvp
z=+WpR!1Yo=o^wnC2{phSZ>d{`-cNKERh7|s@a0R<3G4LwdI7U+%Q=cJH~zBzt5-A$
z?SW=Ep!9%DxEu#BR4h{D!EA>9Z7Q0sHEe$8SxP<eP?nv}n>TOf-7YRlMKNTX-_L8s
z&Zi-W@|P{gHj%ufp}m{3e(Gdr2PDJ5yekAHsCWx`9F>;hkqeBtMgw0LclGCpq`Uve
zk9U9pfg_YgW@=&MR+C=k_08Skm;d_e{o?u$<_$5}cc^WL(zQx_S%;G}i)GVbAE|jF
z*iX$tV_SG;{hAewPZ7%d;1)B@Op1asq;)-o<)W%_a#{P^sKv!aaLu=oE2fGEL3YPz
zwd7lC6B846_Y?RV!y^!1wp0(sU07sL$x$EvEbOIgL;__U`g-zMnfb!vV%D+lA{<K5
zLR<Fp4TQ;amJc|cNbTsjxXRVPzAAq2>FGHdg6p_E&>BgeJa>DyY*@D>(GU)}jE?tT
z`GHOab}X&oF1ZP}9%dyevL%l*i{$oy9aZ_U%xE!D7H{?!!@<yQ2%{PV$l+0Cq#+&A
zQ=O2a=nRDZKw9e_D!-sMvgOO*v)U}B%;0JYX`NEDgsZ@QLtQJxEdT7FKs7&Z*YYKB
z4xH)Oi%@xSf)~r2Jrf&uM%{?33OKJD*49bdyx|F6Z(jY(ssA?t<vInDIui6p%5c2W
zyPGQ8x$wWIy#E+sk(sOZy;>%gz4&IE6}5yCfqXtc!i7T_28t@AqMX%Sp`cRa?rj@0
zhT5F&r#Cs)rK_P8;ca;Y$+XYOfwnML+30}-JdPQQwM`$*tI`juW7#D=3{!*rE5s~$
ze^#e1_bex{kbch@PWUih^imA%HmKvhtVK^tOPdZHiDLF8g0RFbueZAsm|(|bPscXV
zc*$E`B!s6nEX#Hs`?dT96G|rqif+){Z9U_*o0oBLN`7*|^7a);d{`|HbgE15Y^w;h
zNCe)ddW5jP4lR5l5>KB>I!A@ragvadLNThObo7A)xt+9p?Gk0PSPn;0LJjaO`ZMKY
zBymLO+D%Xl_oU-MMwZyRwN@3UGna-hbRhD|2@+U<Sfs6l4QPj0&z&k{u0$`WmvL+w
zG%pd~PPx~gmC%(Wq023Q+I9bHw~O@8DBA1COVFbFQqYU9)R(G69Hx`9BE4!%G1-qP
zin8xVDEj!>+cLJVGq@+SH(t(;D3XVea(5Y2UDaHRP^7oZ_E2Ao4fj_G7O<WDu*$o&
zUg>#SZ67VDin;!gkg+2=z~8VrVn5DN_{$yN8Q>MwoTlFU`c_p}gUX`cOave8HLGDZ
zT)^zMWQeN)cIdf*LnCyAp<fIdT;m1Nd)%0%d1<x%($P%WEb3Bna;h__?eXmvRZ*(s
z?eRJZd(l5%INUXtYS3|pb+#o3OIpck-)ge9>WM!=#tte!m;yvCUyNdQWAc@=o~Il)
z2ogL0M*4}T7fr|xLp>J~oJx1{RzZ6fRHh9o1vs4BGV9aiEdz`4uu9S44+jWap<S@G
zwGF2|f%8SvIARS$Gf$(W360iRo}<<qk|6k-{Mbjism^BR{T&X?p>o@~wyhMFcD4II
z1~bd-Nx3XOb*SX2v+*j2dJXa#!F?5fcKM-3(|D9`JX5RIk2kv|=Yqs%CwAM%8xka2
zfIn5XeqZU85}dqieYMSrQ83=En)9<pvEjc4TcDTOebP3b)jQgUEdqk~JbZkstDbu>
zQ%k5qhpOWCVe~850Vtbeb23-g;BbjakezsR|NOn5Uv37h;%tnTLEZcK1`>mpBEnG2
zQN<gr<c!T0eGJpbz=h@vT2$He7^KwWjaJ!ZRwt@YdOZDjE6f<|x<!s!)37*<+WB=7
zl%uhXT93ApVKW<jHfUSK%t;V1XGx1FN}GV2Og93VN{&xVOiZ5t3B{L7-&C<iO=4cV
zA58nP(_`OeugS}|5Ofg1q%=nsOBhG&v{ZLzJT2WFuglNFPL@|z_@TnV$H!l>&gzWT
zS_j#Go_R2ZFpIR3j~2aJfwo4KZF<N0+FJhUkk38AlFEkz7Q8|CJp%80(=JH`CPIIh
znVGpuJZ040`Upx&)9|d6I1n2!;(o2KSHdHpX>M0|$B_V=sb;?4MaXYT0MBWuX+MZi
zlErbWytoT>C%Da3UwnUyV5j$DJ@r$<DTJr;r})ui9R<=QzTH)w572Z!;B?gbJSJW{
zV%|Tk-+Pqr#mdIUR*e6d&p~SH1y#)y5w4|hqzPm1b4ODL++;vhE1|470jL1P?n8Ac
zib_g`RL~*D3c8hBiR1Ky%clFRRzo)rYO2)TjazZd8;}v}g-$Pviu6%mqFbS~ezUlE
zl~v=9gMmSsLr}15T{U+qdTSC5-yQG^D1dX{V}B(gBBGPsYEr<B*-vJALW<VZu64F)
z4W}(W3vZkYqRt;$_%v6?l*{L`Wq=kMnpsA*U?MQ2Y_Y;lG&8CBz(aBMni%Hb!_3Is
z!CW+^zA6Bq>(WzhZk|ix0sjgw3%|eX`tHz{7V-<NiC#4EVD|PS{|Ef84`f?nVZWcR
z|H^YuCL^k`Z9n_bkJ~XpfP04wPh`ZnI|cmba2&K8lS+DU>e!77XdU}gKs1YLqCPp>
z&q+<$&LI#jU~gdx@*L~^ul(8Av|<nl*19@5d`w%+xy;MPlK5<|S?DYAqpMY))Bw3N
z_z2MX{0Ozg+iJ(4hqJJGJ;ZKq3=!(x4zdnRL09jXtZOt(cPMTgz_wQ>J#bS;Y=LFW
zs8*XCwZ#HV^QQ;`0z6i&d992<fR^hH7FhX;rAj6{iAn5A7@!vz<%~Ul;*(@My^#qt
zj<vwZ{dxEK1}sdJ%Pk=C)+?P?N1}D>%K7>3Q0I9_fAX2>`xA9Ebl7)0L1k%jyu`|f
zq-(uub7xyg$A`E8XJO}&H}=(kFHDHsSvcDD>=w2TECQtR@QQ5S^FreYm!_TZW{=jI
zpar3!mPf>io-AWjwJRnKxG@%IyN^*Vb1-Uuh1K-+Q=3BYnN&M&zg*M0Cxd5>8*@}?
z6au%)0i)Wk#Fukfp7qcQ!IA!<<+j%6`3e#|r&oV}2VbN2xumS5Kf1=r?s$)?%KBN&
zb&1TwE|uc}Qp<!yH;#Sx)bIAg^9{gbAOn`*lgW%DDI?WqZHQ_)Mn3-Bik9xI$|m(Z
zw{X7v2JaUpVp|#4)m>d{S+%7qwSdOXsW_FC%|7jeexEu1_ntg#zsG$Ac<sslmBJ|Y
zN)m`xK67tiP(f%N+CGEP=g9|)_qBG+_!>fv!-s;rTyv&R!Z$1dK0J<w=1a?SzGkDv
z^fdaSH}*vsJoabO(lwy$58i0l@AeSRCs_Yolu^`fJjh>a#j#y=$u2?b65MT%0P~u^
zm#{p1QqK@QhqVUE`8muO5mru4PMEHL!>GA=`4J*T`h!pZNuPZE*sDnC00)G8`P0Wz
zD<?d4?Ykc&5adr~p#Gu|<0PG;k|GN^KM|LEWkYT*TjOZ%lQ<bIZEei=S5GY`_V(-l
zvTjZ~0^Ows^HjH@p?mZj7LNZo&*k432uIYBo|dM~=P6)&;@r?o5yD}B{>*wKIQTIk
z!@FQ|kIi5H@i>y-6}%kC2^UBC)ULiztM(Wfqc}dY5)Kay&6)~E{rv{t;+qa2*G)-+
zHE#1V=ft{~%T`7lUmk89sukATH3e2aG9IX=ov0?O+8Fpvuf<q8W*}3_tNsPDr$=1|
zGV=L@ry%)YW)I678~PGdad>pJ<7r>ISgz{Dsv2y<2QLE4&mH~QW6KFyR)>qGrw4Nj
z8yhtW$JS<NshhjK7D!Z5YeIA`mDpL4DwLI+xA0QA;U`TOJVxQ;!)hAW>g$Z}y>a8E
zp8K>~x|BLz;zv9?`<+5{o8z98-j^>Sj#`^6EH3Vj+tTutbDy>Ps}K75iVVgc=y#kt
zNhzZp$KtMTY86k1^s~hrRKc4P!kZdMOG&lO^=IO<Xb=qK3}#4B1hK!H@G5`8!*fCK
z$i%zm_)TK~D`;#TL}j1Vmg#%#JPFC-J9SUcI$fa_t0}#fi=Hr^{oK<dm}xaB(^Gg2
zC9nw7I8-P6RfN@~K#kjeBAftqyU|{~U(Uq?gCEAe{Ybd&WH$z33Frj=o7R&9g%U=v
zXR~ToFO%A?2E?YY?(H1Lyt2>cdO65vePE*acjLQ6_ve{kkG%bmZNVZUI?febD|`a(
zQ-LIJAVf^9eAQ7xLLaZZ>=_Zhjcs`P#rord3p@QUUk=aASjuKqL`0<OvsjMoEwofk
zTTvWFS>F8HtxY;XS1c3XS-7`*XyMvot#u~g$gZ0B?0g*cw+Ha~Z&Osgd-G-H#(S&0
zf943!mNMff-TQ|dX-BRHY-p^HQ$vd}v%0ERVKov;<yTjy580}ob05zREx=t#U?*g?
z(iLH+y0*sa+1_5~E>*F`*vqF)0WTeWQe8{@H;G(mqRA?8zK;Ll*01bG#FZM?Q7~?=
z^{l3<37zI;>`fLGSPRMYo2`}xS~sH9hGJMZl(KqM?`OWo_Iur~LXy)m^H8!wPOjef
zN<O1f(|(j%T0;lzwz&0_Cq3U0#a$d0PR_N_-FUw<z6$!2UnhF_?re`rcRzg7tm9HO
zsgry4)h@n=Kdf5z*Llp)bXUHZj^;(dA=lFYL9Wshf!`6?V+#dmKUsGU-U#deDbz06
zwdX+zWFb3L_cQPowlxN@VdINiPq>~`OyPkYz_=~4MIrTv&>_vvPy7n6zmFFsC|YM)
z3hPG&UACq#;sOC=ZvVPyC#1od?Qp90p6FX^QPDq3CB%+o0yl5F*sAn=b!k{X_v{T0
z6ZZ8IqTY$frV`6|@6IKtOqag3Q0A3)^~6r;S1Z|Z_h$hzx0$9+kocVvVEmp2V%6I#
z(O?L4gnBiuoa;Y)1B|=0d0g1{4r~th|19;}eEkZI%qey^$CwnR4U|3Oa|fTcW4?6j
z^0*xR@bCY+B|<mDqF3({@#`vT|3Atoi~kpXc;9uH4H;py!8sZ|rj)KJ>1aRiV4rzq
zeV5-^K@S&I1lkkt&k@l^>-)j`(bpsf4Zd0X8TM6QHLI}wao_XI91pfYqtBc-kDZzV
zNoP|#k2_yX`M7nh@EaO!{Y5P`n0>eM@IlvlqR-^{3VVk!QF(~YV8P%{veyB7kG^Il
z!#17o)s@Y7Wx109VKyq>nQ;<}UBSJVxpNJy6U1IiT~!P1#MT8mrD|rPT&L+9aSE$w
z^HVL?_L67M>pd8s4okr|bOYQ#q^p%fWaEmU<-~njT8GWRXW9Fo$zSSNuZeazw6-Z|
z$uBL7VTDM0Xs=$=spD)#Q`A;4-8)DVc<S7Pqr-?C;fRKV_dY*{W?DvC(BW&JU*ox(
z{?&Ab3BwdKeg{kD%uUyJHZ@#C^x~0u9zfn8cZZ0Q&BG*ybIPM$au^BK*at=z#;X&>
ze4<KGYxmxH8^^<^@iWd?^m(n4p7}KAeY44Zs-sc+<1bxpuY88c!kQ<A*Fu{!Lz=am
zkR3n}NPid}VUhE%)Z%fTI<;z3FW6k1zfH(JlEyrn&ZYFQtgf1}IxZUAT2YIb-OSdv
z*}5fd^i<>j@@kjgZ#i5#CsM474%WA2;Xc9Fhp!aroL^X0=}0Ttw+&yaG|tAy{ZfUV
zsYk`V{|>-0#?iL|e&ggvWTtX5CmmB>mh%{zZ+FvP#$uc7+HbAKyt2+J9<u9Bhzs&E
z^FTHbXp~VcylDKMH!H#F`vb8D+2<L0_$%hj0FvorTdl^}jF_r4UEAtCf}8(45!tKl
ziYbyagRh2Xd|J#sx)jiGBU%1#9S!bhVcwk<imyY4ql?Ung{Mc9|NUC=7_Qd8V$YcC
zG6OFU44R*bd+V|@N+qrwOiNe0a<VkPrT8&G^WR_ZM={5$@Ke;@KMCBI=f6~2aWaFK
z+m+CDvQvWILwOvleq?%WaK2mZT4HFh*j;meq!3SNW`cNh5PT11E0=*Ws$PHW9%1Ed
zM5cvhe=vWOA$w(+(19@}$x!{@+4%4Gf>}hGpXgn?CSaw5xBAh)=Jlf)nb;9xhrQKo
z#v@JK>~npIz=ExAEMkLcr(>O;u+o;ZJCX6a1RphfSI4`{_rF63|L%-9Vkj4ofx>|O
z;D1@4+SDw2@44tWP?SpWdnnYmOaGf6vy#AGL+qX{a2mVUoEBDAatqt&K`2lFCc1jA
zeG;};?<@x2z3bSpUIC<~RJSVpFvMvbX*>+~ps!p4XHSdy<(`%Z;3ciuV?oQ9Z(SJx
zJFsfj^T&Ge2mpLYKr6LqZkGfwP^&7^E!Thl12(;|p~^}g53xN+b<;Hx22E-`2Fknk
zJt|0`bqIAjq-A|0c>`HyB97OS!|pF(?tq#dNeK!MzqbnzI;8K8R9Ghsxc~{o+Hg_~
zs^A@knCtKS{3v*BpA`RwhW_OAv@EgsuC=vwtRN-jZ+P^BriZ(GZHF7+l+cX9ma5GM
z!9Y6Zb@%nXqcg#s7v$FnNy>Tgk&w<K9g&&2(3^UsH~t|gNayP--_lDLFGBgH$=&J-
zOrt9EkSp$o-@kuDj1$;(#N%M&E<F--+j|z2C-dsn7g-EVu4nf4_O`agfV?sx77nnN
zlPo-J7mto90*n@I)r4Z!QZFy*U?gMFu1p@3rqFN(ic8h*^vEF_%zG88;DRE%@t=$T
z%xh-=cHJt_fpeuF10ANE`%W!?3;4XKxcf=&#o%B-*pZy8*RFAD6`<g5G{}ITx2>Vi
z7yuG88~{ZXw}0XbLpy2`VAhxB@6giqmPJJV)58?#c#Ge;A8utoZN=^VQZzdQjuh$i
z9biAhu|eH0BQ5=C>hMNQ>GV%O;s?(~7y@q5-#Z7(O@L<4XK1=z4@X}C#YFo%N7cm?
zfVT%)<#_}AJUq|#qLRk})`}Qhu>_kQy&HyRu*&QvSaXovq+RVV0MJ4Pfnq|s7(kPQ
zcK%@0heUxk1HxqZJEUu=Kulwp#H^l%-63@jUF-hV4<uI1GTwfDT!Ux}lKgU8nVu09
zx)XyyqG#W;r}brPl||^xXF>FJBBJ@2YrxI!aD)n1c+R0HS*b;+<=Z$!q@Tf)aW(-8
zl>5v>wJB{jOReY7qw|C6rNXPM`1mona#|wkhf!YXUf^VA(Wy2l|6Irn*uzp^I&D{6
zKUQhSJt`G~G-bpmtylt&2eORSD8lOj0GmDamy?Ek$(+@*vIiPB2UEc0vGoKjPvMVm
z5)gosJX^3A(4*O}7y7|&*ggyXYg$I>f~aO7@S`%gEJw5APYh@guK_2Ju!;duD2O@0
z+j?S~M#Tw}F$Qnz<LpBb)eNW<=mJ);P5i5h8|9$x{q*So&^76xy!>S*Y{>ZmPi~8J
z=n$+3MP}`bCK~7PL27yG3JJI&At5ny&z?*cz4VFmfB?ikB<^@i{VZb(Tr%Ht=*_qM
zRy>J^^f*thg;VZk_w*~3Pwt~BS<fRZLdEL4&~W4Y-n#@@FrHqz=v^BK>y#cgP7=A5
zBJieXBXC9>$0Za0L;~Hxv<oJ!ZOC+-70nO1Agl2MGy$DU7}X$8&|HCOl#UpIhs=s!
zkR?Nk6-=J8OfD_C%C;{VatRC}VA7%q@w-a$rA+F`eXVR5j7h8^;Bl=2G~DEVR3o5n
zKwZ*t5=7Ge2I5QA909GE%plsrOcB#QTM^0jOQ!=Xkw)d~Y0$NppBhpvN50;}vQL+a
z$}6M_6AK2IR6!83<eI_imM!<05dujvh?4Ki#?gf2H$M$CjuFtQu+p(q#C!7*Cc9QW
z7*UZurhI<y-i_x;SL_7+M~`+ANO0DVn370%ul7)o7>MIB>@<@kCfXjPOeLd2V~}-g
zcqbVo*rj;Dg+J7gE6kD9_x1!MN?9vtGCuVcuJ=3>k3<c_<Du+ay+IH1e#jM~qO)7)
zd0d;!lN*6T0yzbQiS_{n72DTc-UYf0a%0y54k01kIGC!L)-4F50=!i8CnQW+`W&u9
zd{Elk3fA|KY{J(>F3UJo{qkQn@tCFSbaMVxw%70B!>lna*8cr+f`PRM;Rmk&`2PJR
z=wIUV2MYD{R26F&=Jg&344_a~&Lp2aR$lBy{PEyzbcMwjNc?u87MP1=rIgGCh&<U<
zduRYo5*aVSCq@Rs1A01PBr7Y87%0cb-r|XorrzcCU*(Il>Dy88NFYH=#RM!!mkd`;
zRfY*En!+n6qNVqQk^T|Cjp|pt9#CZv(B>*<zbby9UHKZA286=YISgHzuqC*czLZ;7
z-^ZO64{VK+ykI(!7J$0IOUAQbGLc-?To}w%lg|C+lrN6>$moE}Hn3m8+X{qmj|gmV
za`iDtmssf(8~B6X0GPI1Gr^z%&PP&C?J5JnQJkKqkK+!5DBc!w;$8Yqu@@1G-{>s3
zBGR|B6jDp?%=LoRZ6=7jV+q4``iHNk67Wudc!t04{>+;?s0`0LQqERf-h?c1gC=l%
zJsUWN2_7C10R|`8rA>Gan%6+x^^!T)ONZPLVP{Zj@}%aoK1^7;c8?$s;s7CA<8fR*
ziaBb?Oppvd%)Cy76)qlk%pSaq#?0`#Cz6{d{OZ@`m7&A!g?Qjk!BrKMIqPum-aUf^
z9c}Hr!l3$0uZ=r!)<clftmL_cx15v|gLY+6e0;pBzMT9ZY|IbGZYM)X4}&bDTET!B
zWPHO$lS8hSqbxb}>!|M~Ag@d3z{kh-+vWc<$yXVG?Fy7Ja9bczVCa5VBP?Rr`fVYz
zNU~syX=wO0t6u$hr02B?;H4k{?-TqLR-P%R=(RSRa}3Q_b$1^=lPb4T(Vxu*!iW$J
ztJDFTxAZo*)#p+6Vm!p1AIokCbDP#Huh_8Nb^;ScxlalEbTOw09P}q)6694tD#Q(3
zG_iB7-Ik$X{0(;;WIdU8TDvk(9^fN|IC7A~Y{Md8KLnGby0NTeAAsL#`W}~YJ}lef
zMcLDSYO9v~G1TO<y|>4euIG<X!Y;LN{8~=J&X{|=hv-}+HMs)J`0VBb4K$VnAWBvx
z6}RULxMdXSS!HEGx2+pw>;zZj4ldZfmjJ4ou`@78UHh7eS$g*~aP@V2qvoJ*%JI2t
z-6C$@pAi(N>=g_dJfE1c<k~-$S)dg;!Pwx0+?eu9CYHC$)+cJ>AK~^8ID((bA;QpL
z4v2K1Mj+<md5zgn28tPSL74Li<e|)A#&3(Hw}UYs1QJ46QWlsJuwU+hJy$9(vI{b<
zz||bEe4T_uBS)ED@+X8cK}2C#7ev4K`}-TUM-1hv$(zx*mYej7;^6>YJ#e{(;B@e&
z!`i6Ike<;-Z>nGI1?KQOSoRhc@T^fz+7^JrpHF<tOkAhH>4CeToT>r#zX~ApFJ8Iz
z+`J$NOVE6X$z?g5R=Q36xjtT7EdbDuGDDK#(ajQ;tqAcJ2hu=OP84e72CSy=ceRjJ
z&IF~FtoYNXtp@h?r9+Rud>Y8od&Z{axg$pqKdVFd7L*f4wdpdVDKRl>;P<650A<_}
z=qCE&XwKUD@;25YE=mPG5zKGWgtRI6XgE*N^&W0Z840&l;q2V`j!n^0auqa55DQ?`
zj^Q1>+ZjF8@ChEUX1@q|xitDKHY*0S$mEaJ(gcH_u}Av^oD8%kQm3I*Vz7!jqS*uo
zqNqgqU@2>^<bgD>MX8+$Ie0Az@7`rHcdO9z+uYlWR*h}6s3KG(FVv|Ss{RT33k$*L
z{!GT8f+)UWU<uVFd@<@0^^hs2-{0QcpcZc_ksgAy#sUK?74RinEM+I8acg<{zlQh<
zs2j`4iSbb=H^h&FLUH}a#s0tjA<_lRdaicW7F4zaX-P>*ds_Z+An-VYbxE>N!zeG`
z4O13+H@F;o>hy70ZSBvlW}MDda&U+Y%6DR+kOzCt-o@#mi-0&A4Ku}4XE&&87&93`
zJ~%Mj9wUEDvX{rp%WEp#Pa@#_8o1Qmve;N5k7|KJA0M%d00yn{Rq$=6!-JFkZvT(Q
zN0a->KI9M497vr437Nn0XgB1-;`zyfaL^r2B()ZnSBf<#JH3&|Alu40VwR$}^jB%6
zA`x-e(Tz5yaWC&l*cn>4N)ixdD>lKD0~vHV$p}{cuRBIEOk{!5K6)RxE)2;Ay%OMk
zhWi{49PDUsKX5u*6AN7e(%}5-D-j7>hMO*qM-m1+FQONXu7XX;49V64omA;rqr)Tk
zB3J)sUkOzngDhfU<u<TS!WacrS)$=&t&3S-d%HAU0|e2m2*DfoAOLVztEV^K)vJ*i
zP}wb!vbV*s0>1ZXr;Lbwqoi9Xx3CjV3UQ##dc_MtJ;tQ)<q7bI27+b(r#H|oH3{R8
z{q*qhRA^Xk?h3pF-T*>--60Npw@ekz=B33&cpm2Fk*ZWQ6Mgpz#Nap}!-s^i+sZ)=
z1B%XcdP+)>sZjX1AS8jSu<Wxmi&91hB!ul;Qw8@nwR~@}q4Y|Ujp<Z<UCux?-~=Lw
z%YgTOJjM|)-*BGiKTi>rlKNO6X$<O<j1KWYkQ`*eO;%QpEYMB$ys04S2#rg@(&0i<
zH<0?UYU61B=eOt*#47!T{`O5|SB!xmxa_DV>|(iTKZNlq5Nt#D1kWRYvSkp0O=3@B
zVEXsV2_TZ(;}Sf`()wXMB;VeXh4M!^;=kj{A!05(2OZYah^HW&f_Cb&%}`Bv2lJAK
zk;3-JSB)Pb%3E@ZH5HBu$!;X!n6zUamzbrcuPur(Q%>&Y|GYd2Vx-*4D-<Sz5(NpI
zffq#C)Vl@v`@`Bzgwg?C0QUUdP7>(j_BN)3`}?~A9QR~ov|H?<`tQWB$KaEP)%c%5
zM@F6;d4hWjMEC!Xo?F5JveF+ue6ANCa1E3rJU!$nfex5c;P$)r|Nr>^#1Nd}t89J3
V6PVl&hP#TAd@A>(K=j4i{{xR`)sO%H

literal 0
HcmV?d00001


From 8a8989c309275a051e1f008d039e2c7ed3a56cd8 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 24 Nov 2022 22:01:55 +0100
Subject: [PATCH 43/80] minor fix of available metrics in results function

---
 R/AllGenerics.R | 4 ++--
 man/results.Rd  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 4bfb89fe..e84bab35 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -925,7 +925,7 @@ setMethod("results", "FraserDataSet", function(object,
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.1,
                     rhoCutoff=1, aggregate=FALSE, collapse=FALSE,
-                    minCount=5, psiType=currentType(object),
+                    minCount=5, psiType=psiTypes_avail,
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
@@ -936,7 +936,7 @@ setMethod("results", "FraserDataSet", function(object,
             additionalColumns=additionalColumns, BPPARAM=BPPARAM)
 })
 
-aberrant.FRASER <- function(object, type=currentType(object), 
+aberrant.FRASER <- function(object, type=psiTypes_avail, 
                                 padjCutoff=0.05, deltaPsiCutoff=0.1, 
                                 zScoreCutoff=NA, minCount=5, rhoCutoff=1,
                                 by=c("none", "sample", "feature"), 
diff --git a/man/results.Rd b/man/results.Rd
index c7dec171..9475839f 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -15,7 +15,7 @@
   aggregate = FALSE,
   collapse = FALSE,
   minCount = 5,
-  psiType = currentType(object),
+  psiType = psiTypes_avail,
   geneColumn = "hgnc_symbol",
   additionalColumns = NULL,
   BPPARAM = bpparam()
@@ -23,7 +23,7 @@
 
 \S4method{aberrant}{FraserDataSet}(
   object,
-  type = currentType(object),
+  type = psiTypes_avail,
   padjCutoff = 0.05,
   deltaPsiCutoff = 0.1,
   zScoreCutoff = NA,

From af0834ae753440d7a97836213b57d61493f6afb0 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Fri, 25 Nov 2022 18:01:10 +0100
Subject: [PATCH 44/80] improved selection of splice metrics to run

---
 NAMESPACE                             |  3 +-
 R/AllGenerics.R                       | 19 +++++++--
 R/Fraser-pipeline.R                   |  2 +-
 R/calculatePSIValue.R                 |  2 +-
 R/filterExpression.R                  | 57 ++++++++++++++++++++++-----
 R/find_encoding_dimensions.R          |  4 +-
 R/fitCorrectionMethods.R              |  3 +-
 R/getNSetterFuns.R                    | 21 ++++++++++
 R/helper-functions.R                  | 10 ++---
 R/makeSimulatedDataset.R              |  2 +-
 R/plotMethods.R                       | 16 ++++----
 R/variables.R                         | 14 ++-----
 man/FRASER.Rd                         |  2 +-
 man/calculatePSIValues.Rd             |  2 +-
 man/filtering.Rd                      |  3 ++
 man/fit.Rd                            |  2 +-
 man/getter_setter_functions.Rd        | 13 ++++++
 man/injectOutliers.Rd                 |  2 +-
 man/optimHyperParams.Rd               |  2 +-
 man/plotFunctions.Rd                  | 14 +++----
 man/psiTypes.Rd                       | 17 +-------
 man/results.Rd                        |  4 +-
 tests/testthat/test_fraser_pipeline.R |  2 +-
 tests/testthat/test_stats.R           | 24 +++++++++++
 24 files changed, 167 insertions(+), 73 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 139fcd83..436647d3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,6 +6,7 @@ export("condition<-")
 export("currentType<-")
 export("dontWriteHDF5<-")
 export("featureExclusionMask<-")
+export("fitMetrics<-")
 export("name<-")
 export("nonSplicedReads<-")
 export("pairedEnd<-")
@@ -47,6 +48,7 @@ export(filterExpression)
 export(filterExpressionAndVariability)
 export(filterVariability)
 export(fit)
+export(fitMetrics)
 export(flagBlacklistRegions)
 export(getNonSplitReadCountsForAllSamples)
 export(getSplitReadCountsForAllSamples)
@@ -78,7 +80,6 @@ export(plotVolcano)
 export(predictedMeans)
 export(pseudocount)
 export(psiTypes)
-export(psiTypes_avail)
 export(results)
 export(rho)
 export(samples)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index e84bab35..8a4bac5e 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -724,7 +724,19 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             !("annotatedJunction" %in% additionalColumns)){
         additionalColumns <- c(additionalColumns, "annotatedJunction")
     }
-
+    
+    # only extract results for requested psiTypes if pvals exist for them
+    stopifnot(all(psiType %in% psiTypes))
+    pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType,
+                                                   filters=list(rho=rhoCutoff), 
+                                                   aggregate=aggregate)
+    psiType <- psiType[pvalsAvailable]
+    if(all(isFALSE(pvalsAvailable))){
+        stop("For the splice metric(s), pvalues are not yet computed. \n", 
+             "Please compute them first by running the ",
+             "calculatePadjValues function.")
+    }
+    
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
         message(date(), ": Collecting results for: ", type)
         currentType(object) <- type
@@ -925,7 +937,7 @@ setMethod("results", "FraserDataSet", function(object,
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.1,
                     rhoCutoff=1, aggregate=FALSE, collapse=FALSE,
-                    minCount=5, psiType=psiTypes_avail,
+                    minCount=5, psiType=psiTypes,
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
@@ -936,7 +948,7 @@ setMethod("results", "FraserDataSet", function(object,
             additionalColumns=additionalColumns, BPPARAM=BPPARAM)
 })
 
-aberrant.FRASER <- function(object, type=psiTypes_avail, 
+aberrant.FRASER <- function(object, type=fitMetrics(object), 
                                 padjCutoff=0.05, deltaPsiCutoff=0.1, 
                                 zScoreCutoff=NA, minCount=5, rhoCutoff=1,
                                 by=c("none", "sample", "feature"), 
@@ -948,6 +960,7 @@ aberrant.FRASER <- function(object, type=psiTypes_avail,
     checkNaAndRange(rhoCutoff,      min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(minCount,       min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
     by <- match.arg(by)
+    type <- match.arg(type)
     
     dots <- list(...)
     if("n" %in% names(dots)){
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index d6a106bf..8c1856b7 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -85,7 +85,7 @@ NULL
 #' the beta-binomial fit, the computation of Z scores and p values as well as 
 #' the computation of delta-PSI values.
 #' @export
-FRASER <- function(fds, q, types=psiTypes, 
+FRASER <- function(fds, q, types=fitMetrics(fds), 
                    implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", 
                                     "AE", "BB"), 
                     iterations=15, BPPARAM=bpparam(), correction, ...){
diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
index c0e9d425..f9f3f991 100644
--- a/R/calculatePSIValue.R
+++ b/R/calculatePSIValue.R
@@ -25,7 +25,7 @@
 #'   
 #'   ### usually one would run this function for all psi types by using:
 #'   # fds <- calculatePSIValues(fds)
-calculatePSIValues <- function(fds, types=psiTypes_avail, overwriteCts=FALSE, 
+calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, 
                                 BPPARAM=bpparam()){
     # check input
     stopifnot(is(fds, "FraserDataSet"))
diff --git a/R/filterExpression.R b/R/filterExpression.R
index 23729ee9..9a47b51a 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -45,18 +45,22 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
                     quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0,
                     filter=TRUE, 
                     delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                    filterOnJaccard=TRUE,
                     BPPARAM=bpparam()){
+    
     # filter introns with low read support and corresponding splice sites
     object <- filterExpression(object, 
                     minExpressionInOneSample=minExpressionInOneSample, 
                     quantile=quantile, 
                     quantileMinExpression=quantileMinExpression, 
                     filter=filter, delayed=delayed,
+                    filterOnJaccard=filterOnJaccard,
                     BPPARAM=BPPARAM)
     
     # filter introns that are not variable across samples
     object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter, 
-                    delayed=delayed, BPPARAM=BPPARAM)
+                    delayed=delayed, filterOnJaccard=filterOnJaccard,
+                    BPPARAM=BPPARAM)
     
     # return fds
     message(date(), ": Filtering done!")
@@ -64,6 +68,34 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
     
 }
 
+#' @noRd
+filterExpression.FRASER2 <- function(object, minExpressionInOneSample=20,
+                    quantile=0.75, quantileMinExpression=10, filter=TRUE, 
+                    delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
+                    filterOnJaccard=TRUE, BPPARAM=bpparam()){
+    if(isTRUE(filterOnJaccard)){
+        return(filterExpression_jaccard(object, 
+                         minExpressionInOneSample=minExpressionInOneSample, 
+                         quantile=quantile, 
+                         quantileMinExpression=quantileMinExpression, 
+                         filter=filter, delayed=delayed,
+                         BPPARAM=BPPARAM))
+    } else{
+        return(filterExpression.FRASER(object, 
+                         minExpressionInOneSample=minExpressionInOneSample, 
+                         quantile=quantile, 
+                         quantileMinExpression=quantileMinExpression, 
+                         filter=filter, delayed=delayed,
+                         BPPARAM=BPPARAM))
+    }
+}
+
+#' @describeIn filtering This function filters out introns and corresponding 
+#' splice sites that have low read support in all samples.
+#' @export
+setMethod("filterExpression", signature="FraserDataSet",
+          filterExpression.FRASER2)
+
 #' This function filters out introns and corresponding 
 #' splice sites which are expressed at very low levels across samples.
 #' @noRd
@@ -125,11 +157,24 @@ filterExpression_jaccard <- function(object, minExpressionInOneSample=20,
     return(object)
 }
 
+#' @noRd
+filterVariability.FRASER2 <- function(object, minDeltaPsi=0, filter=TRUE, 
+                        delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), 
+                        filterOnJaccard=TRUE, BPPARAM=bpparam()){
+    if(isTRUE(filterOnJaccard)){
+        object <- filterVariability_jaccard(object, minDeltaPsi=minDeltaPsi, 
+                            filter=filter, delayed=delayed, BPPARAM=BPPARAM)
+    } else{
+        object <- filterVariability.FRASER(object, minDeltaPsi=minDeltaPsi, 
+                            filter=filter, delayed=delayed, BPPARAM=BPPARAM)
+    }
+}
+
 #' @describeIn filtering This function filters out introns and corresponding 
 #' splice sites that have low read support in all samples.
 #' @export
-setMethod("filterExpression", signature="FraserDataSet",
-        filterExpression_jaccard)
+setMethod("filterVariability", signature="FraserDataSet",
+          filterVariability.FRASER2)
 
 
 #' This function filters out introns and corresponding 
@@ -192,12 +237,6 @@ filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE,
     return(object)
 }
 
-#' @describeIn filtering This function filters out introns and corresponding 
-#' splice sites that have low read support in all samples.
-#' @export
-setMethod("filterVariability", signature="FraserDataSet",
-            filterVariability_jaccard)
-
 #' Applies previously calculated filters for expression filters
 #' @noRd
 applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, 
diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R
index f3747eab..ad4873ec 100644
--- a/R/find_encoding_dimensions.R
+++ b/R/find_encoding_dimensions.R
@@ -123,7 +123,7 @@ findEncodingDim <- function(i, fds, type, params, implementation,
 #'   hyperParams(fds, type="jaccard")
 #'   
 #' @export
-optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA",
+optimHyperParams <- function(fds, type=psiTypes, implementation="PCA",
                     q_param=getEncDimRange(fds),
                     noise_param=0, minDeltaPsi=0.1,
                     iterations=5, setSubset=50000, injectFreq=1e-2,
@@ -191,7 +191,7 @@ optimHyperParams <- function(fds, type=currentType(fds), implementation="PCA",
 
         # remove unneeded blocks to save memory
         a2rm <- paste(sep="_", c("originalCounts", "originalOtherCounts"),
-                rep(psiTypes_avail, 2))
+                rep(psiTypes, 2))
         for(a in a2rm){
             assay(fds_copy, a) <- NULL
         }
diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R
index 5b87d1ce..1b726403 100644
--- a/R/fitCorrectionMethods.R
+++ b/R/fitCorrectionMethods.R
@@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
                             "AE", "AE-weighted", "PCA-BB-full", "fullAE", 
                             "PCA-regression", "PCA-reg-full", 
                             "PCA-BB-Decoder-no-weights", "BB"),
-                    q, type=currentType(object), rhoRange=c(-30, 30), 
+                    q, type=psiTypes, rhoRange=c(-30, 30), 
                     weighted=FALSE, noiseAlpha=1, convergence=1e-5, 
                     iterations=15, initialize=TRUE, control=list(), 
                     BPPARAM=bpparam(), nSubset=15000, 
@@ -51,6 +51,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder",
                 paste(names(list(...)), collapse=", "))
     }
     method <- match.arg(implementation)
+    type <- match.arg(type)
     
     verbose <- verbose(object) > 0
     
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index cd35cbf5..c0bb9d65 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -411,6 +411,27 @@ currentType <- function(fds){
     return(fds)
 }
 
+#' @describeIn getter_setter_functions Returns the splice metrics that will be
+#' fitted (defaults to jaccard, used within several methods in the 
+#' FRASER package).
+#' @export
+fitMetrics <- function(fds){
+    metrics <- metadata(fds)[['fit_metrics']]
+    if(is.null(metrics)){
+        metrics <- "jaccard"
+    }
+    return(metrics)
+}
+
+#' @describeIn getter_setter_functions Sets the splice metrics that will be 
+#' fitted (used within several methods in the FRASER package).
+#' @export 
+`fitMetrics<-` <- function(fds, value){
+    stopifnot(is.character(whichPSIType(value)))
+    metadata(fds)[['fit_metrics']] <- whichPSIType(value)
+    return(fds)
+}
+
 #' @describeIn getter_setter_functions Sets and returns the pseudo count used 
 #' within the FRASER fitting procedure.
 #' @export
diff --git a/R/helper-functions.R b/R/helper-functions.R
index ed73f639..54593b28 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -22,7 +22,7 @@ checkCountData <- function(fds, stop=TRUE){
         if(isFALSE(stop)) return(invisible(FALSE))
         stop("No counts detected! Please provide counts first.")
     }
-    if(!all(paste0("rawOtherCounts_", psiTypes_avail) %in% assayNames(fds))){
+    if(!all(paste0("rawOtherCounts_", psiTypes) %in% assayNames(fds))){
         if(isFALSE(stop)) return(invisible(FALSE))
         stop("Please compute first the total expression at each junction.")
     }
@@ -71,10 +71,10 @@ checkReadType <- function(fds, type){
 
     # check if type is null or missing
     if(missing(type) | is.null(type)){
-        # if(verbose(fds) > 0){
-        #     warning("Read type was not specified!",
-        #             "We will assume the default: 'j'")
-        # }
+        if(verbose(fds) > 3){
+            warning("Read type was not specified!",
+                    "We will assume the default: 'j'")
+        }
         return("j")
     }
     type <- unique(type)
diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
index f451d5cf..7a1fb6f9 100644
--- a/R/makeSimulatedDataset.R
+++ b/R/makeSimulatedDataset.R
@@ -438,7 +438,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10,
 #' fds <- calculatePSIValues(fds)
 #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3)
 #' @export
-injectOutliers <- function(fds, type=currentType(fds),
+injectOutliers <- function(fds, type=psiTypes,
                     freq=1E-3, minDpsi=0.2, minCoverage=2,
                     deltaDistr="uniformDistr", verbose=FALSE,
                     method=c('samplePSI', 'meanPSI', 'simulatedPSI'),
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 2c38562f..8b57feda 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -299,7 +299,7 @@ NULL
 
 
 plotVolcano.FRASER <- function(object, sampleID, 
-                    type=psiTypes_avail, basePlot=TRUE, 
+                    type=fitMetrics(object), basePlot=TRUE, 
                     aggregate=FALSE, main=NULL, label=NULL,
                     deltaPsiCutoff=0.1, padjCutoff=0.1, ...){
     
@@ -400,7 +400,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 
 plotAberrantPerSample.FRASER <- function(object, main, 
-                    type=psiTypes_avail,
+                    type=fitMetrics(object),
                     padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
@@ -463,7 +463,7 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet",
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpression <- function(fds, type=psiTypes_avail,
+plotExpression <- function(fds, type=fitMetrics(fds),
                     idx=NULL, result=NULL, colGroup=NULL, 
                     basePlot=TRUE, main=NULL, label="aberrant", ...){
     if(!is.null(result)){
@@ -555,7 +555,7 @@ plotExpression <- function(fds, type=psiTypes_avail,
 #'
 #' @rdname plotFunctions
 #' @export
-plotExpectedVsObservedPsi <- function(fds, type=psiTypes_avail,
+plotExpectedVsObservedPsi <- function(fds, type=fitMetrics(fds),
                     idx=NULL, result=NULL, colGroup=NULL, main=NULL,
                     basePlot=TRUE, label="aberrant", ...){
     type <- match.arg(type)
@@ -819,7 +819,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
 setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER)
 
 
-plotEncDimSearch.FRASER <- function(object, type=psiTypes_avail, 
+plotEncDimSearch.FRASER <- function(object, type=psiTypes, 
                     plotType=c("auc", "loss")){
     type <- match.arg(type)
     plotType <- match.arg(plotType)
@@ -987,7 +987,7 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8),
 
 
 plotCountCorHeatmap.FRASER <- function(object,
-                    type=psiTypes_avail, logit=FALSE, 
+                    type=psiTypes, logit=FALSE, 
                     topN=50000, topJ=5000, minMedian=1, minCount=10, 
                     main=NULL, normalized=FALSE, show_rownames=FALSE,
                     show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA,
@@ -1428,7 +1428,7 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
 }
 
 plotManhattan.FRASER <- function(object, sampleID, 
-                                type=psiTypes_avail, 
+                                type=fitMetrics(object), 
                                 main=paste0("sampleID = ", sampleID), 
                                 color_chr=c("black", "darkgrey"),
                                 ...){
@@ -1534,7 +1534,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){
     if(isFALSE(asCharacter)){
         vapply(type, FUN=function(x)
             switch (x,
-                    jaccard = c(bquote(jaccard~intron~index)),
+                    jaccard = c(bquote(Intron~Jaccard~Index)),
                     psi5 = c(bquote(psi[5])),
                     psi3 = c(bquote(psi[3])),
                     theta = c(bquote(theta))),
diff --git a/R/variables.R b/R/variables.R
index 214c283d..d097201c 100644
--- a/R/variables.R
+++ b/R/variables.R
@@ -2,18 +2,10 @@
 #' Available splice metrics
 #'
 #' @examples 
-#'   # to show available splice metrics:
-#'   psiTypes_avail
-#' 
-#'   # to show splice metrics selected to be fitted:
+#'   # to show all available splice metrics:
 #'   psiTypes
 #' 
 #' @rdname psiTypes
 #' @export
-psiTypes_avail <- c("jaccard", "psi5", "psi3", "theta")
-names(psiTypes_avail) <- c("Intron Jaccard Index", "psi5", "psi3", "theta")
-
-#' @describeIn psiTypes Splice metrics that are run by default
-#' @export
-psiTypes <- c("jaccard")
-names(psiTypes) <- c("Intron Jaccard Index")
+psiTypes <- c("jaccard", "psi5", "psi3", "theta")
+names(psiTypes) <- c("Intron Jaccard Index", "psi5", "psi3", "theta")
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index a9a0ca97..6acf7638 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -11,7 +11,7 @@
 FRASER(
   fds,
   q,
-  types = psiTypes,
+  types = fitMetrics(fds),
   implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"),
   iterations = 15,
   BPPARAM = bpparam(),
diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd
index 9086c048..9e0c14fa 100644
--- a/man/calculatePSIValues.Rd
+++ b/man/calculatePSIValues.Rd
@@ -6,7 +6,7 @@
 \usage{
 calculatePSIValues(
   fds,
-  types = psiTypes_avail,
+  types = psiTypes,
   overwriteCts = FALSE,
   BPPARAM = bpparam()
 )
diff --git a/man/filtering.Rd b/man/filtering.Rd
index 5d604036..ee21f45e 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -18,6 +18,7 @@ filterExpressionAndVariability(
   minDeltaPsi = 0,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  filterOnJaccard = TRUE,
   BPPARAM = bpparam()
 )
 
@@ -28,6 +29,7 @@ filterExpressionAndVariability(
   quantileMinExpression = 10,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  filterOnJaccard = TRUE,
   BPPARAM = bpparam()
 )
 
@@ -36,6 +38,7 @@ filterExpressionAndVariability(
   minDeltaPsi = 0,
   filter = TRUE,
   delayed = ifelse(ncol(object) <= 300, FALSE, TRUE),
+  filterOnJaccard = TRUE,
   BPPARAM = bpparam()
 )
 }
diff --git a/man/fit.Rd b/man/fit.Rd
index 659baa3b..dcbee2dc 100644
--- a/man/fit.Rd
+++ b/man/fit.Rd
@@ -10,7 +10,7 @@
   implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full",
     "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"),
   q,
-  type = currentType(object),
+  type = psiTypes,
   rhoRange = c(-30, 30),
   weighted = FALSE,
   noiseAlpha = 1,
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index 3b9c108a..0ce81990 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -19,6 +19,8 @@
 \alias{deltaPsiValue}
 \alias{currentType}
 \alias{currentType<-}
+\alias{fitMetrics}
+\alias{fitMetrics<-}
 \alias{pseudocount}
 \alias{hyperParams}
 \alias{dontWriteHDF5}
@@ -61,6 +63,10 @@ currentType(fds)
 
 currentType(fds) <- value
 
+fitMetrics(fds)
+
+fitMetrics(fds) <- value
+
 pseudocount(value = NULL)
 
 hyperParams(fds, type = currentType(fds), all = FALSE)
@@ -140,6 +146,13 @@ within several methods in the FRASER package (defaults to jaccard).
 \item \code{currentType<-}: Sets the psi type that is to be used 
 within several methods in the FRASER package.
 
+\item \code{fitMetrics}: Returns the splice metrics that will be
+fitted (defaults to jaccard, used within several methods in the 
+FRASER package).
+
+\item \code{fitMetrics<-}: Sets the splice metrics that will be 
+fitted (used within several methods in the FRASER package).
+
 \item \code{pseudocount}: Sets and returns the pseudo count used 
 within the FRASER fitting procedure.
 
diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd
index 64728eae..5132c6e4 100644
--- a/man/injectOutliers.Rd
+++ b/man/injectOutliers.Rd
@@ -6,7 +6,7 @@
 \usage{
 injectOutliers(
   fds,
-  type = currentType(fds),
+  type = psiTypes,
   freq = 0.001,
   minDpsi = 0.2,
   minCoverage = 2,
diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd
index 96ca78ba..e1333b4a 100644
--- a/man/optimHyperParams.Rd
+++ b/man/optimHyperParams.Rd
@@ -6,7 +6,7 @@
 \usage{
 optimHyperParams(
   fds,
-  type = currentType(fds),
+  type = psiTypes,
   implementation = "PCA",
   q_param = getEncDimRange(fds),
   noise_param = 0,
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 686bf15f..dfd317a2 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -27,7 +27,7 @@ plotManhattan(object, ...)
 \S4method{plotVolcano}{FraserDataSet}(
   object,
   sampleID,
-  type = psiTypes_avail,
+  type = fitMetrics(object),
   basePlot = TRUE,
   aggregate = FALSE,
   main = NULL,
@@ -40,7 +40,7 @@ plotManhattan(object, ...)
 \S4method{plotAberrantPerSample}{FraserDataSet}(
   object,
   main,
-  type = psiTypes_avail,
+  type = fitMetrics(object),
   padjCutoff = 0.1,
   zScoreCutoff = NA,
   deltaPsiCutoff = 0.1,
@@ -51,7 +51,7 @@ plotManhattan(object, ...)
 
 plotExpression(
   fds,
-  type = psiTypes_avail,
+  type = fitMetrics(fds),
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -63,7 +63,7 @@ plotExpression(
 
 plotExpectedVsObservedPsi(
   fds,
-  type = psiTypes_avail,
+  type = fitMetrics(fds),
   idx = NULL,
   result = NULL,
   colGroup = NULL,
@@ -89,7 +89,7 @@ plotExpectedVsObservedPsi(
   ...
 )
 
-\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes_avail, plotType = c("auc", "loss"))
+\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes, plotType = c("auc", "loss"))
 
 plotFilterExpression(
   fds,
@@ -107,7 +107,7 @@ plotFilterVariability(
 
 \S4method{plotCountCorHeatmap}{FraserDataSet}(
   object,
-  type = psiTypes_avail,
+  type = psiTypes,
   logit = FALSE,
   topN = 50000,
   topJ = 5000,
@@ -170,7 +170,7 @@ plotBamCoverageFromResultTable(
 \S4method{plotManhattan}{FraserDataSet}(
   object,
   sampleID,
-  type = psiTypes_avail,
+  type = fitMetrics(object),
   main = paste0("sampleID = ", sampleID),
   color_chr = c("black", "darkgrey"),
   ...
diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd
index dfff5dce..9a4b9ca2 100644
--- a/man/psiTypes.Rd
+++ b/man/psiTypes.Rd
@@ -1,33 +1,20 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/variables.R
 \docType{data}
-\name{psiTypes_avail}
-\alias{psiTypes_avail}
+\name{psiTypes}
 \alias{psiTypes}
 \title{Available splice metrics}
 \format{
 An object of class \code{character} of length 4.
-
-An object of class \code{character} of length 1.
 }
 \usage{
-psiTypes_avail
-
 psiTypes
 }
 \description{
 Available splice metrics
 }
-\section{Functions}{
-\itemize{
-\item \code{psiTypes}: Splice metrics that are run by default
-}}
-
 \examples{
-  # to show available splice metrics:
-  psiTypes_avail
-
-  # to show splice metrics selected to be fitted:
+  # to show all available splice metrics:
   psiTypes
 
 }
diff --git a/man/results.Rd b/man/results.Rd
index 9475839f..500bcbb6 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -15,7 +15,7 @@
   aggregate = FALSE,
   collapse = FALSE,
   minCount = 5,
-  psiType = psiTypes_avail,
+  psiType = psiTypes,
   geneColumn = "hgnc_symbol",
   additionalColumns = NULL,
   BPPARAM = bpparam()
@@ -23,7 +23,7 @@
 
 \S4method{aberrant}{FraserDataSet}(
   object,
-  type = psiTypes_avail,
+  type = fitMetrics(object),
   padjCutoff = 0.05,
   deltaPsiCutoff = 0.1,
   zScoreCutoff = NA,
diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R
index 761330ec..55b0e54d 100644
--- a/tests/testthat/test_fraser_pipeline.R
+++ b/tests/testthat/test_fraser_pipeline.R
@@ -6,7 +6,7 @@ test_that("FRASER function", {
     anames <- c(psiTypes, paste0(c("delta", "predictedMeans", 
             "pvaluesBetaBinomial", "padjBetaBinomial", 
             "zScores"), "_", 
-            rep(psiTypes, 5)))
+            rep(fitMetrics(fds), 5)))
     expect_equal(anames %in% assayNames(fds), !logical(length(anames)))
 })
 
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index e333b220..5e389ec8 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -80,3 +80,27 @@ test_that("Gene p value calculation with NAs", {
     expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", 
                                 filters=list(rho=0.1)))))
 })
+
+test_that("FDR on subset of genes", {
+    fds <- getFraser()
+    mcols(fds, type="j")$hgnc_symbol <- 
+        rep(c("geneA", "geneB", "geneC", "geneD", "geneE"), 
+            times=c(3, 7, 5, 4, 8))
+    
+    # define gene subset per sample
+    genes_per_sample <- list(
+        "sample1" = c("geneE", "geneC", "geneA"),
+        "sample2" = c("geneB"),
+        "sample3" = c("geneA", "geneB", "geneC", "geneD")
+    )
+    expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4)
+    
+    subsetName <- "FDR_subset_test"
+    fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, 
+                                       subsetName=subsetName, type="jaccard")
+    subset_dt <- metadata(fds)[[subsetName]]
+    expect_true(is(subset_dt, "data.table"))
+    expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt)))
+    expect_equal(subset_dt[, .N], expected_output_nrows)
+      
+})

From c1215e1fff676512a81640d8ef23fcffccba0f8f Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 28 Nov 2022 21:59:23 +0100
Subject: [PATCH 45/80] fix vignette

---
 DESCRIPTION                    |  2 +-
 R/plotMethods.R                |  2 +-
 man/FRASER.Rd                  | 12 +++++-----
 man/countRNA.Rd                | 16 +++++++-------
 man/filtering.Rd               |  8 +++----
 man/getter_setter_functions.Rd | 40 +++++++++++++++++-----------------
 man/spliceTypeAnnotations.Rd   |  8 +++----
 vignettes/FRASER.Rnw           | 27 ++++++++++++-----------
 8 files changed, 58 insertions(+), 57 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fb58e4ec..2dc759a1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -29,7 +29,7 @@ biocViews:
 License: MIT + file LICENSE
 URL: https://github.com/gagneurlab/FRASER
 BugRepots: https://github.com/gagneurlab/FRASER/issues
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.2
 Encoding: UTF-8
 VignetteBuilder: knitr
 Depends:
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 8b57feda..a9f96bee 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -663,7 +663,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL,
 
     if(isTRUE(global)){
         if(is.null(type)){
-            type <- psiTypes
+            type <- fitMetrics(object)
         }
         dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col",
                 idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...))
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 6acf7638..0ee1035e 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -131,33 +131,33 @@ raw counts ("BB").
 }
 \section{Functions}{
 \itemize{
-\item \code{FRASER}: This function runs the default FRASER pipeline combining 
+\item \code{FRASER()}: This function runs the default FRASER pipeline combining 
 the beta-binomial fit, the computation of Z scores and p values as well as 
 the computation of delta-PSI values.
 
-\item \code{calculateZscore}: This function calculates z-scores based on the 
+\item \code{calculateZscore()}: This function calculates z-scores based on the 
 observed and expected logit 
 psi.
 
-\item \code{calculatePvalues}: This function calculates two-sided p-values based on 
+\item \code{calculatePvalues()}: This function calculates two-sided p-values based on 
 the beta-binomial distribution (or binomial or normal if desired). The 
 returned p values are not yet adjusted with Holm's method per donor or 
 acceptor site, respectively.
 
-\item \code{calculatePadjValues}: This function adjusts the previously calculated 
+\item \code{calculatePadjValues()}: This function adjusts the previously calculated 
 p-values per sample for multiple testing. First, the previoulsy calculated 
 junction-level p values are adjusted with Holm's method per donor or 
 acceptor site, respectively. Then, if gene symbols have been annotated to 
 junctions (and not otherwise requested), gene-level p values are computed.
 
-\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions 
+\item \code{calculatePadjValuesOnSubset()}: This function does FDR correction only for all junctions 
 in a certain subset of genes which can differ per sample. Requires gene 
 symbols to have been annotated to junctions. As with the full FDR 
 correction across all junctions, first the previously calculated 
 junction-level p values are adjusted with Holm's method per donor or 
 acceptor site, respectively. Then, gene-level p values are computed.
-}}
 
+}}
 \examples{
 # set default parallel backend
 register(SerialParam())
diff --git a/man/countRNA.Rd b/man/countRNA.Rd
index 7940b91b..440231fe 100644
--- a/man/countRNA.Rd
+++ b/man/countRNA.Rd
@@ -245,32 +245,32 @@ a sample are set to zero.
 }
 \section{Functions}{
 \itemize{
-\item \code{countRNAData}: This method extracts and counts the split reads and
+\item \code{countRNAData()}: This method extracts and counts the split reads and
 non spliced reads from RNA bam files.
 
-\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges 
+\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges 
 object containing the split read counts from all 
 specified samples.
 
-\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges 
+\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges 
 object containing the non split read counts at the 
 exon-intron boundaries inferred from the GRanges object 
 containing the positions of all the introns in this dataset.
 
-\item \code{addCountsToFraserDataSet}: This method adds the split read and 
+\item \code{addCountsToFraserDataSet()}: This method adds the split read and 
 non split read counts to a existing FraserDataSet 
 containing the settings.
 
-\item \code{countSplitReads}: This method counts all split reads in a 
+\item \code{countSplitReads()}: This method counts all split reads in a 
 bam file for a single sample.
 
-\item \code{mergeCounts}: This method merges counts for multiple 
+\item \code{mergeCounts()}: This method merges counts for multiple 
 samples into one SummarizedExperiment object.
 
-\item \code{countNonSplicedReads}: This method counts non spliced reads based 
+\item \code{countNonSplicedReads()}: This method counts non spliced reads based 
 on the given target (acceptor/donor) regions for a single sample.
-}}
 
+}}
 \examples{
    # On Windows SNOW is the default for the parallele backend, which can be 
    # very slow for many but small tasks. Therefore, we will use 
diff --git a/man/filtering.Rd b/man/filtering.Rd
index ee21f45e..9953fa0c 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -80,16 +80,16 @@ reliably detected and to remove introns with no variablity between samples.
 }
 \section{Functions}{
 \itemize{
-\item \code{filterExpressionAndVariability}: This functions filters out both introns with low 
+\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low 
 read support and introns that are not variable across samples.
 
-\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding 
+\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
 
-\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding 
+\item \code{filterVariability(FraserDataSet)}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
-}}
 
+}}
 \examples{
 fds <- createTestFraserDataSet()
 fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index 0ce81990..c57387d7 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -115,70 +115,70 @@ the values within the FRASER model.
 }
 \section{Functions}{
 \itemize{
-\item \code{featureExclusionMask}: Retrieves a logical vector indicating
+\item \code{featureExclusionMask()}: Retrieves a logical vector indicating
 for each junction whether it is included or excluded during the fitting
 procedure.
 
-\item \code{featureExclusionMask<-}: To remove certain junctions from 
+\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from 
 being used in the train step of the encoding dimension we can set the 
 \code{featureExclusion} vector to \code{FALSE}. This can be helpfull 
 if we have local linkage between features which we do not want to 
 model by the autoencoder.
 
-\item \code{rho}: Returns the fitted rho values for the 
+\item \code{rho()}: Returns the fitted rho values for the 
 beta-binomial distribution
 
-\item \code{zScores}: This returns the calculated z-scores.
+\item \code{zScores()}: This returns the calculated z-scores.
 
-\item \code{pVals}: This returns the calculated p-values.
+\item \code{pVals()}: This returns the calculated p-values.
 
-\item \code{padjVals}: This returns the adjusted p-values.
+\item \code{padjVals()}: This returns the adjusted p-values.
 
-\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) 
+\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) 
 values.
 
-\item \code{deltaPsiValue}: Returns the difference between the 
+\item \code{deltaPsiValue()}: Returns the difference between the 
 observed and the fitted psi values.
 
-\item \code{currentType}: Returns the psi type that is used 
+\item \code{currentType()}: Returns the psi type that is used 
 within several methods in the FRASER package (defaults to jaccard).
 
-\item \code{currentType<-}: Sets the psi type that is to be used 
+\item \code{currentType(fds) <- value}: Sets the psi type that is to be used 
 within several methods in the FRASER package.
 
-\item \code{fitMetrics}: Returns the splice metrics that will be
+\item \code{fitMetrics()}: Returns the splice metrics that will be
 fitted (defaults to jaccard, used within several methods in the 
 FRASER package).
 
-\item \code{fitMetrics<-}: Sets the splice metrics that will be 
+\item \code{fitMetrics(fds) <- value}: Sets the splice metrics that will be 
 fitted (used within several methods in the FRASER package).
 
-\item \code{pseudocount}: Sets and returns the pseudo count used 
+\item \code{pseudocount()}: Sets and returns the pseudo count used 
 within the FRASER fitting procedure.
 
-\item \code{hyperParams}: This returns the results of the 
+\item \code{hyperParams()}: This returns the results of the 
 hyperparameter optimization NULL if the hyperparameter 
 opimization was not run yet.
 
-\item \code{bestQ}: This returns the optimal size of the 
+\item \code{bestQ()}: This returns the optimal size of the 
 latent space according to the hyperparameter optimization or a simple 
 estimate of about a tenth of the number of samples if the hyperparameter 
 opimization was not run yet.
 
-\item \code{dontWriteHDF5}: Gets the current value of whether the 
+\item \code{dontWriteHDF5()}: Gets the current value of whether the 
 assays should be stored as hdf5 files.
 
-\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored 
+\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored 
 as hdf5 files.
 
-\item \code{verbose}: Dependent on the level of verbosity 
+\item \code{verbose()}: Dependent on the level of verbosity 
 the algorithm reports more or less to the user. 0 means being quiet 
 and 10 means everything.
 
-\item \code{verbose<-}: Sets the verbosity level to a value 
+\item \code{verbose(fds) <- value}: Sets the verbosity level to a value 
 between 0 and 10. 0 means being quiet and 10 means reporting everything.
-}}
 
+}}
 \examples{
 fds <- createTestFraserDataSet()
 
diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd
index b28e3c33..4556b7b6 100644
--- a/man/spliceTypeAnnotations.Rd
+++ b/man/spliceTypeAnnotations.Rd
@@ -93,20 +93,20 @@ These functions work on the result table and add additional
 }
 \section{Functions}{
 \itemize{
-\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations 
+\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations 
 based on overlap with the reference annotation (start, end, none, both) 
 for the full fds. The overlap type is added as a new column 
 \code{annotatedJunction} in \code{mcols(fds)}.
 
-\item \code{annotateSpliceEventType}: This method annotates the splice event 
+\item \code{annotateSpliceEventType()}: This method annotates the splice event 
 type to junctions in the given results table.
 
-\item \code{flagBlacklistRegions}: This method flags all introns and 
+\item \code{flagBlacklistRegions()}: This method flags all introns and 
 splice sites in the given results table for which at least one splice 
 site (donor or acceptor) is located in a blacklist region. Blacklist 
 regions of the genome are determined from the provided BED file.
-}}
 
+}}
 \examples{
   # get data, fit and compute p-values and z-scores
   fds <- createTestFraserDataSet()
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 4a28fbbe..500c8ed2 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -1,4 +1,4 @@
-%\VignetteIndexEntry{FRASER: Find RAre Splicing Evens in RNA-seq Data}
+%\VignetteIndexEntry{FRASER: Find RAre Splicing Events in RNA-seq Data}
 %\VignettePackage{FRASER}
 %\VignetteEngine{knitr::knitr}
 %\VignetteEncoding{UTF-8}
@@ -36,7 +36,7 @@ opts_chunk$set(
 \newcommand{\fraser}{\Biocpkg{FRASER}}
 \newcommand{\fds}{\Rclass{FraserDataSet}}
 
-\title{FRASER: Find RAre Splicing Events in RNA-seq}
+\title{FRASER: Find RAre Splicing Events in RNA-seq Data}
 
 \author{
     Christian Mertes$^{1}$, Ines Scheller$^{1}$, Julien Gagneur$^{1}$ \\
@@ -176,8 +176,8 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and
 between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the 
 following.
 
-From \fraser{}2 on, only one metric - the Intron Jaccard Index (Figure 
-\ref{IntronJaccardIndex_sketch}.) - is used by default. The Intron Jaccard 
+From \fraser{}2 on, only a single metric - the Intron Jaccard Index (Figure 
+\ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard 
 Index is more robust and allows to focus more on functionally relevant 
 aberrant splicing events. It allows to detect all types of aberrant splicing 
 previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$)
@@ -185,11 +185,10 @@ within a single metric.
 
 \incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron 
 Jaccard Index, the splice metric used in \fraser{}2.}{
-The Intron Jaccard Index considers both split and nonsplit reads and is 
-defined as the jaccard index of the set of donor reads (reads sharing a donor 
-site with the intron of interest and nonsplit reads at that donor site) and 
-acceptor reads (reads sharing an acceptor site with the intron of interest and
-nonsplit reads at that acceptor site). }
+The Intron Jaccard Index considers both split and nonsplit reads within a 
+single metric and allows to detect all different types of aberrant splicing 
+previously captured with either of the metrics $\psi_5$, $\psi_3$, $\theta$. 
+}
 
 The Intron Jaccard Index considers both split and nonsplit reads and is 
 defined as the jaccard index of the set of donor reads (reads sharing a donor 
@@ -198,7 +197,8 @@ acceptor reads (reads sharing an acceptor site with the intron of interest and
 nonsplit reads at that acceptor site): 
 
 \begin{equation}
-    J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}\label{eq:jaccard}
+    J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)}
+    \label{eq:jaccard}
 \end{equation}
 
 
@@ -590,7 +590,7 @@ bam files are available in the \fds{} object).
 
 <<result_table, echo=TRUE>>=
 # to show result visualization functions for this tuturial, zScore cutoff used
-res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1)
+res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # for the gene level pvalues, gene symbols need to be annotated the fds object
@@ -598,7 +598,8 @@ res
 # as we previously called FRASER() before annotating genes, we run it again here
 fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE)
 # generate gene-level results table (if gene symbols have been annotated)
-res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=0.1)
+res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA, 
+                    zScoreCutoff=2)
 res_gene
 @
 
@@ -614,7 +615,7 @@ plotVolcano(fds, type="jaccard", "sample10")
 Which are the splicing events in detail?
 
 <<sample result, echo=TRUE>>=
-sampleRes <- res[res$sampleID == "sample10"]
+sampleRes <- res[res$sampleID == "sample8"]
 sampleRes
 @
 

From 74b3d2a65541226676a514fee4007bf70d917cf1 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Nov 2022 11:15:09 +0100
Subject: [PATCH 46/80] fix missing udate to jaccard in vignette

---
 R/plotMethods.R      | 6 +++++-
 vignettes/FRASER.Rnw | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index a9f96bee..2a05446a 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1432,6 +1432,10 @@ plotManhattan.FRASER <- function(object, sampleID,
                                 main=paste0("sampleID = ", sampleID), 
                                 color_chr=c("black", "darkgrey"),
                                 ...){
+    # load necessary packages
+    require(ggbio)
+    require(biovizBase)
+    
     # check arguments
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(sampleID %in% samples(object))
@@ -1679,7 +1683,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
             else {
                 highlight.name <- values(highlight.gr)[, highlight.name]
             }
-            p <- p + geom_point(data = mold(p@data[queryHits(idx)]), 
+            p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]), 
                                 do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), 
                                 color = highlight.col)
             if (!is.null(highlight.name)) {
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 500c8ed2..d48f9cf7 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -240,8 +240,8 @@ orgDb <- org.Hs.eg.db
 fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
 
 # fit the splicing model for each metric 
-# with a specific latentsapce dimension
-fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3))
+# with a specific latentspace dimension
+fds <- FRASER(fds, q=c(jaccard=2))
 
 # alternatively, we also provide a way to use biomart for the annotation:
 # fds <- annotateRanges(fds)

From b052a38622a7c486547ef1c581807975d33a3b9e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Nov 2022 11:32:25 +0100
Subject: [PATCH 47/80] small fix in vignette

---
 vignettes/FRASER.Rnw | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index d48f9cf7..9482a775 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -46,7 +46,6 @@ opts_chunk$set(
 
 
 \begin{document}
-\SweaveOpts{concordance=TRUE}
 
 <<include=FALSE>>=
 opts_chunk$set(concordance=TRUE)

From 01c3a4e15a2ea5cff12ecec211ce63bfb0e4616f Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Nov 2022 13:41:23 +0100
Subject: [PATCH 48/80] fix biocCheck warnings

---
 R/AllGenerics.R      |  4 ++--
 R/Fraser-pipeline.R  |  4 ++--
 R/filterExpression.R |  3 +++
 R/plotMethods.R      | 27 ++++++++++++++++-----------
 man/FRASER.Rd        |  8 ++++----
 man/fds-methods.Rd   | 20 +++-----------------
 man/filtering.Rd     |  4 ++++
 7 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 8a4bac5e..ca46ba7f 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -12,8 +12,8 @@ asFDS <- function(x){
 #'
 #' @title Getter/Setter methods for the FraserDataSet
 #'
-#' The following methods are getter and setter methods to extract or set
-#' certain values of a FraserDataSet object. 
+#' @description The following methods are getter and setter methods to extract 
+#' or set certain values of a FraserDataSet object. 
 #' 
 #' \code{samples} sets or gets the sample IDs; \code{condition} ;
 #' \code{}
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index 8c1856b7..7b9d0e71 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -85,7 +85,7 @@ NULL
 #' the beta-binomial fit, the computation of Z scores and p values as well as 
 #' the computation of delta-PSI values.
 #' @export
-FRASER <- function(fds, q, types=fitMetrics(fds), 
+FRASER <- function(fds, q, type=fitMetrics(fds), 
                    implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", 
                                     "AE", "BB"), 
                     iterations=15, BPPARAM=bpparam(), correction, ...){
@@ -104,7 +104,7 @@ FRASER <- function(fds, q, types=fitMetrics(fds),
     }
 
     # fit each splicing type separately
-    for(i in types){
+    for(i in type){
 
         # get type specific q
         if(missing(q)){
diff --git a/R/filterExpression.R b/R/filterExpression.R
index 9a47b51a..6c7eab8c 100644
--- a/R/filterExpression.R
+++ b/R/filterExpression.R
@@ -17,6 +17,9 @@
 #' passed all filters is returned. If FALSE, no subsetting is done and the 
 #' information of whether an intron passed the filters is only stored in the 
 #' mcols.
+#' @param filterOnJaccard If TRUE, the Intron Jaccard Metric is used to define 
+#' express introns during fitlering. Otherwise, the psi5, psi3 and theta 
+#' metrics are used (default: TRUE).
 #' @param delayed If FALSE, count matrices will be loaded into memory, 
 #' otherwise the function works on the delayedMatrix representations. The 
 #' default value depends on the number of samples in the fds-object. 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 2a05446a..1685637a 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1432,9 +1432,13 @@ plotManhattan.FRASER <- function(object, sampleID,
                                 main=paste0("sampleID = ", sampleID), 
                                 color_chr=c("black", "darkgrey"),
                                 ...){
-    # load necessary packages
-    require(ggbio)
-    require(biovizBase)
+    # check necessary packages
+    if (!requireNamespace('ggbio')){
+        stop("For this function, the ggbio package is required.")
+    }
+    if (!requireNamespace('biovizBase')){
+        stop("For this function, the biovizBase package is required.")
+    }
     
     # check arguments
     stopifnot(is(object, "FraserDataSet"))
@@ -1460,7 +1464,7 @@ plotManhattan.FRASER <- function(object, sampleID,
     mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID]
     
     # only one point per donor/acceptor site (relevant only for psi5 and psi3)
-    index <- FRASER:::getSiteIndex(object, type=type)
+    index <- getSiteIndex(object, type=type)
     nonDup <- !duplicated(index)
     gr_sample <- gr_sample[nonDup,]
     
@@ -1657,9 +1661,9 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
     }
     if (!missing(facets)) {
         args$facets <- facets
-        args.facets <- subsetArgsByFormals(args, facet_grid, 
+        args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid, 
                                            facet_wrap)
-        facet <- .buildFacetsFromArgs(obj, args.facets)
+        facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets)
         p <- p + facet
     }
     p <- p + theme(panel.grid.minor = element_blank())
@@ -1689,13 +1693,14 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
             if (!is.null(highlight.name)) {
                 seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj)
                 suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj))
-                .trans <- transformToGenome(.h.pos, space.skip = space.skip)
+                .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip)
                 values(.trans)$mean <- (start(.trans) + end(.trans))/2
                 values(.trans)$names <- highlight.name
-                p <- p + geom_text(data = mold(.trans), size = highlight.label.size, 
-                                   vjust = 0, color = highlight.label.col, do.call(aes, 
-                                                                                   list(x = substitute(mean), y = as.name("val"), 
-                                                                                        label = as.name("names"))))
+                p <- p + geom_text(data = biovizBase::mold(.trans), 
+                        size = highlight.label.size, 
+                        vjust = 0, color = highlight.label.col, do.call(aes, 
+                                list(x = substitute(mean), y = as.name("val"), 
+                                label = as.name("names"))))
             }
         }
     }
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 0ee1035e..f5961c5b 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -11,7 +11,7 @@
 FRASER(
   fds,
   q,
-  types = fitMetrics(fds),
+  type = fitMetrics(fds),
   implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"),
   iterations = 15,
   BPPARAM = bpparam(),
@@ -58,6 +58,9 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown.
 If a named vector is provided it is used for the different
 splicing types.}
 
+\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
+efficiency)}
+
 \item{implementation}{The method that should be used to correct for 
 confounders.}
 
@@ -70,9 +73,6 @@ not yet converged after these number of iterations, the fit stops anyway.}
 
 \item{...}{Additional parameters passed on to the internal fit function}
 
-\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing 
-efficiency)}
-
 \item{logit}{Indicates if z scores are computed on the logit scale (default) 
 or in the natural (psi) scale.}
 
diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd
index 3e11e4e4..d6caf3fd 100644
--- a/man/fds-methods.Rd
+++ b/man/fds-methods.Rd
@@ -41,17 +41,7 @@
 \alias{FRASER.mcols.get}
 \alias{FRASER.rowRanges.get}
 \alias{mapSeqlevels}
-\title{Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object. 
-
-\code{samples} sets or gets the sample IDs; \code{condition} ;
-\code{}
-\code{nonSplicedReads} return a RangedSummarizedExperiment object 
-containing the counts for the non spliced reads overlapping splice 
-sites in the fds.
-\code{}}
+\title{Getter/Setter methods for the FraserDataSet}
 \usage{
 samples(object)
 
@@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().}
 Getter method return the respective current value.
 }
 \description{
-Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object. 
+The following methods are getter and setter methods to extract 
+or set certain values of a FraserDataSet object. 
 
 \code{samples} sets or gets the sample IDs; \code{condition} ;
 \code{}
@@ -162,8 +150,6 @@ certain values of a FraserDataSet object.
 containing the counts for the non spliced reads overlapping splice 
 sites in the fds.
 \code{}
-
-Mapping of chromosome names
 }
 \examples{
 fds <- createTestFraserDataSet()
diff --git a/man/filtering.Rd b/man/filtering.Rd
index 9953fa0c..226df199 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -68,6 +68,10 @@ mcols.}
 otherwise the function works on the delayedMatrix representations. The 
 default value depends on the number of samples in the fds-object.}
 
+\item{filterOnJaccard}{If TRUE, the Intron Jaccard Metric is used to define 
+express introns during fitlering. Otherwise, the psi5, psi3 and theta 
+metrics are used (default: TRUE).}
+
 \item{BPPARAM}{the BiocParallel parameters for the parallelization}
 }
 \value{

From caa194cc2470642231c8bf092f0095d0e4710664 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Nov 2022 15:08:24 +0100
Subject: [PATCH 49/80] remove installs from vignette for biocCheck

---
 vignettes/FRASER.Rnw | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 9482a775..51783ed1 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -521,10 +521,6 @@ $\psi$ values or both.
 
 <<result table, echo=TRUE>>=
 # annotate introns with the HGNC symbols of the corresponding gene
-if (!require("TxDb.Hsapiens.UCSC.hg19.knownGene", quietly = TRUE))
-    BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
-if (!require("org.Hs.eg.db", quietly = TRUE))
-    BiocManager::install("org.Hs.eg.db")
 library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 library(org.Hs.eg.db)
 

From fc6d61e282ec16f233c75e60c7c327d640aeabbe Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 29 Nov 2022 17:04:47 +0100
Subject: [PATCH 50/80] only run example if needed package is available

---
 R/plotMethods.R      | 4 +++-
 man/plotFunctions.Rd | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index 1685637a..95354df1 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -260,7 +260,9 @@
 #' plotExpectedVsObservedPsi(fds, res=res[1])
 #' 
 #' # create manhattan plot of pvalues by genomic position
-#' plotManhattan(fds, type="jaccard", sampleID="sample10")
+#' if(require(ggbio)){
+#'     plotManhattan(fds, type="jaccard", sampleID="sample10")
+#' }
 #' 
 #' # plot splice graph and coverage from bam files in a given region
 #' if(require(SGSeq)){
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index dfd317a2..927d3d11 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -505,7 +505,9 @@ plotQQ(fds, result=res[1])
 plotExpectedVsObservedPsi(fds, res=res[1])
 
 # create manhattan plot of pvalues by genomic position
-plotManhattan(fds, type="jaccard", sampleID="sample10")
+if(require(ggbio)){
+    plotManhattan(fds, type="jaccard", sampleID="sample10")
+}
 
 # plot splice graph and coverage from bam files in a given region
 if(require(SGSeq)){

From 03e678fd45a6680e4a907395c0b37dcf74402b3a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 5 Dec 2022 17:34:05 +0100
Subject: [PATCH 51/80] R CMD CHECK set to ignore warnings; removed
 zscoreCutoff and added option to specify filters to annotate only certain
 types of genes

---
 .github/workflows/check-bioc.yml       |  2 +-
 R/AllGenerics.R                        | 42 ++++++++------------------
 R/Fraser-pipeline.R                    |  4 +--
 R/annotationOfRanges.R                 | 21 ++++++++++---
 R/example_functions.R                  |  2 +-
 R/getNSetterFuns.R                     |  6 ++--
 R/plotMethods.R                        |  8 ++---
 R/resultAnnotations.R                  |  6 ++--
 man/annotateRanges.Rd                  |  8 ++++-
 man/getter_setter_functions.Rd         |  3 ++
 man/plotFunctions.Rd                   |  5 ++-
 man/results.Rd                         | 18 ++++-------
 man/spliceTypeAnnotations.Rd           |  6 ++--
 tests/testthat/test_fraser_pipeline.R  |  3 +-
 tests/testthat/test_plotJunctionDist.R |  2 +-
 tests/testthat/test_stats.R            | 26 ++++++++--------
 vignettes/FRASER.Rnw                   | 41 +++++--------------------
 17 files changed, 88 insertions(+), 115 deletions(-)

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 946e96ef..e32789e2 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -249,7 +249,7 @@ jobs:
           rcmdcheck::rcmdcheck(
               args = c("--no-manual", "--no-vignettes", "--timings"),
               build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
-              error_on = "warning",
+              error_on = "error",
               check_dir = "check"
           )
         shell: Rscript {0}
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index ca46ba7f..55fe3f97 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -620,7 +620,7 @@ mapSeqlevels <- function(fds, style="UCSC", ...){
 #'
 #' retrieve a single sample result object
 #' @noRd
-resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, 
+resultsSingleSample <- function(sampleID, gr, pvals, padjs, 
                                 psivals, rawCts, rawTotalCts, rawNonsplitCts, 
                                 rawNsProportion, nsProportion_99quantile,
                                 deltaPsiVals, psiType, rowMeansK, rowMeansN, 
@@ -665,7 +665,6 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
     mcols(ans)$type            <- Rle(psiType)
     mcols(ans)$pValue          <- signif(pvals[goodCut,sampleID], 5)
     mcols(ans)$padjust         <- signif(padjs[goodCut,sampleID], 5)
-    mcols(ans)$zScore          <- Rle(round(zscores[goodCut,sampleID], 2))
     mcols(ans)$psiValue        <- Rle(round(psivals[goodCut,sampleID], 2))
     mcols(ans)$deltaPsi        <- round(deltaPsiVals[goodCut,sampleID], 2)
     mcols(ans)$counts          <- Rle(rawCts[goodCut, sampleID])
@@ -711,7 +710,7 @@ resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores,
     return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))])
 }
 
-FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, 
+FRASER.results <- function(object, sampleIDs, fdrCutoff, 
                             dPsiCutoff, minCount, rhoCutoff, psiType, 
                             maxCols=20, aggregate=FALSE, collapse=FALSE,
                             geneColumn="hgnc_symbol", BPPARAM=bpparam(), 
@@ -763,7 +762,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
                                             filters=list(rho=rhoCutoff)))
             padjs        <- as.matrix(padjVals(tmp_x, 
                                             filters=list(rho=rhoCutoff)))
-            zscores      <- as.matrix(zScores(tmp_x))
             psivals      <- as.matrix(assay(tmp_x, type))
             muPsi        <- as.matrix(predictedMeans(tmp_x))
             psivals_pc   <- (rawCts + pseudocount()) /
@@ -772,7 +770,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             rho          <- rho(tmp_x, type)
             aberrant     <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
-                                                zScoreCutoff=zscoreCutoff, 
                                                 deltaPsiCutoff=dPsiCutoff, 
                                                 minCount=minCount, 
                                                 rhoCutoff=rhoCutoff,
@@ -783,7 +780,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
                 padjsGene    <- as.matrix(padjVals(tmp_x, level="gene"))
                 aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
-                                                zScoreCutoff=zscoreCutoff, 
                                                 deltaPsiCutoff=dPsiCutoff, 
                                                 minCount=minCount, 
                                                 rhoCutoff=rhoCutoff,
@@ -798,7 +794,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             if(length(sc) == 1){
                 colnames(pvals) <- sc
                 colnames(padjs) <- sc
-                colnames(zscores) <- sc
                 colnames(deltaPsiVals) <- sc
             }
             
@@ -817,7 +812,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
             # create result table
             sampleRes <- lapply(sc,
                                 resultsSingleSample, gr=gr, pvals=pvals, 
-                                padjs=padjs, zscores=zscores, psiType=type, 
+                                padjs=padjs, psiType=type, 
                                 psivals=psivals, deltaPsiVals=deltaPsiVals, 
                                 rawCts=rawCts, rawTotalCts=rawTotalCts, 
                                 rawNonsplitCts=rawNonsplitCts, 
@@ -866,7 +861,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' @param sampleIDs A vector of sample IDs for which results should be 
 #' retrieved
 #' @param padjCutoff The FDR cutoff to be applied or NA if not requested.
-#' @param zScoreCutoff The z-score cutoff to be applied or NA if not requested.
 #' @param deltaPsiCutoff The cutoff on delta psi or NA if not requested.
 #' @param minCount The minimum count value of the total coverage of an intron 
 #' to be considered as significant.
@@ -891,7 +885,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #'              types to return only one row per feature (gene) and sample.
 #' @param geneColumn The column name of the column that has the gene annotation 
 #'              that will be used for gene-level pvalue computation.
-#' @param ... Further arguments can be passed to the method. If "n", "zscores", 
+#' @param ... Further arguments can be passed to the method. If "n",  
 #'              "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 #'              arguments are used to define the aberrant events.
 #'
@@ -908,18 +902,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' 
 #' # extract results: for this example dataset, no cutoffs are used to
 #' # get at least one result and show the output
-#' res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA)
+#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 #' res
 #' 
 #' # aggregate the results by genes (gene symbols need to be annotated first 
 #' # using annotateRanges() function)
-#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
-#'         aggregate=TRUE)
+#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE)
 #'
 #' # aggregate the results by genes and collapse over all psi types to obtain 
 #' # only one row per gene in the results table 
-#' results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
-#'         aggregate=TRUE, collapse=TRUE)
+#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, 
+#'         collapse=TRUE)
 #'
 #' # get aberrant events per sample: on the example data, nothing is aberrant
 #' # based on the adjusted p-value
@@ -927,21 +920,20 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #' 
 #' # get aberrant events per gene (first annotate gene symbols)
 #' fds <- annotateRangesWithTxDb(fds)
-#' aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA,
-#'         aggregate=TRUE)
+#' aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE)
 #'         
 #' # find aberrant junctions/splice sites
 #' aberrant(fds, type="jaccard")
 #' @export
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
-                    zScoreCutoff=NA, deltaPsiCutoff=0.1,
+                    deltaPsiCutoff=0.1,
                     rhoCutoff=1, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=psiTypes,
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
-            zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, 
+            dPsiCutoff=deltaPsiCutoff, 
             rhoCutoff=rhoCutoff, minCount=minCount, 
             psiType=match.arg(psiType, several.ok=TRUE),
             aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
@@ -950,12 +942,11 @@ setMethod("results", "FraserDataSet", function(object,
 
 aberrant.FRASER <- function(object, type=fitMetrics(object), 
                                 padjCutoff=0.05, deltaPsiCutoff=0.1, 
-                                zScoreCutoff=NA, minCount=5, rhoCutoff=1,
+                                minCount=5, rhoCutoff=1,
                                 by=c("none", "sample", "feature"), 
                                 aggregate=FALSE, geneColumn="hgnc_symbol", ...){
     
     checkNaAndRange(padjCutoff,     min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
-    checkNaAndRange(zScoreCutoff,   min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(deltaPsiCutoff, min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(rhoCutoff,      min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(minCount,       min=0, max=Inf, scalar=TRUE,   na.ok=TRUE)
@@ -968,11 +959,6 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
     } else {
         n <- N(object, type=type)
     }
-    if("zscores" %in% names(dots)){
-        zscores <- dots[['zscores']]
-    } else {
-        zscores <- zScores(object, type=type)
-    }
     if("padjVals" %in% names(dots)){
         padj <- dots[['padjVals']]
     } else {
@@ -1019,10 +1005,6 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
     if(!is.na(minCount)){
         aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
     }
-    if(!is.na(zScoreCutoff)){
-        aberrantEvents <- aberrantEvents & 
-            as.matrix(abs(zscores) >= zScoreCutoff)
-    }
     if(!is.na(deltaPsiCutoff)){
         aberrantEvents <- aberrantEvents & 
             as.matrix(abs(dpsi) >= deltaPsiCutoff)
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index 7b9d0e71..20485299 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -129,8 +129,8 @@ FRASER <- function(fds, q, type=fitMetrics(fds),
         message(date(), ": Adjust p values for: '", i, "'.")
         fds <- calculatePadjValues(fds, type=i)
 
-        message(date(), ": Compute Z scores for: '", i, "'.")
-        fds <- calculateZscore(fds, type=i)
+        # message(date(), ": Compute Z scores for: '", i, "'.")
+        # fds <- calculateZscore(fds, type=i)
     }
 
     # return final analysis
diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
index 13939780..8796cd0b 100644
--- a/R/annotationOfRanges.R
+++ b/R/annotationOfRanges.R
@@ -17,6 +17,10 @@
 #'             \code{TxDb.Hsapiens.UCSC.hg19.knownGene}.
 #' @param orgDb An \code{orgDb} object or a data table to map the feature names.
 #'             If this is NULL, then \code{org.Hs.eg.db} is used as the default.
+#' @param filter A named list specifying the filters which should be applied to 
+#'             subset to e.g. only protein-coding genes for annotation. 
+#'             \code{names(filter)} needs to be column names in the given 
+#'             orgDb object (default: no filtering).
 #' @param keytype The keytype or column name of gene IDs in the \code{TxDb}
 #'             object (see 
 #'             \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}}
@@ -104,7 +108,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
 #' @export
 annotateRangesWithTxDb <- function(fds, feature="SYMBOL", 
                     featureName="hgnc_symbol", keytype="ENTREZID",
-                    txdb=NULL, orgDb=NULL){
+                    txdb=NULL, orgDb=NULL, filter=list()){
     gene_id <- NULL
     
     # check input
@@ -136,15 +140,24 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL",
     if(is.data.table(orgDb)){
         tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, 
                 by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[,
-                        .(gene_id, feature=get(feature))]
-        setnames(tmp, "feature", feature)
+                            c("gene_id", feature, names(filter)), with=FALSE]
     } else {
         tmp  <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], 
-                columns=feature, keytype=keytype))
+                columns=c(feature, names(filter)), keytype=keytype))
     }
         
+    # filter genes as specified by user (e.g. only protein_coding)
+    tmp[, include:=TRUE]
+    if(!is.null(filter) & length(filter) > 0 & !is.null(names(filter))){
+        for(n in names(filter)){
+            stopifnot(n %in% colnames(tmp))
+            tmp[!(get(n) %in% filter[[n]]), include:=FALSE]
+        }
+    }
+    
     # add the new feature to the annotation
     tmp[, uniqueID := .GRP, by=keytype]
+    tmp <- tmp[include == TRUE,]
     anno <- anno[tmp[,uniqueID]]
     mcols(anno)[[featureName]] <- tmp[,get(feature)]
         
diff --git a/R/example_functions.R b/R/example_functions.R
index cec001f6..c4b992be 100644
--- a/R/example_functions.R
+++ b/R/example_functions.R
@@ -61,7 +61,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){
     if(all(file.exists(hdf5Files))){
         if(isFALSE(rerun)){
             fds <- loadFraserDataSet(workingDir, name="Data_Analysis")
-            if(all(paste0(c("zScores", "padjBetaBinomial", "predictedMeans"),
+            if(all(paste0(c("padjBetaBinomial", "predictedMeans"),
                         "_", rep(psiTypes, 3)) %in% assayNames(fds))){
                 message(date(), ": Use existing cache data.")
                 return(fds)
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index c0bb9d65..ed19427f 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -43,6 +43,9 @@
 #' # get statistics
 #' pVals(fds)
 #' padjVals(fds)
+#' 
+#' # zscore not calculated by default
+#' fds <- calculateZscore(fds, type="jaccard")
 #' zScores(fds)
 #' 
 #' # set and get pseudocount
@@ -701,7 +704,6 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds),
             pval      = c(pVals(fds, type=type, 
                                 level=pvalLevel)[idxrow, idxcol]),
             padj      = c(padjVals(fds, type=type)[idxrow, idxcol]),
-            zscore    = c(zScores(fds, type=type)[idxrow, idxcol]),
             obsPsi    = c(k/n),
             predPsi   = c(predictedMeans(fds, type)[idxrow, idxcol]),
             rho       = rep(rho(fds, type=type)[idxrow], 
@@ -711,7 +713,7 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds),
 
     # add aberrant information to it
     aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)],
-        dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)], 
+        dPsi=dt[,.(deltaPsi)], n=dt[,.(n)], 
         rhoVals=dt[,.(rho)], aggregate=FALSE)
     dt[,aberrant:=aberrantVec]
 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 95354df1..e53ce761 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -31,7 +31,7 @@
 #' @param idx A junction site ID or gene ID or one of both, which
 #'             should be plotted. Can also be a vector. Integers are treated
 #'             as indices.
-#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta
+#' @param padjCutoff,deltaPsiCutoff Significance or delta
 #'             psi cutoff to mark outliers
 #' @param global Flag to plot a global Q-Q plot, default FALSE
 #' @param normalized If TRUE, the normalized psi values are used, the default,
@@ -282,7 +282,7 @@
 #'     require(org.Hs.eg.db)
 #'     orgDb <- org.Hs.eg.db
 #'  
-#'     res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+#'     res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 #'     res_dt <- as.data.table(res)
 #'     res_dt <- res_dt[sampleID == "sample2",]
 #'     
@@ -403,7 +403,7 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER)
 
 plotAberrantPerSample.FRASER <- function(object, main, 
                     type=fitMetrics(object),
-                    padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.1,
+                    padjCutoff=0.1, deltaPsiCutoff=0.1,
                     aggregate=TRUE, BPPARAM=bpparam(), ...){
 
     type <- match.arg(type, several.ok=TRUE)
@@ -417,7 +417,7 @@ plotAberrantPerSample.FRASER <- function(object, main,
 
     # extract outliers
     outliers <- bplapply(type, aberrant, object=object, by="sample",
-            padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff,
+            padjCutoff=padjCutoff, 
             deltaPsiCutoff=deltaPsiCutoff, aggregate=aggregate, ..., 
             BPPARAM=BPPARAM)
     dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 2dbd9a5b..bfb57620 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -69,9 +69,9 @@
 #'   # run this function before creating the results table
 #'   fds <- annotateIntronReferenceOverlap(fds, txdb) 
 #' 
-#'   # extract results: for this small example dataset, only a z score cutoff 
-#'   # of 1 is used to get at least one result. 
-#'   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
+#'   # extract results: for this small example dataset, no cutoffs used 
+#'   # to get some results 
+#'   res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 #'   
 #'   # annotate the type of splice event and UTR overlap
 #'   res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd
index 4d1639f9..e74728b2 100644
--- a/man/annotateRanges.Rd
+++ b/man/annotateRanges.Rd
@@ -20,7 +20,8 @@ annotateRangesWithTxDb(
   featureName = "hgnc_symbol",
   keytype = "ENTREZID",
   txdb = NULL,
-  orgDb = NULL
+  orgDb = NULL,
+  filter = list()
 )
 }
 \arguments{
@@ -52,6 +53,11 @@ one is used, currently this is
 
 \item{orgDb}{An \code{orgDb} object or a data table to map the feature names.
 If this is NULL, then \code{org.Hs.eg.db} is used as the default.}
+
+\item{filter}{A named list specifying the filters which should be applied to 
+subset to e.g. only protein-coding genes for annotation. 
+\code{names(filter)} needs to be column names in the given 
+orgDb object (default: no filtering).}
 }
 \value{
 FraserDataSet
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index c57387d7..19828cc5 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -198,6 +198,9 @@ rho(fds)
 # get statistics
 pVals(fds)
 padjVals(fds)
+
+# zscore not calculated by default
+fds <- calculateZscore(fds, type="jaccard")
 zScores(fds)
 
 # set and get pseudocount
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 927d3d11..79b1db7c 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -42,7 +42,6 @@ plotManhattan(object, ...)
   main,
   type = fitMetrics(object),
   padjCutoff = 0.1,
-  zScoreCutoff = NA,
   deltaPsiCutoff = 0.1,
   aggregate = TRUE,
   BPPARAM = bpparam(),
@@ -202,7 +201,7 @@ samples. Labelling can be turned off by setting
 \code{label=NULL}. The user can also provide a custom 
 list of gene symbols or sampleIDs.}
 
-\item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta
+\item{padjCutoff, deltaPsiCutoff}{Significance or delta
 psi cutoff to mark outliers}
 
 \item{BPPARAM}{BiocParallel parameter to use.}
@@ -527,7 +526,7 @@ if(require(SGSeq)){
     require(org.Hs.eg.db)
     orgDb <- org.Hs.eg.db
  
-    res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA, zScoreCutoff=NA)
+    res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
     res_dt <- as.data.table(res)
     res_dt <- res_dt[sampleID == "sample2",]
     
diff --git a/man/results.Rd b/man/results.Rd
index 500bcbb6..7c536129 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -9,7 +9,6 @@
   object,
   sampleIDs = samples(object),
   padjCutoff = 0.05,
-  zScoreCutoff = NA,
   deltaPsiCutoff = 0.1,
   rhoCutoff = 1,
   aggregate = FALSE,
@@ -26,7 +25,6 @@
   type = fitMetrics(object),
   padjCutoff = 0.05,
   deltaPsiCutoff = 0.1,
-  zScoreCutoff = NA,
   minCount = 5,
   rhoCutoff = 1,
   by = c("none", "sample", "feature"),
@@ -43,8 +41,6 @@ retrieved}
 
 \item{padjCutoff}{The FDR cutoff to be applied or NA if not requested.}
 
-\item{zScoreCutoff}{The z-score cutoff to be applied or NA if not requested.}
-
 \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.}
 
 \item{rhoCutoff}{The cutoff value on the fitted rho value 
@@ -80,7 +76,7 @@ are included.}
 \code{sample} or \code{feature} is specified the sum by 
 sample or feature is returned}
 
-\item{...}{Further arguments can be passed to the method. If "n", "zscores", 
+\item{...}{Further arguments can be passed to the method. If "n",
 "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 arguments are used to define the aberrant events.}
 }
@@ -102,18 +98,17 @@ fds <- createTestFraserDataSet()
 
 # extract results: for this example dataset, no cutoffs are used to
 # get at least one result and show the output
-res <- results(fds, padjCutoff=NA, zScoreCutoff=NA, deltaPsiCutoff=NA)
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # aggregate the results by genes (gene symbols need to be annotated first 
 # using annotateRanges() function)
-results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
-        aggregate=TRUE)
+results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE)
 
 # aggregate the results by genes and collapse over all psi types to obtain 
 # only one row per gene in the results table 
-results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=0.1, 
-        aggregate=TRUE, collapse=TRUE)
+results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, 
+        collapse=TRUE)
 
 # get aberrant events per sample: on the example data, nothing is aberrant
 # based on the adjusted p-value
@@ -121,8 +116,7 @@ aberrant(fds, type="jaccard", by="sample")
 
 # get aberrant events per gene (first annotate gene symbols)
 fds <- annotateRangesWithTxDb(fds)
-aberrant(fds, type="jaccard", by="feature", zScoreCutoff=2, padjCutoff=NA,
-        aggregate=TRUE)
+aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE)
         
 # find aberrant junctions/splice sites
 aberrant(fds, type="jaccard")
diff --git a/man/spliceTypeAnnotations.Rd b/man/spliceTypeAnnotations.Rd
index 4556b7b6..0b14439c 100644
--- a/man/spliceTypeAnnotations.Rd
+++ b/man/spliceTypeAnnotations.Rd
@@ -119,9 +119,9 @@ regions of the genome are determined from the provided BED file.
   # run this function before creating the results table
   fds <- annotateIntronReferenceOverlap(fds, txdb) 
 
-  # extract results: for this small example dataset, only a z score cutoff 
-  # of 1 is used to get at least one result. 
-  res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
+  # extract results: for this small example dataset, no cutoffs used 
+  # to get some results 
+  res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
   
   # annotate the type of splice event and UTR overlap
   res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R
index 55b0e54d..45c9b2ff 100644
--- a/tests/testthat/test_fraser_pipeline.R
+++ b/tests/testthat/test_fraser_pipeline.R
@@ -4,8 +4,7 @@ test_that("FRASER function", {
     fds <- createTestFraserDataSet()
     expect_is(fds, "FraserDataSet")
     anames <- c(psiTypes, paste0(c("delta", "predictedMeans", 
-            "pvaluesBetaBinomial", "padjBetaBinomial", 
-            "zScores"), "_", 
+            "pvaluesBetaBinomial", "padjBetaBinomial"), "_", 
             rep(fitMetrics(fds), 5)))
     expect_equal(anames %in% assayNames(fds), !logical(length(anames)))
 })
diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R
index 1394f13c..43022359 100644
--- a/tests/testthat/test_plotJunctionDist.R
+++ b/tests/testthat/test_plotJunctionDist.R
@@ -3,7 +3,7 @@ context("Test distribution plots for given results/junction")
 test_that("Main junction distribution plot", {
     # get results
     fds <- getFraser()
-    res <- results(fds, padjCutoff=1, zScoreCutoff=NA, deltaPsiCutoff=NA)
+    res <- results(fds, padjCutoff=1, deltaPsiCutoff=NA)
 
     # plot distributions
     expect_silent(plotExpression(fds, result=res[1]))
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index 5e389ec8..dcbe5bea 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -17,19 +17,19 @@ test_that("PSI value calculation", {
     expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0))
 })
 
-test_that("Zscore calculation", {
-    fds <- getFraser(clean = TRUE)
-    
-    # prepare zScore input for logit scale
-    psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount())
-    mu <- predictedMeans(fds, "jaccard")
-    residual <- qlogis(psiVal) - qlogis(mu)
-    
-    # compute zscore
-    zscores <- (residual - rowMeans(residual)) / rowSds(residual)
-    
-    expect_equal(zscores, zScores(fds, "jaccard"))
-})
+# test_that("Zscore calculation", {
+#     fds <- getFraser(clean = TRUE)
+#     
+#     # prepare zScore input for logit scale
+#     psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount())
+#     mu <- predictedMeans(fds, "jaccard")
+#     residual <- qlogis(psiVal) - qlogis(mu)
+#     
+#     # compute zscore
+#     zscores <- (residual - rowMeans(residual)) / rowSds(residual)
+#     
+#     expect_equal(zscores, zScores(fds, "jaccard"))
+# })
 
 test_that("Gene p value calculation with NAs", {
     fds <- getFraser()
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 51783ed1..ef49a129 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -247,8 +247,8 @@ fds <- FRASER(fds, q=c(jaccard=2))
 
 # get results: we recommend to use an FDR cutoff 0.05, but due to the small
 # dataset size we extract all events and their associated values
-# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3)
-res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA)
+# eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1)
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # result visualization
@@ -412,9 +412,9 @@ be processed with the same protocol and origin from the same tissue.
 \label{sec:filtering}
 
 Before we can filter the data, we have to compute the main splicing metric:
-the $\psi$-value (Percent Spliced In).
+the $\psi$-value (Percent Spliced In) and the Intron Jaccard Index.
 
-<<calculate psi/zscore values, echo=TRUE>>=
+<<calculate psi/jaccard values, echo=TRUE>>=
 fds <- calculatePSIValues(fds)
 fds
 @
@@ -547,7 +547,7 @@ the following additional information:
     \item type: the metric for which the aberrant event was detected (either
     jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or 
     theta for $\theta$)
-    \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of
+    \item pValue, padjust: the p-value and adjusted p-value (FDR) of
     this event (at intron or splice site level depending on metric)
     \item pValueGene, padjustGene: only present in the gene-level results table,
     gives the p-value and FDR adjusted p-value at gene-level
@@ -584,8 +584,8 @@ sashimi plot for an outlier in the results table directly in R (if paths to
 bam files are available in the \fds{} object).
 
 <<result_table, echo=TRUE>>=
-# to show result visualization functions for this tuturial, zScore cutoff used
-res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=NA)
+# to show result visualization functions for this tuturial, no cutoff used
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
 res
 
 # for the gene level pvalues, gene symbols need to be annotated the fds object
@@ -593,8 +593,7 @@ res
 # as we previously called FRASER() before annotating genes, we run it again here
 fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE)
 # generate gene-level results table (if gene symbols have been annotated)
-res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA, 
-                    zScoreCutoff=2)
+res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA)
 res_gene
 @
 
@@ -733,30 +732,6 @@ fds <- calculatePadjValues(fds, type="jaccard", method="BY")
 head(padjVals(fds,type="jaccard"))
 @
 
-\subsection{Z-score calculation}
-\label{sec:Z-score-calculation}
-
-To calculate z-scores on the logit transformed $\Delta\psi$ values and to store
-them in the \fds{} object, the function \Rfunction{calculateZScores} can be
-called. The Z-scores can be used for visualization, filtering, and ranking of
-samples. The Z-scores are calculated as follows:
-
-\begin{equation}
-    z_{ij} = \frac{\delta_{ij} - \bar{\delta_j}}{sd(\delta_j)}
-\end{equation}
-\begin{equation*}
-    \delta_{ij} = logit{(\frac{k_{ij} + 1}{n_{ij} + 2})} - logit{(\mu_{ij})},
-\end{equation*}
-
-where $\delta_{ij}$ is the difference on the logit scale between the measured
-counts and the counts after correction for confounders and $\bar{\delta_j}$ is
-the mean of intron $j$.
-
-<<z-score calculation, echo=TRUE>>=
-fds <- calculateZscore(fds, type="jaccard")
-head(zScores(fds, type="jaccard"))
-@
-
 \subsection{Result visualization}
 \label{sec:result-vis}
 

From 39d03b29f4b79719f8fdfe213773e326921c2691 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 7 Dec 2022 13:10:55 +0100
Subject: [PATCH 52/80] fixed gene-level pvalue retrieval to work with and
 without rhoCutoff

---
 R/AllGenerics.R                | 11 +++++++++--
 R/getNSetterFuns.R             | 28 ++++++++++++++++------------
 R/pvalsNzscore.R               | 22 +++++++++++++---------
 man/FRASER.Rd                  |  4 ++--
 man/getter_setter_functions.Rd |  4 ++--
 man/results.Rd                 |  4 ++--
 6 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 55fe3f97..886f625d 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -726,6 +726,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
     
     # only extract results for requested psiTypes if pvals exist for them
     stopifnot(all(psiType %in% psiTypes))
+    if(is.na(rhoCutoff)){
+        rhoCutoff <- 1
+    }
     pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType,
                                                    filters=list(rho=rhoCutoff), 
                                                    aggregate=aggregate)
@@ -928,7 +931,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     deltaPsiCutoff=0.1,
-                    rhoCutoff=1, aggregate=FALSE, collapse=FALSE,
+                    rhoCutoff=NA, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=psiTypes,
                     geneColumn="hgnc_symbol",
                     additionalColumns=NULL, BPPARAM=bpparam()){
@@ -942,7 +945,7 @@ setMethod("results", "FraserDataSet", function(object,
 
 aberrant.FRASER <- function(object, type=fitMetrics(object), 
                                 padjCutoff=0.05, deltaPsiCutoff=0.1, 
-                                minCount=5, rhoCutoff=1,
+                                minCount=5, rhoCutoff=NA,
                                 by=c("none", "sample", "feature"), 
                                 aggregate=FALSE, geneColumn="hgnc_symbol", ...){
     
@@ -953,6 +956,10 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
     by <- match.arg(by)
     type <- match.arg(type)
     
+    if(is.na(rhoCutoff)){
+        rhoCutoff <- 1
+    }
+    
     dots <- list(...)
     if("n" %in% names(dots)){
         n <- dots[['n']]
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index ed19427f..250ad922 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -254,7 +254,7 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){
 #' @describeIn getter_setter_functions This returns the calculated p-values.
 #' @export
 pVals <- function(fds, type=currentType(fds), level="site", 
-                    filters=list(rho=1), dist="BetaBinomial", ...){
+                    filters=list(), dist="BetaBinomial", ...){
     level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("pvalues", dist)
@@ -269,10 +269,14 @@ pVals <- function(fds, type=currentType(fds), level="site",
     } else{ 
         aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
         # add information on used filters
+        if(is.null(names(filters))){
+            filters <- list(rho=1)
+        }
         for(n in sort(names(filters))){
             aname_new <- paste0(aname, "_", n, filters[[n]])
             if(n == "rho" && filters[[n]] == 1){
-                if(any(grepl(aname_new, assayNames(fds)))){
+                if(any(grepl(aname_new, assayNames(fds))) ||
+                        any(grepl(aname_new, names(metadata(fds))))){
                     aname <- aname_new
                 }
             }else{
@@ -292,7 +296,7 @@ pVals <- function(fds, type=currentType(fds), level="site",
 }
 
 `pVals<-` <- function(fds, type=currentType(fds), level="site",
-                    filters=list(rho=1),
+                    filters=list(),
                     dist="BetaBinomial", ..., value){
     level <- match.arg(level, choices=c("site", "junction", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
@@ -306,9 +310,7 @@ pVals <- function(fds, type=currentType(fds), level="site",
     }
     # add information on used filters
     for(n in sort(names(filters))){
-        if(!(n == "rho" && filters[[n]] == 1)){
-            aname <- paste0(aname, "_", n, filters[[n]])
-        }
+        aname <- paste0(aname, "_", n, filters[[n]])
     }
     
     if(level == "gene"){
@@ -325,7 +327,7 @@ pVals <- function(fds, type=currentType(fds), level="site",
 #' @describeIn getter_setter_functions This returns the adjusted p-values.
 #' @export
 padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), 
-                    level="site", filters=list(rho=1), ...){
+                    level="site", filters=list(), ...){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
@@ -333,8 +335,12 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
     # add information on used filters
     for(n in sort(names(filters))){
         aname_new <- paste0(aname, "_", n, filters[[n]])
+        if(is.null(names(filters))){
+            filters <- list(rho=1)
+        }
         if(n == "rho" && filters[[n]] == 1){
-            if(any(grepl(aname_new, assayNames(fds)))){
+            if(any(grepl(aname_new, assayNames(fds))) ||
+                    any(grepl(aname_new, names(metadata(fds))))){
                 aname <- aname_new
             }
         }else{
@@ -352,16 +358,14 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
 }
 
 `padjVals<-` <- function(fds, type=currentType(fds), level="site",
-                    dist="BetaBinomial", filters=list(rho=1), ..., value){
+                    dist="BetaBinomial", filters=list(), ..., value){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
     # add information on used filters
     for(n in sort(names(filters))){
-        if(!(n == "rho" && filters[[n]] == 1)){
-            aname <- paste0(aname, "_", n, filters[[n]])
-        }
+        aname <- paste0(aname, "_", n, filters[[n]])
     }
     if(level == "gene"){
         if(is.null(rownames(value))){
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index f1dbcb45..61a8a2da 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -222,7 +222,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #' multiple testing correction.
 #' @param rhoCutoff The cutoff value on the fitted rho value 
 #'     (overdispersion parameter of the betabinomial) above which junctions are 
-#'     masked with NA during p value adjustment. 
+#'     masked with NA during p value adjustment (default: NA, no masking). 
 #' @param geneLevel Logical value indiciating whether gene-level p values 
 #'     should be calculated. Defaults to TRUE.
 #' @param geneColumn The column name of the column that has the gene annotation 
@@ -230,7 +230,7 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #' 
 #' @export
 calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
-                                rhoCutoff=1, geneLevel=TRUE, 
+                                rhoCutoff=NA, geneLevel=TRUE, 
                                 geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
@@ -247,11 +247,15 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
         
         # splice site-level pval correction
         message(date(), ":  adjusting junction-level pvalues ...")
-        # fwer_pvals <- getFWERpvals_byIdx(pvals, index, rho, method="holm", 
-        #                                 rhoCutoff=rhoCutoff, BPPARAM=BPPARAM)
         fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", 
-                                        rhoCutoff=rhoCutoff, BPPARAM=BPPARAM)
-        pVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff),
+                            rhoCutoff=ifelse(is.na(rhoCutoff), 1, rhoCutoff), 
+                            BPPARAM=BPPARAM)
+        if(!is.na(rhoCutoff)){
+            filters <- list(rho=rhoCutoff)
+        } else{
+            filters <- list()
+        }
+        pVals(fds, dist=i, level="site", filters=filters,
                 withDimnames=FALSE) <- fwer_pvals
         
         # junction-level FDR correction
@@ -259,7 +263,7 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
         padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method)
         padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)]
         padjDT[,i:=NULL]
-        padjVals(fds, dist=i, level="site", filters=list(rho=rhoCutoff),
+        padjVals(fds, dist=i, level="site", filters=filters,
                 withDimnames=FALSE) <- as.matrix(padjDT)
         
         # gene-level pval correction and FDR
@@ -270,9 +274,9 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                                             method="holm", FDRmethod=method, 
                                             geneColumn=geneColumn,
                                             BPPARAM=BPPARAM)
-            pVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
+            pVals(fds, dist=i, level="gene", filters=filters,
                     withDimnames=FALSE) <- gene_pvals[["pvals"]]
-            padjVals(fds, dist=i, level="gene", filters=list(rho=rhoCutoff),
+            padjVals(fds, dist=i, level="gene", filters=filters,
                     withDimnames=FALSE) <- gene_pvals[["padj"]]
         } else if(isTRUE(geneLevel)){
             warning("Gene-level pvalues could not be calculated as column ",
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index f5961c5b..1b2e5f72 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -34,7 +34,7 @@ calculatePadjValues(
   fds,
   type = currentType(fds),
   method = "BY",
-  rhoCutoff = 1,
+  rhoCutoff = NA,
   geneLevel = TRUE,
   geneColumn = "hgnc_symbol",
   BPPARAM = bpparam()
@@ -87,7 +87,7 @@ multiple testing correction.}
 
 \item{rhoCutoff}{The cutoff value on the fitted rho value 
 (overdispersion parameter of the betabinomial) above which junctions are 
-masked with NA during p value adjustment.}
+masked with NA during p value adjustment (default: NA, no masking).}
 
 \item{geneLevel}{Logical value indiciating whether gene-level p values 
 should be calculated. Defaults to TRUE.}
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index 19828cc5..3f25c8b6 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -41,7 +41,7 @@ pVals(
   fds,
   type = currentType(fds),
   level = "site",
-  filters = list(rho = 1),
+  filters = list(),
   dist = "BetaBinomial",
   ...
 )
@@ -51,7 +51,7 @@ padjVals(
   type = currentType(fds),
   dist = c("BetaBinomial"),
   level = "site",
-  filters = list(rho = 1),
+  filters = list(),
   ...
 )
 
diff --git a/man/results.Rd b/man/results.Rd
index 7c536129..4da3bf1b 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -10,7 +10,7 @@
   sampleIDs = samples(object),
   padjCutoff = 0.05,
   deltaPsiCutoff = 0.1,
-  rhoCutoff = 1,
+  rhoCutoff = NA,
   aggregate = FALSE,
   collapse = FALSE,
   minCount = 5,
@@ -26,7 +26,7 @@
   padjCutoff = 0.05,
   deltaPsiCutoff = 0.1,
   minCount = 5,
-  rhoCutoff = 1,
+  rhoCutoff = NA,
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
   geneColumn = "hgnc_symbol",

From e9a64861b5b6284d9ea9b37bd520790642b7fa5e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 7 Dec 2022 15:00:52 +0100
Subject: [PATCH 53/80] further fix to correctly extract gene pvalues

---
 R/AllGenerics.R      | 6 ++++--
 R/getNSetterFuns.R   | 6 +++---
 R/helper-functions.R | 3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 886f625d..74433c6a 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -779,8 +779,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                                                 aggregate=FALSE,
                                                 geneColumn=geneColumn)
             if(isTRUE(aggregate)){
-                pvalsGene    <- as.matrix(pVals(tmp_x, level="gene"))
-                padjsGene    <- as.matrix(padjVals(tmp_x, level="gene"))
+                pvalsGene    <- as.matrix(pVals(tmp_x, level="gene", 
+                                                filters=list(rho=rhoCutoff)))
+                padjsGene    <- as.matrix(padjVals(tmp_x, level="gene", 
+                                                filters=list(rho=rhoCutoff)))
                 aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
                                                 deltaPsiCutoff=dPsiCutoff, 
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 250ad922..0a03576f 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -333,11 +333,11 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
     # add information on used filters
+    if(is.null(names(filters))){
+        filters <- list(rho=1)
+    }
     for(n in sort(names(filters))){
         aname_new <- paste0(aname, "_", n, filters[[n]])
-        if(is.null(names(filters))){
-            filters <- list(rho=1)
-        }
         if(n == "rho" && filters[[n]] == 1){
             if(any(grepl(aname_new, assayNames(fds))) ||
                     any(grepl(aname_new, names(metadata(fds))))){
diff --git a/R/helper-functions.R b/R/helper-functions.R
index 54593b28..ae0dba66 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -564,7 +564,8 @@ checkPadjAvailableForFilters <- function(fds, type=currentType(fds),
     for(n in sort(names(filters))){
         aname_new <- paste0(aname, "_", n, filters[[n]])
         if(n == "rho" && filters[[n]] == 1){
-            if(any(grepl(aname_new, assayNames(fds)))){
+            if(any(grepl(aname_new, assayNames(fds))) ||
+                    any(grepl(aname_new, names(metadata(fds))))){
                 aname <- aname_new
             }
         }else{

From 02fbb8412fcf6adbbbd3e5821ba27b79e942a33b Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Fri, 9 Dec 2022 14:48:52 +0100
Subject: [PATCH 54/80] minor improvements to plots

---
 R/plotMethods.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index e53ce761..e21471b3 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -431,7 +431,7 @@ plotAberrantPerSample.FRASER <- function(object, main,
         geom_line() +
         geom_hline(aes(yintercept=median, color=type, lty="Median")) +
         theme_bw() +
-        theme_cowplot() +
+        theme_cowplot() + background_grid(major="xy", minor="xy") +
         ggtitle(main) +
         xlab("Sample rank") +
         ylab("Number of outliers") +
@@ -440,7 +440,7 @@ plotAberrantPerSample.FRASER <- function(object, main,
         scale_linetype_manual(name="", values=2, labels="Median")
 
     if(!all(dt2p[,value] == 0)){
-        g <- g + scale_y_log10()
+        g <- g + scale_y_log10() + annotation_logticks(sides="l")
     }
     
     g
@@ -846,6 +846,8 @@ plotEncDimSearch.FRASER <- function(object, type=psiTypes,
             geom_smooth(method="loess", formula=y~x) +
             geom_vline(data=data[isOptimalQ == TRUE,], 
                     mapping=aes(xintercept=q, col=nsubset, linetype=noise)) +
+            geom_text(data=data[isOptimalQ == TRUE,], 
+                        aes(y=0.0, q+1, label=q)) +
             ggtitle(as.expression(bquote(bold(paste(
                 "Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) +
             xlab("Estimated q") +

From 7e26a3654176d427bf1f543f7b29c53d146a2857 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Fri, 9 Dec 2022 18:00:07 +0100
Subject: [PATCH 55/80] added support to get results table only for subset of
 genes

---
 NAMESPACE                   |   1 +
 R/AllGenerics.R             | 257 +++++++++++++++++++++++++++++++-----
 R/FRASER-package.R          |   2 +-
 R/Fraser-pipeline.R         |   6 +
 R/pvalsNzscore.R            |  27 +++-
 man/FRASER.Rd               |   8 +-
 man/results.Rd              |  36 +++++
 tests/testthat/test_stats.R |   4 +-
 8 files changed, 299 insertions(+), 42 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 436647d3..f1ca889f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -262,6 +262,7 @@ importFrom(VGAM,rbetabinom)
 importFrom(VGAM,vglm)
 importFrom(biomaRt,getBM)
 importFrom(biomaRt,useEnsembl)
+importFrom(cowplot,background_grid)
 importFrom(cowplot,theme_cowplot)
 importFrom(extraDistr,dbbinom)
 importFrom(extraDistr,pbbinom)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 74433c6a..f37cb6dd 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -714,6 +714,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                             dPsiCutoff, minCount, rhoCutoff, psiType, 
                             maxCols=20, aggregate=FALSE, collapse=FALSE,
                             geneColumn="hgnc_symbol", BPPARAM=bpparam(), 
+                            geneSubset=NULL, subsetName, fullSubset=FALSE, 
                             additionalColumns=NULL){
     
     stopifnot(is(object, "FraserDataSet"))
@@ -744,6 +745,16 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
         currentType(object) <- type
         gr <- rowRanges(object, type=type)
         
+        # calculate FDR on subset first if requested
+        if(!is.null(geneSubset)){
+            object <- calculatePadjValuesOnSubset(fds=object, type=type, 
+                                                  genesToTest=geneSubset, 
+                                                  subsetName=subsetName, 
+                                                  geneColumn=geneColumn)
+            fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]]
+            object <- object[, fdr_subset[, unique(sampleID)]]
+        } 
+        
         # first get row means
         rowMeansK <- rowMeans(K(object, type=type))
         rowMeansN <- rowMeans(N(object, type=type))
@@ -771,31 +782,100 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                 (rawTotalCts + 2*pseudocount())
             deltaPsiVals <- deltaPsiValue(tmp_x, type)
             rho          <- rho(tmp_x, type)
-            aberrant     <- aberrant.FRASER(tmp_x, type=type, 
+            if(is.null(geneSubset)){
+                aberrant     <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
                                                 deltaPsiCutoff=dPsiCutoff, 
                                                 minCount=minCount, 
                                                 rhoCutoff=rhoCutoff,
                                                 aggregate=FALSE,
                                                 geneColumn=geneColumn)
+            } else{
+                aberrant     <- aberrant.FRASER(tmp_x, type=type, 
+                                                padjCutoff=fdrCutoff, 
+                                                deltaPsiCutoff=dPsiCutoff, 
+                                                minCount=minCount, 
+                                                rhoCutoff=rhoCutoff,
+                                                aggregate=FALSE,
+                                                geneColumn=geneColumn,
+                                                FDR_subset=fdr_subset[sampleID 
+                                                        %in% samples(tmp_x)],
+                                                fullSubset=fullSubset)
+            }
             if(isTRUE(aggregate)){
                 pvalsGene    <- as.matrix(pVals(tmp_x, level="gene", 
                                                 filters=list(rho=rhoCutoff)))
                 padjsGene    <- as.matrix(padjVals(tmp_x, level="gene", 
                                                 filters=list(rho=rhoCutoff)))
-                aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
-                                                padjCutoff=fdrCutoff, 
-                                                deltaPsiCutoff=dPsiCutoff, 
-                                                minCount=minCount, 
-                                                rhoCutoff=rhoCutoff,
-                                                aggregate=TRUE,
-                                                geneColumn=geneColumn)
+                if(is.null(geneSubset)){
+                    aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
+                                                    padjCutoff=fdrCutoff, 
+                                                    deltaPsiCutoff=dPsiCutoff, 
+                                                    minCount=minCount, 
+                                                    rhoCutoff=rhoCutoff,
+                                                    aggregate=TRUE,
+                                                    geneColumn=geneColumn)
+                } else{
+                    aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
+                                                    padjCutoff=fdrCutoff, 
+                                                    deltaPsiCutoff=dPsiCutoff, 
+                                                    minCount=minCount, 
+                                                    rhoCutoff=rhoCutoff,
+                                                    aggregate=TRUE,
+                                                    geneColumn=geneColumn,
+                                                    FDR_subset=fdr_subset[sampleID 
+                                                            %in% samples(tmp_x)],
+                                                    fullSubset=fullSubset)
+                
+                    # get row,col idx of genes/samples in subset
+                    subset_gene_padj <- as.matrix(
+                        fdr_subset[sampleID %in% samples(tmp_x), 
+                            .(sapply(gene, 
+                                function(g) which(rownames(pvalsGene) == g)),
+                            sapply(sampleID, 
+                                function(s) which(colnames(tmp_x) ==s)),
+                            pval_gene,
+                            FDR_subset_gene)]
+                    )
+                    # replace pvalsGene and padjsGene with values on subset
+                    geneNames <- rownames(pvalsGene)
+                    pvalsGene <- matrix(NA, 
+                                        nrow=nrow(pvalsGene), ncol=ncol(tmp_x))
+                    rownames(pvalsGene) <- geneNames 
+                    colnames(pvalsGene) <- colnames(tmp_x)
+                    padjsGene <- matrix(NA, 
+                                        nrow=nrow(padjsGene), ncol=ncol(tmp_x))
+                    rownames(padjsGene) <- geneNames
+                    colnames(padjsGene) <- colnames(tmp_x)
+                    if(nrow(subset_gene_padj) > 0){
+                        pvalsGene[subset_gene_padj[,1:2]] <- 
+                            subset_gene_padj[,3]
+                        padjsGene[subset_gene_padj[,1:2]] <- 
+                            subset_gene_padj[,4]
+                    }
+                }
+                
             } else{
                 pvalsGene    <- NULL
                 padjsGene    <- NULL
                 aberrantGene <- NULL
             }
             
+            if(!is.null(geneSubset)){
+                # get row,col idx of introns/samples in subset
+                subset_padj <- as.matrix(
+                    fdr_subset[sampleID %in% samples(tmp_x), .(jidx, 
+                                sapply(sampleID, 
+                                    function(s) which(colnames(tmp_x) ==s)),
+                                FDR_subset)]
+                )
+                # replace padj values with values on subset
+                padj <- matrix(NA, nrow=nrow(tmp_x), ncol=ncol(tmp_x))
+                if(nrow(subset_padj) > 0){
+                    padj[subset_padj[,1:2]] <- subset_padj[,3]
+                }
+            }
+            
             if(length(sc) == 1){
                 colnames(pvals) <- sc
                 colnames(padjs) <- sc
@@ -843,6 +923,15 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
     # sort it if existing
     if(length(ans) > 0){
         ans <- ans[order(ans$pValue)]
+        if(is.null(geneSubset)){
+            mcols(ans)[["FDR_set"]] <- "transcriptome-wide"
+        } else{
+            if(is.null(subsetName)){
+                mcols(ans)[["FDR_set"]] <- "subset"
+            } else{
+                mcols(ans)[["FDR_set"]] <- subsetName
+            }
+        }
     }
     
     # collapse into one row per gene if requested
@@ -890,6 +979,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #'              types to return only one row per feature (gene) and sample.
 #' @param geneColumn The column name of the column that has the gene annotation 
 #'              that will be used for gene-level pvalue computation.
+#' @param subsets A named list of named lists specifying any number of gene 
+#'              subsets (can differ per sample). For each subset, FDR correction
+#'              will be limited to introns in genes in the subset, and aberrant 
+#'              events passing the FDR cutoff will be reported for each subset 
+#'              separately. See the examples for how to use this argument. 
+#' @param fullSubset Only applies when \code{geneSubset} is not NULL. Specifies 
+#'              whether all introns in given subset of genes should be 
+#'              considered as aberrant, or only those passing the given cutoffs. 
+#'              Defaults to FALSE (introns have to pass the cutoffs in addtion 
+#'              to being in the gene subset to be considered aberrant). 
+#' @param geneSubset A named list giving a subset of genes per sample to which  
+#'              FDR correction should be restricted. The names of the list must 
+#'              correspond to the sampleIDs in the fds object.
+#' @param subsetName The name under which the resulting FDR corrected pvalues 
+#'              on the subset only will be displayed in the result table.
 #' @param ... Further arguments can be passed to the method. If "n",  
 #'              "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 #'              arguments are used to define the aberrant events.
@@ -929,6 +1033,17 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #'         
 #' # find aberrant junctions/splice sites
 #' aberrant(fds, type="jaccard")
+#' 
+#' # retrieve results limiting FDR correction to only a subset of genes
+#' # first, we need to create a list of genes per sample that will be tested
+#' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
+#' results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE)
+#' 
+#' # results for several subsets can be retrieved at the same time:
+#' geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1"))
+#' results(fds, 
+#'         subsets=list('random_subset'=geneList, 'another_subset'=geneList2), 
+#'         fullSubset=TRUE)
 #' @export
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
@@ -936,20 +1051,62 @@ setMethod("results", "FraserDataSet", function(object,
                     rhoCutoff=NA, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=psiTypes,
                     geneColumn="hgnc_symbol",
+                    subsets=NULL, fullSubset=FALSE, 
+                    returnTranscriptomewideResults=TRUE,
                     additionalColumns=NULL, BPPARAM=bpparam()){
-    FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
-            dPsiCutoff=deltaPsiCutoff, 
-            rhoCutoff=rhoCutoff, minCount=minCount, 
-            psiType=match.arg(psiType, several.ok=TRUE),
-            aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
-            additionalColumns=additionalColumns, BPPARAM=BPPARAM)
+    psiType <- match.arg(psiType, several.ok=TRUE)
+    if(is.null(subsets)){
+        returnTranscriptomewideResults <- TRUE
+    }
+    if(isTRUE(returnTranscriptomewideResults)){
+        res <- FRASER.results(object=object, sampleIDs=sampleIDs, 
+                fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, 
+                rhoCutoff=rhoCutoff, minCount=minCount, 
+                psiType=psiType,
+                aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
+                geneSubset=NULL, subsetName=NULL, fullSubset=FALSE,
+                additionalColumns=additionalColumns, BPPARAM=BPPARAM)
+    }
+    
+    # add results for FDR_subsets if requested
+    if(!is.null(subsets)){
+        stopifnot(is.list(subsets))
+        stopifnot(!is.null(names(subsets)))
+        resls_subsets <- lapply(names(subsets), function(setName){
+            geneList_sub <- subsets[[setName]]
+            res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, 
+                fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, 
+                rhoCutoff=rhoCutoff, minCount=minCount, 
+                psiType=psiType,
+                aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
+                geneSubset=geneList_sub, subsetName=setName, 
+                fullSubset=fullSubset, 
+                additionalColumns=additionalColumns, BPPARAM=BPPARAM)
+        })
+        if(isTRUE(returnTranscriptomewideResults)){
+            res <- unlist(GRangesList(unlist(list(res, resls_subsets))))
+        } else{
+            res <- unlist(GRangesList(unlist(resls_subsets)))
+        }
+        
+        # sort it if existing
+        if(length(res) > 0){
+            res <- res[order(res$pValue)]
+            if(isTRUE(aggregate)){
+                res <- res[!is.na(res$pValueGene)]
+            }
+        }
+    }
+    return(res)
 })
 
 aberrant.FRASER <- function(object, type=fitMetrics(object), 
                                 padjCutoff=0.05, deltaPsiCutoff=0.1, 
                                 minCount=5, rhoCutoff=NA,
                                 by=c("none", "sample", "feature"), 
-                                aggregate=FALSE, geneColumn="hgnc_symbol", ...){
+                                aggregate=FALSE, geneColumn="hgnc_symbol", 
+                                geneSubset=NULL, subsetName=NULL, 
+                                fullSubset=FALSE, ...){
     
     checkNaAndRange(padjCutoff,     min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(deltaPsiCutoff, min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
@@ -1004,24 +1161,61 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         
     }
     
-    if(is.na(padjCutoff)){
+    if(is.na(padjCutoff) || (isTRUE(fullSubset) && 
+            (!is.null(geneSubset) || ("FDR_subset" %in% names(dots))))){
         padjCutoff <- 1
     }
     
-    aberrantEvents <- as.matrix(padj) <= padjCutoff
+    if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){
+        aberrantEvents <- as.matrix(padj) <= padjCutoff
+    } else{
+        if("FDR_subset" %in% names(dots)){
+            fdr_subset <- dots[["FDR_subset"]]
+        } else{
+            if(is.null(subsetName)){
+                subsetName <- "subset"
+            }
+            object <- calculatePadjValuesOnSubset(fds=object, type=type, 
+                                                  genesToTest=geneSubset, 
+                                                  subsetName=subsetName, 
+                                                  geneColumn=geneColumn)
+            fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]]
+        }
+        
+        
+        # define aberrant status based on whether intron/sample tuples are 
+        # part of the given subset
+        aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj))
+        colnames(aberrantEvents) <- colnames(padj)
+        FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset")
+        subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){
+            col <- which(colnames(object) == sid)
+            rows <- fdr_subset[sampleID == sid & get(FDR_col) <= padjCutoff, 
+                               sort(unique(jidx))]
+            sub_idx <- matrix(c(rows, rep(col, length(rows))), 
+                              nrow=length(rows))
+            return(sub_idx)
+        })
+        subset_idx <- do.call(rbind, 
+                            subset_idx[which(sapply(subset_idx, nrow) > 0)])
+        aberrantEvents[subset_idx] <- TRUE
+    }
     
     # check each cutoff if in use (not NA)
-    if(!is.na(minCount)){
-        aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
-    }
-    if(!is.na(deltaPsiCutoff)){
-        aberrantEvents <- aberrantEvents & 
-            as.matrix(abs(dpsi) >= deltaPsiCutoff)
-    }
-    if(!is.na(rhoCutoff)){
-        aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff)
+    if((is.null(geneSubset) && !("FDR_subset" %in% names(dots))) || 
+            isFALSE(fullSubset)){
+        if(!is.na(minCount)){
+            aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
+        }
+        if(!is.na(deltaPsiCutoff)){
+            aberrantEvents <- aberrantEvents & 
+                as.matrix(abs(dpsi) >= deltaPsiCutoff)
+        }
+        if(!is.na(rhoCutoff)){
+            aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff)
+        }
+        aberrantEvents[is.na(aberrantEvents)] <- FALSE
     }
-    aberrantEvents[is.na(aberrantEvents)] <- FALSE
         
     if(isTRUE(aggregate)){
         if(is.null(rownames(padj_gene))){
@@ -1040,9 +1234,11 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         aberrantEvents <- as.matrix(ab_dt[,-1])
         rownames(aberrantEvents) <- ab_dt[,geneID]
         
-        aberrantEvents <- aberrantEvents & as.matrix(
-                padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)]
-            ) <= padjCutoff
+        if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){
+            aberrantEvents <- aberrantEvents & as.matrix(
+                    padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)]
+                ) <= padjCutoff
+        }
     }
     
     return(switch(match.arg(by),
@@ -1056,4 +1252,3 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
 #' @export
 setMethod("aberrant", "FraserDataSet", aberrant.FRASER)
 
-
diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 0426bb26..85569e89 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -49,7 +49,7 @@
 #' @importFrom plotly plot_ly subplot layout add_trace ggplotly
 #' @importFrom pheatmap pheatmap
 #' @importFrom RColorBrewer brewer.pal
-#' @importFrom cowplot theme_cowplot
+#' @importFrom cowplot theme_cowplot background_grid
 #' @importFrom ggrepel geom_text_repel
 #'
 ### Data handling
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index 20485299..10f22c3a 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -72,6 +72,12 @@
 #' fds <- calculateZscore(fds, type="jaccard")
 #' head(zScores(fds, type="jaccard")) 
 #' 
+#' # To calculate the FDR only on a subset of genes of interest (per sample):
+#' geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, 
+#'          subsetName="setOfInterest", type="jaccard")
+#' metadata(fds)[["FDR_setOfInterest"]]
+#' 
 #' @seealso \code{\link[FRASER]{fit}}
 #' 
 #' @author Christian Mertes \email{mertes@@in.tum.de}
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 61a8a2da..7ceb87f9 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -399,7 +399,7 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){
 #' 
 #' @export
 calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), 
-                                subsetName="FDR_subset", method="BY", 
+                                subsetName="subset", method="BY", 
                                 geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     
     # check input
@@ -410,10 +410,6 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
     if(!all(names(genesToTest) %in% samples(fds))){
         stop("names(genesToTest) need to be sampleIDs in the given fds object.")
     }
-    if(!all(samples(fds) %in% names(genesToTest))){
-        stop("All sampleIDs of the given fds object need to be in ",
-            "names(geneToTest).")
-    }
     
     # check if genes have been annotated
     if(!geneColumn %in% colnames(mcols(fds, type=type))){    
@@ -430,12 +426,29 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
         # message(date(), ": FDR subset calculation for sample = ", sample_id)
         # get idx of junctions corresponding to genes with var
         jidx <- unlist(lapply(genes_to_test_sample, function(gene){
-            idx <- which(grepl(gene, mcols(fds, type="j")[, geneColumn]))
+            idx <- which(grepl(gene, mcols(fds, type=type)[, geneColumn]))
             names(idx) <- rep(gene, length(idx))
+            if(length(idx) == 0 && verbose(fds) > 0){
+                warning("No introns found in fds object for gene: ", gene, 
+                        " and sample: ", sample_id, ". Skipping this gene.")
+            }
             return(idx)
         }))
         jidx <- sort(jidx[!duplicated(jidx)])
         
+        # check that jidx is not empty vector
+        if(length(jidx) == 0){
+            warning("No introns found in the fds object for the given gene ", 
+                    "subset for sample: ", sample_id)
+            return(data.table(gene=character(0), 
+                              sampleID=character(0), 
+                              pval=numeric(0),
+                              FDR_subset=numeric(0), 
+                              jidx=integer(0),
+                              pval_gene=numeric(0),
+                              FDR_subset_gene=numeric(0)))
+        }
+        
         # retrieve pvalues of junctions to test
         p <- pVals(fds, type=type, level="junction")[jidx, sample_id]
         
@@ -458,7 +471,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
     message(date(), ": finished FDR calculation on subset of genes.")
     
     # add FDR subset info to fds object and return
-    metadata(fds)[[subsetName]] <- FDR_subset
+    metadata(fds)[[paste("FDR", subsetName, sep="_")]] <- FDR_subset
     return(fds)
 }
 
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 1b2e5f72..13210477 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -44,7 +44,7 @@ calculatePadjValuesOnSubset(
   fds,
   genesToTest,
   type = currentType(fds),
-  subsetName = "FDR_subset",
+  subsetName = "subset",
   method = "BY",
   geneColumn = "hgnc_symbol",
   BPPARAM = bpparam()
@@ -190,6 +190,12 @@ head(padjVals(fds, type="jaccard"))
 fds <- calculateZscore(fds, type="jaccard")
 head(zScores(fds, type="jaccard")) 
 
+# To calculate the FDR only on a subset of genes of interest (per sample):
+geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, 
+         subsetName="setOfInterest", type="jaccard")
+metadata(fds)[["FDR_setOfInterest"]]
+
 }
 \seealso{
 \code{\link[FRASER]{fit}}
diff --git a/man/results.Rd b/man/results.Rd
index 4da3bf1b..03b67b36 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -16,6 +16,9 @@
   minCount = 5,
   psiType = psiTypes,
   geneColumn = "hgnc_symbol",
+  subsets = NULL,
+  fullSubset = FALSE,
+  returnTranscriptomewideResults = TRUE,
   additionalColumns = NULL,
   BPPARAM = bpparam()
 )
@@ -30,6 +33,9 @@
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
   geneColumn = "hgnc_symbol",
+  geneSubset = NULL,
+  subsetName = NULL,
+  fullSubset = FALSE,
   ...
 )
 }
@@ -63,6 +69,18 @@ result}
 \item{geneColumn}{The column name of the column that has the gene annotation 
 that will be used for gene-level pvalue computation.}
 
+\item{subsets}{A named list of named lists specifying any number of gene 
+subsets (can differ per sample). For each subset, FDR correction
+will be limited to introns in genes in the subset, and aberrant 
+events passing the FDR cutoff will be reported for each subset 
+separately. See the examples for how to use this argument.}
+
+\item{fullSubset}{Only applies when \code{geneSubset} is not NULL. Specifies 
+whether all introns in given subset of genes should be 
+considered as aberrant, or only those passing the given cutoffs. 
+Defaults to FALSE (introns have to pass the cutoffs in addtion 
+to being in the gene subset to be considered aberrant).}
+
 \item{additionalColumns}{Character vector containing the names of additional 
 columns from mcols(fds) that should appear in the result table 
 (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns 
@@ -76,6 +94,13 @@ are included.}
 \code{sample} or \code{feature} is specified the sum by 
 sample or feature is returned}
 
+\item{geneSubset}{A named list giving a subset of genes per sample to which
+FDR correction should be restricted. The names of the list must 
+correspond to the sampleIDs in the fds object.}
+
+\item{subsetName}{The name under which the resulting FDR corrected pvalues 
+on the subset only will be displayed in the result table.}
+
 \item{...}{Further arguments can be passed to the method. If "n",
 "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 arguments are used to define the aberrant events.}
@@ -120,4 +145,15 @@ aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE)
         
 # find aberrant junctions/splice sites
 aberrant(fds, type="jaccard")
+
+# retrieve results limiting FDR correction to only a subset of genes
+# first, we need to create a list of genes per sample that will be tested
+geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
+results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE)
+
+# results for several subsets can be retrieved at the same time:
+geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1"))
+results(fds, 
+        subsets=list('random_subset'=geneList, 'another_subset'=geneList2), 
+        fullSubset=TRUE)
 }
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index dcbe5bea..b6e3aa07 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -95,10 +95,10 @@ test_that("FDR on subset of genes", {
     )
     expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4)
     
-    subsetName <- "FDR_subset_test"
+    subsetName <- "subset_test"
     fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, 
                                        subsetName=subsetName, type="jaccard")
-    subset_dt <- metadata(fds)[[subsetName]]
+    subset_dt <- metadata(fds)[[paste0("FDR_", subsetName)]]
     expect_true(is(subset_dt, "data.table"))
     expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt)))
     expect_equal(subset_dt[, .N], expected_output_nrows)

From 9d8826c1e053a50cd7ff83e3a5dbed190792fe7d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 12 Dec 2022 11:43:22 +0100
Subject: [PATCH 56/80] minor fix for theta FDR on subset

---
 R/AllGenerics.R             | 8 +++++---
 R/pvalsNzscore.R            | 5 +++--
 tests/testthat/test_stats.R | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index f37cb6dd..38e71a44 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -751,7 +751,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                                                   genesToTest=geneSubset, 
                                                   subsetName=subsetName, 
                                                   geneColumn=geneColumn)
-            fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]]
+            fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, 
+                                                    sep="_")]]
             object <- object[, fdr_subset[, unique(sampleID)]]
         } 
         
@@ -870,7 +871,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                                 FDR_subset)]
                 )
                 # replace padj values with values on subset
-                padj <- matrix(NA, nrow=nrow(tmp_x), ncol=ncol(tmp_x))
+                padj <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals))
                 if(nrow(subset_padj) > 0){
                     padj[subset_padj[,1:2]] <- subset_padj[,3]
                 }
@@ -1179,7 +1180,8 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
                                                   genesToTest=geneSubset, 
                                                   subsetName=subsetName, 
                                                   geneColumn=geneColumn)
-            fdr_subset <- metadata(object)[[paste0("FDR_", subsetName)]]
+            fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, 
+                                                    sep="_")]]
         }
         
         
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 7ceb87f9..979d97e8 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -442,6 +442,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
                     "subset for sample: ", sample_id)
             return(data.table(gene=character(0), 
                               sampleID=character(0), 
+                              type=character(0),
                               pval=numeric(0),
                               FDR_subset=numeric(0), 
                               jidx=integer(0),
@@ -456,7 +457,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
         pa <- p.adjust(p, method=method)
         
         # gene level pvals
-        dt <- data.table(sampleID=sample_id, pval=p, FDR_subset=pa, 
+        dt <- data.table(sampleID=sample_id, type=type, pval=p, FDR_subset=pa, 
                         gene=names(jidx), jidx=jidx)
         dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
         
@@ -471,7 +472,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
     message(date(), ": finished FDR calculation on subset of genes.")
     
     # add FDR subset info to fds object and return
-    metadata(fds)[[paste("FDR", subsetName, sep="_")]] <- FDR_subset
+    metadata(fds)[[paste("FDR", subsetName, type, sep="_")]] <- FDR_subset
     return(fds)
 }
 
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index b6e3aa07..79f737d1 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -98,7 +98,7 @@ test_that("FDR on subset of genes", {
     subsetName <- "subset_test"
     fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, 
                                        subsetName=subsetName, type="jaccard")
-    subset_dt <- metadata(fds)[[paste0("FDR_", subsetName)]]
+    subset_dt <- metadata(fds)[[paste("FDR", subsetName, "jaccard", sep="_")]]
     expect_true(is(subset_dt, "data.table"))
     expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt)))
     expect_equal(subset_dt[, .N], expected_output_nrows)

From f987da9144e5f0a0c0686f425649194ebc90e92e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 12 Dec 2022 15:42:07 +0100
Subject: [PATCH 57/80] minor fix to get correct rowMeans in result table when
 using FDR on subset

---
 R/AllGenerics.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 38e71a44..5c4cff54 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -745,6 +745,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
         currentType(object) <- type
         gr <- rowRanges(object, type=type)
         
+        # first get row means
+        rowMeansK <- rowMeans(K(object, type=type))
+        rowMeansN <- rowMeans(N(object, type=type))
+        
         # calculate FDR on subset first if requested
         if(!is.null(geneSubset)){
             object <- calculatePadjValuesOnSubset(fds=object, type=type, 
@@ -756,10 +760,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
             object <- object[, fdr_subset[, unique(sampleID)]]
         } 
         
-        # first get row means
-        rowMeansK <- rowMeans(K(object, type=type))
-        rowMeansN <- rowMeans(N(object, type=type))
-        
         # then iterate by chunk
         chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols)
         sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs,

From a5ef78107ed2424eec773dd2a325ec35af376e58 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 15 Dec 2022 14:15:35 +0100
Subject: [PATCH 58/80] renamed annotated column to potentialImpact

---
 DESCRIPTION                                   |   2 +-
 NAMESPACE                                     |   2 +-
 R/AllGenerics.R                               |   4 +-
 R/resultAnnotations.R                         | 116 +++++++++---------
 man/FRASER.Rd                                 |  10 +-
 man/countRNA.Rd                               |  16 +--
 man/fds-methods.Rd                            |  20 +--
 man/filtering.Rd                              |   8 +-
 man/getter_setter_functions.Rd                |  36 +++---
 ...tions.Rd => potentialImpactAnnotations.Rd} |  44 +++----
 10 files changed, 123 insertions(+), 135 deletions(-)
 rename man/{spliceTypeAnnotations.Rd => potentialImpactAnnotations.Rd} (76%)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7c8e24ab..2819f170 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -29,7 +29,7 @@ biocViews:
 License: MIT + file LICENSE
 URL: https://github.com/gagneurlab/FRASER
 BugRepots: https://github.com/gagneurlab/FRASER/issues
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.2
 Encoding: UTF-8
 VignetteBuilder: knitr
 Depends:
diff --git a/NAMESPACE b/NAMESPACE
index 861b7c2c..b30b1fb5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -23,9 +23,9 @@ export(N)
 export(aberrant)
 export(addCountsToFraserDataSet)
 export(annotateIntronReferenceOverlap)
+export(annotatePotentialImpact)
 export(annotateRanges)
 export(annotateRangesWithTxDb)
-export(annotateSpliceEventType)
 export(bamFile)
 export(bestQ)
 export(calculatePSIValues)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 6bebe86a..7a07b24b 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -12,8 +12,8 @@ asFDS <- function(x){
 #'
 #' @title Getter/Setter methods for the FraserDataSet
 #'
-#' The following methods are getter and setter methods to extract or set
-#' certain values of a FraserDataSet object. 
+#' @description The following methods are getter and setter methods to extract 
+#' or set certain values of a FraserDataSet object. 
 #' 
 #' \code{samples} sets or gets the sample IDs; \code{condition} ;
 #' \code{}
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index 52f1c70b..d64ed4ee 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -2,23 +2,23 @@
 #' @title Additional result annotations
 #' 
 #' @description These functions work on the result table and add additional 
-#'     annotations to the reported introns: the type of splice event (e.g. 
-#'     exon skipping, exon truncation, ...), expected occurence of frameshift, 
-#'     overlap with UTR regions as well as a flag for introns that are 
-#'     located in blacklist regions of the genome.
+#'     annotations to the reported introns: the type of potential impact on 
+#'     splicing (e.g. exon skipping, exon truncation, ...), potential occurence 
+#'     of frameshift, overlap with UTR regions as well as a flag for introns 
+#'     that are located in blacklist regions of the genome.
 #'     
 #' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the 
 #'     fds for each intron based on the overlap of the intron's location with 
 #'     the reference annotation. Has to be run before the result table is 
 #'     created so that the new column can be included in it (see examples).
 #' 
-#' \code{\link{annotateSpliceEventType}} annotates each intron in the results 
-#'     table with the type of splice event and expected occurence of frameshift 
-#'     (likely, unlikely, inconclusive). Can also calculate overlap with 
-#'     annotated UTR regions. Splice types can be one of: 
+#' \code{\link{annotatePotentialImpact}} annotates each intron in the results 
+#'     table with the type of potential impact on splicing and potential 
+#'     occurence of frameshift (likely, unlikely, inconclusive). Can also 
+#'     calculate overlap with annotated UTR regions. Potential impact can be: 
 #'     annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, 
 #'     exonTruncation, exonElongation, exonTruncation&Elongation, 
-#'     singleExonSkipping, exonSkipping, splicingBeyondGene, 
+#'     exonSkipping, splicingBeyondGene, 
 #'     multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, 
 #'     complex (everything else).     
 #'     Splice sites (theta metric) annotations indicate how the splice site is 
@@ -37,8 +37,8 @@
 #' @param result A result table as generated by FRASER, including the column 
 #'     \code{annotatedJunction} as generated by the function
 #'     \code{annotateIntronReferenceOverlap}.
-#' @param addSpliceType Logical, indicating if the type of the splice event 
-#'     should be added to the results table. Defaults to \code{TRUE}.
+#' @param addPotentialImpact Logical, indicating if the type of the potential 
+#'     impact should be added to the results table. Defaults to \code{TRUE}.
 #' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
 #'     should checked and added to the results table. Defaults to \code{TRUE}.
 #' @param minoverlap Integer value defining the number of base pairs around the
@@ -54,8 +54,8 @@
 #'     \code{bpparam()}.
 #' @return An annotated FraserDataSet or results table, respectively
 #' 
-#' @name spliceTypeAnnotations
-#' @rdname spliceTypeAnnotations
+#' @name potentialImpactAnnotations
+#' @rdname potentialImpactAnnotations
 #' 
 #' @examples
 #'   # get data, fit and compute p-values and z-scores
@@ -73,8 +73,8 @@
 #'   # of 1 is used to get at least one result. 
 #'   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
 #'   
-#'   # annotate the type of splice event and UTR overlap
-#'   res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
+#'   # annotate the type of potential impact on splicing and UTR overlap
+#'   res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
 #'   
 #'   # annotate overlap with blacklist regions
 #'   res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
@@ -84,7 +84,7 @@
 #'   
 NULL
 
-#' @describeIn spliceTypeAnnotations This method calculates basic annotations 
+#' @describeIn potentialImpactAnnotations This method calculates basic annotations 
 #'     based on overlap with the reference annotation (start, end, none, both) 
 #'     for the full fds. The overlap type is added as a new column 
 #'     \code{annotatedJunction} in \code{mcols(fds)}. 
@@ -147,10 +147,10 @@ annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
     return(fds)
 }
 
-#' @describeIn spliceTypeAnnotations This method annotates the splice event 
+#' @describeIn potentialImpactAnnotations This method annotates the splice event 
 #'     type to junctions in the given results table.
 #' @export
-annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE, 
+annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE, 
                                     addUTRoverlap=TRUE, minoverlap=5, 
                                     BPPARAM=bpparam()){
     
@@ -170,8 +170,10 @@ annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE,
     }
     
     # Calculate splice types and frameshift
-    if(isTRUE(addSpliceType)){
-        annoResult <- addSpliceTypeLabels(annoResult, fds, txdb)
+    if(isTRUE(addPotentialImpact)){
+        annoResult <- addPotentialImpactLabels(annoResult, fds, txdb)
+        annoResult[potentialImpact == "singleExonSkipping", 
+                    potentialImpact := "exonSkipping"]
     }
     
     # Add UTR labels
@@ -187,7 +189,7 @@ annotateSpliceEventType <- function(result, txdb, fds, addSpliceType=TRUE,
     return(annoResult)
 }
 
-#' @describeIn spliceTypeAnnotations This method flags all introns and 
+#' @describeIn potentialImpactAnnotations This method flags all introns and 
 #'     splice sites in the given results table for which at least one splice 
 #'     site (donor or acceptor) is located in a blacklist region. Blacklist 
 #'     regions of the genome are determined from the provided BED file.  
@@ -300,7 +302,7 @@ addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
 
 #' adds type of splicing to each intron in the results table
 #' @noRd
-addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
+addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
     message("preparing ...")
     psi_positions <- which(junctions_dt$type != "theta")
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
@@ -326,13 +328,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                                                 keep.extra.columns = TRUE)
     
     # prepare the results column
-    junctions_dt[, spliceType := "complex"]
-    junctions_dt[, causesFrameshift := "inconclusive"]
+    junctions_dt[, potentialImpact := "complex"]
+    junctions_dt[, potentialFrameshift := "inconclusive"]
     junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, 
-                    spliceType := "annotatedIntron_increasedUsage"]
+                    potentialImpact := "annotatedIntron_increasedUsage"]
     junctions_dt[annotatedJunction == "both" & deltaPsi < 0, 
-                    spliceType := "annotatedIntron_reducedUsage"]
-    junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
+                    potentialImpact := "annotatedIntron_reducedUsage"]
+    junctions_dt[annotatedJunction == "both", potentialFrameshift := "unlikely"]
     
     # TODO check for intron retention
     
@@ -354,9 +356,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                             intron_ranges, exons))
     })
     junctions_dt[psi_positions[starts], 
-                    causesFrameshift:=start_results[2,]]
+                    potentialFrameshift:=start_results[2,]]
     junctions_dt[psi_positions[starts], 
-                    spliceType := start_results[1,]]
+                    potentialImpact := start_results[1,]]
     
     # end junctions
     end_results <- sapply(ends, function(i){
@@ -371,8 +373,8 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                                 intron_ranges, exons))
         
     })
-    junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
-    junctions_dt[psi_positions[ends], spliceType := end_results[1,]]
+    junctions_dt[psi_positions[ends], potentialFrameshift:=end_results[2,]]
+    junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]]
     
     # none junctions pt1
     none_results <- sapply(nones, function(i){
@@ -423,10 +425,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         return(c(combined,frs))
         
     })
-    junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
-    junctions_dt[psi_positions[nones], spliceType := none_results[1,]]
+    junctions_dt[psi_positions[nones], potentialFrameshift:=none_results[2,]]
+    junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]]
     
-    noLaps <-which(junctions_dt[psi_positions]$spliceType=="noOverlap")
+    noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap")
     refseq.genes<- genes(txdb)
     
     # none junctions pt2
@@ -454,9 +456,9 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         return(c("complex","inconclusive"))
     })
     junctions_dt[psi_positions[noLaps], 
-                    causesFrameshift:=noLaps_results[2,]]
+                    potentialFrameshift:=noLaps_results[2,]]
     junctions_dt[psi_positions[noLaps], 
-                    spliceType := noLaps_results[1,]]
+                    potentialImpact := noLaps_results[1,]]
     
     # theta annotations
     thetas <- which(junctions_dt$type == "theta")
@@ -464,12 +466,12 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                                             keep.extra.columns = TRUE)
     
     # specify default type for theta results as NA
-    junctions_dt[thetas, spliceType := NA]
-    junctions_dt[thetas, causesFrameshift := NA]
+    junctions_dt[thetas, potentialImpact := NA]
+    junctions_dt[thetas, potentialFrameshift := NA]
     
     # label all as intronic first if they have any intron overlap
     intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
-    junctions_dt[thetas[intronic], spliceType := "intronicRegion"]
+    junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"]
     
     # for exonic, check if theta is fully contained in an exon
     # if one end is in an intron and the other in an exon it is a splice site
@@ -482,10 +484,10 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
         if(length(a) == length(w)) return("exonicRegion")
         return("annotatedSpliceSite")
     })
-    junctions_dt[thetas[exonic], spliceType := exonic_results]
+    junctions_dt[thetas[exonic], potentialImpact := exonic_results]
     
     # check cases that don't overlap with an exon/intron
-    nones <- which(is.na(junctions_dt[thetas,]$spliceType))
+    nones <- which(is.na(junctions_dt[thetas,]$potentialImpact))
     none_results <- sapply(nones, function(i){
         if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
         #return("intergenic")
@@ -500,13 +502,13 @@ addSpliceTypeLabels <- function(junctions_dt, fds, txdb){
                     return("upstreamOfNearestGene"))
         }
     })
-    junctions_dt[thetas[nones], spliceType := none_results]
+    junctions_dt[thetas[nones], potentialImpact := none_results]
     
     # add distance to closest neighbour gene for intergenic results 
     # (both psi and theta)
     message("adding distances to nearest gene ...")
-    up <- which(junctions_dt$spliceType == "upstreamOfNearestGene")
-    down <- which(junctions_dt$spliceType == "downstreamOfNearestGene")
+    up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene")
+    down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene")
     
     # create full grange object containing psi and theta
     junctions_gr <- makeGRangesFromDataFrame(junctions_dt, 
@@ -829,7 +831,7 @@ checkExonSkipping <- function(junctions_dt, txdb){
     
     refseq.genes<- genes(txdb)
     
-    exonSkip <- which(junctions_dt[psi_positions]$spliceType %in% 
+    exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in% 
                             c("exonSkipping", "singleExonSkipping"))
     
     message("start checking exonSkipping")
@@ -878,16 +880,16 @@ checkExonSkipping <- function(junctions_dt, txdb){
     # checking exonSkipping done
     if(length(exonSkip) > 0){
         junctions_dt[psi_positions[exonSkip], 
-                        spliceType2 := newSkip_results]
-        junctions_dt[spliceType2 == "splicingBeyondGene", 
-                        spliceType := "splicingBeyondGene"]
-        junctions_dt[spliceType2 == "splicingBeyondGene", 
-                        causesFrameshift := "inconclusive"]
-        junctions_dt[spliceType2 == "multigenicSplicing", 
-                        spliceType := "multigenicSplicing"]
-        junctions_dt[spliceType2 == "multigenicSplicing", 
-                        causesFrameshift := "inconclusive"]
-        junctions_dt[, spliceType2 := NULL]
+                        potentialImpact2 := newSkip_results]
+        junctions_dt[potentialImpact2 == "splicingBeyondGene", 
+                        potentialImpact := "splicingBeyondGene"]
+        junctions_dt[potentialImpact2 == "splicingBeyondGene", 
+                        potentialFrameshift := "inconclusive"]
+        junctions_dt[potentialImpact2 == "multigenicSplicing", 
+                        potentialImpact := "multigenicSplicing"]
+        junctions_dt[potentialImpact2 == "multigenicSplicing", 
+                        potentialFrameshift := "inconclusive"]
+        junctions_dt[, potentialImpact2 := NULL]
     }
     
     colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
@@ -906,7 +908,7 @@ checkInconclusive <- function(junctions_dt, txdb){
     refseq.genes<- genes(txdb)
     
     inconclusive <- which(junctions_dt[psi_positions
-                                        ]$spliceType == "complex")
+                                        ]$potentialImpact == "complex")
     
     inconclusive_results <- sapply(inconclusive, function(i){
         start = start(junctions_gr[i]) 
@@ -954,7 +956,7 @@ checkInconclusive <- function(junctions_dt, txdb){
 
     if(length(inconclusive) > 0){
         junctions_dt[psi_positions[inconclusive], 
-                        spliceType := inconclusive_results]
+                        potentialImpact := inconclusive_results]
     } 
     
     return(junctions_dt)
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 22f3fff5..8ae94611 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -94,23 +94,23 @@ raw counts ("BB").
 }
 \section{Functions}{
 \itemize{
-\item \code{FRASER}: This function runs the default FRASER pipeline combining 
+\item \code{FRASER()}: This function runs the default FRASER pipeline combining 
 the beta-binomial fit, the computation of Z scores and p values as well as 
 the computation of delta-PSI values.
 
-\item \code{calculateZscore}: This function calculates z-scores based on the 
+\item \code{calculateZscore()}: This function calculates z-scores based on the 
 observed and expected logit 
 psi.
 
-\item \code{calculatePvalues}: This function calculates two-sided p-values based on 
+\item \code{calculatePvalues()}: This function calculates two-sided p-values based on 
 the beta-binomial distribution (or binomial or normal if desired). The 
 returned p values are already adjusted with Holm's method per donor or 
 acceptor site, respectively.
 
-\item \code{calculatePadjValues}: This function adjusts the previously calculated 
+\item \code{calculatePadjValues()}: This function adjusts the previously calculated 
 p-values per sample for multiple testing.
-}}
 
+}}
 \examples{
 # set default parallel backend
 register(SerialParam())
diff --git a/man/countRNA.Rd b/man/countRNA.Rd
index 7940b91b..440231fe 100644
--- a/man/countRNA.Rd
+++ b/man/countRNA.Rd
@@ -245,32 +245,32 @@ a sample are set to zero.
 }
 \section{Functions}{
 \itemize{
-\item \code{countRNAData}: This method extracts and counts the split reads and
+\item \code{countRNAData()}: This method extracts and counts the split reads and
 non spliced reads from RNA bam files.
 
-\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges 
+\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges 
 object containing the split read counts from all 
 specified samples.
 
-\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges 
+\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges 
 object containing the non split read counts at the 
 exon-intron boundaries inferred from the GRanges object 
 containing the positions of all the introns in this dataset.
 
-\item \code{addCountsToFraserDataSet}: This method adds the split read and 
+\item \code{addCountsToFraserDataSet()}: This method adds the split read and 
 non split read counts to a existing FraserDataSet 
 containing the settings.
 
-\item \code{countSplitReads}: This method counts all split reads in a 
+\item \code{countSplitReads()}: This method counts all split reads in a 
 bam file for a single sample.
 
-\item \code{mergeCounts}: This method merges counts for multiple 
+\item \code{mergeCounts()}: This method merges counts for multiple 
 samples into one SummarizedExperiment object.
 
-\item \code{countNonSplicedReads}: This method counts non spliced reads based 
+\item \code{countNonSplicedReads()}: This method counts non spliced reads based 
 on the given target (acceptor/donor) regions for a single sample.
-}}
 
+}}
 \examples{
    # On Windows SNOW is the default for the parallele backend, which can be 
    # very slow for many but small tasks. Therefore, we will use 
diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd
index 3e11e4e4..d6caf3fd 100644
--- a/man/fds-methods.Rd
+++ b/man/fds-methods.Rd
@@ -41,17 +41,7 @@
 \alias{FRASER.mcols.get}
 \alias{FRASER.rowRanges.get}
 \alias{mapSeqlevels}
-\title{Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object. 
-
-\code{samples} sets or gets the sample IDs; \code{condition} ;
-\code{}
-\code{nonSplicedReads} return a RangedSummarizedExperiment object 
-containing the counts for the non spliced reads overlapping splice 
-sites in the fds.
-\code{}}
+\title{Getter/Setter methods for the FraserDataSet}
 \usage{
 samples(object)
 
@@ -151,10 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().}
 Getter method return the respective current value.
 }
 \description{
-Getter/Setter methods for the FraserDataSet
-
-The following methods are getter and setter methods to extract or set
-certain values of a FraserDataSet object. 
+The following methods are getter and setter methods to extract 
+or set certain values of a FraserDataSet object. 
 
 \code{samples} sets or gets the sample IDs; \code{condition} ;
 \code{}
@@ -162,8 +150,6 @@ certain values of a FraserDataSet object.
 containing the counts for the non spliced reads overlapping splice 
 sites in the fds.
 \code{}
-
-Mapping of chromosome names
 }
 \examples{
 fds <- createTestFraserDataSet()
diff --git a/man/filtering.Rd b/man/filtering.Rd
index b40ad786..361dde7e 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -72,16 +72,16 @@ reliably detected and to remove introns with no variablity between samples.
 }
 \section{Functions}{
 \itemize{
-\item \code{filterExpressionAndVariability}: This functions filters out both introns with low 
+\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low 
 read support and introns that are not variable across samples.
 
-\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding 
+\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
 
-\item \code{filterVariability}: This function filters out introns and corresponding 
+\item \code{filterVariability()}: This function filters out introns and corresponding 
 splice sites which do not show variablity across samples.
-}}
 
+}}
 \examples{
 fds <- createTestFraserDataSet()
 fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index bf56e32c..4dad7b0d 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -91,63 +91,63 @@ the values within the FRASER model.
 }
 \section{Functions}{
 \itemize{
-\item \code{featureExclusionMask}: Retrieves a logical vector indicating
+\item \code{featureExclusionMask()}: Retrieves a logical vector indicating
 for each junction whether it is included or excluded during the fitting
 procedure.
 
-\item \code{featureExclusionMask<-}: To remove certain junctions from 
+\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from 
 being used in the train step of the encoding dimension we can set the 
 \code{featureExclusion} vector to \code{FALSE}. This can be helpfull 
 if we have local linkage between features which we do not want to 
 model by the autoencoder.
 
-\item \code{rho}: Returns the fitted rho values for the 
+\item \code{rho()}: Returns the fitted rho values for the 
 beta-binomial distribution
 
-\item \code{zScores}: This returns the calculated z-scores.
+\item \code{zScores()}: This returns the calculated z-scores.
 
-\item \code{pVals}: This returns the calculated p-values.
+\item \code{pVals()}: This returns the calculated p-values.
 
-\item \code{padjVals}: This returns the adjusted p-values.
+\item \code{padjVals()}: This returns the adjusted p-values.
 
-\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) 
+\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) 
 values.
 
-\item \code{deltaPsiValue}: Returns the difference between the 
+\item \code{deltaPsiValue()}: Returns the difference between the 
 observed and the fitted psi values.
 
-\item \code{currentType}: Returns the psi type that is used 
+\item \code{currentType()}: Returns the psi type that is used 
 within several methods in the FRASER package.
 
-\item \code{currentType<-}: Sets the psi type that is to be used 
+\item \code{currentType(fds) <- value}: Sets the psi type that is to be used 
 within several methods in the FRASER package.
 
-\item \code{pseudocount}: Sets and returns the pseudo count used 
+\item \code{pseudocount()}: Sets and returns the pseudo count used 
 within the FRASER fitting procedure.
 
-\item \code{hyperParams}: This returns the results of the 
+\item \code{hyperParams()}: This returns the results of the 
 hyperparameter optimization NULL if the hyperparameter 
 opimization was not run yet.
 
-\item \code{bestQ}: This returns the optimal size of the 
+\item \code{bestQ()}: This returns the optimal size of the 
 latent space according to the hyperparameter optimization or a simple 
 estimate of about a tenth of the number of samples if the hyperparameter 
 opimization was not run yet.
 
-\item \code{dontWriteHDF5}: Gets the current value of whether the 
+\item \code{dontWriteHDF5()}: Gets the current value of whether the 
 assays should be stored as hdf5 files.
 
-\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored 
+\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored 
 as hdf5 files.
 
-\item \code{verbose}: Dependend on the level of verbosity 
+\item \code{verbose()}: Dependend on the level of verbosity 
 the algorithm reports more or less to the user. 0 means being quiet 
 and 10 means everything.
 
-\item \code{verbose<-}: Sets the verbosity level to a value 
+\item \code{verbose(fds) <- value}: Sets the verbosity level to a value 
 between 0 and 10. 0 means being quiet and 10 means reporting everything.
-}}
 
+}}
 \examples{
 fds <- createTestFraserDataSet()
 
diff --git a/man/spliceTypeAnnotations.Rd b/man/potentialImpactAnnotations.Rd
similarity index 76%
rename from man/spliceTypeAnnotations.Rd
rename to man/potentialImpactAnnotations.Rd
index b28e3c33..6ffec6a7 100644
--- a/man/spliceTypeAnnotations.Rd
+++ b/man/potentialImpactAnnotations.Rd
@@ -1,19 +1,19 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/resultAnnotations.R
-\name{spliceTypeAnnotations}
-\alias{spliceTypeAnnotations}
+\name{potentialImpactAnnotations}
+\alias{potentialImpactAnnotations}
 \alias{annotateIntronReferenceOverlap}
-\alias{annotateSpliceEventType}
+\alias{annotatePotentialImpact}
 \alias{flagBlacklistRegions}
 \title{Additional result annotations}
 \usage{
 annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam())
 
-annotateSpliceEventType(
+annotatePotentialImpact(
   result,
   txdb,
   fds,
-  addSpliceType = TRUE,
+  addPotentialImpact = TRUE,
   addUTRoverlap = TRUE,
   minoverlap = 5,
   BPPARAM = bpparam()
@@ -38,8 +38,8 @@ flagBlacklistRegions(
 \code{annotatedJunction} as generated by the function
 \code{annotateIntronReferenceOverlap}.}
 
-\item{addSpliceType}{Logical, indicating if the type of the splice event 
-should be added to the results table. Defaults to \code{TRUE}.}
+\item{addPotentialImpact}{Logical, indicating if the type of the potential 
+impact should be added to the results table. Defaults to \code{TRUE}.}
 
 \item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions
 should checked and added to the results table. Defaults to \code{TRUE}.}
@@ -61,23 +61,23 @@ An annotated FraserDataSet or results table, respectively
 }
 \description{
 These functions work on the result table and add additional 
-    annotations to the reported introns: the type of splice event (e.g. 
-    exon skipping, exon truncation, ...), expected occurence of frameshift, 
-    overlap with UTR regions as well as a flag for introns that are 
-    located in blacklist regions of the genome.
+    annotations to the reported introns: the type of potential impact on 
+    splicing (e.g. exon skipping, exon truncation, ...), potential occurence 
+    of frameshift, overlap with UTR regions as well as a flag for introns 
+    that are located in blacklist regions of the genome.
     
 \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the 
     fds for each intron based on the overlap of the intron's location with 
     the reference annotation. Has to be run before the result table is 
     created so that the new column can be included in it (see examples).
 
-\code{\link{annotateSpliceEventType}} annotates each intron in the results 
-    table with the type of splice event and expected occurence of frameshift 
-    (likely, unlikely, inconclusive). Can also calculate overlap with 
-    annotated UTR regions. Splice types can be one of: 
+\code{\link{annotatePotentialImpact}} annotates each intron in the results 
+    table with the type of potential impact on splicing and potential 
+    occurence of frameshift (likely, unlikely, inconclusive). Can also 
+    calculate overlap with annotated UTR regions. Potential impact can be: 
     annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, 
     exonTruncation, exonElongation, exonTruncation&Elongation, 
-    singleExonSkipping, exonSkipping, splicingBeyondGene, 
+    exonSkipping, splicingBeyondGene, 
     multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, 
     complex (everything else).     
     Splice sites (theta metric) annotations indicate how the splice site is 
@@ -93,20 +93,20 @@ These functions work on the result table and add additional
 }
 \section{Functions}{
 \itemize{
-\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations 
+\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations 
 based on overlap with the reference annotation (start, end, none, both) 
 for the full fds. The overlap type is added as a new column 
 \code{annotatedJunction} in \code{mcols(fds)}.
 
-\item \code{annotateSpliceEventType}: This method annotates the splice event 
+\item \code{annotatePotentialImpact()}: This method annotates the splice event 
 type to junctions in the given results table.
 
-\item \code{flagBlacklistRegions}: This method flags all introns and 
+\item \code{flagBlacklistRegions()}: This method flags all introns and 
 splice sites in the given results table for which at least one splice 
 site (donor or acceptor) is located in a blacklist region. Blacklist 
 regions of the genome are determined from the provided BED file.
-}}
 
+}}
 \examples{
   # get data, fit and compute p-values and z-scores
   fds <- createTestFraserDataSet()
@@ -123,8 +123,8 @@ regions of the genome are determined from the provided BED file.
   # of 1 is used to get at least one result. 
   res <- results(fds, padjCutoff=NA, zScoreCutoff=1, deltaPsiCutoff=NA)
   
-  # annotate the type of splice event and UTR overlap
-  res <- annotateSpliceEventType(result=res, txdb=txdb, fds=fds)
+  # annotate the type of potential impact on splicing and UTR overlap
+  res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
   
   # annotate overlap with blacklist regions
   res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")

From c1fb2b318a434d9f640168ddd2e81742ad5cf54e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 15 Dec 2022 15:28:45 +0100
Subject: [PATCH 59/80] update potential frameshift colname

---
 R/FRASER-package.R    |  6 +++---
 R/resultAnnotations.R | 24 ++++++++++++------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/R/FRASER-package.R b/R/FRASER-package.R
index 85569e89..a175a0bd 100644
--- a/R/FRASER-package.R
+++ b/R/FRASER-package.R
@@ -134,10 +134,10 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature",
         "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr",
         "uniqueID", "V1", "value", "zscore", "maxDTheta", "par", "genes_donor",
         "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", 
-        "blacklist", "spliceType", "causesFrameshift", "annotatedJunction", 
+        "blacklist", "potentialImpact", "causesFrameshift", "annotatedJunction", 
         "distNearestGene", "UTR_overlap", "meanCount", "medianCount", 
-        "spliceType2", "nonsplitProportion", "nonsplitCounts", 
-        "nonsplitProportion_99quantile", "startID", "endID", "j_idx", 
+        "potentialImpact2", "nonsplitProportion", "nonsplitCounts", 
+        "nonsplitProportion_99quantile", "startID", "endID", "j_idx", "jidx", 
         "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", 
         "pvalue"),
         package="FRASER")
diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R
index bacc8b1c..7ab9b61d 100644
--- a/R/resultAnnotations.R
+++ b/R/resultAnnotations.R
@@ -329,23 +329,23 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
     
     # prepare the results column
     junctions_dt[, potentialImpact := "complex"]
-    junctions_dt[, potentialFrameshift := "inconclusive"]
+    junctions_dt[, causesFrameshift := "inconclusive"]
     junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, 
                     potentialImpact := "annotatedIntron_increasedUsage"]
     junctions_dt[annotatedJunction == "both" & deltaPsi < 0, 
                     potentialImpact := "annotatedIntron_reducedUsage"]
-    junctions_dt[annotatedJunction == "both", potentialFrameshift := "unlikely"]
+    junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
     
     if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% 
             colnames(junctions_dt))){
-        junctions_dt[spliceType == "annotatedIntron_reducedUsage" & 
+        junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" & 
                     type == "jaccard" &
                     nonsplitProportion >= nonsplitProportion_99quantile + 0.05 &
                     nonsplitCounts >= 10,
-                spliceType := "(partial)intronRetention"]
+                potentialImpact := "(partial)intronRetention"]
         
         # TODO check frameshift for intron retention
-        junctions_dt[spliceType == "(partial)intronRetention", 
+        junctions_dt[potentialImpact == "(partial)intronRetention", 
                         causesFrameshift := "inconclusive"]
     }
     
@@ -367,7 +367,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
                             intron_ranges, exons))
     })
     junctions_dt[psi_positions[starts], 
-                    potentialFrameshift:=start_results[2,]]
+                    causesFrameshift:=start_results[2,]]
     junctions_dt[psi_positions[starts], 
                     potentialImpact := start_results[1,]]
     
@@ -384,7 +384,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
                                 intron_ranges, exons))
         
     })
-    junctions_dt[psi_positions[ends], potentialFrameshift:=end_results[2,]]
+    junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
     junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]]
     
     # none junctions pt1
@@ -436,7 +436,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
         return(c(combined,frs))
         
     })
-    junctions_dt[psi_positions[nones], potentialFrameshift:=none_results[2,]]
+    junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
     junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]]
     
     noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap")
@@ -467,7 +467,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
         return(c("complex","inconclusive"))
     })
     junctions_dt[psi_positions[noLaps], 
-                    potentialFrameshift:=noLaps_results[2,]]
+                    causesFrameshift:=noLaps_results[2,]]
     junctions_dt[psi_positions[noLaps], 
                     potentialImpact := noLaps_results[1,]]
     
@@ -478,7 +478,7 @@ addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
     
     # specify default type for theta results as NA
     junctions_dt[thetas, potentialImpact := NA]
-    junctions_dt[thetas, potentialFrameshift := NA]
+    junctions_dt[thetas, causesFrameshift := NA]
     
     # label all as intronic first if they have any intron overlap
     intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
@@ -895,11 +895,11 @@ checkExonSkipping <- function(junctions_dt, txdb){
         junctions_dt[potentialImpact2 == "splicingBeyondGene", 
                         potentialImpact := "splicingBeyondGene"]
         junctions_dt[potentialImpact2 == "splicingBeyondGene", 
-                        potentialFrameshift := "inconclusive"]
+                        causesFrameshift := "inconclusive"]
         junctions_dt[potentialImpact2 == "multigenicSplicing", 
                         potentialImpact := "multigenicSplicing"]
         junctions_dt[potentialImpact2 == "multigenicSplicing", 
-                        potentialFrameshift := "inconclusive"]
+                        causesFrameshift := "inconclusive"]
         junctions_dt[, potentialImpact2 := NULL]
     }
     

From 5625f6116c357b4ab22a4ee965c99912e9e09dba Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 19 Dec 2022 16:49:58 +0100
Subject: [PATCH 60/80] fix to ignore cross-chromosome read pairs during
 counting

---
 R/countRNAseqData.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
index ee9f908b..9ef6ef5f 100644
--- a/R/countRNAseqData.R
+++ b/R/countRNAseqData.R
@@ -569,6 +569,10 @@ countSplitReadsPerChromosome <- function(chromosome, bamFile,
                 bamFile, param=param, strandMode=strandMode)
     }
     
+    # remove read pairs with NA seqnames 
+    # (occurs if reads of a pair align to different chromosomes)
+    galignment <- galignment[!is.na(seqnames(galignment))]
+    
     # remove the strand information if unstranded data
     if(isFALSE(as.logical(strandMode))){
         strand(galignment) <- "*"

From 8de38a220558da9570ab96c821513ba5c86aff5f Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 19 Dec 2022 17:05:49 +0100
Subject: [PATCH 61/80] use correct pseudocount in plot heatmap function

---
 R/plotMethods.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/plotMethods.R b/R/plotMethods.R
index e21471b3..555385f8 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1034,7 +1034,7 @@ plotCountCorHeatmap.FRASER <- function(object,
         object <- object[,ids2plot]
     }
 
-    xmat <- (skmat + 1)/(snmat + 2)
+    xmat <- (skmat + 1*pseudocount())/(snmat + 2*pseudocount())
     if(isTRUE(logit)){
         xmat <- qlogisWithCap(xmat)
     }

From cf115564057081bb353add734147492f1b245d3a Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 20 Dec 2022 09:27:08 +0100
Subject: [PATCH 62/80] fix typo

---
 R/AllGenerics.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 5c4cff54..db93ff9c 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -870,10 +870,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                                     function(s) which(colnames(tmp_x) ==s)),
                                 FDR_subset)]
                 )
-                # replace padj values with values on subset
-                padj <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals))
+                # replace padjs values with values on subset
+                padjs <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals))
                 if(nrow(subset_padj) > 0){
-                    padj[subset_padj[,1:2]] <- subset_padj[,3]
+                    padjs[subset_padj[,1:2]] <- subset_padj[,3]
                 }
             }
             

From f57f4c8dd189d47e205cc9afe38df47ff1be458b Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Tue, 20 Dec 2022 10:12:37 +0100
Subject: [PATCH 63/80] add colnames to subsetted padjust for later retrieval

---
 R/AllGenerics.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index db93ff9c..7a632521 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -875,6 +875,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                 if(nrow(subset_padj) > 0){
                     padjs[subset_padj[,1:2]] <- subset_padj[,3]
                 }
+                colnames(padjs) <- colnames(pvals)
             }
             
             if(length(sc) == 1){

From f17ffe825e0eece2418d428e70c24d5bd31a377d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 19 Jan 2023 15:54:19 +0100
Subject: [PATCH 64/80] minor bug fixes

---
 R/AllGenerics.R | 19 ++++++++++++-------
 R/plotMethods.R |  3 ++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 7a632521..fdbf7045 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -831,10 +831,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                     # get row,col idx of genes/samples in subset
                     subset_gene_padj <- as.matrix(
                         fdr_subset[sampleID %in% samples(tmp_x), 
-                            .(sapply(gene, 
-                                function(g) which(rownames(pvalsGene) == g)),
-                            sapply(sampleID, 
-                                function(s) which(colnames(tmp_x) ==s)),
+                            .(as.numeric(sapply(gene, 
+                                function(g) which(rownames(pvalsGene) == g)) ),
+                            as.numeric(sapply(sampleID, 
+                                function(s) which(colnames(tmp_x) ==s)) ),
                             pval_gene,
                             FDR_subset_gene)]
                     )
@@ -850,9 +850,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                     colnames(padjsGene) <- colnames(tmp_x)
                     if(nrow(subset_gene_padj) > 0){
                         pvalsGene[subset_gene_padj[,1:2]] <- 
-                            subset_gene_padj[,3]
+                            unlist(subset_gene_padj[,3])
                         padjsGene[subset_gene_padj[,1:2]] <- 
-                            subset_gene_padj[,4]
+                            unlist(subset_gene_padj[,4])
                     }
                 }
                 
@@ -1188,7 +1188,12 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         
         # define aberrant status based on whether intron/sample tuples are 
         # part of the given subset
-        aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj))
+        if(is.null(dim(padj))){
+            # only one sample in fds present -> no dimensions set
+            aberrantEvents <- matrix(FALSE, nrow=length(padj), ncol=1)
+        } else{
+            aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj))
+        }
         colnames(aberrantEvents) <- colnames(padj)
         FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset")
         subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 555385f8..faace9a4 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -440,7 +440,8 @@ plotAberrantPerSample.FRASER <- function(object, main,
         scale_linetype_manual(name="", values=2, labels="Median")
 
     if(!all(dt2p[,value] == 0)){
-        g <- g + scale_y_log10() + annotation_logticks(sides="l")
+        g <- g + scale_y_log10(limits=c(1, max(unlist(outliers)))) + 
+            annotation_logticks(sides="l")
     }
     
     g

From ade6ebfd29e060217a7bb42e5e754f5c0302bc57 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 19 Jan 2023 16:37:55 +0100
Subject: [PATCH 65/80] fix small bug in extracting junction idx of genes in
 subsets

---
 R/pvalsNzscore.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 979d97e8..8a9e7e82 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -426,7 +426,8 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
         # message(date(), ": FDR subset calculation for sample = ", sample_id)
         # get idx of junctions corresponding to genes with var
         jidx <- unlist(lapply(genes_to_test_sample, function(gene){
-            idx <- which(grepl(gene, mcols(fds, type=type)[, geneColumn]))
+            idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), 
+                                mcols(fds, type=type)[, geneColumn]))
             names(idx) <- rep(gene, length(idx))
             if(length(idx) == 0 && verbose(fds) > 0){
                 warning("No introns found in fds object for gene: ", gene, 

From 37c757aebafbdf957abea734e70ebde1c9865efa Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 15 Feb 2023 15:44:32 +0100
Subject: [PATCH 66/80] move nonsplitProportion calculation before subsetting
 in results function

---
 R/AllGenerics.R | 50 ++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index fdbf7045..b99ddd8e 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -749,6 +749,18 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
         rowMeansK <- rowMeans(K(object, type=type))
         rowMeansN <- rowMeans(N(object, type=type))
         
+        # get proportion of nonsplitCounts among all counts (N) for each intron
+        if(type == "jaccard"){
+            rawNonsplitCts  <- as.matrix(assay(object, "rawCountsJnonsplit"))
+            rawNsProportion <- rawNonsplitCts / as.matrix(N(object))
+            nsProportion_99quantile <- 
+                rowQuantiles(rawNsProportion, probs=0.99)
+        } else{
+            rawNonsplitCts  <- NULL
+            rawNsProportion <- NULL
+            nsProportion_99quantile <- NULL
+        }
+        
         # calculate FDR on subset first if requested
         if(!is.null(geneSubset)){
             object <- calculatePadjValuesOnSubset(fds=object, type=type, 
@@ -884,33 +896,21 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                 colnames(deltaPsiVals) <- sc
             }
             
-            if(type == "jaccard"){
-                rawNonsplitCts  <- as.matrix(assay(tmp_x, "rawCountsJnonsplit"))
-                rawNsProportion <- rawNonsplitCts / rawTotalCts
-                nsProportion_99quantile <- 
-                    rowQuantiles(rawNsProportion, probs=0.99)
-            } else{
-                rawNonsplitCts  <- NULL
-                rawNsProportion <- NULL
-                nsProportion_99quantile <- NULL
-            }
-            
-            
             # create result table
             sampleRes <- lapply(sc,
-                                resultsSingleSample, gr=gr, pvals=pvals, 
-                                padjs=padjs, psiType=type, 
-                                psivals=psivals, deltaPsiVals=deltaPsiVals, 
-                                rawCts=rawCts, rawTotalCts=rawTotalCts, 
-                                rawNonsplitCts=rawNonsplitCts, 
-                                rawNsProportion=rawNsProportion,
-                                nsProportion_99quantile=nsProportion_99quantile,
-                                rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
-                                aberrant=aberrant, aggregate=aggregate,
-                                rho=rho, geneColumn=geneColumn,
-                                pvalsGene=pvalsGene, padjsGene=padjsGene,
-                                aberrantGene=aberrantGene, 
-                                additionalColumns=additionalColumns)
+                        resultsSingleSample, gr=gr, pvals=pvals, 
+                        padjs=padjs, psiType=type, 
+                        psivals=psivals, deltaPsiVals=deltaPsiVals, 
+                        rawCts=rawCts, rawTotalCts=rawTotalCts, 
+                        rawNonsplitCts=rawNonsplitCts[,sc], 
+                        rawNsProportion=rawNsProportion[,sc],
+                        nsProportion_99quantile=nsProportion_99quantile,
+                        rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
+                        aberrant=aberrant, aggregate=aggregate,
+                        rho=rho, geneColumn=geneColumn,
+                        pvalsGene=pvalsGene, padjsGene=padjsGene,
+                        aberrantGene=aberrantGene, 
+                        additionalColumns=additionalColumns)
             
             # return combined result
             return(unlist(GRangesList(sampleRes)))

From c1e13d3d63731e5c326543ea8049f8f17e838641 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 15 Feb 2023 17:37:32 +0100
Subject: [PATCH 67/80] small bugfix

---
 R/AllGenerics.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index b99ddd8e..c27960f1 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -902,8 +902,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                         padjs=padjs, psiType=type, 
                         psivals=psivals, deltaPsiVals=deltaPsiVals, 
                         rawCts=rawCts, rawTotalCts=rawTotalCts, 
-                        rawNonsplitCts=rawNonsplitCts[,sc], 
-                        rawNsProportion=rawNsProportion[,sc],
+                        rawNonsplitCts=rawNonsplitCts[,sc,drop=FALSE], 
+                        rawNsProportion=rawNsProportion[,sc,drop=FALSE],
                         nsProportion_99quantile=nsProportion_99quantile,
                         rowMeansK=rowMeansK, rowMeansN=rowMeansN, 
                         aberrant=aberrant, aggregate=aggregate,

From 68f703912b2207b056c9ce8986864b51e34849ed Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 16 Feb 2023 10:08:07 +0100
Subject: [PATCH 68/80] small bugfix

---
 R/AllGenerics.R  | 2 +-
 R/pvalsNzscore.R | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index c27960f1..6b72e958 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -1194,7 +1194,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         } else{
             aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj))
         }
-        colnames(aberrantEvents) <- colnames(padj)
+        colnames(aberrantEvents) <- colnames(object)
         FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset")
         subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){
             col <- which(colnames(object) == sid)
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 8a9e7e82..ccad870f 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -452,7 +452,11 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
         }
         
         # retrieve pvalues of junctions to test
-        p <- pVals(fds, type=type, level="junction")[jidx, sample_id]
+        p <- as.matrix(pVals(fds, type=type, level="junction"))
+        if(ncol(p) == 1){
+            colnames(p) <- colnames(fds)    
+        }
+        p <- p[jidx, sample_id]
         
         # FDR correction
         pa <- p.adjust(p, method=method)

From 63e0654c6217a215321b1107f61cc41b6a8c406e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 16 Feb 2023 17:35:44 +0100
Subject: [PATCH 69/80] fix gene pval in FDR on subset for psi3/5

---
 R/pvalsNzscore.R | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index ccad870f..94c9c151 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -418,6 +418,9 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
             "annotateRanges or annotateRangesWithTxDb function."))
     }
     
+    # site index (for psi3/5)
+    site_idx <- getSiteIndex(fds, type=type)
+        
     # compute FDR on the given subsets of genes
     message(date(), ": starting FDR calculation on subset of genes...")
     FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, 
@@ -451,20 +454,22 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds),
                               FDR_subset_gene=numeric(0)))
         }
         
-        # retrieve pvalues of junctions to test
-        p <- as.matrix(pVals(fds, type=type, level="junction"))
+        # retrieve pvalues of junctions
+        p <- as.matrix(pVals(fds, type=type))
         if(ncol(p) == 1){
             colnames(p) <- colnames(fds)    
         }
         p <- p[jidx, sample_id]
         
         # FDR correction
-        pa <- p.adjust(p, method=method)
+        pa <- p.adjust(p[!duplicated(site_idx[jidx])], method=method)
         
         # gene level pvals
-        dt <- data.table(sampleID=sample_id, type=type, pval=p, FDR_subset=pa, 
-                        gene=names(jidx), jidx=jidx)
-        dt[, pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
+        dt <- data.table(sampleID=sample_id, type=type, pval=p, 
+                         gene=names(jidx), jidx=jidx, site_idx=site_idx[jidx])
+        dt <- merge(dt, data.table(site_idx=site_idx[jidx][!duplicated(site_idx[jidx])], FDR_subset=pa), by="site_idx")
+        dt[!duplicated(dt$site_idx), pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
+        dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"]
         
         # gene level FDR
         dt2 <- dt[, unique(pval_gene), by="gene"]

From 17811132069569a308d5e68fb3912f1a668e7d6b Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Mon, 13 Mar 2023 15:13:29 +0100
Subject: [PATCH 70/80] enable subsetting on manhattan plot & added
 plotSpliceMetricRank

---
 NAMESPACE            |   1 +
 R/plotMethods.R      | 177 ++++++++++++++++++++++++++++++++++++++++---
 man/plotFunctions.Rd |  34 ++++++++-
 3 files changed, 197 insertions(+), 15 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index bf731b72..64a7caf6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -76,6 +76,7 @@ export(plotFilterExpression)
 export(plotFilterVariability)
 export(plotManhattan)
 export(plotQQ)
+export(plotSpliceMetricRank)
 export(plotVolcano)
 export(predictedMeans)
 export(pseudocount)
diff --git a/R/plotMethods.R b/R/plotMethods.R
index faace9a4..7609939c 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -18,6 +18,7 @@
 #'   \item plotBamCoverage()
 #'   \item plotBamCoverageFromResultTable()
 #'   \item plotManhattan()
+#'   \item plotSpliceMetricRank()
 #' }
 #'
 #' For a detailed description of each plot function please see the details.
@@ -162,7 +163,11 @@
 #'             (b,l,t,r).
 #' @param cex For controlling the size of text and numbers in 
 #'             \code{plotBamCoverage}.
-#' @param color_chr Interchanging colors by chromosome for \code{plotManhattan}.
+#' @param chr Vector of chromosome names to show in \code{plotManhattan}. The 
+#'             default is to show all chromosomes.
+#' @param value Indicates which assay is shown in the manhattan plot. Defaults 
+#'             to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.
+#' @param chrColor Interchanging colors by chromosome for \code{plotManhattan}.
 #'
 #### Additional ... parameter
 #' @param ... Additional parameters passed to plot() or plot_ly() if not stated
@@ -190,6 +195,9 @@
 #'
 #' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi
 #' against the predicted psi for a given site.
+#' 
+#' \code{plotSpliceMetricRank}: This function plots for a given intron the
+#' observed values of the selected splice metrix against the sample rank.
 #'
 #' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either
 #' of the full data set (i.e. sample-sample correlations) or of the top x most
@@ -549,6 +557,116 @@ plotExpression <- function(fds, type=fitMetrics(fds),
     plotBasePlot(g, basePlot)
 }
 
+#'
+#' Junction splice metric plot
+#'
+#' Plots the observed values of the splice metric across samples for a 
+#' junction of interest.
+#'
+#' @rdname plotFunctions
+#' @export
+plotSpliceMetricRank <- function(fds, type=fitMetrics(fds),
+                           idx=NULL, result=NULL, colGroup=NULL, 
+                           basePlot=TRUE, main=NULL, label="aberrant", ...){
+    if(!is.null(result)){
+        type <- as.character(result$type)
+        idx <- getIndexFromResultTable(fds, result)
+    } else {
+        type <- match.arg(type)
+    }
+    
+    dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, ...)
+    dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)]
+    
+    # rank on observed value of splice metric of interest
+    dt[, rank := rank(obsPsi, ties.method="random", na.last=FALSE)]
+    
+    if(!is.null(colGroup)){
+        if(all(colGroup %in% samples(fds))){
+            colGroup <- samples(fds) %in% colGroup
+        }
+        dt[colGroup,aberrant:=TRUE]
+    }
+    dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))]
+    
+    gr <- granges(rowRanges(fds,type=type)[idx,])
+    genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), 
+                                ":", strand(gr))
+    
+    if(is.null(main)){
+        if(isTRUE(basePlot)){
+            # main <- as.expression(bquote(bold(paste(
+            #     .(ggplotLabelPsi(type)[[1]]), " rank plot: ",
+            #     .(genomic_pos_label),
+            #     " (", bolditalic(.(as.character(dt[,unique(featureID)]))), 
+            #     ")"))))
+            main <- as.expression(bquote(bold(paste(
+                    .(genomic_pos_label),
+                    " (", bolditalic(.(as.character(dt[,unique(featureID)]))), 
+                    ")"))))
+        } else{
+            # main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], 
+            #                " rank plot: ", dt[,unique(featureID)], 
+            #                " (site ", dt[,unique(idx)], ")")
+            main <- paste0(genomic_pos_label,
+                " (", dt[,unique(featureID)], ")")
+        }
+    }
+    
+    if(isTRUE(basePlot)){
+        ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]]))
+    } else{
+        ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]])
+    }
+    
+    g <- ggplot(dt, aes(x=rank, y=obsPsi, color=aberrant, label=sampleID, 
+                        text=paste0(
+                            "Sample: ", sampleID, "<br>",
+                            "Counts (K): ", k, "<br>",
+                            "Total counts (N): ", n, "<br>",
+                            "p value: ", signif(pval, 5), "<br>",
+                            "padjust: ", signif(padj, 5), "<br>",
+                            "Observed Psi: ", round(obsPsi, 2), "<br>",
+                            "Predicted mu: ", round(predPsi, 2), "<br>"))) +
+        geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) +
+        theme_bw() +
+        theme(legend.position="none", title=) +
+        xlab("Sample rank") +
+        ylab(ylab) +
+        ggtitle(main, subtitle=paste0("fds row index: ", dt[,unique(idx)])) +
+        ylim(0,1)
+    
+    
+    if(isTRUE(basePlot) && !is.null(label)){
+        if(isScalarCharacter(label) && label == "aberrant"){
+            if(nrow(dt[aberrant == TRUE,]) > 0){
+                g <- g + geom_text_repel(data=dt[aberrant == TRUE,], 
+                                         aes(col=aberrant),
+                                         fontface='bold', hjust=-.2, vjust=-.2)
+            }
+        }
+        else{
+            if(nrow(dt[sampleID %in% label]) > 0){
+                g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), 
+                                         aes(col=aberrant),
+                                         fontface='bold', hjust=-.2, vjust=-.2)
+            }
+            if(any(!(label %in% dt[,sampleID]))){
+                warning("Did not find sample(s) ", 
+                        paste(label[!(label %in% dt[,sampleID])], 
+                              collapse=", "), " to label.")
+            }
+        }
+    }
+    
+    if(is.null(colGroup)){
+        g <- g + scale_colour_manual(
+            values=c("FALSE"="gray70", "TRUE"="firebrick"))
+    }
+    
+    plotBasePlot(g, basePlot)
+}
+
 
 #'
 #' Expected over Overserved plot
@@ -1432,10 +1550,10 @@ plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE,
     return(invisible(fds))
 }
 
-plotManhattan.FRASER <- function(object, sampleID, 
-                                type=fitMetrics(object), 
-                                main=paste0("sampleID = ", sampleID), 
-                                color_chr=c("black", "darkgrey"),
+plotManhattan.FRASER <- function(object, sampleID, value="pvalue", 
+                                type=fitMetrics(object), chr=NULL, 
+                                main=paste0("sample: ", sampleID), 
+                                chrColor=c("black", "darkgrey"),
                                 ...){
     # check necessary packages
     if (!requireNamespace('ggbio')){
@@ -1454,7 +1572,7 @@ plotManhattan.FRASER <- function(object, sampleID,
     if("padjCutoff" %in% names(additional_args)){
         padjCutoff <- additional_args$padjCutoff
     }
-    deltaPsiCutoff <- 0.3
+    deltaPsiCutoff <- ifelse(type == "jaccard", 0.1, 0.3)
     if("deltaPsiCutoff" %in% names(additional_args)){
         deltaPsiCutoff <- additional_args$deltaPsiCutoff
     }
@@ -1468,6 +1586,20 @@ plotManhattan.FRASER <- function(object, sampleID,
         padjVals(object, type=type, level="site")[,sampleID])
     mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID]
     
+    # Add values to granges
+    if(value %in% c('pvalue', 'pValue', 'pv')){
+        gr_sample$value <- mcols(gr_sample)[, "pvalue"]
+        ylabel <- expression(paste(-log[10], "(P-value)"))
+    }
+    if(value %in% c('zscore', 'zScore')){
+        gr_sample$value <- zScores(object, type=type)[, sampleID]
+        ylabel <- value
+    }
+    if(value %in% c('delta', 'deltaPsi', 'deltaJaccard')){
+        gr_sample$value <- mcols(gr_sample)[, "delta"]
+        ylabel <- bquote(Delta ~ .(ggplotLabelPsi(type)[[1]]))
+    }
+    
     # only one point per donor/acceptor site (relevant only for psi5 and psi3)
     index <- getSiteIndex(object, type=type)
     nonDup <- !duplicated(index)
@@ -1477,6 +1609,24 @@ plotManhattan.FRASER <- function(object, sampleID,
     gr_sample <- sortSeqlevels(gr_sample)
     gr_sample <- sort(gr_sample)
     
+    # subset to chromosomes in chrSubset if requested
+    if(!is.null(chr)){
+        # check input
+        if(!all(chr %in% unique(seqnames(gr_sample)))){
+            stop("Not all chromosomes selected for subsetting are present ",
+                 "in the GRanges object.")
+        }
+        
+        # subset
+        gr_sample <- gr_sample[as.character(seqnames(gr_sample)) %in% chr]
+        
+        # add chr to plot title if only one chr given
+        if(length(chr) == 1){
+            main <- paste0(main, "; ", 
+                           paste(chr, collapse=", ", sep=""))
+        }
+    }
+    
     # find outlier indices
     if(!type %in% c("psi3", "psi5")){
         outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) &
@@ -1487,11 +1637,12 @@ plotManhattan.FRASER <- function(object, sampleID,
     message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...")
     
     # plot manhattan plot
-    plotGrandLinear.adapted(gr_sample, aes(y=pvalue), 
-                            color=color_chr, 
+    plotGrandLinear.adapted(gr_sample, aes(y=value), 
+                            color=chrColor, 
                             highlight.gr=gr_sample[outlier_idx,],
-                            highlight.overlap="equal") + 
-        labs(title=main)
+                            highlight.overlap="equal",
+                            use.genome.coords=is.null(chr)) + 
+        labs(title=main, x="", y=ylabel)
     
 }
 
@@ -1614,7 +1765,7 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
         highlight.label.size = 5, highlight.label.offset = 0.05, 
         highlight.label.col = "black", 
         highlight.overlap = c("any", "start", "end", "within", "equal"),
-        spaceline = FALSE){
+        spaceline = FALSE, use.genome.coords=TRUE){
     if (is.null(geom)) 
         geom <- "point"
     args <- list(...)
@@ -1644,7 +1795,9 @@ plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01,
     }
     if (!"y" %in% names(args.aes)) 
         stop("need to provide y")
-    args.non$coord <- "genome"
+    if(isTRUE(use.genome.coords)){
+        args.non$coord <- "genome"
+    }
     args.non$space.skip <- space.skip
     args.non$geom <- geom
     args.non$object <- obj
diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd
index 79b1db7c..db8025a0 100644
--- a/man/plotFunctions.Rd
+++ b/man/plotFunctions.Rd
@@ -15,6 +15,7 @@
 \alias{plotBamCoverageFromResultTable}
 \alias{plotVolcano,FraserDataSet-method}
 \alias{plotAberrantPerSample,FraserDataSet-method}
+\alias{plotSpliceMetricRank}
 \alias{plotQQ,FraserDataSet-method}
 \alias{plotEncDimSearch,FraserDataSet-method}
 \alias{plotFilterVariability}
@@ -60,6 +61,18 @@ plotExpression(
   ...
 )
 
+plotSpliceMetricRank(
+  fds,
+  type = fitMetrics(fds),
+  idx = NULL,
+  result = NULL,
+  colGroup = NULL,
+  basePlot = TRUE,
+  main = NULL,
+  label = "aberrant",
+  ...
+)
+
 plotExpectedVsObservedPsi(
   fds,
   type = fitMetrics(fds),
@@ -169,9 +182,11 @@ plotBamCoverageFromResultTable(
 \S4method{plotManhattan}{FraserDataSet}(
   object,
   sampleID,
+  value = "pvalue",
   type = fitMetrics(object),
-  main = paste0("sampleID = ", sampleID),
-  color_chr = c("black", "darkgrey"),
+  chr = NULL,
+  main = paste0("sample: ", sampleID),
+  chrColor = c("black", "darkgrey"),
   ...
 )
 }
@@ -371,7 +386,13 @@ junction should be extended to the left in
 outlier junction should be extended to the right in 
 \code{plotBamCoverageFromResultTable}.}
 
-\item{color_chr}{Interchanging colors by chromosome for \code{plotManhattan}.}
+\item{value}{Indicates which assay is shown in the manhattan plot. Defaults 
+to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.}
+
+\item{chr}{Vector of chromosome names to show in \code{plotManhattan}. The 
+default is to show all chromosomes.}
+
+\item{chrColor}{Interchanging colors by chromosome for \code{plotManhattan}.}
 }
 \value{
 If base R graphics are used nothing is returned else the plotly or
@@ -390,6 +411,9 @@ Plot the number of aberrant events per samples
 Plots the observed split reads of the junction of interest over all reads
 coming from the given donor/acceptor.
 
+Plots the observed values of the splice metric across samples for a 
+junction of interest.
+
 Plots the expected psi value over the observed psi value of the given
 junction.
 
@@ -416,6 +440,7 @@ This is the list of all plotting function provided by FRASER:
   \item plotBamCoverage()
   \item plotBamCoverageFromResultTable()
   \item plotManhattan()
+  \item plotSpliceMetricRank()
 }
 
 For a detailed description of each plot function please see the details.
@@ -444,6 +469,9 @@ log10 space.
 \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi
 against the predicted psi for a given site.
 
+\code{plotSpliceMetricRank}: This function plots for a given intron the
+observed values of the selected splice metrix against the sample rank.
+
 \code{plotCountCorHeatmap}: The correlation heatmap of the count data either
 of the full data set (i.e. sample-sample correlations) or of the top x most
 variable junctions (i.e. junction-sample correlations). By default the values

From 3e8b39fbf65d4203af9405ce51c4f64bd0dc3f62 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 15 Mar 2023 15:57:09 +0100
Subject: [PATCH 71/80] added new plot functions to vignette

---
 vignettes/FRASER.Rnw | 62 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index ef49a129..f911b8ec 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -556,17 +556,19 @@ the following additional information:
     sample in which it is detected as aberrant
     \item deltaPsi: the $\Delta\psi$-value of the event in this sample, which
     is the difference between the actual observed $\psi$ and the expected $\psi$
+    \item counts, totalCounts: the count (k) and total count (n) of the splice
+    junction or site for the sample where it is detected as aberrant
     \item meanCounts: the mean count (k) of reads mapping to this splice
     junction or site over all samples
     \item meanTotalCounts: the mean total count (n) of reads mapping to the
     same donor or acceptor site as this junction or site over all samples
-    \item counts, totalCounts: the count (k) and total count (n) of the splice
-    junction or site for the sample where it is detected as aberrant
     \item nonsplitCounts, nonsplitProportion: only present for the Intron 
     Jaccard Index. States the sum of nonsplit counts overlapping either the 
     donor or acceptor site of the outlier intron for the sample where it is 
     detected as aberrant; and their proportion out of the total counts (N). 
     A high nonsplitProportion indicates possible (partial) intron retention.
+    \item FDR_set The set of genes on which FDR correction is applied. If not 
+    otherwise specified, FDR correction is transcriptome-wide.
 \end{itemize}
 Please refer to section \ref{sec:Introduction} for more information about the
 Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and 
@@ -609,15 +611,16 @@ plotVolcano(fds, type="jaccard", "sample10")
 Which are the splicing events in detail?
 
 <<sample result, echo=TRUE>>=
-sampleRes <- res[res$sampleID == "sample8"]
+sampleRes <- res[res$sampleID == "sample10"]
 sampleRes
 @
 
 To have a closer look at the junction level, use the following functions:
 
 <<plot_expression, echo=TRUE, eval=FALSE>>=
-plotExpression(fds, type="jaccard", result=sampleRes[1])
-plotExpectedVsObservedPsi(fds, result=sampleRes[1])
+plotExpression(fds, type="jaccard", result=sampleRes[9])
+plotSpliceMetricRank(fds, type="jaccard", result=sampleRes[9])
+plotExpectedVsObservedPsi(fds, result=sampleRes[9])
 @
 
 \subsection{Saving and loading a \fds{}}
@@ -737,6 +740,7 @@ head(padjVals(fds,type="jaccard"))
 
 In addition to the plotting methods \Rfunction{plotVolcano},
 \Rfunction{plotExpression}, \Rfunction{plotExpectedVsObservedPsi},
+\Rfunction{plotSpliceMetricRank}, 
 \Rfunction{plotFilterExpression} and \Rfunction{plotEncDimSearch} used above,
 the \fraser{} package provides two additional functions to visualize the
 results:
@@ -755,6 +759,54 @@ plotQQ(fds, result=res[1])
 plotQQ(fds, aggregate=TRUE, global=TRUE)
 @
 
+The \Rfunction{plotManhattan} function can be used to visualize the p-values 
+along with the genomic coordinates of the introns:
+<<manhattan_plot, echo=TRUE>>=
+plotManhattan(fds, sampleID="sample10")
+plotManhattan(fds, sampleID="sample10", chr="chr19")
+@
+
+Finally, when one has access to the bam files from which the split and unsplit 
+counts of FRASER were created, the \Rfunction{plotBamCoverage} and 
+\Rfunction{plotBamCoverageFromResultTable} functions use the \Rpackage{SGSeq}
+package to allow visualizing the read coverage in the bam file a certain intron 
+from the results table or within a given genomic region as a sashimi plot:
+<<bamCoverage_plot, echo=TRUE>>=
+### plot coverage from bam file for a certain genomic region
+fds <- createTestFraserSettings()
+vizRange <- GRanges(seqnames="chr19",
+                IRanges(start=7587496, end=7598895),
+                strand="+")
+plotBamCoverage(fds, gr=vizRange, sampleID="sample3",
+                control_samples="sample2", min_junction_count=5,
+                curvature_splicegraph=1, curvature_coverage=1,
+                mar=c(1, 7, 0.1, 3))
+
+### plot coverage from bam file for a row in the result table
+fds <- createTestFraserDataSet()
+
+# load gene annotation
+require(TxDb.Hsapiens.UCSC.hg19.knownGene)
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
+require(org.Hs.eg.db)
+orgDb <- org.Hs.eg.db
+ 
+# get results table 
+res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+res_dt <- as.data.table(res)
+res_dt <- res_dt[sampleID == "sample2",]
+
+# plot full range of gene highlighting the outlier intron
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE,
+    txdb=txdb, orgDb=orgDb, control_samples="sample3")
+     
+# plot only certain range around the outlier intron
+plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, 
+     control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb,
+     curvature_coverage=0.5, right_extension=5000, left_extension=5000,
+     splicegraph_labels="id")
+@
+
 \bibliography{bibliography}
 
 \section{Session Info}

From bcc4fd3a80913aaab59371f779984993e0d6e7fb Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 15 Mar 2023 16:09:47 +0100
Subject: [PATCH 72/80] fix vignette

---
 vignettes/FRASER.Rnw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index f911b8ec..99df108d 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -567,7 +567,7 @@ the following additional information:
     donor or acceptor site of the outlier intron for the sample where it is 
     detected as aberrant; and their proportion out of the total counts (N). 
     A high nonsplitProportion indicates possible (partial) intron retention.
-    \item FDR_set The set of genes on which FDR correction is applied. If not 
+    \item FDR\_set The set of genes on which FDR correction is applied. If not 
     otherwise specified, FDR correction is transcriptome-wide.
 \end{itemize}
 Please refer to section \ref{sec:Introduction} for more information about the

From 5e6893ff08f5f70c7919b19a70e0afd869b46069 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 29 Mar 2023 17:21:05 +0200
Subject: [PATCH 73/80] simplify result retrieval for FDR on subset

---
 NAMESPACE                         |   1 +
 R/AllGenerics.R                   | 262 ++++++++----------------------
 R/Fraser-pipeline.R               |  18 +-
 R/getNSetterFuns.R                |  37 ++++-
 R/helper-functions.R              |   5 +-
 R/pvalsNzscore.R                  | 175 ++++++++++++++------
 man/FRASER.Rd                     |  40 +++--
 man/countRNA.Rd                   |  16 +-
 man/filtering.Rd                  |   8 +-
 man/getter_setter_functions.Rd    |  47 +++---
 man/potentialImpactAnnotations.Rd |   8 +-
 man/results.Rd                    |  44 ++---
 tests/testthat/test_stats.R       |  14 +-
 vignettes/FRASER.Rnw              |  23 ++-
 14 files changed, 358 insertions(+), 340 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 64a7caf6..907f1408 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -27,6 +27,7 @@ export(annotateIntronReferenceOverlap)
 export(annotatePotentialImpact)
 export(annotateRanges)
 export(annotateRangesWithTxDb)
+export(availableFDRsubsets)
 export(bamFile)
 export(bestQ)
 export(calculatePSIValues)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 6b72e958..7e1bf696 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -714,8 +714,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                             dPsiCutoff, minCount, rhoCutoff, psiType, 
                             maxCols=20, aggregate=FALSE, collapse=FALSE,
                             geneColumn="hgnc_symbol", BPPARAM=bpparam(), 
-                            geneSubset=NULL, subsetName, fullSubset=FALSE, 
-                            additionalColumns=NULL){
+                            subsetName=NULL, all=all, additionalColumns=NULL){
     
     stopifnot(is(object, "FraserDataSet"))
     stopifnot(all(sampleIDs %in% samples(object)))
@@ -732,7 +731,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
     }
     pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType,
                                                    filters=list(rho=rhoCutoff), 
-                                                   aggregate=aggregate)
+                                                   aggregate=aggregate,
+                                                   subsetName=subsetName)
     psiType <- psiType[pvalsAvailable]
     if(all(isFALSE(pvalsAvailable))){
         stop("For the splice metric(s), pvalues are not yet computed. \n", 
@@ -741,7 +741,9 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
     }
     
     resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){
-        message(date(), ": Collecting results for: ", type)
+        message(date(), ": Collecting results for: ", type,
+                ifelse(is.null(subsetName), " (transcriptome-wide)",
+                       paste0(" (", subsetName, ")")))
         currentType(object) <- type
         gr <- rowRanges(object, type=type)
         
@@ -761,17 +763,6 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
             nsProportion_99quantile <- NULL
         }
         
-        # calculate FDR on subset first if requested
-        if(!is.null(geneSubset)){
-            object <- calculatePadjValuesOnSubset(fds=object, type=type, 
-                                                  genesToTest=geneSubset, 
-                                                  subsetName=subsetName, 
-                                                  geneColumn=geneColumn)
-            fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, 
-                                                    sep="_")]]
-            object <- object[, fdr_subset[, unique(sampleID)]]
-        } 
-        
         # then iterate by chunk
         chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols)
         sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs,
@@ -788,6 +779,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
             pvals        <- as.matrix(pVals(tmp_x, 
                                             filters=list(rho=rhoCutoff)))
             padjs        <- as.matrix(padjVals(tmp_x, 
+                                            subsetName=subsetName,
                                             filters=list(rho=rhoCutoff)))
             psivals      <- as.matrix(assay(tmp_x, type))
             muPsi        <- as.matrix(predictedMeans(tmp_x))
@@ -795,101 +787,36 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
                 (rawTotalCts + 2*pseudocount())
             deltaPsiVals <- deltaPsiValue(tmp_x, type)
             rho          <- rho(tmp_x, type)
-            if(is.null(geneSubset)){
-                aberrant     <- aberrant.FRASER(tmp_x, type=type, 
-                                                padjCutoff=fdrCutoff, 
-                                                deltaPsiCutoff=dPsiCutoff, 
-                                                minCount=minCount, 
-                                                rhoCutoff=rhoCutoff,
-                                                aggregate=FALSE,
-                                                geneColumn=geneColumn)
-            } else{
-                aberrant     <- aberrant.FRASER(tmp_x, type=type, 
+            aberrant     <- aberrant.FRASER(tmp_x, type=type, 
+                                            padjCutoff=fdrCutoff, 
+                                            deltaPsiCutoff=dPsiCutoff, 
+                                            minCount=minCount, 
+                                            rhoCutoff=rhoCutoff,
+                                            aggregate=FALSE,
+                                            all=all,
+                                            geneColumn=geneColumn,
+                                            subsetName=subsetName)
+            if(isTRUE(aggregate)){
+                pvalsGene    <- as.matrix(pVals(tmp_x, level="gene", 
+                                                filters=list(rho=rhoCutoff)))
+                padjsGene    <- as.matrix(padjVals(tmp_x, level="gene",
+                                                   subsetName=subsetName, 
+                                                filters=list(rho=rhoCutoff)))
+                aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
                                                 padjCutoff=fdrCutoff, 
                                                 deltaPsiCutoff=dPsiCutoff, 
                                                 minCount=minCount, 
                                                 rhoCutoff=rhoCutoff,
-                                                aggregate=FALSE,
+                                                aggregate=TRUE,
+                                                all=all,
                                                 geneColumn=geneColumn,
-                                                FDR_subset=fdr_subset[sampleID 
-                                                        %in% samples(tmp_x)],
-                                                fullSubset=fullSubset)
-            }
-            if(isTRUE(aggregate)){
-                pvalsGene    <- as.matrix(pVals(tmp_x, level="gene", 
-                                                filters=list(rho=rhoCutoff)))
-                padjsGene    <- as.matrix(padjVals(tmp_x, level="gene", 
-                                                filters=list(rho=rhoCutoff)))
-                if(is.null(geneSubset)){
-                    aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
-                                                    padjCutoff=fdrCutoff, 
-                                                    deltaPsiCutoff=dPsiCutoff, 
-                                                    minCount=minCount, 
-                                                    rhoCutoff=rhoCutoff,
-                                                    aggregate=TRUE,
-                                                    geneColumn=geneColumn)
-                } else{
-                    aberrantGene <- aberrant.FRASER(tmp_x, type=type, 
-                                                    padjCutoff=fdrCutoff, 
-                                                    deltaPsiCutoff=dPsiCutoff, 
-                                                    minCount=minCount, 
-                                                    rhoCutoff=rhoCutoff,
-                                                    aggregate=TRUE,
-                                                    geneColumn=geneColumn,
-                                                    FDR_subset=fdr_subset[sampleID 
-                                                            %in% samples(tmp_x)],
-                                                    fullSubset=fullSubset)
-                
-                    # get row,col idx of genes/samples in subset
-                    subset_gene_padj <- as.matrix(
-                        fdr_subset[sampleID %in% samples(tmp_x), 
-                            .(as.numeric(sapply(gene, 
-                                function(g) which(rownames(pvalsGene) == g)) ),
-                            as.numeric(sapply(sampleID, 
-                                function(s) which(colnames(tmp_x) ==s)) ),
-                            pval_gene,
-                            FDR_subset_gene)]
-                    )
-                    # replace pvalsGene and padjsGene with values on subset
-                    geneNames <- rownames(pvalsGene)
-                    pvalsGene <- matrix(NA, 
-                                        nrow=nrow(pvalsGene), ncol=ncol(tmp_x))
-                    rownames(pvalsGene) <- geneNames 
-                    colnames(pvalsGene) <- colnames(tmp_x)
-                    padjsGene <- matrix(NA, 
-                                        nrow=nrow(padjsGene), ncol=ncol(tmp_x))
-                    rownames(padjsGene) <- geneNames
-                    colnames(padjsGene) <- colnames(tmp_x)
-                    if(nrow(subset_gene_padj) > 0){
-                        pvalsGene[subset_gene_padj[,1:2]] <- 
-                            unlist(subset_gene_padj[,3])
-                        padjsGene[subset_gene_padj[,1:2]] <- 
-                            unlist(subset_gene_padj[,4])
-                    }
-                }
-                
+                                                subsetName=subsetName)
             } else{
                 pvalsGene    <- NULL
                 padjsGene    <- NULL
                 aberrantGene <- NULL
             }
             
-            if(!is.null(geneSubset)){
-                # get row,col idx of introns/samples in subset
-                subset_padj <- as.matrix(
-                    fdr_subset[sampleID %in% samples(tmp_x), .(jidx, 
-                                sapply(sampleID, 
-                                    function(s) which(colnames(tmp_x) ==s)),
-                                FDR_subset)]
-                )
-                # replace padjs values with values on subset
-                padjs <- matrix(NA, nrow=nrow(pvals), ncol=ncol(pvals))
-                if(nrow(subset_padj) > 0){
-                    padjs[subset_padj[,1:2]] <- subset_padj[,3]
-                }
-                colnames(padjs) <- colnames(pvals)
-            }
-            
             if(length(sc) == 1){
                 colnames(pvals) <- sc
                 colnames(padjs) <- sc
@@ -925,14 +852,10 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
     # sort it if existing
     if(length(ans) > 0){
         ans <- ans[order(ans$pValue)]
-        if(is.null(geneSubset)){
+        if(is.null(subsetName)){
             mcols(ans)[["FDR_set"]] <- "transcriptome-wide"
         } else{
-            if(is.null(subsetName)){
-                mcols(ans)[["FDR_set"]] <- "subset"
-            } else{
-                mcols(ans)[["FDR_set"]] <- subsetName
-            }
+            mcols(ans)[["FDR_set"]] <- subsetName
         }
     }
     
@@ -981,21 +904,14 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #'              types to return only one row per feature (gene) and sample.
 #' @param geneColumn The column name of the column that has the gene annotation 
 #'              that will be used for gene-level pvalue computation.
-#' @param subsets A named list of named lists specifying any number of gene 
-#'              subsets (can differ per sample). For each subset, FDR correction
-#'              will be limited to introns in genes in the subset, and aberrant 
-#'              events passing the FDR cutoff will be reported for each subset 
-#'              separately. See the examples for how to use this argument. 
-#' @param fullSubset Only applies when \code{geneSubset} is not NULL. Specifies 
-#'              whether all introns in given subset of genes should be 
-#'              considered as aberrant, or only those passing the given cutoffs. 
-#'              Defaults to FALSE (introns have to pass the cutoffs in addtion 
-#'              to being in the gene subset to be considered aberrant). 
-#' @param geneSubset A named list giving a subset of genes per sample to which  
-#'              FDR correction should be restricted. The names of the list must 
-#'              correspond to the sampleIDs in the fds object.
-#' @param subsetName The name under which the resulting FDR corrected pvalues 
-#'              on the subset only will be displayed in the result table.
+#' @param all By default FALSE, only significant introns (or genes) are listed 
+#'             in the results. If TRUE, results are assembled for all 
+#'             samples and introns/genes regardless of significance. 
+#' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets 
+#'              of genes of interest have been calculated, this parameter 
+#'              indicates whether additionally the transcriptome-wide results 
+#'              should be returned as well (default), or whether only results 
+#'              for those subsets should be retrieved.
 #' @param ... Further arguments can be passed to the method. If "n",  
 #'              "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 #'              arguments are used to define the aberrant events.
@@ -1012,8 +928,8 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #' fds <- createTestFraserDataSet()
 #' 
 #' # extract results: for this example dataset, no cutoffs are used to
-#' # get at least one result and show the output
-#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+#' # show the output of the results function
+#' res <- results(fds, all=TRUE)
 #' res
 #' 
 #' # aggregate the results by genes (gene symbols need to be annotated first 
@@ -1039,52 +955,49 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #' # retrieve results limiting FDR correction to only a subset of genes
 #' # first, we need to create a list of genes per sample that will be tested
 #' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
-#' results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE)
+#' fds <- calculatePadjValues(fds, type="jaccard", 
+#'                  subsets=list("exampleSubset"=geneList))
+#' results(fds, all=TRUE, returnTranscriptomewideResults=FALSE)
 #' 
-#' # results for several subsets can be retrieved at the same time:
-#' geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1"))
-#' results(fds, 
-#'         subsets=list('random_subset'=geneList, 'another_subset'=geneList2), 
-#'         fullSubset=TRUE)
 #' @export
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     deltaPsiCutoff=0.1,
                     rhoCutoff=NA, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=psiTypes,
-                    geneColumn="hgnc_symbol",
-                    subsets=NULL, fullSubset=FALSE, 
+                    geneColumn="hgnc_symbol", all=FALSE,
                     returnTranscriptomewideResults=TRUE,
                     additionalColumns=NULL, BPPARAM=bpparam()){
     psiType <- match.arg(psiType, several.ok=TRUE)
-    if(is.null(subsets)){
+    FDRsets <- availableFDRsubsets(object)
+    
+    if(isFALSE(returnTranscriptomewideResults) && is.null(FDRsets)){
+        warning("Retrieving transcriptome-wide results as no other ",
+                "FDR subsets are available in the fds object.")
         returnTranscriptomewideResults <- TRUE
-    }
+    } 
     if(isTRUE(returnTranscriptomewideResults)){
         res <- FRASER.results(object=object, sampleIDs=sampleIDs, 
                 fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, 
                 rhoCutoff=rhoCutoff, minCount=minCount, 
-                psiType=psiType,
+                psiType=psiType, all=all,
                 aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
-                geneSubset=NULL, subsetName=NULL, fullSubset=FALSE,
-                additionalColumns=additionalColumns, BPPARAM=BPPARAM)
+                subsetName=NULL, additionalColumns=additionalColumns, 
+                BPPARAM=BPPARAM)
     }
     
     # add results for FDR_subsets if requested
-    if(!is.null(subsets)){
-        stopifnot(is.list(subsets))
-        stopifnot(!is.null(names(subsets)))
-        resls_subsets <- lapply(names(subsets), function(setName){
-            geneList_sub <- subsets[[setName]]
+    if(!is.null(FDRsets)){
+        resls_subsets <- lapply(FDRsets, function(setName){
             res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, 
                 fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, 
                 rhoCutoff=rhoCutoff, minCount=minCount, 
-                psiType=psiType,
+                psiType=psiType, all=all,
                 aggregate=aggregate, collapse=collapse, geneColumn=geneColumn,
-                geneSubset=geneList_sub, subsetName=setName, 
-                fullSubset=fullSubset, 
-                additionalColumns=additionalColumns, BPPARAM=BPPARAM)
+                subsetName=setName, additionalColumns=additionalColumns, 
+                BPPARAM=BPPARAM)
         })
+        
         if(isTRUE(returnTranscriptomewideResults)){
             res <- unlist(GRangesList(unlist(list(res, resls_subsets))))
         } else{
@@ -1107,8 +1020,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
                                 minCount=5, rhoCutoff=NA,
                                 by=c("none", "sample", "feature"), 
                                 aggregate=FALSE, geneColumn="hgnc_symbol", 
-                                geneSubset=NULL, subsetName=NULL, 
-                                fullSubset=FALSE, ...){
+                                subsetName=NULL, all=FALSE, ...){
     
     checkNaAndRange(padjCutoff,     min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
     checkNaAndRange(deltaPsiCutoff, min=0, max=1,   scalar=TRUE,   na.ok=TRUE)
@@ -1133,13 +1045,14 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         # check if padj values are available for the given filters
         pvalsAvailable <- checkPadjAvailableForFilters(object, type=type,
                                                 filters=list(rho=rhoCutoff), 
-                                                aggregate=aggregate)
+                                                aggregate=aggregate,
+                                                subsetName=subsetName)
         if(isFALSE(pvalsAvailable)){
             stop("For the given filters, pvalues are not yet computed. \n", 
                     "Please compute them first by running the ",
                     "calculatePadjValues function with the requested filters.")
         }
-        padj <- padjVals(object, type=type, level="site", 
+        padj <- padjVals(object, type=type, level="site", subsetName=subsetName,
                         filters=list(rho=rhoCutoff))
     }
     if("dPsi" %in% names(dots)){
@@ -1158,60 +1071,23 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
             padj_gene <- dots[['padjGeneVals']]
         } else{
             padj_gene <- padjVals(object, type=type, level="gene", 
+                                subsetName=subsetName,
                                 filters=list(rho=rhoCutoff))    
         }
         
     }
     
-    if(is.na(padjCutoff) || (isTRUE(fullSubset) && 
-            (!is.null(geneSubset) || ("FDR_subset" %in% names(dots))))){
+    if(is.na(padjCutoff)){
         padjCutoff <- 1
     }
     
-    if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){
-        aberrantEvents <- as.matrix(padj) <= padjCutoff
+    if(isTRUE(all)){
+        aberrantEvents <- matrix(TRUE, nrow=nrow(object), ncol=ncol(object))
+        colnames(aberrantEvents) <- colnames(fds)
     } else{
-        if("FDR_subset" %in% names(dots)){
-            fdr_subset <- dots[["FDR_subset"]]
-        } else{
-            if(is.null(subsetName)){
-                subsetName <- "subset"
-            }
-            object <- calculatePadjValuesOnSubset(fds=object, type=type, 
-                                                  genesToTest=geneSubset, 
-                                                  subsetName=subsetName, 
-                                                  geneColumn=geneColumn)
-            fdr_subset <- metadata(object)[[paste("FDR", subsetName, type, 
-                                                    sep="_")]]
-        }
-        
+        aberrantEvents <- as.matrix(padj) <= padjCutoff
         
-        # define aberrant status based on whether intron/sample tuples are 
-        # part of the given subset
-        if(is.null(dim(padj))){
-            # only one sample in fds present -> no dimensions set
-            aberrantEvents <- matrix(FALSE, nrow=length(padj), ncol=1)
-        } else{
-            aberrantEvents <- matrix(FALSE, nrow=nrow(padj), ncol=ncol(padj))
-        }
-        colnames(aberrantEvents) <- colnames(object)
-        FDR_col <- ifelse(isTRUE(aggregate), "FDR_subset_gene", "FDR_subset")
-        subset_idx <- lapply(fdr_subset[, unique(sampleID)], function(sid){
-            col <- which(colnames(object) == sid)
-            rows <- fdr_subset[sampleID == sid & get(FDR_col) <= padjCutoff, 
-                               sort(unique(jidx))]
-            sub_idx <- matrix(c(rows, rep(col, length(rows))), 
-                              nrow=length(rows))
-            return(sub_idx)
-        })
-        subset_idx <- do.call(rbind, 
-                            subset_idx[which(sapply(subset_idx, nrow) > 0)])
-        aberrantEvents[subset_idx] <- TRUE
-    }
-    
-    # check each cutoff if in use (not NA)
-    if((is.null(geneSubset) && !("FDR_subset" %in% names(dots))) || 
-            isFALSE(fullSubset)){
+        # check each cutoff if in use (not NA)
         if(!is.na(minCount)){
             aberrantEvents <- aberrantEvents & as.matrix(n >= minCount)
         }
@@ -1242,7 +1118,7 @@ aberrant.FRASER <- function(object, type=fitMetrics(object),
         aberrantEvents <- as.matrix(ab_dt[,-1])
         rownames(aberrantEvents) <- ab_dt[,geneID]
         
-        if(is.null(geneSubset) && !("FDR_subset" %in% names(dots))){
+        if(isFALSE(all)){
             aberrantEvents <- aberrantEvents & as.matrix(
                     padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)]
                 ) <= padjCutoff
diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
index 10f22c3a..92d0b492 100644
--- a/R/Fraser-pipeline.R
+++ b/R/Fraser-pipeline.R
@@ -72,11 +72,21 @@
 #' fds <- calculateZscore(fds, type="jaccard")
 #' head(zScores(fds, type="jaccard")) 
 #' 
-#' # To calculate the FDR only on a subset of genes of interest (per sample):
-#' geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
-#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, 
+#' # example of restricting FDR correction to subsets of genes of interest
+#' genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+#' fds <- calculatePadjValues(fds, type="jaccard", 
+#'                  subsets=list("exampleSubset"=genesOfInterest))
+#' padjVals(fds, type="jaccard", subsetName="exampleSubset")
+#' padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset")
+#' fds <- calculatePadjValues(fds, type="jaccard", 
+#'                  subsets=list("anotherExampleSubset"=c("TIMMDC1")))
+#' padjVals(fds, type="jaccard", subsetName="anotherExampleSubset")
+#' 
+#' # only adding FDR corrected pvalues on a subset without calculating 
+#' # transcriptome-wide FDR again:
+#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, 
 #'          subsetName="setOfInterest", type="jaccard")
-#' metadata(fds)[["FDR_setOfInterest"]]
+#' padjVals(fds, type="jaccard", subsetName="setOfInterest")
 #' 
 #' @seealso \code{\link[FRASER]{fit}}
 #' 
diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
index 0a03576f..00fb0055 100644
--- a/R/getNSetterFuns.R
+++ b/R/getNSetterFuns.R
@@ -327,11 +327,14 @@ pVals <- function(fds, type=currentType(fds), level="site",
 #' @describeIn getter_setter_functions This returns the adjusted p-values.
 #' @export
 padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), 
-                    level="site", filters=list(), ...){
+                    level="site", subsetName=NULL, filters=list(), ...){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+    if(!is.null(subsetName)){
+        aname <- paste0(aname, "_", subsetName)
+    }
     # add information on used filters
     if(is.null(names(filters))){
         filters <- list(rho=1)
@@ -358,11 +361,15 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
 }
 
 `padjVals<-` <- function(fds, type=currentType(fds), level="site",
-                    dist="BetaBinomial", filters=list(), ..., value){
+                    dist="BetaBinomial", subsetName=NULL, filters=list(), ..., 
+                    value){
     level <- match.arg(level, choices=c("site", "gene"))
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname)
+    if(!is.null(subsetName)){
+        aname <- paste0(aname, "_", subsetName)
+    }
     # add information on used filters
     for(n in sort(names(filters))){
         aname <- paste0(aname, "_", n, filters[[n]])
@@ -378,6 +385,32 @@ padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"),
     return(fds)
 }
 
+#' @describeIn getter_setter_functions This returns the names of FDR subsets 
+#'     for which adjusted p values have been calculated.
+#' @export
+availableFDRsubsets <- function(fds){
+    ans <- metadata(fds)[["FDRsubsets"]]
+    return(ans)
+}
+
+`availableFDRsubsets<-` <- function(fds, value){
+    metadata(fds)[["FDRsubsets"]] <- value
+    return(fds)
+}
+
+`addToAvailableFDRsubsets<-` <- function(fds, value){
+    if(!isScalarCharacter(value)){
+        stop("The assigned value needs to be a scalar character.")
+    }
+    ans <- metadata(fds)[["FDRsubsets"]]
+    if(is.null(ans)){
+        metadata(fds)[["FDRsubsets"]] <- value
+    } else{
+        metadata(fds)[["FDRsubsets"]] <- unique(c(ans, value))
+    }
+    return(fds)
+}
+
 #' @describeIn getter_setter_functions This returns the fitted mu (i.e. psi) 
 #' values.
 #' @export
diff --git a/R/helper-functions.R b/R/helper-functions.R
index ae0dba66..03d8de3b 100644
--- a/R/helper-functions.R
+++ b/R/helper-functions.R
@@ -556,10 +556,13 @@ getStrandString <- function(fds){
 #' Check if adjusted pvalues have been computed for a given set of filters.
 #' @noRd
 checkPadjAvailableForFilters <- function(fds, type=currentType(fds), 
-                    filters=list(), dist="BetaBinomial", aggregate=FALSE){
+                    filters=list(), dist="BetaBinomial", aggregate=FALSE,
+                    subsetName=NULL){
     dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal"))
     aname <- paste0("padj", dist)
     aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname)
+    aname <- ifelse(!is.null(subsetName), paste0(aname, "_", subsetName), aname)
+
     # add information on used filters
     for(n in sort(names(filters))){
         aname_new <- paste0(aname, "_", n, filters[[n]])
diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 94c9c151..2e403173 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -227,11 +227,18 @@ singlePvalueBinomial <- function(idx, k, n, mu){
 #'     should be calculated. Defaults to TRUE.
 #' @param geneColumn The column name of the column that has the gene annotation 
 #'              that will be used for gene-level pvalue computation.
+#' @param subsets A named list of named lists specifying any number of gene 
+#'             subsets (can differ per sample). For each subset, FDR correction
+#'             will be limited to genes in the subset, and the FDR corrected 
+#'             pvalues stored as an assay in the fds object in addition to the 
+#'             transcriptome-wide FDR corrected pvalues. See the examples for 
+#'             how to use this argument.
 #' 
 #' @export
 calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                                 rhoCutoff=NA, geneLevel=TRUE, 
-                                geneColumn="hgnc_symbol", BPPARAM=bpparam()){
+                                geneColumn="hgnc_symbol", subsets=NULL, 
+                                BPPARAM=bpparam()){
     currentType(fds) <- type
     index <- getSiteIndex(fds, type=type)
     idx   <- !duplicated(index)
@@ -284,6 +291,21 @@ calculatePadjValues <- function(fds, type=currentType(fds), method="BY",
                     "Please annotate gene symbols \nfirst using the ", 
                     "annotateRanges function.")
         }
+        
+        # calculate FDR for each provided subset and assign to fds
+        if(!is.null(subsets)){
+            stopifnot(is.list(subsets))
+            stopifnot(!is.null(names(subsets)))
+            for(setName in names(subsets)){
+                geneListSubset <- subsets[[setName]]
+                fds <- calculatePadjValuesOnSubset(fds=fds, 
+                                                   genesToTest=geneListSubset,
+                                                   subsetName=setName,
+                                                   type=type, method=method, 
+                                                   geneColumn=geneColumn, 
+                                                   BPPARAM=BPPARAM)
+            }
+        }
     }
     
     return(fds)
@@ -360,6 +382,13 @@ getSiteIndex <- function(fds, type=currentType(fds)){
 
 getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, 
                         geneColumn="hgnc_symbol"){
+    if(!geneColumn %in% colnames(mcols(fds, type=type))){
+        stop("Did not find column '", geneColumn, "' in mcols(fds, type='", 
+             type, "'). Please assign introns\nto genes first using the ",
+             "annotateRanges(fds, ...) or annotateRangesWithTxDb(fds, ...) ",
+             "function.")
+    }
+    
     geneIDs <- mcols(fds, type=type)[[geneColumn]]
     if(isTRUE(unique)){
         geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){
@@ -398,91 +427,129 @@ genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){
 #'     will be stored in metadata(fds).
 #' 
 #' @export
-calculatePadjValuesOnSubset <- function(fds, genesToTest, type=currentType(fds), 
-                                subsetName="subset", method="BY", 
+calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, 
+                                type=currentType(fds), method='BY', 
                                 geneColumn="hgnc_symbol", BPPARAM=bpparam()){
     
     # check input
-    currentType(fds) <- type
     stopifnot(!is.null(genesToTest))
-    stopifnot(is.list(genesToTest))
+    stopifnot(is.list(genesToTest) || is.vector(genesToTest))
+    
+    # replicate subset genes for each sample if given as vector
+    if(!is.list(genesToTest)){
+        genesToTest <- rep(list(genesToTest), ncol(fds))
+        names(genesToTest) <- colnames(fds)
+    }
+    
+    # check that names are present and correspond to samples in ods
     stopifnot(!is.null(names(genesToTest)))
-    if(!all(names(genesToTest) %in% samples(fds))){
+    if(!all(names(genesToTest) %in% colnames(fds))){
         stop("names(genesToTest) need to be sampleIDs in the given fds object.")
     }
     
-    # check if genes have been annotated
-    if(!geneColumn %in% colnames(mcols(fds, type=type))){    
-        stop(paste0("'", geneColumn, "' is not found in mcols(fds). ", 
-            "Please annotate gene symbols \nfirst using the ", 
-            "annotateRanges or annotateRangesWithTxDb function."))
-    }
+    # get genes from fds object
+    fds_genes <- getGeneIDs(fds, unique=TRUE, type=type, geneColumn=geneColumn)
+    ngenes <- length(fds_genes)
     
     # site index (for psi3/5)
     site_idx <- getSiteIndex(fds, type=type)
-        
+    
     # compute FDR on the given subsets of genes
     message(date(), ": starting FDR calculation on subset of genes...")
-    FDR_subset <- rbindlist(bpmapply(names(genesToTest), genesToTest, 
-            FUN=function(sample_id, genes_to_test_sample){
+    
+    # compute FDR on the given subsets of genes
+    fdrSubset <- bplapply(colnames(fds), FUN=function(sampleId){
         
-        # message(date(), ": FDR subset calculation for sample = ", sample_id)
-        # get idx of junctions corresponding to genes with var
-        jidx <- unlist(lapply(genes_to_test_sample, function(gene){
-            idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), 
-                                mcols(fds, type=type)[, geneColumn]))
-            names(idx) <- rep(gene, length(idx))
-            if(length(idx) == 0 && verbose(fds) > 0){
-                warning("No introns found in fds object for gene: ", gene, 
-                        " and sample: ", sample_id, ". Skipping this gene.")
-            }
-            return(idx)
-        }))
-        jidx <- sort(jidx[!duplicated(jidx)])
+        # get genes to test for this sample
+        genesToTestSample <- genesToTest[[sampleId]]
+        padj <- rep(NA, nrow(fds))
+        padj_gene <- rep(NA, ngenes)
+        
+        # if no genes present in the subset for this sample, return NAs
+        if(is.null(genesToTestSample)){
+            return(list(padj=padj, padj_gene=padj_gene))
+        }
         
-        # check that jidx is not empty vector
-        if(length(jidx) == 0){
-            warning("No introns found in the fds object for the given gene ", 
-                    "subset for sample: ", sample_id)
-            return(data.table(gene=character(0), 
-                              sampleID=character(0), 
-                              type=character(0),
-                              pval=numeric(0),
-                              FDR_subset=numeric(0), 
-                              jidx=integer(0),
-                              pval_gene=numeric(0),
-                              FDR_subset_gene=numeric(0)))
+        # get idx of junctions corresponding to genes to test
+        if(is.character(genesToTestSample)){
+            rowIdx <- sort(which(fds_genes %in% genesToTestSample))
+            rowIdx <- unlist(lapply(genesToTestSample, function(gene){
+                idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), 
+                                   mcols(fds, type=type)[, geneColumn]))
+                names(idx) <- rep(gene, length(idx))
+                if(length(idx) == 0 && verbose(fds) > 0){
+                    warning("No introns found in fds object for gene: ", gene, 
+                        " and sample: ", sampleId, ". Skipping this gene.")
+                }
+                return(idx)
+            }))
+            rowIdx <- sort(rowIdx[!duplicated(rowIdx)])
+        } else{
+            stop("Genes in the list to test must be a character vector ",
+                 "of geneIDs.")
         }
         
-        # retrieve pvalues of junctions
+        # check that rowIdx is not empty vector
+        if(length(rowIdx) == 0){
+            warning("No genes from the given subset found in the fds ",
+                    "object for sample: ", sampleId)
+            return(list(padj=padj, padj_gene=padj_gene))
+        }
+        
+       
+        
+        # retrieve pvalues of introns to test
         p <- as.matrix(pVals(fds, type=type))
         if(ncol(p) == 1){
             colnames(p) <- colnames(fds)    
         }
-        p <- p[jidx, sample_id]
+        p <- p[rowIdx, sampleId]
         
-        # FDR correction
-        pa <- p.adjust(p[!duplicated(site_idx[jidx])], method=method)
+        # FDR correction on subset
+        non_dup_site_idx <- !duplicated(site_idx[rowIdx])
+        padjSub <- p.adjust(p[non_dup_site_idx], method=method)
+        
+        # set intron FDR on subset (filled with NA for all other genes)
+        padj[rowIdx] <- padjSub
         
         # gene level pvals
-        dt <- data.table(sampleID=sample_id, type=type, pval=p, 
-                         gene=names(jidx), jidx=jidx, site_idx=site_idx[jidx])
-        dt <- merge(dt, data.table(site_idx=site_idx[jidx][!duplicated(site_idx[jidx])], FDR_subset=pa), by="site_idx")
-        dt[!duplicated(dt$site_idx), pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
+        dt <- data.table(sampleID=sampleId, type=type, pval=p, 
+                    gene=names(rowIdx), jidx=rowIdx, site_idx=site_idx[rowIdx])
+        dt <- merge(dt, 
+                    data.table(site_idx=site_idx[rowIdx][non_dup_site_idx], 
+                                FDR_subset=padjSub), 
+                    by="site_idx")
+        dt[!duplicated(dt$site_idx), 
+            pval_gene:=min(p.adjust(pval, method="holm")), by="gene"]
         dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"]
         
         # gene level FDR
         dt2 <- dt[, unique(pval_gene), by="gene"]
         dt2[, FDR_subset_gene := p.adjust(V1, method=method)]
-        dt <- merge(dt, dt2[, .(gene, FDR_subset_gene)], by="gene", all.x=TRUE)
+        dt2[, gene_rowIdx := sapply(gene, function(g) which(fds_genes == g))]
+        
+        # set intron FDR on subset (filled with NA for all other genes)
+        padj_gene[dt2[,gene_rowIdx]] <- dt2[, FDR_subset_gene]
         
         # return new FDR
-        return(dt)
-    }, SIMPLIFY=FALSE, BPPARAM=BPPARAM))
-    message(date(), ": finished FDR calculation on subset of genes.")
+        return(list(padj=padj, padj_gene=padj_gene))
+        
+    }, BPPARAM=BPPARAM)
+    padjSub <- vapply(fdrSubset, '[[', double(nrow(fds)), 'padj')
+    padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene')
     
-    # add FDR subset info to fds object and return
-    metadata(fds)[[paste("FDR", subsetName, type, sep="_")]] <- FDR_subset
+    rownames(padjSub) <- rownames(fds)
+    colnames(padjSub) <- colnames(fds)
+    rownames(padjSub_gene) <- fds_genes
+    colnames(padjSub_gene) <- colnames(fds)
+    
+    # add FDR subset info to ods object and return
+    padjVals(fds, type=type, level="site", subsetName=subsetName) <- padjSub
+    padjVals(fds, type=type, level="gene", subsetName=subsetName) <- padjSub_gene
+    addToAvailableFDRsubsets(fds) <- subsetName
+    
+    message(date(), ": finished FDR calculation on subset of genes.")
+    validObject(fds)
     return(fds)
 }
 
diff --git a/man/FRASER.Rd b/man/FRASER.Rd
index 13210477..c4848052 100644
--- a/man/FRASER.Rd
+++ b/man/FRASER.Rd
@@ -37,14 +37,15 @@ calculatePadjValues(
   rhoCutoff = NA,
   geneLevel = TRUE,
   geneColumn = "hgnc_symbol",
+  subsets = NULL,
   BPPARAM = bpparam()
 )
 
 calculatePadjValuesOnSubset(
   fds,
   genesToTest,
+  subsetName,
   type = currentType(fds),
-  subsetName = "subset",
   method = "BY",
   geneColumn = "hgnc_symbol",
   BPPARAM = bpparam()
@@ -95,6 +96,13 @@ should be calculated. Defaults to TRUE.}
 \item{geneColumn}{The column name of the column that has the gene annotation 
 that will be used for gene-level pvalue computation.}
 
+\item{subsets}{A named list of named lists specifying any number of gene 
+subsets (can differ per sample). For each subset, FDR correction
+will be limited to genes in the subset, and the FDR corrected 
+pvalues stored as an assay in the fds object in addition to the 
+transcriptome-wide FDR corrected pvalues. See the examples for 
+how to use this argument.}
+
 \item{genesToTest}{A named list with the subset of genes to test per sample.
 The names must correspond to the sampleIDs in the given fds object.}
 
@@ -131,33 +139,33 @@ raw counts ("BB").
 }
 \section{Functions}{
 \itemize{
-\item \code{FRASER()}: This function runs the default FRASER pipeline combining 
+\item \code{FRASER}: This function runs the default FRASER pipeline combining 
 the beta-binomial fit, the computation of Z scores and p values as well as 
 the computation of delta-PSI values.
 
-\item \code{calculateZscore()}: This function calculates z-scores based on the 
+\item \code{calculateZscore}: This function calculates z-scores based on the 
 observed and expected logit 
 psi.
 
-\item \code{calculatePvalues()}: This function calculates two-sided p-values based on 
+\item \code{calculatePvalues}: This function calculates two-sided p-values based on 
 the beta-binomial distribution (or binomial or normal if desired). The 
 returned p values are not yet adjusted with Holm's method per donor or 
 acceptor site, respectively.
 
-\item \code{calculatePadjValues()}: This function adjusts the previously calculated 
+\item \code{calculatePadjValues}: This function adjusts the previously calculated 
 p-values per sample for multiple testing. First, the previoulsy calculated 
 junction-level p values are adjusted with Holm's method per donor or 
 acceptor site, respectively. Then, if gene symbols have been annotated to 
 junctions (and not otherwise requested), gene-level p values are computed.
 
-\item \code{calculatePadjValuesOnSubset()}: This function does FDR correction only for all junctions 
+\item \code{calculatePadjValuesOnSubset}: This function does FDR correction only for all junctions 
 in a certain subset of genes which can differ per sample. Requires gene 
 symbols to have been annotated to junctions. As with the full FDR 
 correction across all junctions, first the previously calculated 
 junction-level p values are adjusted with Holm's method per donor or 
 acceptor site, respectively. Then, gene-level p values are computed.
-
 }}
+
 \examples{
 # set default parallel backend
 register(SerialParam())
@@ -190,11 +198,21 @@ head(padjVals(fds, type="jaccard"))
 fds <- calculateZscore(fds, type="jaccard")
 head(zScores(fds, type="jaccard")) 
 
-# To calculate the FDR only on a subset of genes of interest (per sample):
-geneList <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
-fds <- calculatePadjValuesOnSubset(fds, genesToTest=geneList, 
+# example of restricting FDR correction to subsets of genes of interest
+genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1"))
+fds <- calculatePadjValues(fds, type="jaccard", 
+                 subsets=list("exampleSubset"=genesOfInterest))
+padjVals(fds, type="jaccard", subsetName="exampleSubset")
+padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset")
+fds <- calculatePadjValues(fds, type="jaccard", 
+                 subsets=list("anotherExampleSubset"=c("TIMMDC1")))
+padjVals(fds, type="jaccard", subsetName="anotherExampleSubset")
+
+# only adding FDR corrected pvalues on a subset without calculating 
+# transcriptome-wide FDR again:
+fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, 
          subsetName="setOfInterest", type="jaccard")
-metadata(fds)[["FDR_setOfInterest"]]
+padjVals(fds, type="jaccard", subsetName="setOfInterest")
 
 }
 \seealso{
diff --git a/man/countRNA.Rd b/man/countRNA.Rd
index 440231fe..7940b91b 100644
--- a/man/countRNA.Rd
+++ b/man/countRNA.Rd
@@ -245,32 +245,32 @@ a sample are set to zero.
 }
 \section{Functions}{
 \itemize{
-\item \code{countRNAData()}: This method extracts and counts the split reads and
+\item \code{countRNAData}: This method extracts and counts the split reads and
 non spliced reads from RNA bam files.
 
-\item \code{getSplitReadCountsForAllSamples()}: This method creates a GRanges 
+\item \code{getSplitReadCountsForAllSamples}: This method creates a GRanges 
 object containing the split read counts from all 
 specified samples.
 
-\item \code{getNonSplitReadCountsForAllSamples()}: This method creates a GRanges 
+\item \code{getNonSplitReadCountsForAllSamples}: This method creates a GRanges 
 object containing the non split read counts at the 
 exon-intron boundaries inferred from the GRanges object 
 containing the positions of all the introns in this dataset.
 
-\item \code{addCountsToFraserDataSet()}: This method adds the split read and 
+\item \code{addCountsToFraserDataSet}: This method adds the split read and 
 non split read counts to a existing FraserDataSet 
 containing the settings.
 
-\item \code{countSplitReads()}: This method counts all split reads in a 
+\item \code{countSplitReads}: This method counts all split reads in a 
 bam file for a single sample.
 
-\item \code{mergeCounts()}: This method merges counts for multiple 
+\item \code{mergeCounts}: This method merges counts for multiple 
 samples into one SummarizedExperiment object.
 
-\item \code{countNonSplicedReads()}: This method counts non spliced reads based 
+\item \code{countNonSplicedReads}: This method counts non spliced reads based 
 on the given target (acceptor/donor) regions for a single sample.
-
 }}
+
 \examples{
    # On Windows SNOW is the default for the parallele backend, which can be 
    # very slow for many but small tasks. Therefore, we will use 
diff --git a/man/filtering.Rd b/man/filtering.Rd
index 226df199..b99936dc 100644
--- a/man/filtering.Rd
+++ b/man/filtering.Rd
@@ -84,16 +84,16 @@ reliably detected and to remove introns with no variablity between samples.
 }
 \section{Functions}{
 \itemize{
-\item \code{filterExpressionAndVariability()}: This functions filters out both introns with low 
+\item \code{filterExpressionAndVariability}: This functions filters out both introns with low 
 read support and introns that are not variable across samples.
 
-\item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding 
+\item \code{filterExpression,FraserDataSet-method}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
 
-\item \code{filterVariability(FraserDataSet)}: This function filters out introns and corresponding 
+\item \code{filterVariability,FraserDataSet-method}: This function filters out introns and corresponding 
 splice sites that have low read support in all samples.
-
 }}
+
 \examples{
 fds <- createTestFraserDataSet()
 fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd
index 3f25c8b6..9b10f832 100644
--- a/man/getter_setter_functions.Rd
+++ b/man/getter_setter_functions.Rd
@@ -15,6 +15,7 @@
 \alias{zScores}
 \alias{pVals}
 \alias{padjVals}
+\alias{availableFDRsubsets}
 \alias{predictedMeans}
 \alias{deltaPsiValue}
 \alias{currentType}
@@ -51,10 +52,13 @@ padjVals(
   type = currentType(fds),
   dist = c("BetaBinomial"),
   level = "site",
+  subsetName = NULL,
   filters = list(),
   ...
 )
 
+availableFDRsubsets(fds)
+
 predictedMeans(fds, type = currentType(fds))
 
 deltaPsiValue(fds, type = currentType(fds))
@@ -115,70 +119,73 @@ the values within the FRASER model.
 }
 \section{Functions}{
 \itemize{
-\item \code{featureExclusionMask()}: Retrieves a logical vector indicating
+\item \code{featureExclusionMask}: Retrieves a logical vector indicating
 for each junction whether it is included or excluded during the fitting
 procedure.
 
-\item \code{featureExclusionMask(fds, type = currentType(fds)) <- value}: To remove certain junctions from 
+\item \code{featureExclusionMask<-}: To remove certain junctions from 
 being used in the train step of the encoding dimension we can set the 
 \code{featureExclusion} vector to \code{FALSE}. This can be helpfull 
 if we have local linkage between features which we do not want to 
 model by the autoencoder.
 
-\item \code{rho()}: Returns the fitted rho values for the 
+\item \code{rho}: Returns the fitted rho values for the 
 beta-binomial distribution
 
-\item \code{zScores()}: This returns the calculated z-scores.
+\item \code{zScores}: This returns the calculated z-scores.
+
+\item \code{pVals}: This returns the calculated p-values.
 
-\item \code{pVals()}: This returns the calculated p-values.
+\item \code{padjVals}: This returns the adjusted p-values.
 
-\item \code{padjVals()}: This returns the adjusted p-values.
+\item \code{availableFDRsubsets}: This returns the names of FDR subsets 
+for which adjusted p values have been calculated.
 
-\item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) 
+\item \code{predictedMeans}: This returns the fitted mu (i.e. psi) 
 values.
 
-\item \code{deltaPsiValue()}: Returns the difference between the 
+\item \code{deltaPsiValue}: Returns the difference between the 
 observed and the fitted psi values.
 
-\item \code{currentType()}: Returns the psi type that is used 
+\item \code{currentType}: Returns the psi type that is used 
 within several methods in the FRASER package (defaults to jaccard).
 
-\item \code{currentType(fds) <- value}: Sets the psi type that is to be used 
+\item \code{currentType<-}: Sets the psi type that is to be used 
 within several methods in the FRASER package.
 
-\item \code{fitMetrics()}: Returns the splice metrics that will be
+\item \code{fitMetrics}: Returns the splice metrics that will be
 fitted (defaults to jaccard, used within several methods in the 
 FRASER package).
 
-\item \code{fitMetrics(fds) <- value}: Sets the splice metrics that will be 
+\item \code{fitMetrics<-}: Sets the splice metrics that will be 
 fitted (used within several methods in the FRASER package).
 
-\item \code{pseudocount()}: Sets and returns the pseudo count used 
+\item \code{pseudocount}: Sets and returns the pseudo count used 
 within the FRASER fitting procedure.
 
-\item \code{hyperParams()}: This returns the results of the 
+\item \code{hyperParams}: This returns the results of the 
 hyperparameter optimization NULL if the hyperparameter 
 opimization was not run yet.
 
-\item \code{bestQ()}: This returns the optimal size of the 
+\item \code{bestQ}: This returns the optimal size of the 
 latent space according to the hyperparameter optimization or a simple 
 estimate of about a tenth of the number of samples if the hyperparameter 
 opimization was not run yet.
 
-\item \code{dontWriteHDF5()}: Gets the current value of whether the 
+\item \code{dontWriteHDF5}: Gets the current value of whether the 
 assays should be stored as hdf5 files.
 
-\item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored 
+\item \code{dontWriteHDF5<-}: Sets whether the assays should be stored 
 as hdf5 files.
 
-\item \code{verbose()}: Dependent on the level of verbosity 
+\item \code{verbose}: Dependent on the level of verbosity 
 the algorithm reports more or less to the user. 0 means being quiet 
 and 10 means everything.
 
-\item \code{verbose(fds) <- value}: Sets the verbosity level to a value 
+\item \code{verbose<-}: Sets the verbosity level to a value 
 between 0 and 10. 0 means being quiet and 10 means reporting everything.
-
 }}
+
 \examples{
 fds <- createTestFraserDataSet()
 
diff --git a/man/potentialImpactAnnotations.Rd b/man/potentialImpactAnnotations.Rd
index a06e304a..282a6d09 100644
--- a/man/potentialImpactAnnotations.Rd
+++ b/man/potentialImpactAnnotations.Rd
@@ -93,20 +93,20 @@ These functions work on the result table and add additional
 }
 \section{Functions}{
 \itemize{
-\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations 
+\item \code{annotateIntronReferenceOverlap}: This method calculates basic annotations 
 based on overlap with the reference annotation (start, end, none, both) 
 for the full fds. The overlap type is added as a new column 
 \code{annotatedJunction} in \code{mcols(fds)}.
 
-\item \code{annotatePotentialImpact()}: This method annotates the splice event 
+\item \code{annotatePotentialImpact}: This method annotates the splice event 
 type to junctions in the given results table.
 
-\item \code{flagBlacklistRegions()}: This method flags all introns and 
+\item \code{flagBlacklistRegions}: This method flags all introns and 
 splice sites in the given results table for which at least one splice 
 site (donor or acceptor) is located in a blacklist region. Blacklist 
 regions of the genome are determined from the provided BED file.
-
 }}
+
 \examples{
   # get data, fit and compute p-values and z-scores
   fds <- createTestFraserDataSet()
diff --git a/man/results.Rd b/man/results.Rd
index 03b67b36..ceeb9fc8 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -16,8 +16,7 @@
   minCount = 5,
   psiType = psiTypes,
   geneColumn = "hgnc_symbol",
-  subsets = NULL,
-  fullSubset = FALSE,
+  all = FALSE,
   returnTranscriptomewideResults = TRUE,
   additionalColumns = NULL,
   BPPARAM = bpparam()
@@ -33,9 +32,8 @@
   by = c("none", "sample", "feature"),
   aggregate = FALSE,
   geneColumn = "hgnc_symbol",
-  geneSubset = NULL,
   subsetName = NULL,
-  fullSubset = FALSE,
+  all = FALSE,
   ...
 )
 }
@@ -69,17 +67,15 @@ result}
 \item{geneColumn}{The column name of the column that has the gene annotation 
 that will be used for gene-level pvalue computation.}
 
-\item{subsets}{A named list of named lists specifying any number of gene 
-subsets (can differ per sample). For each subset, FDR correction
-will be limited to introns in genes in the subset, and aberrant 
-events passing the FDR cutoff will be reported for each subset 
-separately. See the examples for how to use this argument.}
+\item{all}{By default FALSE, only significant introns (or genes) are listed 
+in the results. If TRUE, results are assembled for all 
+samples and introns/genes regardless of significance.}
 
-\item{fullSubset}{Only applies when \code{geneSubset} is not NULL. Specifies 
-whether all introns in given subset of genes should be 
-considered as aberrant, or only those passing the given cutoffs. 
-Defaults to FALSE (introns have to pass the cutoffs in addtion 
-to being in the gene subset to be considered aberrant).}
+\item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets 
+of genes of interest have been calculated, this parameter 
+indicates whether additionally the transcriptome-wide results 
+should be returned as well (default), or whether only results 
+for those subsets should be retrieved.}
 
 \item{additionalColumns}{Character vector containing the names of additional 
 columns from mcols(fds) that should appear in the result table 
@@ -94,13 +90,6 @@ are included.}
 \code{sample} or \code{feature} is specified the sum by 
 sample or feature is returned}
 
-\item{geneSubset}{A named list giving a subset of genes per sample to which
-FDR correction should be restricted. The names of the list must 
-correspond to the sampleIDs in the fds object.}
-
-\item{subsetName}{The name under which the resulting FDR corrected pvalues 
-on the subset only will be displayed in the result table.}
-
 \item{...}{Further arguments can be passed to the method. If "n",
 "padjVals", "dPsi" or "rhoVals" are given, the values of those 
 arguments are used to define the aberrant events.}
@@ -122,8 +111,8 @@ aberrant splicing events based on the given cutoffs.
 fds <- createTestFraserDataSet()
 
 # extract results: for this example dataset, no cutoffs are used to
-# get at least one result and show the output
-res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+# show the output of the results function
+res <- results(fds, all=TRUE)
 res
 
 # aggregate the results by genes (gene symbols need to be annotated first 
@@ -149,11 +138,8 @@ aberrant(fds, type="jaccard")
 # retrieve results limiting FDR correction to only a subset of genes
 # first, we need to create a list of genes per sample that will be tested
 geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1"))
-results(fds, subsets=list('random_subset'=geneList), fullSubset=TRUE)
+fds <- calculatePadjValues(fds, type="jaccard", 
+                 subsets=list("exampleSubset"=geneList))
+results(fds, all=TRUE, returnTranscriptomewideResults=FALSE)
 
-# results for several subsets can be retrieved at the same time:
-geneList2 <- list('sample1'=c("MCOLN1", "TIMMDC1"), 'sample2'=c("MCOLN1"))
-results(fds, 
-        subsets=list('random_subset'=geneList, 'another_subset'=geneList2), 
-        fullSubset=TRUE)
 }
diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R
index 79f737d1..b1eedb8c 100644
--- a/tests/testthat/test_stats.R
+++ b/tests/testthat/test_stats.R
@@ -93,14 +93,18 @@ test_that("FDR on subset of genes", {
         "sample2" = c("geneB"),
         "sample3" = c("geneA", "geneB", "geneC", "geneD")
     )
-    expected_output_nrows <- (8 + 5 + 3) + (7) + (3+7+5+4)
     
     subsetName <- "subset_test"
     fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, 
                                        subsetName=subsetName, type="jaccard")
-    subset_dt <- metadata(fds)[[paste("FDR", subsetName, "jaccard", sep="_")]]
-    expect_true(is(subset_dt, "data.table"))
-    expect_true(all(c("FDR_subset", "FDR_subset_gene") %in% colnames(subset_dt)))
-    expect_equal(subset_dt[, .N], expected_output_nrows)
+    subset_padj <- padjVals(fds, type="jaccard", subsetName=subsetName)
+    expect_true(is(subset_padj, "matrix"))
+    expect_true(nrow(subset_padj) == 27)
+    expect_true(ncol(subset_padj) == 3)
+    subset_padj_gene <- padjVals(fds, type="jaccard", level="gene", 
+                                    subsetName=subsetName)
+    expect_true(is(subset_padj_gene, "matrix"))
+    expect_true(nrow(subset_padj_gene) == 5)
+    expect_true(ncol(subset_padj_gene) == 3)
       
 })
diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 99df108d..3d54ac21 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -175,7 +175,7 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and
 between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the 
 following.
 
-From \fraser{}2 on, only a single metric - the Intron Jaccard Index (Figure 
+From \fraser{} 2.0 on, only a single metric - the Intron Jaccard Index (Figure 
 \ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard 
 Index is more robust and allows to focus more on functionally relevant 
 aberrant splicing events. It allows to detect all types of aberrant splicing 
@@ -248,7 +248,7 @@ fds <- FRASER(fds, q=c(jaccard=2))
 # get results: we recommend to use an FDR cutoff 0.05, but due to the small
 # dataset size we extract all events and their associated values
 # eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1)
-res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+res <- results(fds, all=TRUE)
 res
 
 # result visualization
@@ -586,8 +586,8 @@ sashimi plot for an outlier in the results table directly in R (if paths to
 bam files are available in the \fds{} object).
 
 <<result_table, echo=TRUE>>=
-# to show result visualization functions for this tuturial, no cutoff used
-res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
+# to show result visualization functions for this tutorial, no cutoff used
+res <- results(fds, all=TRUE)
 res
 
 # for the gene level pvalues, gene symbols need to be annotated the fds object
@@ -595,7 +595,7 @@ res
 # as we previously called FRASER() before annotating genes, we run it again here
 fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE)
 # generate gene-level results table (if gene symbols have been annotated)
-res_gene <- results(fds, aggregate=TRUE, padjCutoff=NA, deltaPsiCutoff=NA)
+res_gene <- results(fds, aggregate=TRUE, all=TRUE)
 res_gene
 @
 
@@ -735,6 +735,19 @@ fds <- calculatePadjValues(fds, type="jaccard", method="BY")
 head(padjVals(fds,type="jaccard"))
 @
 
+With FRASER 2.0 we introduce the option to limit FDR correction to a subset of 
+genes based on prior knowledge, e.g. genes that contain a rare variant per 
+sample. To use this option, provide a list of genes per sample during FDR 
+computation:
+
+<<p-adj calculation, echo=TRUE>>=
+genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), 
+                        "sample2"=c("ARHGEF18", "TRAPPC5"))
+fds <- calculatePadjValues(fds, type="jaccard", 
+              subsets=list("exampleSubset"=genesOfInterest))
+head(padjVals(fds, type="jaccard", subsetName="exampleSubset"))
+@
+
 \subsection{Result visualization}
 \label{sec:result-vis}
 

From 4e3e32305c53f1344570cee91982b9736487723e Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 29 Mar 2023 18:04:01 +0200
Subject: [PATCH 74/80] small fix

---
 R/pvalsNzscore.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index 2e403173..a4bf89be 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -544,8 +544,10 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName,
     colnames(padjSub_gene) <- colnames(fds)
     
     # add FDR subset info to ods object and return
-    padjVals(fds, type=type, level="site", subsetName=subsetName) <- padjSub
-    padjVals(fds, type=type, level="gene", subsetName=subsetName) <- padjSub_gene
+    padjVals(fds, type=type, level="site", subsetName=subsetName,
+             withDimnames=FALSE) <- padjSub
+    padjVals(fds, type=type, level="gene", subsetName=subsetName,
+             withDimnames=FALSE) <- padjSub_gene
     addToAvailableFDRsubsets(fds) <- subsetName
     
     message(date(), ": finished FDR calculation on subset of genes.")

From 89983e5e10d3bdf87dde9f81e42dca88af05e0aa Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 29 Mar 2023 18:47:12 +0200
Subject: [PATCH 75/80] fix fdr on subset for theta

---
 R/pvalsNzscore.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R
index a4bf89be..be256009 100644
--- a/R/pvalsNzscore.R
+++ b/R/pvalsNzscore.R
@@ -462,7 +462,7 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName,
         
         # get genes to test for this sample
         genesToTestSample <- genesToTest[[sampleId]]
-        padj <- rep(NA, nrow(fds))
+        padj <- rep(NA, nrow(mcols(fds, type=type)))
         padj_gene <- rep(NA, ngenes)
         
         # if no genes present in the subset for this sample, return NAs
@@ -535,10 +535,10 @@ calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName,
         return(list(padj=padj, padj_gene=padj_gene))
         
     }, BPPARAM=BPPARAM)
-    padjSub <- vapply(fdrSubset, '[[', double(nrow(fds)), 'padj')
+    padjSub <- vapply(fdrSubset, '[[', 
+                        double(nrow(mcols(fds, type=type))), 'padj')
     padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene')
     
-    rownames(padjSub) <- rownames(fds)
     colnames(padjSub) <- colnames(fds)
     rownames(padjSub_gene) <- fds_genes
     colnames(padjSub_gene) <- colnames(fds)

From 54822b92805862b675d60a8721273773f9b0d223 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Wed, 29 Mar 2023 21:04:55 +0200
Subject: [PATCH 76/80] fix vignette error

---
 vignettes/FRASER.Rnw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw
index 3d54ac21..70e80f21 100644
--- a/vignettes/FRASER.Rnw
+++ b/vignettes/FRASER.Rnw
@@ -740,7 +740,7 @@ genes based on prior knowledge, e.g. genes that contain a rare variant per
 sample. To use this option, provide a list of genes per sample during FDR 
 computation:
 
-<<p-adj calculation, echo=TRUE>>=
+<<p-adj calculation on subset, echo=TRUE>>=
 genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), 
                         "sample2"=c("ARHGEF18", "TRAPPC5"))
 fds <- calculatePadjValues(fds, type="jaccard", 

From 10deeab7240e2aed22d29f7d7e6cc434e417504d Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 30 Mar 2023 16:23:54 +0200
Subject: [PATCH 77/80] add FRASER 2.0 to readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 8e9443aa..5c15f95f 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,13 @@ Please cite our method paper if you use it in a publication:
 
 > Mertes, C., Scheller, I.F., Yépez, V.A. *et al.* Detection of aberrant splicing events in RNA-seq data using FRASER. *Nat Commun* **12**, 529 (2021). https://doi.org/10.1038/s41467-020-20573-7
 
+## What's new
+
+FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as 
+its splice metric instead of FRASER's previous three metrics, is now available 
+and used by default (version 1.99.0 and above). The manuscript describing these 
+changes in more detail will be available soon. 
+
 ## Installation
 
 `FRASER` is an R/Bioconductor software package requiring a running 

From adf499cc2bd3e7c9e0a346e352a10be7f8e336b7 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 30 Mar 2023 17:12:09 +0200
Subject: [PATCH 78/80] have new delta and fdr cutoffs consistently

---
 R/AllGenerics.R | 4 ++--
 R/plotMethods.R | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 7e1bf696..05eb80d1 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -961,7 +961,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff,
 #' 
 #' @export
 setMethod("results", "FraserDataSet", function(object, 
-                    sampleIDs=samples(object), padjCutoff=0.05,
+                    sampleIDs=samples(object), padjCutoff=0.1,
                     deltaPsiCutoff=0.1,
                     rhoCutoff=NA, aggregate=FALSE, collapse=FALSE,
                     minCount=5, psiType=psiTypes,
@@ -1016,7 +1016,7 @@ setMethod("results", "FraserDataSet", function(object,
 })
 
 aberrant.FRASER <- function(object, type=fitMetrics(object), 
-                                padjCutoff=0.05, deltaPsiCutoff=0.1, 
+                                padjCutoff=0.1, deltaPsiCutoff=0.1, 
                                 minCount=5, rhoCutoff=NA,
                                 by=c("none", "sample", "feature"), 
                                 aggregate=FALSE, geneColumn="hgnc_symbol", 
diff --git a/R/plotMethods.R b/R/plotMethods.R
index 7609939c..6f7a9a22 100644
--- a/R/plotMethods.R
+++ b/R/plotMethods.R
@@ -1568,7 +1568,7 @@ plotManhattan.FRASER <- function(object, sampleID, value="pvalue",
     stopifnot(sampleID %in% samples(object))
     type <- match.arg(type)
     additional_args <- list(...)
-    padjCutoff <- 0.05
+    padjCutoff <- 0.1
     if("padjCutoff" %in% names(additional_args)){
         padjCutoff <- additional_args$padjCutoff
     }

From 6ab5bd7e797e3a31eb84ac45d3f8e5f564828c46 Mon Sep 17 00:00:00 2001
From: Ines Scheller <scheller@in.tum.de>
Date: Thu, 30 Mar 2023 17:12:40 +0200
Subject: [PATCH 79/80] readme update

---
 README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5c15f95f..21de3e75 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,16 @@ Please cite our method paper if you use it in a publication:
 ## What's new
 
 FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as 
-its splice metric instead of FRASER's previous three metrics, is now available 
-and used by default (version 1.99.0 and above). The manuscript describing these 
-changes in more detail will be available soon. 
+its splice metric instead of FRASER's previous three metrics along with some other 
+parameter optimizations of pseudocount, filtering settings and default delta cutoff, 
+is now available and used by default (version 1.99.0 and above). 
+To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics 
+specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the 
+new default delta cutoff is 0.1 instead of the previous value of 0.3. When using 
+the 3 previous metrics, the delta cutoff should therefore be manually set to 0.3 
+during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`.
+
+The manuscript describing these changes in more detail will be available soon. 
 
 ## Installation
 

From 8251d53290ef7dcf7dfdc5027c86396dd6015c9e Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Thu, 30 Mar 2023 17:29:45 +0200
Subject: [PATCH 80/80] Update README.md

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 21de3e75..ef17f68a 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,15 @@ Please cite our method paper if you use it in a publication:
 
 ## What's new
 
-FRASER 2.0, an improved version of FRASER that uses the Intron Jaccard Index as 
-its splice metric instead of FRASER's previous three metrics along with some other 
-parameter optimizations of pseudocount, filtering settings and default delta cutoff, 
-is now available and used by default (version 1.99.0 and above). 
+FRASER 2.0, an improved version of FRASER, is now available and used by default (version 1.99.0 and above).
+FRASER 2.0 uses the Intron Jaccard Index as its splice metric instead of FRASER's 
+previous three metrics along with some other parameter optimizations of pseudocount, 
+filtering settings and default delta cutoff. 
+ 
 To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics 
 specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the 
 new default delta cutoff is 0.1 instead of the previous value of 0.3. When using 
-the 3 previous metrics, the delta cutoff should therefore be manually set to 0.3 
+the 3 previous metrics, the delta cutoff should be set manually to 0.3 
 during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`.
 
 The manuscript describing these changes in more detail will be available soon.