Switch to theta (#17)

* global change of psiSite -> theta * fix typos and documentation * adapt and speedup examples * fixing #16
gagneurlab · Oct 21, 2020 · 07a82e2 · 07a82e2
1 parent 82f1586
commit 07a82e2
Show file tree

Hide file tree

Showing 34 changed files with 278 additions and 232 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,8 @@
 
 # ignore tmp files
 tmp
+inst/igv
+inst/tmp*
 
 # ignore vignette artefacts
 FraseR.pdf

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -88,7 +88,6 @@ Collate:
     AllGenerics-definitions.R
     AllGenerics.R
     Fraser-pipeline.R
-    accessor-methods.R
     annotationOfRanges.R
     beta-binomial-testing.R
     calculatePSIValue.R

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -25,7 +25,7 @@ asFDS <- function(x){
 #' @param object A FraserDataSet object.
 #' @param value The new value that should replace the current one.
 #' @param x A FraserDataSet object.
-#' @param type The psi type (psi3, psi5 or psiSite)
+#' @param type The psi type (psi3, psi5 or theta)
 #' @return Getter method return the respective current value.
 #' @examples
 #' fds <- createTestFraserDataSet()
@@ -47,9 +47,9 @@ asFDS <- function(x){
 #' scanBamParam(fds) <- ScanBamParam(mapqFilter=30)
 #' nonSplicedReads(fds)
 #' rowRanges(fds)
-#' rowRanges(fds, type="psiSite")
+#' rowRanges(fds, type="theta")
 #' mcols(fds, type="psi5")
-#' mcols(fds, type="psiSite")
+#' mcols(fds, type="theta")
 #' seqlevels(fds)
 #' seqlevels(mapSeqlevels(fds, style="UCSC"))
 #' seqlevels(mapSeqlevels(fds, style="Ensembl"))
@@ -518,7 +518,7 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace)
 #'  counts(fds, type="psi5", side="ofInterest")
 #'  counts(fds, type="psi5", side="other")
 #'  head(K(fds, type="psi3"))
-#'  head(N(fds, type="psi3"))
+#'  head(N(fds, type="theta"))
 #'  
 setMethod("counts", "FraserDataSet", function(object, type=NULL,
             side=c("ofInterest", "otherSide")){
@@ -534,15 +534,15 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL,
     }
 
     # extract psi value from type
-    type <- unlist(regmatches(type, gregexpr("psi(3|5|Site)", type, perl=TRUE)))
-    if(length(type) == 0){
-        stop(paste0("Please provide a correct psi type: psi5, psi3, and ",
-                    "psiSite. Not the given one: '", type, "'."))
+    type <- whichPSIType(type)
+    if(length(type) == 0 | length(type) > 1){
+        stop(paste0("Please provide a correct psi type: psi5, psi3, or ",
+                    "theta. Not the given one: '", type, "'."))
     }
     aname <- paste0("rawOtherCounts_", type)
     if(!aname %in% assayNames(object)){
         stop(paste0("Missing rawOtherCounts for type '", type, "'.",
-                "Please calculate PSIValues first. And then try again."))
+                " Please calculate PSIValues first. And then try again."))
     }
     return(assays(object)[[aname]])
 })
@@ -559,8 +559,7 @@ setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL,
         type <- checkReadType(object, type)
         aname <- paste0("rawCounts", toupper(type))
     } else {
-        type <- unlist(
-                regmatches(type, gregexpr("psi(3|5|Site)", type, perl=TRUE)))
+        type <- whichPSIType(type)
         aname <- paste0("rawOtherCounts_", type)
     }
     assays(object, ...)[[aname]] <- value
@@ -745,7 +744,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 #'     the gene symbols.   
 #' @param method The p.adjust method that is being used to adjust p values per
 #'     sample.
-#' @param type Splicing type (psi5, psi3 or psiSite)
+#' @param type Splicing type (psi5, psi3 or theta)
 #' @param by By default \code{none} which means no grouping. But if 
 #'              \code{sample} or \code{feature} is specified the sum by 
 #'              sample or feature is returned
@@ -790,7 +789,7 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff,
 setMethod("results", "FraserDataSet", function(object, 
                     sampleIDs=samples(object), padjCutoff=0.05,
                     zScoreCutoff=NA, deltaPsiCutoff=0.3,
-                    minCount=5, psiType=c("psi3", "psi5", "psiSite"),
+                    minCount=5, psiType=c("psi3", "psi5", "theta"),
                     additionalColumns=NULL, BPPARAM=bpparam(), ...){
     FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff,
             zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff,

diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R
@@ -31,7 +31,7 @@
 #'         splicing types.
 #' @param implementation The method that should be used to correct for 
 #' confounders. 
-#' @param type The type of PSI (psi5, psi3 or psiSite for theta/splicing 
+#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing 
 #' efficiency)
 #' @param iterations The maximal number of iterations. When the autoencoder has 
 #' not yet converged after these number of iterations, the fit stops anyway.
@@ -47,8 +47,7 @@
 #' # preprocessing
 #' fds <- createTestFraserDataSet()
 #' 
-#' # when running FRASER on a real dataset, one should run the following 
-#' # two commands first (not run here to make the example run faster):
+#' # filtering not expressed introns
 #' fds <- calculatePSIValues(fds)
 #' fds <- filterExpressionAndVariability(fds)
 #'

diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R
@@ -264,6 +264,16 @@ FraserDataSet <- function(colData=NULL, junctions=NULL, spliceSites=NULL, ...) {
             stop("Please provdie splice site counts if you provide ",
                     "junction counts.")
         }
+        if(is.character(junctions)){
+            if(!file.exists(junctions))
+                stop("Junction file '", junctions, "' does not exists")
+            junctions <- fread(junctions)
+        }
+        if(is.character(spliceSites)){
+            if(!file.exists(spliceSites))
+                stop("SpliceSite file '", spliceSites, "' does not exists")
+            spliceSites <- fread(spliceSites)
+        }
         if(is.data.frame(junctions)){
             junctions <- makeGRangesFromDataFrame(junctions,
                     keep.extra.columns=TRUE)
@@ -272,6 +282,12 @@ FraserDataSet <- function(colData=NULL, junctions=NULL, spliceSites=NULL, ...) {
             spliceSites <- makeGRangesFromDataFrame(spliceSites,
                     keep.extra.columns=TRUE)
         }
+        if(!is(junctions, "GRanges")){
+            stop("Provided junction object is not a GRanges object.")
+        }
+        if(!is(spliceSites, "GRanges")){
+            stop("Provided spliceSite object is not a GRanges object.")
+        }
         nsr <- SummarizedExperiment(
                 rowRanges=spliceSites[,c("spliceSiteID", "type")],
                 assays=SimpleList(rawCountsSS=as.data.frame(

diff --git a/R/accessor-methods.R b/R/accessor-methods.R
diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
@@ -82,7 +82,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
             biotype, useUSCS)
 
     # annotate split reads
-    for(i in c("psi3", "psiSite")){
+    for(i in c("psi3", "theta")){
         gr <- rowRanges(fds, type=i)
         if(any(strand(gr) == "*")){
             strand(annotation) <- "*"
@@ -124,7 +124,7 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL",
         }
     }
 
-    for(i in c("psi3", "psiSite")){
+    for(i in c("psi3", "theta")){
         # get GRanges object with the split reads which should be annotated
         gr <- rowRanges(fds, type=i)
 

diff --git a/R/beta-binomial-testing.R b/R/beta-binomial-testing.R
@@ -26,7 +26,7 @@ pvalueByBetaBinomialPerType <- function(fds, aname, psiType, pvalFun,
     rawOtherCounts <- counts(fds, type=psiType, side="otherSide")
 
     # swap rawCounts with others for intron retention
-    if(psiType == "psiSite"){
+    if(psiType == "theta"){
         tmp <- rawCounts
         rawCounts <- rawOtherCounts
         rawOtherCounts <- tmp

diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R
@@ -14,7 +14,7 @@
 #'
 #' @inheritParams countRNA
 #' @param types A vector with the psi types which should be calculated. Default 
-#' is all of psi5, psi3 and psiSite.
+#' is all of psi5, psi3 and theta.
 #' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will
 #'              be recalculated based on the existing junction counts (aka K)
 #' @return FraserDataSet
@@ -185,12 +185,12 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
 
     message(date(), ": Calculate the PSI site values ...")
 
-    psiName <- "psiSite"
-    psiROCName <- "rawOtherCounts_psiSite"
+    psiName <- "theta"
+    psiROCName <- "rawOtherCounts_theta"
     if(!psiROCName %in% assayNames(fds)){
         overwriteCts <- TRUE
     }
-    psiH5datasetName <- "oSite_psiSite"
+    psiH5datasetName <- "oSite_theta"
 
     # prepare data table for calculating the psi value
     countData <- data.table(
@@ -205,7 +205,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
         )
     )
 
-    psiSiteValues <- bplapply(samples(fds), countData=countData, fds=fds,
+    thetaValues <- bplapply(samples(fds), countData=countData, fds=fds,
         BPPARAM=BPPARAM, FUN=function(sample, countData, fds){
             if(verbose(fds) > 3){
                 message("sample: ", sample)
@@ -214,18 +214,18 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
             # get sample
             sample <- as.character(sample)
 
-            # get counts and psiSite values from cache file if it exists
+            # get counts and theta values from cache file if it exists
             cacheFile <- getOtherCountsCacheFile(sample, fds)
             ans <- checkPsiCacheFile(cFile=cacheFile, dName=psiH5datasetName, 
-                    overwrite=overwriteCts, ptype="psiSite", fds=fds)
+                    overwrite=overwriteCts, ptype="theta", fds=fds)
             if(!is.null(ans)){
                 return(ans)
             }
 
             # add sample specific counts to the data.table
             sdata <- data.table(k=c(
                     rep(K(fds, type="psi3")[,sample], 2),
-                    K(fds, type="psiSite")[,sample]))
+                    K(fds, type="theta")[,sample]))
             sdata <- cbind(countData, sdata)
             sdata[,os:=sum(k)-k, by="spliceSiteID"]
 
@@ -257,15 +257,15 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
             HDF5Array(filepath=cacheFile, name=psiH5datasetName)
         }
     )
-    names(psiSiteValues) <- samples(fds)
+    names(thetaValues) <- samples(fds)
 
     # merge it and assign it to our object
     assay(fds, type="ss", psiName, withDimnames=FALSE) <- do.call(cbind, 
-            mapply('[', psiSiteValues, TRUE, 2, drop=FALSE, 
+            mapply('[', thetaValues, TRUE, 2, drop=FALSE, 
                     SIMPLIFY=FALSE))
     if(isTRUE(overwriteCts)){
         assay(fds, type="ss", psiROCName, withDimnames=FALSE) <- do.call(cbind,
-                mapply('[', psiSiteValues, TRUE, 1, drop=FALSE, 
+                mapply('[', thetaValues, TRUE, 1, drop=FALSE, 
                         SIMPLIFY=FALSE))
     }
 

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
@@ -37,13 +37,14 @@
 #' @param recount if TRUE the cache is ignored and the bam file is recounted.
 #' @param genome NULL (default) or a character vector specifying the names of 
 #' the reference genomes that were used to align the reads for each sample. The 
-#' names have to be in a way accepted by the \link{getBSgenome} function. 
-#' Available genomes can be listed using the \link{available.genomes} function 
-#' from the BSgenome package. If genome is of length 1, the same reference 
-#' genome will be used for all samples. 
-#' If \code{genome} is supplied and \code{strandSpecific(fds) == 0L} 
+#' names have to be in a way accepted by the \code{\link[BSgenome]{getBSgenome}}
+#' function. Available genomes can be listed using the 
+#' \code{\link[BSgenome]{available.genomes}} function from the BSgenome package.
+#' If genome is of length 1, the same reference genome will be used for all
+#' samples. If \code{genome} is supplied and \code{strandSpecific(fds) == 0L} 
 #' (unstranded), then the strand information will be estimated by checking the 
-#' dinucleotides found at the intron boundaries (see ?\link{summarizeJunctions} 
+#' dinucleotides found at the intron boundaries 
+#' (see \code{\link[GenomicAlignments]{summarizeJunctions}}
 #' in GenomicAlignments package for details). This can e.g. help to avoid 
 #' ambiguities when adding gene names from a gene annotation to the introns 
 #' in a later step. 

diff --git a/R/filterExpression.R b/R/filterExpression.R
@@ -146,7 +146,7 @@ filterVariability <- function(object, minDeltaPsi=0, filter=TRUE,
     ctsSE  <- K(object, type="ss")
     ctsN5  <- N(object, type="psi5")
     ctsN3  <- N(object, type="psi3")
-    ctsNSE <- N(object, type="psiSite")
+    ctsNSE <- N(object, type="theta")
 
     if(isFALSE(delayed)){
         cts <- as.matrix(cts)

diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R
@@ -4,7 +4,7 @@
 #' the values within the FRASER model.
 #' 
 #' @param fds An FraserDataSet object.
-#' @param type The type of psi (psi5, psi3 or psiSite)
+#' @param type The type of psi (psi5, psi3 or theta)
 #' @param byGroup If TRUE, aggregation by donor/acceptor site will be done.
 #' @param dist Distribution for which the p-values should be extracted.
 #' @param level Indicates if the retrieved p values should be adjusted on the 
@@ -47,9 +47,9 @@
 #' pseudocount()
 #' 
 #' # retrieve or set a mask to exclude certain junctions in the fitting step
-#' featureExclusionMask(fds, type="psiSite") <- sample(
-#'         c(FALSE, TRUE), nrow(mcols(fds, type="psiSite")), replace=TRUE)
-#' featureExclusionMask(fds, type="psiSite")
+#' featureExclusionMask(fds, type="theta") <- sample(
+#'         c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE)
+#' featureExclusionMask(fds, type="theta")
 #' 
 #' # controlling the verbosity level of the output of some algorithms
 #' verbose(fds) <- 2
@@ -527,7 +527,7 @@ weights <- function(fds, type){
 getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){
     type <- as.character(resultTable$type)
     target <- makeGRangesFromDataFrame(resultTable)
-    if(type == "psiSite"){
+    if(type == "theta"){
         gr <- granges(asSE(nonSplicedReads(fds)))
     } else {
         gr <- granges(asSE(fds))

diff --git a/R/helper-functions.R b/R/helper-functions.R
@@ -79,7 +79,7 @@ checkReadType <- function(fds, type){
     }
     type <- unique(type)
     stopifnot(isScalarCharacter(type))
-    correctTypes <- c(psi3="j", psi5="j", psiSite="ss")
+    correctTypes <- c(psi3="j", psi5="j", theta="ss")
 
     # check if it is already the correct type
     if(type %in% correctTypes) return(type)
@@ -109,7 +109,7 @@ checkReadType <- function(fds, type){
 #'
 #' @noRd
 whichPSIType <- function(type){
-    unlist(regmatches(type, gregexpr("psi(3|5|Site)", type, perl=TRUE)))
+    unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE)))
 }
 
 #'
@@ -120,7 +120,7 @@ whichReadType <- function(fds, name){
     stopifnot(isScalarCharacter(name))
 
     # check writing
-    if(name == "ss" | endsWith(name, "psiSite"))
+    if(name == "ss" | endsWith(name, "theta"))
         return("ss")
     if(name == "j"  | endsWith(name, "psi5") | endsWith(name, "psi3"))
         return("j")