Merge pull request #269 from AlexsLemonade/jashapiro/reduce-merge-memory

Update to Bioc3.18 and reduce merge memory
AlexsLemonade · Apr 18, 2024 · 951b569 · 951b569
2 parents 20c9029 + ff19180
commit 951b569
Show file tree

Hide file tree

Showing 23 changed files with 900 additions and 812 deletions.
diff --git a/.Rprofile b/.Rprofile
@@ -1,5 +1,6 @@
-options(BioC_mirror = "https://packagemanager.posit.co/bioconductor")
-options(BIOCONDUCTOR_CONFIG_FILE = "https://packagemanager.posit.co/bioconductor/config.yaml")
+# Configure BioCManager to use Posit Public Package Manager:
+options(BioC_mirror = "https://p3m.dev/bioconductor")
+options(BIOCONDUCTOR_CONFIG_FILE = "https://p3m.dev/bioconductor/config.yaml")
 
 # activate renv
 source("renv/activate.R")
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,7 +25,7 @@ Description: Tools for processing single cell data associated with the
 License: BSD_3_clause + file LICENSE
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends:
     R (>= 4.3.0)
 Imports:

diff --git a/R/build_sce.R b/R/build_sce.R
@@ -5,15 +5,16 @@
 #'   with intron counts marked by "-U" and ambiguous counts "-A".
 #' @param include_unspliced Whether or not to include the unspliced reads in the counts matrix.
 #'   If TRUE, the main "counts" assay will contain unspliced reads and spliced reads and an additional "spliced"
-#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference contianing
+#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference containing
 #'   spliced and unspliced reads.
 #'   Default is TRUE.
 #' @param round_counts Logical indicating in the count matrix should be rounded to integers on import.
 #'   Default is TRUE.
 #'
-#' @return SingleCellExperiment object containing either just a counts assay with spliced cDNA only if
-#'   `include_unspliced` is FALSE. If `include_unspliced` is TRUE, the counts assay will contain both spliced and unspliced
-#'    counts and the spliced assay will contain the counts for just the spliced cDNA.
+#' @return SingleCellExperiment object. If `include_unspliced` is TRUE (default), the counts assay will contain
+#'   both spliced and unspliced counts and the spliced assay will contain the counts for just the spliced cDNA.
+#'   If `include_unspliced` is FALSE, the counts assay will contain spliced cDNA counts only.
+#'
 #'
 #' @examples
 #' \dontrun{
@@ -41,13 +42,13 @@ build_sce <- function(counts,
   # define if counts matrix has any unspliced reads
   has_unspliced <- any(grep("-[IU]$", rownames(counts)))
 
-  if (include_unspliced & !has_unspliced) {
+  if (include_unspliced && !has_unspliced) {
     stop("No counts corresponding to intronic reads detected.
           If `include_unspliced` is TRUE a reference with spliced and unspliced reads must be used.")
   }
 
   # if has unspliced data get counts for both unspliced and spliced
-  if (include_unspliced & has_unspliced) {
+  if (include_unspliced && has_unspliced) {
     total <- collapse_intron_counts(counts, which_counts = c("total"))
     spliced <- collapse_intron_counts(counts, which_counts = c("spliced"))
 
@@ -88,7 +89,7 @@ build_sce <- function(counts,
       counts = total,
       spliced = spliced
     )
-  } else if (!include_unspliced & has_unspliced) {
+  } else if (!include_unspliced && has_unspliced) {
     # still aligned to introns, but want to collapse and just return spliced
     spliced <- collapse_intron_counts(counts, which_counts = c("spliced"))
     assay_list <- list(counts = spliced)

diff --git a/R/import_quant_data.R b/R/import_quant_data.R
@@ -1,18 +1,19 @@
 #' Import Gene Expression Quantification Data for Single-Cell RNA-Seq
 #'
-#' Imports the gene x cell matrix output from either Alevin, Alevin-fry, Cellranger, or Kallisto and returns a SingleCellExperiment.
+#' Imports the gene x cell matrix output from either Alevin, Alevin-fry, Cell Ranger, or Kallisto
+#'   and returns a SingleCellExperiment.
 #'
 #' @param quant_dir Path to directory where output files are located.
 #' @param tool Type of tool used to create files (alevin, alevin-fry, cellranger, or kallisto).
 #' @param include_unspliced Whether or not to include the unspliced reads in the counts matrix.
 #'   If TRUE, the main "counts" assay will contain unspliced reads and spliced reads and an additional "spliced"
-#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference contianing
+#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference containing
 #'   spliced and unspliced reads.
 #'   Default is TRUE.
 #' @param usa_mode Logical indicating if Alevin-fry was used, if the USA mode was invoked.
 #'   Default is FALSE.
 #' @param filter Logical indicating whether or not to filter the counts matrix.
-#'   Filtering is performed using DropletUtils::emptyDrops and cannot be performed with Cellranger.
+#'   Filtering is performed using DropletUtils::emptyDrops and cannot be performed with Cell Ranger.
 #' @param fdr_cutoff FDR cutoff to use for DropletUtils::emptyDrops.
 #'   Default is 0.01.
 #' @param tech_version Technology or kit used to process library (i.e. 10Xv3, 10Xv3.1).
@@ -64,40 +65,22 @@ import_quant_data <- function(quant_dir,
                               ...) {
   which_counts <- match.arg(which_counts)
 
-  if (!(tool %in% c("cellranger", "alevin", "alevin-fry", "kallisto"))) {
-    stop("Tool must be either cellranger, alevin, alevin-fry, or kallisto.")
-  }
-
-  # checks for intron_mode and usa_mode
-  if (!is.logical(include_unspliced)) {
-    stop("include_unspliced must be set as TRUE or FALSE")
-  }
-  if (!is.logical(usa_mode)) {
-    stop("usa_mode must be set as TRUE or FALSE")
-  }
-  if (!is.logical(filter)) {
-    stop("filter must be set as TRUE or FALSE")
-  }
+  stopifnot(
+    "Tool must be one of cellranger, alevin, alevin-fry, or kallisto." =
+      tool %in% c("cellranger", "alevin", "alevin-fry", "kallisto"),
+    "include_unspliced must be set as TRUE or FALSE" = is.logical(include_unspliced),
+    "usa_mode must be set as TRUE or FALSE" = is.logical(usa_mode),
+    "filter must be set as TRUE or FALSE" = is.logical(filter),
+    "USA mode only compatible with alevin-fry." = !(usa_mode && tool %in% c("cellranger", "alevin", "kallisto")),
+    "Include unspliced not compatible with cellranger." = !(include_unspliced && tool %in% c("cellranger")),
+    "Cannot perform emptyDrops filtering on cellranger output." = !(filter && tool == "cellranger")
+  )
 
-  # check that usa_mode and intron_mode are used with the proper tools
-  if (usa_mode & tool %in% c("cellranger", "alevin", "kallisto")) {
-    stop("USA mode only compatible with alevin-fry.")
-  }
-  if (include_unspliced & tool %in% c("cellranger")) {
-    stop("Include unspliced not compatible with cellranger.")
-  }
-
-  # check that filter is not used with cellranger
-  if (filter & tool == "cellranger") {
-    stop("Cannot perform emptyDrops filtering on cellranger output.")
-  }
   if (filter) {
-    if (!(is.numeric(fdr_cutoff))) {
-      stop("fdr_cutoff is not a number.")
-    }
-    if (fdr_cutoff < 0 | fdr_cutoff > 1) {
-      stop("fdr_cutoff must be a number between 0 - 1.")
-    }
+    stopifnot(
+      "fdr_cutoff must be a number." = is.numeric(fdr_cutoff),
+      "fdr_cutoff must be a number between 0 - 1." = fdr_cutoff >= 0 && fdr_cutoff <= 1
+    )
   }
 
   if (tool %in% c("alevin-fry", "alevin")) {

diff --git a/R/merge_sce_list.R b/R/merge_sce_list.R
@@ -160,8 +160,13 @@ merge_sce_list <- function(
       purrr::walk(
         \(altexp_name) {
           sce_list |>
-            purrr::keep(\(sce) altexp_name %in% altExpNames(sce)) |>
-            purrr::map(altExp, altexp_name) |>
+            purrr::map(\(sce) {
+              if (altexp_name %in% altExpNames(sce)) {
+                altExp(sce, altexp_name)
+              }
+            }) |>
+            # remove nulls
+            purrr::keep(\(sce) is(sce, "SingleCellExperiment")) |>
             check_metadata()
         }
       )
@@ -244,24 +249,9 @@ merge_sce_list <- function(
 
   if (include_altexp) {
     for (altexp_name in names(altexp_attributes)) {
-      has_altexp_name <- sce_list |>
-        purrr::map_lgl(\(sce) (altexp_name %in% altExpNames(sce)))
-
-      # For any SCEs without this altExp, create the library_id and sample_id metadata
-      additional_metadata <- sce_list |>
-        purrr::discard(has_altexp_name) |>
-        purrr::map(extract_metadata_for_altexp)
-
       # Update metadata in altExps that were originally present
       altexp_metadata_list <- sce_list |>
-        purrr::keep(has_altexp_name) |>
-        purrr::map(altExp, altexp_name) |>
-        purrr::map(metadata) |>
-        # Tack on the metadata we created for libraries without altExps
-        c(additional_metadata)
-
-      # Ensure correct order
-      altexp_metadata_list <- altexp_metadata_list[names(sce_list)]
+        purrr::map(get_altexp_metadata, altexp_name = altexp_name)
 
       metadata(altExp(merged_sce, altexp_name)) <- prepare_merged_metadata(altexp_metadata_list)
     }
@@ -572,14 +562,27 @@ check_metadata <- function(sce_list, expected_fields = c("library_id", "sample_i
   }
 }
 
-#' Helper function to extract main SCE metadata for inclusion in an altExp
+
+
+#' Helper function to get altExp metadata from an SCE that may not have the altexp
+#' Returns main experiment metadata if the altExp is not present in the SCE
 #'
 #' @param sce SCE object to extract metadata from
+#' @param altexp_name Name of the altExp to extract metadata for
 #'
 #' @return List with fields `library_id` and `sample_id`
-extract_metadata_for_altexp <- function(sce) {
-  list(
-    library_id = metadata(sce)$library_id,
-    sample_id = metadata(sce)$sample_id
-  )
+get_altexp_metadata <- function(sce, altexp_name) {
+  if (altexp_name %in% altExpNames(sce)) {
+    return(
+      metadata(altExp(sce, altexp_name))
+    )
+  } else {
+    # if the altExp is not present, return partial main SCE metadata
+    return(
+      list(
+        library_id = metadata(sce)$library_id,
+        sample_id = metadata(sce)$sample_id
+      )
+    )
+  }
 }
diff --git a/R/read_alevin.R b/R/read_alevin.R
@@ -7,7 +7,7 @@
 #'   Default is FALSE.
 #' @param include_unspliced Whether or not to include the unspliced reads in the counts matrix.
 #'   If TRUE, the main "counts" assay will contain unspliced reads and spliced reads and an additional "spliced"
-#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference contianing
+#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference containing
 #'   spliced and unspliced reads.
 #'   Default is TRUE.
 #' @param feature_data Logical indicating if the data being read in contains feature data.

diff --git a/R/read_cellranger.R b/R/read_cellranger.R
@@ -1,4 +1,4 @@
-#' Read in counts data processed with Cellranger
+#' Read in counts data processed with Cell Ranger
 #'
 #' @param quant_dir Path to directory where output files are located.
 #'

diff --git a/R/read_kallisto.R b/R/read_kallisto.R
@@ -3,7 +3,7 @@
 #' @param quant_dir Path to directory where output files are located.
 #' @param include_unspliced Whether or not to include the unspliced reads in the counts matrix.
 #'   If TRUE, the main "counts" assay will contain unspliced reads and spliced reads and an additional "spliced"
-#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference contianing
+#'   assay will contain spliced reads only. If TRUE, requires that data has been aligned to a reference containing
 #'   spliced and unspliced reads.
 #'   Default is TRUE.
 #' @param round_counts Logical indicating in the count matrix should be rounded to integers on import.

diff --git a/R/scpcaTools-package.R b/R/scpcaTools-package.R
@@ -1,13 +1,12 @@
 #' scpcaTools: Useful tools for analysis of single-cell RNA seq counts data
 #'
 #' The scpcaTools package contains a set of tools for working with single-cell and single-nuclei RNA-seq counts data.
-#' Mainly, this package can work with data that has been produced using Alevin, Alevin-Fry, Cellranger, or Kallisto.
+#' Mainly, this package can work with data that has been produced using Alevin, Alevin-Fry, Cell Ranger, or Kallisto.
 #'
 #'
-#' @docType package
-#' @name scpcaTools-package
+#' @aliases scpcaTools-package
 #'
 #' @import methods
 #' @import SingleCellExperiment
 #' @import stringr
-NULL
+"_PACKAGE"
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM rocker/r-ver:4.3.2
+FROM rocker/r-ver:4.3.3
 LABEL maintainer="[email protected]"
 LABEL org.opencontainers.image.source https://github.com/AlexsLemonade/scpcaTools
 
@@ -13,10 +13,9 @@ RUN Rscript -e "install.packages(c('remotes', 'renv'))"
 
 WORKDIR /usr/local/renv
 COPY renv.lock renv.lock
-# restore with PPM repo set for binary installs
-RUN Rscript -e "renv::consent(provided = TRUE); \
-      renv::restore(repos = 'https://packagemanager.posit.co/cran/latest')" && \
-      rm -rf ~/.local/share/renv && \
+# restore renv and remove cache files
+RUN Rscript -e "renv::restore()" && \
+      rm -rf ~/.cache/R/renv && \
       rm -rf /tmp/downloaded_packages && \
       rm -rf /tmp/Rtmp*
 
@@ -27,7 +26,7 @@ RUN Rscript -e "proc <- basilisk::basiliskStart(env = zellkonverter::zellkonvert
 
 #### Python packages
 COPY requirements.txt requirements.txt
-RUN pip install -r requirements.txt && pip cache purge
+RUN pip install --no-cache-dir -r requirements.txt
 
 
 ##########################