diff --git a/.Rbuildignore b/.Rbuildignore index b238b50..ae19577 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,4 @@ ^\.lintr$ ^\.pre-commit-config.yaml$ ^data-raw$ +^LICENSE\.md$ diff --git a/.Renviron b/.Renviron new file mode 100644 index 0000000..90d1f09 --- /dev/null +++ b/.Renviron @@ -0,0 +1 @@ +RENV_EXT_ENABLED = FALSE diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e3079b8..5d0ac27 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,7 +16,7 @@ repos: exclude: '\.Rd' - repo: https://github.com/crate-ci/typos - rev: v1.28.1 + rev: v1.28.2 hooks: - id: typos exclude: '\.nb\.html' diff --git a/DESCRIPTION b/DESCRIPTION index b2908ff..0c9cc6e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,8 +21,12 @@ LazyData: true Suggests: testthat (>= 3.0.0), scater, + scran, Seurat, - splatter + splatter, + scuttle, + Matrix, + SeuratObject Config/testthat/edition: 3 RoxygenNote: 7.3.2 Imports: @@ -34,6 +38,7 @@ Imports: purrr, S4Vectors, SingleCellExperiment, + SummarizedExperiment, tibble, tidyr biocViews: diff --git a/LICENSE b/LICENSE index ad17738..ab97fda 100644 --- a/LICENSE +++ b/LICENSE @@ -1,28 +1,3 @@ -BSD 3-Clause License - -Copyright (c) 2024, Alex's Lemonade Stand Foundation - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +YEAR: 2024 +COPYRIGHT HOLDER: Alex's Lemonade Stand Foundation +ORGANIZATION: Alex's Lemonade Stand Foundation diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..252ca04 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,28 @@ +# BSD 3-Clause License + +Copyright (c) 2024, Alex's Lemonade Stand Foundation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/NAMESPACE b/NAMESPACE index 601be7e..6d9e440 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,9 +6,12 @@ export(calculate_silhouette) export(calculate_stability) export(ensembl_to_symbol) export(extract_pc_matrix) +export(sce_to_seurat) export(sce_to_symbols) +export(sum_duplicate_genes) export(sweep_clusters) import(SingleCellExperiment) +import(SummarizedExperiment) import(methods) importFrom(S4Vectors,`metadata<-`) importFrom(S4Vectors,metadata) diff --git a/R/convert-gene-ids.R b/R/convert-gene-ids.R index 5645a9c..dd10d32 100644 --- a/R/convert-gene-ids.R +++ b/R/convert-gene-ids.R @@ -13,18 +13,21 @@ #' #' #' @param ensembl_ids A character vector of Ensembl gene ids to translate to -#' gene symbols. -#' @param reference The reference gene list to use for translation. One of `scpca`, -#' `10x2020`, `10x2024`. The `scpca` reference is the default. -#' @param sce A SingleCellExperiment object to use as a reference for gene symbols. -#' If provided, the `reference` argument will be ignored. The `sce` object must -#' include columns with the names `gene_ids` (containing Ensembl ids) and -#' `gene_symbol` (containing the symbols) to use for conversion. -#' @param unique Whether to use unique gene symbols, as would be done if -#' data had been read in with gene symbols by Seurat. Default is FALSE. -#' @param leave_na Whether to leave NA values in the output vector. -#' If FALSE, any missing values will be replaced with the input ensembl_id value. -#' Default is FALSE. +#' gene symbols. +#' @param reference The reference gene list to use for translation. One of +#' `scpca`, `10x2020`, `10x2024`. The `scpca` reference is the default. +#' @param sce A SingleCellExperiment object to use as a reference for gene +#' symbols. If provided, the `reference` argument will be ignored. The `sce` +#' object must include columns with the names `gene_ids` (containing Ensembl +#' ids) and `gene_symbol` (containing the symbols) to use for conversion. +#' @param unique Whether to use unique gene symbols, as would be done if data +#' had been read in with gene symbols by Seurat. Default is FALSE. +#' @param leave_na Whether to leave NA values in the output vector. If FALSE, +#' any missing values will be replaced with the input ensembl_id value. +#' Default is FALSE. +#' @param seurat_compatible Whether to return a vector that is compatible with +#' Seurat, translating and underscores to dashes. +#' Default is FALSE. #' #' @return A vector of gene symbols corresponding to the input Ensembl ids. #' @export @@ -51,7 +54,8 @@ ensembl_to_symbol <- function( reference = c("scpca", "10x2020", "10x2024"), sce = NULL, unique = FALSE, - leave_na = FALSE) { + leave_na = FALSE, + seurat_compatible = FALSE) { reference <- match.arg(reference) stopifnot( "`ensembl_ids` must be a character vector." = is.character(ensembl_ids), @@ -85,10 +89,17 @@ ensembl_to_symbol <- function( ) } if (!leave_na && any(missing_symbols)) { - warning("Not all input ids have corresponding gene symbols, using input ids for missing values.") + message("Not all input ids have corresponding gene symbols, using input ids for missing values.") gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols] } + if (seurat_compatible) { + if (any(grepl("_", gene_symbols))) { + warning("Replacing underscores ('_') with dashes ('-') in gene symbols for Seurat compatibility.") + } + gene_symbols <- gsub("_", "-", gene_symbols) + } + return(gene_symbols) } @@ -112,15 +123,19 @@ ensembl_to_symbol <- function( #' (and not disabled by the `convert_hvg` and `convert_pca` arguments). #' #' -#' @param sce A SingleCellExperiment object containing gene ids and gene symbols. -#' @param reference The reference gene list for conversion. One of `sce`, `scpca`, -#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used. -#' @param unique Whether to use unique gene symbols, as would be done if -#' data had been read in with gene symbols by Seurat. Default is FALSE. -#' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols. -#' Default is TRUE. -#' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols. -#' Default is TRUE. +#' @param sce A SingleCellExperiment object containing gene ids and gene +#' symbols. +#' @param reference The reference gene list for conversion. One of `sce`, +#' `scpca`, `10x2020`, or `10x2024`. If `sce` (the default) the internal row +#' data is used. +#' @param unique Whether to use unique gene symbols, as would be done if data +#' had been read in with gene symbols by Seurat. Default is FALSE. +#' @param convert_hvg Logical indicating whether to convert highly variable +#' genes to gene symbols. Default is TRUE. +#' @param convert_pca Logical indicating whether to convert PCA rotation matrix +#' to gene symbols. Default is TRUE. +#' @param seurat_compatible Logical indicating whether to make gene symbols +#' Seurat-compatible by replacing underscores with dashes. Default is FALSE. #' #' @return A SingleCellExperiment object with row names set as gene symbols. #' @export @@ -145,7 +160,8 @@ sce_to_symbols <- function( reference = c("sce", "scpca", "10x2020", "10x2024"), unique = FALSE, convert_hvg = TRUE, - convert_pca = TRUE) { + convert_pca = TRUE, + seurat_compatible = FALSE) { reference <- match.arg(reference) stopifnot( "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"), @@ -164,9 +180,19 @@ sce_to_symbols <- function( } if (reference == "sce") { - gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique) + gene_symbols <- ensembl_to_symbol( + ensembl_ids, + sce = sce, + unique = unique, + seurat_compatible = seurat_compatible + ) } else { - gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique) + gene_symbols <- ensembl_to_symbol( + ensembl_ids, + reference = reference, + unique = unique, + seurat_compatible = seurat_compatible + ) } row_ids <- gene_symbols # set Ensembl ids as original ids for later translations diff --git a/R/make-seurat.R b/R/make-seurat.R new file mode 100644 index 0000000..424ee6f --- /dev/null +++ b/R/make-seurat.R @@ -0,0 +1,169 @@ +#' Convert an SCE object to Seurat +#' +#' Converts an ScPCA SingleCellExperiment (SCE) object to Seurat format. This is +#' primarily a wrapper around Seurat::as.Seurat() with some additional steps to +#' include ScPCA metadata and options for converting the feature index from +#' Ensembl gene ids to gene symbols. +#' +#' If present, reduced dimensions from SCE objects will be included, renamed to +#' match Seurat default naming. +#' +#' +#' @param sce The input SCE object +#' @param use_symbols Logical indicating whether the Seurat object uses gene +#' symbols for indexing. Default is TRUE. +#' @param reference If `use_symbols` is TRUE, the reference to use for gene +#' symbols. One of `sce`, `scpca`, `10x2020`, `10x2024`, where `sce` uses the +#' symbols stored in the `gene_symbol` column of the SCE object's row data, +#' and the others use standardized translation tables. See `ensembl_to_symbol()` +#' for more information. Default is `sce`. +#' @param dedup_method Method to handle duplicated gene symbols. If `unique`, +#' the gene symbols will be made unique following standard Seurat procedures. +#' If `sum`, the expression values for each gene symbols will be summed. +#' @param seurat_assay_version Whether to create Seurat `v3` or `v5` assay +#' objects. Default is `v3`. +#' +#' @return a Seurat object +#' @export +#' +#' @examples +#' \dontrun{ +#' # convert an ScPCA SCE to Seurat renaming with gene symbols +#' seurat_obj <- sce_to_seurat(sce) +#' +#' # convert SCE to Seurat keeping the current names (usually Ensembl) +#' seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE) +#' +#' # convert SCE to Seurat, merging duplicated gene names +#' seurat_obj <- sce_to_seurat(sce, dedup_method = "sum") +#' +#' # convert SCE to Seurat with v5 assay objects +#' seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v5") +#' } +#' +sce_to_seurat <- function( + sce, + use_symbols = TRUE, + reference = c("sce", "scpca", "10x2020", "10x2024"), + dedup_method = c("unique", "sum"), + seurat_assay_version = c("v3", "v5")) { + reference <- match.arg(reference) + dedup_method <- match.arg(dedup_method) + seurat_assay_version <- match.arg(seurat_assay_version) + + stopifnot( + "Package `Seurat` must be installed for SCE to Seurat conversion." = + requireNamespace("Seurat", quietly = TRUE), + "Package `SeuratObject` must be installed for SCE to Seurat conversion." = + requireNamespace("SeuratObject", quietly = TRUE), + "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"), + "`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." = + reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))), + "`sce` should only include a single library. Merged SingleCellExperiment objects are not (yet) supported." = + length(metadata(sce)$library_id) <= 1 + ) + + if (use_symbols) { + sce <- suppressMessages(sce_to_symbols( + sce, + reference = reference, + unique = (dedup_method == "unique"), + seurat_compatible = TRUE + )) + } + + if (dedup_method == "sum") { + sce <- sum_duplicate_genes(sce) + } + + mainExpName(sce) <- "RNA" + + # convert reducedDims to Seurat standard naming + reducedDimNames(sce) <- tolower(reducedDimNames(sce)) + reducedDims(sce) <- reducedDims(sce) |> + purrr::map(\(mat) { + # insert underscore between letters and numbers in axis labels + rd_names <- gsub("([A-Za-z]+)([0-9]+)", "\\1_\\2", colnames(mat)) |> + tolower() + colnames(mat) <- rd_names + if (!is.null(attr(mat, "rotation"))) { + colnames(attr(mat, "rotation")) <- rd_names + } + return(mat) + }) + + # pull out any altExps to handle manually + alt_exps <- altExps(sce) + altExps(sce) <- NULL + + # Let Seurat do initial conversion to capture most things + if ("logcounts" %in% assayNames(sce)) { + data_name <- "logcounts" + } else { + data_name <- NULL + } + sobj <- Seurat::as.Seurat(sce, data = data_name) + + # update identity values, using cluster if present + sce_meta <- metadata(sce) + sobj$orig.ident <- sce_meta$library_id + if (is.null(sce$cluster)) { + Seurat::Idents(sobj) <- sce_meta$library_id + } else { + Seurat::Idents(sobj) <- sce$cluster + } + + # add variable features if present + if (!is.null(sce_meta$highly_variable_genes)) { + sobj[["RNA"]]@var.features <- sce_meta$highly_variable_genes + sce_meta$highly_variable_genes <- NULL # remove from metadata to avoid redundancy + } + + # convert and set functions depending on assay version requested + if (seurat_assay_version == "v5") { + create_seurat_assay <- SeuratObject::CreateAssay5Object + suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5")) + } else { + create_seurat_assay <- SeuratObject::CreateAssayObject + suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay")) + } + + # add spliced data if present + if ("spliced" %in% assayNames(sce)) { + sobj[["spliced"]] <- create_seurat_assay(counts = assay(sce, "spliced")) + } + + # add altExps as needed. + for (alt_exp_name in names(alt_exps)) { + alt_exp <- alt_exps[[alt_exp_name]] + stopifnot( + "All altExps must contain a `counts` assay." = "counts" %in% assayNames(alt_exp) + ) + + # check name compatibility for Seurat + alt_exp_rownames <- rownames(alt_exp) + if (any(grepl("_", alt_exp_rownames))) { + warning( + "Replacing underscores ('_') with dashes ('-') in feature names from ", + alt_exp_name, + " for Seurat compatibility." + ) + rownames(alt_exp) <- gsub("_", "-", alt_exp_rownames) + } + + sobj[[alt_exp_name]] <- create_seurat_assay(counts = counts(alt_exp)) + if ("logcounts" %in% assayNames(alt_exp)) { + sobj[[alt_exp_name]]$data <- logcounts(alt_exp) + } + } + + # add sample-level metadata after removing non-vector objects + sobj@misc <- sce_meta |> + purrr::discard(is.object) |> + purrr::discard(is.list) + + # scale data because many Seurat functions expect this + sobj <- Seurat::ScaleData(sobj, assay = "RNA", verbose = FALSE) + + return(sobj) +} diff --git a/R/sum-duplicate-genes.R b/R/sum-duplicate-genes.R new file mode 100644 index 0000000..a4f512f --- /dev/null +++ b/R/sum-duplicate-genes.R @@ -0,0 +1,129 @@ +#' Sum counts for genes with duplicate names in a SingleCellExperiment object. +#' +#' Genes with the same name are merged by summing their raw expression counts. +#' When multiple Ensembl gene IDs are associated with the same gene symbol, +#' identifier conversion can result in duplicate gene names. This function +#' resolves such duplicates by summing the expression values for each duplicate +#' gene name, which may be justified if the different Ensembl gene IDs share +#' substantial sequence identity, which could make separate quantification of +#' the two genes less reliable. +#' +#' If requested, the log-normalized expression values are recalculated, +#' otherwise that matrix is left blank. +#' +#' PCA and UMAP are also recalculated if requested (which requires recalculating +#' the normalized expression). If not recalculating, the original reduced +#' dimensions are retained as they are likely to remain similar in most cases. +#' +#' +#' @param sce a SingleCellExperiment object with duplicated row names +#' @param normalize a logical indicating whether to normalize the expression +#' values. Default is TRUE. +#' @param recalculate_reduced_dims a logical indicating whether to recalculate +#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If +#' TRUE, the highly variable genes are also recalculated with the new values +#' stored in metadata. Default is FALSE. +#' +#' @return a SingleCellExperiment object +#' @export +#' +#' @import SingleCellExperiment +#' @import SummarizedExperiment +#' +#' @examples +#' \dontrun{ +#' # sum across duplicated gene names +#' summed_sce <- sum_duplicate_genes(sce) +#' +#' # sum across duplicated gene names and recalculate PCA and UMAP +#' summed_sce <- sum_duplicate_genes(sce, recalculate_reduced_dims = TRUE) +#' } +#' +sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) { + stopifnot( + "sce must be a SingleCellExperiment object" = is(sce, "SingleCellExperiment"), + "normalize must be a logical" = is.logical(normalize), + "recalculate_reduced_dims must be a logical" = is.logical(recalculate_reduced_dims), + "normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims + ) + if (normalize) { + stopifnot( + "Package `scran` must be installed if `normalize = TRUE` is set." = + requireNamespace("scran", quietly = TRUE), + "Package `scuttle` must be installed if `normalize = TRUE` is set." = + requireNamespace("scuttle", quietly = TRUE) + ) + } + stopifnot( + "Package `scater` must be installed if `recalculate_reduced_dims = TRUE` is set." = + requireNamespace("scater", quietly = TRUE) + ) + + if (!any(duplicated(rownames(sce)))) { + message("No duplicated gene names found in the SingleCellExperiment object, returning original object") + return(sce) + } + + # calculate the reduced matrices + counts <- rowsum(counts(sce), rownames(sce)) |> as("sparseMatrix") + if ("spliced" %in% assayNames(sce)) { + spliced <- rowsum(assay(sce, "spliced"), rownames(sce)) |> as("sparseMatrix") + assays <- list(counts = counts, spliced = spliced) + } else { + assays <- list(counts = counts) + } + + + if (recalculate_reduced_dims) { + reduced_dims <- list() + } else { + reduced_dims <- reducedDims(sce) + } + + + # Build the new SingleCellExperiment object + summed_sce <- SingleCellExperiment( + assays = assays, + colData = colData(sce), + metadata = metadata(sce), + # if we are not recalculating reduced dimensions, copy over previous (likely similar) + reducedDims = reduced_dims, + altExps = altExps(sce) + ) + + # Add normalized values if requested + if (normalize) { + try({ + # try to cluster similar cells + # clustering may fail if < 100 cells in dataset + suppressWarnings({ + qclust <- scran::quickCluster(summed_sce) + summed_sce <- scran::computeSumFactors(summed_sce, clusters = qclust) + }) + }) + summed_sce <- scuttle::logNormCounts(summed_sce) + } + + # recalculate PCA and UMAP if requested, using the same dimensions as before (if available) + if (recalculate_reduced_dims) { + if ("PCA" %in% reducedDimNames(sce)) { + pca_dim <- ncol(reducedDim(sce, "PCA")) + } else { + pca_dim <- min(50, ncol(sce) - 1) # always reduce dimensions at least 1! + } + + if (is.null(metadata(sce)$highly_variable_genes)) { + n_hvg <- 2000 # same default as ScPCA + } else { + n_hvg <- length(metadata(sce)$highly_variable_genes) + } + hv_genes <- scran::modelGeneVar(summed_sce) |> + scran::getTopHVGs(n = n_hvg) + metadata(summed_sce)$highly_variable_genes <- hv_genes # store the new HVGs + + summed_sce <- scater::runPCA(summed_sce, subset_row = hv_genes, ncomponents = pca_dim) |> + scater::runUMAP() + } + + return(summed_sce) +} diff --git a/data-raw/README.md b/data-raw/README.md index 608c806..0432140 100644 --- a/data-raw/README.md +++ b/data-raw/README.md @@ -1,7 +1,12 @@ This directory contains scripts to download and preprocess data used within the rOpenScPCA package. There are also notebooks that explore some of the prepared datasets for exploration. -## Building Gene References +## Test data + +- The `download_test_data.R` script downloads the test data used in this package from the OpenScPCA test data bucket. + + +## Building gene references - The `build_gene_references.R` script creates a table of Ensembl id to gene symbol references. The initial table of gene ids and gene symbols is extracted from an example ScPCA-formatted SCE object (`rOpenScPCA/tests/testthat/data/scpca_sce.rds`). diff --git a/data-raw/build_gene_reference.R b/data-raw/build_gene_reference.R index 9860b7f..90af14f 100644 --- a/data-raw/build_gene_reference.R +++ b/data-raw/build_gene_reference.R @@ -11,9 +11,10 @@ suppressPackageStartupMessages({ ## Download the reference data ------------------------------------------------- +setwd(here::here()) # Read in the test data ScPCA SCE object and extract the row data: -genes_scpca <- readRDS(here::here("tests", "testthat", "data", "scpca_sce.rds")) |> +genes_scpca <- readRDS(testthat::test_path("data", "scpca_sce.rds")) |> rowData() |> as.data.frame() |> # Use Ensembl ID if gene symbol is missing, then make unique diff --git a/data-raw/download_test_data.R b/data-raw/download_test_data.R new file mode 100644 index 0000000..76f9288 --- /dev/null +++ b/data-raw/download_test_data.R @@ -0,0 +1,14 @@ +#!/usr/env Rscript + +# Downloads a test data file from OpenScPCA and places it in the test directory + +setwd(here::here()) +url_base <- "https://openscpca-test-data-release-public-access.s3.us-east-2.amazonaws.com/test/SCPCP000001/SCPCS000001/" # nolint + +test_processed_path <- testthat::test_path("data", "scpca_sce.rds") +processed_url <- paste0(url_base, "SCPCL000001_processed.rds") +download.file(processed_url, test_processed_path, mode = "wb") + +test_filtered_path <- testthat::test_path("data", "filtered_sce.rds") +filtered_url <- paste0(url_base, "SCPCL000001_filtered.rds") +download.file(filtered_url, test_filtered_path, mode = "wb") diff --git a/man/ensembl_to_symbol.Rd b/man/ensembl_to_symbol.Rd index bb380cc..c7f1f9e 100644 --- a/man/ensembl_to_symbol.Rd +++ b/man/ensembl_to_symbol.Rd @@ -9,26 +9,31 @@ ensembl_to_symbol( reference = c("scpca", "10x2020", "10x2024"), sce = NULL, unique = FALSE, - leave_na = FALSE + leave_na = FALSE, + seurat_compatible = FALSE ) } \arguments{ \item{ensembl_ids}{A character vector of Ensembl gene ids to translate to gene symbols.} -\item{reference}{The reference gene list to use for translation. One of `scpca`, -`10x2020`, `10x2024`. The `scpca` reference is the default.} +\item{reference}{The reference gene list to use for translation. One of +`scpca`, `10x2020`, `10x2024`. The `scpca` reference is the default.} -\item{sce}{A SingleCellExperiment object to use as a reference for gene symbols. -If provided, the `reference` argument will be ignored. The `sce` object must -include columns with the names `gene_ids` (containing Ensembl ids) and -`gene_symbol` (containing the symbols) to use for conversion.} +\item{sce}{A SingleCellExperiment object to use as a reference for gene +symbols. If provided, the `reference` argument will be ignored. The `sce` +object must include columns with the names `gene_ids` (containing Ensembl +ids) and `gene_symbol` (containing the symbols) to use for conversion.} -\item{unique}{Whether to use unique gene symbols, as would be done if -data had been read in with gene symbols by Seurat. Default is FALSE.} +\item{unique}{Whether to use unique gene symbols, as would be done if data +had been read in with gene symbols by Seurat. Default is FALSE.} -\item{leave_na}{Whether to leave NA values in the output vector. -If FALSE, any missing values will be replaced with the input ensembl_id value. +\item{leave_na}{Whether to leave NA values in the output vector. If FALSE, +any missing values will be replaced with the input ensembl_id value. +Default is FALSE.} + +\item{seurat_compatible}{Whether to return a vector that is compatible with +Seurat, translating and underscores to dashes. Default is FALSE.} } \value{ diff --git a/man/sce_to_seurat.Rd b/man/sce_to_seurat.Rd new file mode 100644 index 0000000..49543ff --- /dev/null +++ b/man/sce_to_seurat.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-seurat.R +\name{sce_to_seurat} +\alias{sce_to_seurat} +\title{Convert an SCE object to Seurat} +\usage{ +sce_to_seurat( + sce, + use_symbols = TRUE, + reference = c("sce", "scpca", "10x2020", "10x2024"), + dedup_method = c("unique", "sum"), + seurat_assay_version = c("v3", "v5") +) +} +\arguments{ +\item{sce}{The input SCE object} + +\item{use_symbols}{Logical indicating whether the Seurat object uses gene +symbols for indexing. Default is TRUE.} + +\item{reference}{If `use_symbols` is TRUE, the reference to use for gene +symbols. One of `sce`, `scpca`, `10x2020`, `10x2024`, where `sce` uses the +symbols stored in the `gene_symbol` column of the SCE object's row data, +and the others use standardized translation tables. See `ensembl_to_symbol()` +for more information. Default is `sce`.} + +\item{dedup_method}{Method to handle duplicated gene symbols. If `unique`, +the gene symbols will be made unique following standard Seurat procedures. +If `sum`, the expression values for each gene symbols will be summed.} + +\item{seurat_assay_version}{Whether to create Seurat `v3` or `v5` assay +objects. Default is `v3`.} +} +\value{ +a Seurat object +} +\description{ +Converts an ScPCA SingleCellExperiment (SCE) object to Seurat format. This is +primarily a wrapper around Seurat::as.Seurat() with some additional steps to +include ScPCA metadata and options for converting the feature index from +Ensembl gene ids to gene symbols. +} +\details{ +If present, reduced dimensions from SCE objects will be included, renamed to +match Seurat default naming. +} +\examples{ +\dontrun{ +# convert an ScPCA SCE to Seurat renaming with gene symbols +seurat_obj <- sce_to_seurat(sce) + +# convert SCE to Seurat keeping the current names (usually Ensembl) +seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE) + +# convert SCE to Seurat, merging duplicated gene names +seurat_obj <- sce_to_seurat(sce, dedup_method = "sum") + +# convert SCE to Seurat with v5 assay objects +seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v5") +} + +} diff --git a/man/sce_to_symbols.Rd b/man/sce_to_symbols.Rd index 5af59e8..504c747 100644 --- a/man/sce_to_symbols.Rd +++ b/man/sce_to_symbols.Rd @@ -9,23 +9,29 @@ sce_to_symbols( reference = c("sce", "scpca", "10x2020", "10x2024"), unique = FALSE, convert_hvg = TRUE, - convert_pca = TRUE + convert_pca = TRUE, + seurat_compatible = FALSE ) } \arguments{ -\item{sce}{A SingleCellExperiment object containing gene ids and gene symbols.} +\item{sce}{A SingleCellExperiment object containing gene ids and gene +symbols.} -\item{reference}{The reference gene list for conversion. One of `sce`, `scpca`, -`10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.} +\item{reference}{The reference gene list for conversion. One of `sce`, +`scpca`, `10x2020`, or `10x2024`. If `sce` (the default) the internal row +data is used.} -\item{unique}{Whether to use unique gene symbols, as would be done if -data had been read in with gene symbols by Seurat. Default is FALSE.} +\item{unique}{Whether to use unique gene symbols, as would be done if data +had been read in with gene symbols by Seurat. Default is FALSE.} -\item{convert_hvg}{Logical indicating whether to convert highly variable genes to gene symbols. -Default is TRUE.} +\item{convert_hvg}{Logical indicating whether to convert highly variable +genes to gene symbols. Default is TRUE.} -\item{convert_pca}{Logical indicating whether to convert PCA rotation matrix to gene symbols. -Default is TRUE.} +\item{convert_pca}{Logical indicating whether to convert PCA rotation matrix +to gene symbols. Default is TRUE.} + +\item{seurat_compatible}{Logical indicating whether to make gene symbols +Seurat-compatible by replacing underscores with dashes. Default is FALSE.} } \value{ A SingleCellExperiment object with row names set as gene symbols. diff --git a/man/sum_duplicate_genes.Rd b/man/sum_duplicate_genes.Rd new file mode 100644 index 0000000..8421d11 --- /dev/null +++ b/man/sum_duplicate_genes.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sum-duplicate-genes.R +\name{sum_duplicate_genes} +\alias{sum_duplicate_genes} +\title{Sum counts for genes with duplicate names in a SingleCellExperiment object.} +\usage{ +sum_duplicate_genes(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) +} +\arguments{ +\item{sce}{a SingleCellExperiment object with duplicated row names} + +\item{normalize}{a logical indicating whether to normalize the expression +values. Default is TRUE.} + +\item{recalculate_reduced_dims}{a logical indicating whether to recalculate +PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If +TRUE, the highly variable genes are also recalculated with the new values +stored in metadata. Default is FALSE.} +} +\value{ +a SingleCellExperiment object +} +\description{ +Genes with the same name are merged by summing their raw expression counts. +When multiple Ensembl gene IDs are associated with the same gene symbol, +identifier conversion can result in duplicate gene names. This function +resolves such duplicates by summing the expression values for each duplicate +gene name, which may be justified if the different Ensembl gene IDs share +substantial sequence identity, which could make separate quantification of +the two genes less reliable. +} +\details{ +If requested, the log-normalized expression values are recalculated, +otherwise that matrix is left blank. + +PCA and UMAP are also recalculated if requested (which requires recalculating +the normalized expression). If not recalculating, the original reduced +dimensions are retained as they are likely to remain similar in most cases. +} +\examples{ +\dontrun{ +# sum across duplicated gene names +summed_sce <- sum_duplicate_genes(sce) + +# sum across duplicated gene names and recalculate PCA and UMAP +summed_sce <- sum_duplicate_genes(sce, recalculate_reduced_dims = TRUE) +} + +} diff --git a/renv.lock b/renv.lock index 787a0fa..6b52b1c 100644 --- a/renv.lock +++ b/renv.lock @@ -66,20 +66,6 @@ ], "Hash": "ef32d07aafdd12f24c5827374ae3590d" }, - "BiocIO": { - "Package": "BiocIO", - "Version": "1.14.0", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "BiocGenerics", - "R", - "S4Vectors", - "methods", - "tools" - ], - "Hash": "f97a7ef01d364cf20d1946d43a3d526f" - }, "BiocManager": { "Package": "BiocManager", "Version": "1.30.25", @@ -156,26 +142,6 @@ ], "Hash": "b892e27fc9659a4c8f8787d34c37b8b2" }, - "Biostrings": { - "Package": "Biostrings", - "Version": "2.72.1", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "BiocGenerics", - "GenomeInfoDb", - "IRanges", - "R", - "S4Vectors", - "XVector", - "crayon", - "grDevices", - "methods", - "stats", - "utils" - ], - "Hash": "886ff0ed958d6f839ed2e0d01f6853b3" - }, "Cairo": { "Package": "Cairo", "Version": "1.6-2", @@ -294,28 +260,6 @@ ], "Hash": "c3c792a7b7f2677be56e8632c5b7543d" }, - "GenomicAlignments": { - "Package": "GenomicAlignments", - "Version": "1.40.0", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "BiocGenerics", - "BiocParallel", - "Biostrings", - "GenomeInfoDb", - "GenomicRanges", - "IRanges", - "R", - "Rsamtools", - "S4Vectors", - "SummarizedExperiment", - "methods", - "stats", - "utils" - ], - "Hash": "e539709764587c581b31e446dc84d7b8" - }, "GenomicRanges": { "Package": "GenomicRanges", "Version": "1.56.1", @@ -494,18 +438,6 @@ ], "Hash": "45f0398006e83a5b10b72a90663d8d8c" }, - "RCurl": { - "Package": "RCurl", - "Version": "1.98-1.16", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "bitops", - "methods" - ], - "Hash": "ddbdf53d15b47be4407ede6914f56fbb" - }, "ROCR": { "Package": "ROCR", "Version": "1.0-11", @@ -637,41 +569,6 @@ ], "Hash": "c92ba8b9a2c5c9ff600a1062a3b7b727" }, - "Rhtslib": { - "Package": "Rhtslib", - "Version": "3.0.0", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "tools", - "zlibbioc" - ], - "Hash": "5d6514cd44a0106581e3310f3972a82e" - }, - "Rsamtools": { - "Package": "Rsamtools", - "Version": "2.20.0", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "BiocGenerics", - "BiocParallel", - "Biostrings", - "GenomeInfoDb", - "GenomicRanges", - "IRanges", - "R", - "Rhtslib", - "S4Vectors", - "XVector", - "bitops", - "methods", - "stats", - "utils", - "zlibbioc" - ], - "Hash": "9762f24dcbdbd1626173c516bb64792c" - }, "Rtsne": { "Package": "Rtsne", "Version": "0.17", @@ -899,18 +796,6 @@ ], "Hash": "83d45b690bffd09d1980c224ef329f5b" }, - "XML": { - "Package": "XML", - "Version": "3.99-0.17", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "methods", - "utils" - ], - "Hash": "bc2a8a1139d8d4bd9c46086708945124" - }, "XVector": { "Package": "XVector", "Version": "0.44.0", @@ -998,30 +883,6 @@ ], "Hash": "0f4e9d8caa6feaa7e409ae6c30f2ca66" }, - "bit": { - "Package": "bit", - "Version": "4.5.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "5dc7b2677d65d0e874fc4aaf0e879987" - }, - "bit64": { - "Package": "bit64", - "Version": "4.5.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "bit", - "methods", - "stats", - "utils" - ], - "Hash": "e84984bf5f12a18628d9a02322128dfd" - }, "bitops": { "Package": "bitops", "Version": "1.0-8", @@ -1396,14 +1257,13 @@ }, "evaluate": { "Package": "evaluate", - "Version": "0.24.0", + "Version": "1.0.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ - "R", - "methods" + "R" ], - "Hash": "a1066cbc05caee9a4bf6d90f194ff4da" + "Hash": "3fd29944b231036ad67c3edb32e02201" }, "fansi": { "Package": "fansi", @@ -1810,20 +1670,6 @@ ], "Hash": "d65ba49117ca223614f71b60d85b8ab7" }, - "hms": { - "Package": "hms", - "Version": "1.1.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "lifecycle", - "methods", - "pkgconfig", - "rlang", - "vctrs" - ], - "Hash": "b59377caa7ed00fa41808342002138f9" - }, "htmltools": { "Package": "htmltools", "Version": "0.5.8.1", @@ -2197,6 +2043,16 @@ ], "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" }, + "metapod": { + "Package": "metapod", + "Version": "1.12.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "Rcpp" + ], + "Hash": "026552a86c3aa0d92d4d8b12d80010cc" + }, "mgcv": { "Package": "mgcv", "Version": "1.9-1", @@ -2474,16 +2330,6 @@ "Repository": "CRAN", "Hash": "a555924add98c99d2f411e37e7d25e9f" }, - "prettyunits": { - "Package": "prettyunits", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" - }, "processx": { "Package": "processx", "Version": "3.8.4", @@ -2497,20 +2343,6 @@ ], "Hash": "0c90a7d71988856bad2a2a45dd871bb9" }, - "progress": { - "Package": "progress", - "Version": "1.2.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "crayon", - "hms", - "prettyunits" - ], - "Hash": "f4625e061cb2865f111b47ff163a5ca6" - }, "progressr": { "Package": "progressr", "Version": "0.14.0", @@ -2586,48 +2418,12 @@ ], "Hash": "5e3c5dc0b071b21fa128676560dbe94d" }, - "readr": { - "Package": "readr", - "Version": "2.1.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "cli", - "clipr", - "cpp11", - "crayon", - "hms", - "lifecycle", - "methods", - "rlang", - "tibble", - "tzdb", - "utils", - "vroom" - ], - "Hash": "9de96463d2117f6ac49980577939dfb3" - }, - "rematch2": { - "Package": "rematch2", - "Version": "2.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "tibble" - ], - "Hash": "76c9e04c712a05848ae7a23d2f170a40" - }, "renv": { "Package": "renv", "Version": "1.0.11", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "utils" - ], - "Hash": "47623f66b4e80b3b0587bc5d7b309888" + "OS_type": null, + "Repository": "CRAN", + "Source": "Repository" }, "reshape2": { "Package": "reshape2", @@ -2642,22 +2438,6 @@ ], "Hash": "bb5996d0bd962d214a11140d77589917" }, - "restfulr": { - "Package": "restfulr", - "Version": "0.0.15", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R", - "RCurl", - "S4Vectors", - "XML", - "methods", - "rjson", - "yaml" - ], - "Hash": "44651c1e68eda9d462610aca9f15a815" - }, "reticulate": { "Package": "reticulate", "Version": "1.39.0", @@ -2703,16 +2483,6 @@ ], "Hash": "99e15369f8fb17dc188377234de13fc6" }, - "rjson": { - "Package": "rjson", - "Version": "0.2.23", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R" - ], - "Hash": "7a04e9eff95857dbf557b4e5f0b3d1a8" - }, "rlang": { "Package": "rlang", "Version": "1.1.4", @@ -2775,33 +2545,6 @@ ], "Hash": "b462187d887abc519894874486dbd6fd" }, - "rtracklayer": { - "Package": "rtracklayer", - "Version": "1.64.0", - "Source": "Bioconductor", - "Repository": "Bioconductor 3.19", - "Requirements": [ - "BiocGenerics", - "BiocIO", - "Biostrings", - "GenomeInfoDb", - "GenomicAlignments", - "GenomicRanges", - "IRanges", - "R", - "Rsamtools", - "S4Vectors", - "XML", - "XVector", - "curl", - "httr", - "methods", - "restfulr", - "tools", - "zlibbioc" - ], - "Hash": "3d6f004fce582bd7d68e2e18d44abbc1" - }, "sass": { "Package": "sass", "Version": "0.4.9", @@ -2885,6 +2628,38 @@ ], "Hash": "d316e4abb854dd1677f7bd3ad08bc4e8" }, + "scran": { + "Package": "scran", + "Version": "1.32.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BH", + "BiocGenerics", + "BiocParallel", + "BiocSingular", + "DelayedArray", + "DelayedMatrixStats", + "Matrix", + "Rcpp", + "S4Vectors", + "SingleCellExperiment", + "SummarizedExperiment", + "beachmat", + "bluster", + "dqrng", + "edgeR", + "igraph", + "limma", + "metapod", + "methods", + "scuttle", + "statmod", + "stats", + "utils" + ], + "Hash": "5b11173a6b49f06dda0db31e80caa561" + }, "sctransform": { "Package": "sctransform", "Version": "0.4.1", @@ -3275,7 +3050,7 @@ }, "testthat": { "Package": "testthat", - "Version": "3.2.1.1", + "Version": "3.2.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3300,7 +3075,7 @@ "waldo", "withr" ], - "Hash": "3f6e7e5e2220856ff865e4834766bf2b" + "Hash": "6773967afbe2f74c87021e72c1bb05c0" }, "textshaping": { "Package": "textshaping", @@ -3383,17 +3158,6 @@ ], "Hash": "cfbad971a71f0e27cec22e544a08bc3b" }, - "tzdb": { - "Package": "tzdb", - "Version": "0.4.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cpp11" - ], - "Hash": "f561504ec2897f4d46f0c7657e488ae1" - }, "usethis": { "Package": "usethis", "Version": "3.0.0", @@ -3502,35 +3266,9 @@ ], "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" }, - "vroom": { - "Package": "vroom", - "Version": "1.6.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "bit64", - "cli", - "cpp11", - "crayon", - "glue", - "hms", - "lifecycle", - "methods", - "progress", - "rlang", - "stats", - "tibble", - "tidyselect", - "tzdb", - "vctrs", - "withr" - ], - "Hash": "390f9315bc0025be03012054103d227c" - }, "waldo": { "Package": "waldo", - "Version": "0.5.3", + "Version": "0.6.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3539,11 +3277,9 @@ "diffobj", "glue", "methods", - "rematch2", - "rlang", - "tibble" + "rlang" ], - "Hash": "16aa934a49658677d8041df9017329b9" + "Hash": "52f574062a7b66e56926988c3fbdb3b7" }, "whisker": { "Package": "whisker", @@ -3554,7 +3290,7 @@ }, "withr": { "Package": "withr", - "Version": "3.0.1", + "Version": "3.0.2", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -3562,7 +3298,7 @@ "grDevices", "graphics" ], - "Hash": "07909200e8bbe90426fbfeb73e1e27aa" + "Hash": "cc2d62c76458d425210d1eb1478b30b4" }, "xfun": { "Package": "xfun", diff --git a/tests/testthat/data/filtered_sce.rds b/tests/testthat/data/filtered_sce.rds new file mode 100644 index 0000000..6d2413a Binary files /dev/null and b/tests/testthat/data/filtered_sce.rds differ diff --git a/tests/testthat/data/scpca_sce.rds b/tests/testthat/data/scpca_sce.rds index de4562d..8fcb17e 100644 Binary files a/tests/testthat/data/scpca_sce.rds and b/tests/testthat/data/scpca_sce.rds differ diff --git a/tests/testthat/test-convert-gene-ids.R b/tests/testthat/test-convert-gene-ids.R index 23ec3c2..6be0083 100644 --- a/tests/testthat/test-convert-gene-ids.R +++ b/tests/testthat/test-convert-gene-ids.R @@ -8,7 +8,7 @@ test_that("basic gene symbol conversion works", { test_that("gene symbol conversion works with unexpected ids", { ensembl_ids <- c("ENSG00000141510", "ENSG00000134323", "foobar") - expect_warning(gene_symbols <- ensembl_to_symbol(ensembl_ids)) + expect_message(gene_symbols <- ensembl_to_symbol(ensembl_ids)) expect_equal(gene_symbols, c("TP53", "MYCN", "foobar")) expect_no_warning(gene_symbols_na <- ensembl_to_symbol(ensembl_ids, leave_na = TRUE)) @@ -45,13 +45,25 @@ test_that("gene symbol conversion works using an SCE reference", { sce <- readRDS(test_path("data", "scpca_sce.rds")) ensembl_ids <- c("ENSG00000015479", "ENSG00000269226") - gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = FALSE) + expect_message(gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = FALSE)) expect_equal(gene_symbols, c("MATR3", "TMSB15B")) - gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = TRUE) + expect_message(gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = TRUE)) expect_equal(gene_symbols, c("MATR3.1", "TMSB15B.1")) }) +test_that("gene symbol conversion in seurat compatibility mode works", { + ensembl_ids <- c("ENSG00000141510", "ENSG00000134323") + gene_symbols <- ensembl_to_symbol(ensembl_ids, unique = FALSE, seurat_compatible = TRUE) + expect_equal(gene_symbols, c("TP53", "MYCN")) + + ensembl_ids <- c("ENSG00000285609", "ENSG00000252254", "ENSG00000283274") + expect_warning( # this includes name changes for compatibility, so a warning is expected + gene_symbols <- ensembl_to_symbol(ensembl_ids, unique = FALSE, seurat_compatible = TRUE) + ) + expect_equal(gene_symbols, c("5S-rRNA", "Y-RNA", "5-8S-rRNA")) +}) + test_that("conversion of a full sce object works as expected", { @@ -60,7 +72,7 @@ test_that("conversion of a full sce object works as expected", { names(gene_symbols) <- rowData(sce)$gene_ids gene_symbols[is.na(gene_symbols)] <- names(gene_symbols)[is.na(gene_symbols)] - expect_warning(converted_sce <- sce_to_symbols(sce)) + expect_message(converted_sce <- sce_to_symbols(sce)) |> expect_message() # two messages expected here expect_equal(rownames(converted_sce), unname(gene_symbols)) # check that hvg and PCA were converted too. diff --git a/tests/testthat/test-make-seurat.R b/tests/testthat/test-make-seurat.R new file mode 100644 index 0000000..df03a77 --- /dev/null +++ b/tests/testthat/test-make-seurat.R @@ -0,0 +1,239 @@ +test_that("SCE to Seurat with no id conversion works", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE) + expect_s4_class(seurat_obj, "Seurat") + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(rownames(seurat_obj), rownames(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + + expect_equal(names(seurat_obj@assays), c("RNA", "spliced")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay") + expect_s4_class(seurat_obj[["spliced"]], "Assay") + + # assay contents + expect_contains( + slotNames(seurat_obj[["RNA"]]), + c("counts", "data", "scale.data", "var.features", "meta.features") + ) + + expect_equal(seurat_obj[["RNA"]]$counts, counts(sce)) + expect_equal(seurat_obj[["RNA"]]$data, logcounts(sce)) + + expect_equal( + Seurat::VariableFeatures(seurat_obj[["RNA"]]), + metadata(sce)$highly_variable_genes, + ignore_attr = TRUE + ) + + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal( + dim(seurat_obj[["pca"]]), + dim(reducedDim(sce, "PCA")) + ) + expect_equal( + dim(seurat_obj[["umap"]]), + dim(reducedDim(sce, "UMAP")) + ) +}) + + +test_that("SCE to Seurat with id conversion works as expected", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + expect_warning( + seurat_obj <- sce_to_seurat(sce, use_symbols = TRUE, reference = "sce") + ) + expect_s4_class(seurat_obj, "Seurat") + + new_genes <- suppressMessages( + ensembl_to_symbol(rownames(sce), unique = TRUE, sce = sce) + ) + new_genes <- gsub("_", "-", new_genes) + + hv_genes <- suppressMessages( + ensembl_to_symbol(metadata(sce)$highly_variable_genes, unique = TRUE, sce = sce) + ) + hv_genes <- gsub("_", "-", hv_genes) + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + expect_setequal(rownames(seurat_obj), new_genes) + + expect_equal(names(seurat_obj@assays), c("RNA", "spliced")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay") + expect_s4_class(seurat_obj[["spliced"]], "Assay") + + # assay contents + expect_contains( + slotNames(seurat_obj[["RNA"]]), + c("counts", "data", "scale.data", "var.features", "meta.features") + ) + + expect_equal( + Seurat::VariableFeatures(seurat_obj[["RNA"]]), + hv_genes, + ignore_attr = TRUE + ) + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal( + dim(seurat_obj[["pca"]]), + dim(reducedDim(sce, "PCA")) + ) + expect_equal( + dim(seurat_obj[["umap"]]), + dim(reducedDim(sce, "UMAP")) + ) +}) + +test_that("SCE to Seurat with id conversion and 10x reference works as expected", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + seurat_obj <- sce_to_seurat(sce, use_symbols = TRUE, reference = "10x2020") + expect_s4_class(seurat_obj, "Seurat") + + new_genes <- suppressMessages( + ensembl_to_symbol(rownames(sce), unique = TRUE, reference = "10x2020") + ) + new_genes <- gsub("_", "-", new_genes) + + hv_genes <- suppressMessages( + ensembl_to_symbol(metadata(sce)$highly_variable_genes, unique = TRUE, reference = "10x2020") + ) + hv_genes <- gsub("_", "-", hv_genes) + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + expect_setequal(rownames(seurat_obj), new_genes) + + expect_equal(names(seurat_obj@assays), c("RNA", "spliced")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay") + expect_s4_class(seurat_obj[["spliced"]], "Assay") + + # assay contents + expect_contains( + slotNames(seurat_obj[["RNA"]]), + c("counts", "data", "scale.data", "var.features", "meta.features") + ) + + expect_equal( + Seurat::VariableFeatures(seurat_obj[["RNA"]]), + hv_genes, + ignore_attr = TRUE + ) + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal( + dim(seurat_obj[["pca"]]), + dim(reducedDim(sce, "PCA")) + ) + expect_equal( + dim(seurat_obj[["umap"]]), + dim(reducedDim(sce, "UMAP")) + ) +}) + +test_that("conversion works with altExps", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + altsce1 <- SingleCellExperiment(assays = list(counts = counts(sce)[1:10, ], logcounts = logcounts(sce)[1:10, ])) + altsce2 <- SingleCellExperiment(assays = list(counts = counts(sce)[1:3, ])) + rownames(altsce2) <- c("F_1", "F_2", "F_3") # check feature names with underscores + altExps(sce) <- list( + alt1 = altsce1, + alt2 = altsce2 + ) + + expect_warning(seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE)) + expect_s4_class(seurat_obj, "Seurat") + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(rownames(seurat_obj), rownames(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + + expect_setequal(names(seurat_obj@assays), c("RNA", "spliced", "alt1", "alt2")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay") + expect_s4_class(seurat_obj[["spliced"]], "Assay") + expect_s4_class(seurat_obj[["alt1"]], "Assay") + expect_s4_class(seurat_obj[["alt2"]], "Assay") +}) + +test_that("Seurat v5 conversion works", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE, seurat_assay_version = "v5") + expect_s4_class(seurat_obj, "Seurat") + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(rownames(seurat_obj), rownames(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + + expect_equal(names(seurat_obj@assays), c("RNA", "spliced")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay5") + expect_s4_class(seurat_obj[["spliced"]], "Assay5") + + # expected layers present + expect_contains( + names(seurat_obj[["RNA"]]@layers), + c("counts", "data", "scale.data") + ) + + expect_equal(seurat_obj[["RNA"]]$counts, counts(sce)) + expect_equal(seurat_obj[["RNA"]]$data, logcounts(sce)) + + expect_equal( + Seurat::VariableFeatures(seurat_obj[["RNA"]]), + metadata(sce)$highly_variable_genes, + ignore_attr = TRUE + ) + + expect_equal(names(seurat_obj@reductions), c("pca", "umap")) + expect_equal( + dim(seurat_obj[["pca"]]), + dim(reducedDim(sce, "PCA")) + ) + expect_equal( + dim(seurat_obj[["umap"]]), + dim(reducedDim(sce, "UMAP")) + ) +}) + +test_that("Conversion works for non-processed samples", { + sce <- readRDS(test_path("data", "filtered_sce.rds")) + seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE) + expect_s4_class(seurat_obj, "Seurat") + + expect_equal(dim(seurat_obj), dim(sce)) + expect_setequal(rownames(seurat_obj), rownames(sce)) + expect_setequal(colnames(seurat_obj), colnames(sce)) + + expect_equal(names(seurat_obj@assays), c("RNA", "spliced")) + expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA") + + # assay types + expect_s4_class(seurat_obj[["RNA"]], "Assay") + expect_s4_class(seurat_obj[["spliced"]], "Assay") + + # assay contents + expect_contains( + slotNames(seurat_obj[["RNA"]]), + c("counts", "data", "scale.data", "var.features", "meta.features") + ) + + expect_equal(seurat_obj[["RNA"]]$counts, counts(sce)) + + expect_null(names(seurat_obj@reductions)) +}) diff --git a/tests/testthat/test-sum-duplicate-genes.R b/tests/testthat/test-sum-duplicate-genes.R new file mode 100644 index 0000000..127c098 --- /dev/null +++ b/tests/testthat/test-sum-duplicate-genes.R @@ -0,0 +1,39 @@ +test_that("merging works as expected", { + sce <- readRDS(test_path("data", "scpca_sce.rds")) + double_sce <- rbind(sce, sce) + deduped_sce <- sum_duplicate_genes(double_sce) + + expect_equal(dim(deduped_sce), dim(sce)) + + expect_setequal(rownames(deduped_sce), rownames(sce)) + + + expect_equal( + counts(deduped_sce)[rownames(sce), ], # need to make the row order identical + 2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s + ) + expect_equal( + assay(deduped_sce, "spliced")[rownames(sce), ], + 2 * assay(sce, "spliced") |> Matrix::drop0() + ) +}) + +test_that("merging works as expected with unprocessed SCE", { + sce <- readRDS(test_path("data", "filtered_sce.rds")) + double_sce <- rbind(sce, sce) + deduped_sce <- sum_duplicate_genes(double_sce) + + expect_equal(dim(deduped_sce), dim(sce)) + + expect_setequal(rownames(deduped_sce), rownames(sce)) + + + expect_equal( + counts(deduped_sce)[rownames(sce), ], # need to make the row order identical + 2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s + ) + expect_equal( + assay(deduped_sce, "spliced")[rownames(sce), ], + 2 * assay(sce, "spliced") |> Matrix::drop0() + ) +})