AlexsLemonade · jashapiro · Dec 13, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,8 +21,12 @@ LazyData: true
 Suggests:
     testthat (>= 3.0.0),
     scater,
+    scran,
     Seurat,
-    splatter
+    splatter,
+    scuttle,
+    Matrix,
+    SeuratObject
 Config/testthat/edition: 3
 RoxygenNote: 7.3.2
 Imports:
@@ -34,6 +38,7 @@ Imports:
     purrr,
     S4Vectors,
     SingleCellExperiment,
+    SummarizedExperiment,
     tibble,
     tidyr
 biocViews:

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,9 +6,12 @@ export(calculate_silhouette)
 export(calculate_stability)
 export(ensembl_to_symbol)
 export(extract_pc_matrix)
+export(sce_to_seurat)
 export(sce_to_symbols)
+export(sum_duplicate_genes)
 export(sweep_clusters)
 import(SingleCellExperiment)
+import(SummarizedExperiment)
 import(methods)
 importFrom(S4Vectors,`metadata<-`)
 importFrom(S4Vectors,metadata)

diff --git a/R/convert-gene-ids.R b/R/convert-gene-ids.R
@@ -13,18 +13,21 @@
 #'
 #'
 #' @param ensembl_ids A character vector of Ensembl gene ids to translate to
-#'  gene symbols.
-#' @param reference The reference gene list to use for translation. One of `scpca`,
-#'   `10x2020`, `10x2024`. The `scpca` reference is the default.
-#' @param sce A SingleCellExperiment object to use as a reference for gene symbols.
-#'  If provided, the `reference` argument will be ignored. The `sce` object must
-#'  include columns with the names `gene_ids` (containing Ensembl ids) and
-#'  `gene_symbol` (containing the symbols) to use for conversion.
-#' @param unique Whether to use unique gene symbols, as would be done if
-#'  data had been read in with gene symbols by Seurat. Default is FALSE.
-#' @param leave_na Whether to leave NA values in the output vector.
-#' If FALSE, any missing values will be replaced with the input ensembl_id value.
-#' Default is FALSE.
+#'   gene symbols.
+#' @param reference The reference gene list to use for translation. One of
+#'   `scpca`, `10x2020`, `10x2024`. The `scpca` reference is the default.
+#' @param sce A SingleCellExperiment object to use as a reference for gene
+#'   symbols. If provided, the `reference` argument will be ignored. The `sce`
+#'   object must include columns with the names `gene_ids` (containing Ensembl
+#'   ids) and `gene_symbol` (containing the symbols) to use for conversion.
+#' @param unique Whether to use unique gene symbols, as would be done if data
+#'   had been read in with gene symbols by Seurat. Default is FALSE.
+#' @param leave_na Whether to leave NA values in the output vector. If FALSE,
+#'   any missing values will be replaced with the input ensembl_id value.
+#'   Default is FALSE.
+#' @param seurat_compatible Whether to return a vector that is compatible with
+#'   Seurat, translating and underscores to dashes.
+#'   Default is FALSE.
 #'
 #' @return A vector of gene symbols corresponding to the input Ensembl ids.
 #' @export
@@ -51,7 +54,8 @@ ensembl_to_symbol <- function(
     reference = c("scpca", "10x2020", "10x2024"),
     sce = NULL,
     unique = FALSE,
-    leave_na = FALSE) {
+    leave_na = FALSE,
+    seurat_compatible = FALSE) {
   reference <- match.arg(reference)
   stopifnot(
     "`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
@@ -85,10 +89,17 @@ ensembl_to_symbol <- function(
     )
   }
   if (!leave_na && any(missing_symbols)) {
-    warning("Not all input ids have corresponding gene symbols, using input ids for missing values.")
+    message("Not all input ids have corresponding gene symbols, using input ids for missing values.")
     gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
   }
 
+  if (seurat_compatible) {
+    if (any(grepl("_", gene_symbols))) {
+      warning("Replacing underscores ('_') with dashes ('-') in gene symbols for Seurat compatibility.")
+    }
+    gene_symbols <- gsub("_", "-", gene_symbols)
+  }
+
   return(gene_symbols)
 }
 
@@ -112,15 +123,19 @@ ensembl_to_symbol <- function(
 #' (and not disabled by the `convert_hvg` and `convert_pca` arguments).
 #'
 #'
-#' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
-#' @param reference The reference gene list for conversion. One of `sce`, `scpca`,
-#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.
-#' @param unique Whether to use unique gene symbols, as would be done if
-#' data had been read in with gene symbols by Seurat. Default is FALSE.
-#' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
-#'  Default is TRUE.
-#' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
-#'  Default is TRUE.
+#' @param sce A SingleCellExperiment object containing gene ids and gene
+#'   symbols.
+#' @param reference The reference gene list for conversion. One of `sce`,
+#'   `scpca`, `10x2020`, or `10x2024`. If `sce` (the default) the internal row
+#'   data is used.
+#' @param unique Whether to use unique gene symbols, as would be done if data
+#'   had been read in with gene symbols by Seurat. Default is FALSE.
+#' @param convert_hvg Logical indicating whether to convert highly variable
+#'   genes to gene symbols. Default is TRUE.
+#' @param convert_pca Logical indicating whether to convert PCA rotation matrix
+#'   to gene symbols. Default is TRUE.
+#' @param seurat_compatible Logical indicating whether to make gene symbols
+#'   Seurat-compatible by replacing underscores with dashes. Default is FALSE.
 #'
 #' @return A SingleCellExperiment object with row names set as gene symbols.
 #' @export
@@ -145,7 +160,8 @@ sce_to_symbols <- function(
     reference = c("sce", "scpca", "10x2020", "10x2024"),
     unique = FALSE,
     convert_hvg = TRUE,
-    convert_pca = TRUE) {
+    convert_pca = TRUE,
+    seurat_compatible = FALSE) {
   reference <- match.arg(reference)
   stopifnot(
     "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
@@ -164,9 +180,19 @@ sce_to_symbols <- function(
   }
 
   if (reference == "sce") {
-    gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique)
+    gene_symbols <- ensembl_to_symbol(
+      ensembl_ids,
+      sce = sce,
+      unique = unique,
+      seurat_compatible = seurat_compatible
+    )
   } else {
-    gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique)
+    gene_symbols <- ensembl_to_symbol(
+      ensembl_ids,
+      reference = reference,
+      unique = unique,
+      seurat_compatible = seurat_compatible
+    )
   }
   row_ids <- gene_symbols
   # set Ensembl ids as original ids for later translations

diff --git a/R/make-seurat.R b/R/make-seurat.R
@@ -0,0 +1,135 @@
+#' Convert an SCE object to Seurat
+#'
+#' Converts A SingleCellExperiment (SCE) object to Seurat format. This is
+#' primarily a wrapper around Seurat::as.Seurat() with some additional steps to
+#' include ScPCA metadata and options for converting the feature index from
+#' Ensembl gene ids to gene symbols.
+#'
+#' If present, reduced dimensions from SCE objects will be included, renamed to
+#' match Seurat default naming.
+#'
+#'
+#' @param sce The input SCE object
+#' @param use_symbols Logical indicating whether the Seurat object uses gene
+#'   symbols for indexing. Default is TRUE.
+#' @param reference If `use_symbols` is TRUE, the reference to use for gene
+#'   symbols. One of `sce`, `scpca`, `10x2020`, `10x2024`, where `sce` uses the
+#'   symbols stored in the `gene_symbol` column of the SCE object's row data,
+#'   and the others use standardized translation tables. See `ensembl_to_symbol()`
+#'   for more information. Default is `sce`.
+#' @param dedup_method Method to handle duplicated gene symbols. If `unique`,
+#'   the gene symbols will be made unique following standard Seurat procedures.
+#'   If `sum`, the expression values for each gene symbols will be summed.
+#' @param seurat_assay_version Whether to create Seurat `v3` or `v5` assay
+#'   objects. Default is `v3`.
+#'
+#' @return a Seurat object
+#' @export
+#'
+sce_to_seurat <- function(
+    sce,
+    use_symbols = TRUE,
+    reference = c("sce", "scpca", "10x2020", "10x2024"),
+    dedup_method = c("unique", "sum"),
+    seurat_assay_version = c("v3", "v5")) {
+  reference <- match.arg(reference)
+  dedup_method <- match.arg(dedup_method)
+  seurat_assay_version <- match.arg(seurat_assay_version)
+
+  stopifnot(
+    "Package `Seurat` must be installed for SCE to Seurat conversion." =
+      requireNamespace("Seurat", quietly = TRUE),
+    "Package `SeuratObject` must be installed for SCE to Seurat conversion." =
+      requireNamespace("SeuratObject", quietly = TRUE),
+    "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
+    "`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." =
+      reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
+  )
+
+  if (use_symbols) {
+    sce <- suppressMessages(sce_to_symbols(
+      sce,
+      reference = reference,
+      unique = (dedup_method == "unique"),
+      seurat_compatible = TRUE
+    ))
+  }
+
+  if (dedup_method == "sum") {
+    sce <- sum_duplicate_genes(sce)
+  }
+
+  mainExpName(sce) <- "RNA"
+
+  # convert reducedDims to Seurat standard naming
+  reducedDimNames(sce) <- tolower(reducedDimNames(sce))
+  reducedDims(sce) <- reducedDims(sce) |>
+    purrr::map(\(mat) {
+      # insert underscore between letters and numbers in axis labels
+      rd_names <- gsub("([A-Za-z]+)([0-9]+)", "\\1_\\2", colnames(mat)) |>
+        tolower()
+      colnames(mat) <- rd_names
+      if (!is.null(attr(mat, "rotation"))) {
+        colnames(attr(mat, "rotation")) <- rd_names
+      }
+      return(mat)
+    })
+
+  # pull out any altExps to handle manually
+  alt_exps <- altExps(sce)
+  altExps(sce) <- NULL
+
+  # Let Seurat do initial conversion to capture most things
+  sobj <- Seurat::as.Seurat(sce)
+
+  # update identity values, using cluster if present
+  sce_meta <- metadata(sce)
+  sobj$orig.ident <- sce_meta$library_id
+  if (is.null(sce$cluster)) {
+    Seurat::Idents(sobj) <- sce_meta$library_id
+  } else {
+    Seurat::Idents(sobj) <- sce$cluster
+  }
+
+  # add variable features if present
+  if (!is.null(sce_meta$highly_variable_genes)) {
+    sobj[["RNA"]]@var.features <- sce_meta$highly_variable_genes
+    sce_meta$highly_variable_genes <- NULL # remove from metadata to avoid redundancy
+  }
+
+  # convert and set functions depending on assay version requested
+  if (seurat_assay_version == "v5") {
+    create_seurat_assay <- SeuratObject::CreateAssay5Object
+    suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
+  } else {
+    create_seurat_assay <- SeuratObject::CreateAssayObject
+    suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay"))
+  }
+
+  # add spliced data if present
+  if ("spliced" %in% assayNames(sce)) {
+    sobj[["spliced"]] <- create_seurat_assay(counts = assay(sce, "spliced"))
+  }
+
+  # add altExps as needed.
+  for (alt_exp_name in names(alt_exps)) {
+    alt_exp <- alt_exps[[alt_exp_name]]
+    stopifnot(
+      "All altExps must contain a `counts` assay." = "counts" %in% assayNames(alt_exp)
+    )
+    sobj[[alt_exp_name]] <- create_seurat_assay(counts = counts(alt_exp))
+    if ("logcounts" %in% assayNames(alt_exp)) {
+      sobj[[alt_exp_name]]$data <- logcounts(alt_exp)
+    }
+  }
+
+  # add sample-level metadata after removing non-vector objects
+  sobj@misc <- sce_meta |>
+    purrr::discard(is.object) |>
+    purrr::discard(is.list)
+
+  # scale data because many Seurat functions expect this
+  sobj <- Seurat::ScaleData(sobj, assay = "RNA", verbose = FALSE)
+
+  return(sobj)
+}
diff --git a/R/sum-duplicate-genes.R b/R/sum-duplicate-genes.R
@@ -0,0 +1,102 @@
+#' Sum counts for genes with duplicate names in a SingleCellExperiment object.
+#'
+#' Genes with the same name are merged by summing their raw expression counts.
+#' If requested, the log-normalized expression values are recalculated, otherwise
+#' this is left blank.
+#'
+#'
+#' @param sce a SingleCellExperiment object with duplicated row names
+#' @param normalize a logical indicating whether to normalize the expression
+#'   values. Default is TRUE
+#' @param recalculate_reduced_dims a logical indicating whether to recalculate
+#'   PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
+#'   TRUE, the highly variable genes are also recalculated with the new values
+#'   stored in metadata. Default is FALSE
-#' @param sce a SingleCellExperiment object with duplicated row names
-#' @param normalize a logical indicating whether to normalize the expression
-#'   values. Default is TRUE
-#' @param recalculate_reduced_dims a logical indicating whether to recalculate
-#'   PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
-#'   TRUE, the highly variable genes are also recalculated with the new values
-#'   stored in metadata. Default is FALSE
+#' @param sce a SingleCellExperiment object with duplicated row names.
+#' @param normalize a logical indicating whether to normalize the expression
+#'   values. Default is TRUE.
+#' @param recalculate_reduced_dims a logical indicating whether to recalculate
+#'   PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
+#'   TRUE, the highly variable genes are also recalculated with the new values
+#'   stored in metadata. Default is FALSE.
-#' @param sce a SingleCellExperiment object with duplicated row names
-#' @param normalize a logical indicating whether to normalize the expression
-#'   values. Default is TRUE
-#' @param recalculate_reduced_dims a logical indicating whether to recalculate
-#'   PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
-#'   TRUE, the highly variable genes are also recalculated with the new values
-#'   stored in metadata. Default is FALSE
+#' @param sce a SingleCellExperiment object with duplicated row names.
+#' @param normalize a logical indicating whether to normalize the expression
+#'   values. Default is TRUE.
+#' @param recalculate_reduced_dims a logical indicating whether to recalculate
+#'   PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
+#'   TRUE, the highly variable genes are also recalculated with the new values
+#'   stored in metadata. Default is FALSE.
+#'
+#' @return a SingleCellExperiment object
+#' @export
+#'
+#' @import SingleCellExperiment
+#' @import SummarizedExperiment
+#'
+sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) {
+  stopifnot(
+    "sce must be a SingleCellExperiment object" = is(sce, "SingleCellExperiment"),
+    "normalize must be a logical" = is.logical(normalize),
+    "recalculate_reduced_dims must be a logical" = is.logical(recalculate_reduced_dims),
+    "normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims
+  )
+  if (normalize) {
+    stopifnot(
+      "Package `scran` must be installed if `normalize = TRUE` is set." =
+        requireNamespace("scran", quietly = TRUE),
+      "Package `scuttle` must be installed if `normalize = TRUE` is set." =
+        requireNamespace("scuttle", quietly = TRUE)
+    )
+  }
+  stopifnot(
+    "Package `scater` must be installed if `recalculate_reduced_dims = TRUE` is set." =
+      requireNamespace("scater", quietly = TRUE)
+  )
+
+  if (!any(duplicated(rownames(sce)))) {
+    message("No duplicated gene names found in the SingleCellExperiment object, returning original object")
+    return(sce)
+  }
+
+  # calculate the reduced matrices
+  counts <- rowsum(counts(sce), rownames(sce)) |> as("sparseMatrix")
+  if ("spliced" %in% assayNames(sce)) {
+    spliced <- rowsum(assay(sce, "spliced"), rownames(sce)) |> as("sparseMatrix")
+    assays <- list(counts = counts, spliced = spliced)
+  } else {
+    assays <- list(counts = counts)
+  }
+
+
+  if (recalculate_reduced_dims) {
+    reduced_dims <- list()
+  } else {
+    reduced_dims <- reducedDims(sce)
+  }
+
+
+  # Build the new SingleCellExperiment object
+  summed_sce <- SingleCellExperiment(
+    assays = assays,
+    colData = colData(sce),
+    metadata = metadata(sce),
+    # if we are not recalculating reduced dimensions, copy over previous (likely similar)
+    reducedDims = reduced_dims,
+    altExps = altExps(sce)
+  )
+
+  # Add normalized values if requested
+  if (normalize) {
+    try({
+      # try to cluster similar cells
+      # clustering may fail if < 100 cells in dataset
+      qclust <- suppressWarnings(scran::quickCluster(summed_sce))
+      summed_sce <- scran::computeSumFactors(summed_sce, clusters = qclust)
+    })
+    summed_sce <- scuttle::logNormCounts(summed_sce)
+  }
+
+  # recalculate PCA if requested, using the same dimensions as before
+  if (recalculate_reduced_dims) {
+    if ("PCA" %in% reducedDimNames(sce)) {
+      pca_dim <- ncol(reducedDim(sce, "PCA"))
+    } else {
+      pca_dim <- min(50, ncol(sce) - 1) # always reduce dimensions at least 1!
+    }
+
+    hv_genes <- scran::modelGeneVar(summed_sce) |>
+      scran::getTopHVGs(n = length(metadata(sce)$highly_variable_genes))
+    metadata(summed_sce)$highly_variable_genes <- hv_genes # store the new HVGs
+
+    summed_sce <- scater::runPCA(summed_sce, subset_row = hv_genes, ncomponents = pca_dim) |>
+      scater::runUMAP()
+  }
+
+  return(summed_sce)
+}