Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions for Seurat conversion #15

Merged
merged 25 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@ LazyData: true
Suggests:
testthat (>= 3.0.0),
scater,
scran,
Seurat,
splatter
splatter,
scuttle,
Matrix,
SeuratObject
Config/testthat/edition: 3
RoxygenNote: 7.3.2
Imports:
Expand All @@ -34,6 +38,7 @@ Imports:
purrr,
S4Vectors,
SingleCellExperiment,
SummarizedExperiment,
tibble,
tidyr
biocViews:
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ export(calculate_silhouette)
export(calculate_stability)
export(ensembl_to_symbol)
export(extract_pc_matrix)
export(sce_to_seurat)
export(sce_to_symbols)
export(sum_duplicate_genes)
export(sweep_clusters)
import(SingleCellExperiment)
import(SummarizedExperiment)
import(methods)
importFrom(S4Vectors,`metadata<-`)
importFrom(S4Vectors,metadata)
Expand Down
78 changes: 52 additions & 26 deletions R/convert-gene-ids.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,21 @@
#'
#'
#' @param ensembl_ids A character vector of Ensembl gene ids to translate to
#' gene symbols.
#' @param reference The reference gene list to use for translation. One of `scpca`,
#' `10x2020`, `10x2024`. The `scpca` reference is the default.
#' @param sce A SingleCellExperiment object to use as a reference for gene symbols.
#' If provided, the `reference` argument will be ignored. The `sce` object must
#' include columns with the names `gene_ids` (containing Ensembl ids) and
#' `gene_symbol` (containing the symbols) to use for conversion.
#' @param unique Whether to use unique gene symbols, as would be done if
#' data had been read in with gene symbols by Seurat. Default is FALSE.
#' @param leave_na Whether to leave NA values in the output vector.
#' If FALSE, any missing values will be replaced with the input ensembl_id value.
#' Default is FALSE.
#' gene symbols.
#' @param reference The reference gene list to use for translation. One of
#' `scpca`, `10x2020`, `10x2024`. The `scpca` reference is the default.
#' @param sce A SingleCellExperiment object to use as a reference for gene
#' symbols. If provided, the `reference` argument will be ignored. The `sce`
#' object must include columns with the names `gene_ids` (containing Ensembl
#' ids) and `gene_symbol` (containing the symbols) to use for conversion.
#' @param unique Whether to use unique gene symbols, as would be done if data
#' had been read in with gene symbols by Seurat. Default is FALSE.
#' @param leave_na Whether to leave NA values in the output vector. If FALSE,
#' any missing values will be replaced with the input ensembl_id value.
#' Default is FALSE.
#' @param seurat_compatible Whether to return a vector that is compatible with
#' Seurat, translating and underscores to dashes.
#' Default is FALSE.
#'
#' @return A vector of gene symbols corresponding to the input Ensembl ids.
#' @export
Expand All @@ -51,7 +54,8 @@ ensembl_to_symbol <- function(
reference = c("scpca", "10x2020", "10x2024"),
sce = NULL,
unique = FALSE,
leave_na = FALSE) {
leave_na = FALSE,
seurat_compatible = FALSE) {
reference <- match.arg(reference)
stopifnot(
"`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
Expand Down Expand Up @@ -85,10 +89,17 @@ ensembl_to_symbol <- function(
)
}
if (!leave_na && any(missing_symbols)) {
warning("Not all input ids have corresponding gene symbols, using input ids for missing values.")
message("Not all input ids have corresponding gene symbols, using input ids for missing values.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
}

if (seurat_compatible) {
if (any(grepl("_", gene_symbols))) {
warning("Replacing underscores ('_') with dashes ('-') in gene symbols for Seurat compatibility.")
}
gene_symbols <- gsub("_", "-", gene_symbols)
}

return(gene_symbols)
}

Expand All @@ -112,15 +123,19 @@ ensembl_to_symbol <- function(
#' (and not disabled by the `convert_hvg` and `convert_pca` arguments).
#'
#'
#' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
#' @param reference The reference gene list for conversion. One of `sce`, `scpca`,
#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.
#' @param unique Whether to use unique gene symbols, as would be done if
#' data had been read in with gene symbols by Seurat. Default is FALSE.
#' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
#' Default is TRUE.
#' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
#' Default is TRUE.
#' @param sce A SingleCellExperiment object containing gene ids and gene
#' symbols.
#' @param reference The reference gene list for conversion. One of `sce`,
#' `scpca`, `10x2020`, or `10x2024`. If `sce` (the default) the internal row
#' data is used.
#' @param unique Whether to use unique gene symbols, as would be done if data
#' had been read in with gene symbols by Seurat. Default is FALSE.
#' @param convert_hvg Logical indicating whether to convert highly variable
#' genes to gene symbols. Default is TRUE.
#' @param convert_pca Logical indicating whether to convert PCA rotation matrix
#' to gene symbols. Default is TRUE.
#' @param seurat_compatible Logical indicating whether to make gene symbols
#' Seurat-compatible by replacing underscores with dashes. Default is FALSE.
#'
#' @return A SingleCellExperiment object with row names set as gene symbols.
#' @export
Expand All @@ -145,7 +160,8 @@ sce_to_symbols <- function(
reference = c("sce", "scpca", "10x2020", "10x2024"),
unique = FALSE,
convert_hvg = TRUE,
convert_pca = TRUE) {
convert_pca = TRUE,
seurat_compatible = FALSE) {
reference <- match.arg(reference)
stopifnot(
"`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
Expand All @@ -164,9 +180,19 @@ sce_to_symbols <- function(
}

if (reference == "sce") {
gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique)
gene_symbols <- ensembl_to_symbol(
ensembl_ids,
sce = sce,
unique = unique,
seurat_compatible = seurat_compatible
)
} else {
gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique)
gene_symbols <- ensembl_to_symbol(
ensembl_ids,
reference = reference,
unique = unique,
seurat_compatible = seurat_compatible
)
}
row_ids <- gene_symbols
# set Ensembl ids as original ids for later translations
Expand Down
135 changes: 135 additions & 0 deletions R/make-seurat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#' Convert an SCE object to Seurat
#'
#' Converts A SingleCellExperiment (SCE) object to Seurat format. This is
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
#' primarily a wrapper around Seurat::as.Seurat() with some additional steps to
#' include ScPCA metadata and options for converting the feature index from
#' Ensembl gene ids to gene symbols.
#'
#' If present, reduced dimensions from SCE objects will be included, renamed to
#' match Seurat default naming.
#'
#'
#' @param sce The input SCE object
#' @param use_symbols Logical indicating whether the Seurat object uses gene
#' symbols for indexing. Default is TRUE.
#' @param reference If `use_symbols` is TRUE, the reference to use for gene
#' symbols. One of `sce`, `scpca`, `10x2020`, `10x2024`, where `sce` uses the
#' symbols stored in the `gene_symbol` column of the SCE object's row data,
#' and the others use standardized translation tables. See `ensembl_to_symbol()`
#' for more information. Default is `sce`.
#' @param dedup_method Method to handle duplicated gene symbols. If `unique`,
#' the gene symbols will be made unique following standard Seurat procedures.
Comment on lines +20 to +21
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a Seurat version to specify here, or do they all (to our knowledge) follow the same approach?

#' If `sum`, the expression values for each gene symbols will be summed.
#' @param seurat_assay_version Whether to create Seurat `v3` or `v5` assay
#' objects. Default is `v3`.
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @return a Seurat object
#' @export
#'
sce_to_seurat <- function(
sce,
use_symbols = TRUE,
reference = c("sce", "scpca", "10x2020", "10x2024"),
dedup_method = c("unique", "sum"),
seurat_assay_version = c("v3", "v5")) {
reference <- match.arg(reference)
dedup_method <- match.arg(dedup_method)
seurat_assay_version <- match.arg(seurat_assay_version)

stopifnot(
"Package `Seurat` must be installed for SCE to Seurat conversion." =
requireNamespace("Seurat", quietly = TRUE),
"Package `SeuratObject` must be installed for SCE to Seurat conversion." =
requireNamespace("SeuratObject", quietly = TRUE),
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
"`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
"`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." =
reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
)

if (use_symbols) {
sce <- suppressMessages(sce_to_symbols(
sce,
reference = reference,
unique = (dedup_method == "unique"),
seurat_compatible = TRUE
))
}

if (dedup_method == "sum") {
sce <- sum_duplicate_genes(sce)
}

mainExpName(sce) <- "RNA"

# convert reducedDims to Seurat standard naming
reducedDimNames(sce) <- tolower(reducedDimNames(sce))
reducedDims(sce) <- reducedDims(sce) |>
purrr::map(\(mat) {
# insert underscore between letters and numbers in axis labels
rd_names <- gsub("([A-Za-z]+)([0-9]+)", "\\1_\\2", colnames(mat)) |>
tolower()
colnames(mat) <- rd_names
if (!is.null(attr(mat, "rotation"))) {
colnames(attr(mat, "rotation")) <- rd_names
}
return(mat)
})

# pull out any altExps to handle manually
alt_exps <- altExps(sce)
altExps(sce) <- NULL

# Let Seurat do initial conversion to capture most things
sobj <- Seurat::as.Seurat(sce)

# update identity values, using cluster if present
sce_meta <- metadata(sce)
sobj$orig.ident <- sce_meta$library_id
if (is.null(sce$cluster)) {
Seurat::Idents(sobj) <- sce_meta$library_id
} else {
Seurat::Idents(sobj) <- sce$cluster
}

# add variable features if present
if (!is.null(sce_meta$highly_variable_genes)) {
sobj[["RNA"]]@var.features <- sce_meta$highly_variable_genes
sce_meta$highly_variable_genes <- NULL # remove from metadata to avoid redundancy
}

# convert and set functions depending on assay version requested
if (seurat_assay_version == "v5") {
create_seurat_assay <- SeuratObject::CreateAssay5Object
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
} else {
create_seurat_assay <- SeuratObject::CreateAssayObject
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay"))
}

# add spliced data if present
if ("spliced" %in% assayNames(sce)) {
sobj[["spliced"]] <- create_seurat_assay(counts = assay(sce, "spliced"))
}

# add altExps as needed.
for (alt_exp_name in names(alt_exps)) {
alt_exp <- alt_exps[[alt_exp_name]]
stopifnot(
"All altExps must contain a `counts` assay." = "counts" %in% assayNames(alt_exp)
)
sobj[[alt_exp_name]] <- create_seurat_assay(counts = counts(alt_exp))
if ("logcounts" %in% assayNames(alt_exp)) {
sobj[[alt_exp_name]]$data <- logcounts(alt_exp)
Comment on lines +154 to +156
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth noting that these lines may both give the Seurat warning

Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-')

I got this when running a multiplexed library through, so this was coming from the MULTI_X names. We might want to wrangle the altexps a bit more first if we want to avoid this Seurat warning. I think would be good since it's not very transparent to users that this is where the warning comes from, and we might add our own warning (message?) instead to be clearer what's changing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added conversion for altExps with a warning.

}
}

# add sample-level metadata after removing non-vector objects
sobj@misc <- sce_meta |>
purrr::discard(is.object) |>
purrr::discard(is.list)

# scale data because many Seurat functions expect this
sobj <- Seurat::ScaleData(sobj, assay = "RNA", verbose = FALSE)

return(sobj)
}
102 changes: 102 additions & 0 deletions R/sum-duplicate-genes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#' Sum counts for genes with duplicate names in a SingleCellExperiment object.
#'
#' Genes with the same name are merged by summing their raw expression counts.
#' If requested, the log-normalized expression values are recalculated, otherwise
#' this is left blank.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd add a smidge more here about why this function is helpful, since it's exported (which I agree with)

#'
#'
#' @param sce a SingleCellExperiment object with duplicated row names
#' @param normalize a logical indicating whether to normalize the expression
#' values. Default is TRUE
#' @param recalculate_reduced_dims a logical indicating whether to recalculate
#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
#' TRUE, the highly variable genes are also recalculated with the new values
#' stored in metadata. Default is FALSE
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency with other docs...

Suggested change
#' @param sce a SingleCellExperiment object with duplicated row names
#' @param normalize a logical indicating whether to normalize the expression
#' values. Default is TRUE
#' @param recalculate_reduced_dims a logical indicating whether to recalculate
#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
#' TRUE, the highly variable genes are also recalculated with the new values
#' stored in metadata. Default is FALSE
#' @param sce a SingleCellExperiment object with duplicated row names.
#' @param normalize a logical indicating whether to normalize the expression
#' values. Default is TRUE.
#' @param recalculate_reduced_dims a logical indicating whether to recalculate
#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
#' TRUE, the highly variable genes are also recalculated with the new values
#' stored in metadata. Default is FALSE.

#'
#' @return a SingleCellExperiment object
#' @export
#'
#' @import SingleCellExperiment
#' @import SummarizedExperiment
#'
sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) {
stopifnot(
"sce must be a SingleCellExperiment object" = is(sce, "SingleCellExperiment"),
"normalize must be a logical" = is.logical(normalize),
"recalculate_reduced_dims must be a logical" = is.logical(recalculate_reduced_dims),
"normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims
)
if (normalize) {
stopifnot(
"Package `scran` must be installed if `normalize = TRUE` is set." =
requireNamespace("scran", quietly = TRUE),
"Package `scuttle` must be installed if `normalize = TRUE` is set." =
requireNamespace("scuttle", quietly = TRUE)
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
)
}
stopifnot(
"Package `scater` must be installed if `recalculate_reduced_dims = TRUE` is set." =
requireNamespace("scater", quietly = TRUE)
)

if (!any(duplicated(rownames(sce)))) {
message("No duplicated gene names found in the SingleCellExperiment object, returning original object")
return(sce)
}

# calculate the reduced matrices
counts <- rowsum(counts(sce), rownames(sce)) |> as("sparseMatrix")
if ("spliced" %in% assayNames(sce)) {
spliced <- rowsum(assay(sce, "spliced"), rownames(sce)) |> as("sparseMatrix")
assays <- list(counts = counts, spliced = spliced)
} else {
assays <- list(counts = counts)
}


if (recalculate_reduced_dims) {
reduced_dims <- list()
} else {
reduced_dims <- reducedDims(sce)
}


# Build the new SingleCellExperiment object
summed_sce <- SingleCellExperiment(
assays = assays,
colData = colData(sce),
metadata = metadata(sce),
# if we are not recalculating reduced dimensions, copy over previous (likely similar)
reducedDims = reduced_dims,
altExps = altExps(sce)
)

# Add normalized values if requested
if (normalize) {
try({
# try to cluster similar cells
# clustering may fail if < 100 cells in dataset
qclust <- suppressWarnings(scran::quickCluster(summed_sce))
summed_sce <- scran::computeSumFactors(summed_sce, clusters = qclust)
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
})
summed_sce <- scuttle::logNormCounts(summed_sce)
}

# recalculate PCA if requested, using the same dimensions as before
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
if (recalculate_reduced_dims) {
if ("PCA" %in% reducedDimNames(sce)) {
pca_dim <- ncol(reducedDim(sce, "PCA"))
} else {
pca_dim <- min(50, ncol(sce) - 1) # always reduce dimensions at least 1!
}

hv_genes <- scran::modelGeneVar(summed_sce) |>
scran::getTopHVGs(n = length(metadata(sce)$highly_variable_genes))
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
metadata(summed_sce)$highly_variable_genes <- hv_genes # store the new HVGs

summed_sce <- scater::runPCA(summed_sce, subset_row = hv_genes, ncomponents = pca_dim) |>
scater::runUMAP()
}

return(summed_sce)
}
Loading