Skip to content

Commit

Permalink
Merge pull request #21 from AlexsLemonade/jashapiro/largematrix
Browse files Browse the repository at this point in the history
Handle large matrices and change default Seurat format
  • Loading branch information
jashapiro authored Dec 18, 2024
2 parents b4aeb0c + b5458b4 commit 2deaa20
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 57 deletions.
18 changes: 9 additions & 9 deletions R/make-seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#' @param dedup_method Method to handle duplicated gene symbols. If `unique`,
#' the gene symbols will be made unique following standard Seurat procedures.
#' If `sum`, the expression values for each gene symbols will be summed.
#' @param seurat_assay_version Whether to create Seurat `v3` or `v5` assay
#' objects. Default is `v3`.
#' @param seurat_assay_version Whether to create Seurat `v5` or `v3` assay
#' objects. Default is `v5`.
#'
#' @return a Seurat object
#' @export
Expand All @@ -37,16 +37,16 @@
#' # convert SCE to Seurat, merging duplicated gene names
#' seurat_obj <- sce_to_seurat(sce, dedup_method = "sum")
#'
#' # convert SCE to Seurat with v5 assay objects
#' seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v5")
#' # convert SCE to Seurat with v3 assay objects
#' seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v3")
#' }
#'
sce_to_seurat <- function(
sce,
use_symbols = TRUE,
reference = c("sce", "scpca", "10x2020", "10x2024"),
dedup_method = c("unique", "sum"),
seurat_assay_version = c("v3", "v5")) {
seurat_assay_version = c("v5", "v3")) {
reference <- match.arg(reference)
dedup_method <- match.arg(dedup_method)
seurat_assay_version <- match.arg(seurat_assay_version)
Expand Down Expand Up @@ -120,12 +120,12 @@ sce_to_seurat <- function(
}

# convert and set functions depending on assay version requested
if (seurat_assay_version == "v5") {
create_seurat_assay <- SeuratObject::CreateAssay5Object
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
} else {
if (seurat_assay_version == "v3") {
create_seurat_assay <- SeuratObject::CreateAssayObject
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay"))
} else {
create_seurat_assay <- SeuratObject::CreateAssay5Object
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
}

# add spliced data if present
Expand Down
39 changes: 31 additions & 8 deletions R/sum-duplicate-genes.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
#' TRUE, the highly variable genes are also recalculated with the new values
#' stored in metadata. Default is FALSE.
#' @param cell_set_size Because very large matrices can cause trouble, we break
#' the summing into submatrices with this many cells. Default is 5000, which
#' should be fine for most datasets.
#'
#' @return a SingleCellExperiment object
#' @export
Expand All @@ -44,12 +47,16 @@
#' summed_sce <- sum_duplicate_genes(sce, recalculate_reduced_dims = TRUE)
#' }
#'
sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) {
sum_duplicate_genes <- function(sce,
normalize = TRUE,
recalculate_reduced_dims = FALSE,
cell_set_size = 5000) {
stopifnot(
"sce must be a SingleCellExperiment object" = is(sce, "SingleCellExperiment"),
"normalize must be a logical" = is.logical(normalize),
"recalculate_reduced_dims must be a logical" = is.logical(recalculate_reduced_dims),
"normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims
"normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims,
"cell_set_size should be an integer (much) greater than 1" = cell_set_size > 1 && cell_set_size %% 1 == 0
)
if (normalize) {
stopifnot(
Expand All @@ -67,14 +74,30 @@ sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims
return(sce)
}

# calculate the reduced matrices
# calculate the reduced matrices----------------------------------------------
unique_rows <- unique(rownames(sce)) # new row names
counts <- rowsum(counts(sce), rownames(sce))[unique_rows, ] |> # keep order, mostly
as("sparseMatrix")

# break up counts and assay matrices to avoid large non-sparse matrices during calculation
cell_sets <- seq(0, ncol(sce) - 1) %/% cell_set_size
# make sure the last set is not length one
cell_sets[ncol(sce)] <- cell_sets[ncol(sce) - 1]

counts <- unique(cell_sets) |>
purrr::map(\(x) {
rowsum(counts(sce)[, cell_sets == x], rownames(sce)) |>
as("sparseMatrix") |>
base::`[`(unique_rows, ) # reorder rows (faster with sparse)
}) |>
purrr::reduce(cbind)

if ("spliced" %in% assayNames(sce)) {
spliced_names <- rownames(assay(sce, "spliced"))
spliced <- rowsum(assay(sce, "spliced"), spliced_names)[unique(spliced_names), ] |>
as("sparseMatrix")
spliced <- unique(cell_sets) |>
purrr::map(\(x) {
rowsum(assay(sce, "spliced")[, cell_sets == x], rownames(sce)) |>
as("sparseMatrix") |>
base::`[`(unique_rows, )
}) |>
purrr::reduce(cbind)
assays <- list(counts = counts, spliced = spliced)
} else {
assays <- list(counts = counts)
Expand Down
10 changes: 5 additions & 5 deletions man/sce_to_seurat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion man/sum_duplicate_genes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 32 additions & 31 deletions tests/testthat/test-make-seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ test_that("SCE to Seurat with no id conversion works", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents

# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down Expand Up @@ -66,13 +67,13 @@ test_that("SCE to Seurat with id conversion works as expected", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(
Expand Down Expand Up @@ -116,13 +117,13 @@ test_that("SCE to Seurat with id conversion and 10x reference works as expected"


# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(
Expand Down Expand Up @@ -163,15 +164,15 @@ test_that("conversion works with altExps", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["alt1"]], "Assay")
expect_s4_class(seurat_obj[["alt2"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")
expect_s4_class(seurat_obj[["alt1"]], "Assay5")
expect_s4_class(seurat_obj[["alt2"]], "Assay5")
})

test_that("Seurat v5 conversion works", {
test_that("Seurat v3 conversion works", {
sce <- readRDS(test_path("data", "scpca_sce.rds"))
seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE, seurat_assay_version = "v5")
seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE, seurat_assay_version = "v3")
expect_s4_class(seurat_obj, "Seurat")

expect_equal(dim(seurat_obj), dim(sce))
Expand All @@ -182,13 +183,13 @@ test_that("Seurat v5 conversion works", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")

# expected layers present
# assay contents
expect_contains(
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down Expand Up @@ -224,13 +225,13 @@ test_that("Conversion works for non-processed samples", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down
65 changes: 62 additions & 3 deletions tests/testthat/test-sum-duplicate-genes.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ test_that("merging works as expected", {
expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
Expand All @@ -16,11 +17,11 @@ test_that("merging works as expected", {


expect_equal(
counts(deduped_sce)[rownames(sce), ], # need to make the row order identical
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced")[rownames(sce), ],
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})
Expand All @@ -33,6 +34,36 @@ test_that("merging works as expected with unprocessed SCE", {
expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
c("gene_ids", "gene_symbol", "mean", "detected")
)
expect_equal(rowData(deduped_sce)$gene_ids, rowData(sce)$gene_ids)
expect_equal(rowData(deduped_sce)$gene_symbol, rowData(sce)$gene_symbol)


expect_equal(
counts(deduped_sce), # need to make the row order identical
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})


test_that("merging works with split and combined cell sets", {
sce <- readRDS(test_path("data", "filtered_sce.rds"))
double_sce <- rbind(sce, sce)
deduped_sce <- sum_duplicate_genes(double_sce, cell_set_size = 20)

expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
Expand All @@ -43,11 +74,39 @@ test_that("merging works as expected with unprocessed SCE", {


expect_equal(
counts(deduped_sce)[rownames(sce), ], # need to make the row order identical
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced")[rownames(sce), ],
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})

test_that("test merging for split when cell set size leaves a set of size 1", {
sce <- readRDS(test_path("data", "filtered_sce.rds"))
double_sce <- rbind(sce, sce)
deduped_sce <- sum_duplicate_genes(double_sce, cell_set_size = ncol(sce) - 1)

expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
c("gene_ids", "gene_symbol", "mean", "detected")
)
expect_equal(rowData(deduped_sce)$gene_ids, rowData(sce)$gene_ids)
expect_equal(rowData(deduped_sce)$gene_symbol, rowData(sce)$gene_symbol)


expect_equal(
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})

0 comments on commit 2deaa20

Please sign in to comment.