From 236ad71120a53edcdc38cf487c0e9e0b1907e918 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 14:45:00 -0500 Subject: [PATCH 1/5] Update stability function to take a df, and ensure correct order --- R/evaluate-clusters.R | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/R/evaluate-clusters.R b/R/evaluate-clusters.R index 6ea5265..b308466 100644 --- a/R/evaluate-clusters.R +++ b/R/evaluate-clusters.R @@ -133,8 +133,10 @@ calculate_purity <- function( #' either a SingleCellExperiment object, a Seurat object, or a matrix where columns #' are PCs and rows are cells. If a matrix is provided, it must have row names of cell #' ids (e.g., barcodes). -#' @param clusters A vector of cluster ids, typically a numeric factor variable, obtained -#' by previously clustering the PCs. +#' @param cluster_df A data frame that contains at least the columns `cell_id` and +#' `cluster`. The `cell_id` values should match either the PC matrix row names, +#' or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from +#' the `rOpenScPCA::calculate_clusters()` function. #' @param replicates Number of bootstrap replicates to perform. Default is 20. #' @param seed Random seed #' @param pc_name Optionally, the name of the PC matrix in the object. Not used if a @@ -159,7 +161,7 @@ calculate_purity <- function( #' # and setting a seed for reproducibility #' cluster_df <- calculate_clusters(sce_object, seed = 11) #' # Second, calculate cluster stability using default parameters -#' stability_df <- calculate_stability(sce_object, cluster_df$clusters, seed = 11) +#' stability_df <- calculate_stability(sce_object, cluster_df, seed = 11) #' #' #' # First, cluster PCs from a SingleCellExperiment object using default parameters @@ -168,7 +170,7 @@ calculate_purity <- function( #' # Second, calculate cluster stability using default parameters and 50 replicates #' stability_df <- calculate_stability( #' sce_object, -#' cluster_df$clusters, +#' cluster_df, #' replicates = 50, #' seed = 11 #' ) @@ -186,7 +188,7 @@ calculate_purity <- function( #' # for the initial clustering #' stability_df <- calculate_stability( #' sce_object, -#' cluster_df$clusters, +#' cluster_df, #' algorithm = "leiden", #' resolution = 0.1, #' seed = 11 @@ -194,7 +196,7 @@ calculate_purity <- function( #' } calculate_stability <- function( x, - clusters, + cluster_df, replicates = 20, seed = NULL, pc_name = NULL, @@ -206,10 +208,14 @@ calculate_stability <- function( # ensure we have a matrix pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name) - # check clusters and matrix compatibility + # Extract vector of clusters, ensuring same order as pca_matrix + rownames(cluster_df) <- cluster_df$cell_id + clusters <- cluster_df[rownames(pca_matrix),]$cluster + + # check that we have the right number of clusters after ensuring correct order stopifnot( - "The number of rows in the matrix must equal the length of the clusters vector." = - nrow(pca_matrix) == length(clusters) + "Could not extract clusters from cluster_df due to mismatch with PCA matrix." = + !any(is.na(clusters)) ) # calculate ARI for each cluster result bootstrap replicate From 54810d1a04ad5fb234c7c3c2ae12810dd0705294 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 14:45:14 -0500 Subject: [PATCH 2/5] redocument --- man/calculate_stability.Rd | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/man/calculate_stability.Rd b/man/calculate_stability.Rd index 24ddde9..8733278 100644 --- a/man/calculate_stability.Rd +++ b/man/calculate_stability.Rd @@ -6,7 +6,7 @@ \usage{ calculate_stability( x, - clusters, + cluster_df, replicates = 20, seed = NULL, pc_name = NULL, @@ -19,8 +19,10 @@ either a SingleCellExperiment object, a Seurat object, or a matrix where columns are PCs and rows are cells. If a matrix is provided, it must have row names of cell ids (e.g., barcodes).} -\item{clusters}{A vector of cluster ids, typically a numeric factor variable, obtained -by previously clustering the PCs.} +\item{cluster_df}{A data frame that contains at least the columns `cell_id` and +`cluster`. The `cell_id` values should match either the PC matrix row names, +or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from +the `rOpenScPCA::calculate_clusters()` function.} \item{replicates}{Number of bootstrap replicates to perform. Default is 20.} @@ -64,7 +66,7 @@ to "jaccard" to align with common practice in scRNA-seq analysis. # and setting a seed for reproducibility cluster_df <- calculate_clusters(sce_object, seed = 11) # Second, calculate cluster stability using default parameters -stability_df <- calculate_stability(sce_object, cluster_df$clusters, seed = 11) +stability_df <- calculate_stability(sce_object, cluster_df, seed = 11) # First, cluster PCs from a SingleCellExperiment object using default parameters @@ -73,7 +75,7 @@ cluster_df <- calculate_clusters(sce_object, seed = 11) # Second, calculate cluster stability using default parameters and 50 replicates stability_df <- calculate_stability( sce_object, - cluster_df$clusters, + cluster_df, replicates = 50, seed = 11 ) @@ -91,7 +93,7 @@ cluster_df <- calculate_clusters( # for the initial clustering stability_df <- calculate_stability( sce_object, - cluster_df$clusters, + cluster_df, algorithm = "leiden", resolution = 0.1, seed = 11 From 22400d235b43827005936c09cf170aeac21e00ef Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 14:45:29 -0500 Subject: [PATCH 3/5] update tests --- tests/testthat/test-evaluate-clusters.R | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-evaluate-clusters.R b/tests/testthat/test-evaluate-clusters.R index 83da7ae..cd31251 100644 --- a/tests/testthat/test-evaluate-clusters.R +++ b/tests/testthat/test-evaluate-clusters.R @@ -45,7 +45,7 @@ test_that("calculate_stability works as expected with defaults", { # note that we suppress warnings since this calculation done on fake # test data gives expected warnings about ties during the ARI calculation. suppressWarnings({ - df <- calculate_stability(test_mat, cluster_df$cluster) + df <- calculate_stability(test_mat, cluster_df) }) expected_names <- colnames(cluster_df)[!(colnames(cluster_df) %in% c("cell_id", "cluster"))] @@ -62,7 +62,7 @@ test_that("calculate_stability works as expected with different replicates", { # note that we suppress warnings since this calculation done on fake # test data gives expected warnings about ties during the ARI calculation. suppressWarnings({ - df <- calculate_stability(test_mat, cluster_df$cluster, replicates = 2) + df <- calculate_stability(test_mat, cluster_df, replicates = 2) }) expect_equal(nrow(df), 2) }) @@ -77,7 +77,7 @@ test_that("calculate_stability works as expected with object and pc_name", { suppressWarnings({ df <- calculate_stability( sce, - cluster_df$cluster, + cluster_df, replicates = 2, pc_name = "my_pca" ) @@ -90,11 +90,6 @@ test_that("calculate_stability works as expected with object and pc_name", { test_that("calculate_stability errors as expected", { expect_error({ # mismatched cluster vector length - calculate_stability(test_mat, cluster_df$cluster[1:5]) - }) - - expect_error({ - # cluster_df not a vector - calculate_stability(test_mat, cluster_df) + calculate_stability(test_mat, cluster_df[1:5,]) }) }) From 6cbff3d865827552faf7ede0b3dae6a89ebfcb53 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 15:50:07 -0500 Subject: [PATCH 4/5] Update checks and add test --- R/evaluate-clusters.R | 14 ++++++++------ tests/testthat/test-evaluate-clusters.R | 10 +++++++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/R/evaluate-clusters.R b/R/evaluate-clusters.R index b308466..4797832 100644 --- a/R/evaluate-clusters.R +++ b/R/evaluate-clusters.R @@ -208,16 +208,18 @@ calculate_stability <- function( # ensure we have a matrix pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name) + # ensure pca matrix and cluster df compatibility + stopifnot( + "The cluster dataframe must have the same number of rows as the PCA matrix." = + nrow(pca_matrix) == nrow(cluster_df), + "Cell ids in the cluster dataframe must match the PCA matrix rownames." = + length(setdiff(rownames(pca_matrix), cluster_df$cell_id)) == 0 + ) + # Extract vector of clusters, ensuring same order as pca_matrix rownames(cluster_df) <- cluster_df$cell_id clusters <- cluster_df[rownames(pca_matrix),]$cluster - # check that we have the right number of clusters after ensuring correct order - stopifnot( - "Could not extract clusters from cluster_df due to mismatch with PCA matrix." = - !any(is.na(clusters)) - ) - # calculate ARI for each cluster result bootstrap replicate all_ari_df <- 1:replicates |> purrr::map( diff --git a/tests/testthat/test-evaluate-clusters.R b/tests/testthat/test-evaluate-clusters.R index cd31251..ea71203 100644 --- a/tests/testthat/test-evaluate-clusters.R +++ b/tests/testthat/test-evaluate-clusters.R @@ -88,8 +88,16 @@ test_that("calculate_stability works as expected with object and pc_name", { test_that("calculate_stability errors as expected", { + + # cluster_df too short expect_error({ - # mismatched cluster vector length calculate_stability(test_mat, cluster_df[1:5,]) }) + + # cluster_df too long + cluster_df_extra <- cluster_df |> + tibble::add_row(cell_id = "extra_barcode") + expect_error({ + calculate_stability(test_mat, cluster_df_extra) + }) }) From ea017de5773a011e6801fb0d0d1b82e894d3cf0e Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 15:50:17 -0500 Subject: [PATCH 5/5] add .github folder to .Rbuildignore --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..3912071 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.github$