From 236ad71120a53edcdc38cf487c0e9e0b1907e918 Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Fri, 8 Nov 2024 14:45:00 -0500
Subject: [PATCH 1/5] Update stability function to take a df, and ensure
 correct order

---
 R/evaluate-clusters.R | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/R/evaluate-clusters.R b/R/evaluate-clusters.R
index 6ea5265..b308466 100644
--- a/R/evaluate-clusters.R
+++ b/R/evaluate-clusters.R
@@ -133,8 +133,10 @@ calculate_purity <- function(
 #'   either a SingleCellExperiment object, a Seurat object, or a matrix where columns
 #'   are PCs and rows are cells. If a matrix is provided, it must have row names of cell
 #'   ids (e.g., barcodes).
-#' @param clusters A vector of cluster ids, typically a numeric factor variable, obtained
-#'   by previously clustering the PCs.
+#' @param cluster_df A data frame that contains at least the columns `cell_id` and
+#'  `cluster`. The `cell_id` values should match either the PC matrix row names,
+#'  or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from
+#'  the `rOpenScPCA::calculate_clusters()` function.
 #' @param replicates Number of bootstrap replicates to perform. Default is 20.
 #' @param seed Random seed
 #' @param pc_name Optionally, the name of the PC matrix in the object. Not used if a
@@ -159,7 +161,7 @@ calculate_purity <- function(
 #' # and setting a seed for reproducibility
 #' cluster_df <- calculate_clusters(sce_object, seed = 11)
 #' # Second, calculate cluster stability using default parameters
-#' stability_df <- calculate_stability(sce_object, cluster_df$clusters, seed = 11)
+#' stability_df <- calculate_stability(sce_object, cluster_df, seed = 11)
 #'
 #'
 #' # First, cluster PCs from a SingleCellExperiment object using default parameters
@@ -168,7 +170,7 @@ calculate_purity <- function(
 #' # Second, calculate cluster stability using default parameters and 50 replicates
 #' stability_df <- calculate_stability(
 #'   sce_object,
-#'   cluster_df$clusters,
+#'   cluster_df,
 #'   replicates = 50,
 #'   seed = 11
 #' )
@@ -186,7 +188,7 @@ calculate_purity <- function(
 #' # for the initial clustering
 #' stability_df <- calculate_stability(
 #'   sce_object,
-#'   cluster_df$clusters,
+#'   cluster_df,
 #'   algorithm = "leiden",
 #'   resolution = 0.1,
 #'   seed = 11
@@ -194,7 +196,7 @@ calculate_purity <- function(
 #' }
 calculate_stability <- function(
     x,
-    clusters,
+    cluster_df,
     replicates = 20,
     seed = NULL,
     pc_name = NULL,
@@ -206,10 +208,14 @@ calculate_stability <- function(
   # ensure we have a matrix
   pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name)
 
-  # check clusters and matrix compatibility
+  # Extract vector of clusters, ensuring same order as pca_matrix
+  rownames(cluster_df) <- cluster_df$cell_id
+  clusters <- cluster_df[rownames(pca_matrix),]$cluster
+
+  # check that we have the right number of clusters after ensuring correct order
   stopifnot(
-    "The number of rows in the matrix must equal the length of the clusters vector." =
-      nrow(pca_matrix) == length(clusters)
+    "Could not extract clusters from cluster_df due to mismatch with PCA matrix." =
+      !any(is.na(clusters))
   )
 
   # calculate ARI for each cluster result bootstrap replicate

From 54810d1a04ad5fb234c7c3c2ae12810dd0705294 Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Fri, 8 Nov 2024 14:45:14 -0500
Subject: [PATCH 2/5] redocument

---
 man/calculate_stability.Rd | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/man/calculate_stability.Rd b/man/calculate_stability.Rd
index 24ddde9..8733278 100644
--- a/man/calculate_stability.Rd
+++ b/man/calculate_stability.Rd
@@ -6,7 +6,7 @@
 \usage{
 calculate_stability(
   x,
-  clusters,
+  cluster_df,
   replicates = 20,
   seed = NULL,
   pc_name = NULL,
@@ -19,8 +19,10 @@ either a SingleCellExperiment object, a Seurat object, or a matrix where columns
 are PCs and rows are cells. If a matrix is provided, it must have row names of cell
 ids (e.g., barcodes).}
 
-\item{clusters}{A vector of cluster ids, typically a numeric factor variable, obtained
-by previously clustering the PCs.}
+\item{cluster_df}{A data frame that contains at least the columns `cell_id` and
+`cluster`. The `cell_id` values should match either the PC matrix row names,
+or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from
+the `rOpenScPCA::calculate_clusters()` function.}
 
 \item{replicates}{Number of bootstrap replicates to perform. Default is 20.}
 
@@ -64,7 +66,7 @@ to "jaccard" to align with common practice in scRNA-seq analysis.
 # and setting a seed for reproducibility
 cluster_df <- calculate_clusters(sce_object, seed = 11)
 # Second, calculate cluster stability using default parameters
-stability_df <- calculate_stability(sce_object, cluster_df$clusters, seed = 11)
+stability_df <- calculate_stability(sce_object, cluster_df, seed = 11)
 
 
 # First, cluster PCs from a SingleCellExperiment object using default parameters
@@ -73,7 +75,7 @@ cluster_df <- calculate_clusters(sce_object, seed = 11)
 # Second, calculate cluster stability using default parameters and 50 replicates
 stability_df <- calculate_stability(
   sce_object,
-  cluster_df$clusters,
+  cluster_df,
   replicates = 50,
   seed = 11
 )
@@ -91,7 +93,7 @@ cluster_df <- calculate_clusters(
 # for the initial clustering
 stability_df <- calculate_stability(
   sce_object,
-  cluster_df$clusters,
+  cluster_df,
   algorithm = "leiden",
   resolution = 0.1,
   seed = 11

From 22400d235b43827005936c09cf170aeac21e00ef Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Fri, 8 Nov 2024 14:45:29 -0500
Subject: [PATCH 3/5] update tests

---
 tests/testthat/test-evaluate-clusters.R | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/testthat/test-evaluate-clusters.R b/tests/testthat/test-evaluate-clusters.R
index 83da7ae..cd31251 100644
--- a/tests/testthat/test-evaluate-clusters.R
+++ b/tests/testthat/test-evaluate-clusters.R
@@ -45,7 +45,7 @@ test_that("calculate_stability works as expected with defaults", {
   # note that we suppress warnings since this calculation done on fake
   # test data gives expected warnings about ties during the ARI calculation.
   suppressWarnings({
-    df <- calculate_stability(test_mat, cluster_df$cluster)
+    df <- calculate_stability(test_mat, cluster_df)
   })
 
   expected_names <- colnames(cluster_df)[!(colnames(cluster_df) %in% c("cell_id", "cluster"))]
@@ -62,7 +62,7 @@ test_that("calculate_stability works as expected with different replicates", {
   # note that we suppress warnings since this calculation done on fake
   # test data gives expected warnings about ties during the ARI calculation.
   suppressWarnings({
-    df <- calculate_stability(test_mat, cluster_df$cluster, replicates = 2)
+    df <- calculate_stability(test_mat, cluster_df, replicates = 2)
   })
   expect_equal(nrow(df), 2)
 })
@@ -77,7 +77,7 @@ test_that("calculate_stability works as expected with object and pc_name", {
   suppressWarnings({
     df <- calculate_stability(
       sce,
-      cluster_df$cluster,
+      cluster_df,
       replicates = 2,
       pc_name = "my_pca"
     )
@@ -90,11 +90,6 @@ test_that("calculate_stability works as expected with object and pc_name", {
 test_that("calculate_stability errors as expected", {
   expect_error({
     # mismatched cluster vector length
-    calculate_stability(test_mat, cluster_df$cluster[1:5])
-  })
-
-  expect_error({
-    # cluster_df not a vector
-    calculate_stability(test_mat, cluster_df)
+    calculate_stability(test_mat, cluster_df[1:5,])
   })
 })

From 6cbff3d865827552faf7ede0b3dae6a89ebfcb53 Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Fri, 8 Nov 2024 15:50:07 -0500
Subject: [PATCH 4/5] Update checks and add test

---
 R/evaluate-clusters.R                   | 14 ++++++++------
 tests/testthat/test-evaluate-clusters.R | 10 +++++++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/R/evaluate-clusters.R b/R/evaluate-clusters.R
index b308466..4797832 100644
--- a/R/evaluate-clusters.R
+++ b/R/evaluate-clusters.R
@@ -208,16 +208,18 @@ calculate_stability <- function(
   # ensure we have a matrix
   pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name)
 
+  # ensure pca matrix and cluster df compatibility
+  stopifnot(
+    "The cluster dataframe must have the same number of rows as the PCA matrix." =
+      nrow(pca_matrix) == nrow(cluster_df),
+    "Cell ids in the cluster dataframe must match the PCA matrix rownames." =
+      length(setdiff(rownames(pca_matrix), cluster_df$cell_id)) == 0
+  )
+
   # Extract vector of clusters, ensuring same order as pca_matrix
   rownames(cluster_df) <- cluster_df$cell_id
   clusters <- cluster_df[rownames(pca_matrix),]$cluster
 
-  # check that we have the right number of clusters after ensuring correct order
-  stopifnot(
-    "Could not extract clusters from cluster_df due to mismatch with PCA matrix." =
-      !any(is.na(clusters))
-  )
-
   # calculate ARI for each cluster result bootstrap replicate
   all_ari_df <- 1:replicates |>
     purrr::map(
diff --git a/tests/testthat/test-evaluate-clusters.R b/tests/testthat/test-evaluate-clusters.R
index cd31251..ea71203 100644
--- a/tests/testthat/test-evaluate-clusters.R
+++ b/tests/testthat/test-evaluate-clusters.R
@@ -88,8 +88,16 @@ test_that("calculate_stability works as expected with object and pc_name", {
 
 
 test_that("calculate_stability errors as expected", {
+
+  # cluster_df too short
   expect_error({
-    # mismatched cluster vector length
     calculate_stability(test_mat, cluster_df[1:5,])
   })
+
+  # cluster_df too long
+  cluster_df_extra <- cluster_df |>
+    tibble::add_row(cell_id = "extra_barcode")
+  expect_error({
+    calculate_stability(test_mat, cluster_df_extra)
+  })
 })

From ea017de5773a011e6801fb0d0d1b82e894d3cf0e Mon Sep 17 00:00:00 2001
From: Stephanie Spielman <stephanie.spielman@gmail.com>
Date: Fri, 8 Nov 2024 15:50:17 -0500
Subject: [PATCH 5/5] add .github folder to .Rbuildignore

---
 .Rbuildignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.Rbuildignore b/.Rbuildignore
index 91114bf..3912071 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.github$