Merge pull request #6 from AlexsLemonade/sjspielman/5-flexiblize-coln…

…ames Allow users to customize cluster_df column names
AlexsLemonade · Nov 12, 2024 · fae22a6 · fae22a6
2 parents f71a819 + 6fc28e3
commit fae22a6
Show file tree

Hide file tree

Showing 18 changed files with 4,653 additions and 104 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,5 @@
+^renv$
+^renv\.lock$
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.github$
diff --git a/.Rprofile b/.Rprofile
@@ -0,0 +1 @@
+source("renv/activate.R")
diff --git a/.github/workflows/R-CMD-CHECK.yml → .github/workflows/R-CMD-CHECK-release.yml b/.github/workflows/R-CMD-CHECK.yml → .github/workflows/R-CMD-CHECK-release.yml
@@ -1,10 +1,12 @@
+# This GHA checks rOpenScPCA using dependencies using current packages from CRAN and Bioconductor
+
 on:
-  pull_request:
+  push:
     branches:
       - main
       - feature/*
 
-name: R-CMD-CHECK
+name: R-CMD-CHECK-release
 
 jobs:
   R-CMD-check-renv:
@@ -18,7 +20,6 @@ jobs:
       - name: Set up R
         uses: r-lib/actions/setup-r@v2
         with:
-          r-version: 4.4.0
           use-public-rspm: true
 
       - name: Set up dependencies

diff --git a/.github/workflows/R-CMD-CHECK-renv.yml b/.github/workflows/R-CMD-CHECK-renv.yml
@@ -0,0 +1,43 @@
+# This GHA checks rOpenScPCA using the repository's renv environment which records
+#  dependencies from Bioconductor 3.19, the version we expect rOpenScPCA to be most
+#  commonly used with
+
+on:
+  pull_request:
+    branches:
+      - main
+      - feature/*
+
+name: R-CMD-CHECK-renv
+
+jobs:
+  R-CMD-check-renv:
+    runs-on: ubuntu-22.04
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Set up R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: "renv"
+          use-public-rspm: true
+
+      - name: Install additional system dependencies
+        run: |
+          sudo apt-get install -y libcurl4-openssl-dev libglpk40
+
+      - name: Set up renv and install packages
+        uses: r-lib/actions/setup-renv@v2
+
+      - name: Install additional dependencies for testing
+        run: |
+          install.packages("rcmdcheck")
+        shell: Rscript {0}
+
+      - name: Check package
+        uses: r-lib/actions/check-r-package@v2
+        with:
+          args: 'c("--no-manual")'
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,3 +8,4 @@ export(extract_pc_matrix)
 export(sweep_clusters)
 import(SingleCellExperiment)
 import(methods)
+importFrom(stats,setNames)
diff --git a/R/calculate-clusters.R b/R/calculate-clusters.R
@@ -14,13 +14,13 @@
 #'   have row names of cell ids (e.g., barcodes).
 #' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap", or "leiden".
 #' @param weighting Weighting scheme to use. Must be one of "jaccard" (default), "rank", or "number"
-#' @param nn Number of nearest neighbors. Default is 10.
+#' @param nn Number of nearest neighbors. The default is 10.
 #' @param resolution Resolution parameter used by louvain and leiden clustering only. Default is 1.
 #' @param objective_function Leiden-specific parameter for whether to use the Constant Potts Model ("CPM"; default) or "modularity"
 #' @param cluster_args List of additional arguments to pass to the chosen clustering function.
 #'   Only single values for each argument are supported (no vectors or lists).
 #'   See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest
-#' @param threads Number of threads to use. Default is 1.
+#' @param threads Number of threads to use. The default is 1.
 #' @param seed Random seed to set for clustering.
 #' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment
 #'   or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the

diff --git a/R/evaluate-clusters.R b/R/evaluate-clusters.R
@@ -7,18 +7,26 @@
 #' @param x Either a matrix of principal components (PCs), or a SingleCellExperiment
 #'  or Seurat object containing PCs. If a matrix is provided, rows should be cells
 #'  and columns should be PCs, and row names should be cell ids (e.g., barcodes).
-#' @param cluster_df A data frame that contains at least the columns `cell_id` and
-#'  `cluster`. The `cell_id` values should match either the PC matrix row names,
-#'  or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from
-#'  the `rOpenScPCA::calculate_clusters()` function.
+#' @param cluster_df A data frame that contains at least two columns: one representing
+#'   unique cell ids, and one containing cluster assignments. By default, these columns
+#'   should be named `cell_id` and `cluster` respectively, though this can be customized.
+#'   The cell id column's values should match either the PC matrix row names, or the
+#'   SingleCellExperiment/Seurat object cell ids. Typically this data frame will be
+#'   output from the `rOpenScPCA::calculate_clusters()` function.
+#' @param cluster_col The name of the column in `cluster_df` which contains cluster
+#'   assignments. The default is "cluster".
+#' @param cell_id_col The name of the column in `cluster_df` which contains unique cell
+#'   ids. The default is "cell_id".
 #' @param pc_name Optionally, the name of the PC matrix in the object. Not used if a
-#'   matrix is provided. If the name is not provided, the name "PCA" is assumed for
+#'   matrix is provided. If the name is not provided, the name "PCA" is used for
 #'   SingleCellExperiment objects, and "pca" for Seurat objects.
 #'
-#' @return Expanded `cluster_df` data frame with these additional columns:
-#' - `silhouette_width`, the cell's silhouette width
-#' - `other`, the closest cluster other than the one to which the given cell was assigned
-#' For more information, see documentation for `bluster::approxSilhouette()`
+#' @return Expanded `cluster_df` data frame with additional columns `silhouette_width`,
+#'   the cell's silhouette width, and `silhouette_other`, the closest cluster other
+#'   than the one to which the given cell was assigned. For more information,
+#'   see documentation for `bluster::approxSilhouette()`.
+#'
+#' @importFrom stats setNames
 #'
 #' @export
 #' @examples
@@ -29,25 +37,33 @@
 calculate_silhouette <- function(
     x,
     cluster_df,
+    cluster_col = "cluster",
+    cell_id_col = "cell_id",
     pc_name = NULL) {
   x <- prepare_pc_matrix(x, pc_name)
 
-  expected_df_names <- c("cell_id", "cluster")
+  expected_df_names <- c(cell_id_col, cluster_col)
   stopifnot(
-    "Expected columns 'cell_id' and 'cluster' in the cluster_df." =
+    "The cell id column name must be length of 1." = length(cell_id_col) == 1,
+    "The cluster column name must be length of 1." = length(cluster_col) == 1,
+    "Expected columns not present in cluster_df." =
       all(expected_df_names %in% colnames(cluster_df))
   )
 
   silhouette_df <- x |>
-    bluster::approxSilhouette(cluster_df$cluster) |>
+    bluster::approxSilhouette(cluster_df[[cluster_col]]) |>
     as.data.frame() |>
+    # note this gets renamed later as needed
     tibble::rownames_to_column("cell_id") |>
-    dplyr::rename("silhouette_width" = "width")
+    dplyr::rename(
+      "silhouette_width" = "width",
+      "silhouette_other" = "other"
+    )
 
-  # join with cluster_df in this direction, so that columns in
-  # cluster_df come first
+  # join with cluster_df in this direction, so that columns in cluster_df come first
+  # ensure provided cluster_df column names are used as well
   silhouette_df <- cluster_df |>
-    dplyr::inner_join(silhouette_df, by = c("cell_id", "cluster"))
+    dplyr::inner_join(silhouette_df, by = setNames(c("cell_id", "cluster"), c(cell_id_col, cluster_col)))
 
   return(silhouette_df)
 }
@@ -63,19 +79,25 @@ calculate_silhouette <- function(
 #' @param x Either a matrix of principal components (PCs), or a SingleCellExperiment
 #'  or Seurat object containing PCs. If a matrix is provided, rows should be cells
 #'  and columns should be PCs, and row names should be cell ids (e.g., barcodes).
-#' @param cluster_df A data frame that contains at least the columns `cell_id` and
-#'  `cluster`. The `cell_id` values should match either the PC matrix row names,
-#'  or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from
-#'  the `rOpenScPCA::calculate_clusters()` function.
+#' @param cluster_df A data frame that contains at least two columns: one representing
+#'   unique cell ids, and one containing cluster assignments. By default, these columns
+#'   should be named `cell_id` and `cluster` respectively, though this can be customized.
+#'   The cell id column's values should match either the PC matrix row names, or the
+#'   SingleCellExperiment/Seurat object cell ids. Typically this data frame will be
+#'   output from the `rOpenScPCA::calculate_clusters()` function.
+#' @param cluster_col The name of the column in `cluster_df` which contains cluster
+#'   assignments. The default is "cluster".
+#' @param cell_id_col The name of the column in `cluster_df` which contains unique cell
+#'   ids. The default is "cell_id".
 #' @param pc_name Optionally, the name of the PC matrix in the object. Not used if a
-#'   matrix is provided. If the name is not provided, the name "PCA" is assumed for
+#'   matrix is provided. If the name is not provided, the name "PCA" is used for
 #'   SingleCellExperiment objects, and "pca" for Seurat objects.
 #' @param ... Additional arguments to pass to `bluster::neighborPurity()`
 #'
-#' @return Expanded `cluster_df` data frame with these additional columns:
-#' - `purity`, the cell's neighborhood purity
-#' - `maximum`, the cluster with the highest proportion of observations neighboring the given cell.
-#' For more information, see documentation for `bluster::neighborPurity()`
+#' @return Expanded `cluster_df` data frame with the additional columns `purity`,
+#'   the cell's neighborhood purity, and `maximum_neighbor`, the cluster with the
+#'   highest proportion of observations neighboring the given cell. For more
+#'   information see documentation for `bluster::neighborPurity()`.
 #'
 #' @export
 #' @examples
@@ -86,25 +108,32 @@ calculate_silhouette <- function(
 calculate_purity <- function(
     x,
     cluster_df,
+    cluster_col = "cluster",
+    cell_id_col = "cell_id",
     pc_name = NULL,
     ...) {
   x <- prepare_pc_matrix(x, pc_name)
 
-  expected_df_names <- c("cell_id", "cluster")
+  expected_df_names <- c(cell_id_col, cluster_col)
   stopifnot(
-    "Expected columns 'cell_id' and 'cluster' in cluster_df." =
+    "The cell id column name must be length of 1." = length(cell_id_col) == 1,
+    "The cluster column name must be length of 1." = length(cluster_col) == 1,
+    "Expected columns not present in cluster_df." =
       all(expected_df_names %in% colnames(cluster_df))
   )
 
   purity_df <- x |>
-    bluster::neighborPurity(cluster_df$cluster) |>
+    bluster::neighborPurity(cluster_df[[cluster_col]], ...) |>
     as.data.frame() |>
-    tibble::rownames_to_column("cell_id")
+    tibble::rownames_to_column(cell_id_col) |>
+    dplyr::rename(
+      "maximum_neighbor" = "maximum"
+    )
 
   # join with cluster_df in this direction, so that columns in
   # cluster_df come first
   purity_df <- cluster_df |>
-    dplyr::inner_join(purity_df, by = c("cell_id"))
+    dplyr::inner_join(purity_df, by = cell_id_col)
 
   return(purity_df)
 }
@@ -133,15 +162,23 @@ calculate_purity <- function(
 #'   either a SingleCellExperiment object, a Seurat object, or a matrix where columns
 #'   are PCs and rows are cells. If a matrix is provided, it must have row names of cell
 #'   ids (e.g., barcodes).
-#' @param cluster_df A data frame that contains at least the columns `cell_id` and
-#'  `cluster`. The `cell_id` values should match either the PC matrix row names,
-#'  or the SingleCellExperiment/Seurat object cell ids. Typically this will be output from
-#'  the `rOpenScPCA::calculate_clusters()` function.
-#' @param replicates Number of bootstrap replicates to perform. Default is 20.
+#' @param cluster_df A data frame that contains at least two columns: one representing
+#'   unique cell ids, and one containing cluster assignments. By default, these columns
+#'   should be named `cell_id` and `cluster` respectively, though this can be customized.
+#'   The cell id column's values should match either the PC matrix row names, or the
+#'   SingleCellExperiment/Seurat object cell ids. Typically this data frame will be
+#'   output from the `rOpenScPCA::calculate_clusters()` function.
+#' @param cluster_col The name of the column in `cluster_df` which contains cluster
+#'   assignments. The default is "cluster".
+#' @param cell_id_col The name of the column in `cluster_df` which contains unique cell
+#'   ids. The default is "cell_id".
+#' @param replicates Number of bootstrap replicates to perform. The default is 20.
 #' @param seed Random seed
 #' @param pc_name Optionally, the name of the PC matrix in the object. Not used if a
-#'   matrix is provided. If the name is not provided, the name "PCA" is assumed for
+#'   matrix is provided. If the name is not provided, the name "PCA" is used for
 #'   SingleCellExperiment objects, and "pca" for Seurat objects.
+#' @param warnings Whether warnings related to distance ties when calculating bootstrap
+#'   clusters should be printed. The default is FALSE.
 #' @param ... Additional arguments to pass to `calculate_clusters()` which calculates
 #'   bootstrapped clusters. Usually, these will be the same arguments used to generate
 #'   the original clusters.
@@ -197,9 +234,12 @@ calculate_purity <- function(
 calculate_stability <- function(
     x,
     cluster_df,
+    cluster_col = "cluster",
+    cell_id_col = "cell_id",
     replicates = 20,
     seed = NULL,
     pc_name = NULL,
+    warnings = FALSE,
     ...) {
   if (!is.null(seed)) {
     set.seed(seed)
@@ -213,12 +253,21 @@ calculate_stability <- function(
     "The cluster dataframe must have the same number of rows as the PCA matrix." =
       nrow(pca_matrix) == nrow(cluster_df),
     "Cell ids in the cluster dataframe must match the PCA matrix rownames." =
-      length(setdiff(rownames(pca_matrix), cluster_df$cell_id)) == 0
+      length(setdiff(rownames(pca_matrix), cluster_df[[cell_id_col]])) == 0
+  )
+
+  # Check columns
+  expected_df_names <- c(cell_id_col, cluster_col)
+  stopifnot(
+    "The cell id column name must be length of 1." = length(cell_id_col) == 1,
+    "The cluster column name must be length of 1." = length(cluster_col) == 1,
+    "Expected columns not present in cluster_df." =
+      all(expected_df_names %in% colnames(cluster_df))
   )
 
   # Extract vector of clusters, ensuring same order as pca_matrix
-  rownames(cluster_df) <- cluster_df$cell_id
-  clusters <- cluster_df[rownames(pca_matrix),]$cluster
+  rownames(cluster_df) <- cluster_df[[cell_id_col]]
+  clusters <- cluster_df[rownames(pca_matrix), cluster_col]
 
   # calculate ARI for each cluster result bootstrap replicate
   all_ari_df <- 1:replicates |>
@@ -228,14 +277,18 @@ calculate_stability <- function(
         resampled_pca <- pca_matrix[sample_cells, ]
         original_clusters <- clusters[sample_cells]
 
-        resampled_df <- calculate_clusters(resampled_pca, ...)
+        resampled_df <- withCallingHandlers(
+          calculate_clusters(resampled_pca, ...),
+          warning = \(w) {if(!warnings) tryInvokeRestart("muffleWarning")}
+        )
 
         ari <- pdfCluster::adj.rand.index(resampled_df$cluster, original_clusters)
 
         # return df with ari and clustering parameters
         ari_df <- resampled_df |>
           dplyr::slice(1) |>
-          dplyr::select(!c("cell_id", "cluster")) |>
+          # these column names come directly out of calculate_clusters; they are not customized
+          dplyr::select(!dplyr::all_of(c("cell_id", "cluster"))) |>
           dplyr::mutate(
             # define this variable here to ensure it's numeric
             replicate = i,

diff --git a/R/sweep-clusters.R b/R/sweep-clusters.R
@@ -24,14 +24,14 @@
 #' Provide a vector of unique values to vary this parameter. Options include "jaccard" (default),
 #'   "rank", or "number"
 #' @param nn Number of nearest neighbors to consider when sweeping parameters.
-#'  Provide a vector of unique values to vary this parameter. Default is 10.
+#'  Provide a vector of unique values to vary this parameter. The default is 10.
 #' @param resolution Resolution parameter used by louvain and leiden clustering only.
-#'   Provide a vector of unique values to vary this parameter. Default is 1.
+#'   Provide a vector of unique values to vary this parameter. The default is 1.
 #' @param objective_function Leiden-specific parameter for whether to use the
 #'   Constant Potts Model ("CPM"; default) or "modularity". Provide a vector of unique values
 #'   to vary this parameter.
 #' @param seed Random seed to set for clustering.
-#' @param threads Number of threads to use. Default is 1.
+#' @param threads Number of threads to use. The default is 1.
 #' @param pc_name Name of principal components slot in provided object. This argument is only used
 #'   if a SingleCellExperiment or Seurat object is provided. If not provided, the SingleCellExperiment
 #'   object name will default to "PCA" and the Seurat object name will default to "pca".

diff --git a/man/calculate_clusters.Rd b/man/calculate_clusters.Rd