Merge pull request #1 from AlexsLemonade/sjspielman/initial-package-code

Port rOpenScPCA package from OpenScPCA-analysis
AlexsLemonade · Nov 6, 2024 · fc78444 · fc78444
2 parents fb1976c + f009074
commit fc78444
Show file tree

Hide file tree

Showing 21 changed files with 1,576 additions and 1 deletion.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.github/workflows/R-CMD-CHECK.yml b/.github/workflows/R-CMD-CHECK.yml
@@ -0,0 +1,33 @@
+on:
+  pull_request:
+    branches:
+      - main
+      - feature/*
+
+name: R-CMD-CHECK
+
+jobs:
+  R-CMD-check-renv:
+    runs-on: ubuntu-22.04
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Set up R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: 4.4.0
+          use-public-rspm: true
+
+      - name: Set up dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - name: Check package
+        uses: r-lib/actions/check-r-package@v2
+        with:
+          args: 'c("--no-manual")'
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,42 @@
+Package: rOpenScPCA
+Type: Package
+Title: Utility Functions for OpenScPCA Project Analysis Modules
+Version: 0.1.0
+Authors@R: c(
+    person(c("Stephanie", "J."), "Spielman",
+           email = "[email protected]",
+           comment = list(ORCID = "0000-0002-9090-4788"),
+           role = c("aut", "cre")),
+    person(c("Joshua", "A."), "Shapiro",
+           email = "[email protected]",
+           comment = list(ORCID = "0000-0002-6224-0347"),
+           role = c("aut"))
+    )
+Maintainer: Stephanie J. Spielman <[email protected]>
+Description: This package contains utility functions that support single-cell RNA-seq
+    analysis in R-based OpenScPCA analysis module code.
+License: BSD_3_clause + file LICENSE
+Encoding: UTF-8
+LazyData: true
+Suggests:
+    testthat (>= 3.0.0),
+    scater,
+    Seurat,
+    splatter
+Config/testthat/edition: 3
+RoxygenNote: 7.3.2
+Imports:
+    BiocParallel,
+    bluster (>= 1.14),
+    dplyr,
+    methods,
+    pdfCluster,
+    purrr,
+    SingleCellExperiment,
+    tibble,
+    tidyr
+biocViews:
+    GeneExpression,
+    Transcriptomics,
+    SingleCell,
+    Clustering
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+# Generated by roxygen2: do not edit by hand
+
+export(calculate_clusters)
+export(calculate_purity)
+export(calculate_silhouette)
+export(calculate_stability)
+export(extract_pc_matrix)
+export(sweep_clusters)
+import(SingleCellExperiment)
+import(methods)
diff --git a/R/calculate-clusters.R b/R/calculate-clusters.R
@@ -0,0 +1,249 @@
+#' Calculate graph-based clusters from a provided matrix
+#'
+#' This function is provided to simplify application of bluster package clustering functions on OpenScPCA data.
+#' In particular, this function runs bluster::clusterRows() with the bluster::NNGraphParam() function on a
+#' principal components matrix, provided either directly or via single-cell object.
+#' Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults.
+#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard"
+#' to align with common practice in scRNA-seq analysis.
+#'
+#' @import methods
+#'
+#' @param x An object containing PCs that clustering can be performed in. This can be either a SingleCellExperiment
+#'   object, a Seurat object, or a matrix where columns are PCs and rows are cells. If a matrix is provided, it must
+#'   have row names of cell ids (e.g., barcodes).
+#' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap", or "leiden".
+#' @param weighting Weighting scheme to use. Must be one of "jaccard" (default), "rank", or "number"
+#' @param nn Number of nearest neighbors. Default is 10.
+#' @param resolution Resolution parameter used by louvain and leiden clustering only. Default is 1.
+#' @param objective_function Leiden-specific parameter for whether to use the Constant Potts Model ("CPM"; default) or "modularity"
+#' @param cluster_args List of additional arguments to pass to the chosen clustering function.
+#'   Only single values for each argument are supported (no vectors or lists).
+#'   See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest
+#' @param threads Number of threads to use. Default is 1.
+#' @param seed Random seed to set for clustering.
+#' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment
+#'   or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the
+#'   Seurat object name will default to "pca".
+#'
+#' @return A data frame of cluster results with columns `cell_id` and `cluster`. Additional columns represent algorithm parameters
+#'   and include at least: `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`, and
+#'   leiden clustering will further include `objective_function`.
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # cluster PCs from a SingleCellExperiment object using default parameters and
+#' # a random seed for reproducibility
+#' cluster_df <- calculate_clusters(sce_object, seed = 11)
+#'
+#' # cluster PCs from a SingleCellExperiment object using default parameters and 4 threads
+#' cluster_df <- calculate_clusters(sce_object, threads = 4, seed = 11)
+#'
+#' # cluster PCs from a Seurat object using default parameters
+#' cluster_df <- calculate_clusters(seurat_object, seed = 11)
+#'
+#' # cluster directly from a matrix using default parameters
+#' cluster_df <- calculate_clusters(pca_matrix, seed = 11)
+#'
+#' # cluster directly from a matrix using the leiden algorithm with a resolution of 0.1
+#' cluster_df <- calculate_clusters(
+#'   pca_matrix,
+#'   algorithm = "leiden",
+#'   resolution = 0.1,
+#'   seed = 11
+#' )
+#'
+#' # cluster directly from a matrix using the leiden algorithm with 3 iterations
+#' cluster_df <- calculate_clusters(
+#'   pca_matrix,
+#'   algorithm = "leiden",
+#'   cluster_args = list(n_iterations = 3),
+#'   seed = 11
+#' )
+#' }
+calculate_clusters <- function(
+    x,
+    algorithm = c("louvain", "walktrap", "leiden"),
+    weighting = c("jaccard", "rank", "number"),
+    nn = 10,
+    resolution = 1, # louvain or leiden
+    objective_function = c("CPM", "modularity"), # leiden only
+    cluster_args = list(),
+    threads = 1,
+    seed = NULL,
+    pc_name = NULL) {
+  if (!is.null(seed)) {
+    set.seed(seed)
+  }
+
+  # check and prepare matrix
+  pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name)
+
+  # Check input arguments
+  stopifnot(
+    "`resolution` must be numeric" = is.numeric(resolution),
+    "`nn` must be numeric" = is.numeric(nn),
+    "`threads` must be numeric" = is.numeric(threads)
+  )
+
+  algorithm <- match.arg(algorithm)
+  weighting <- match.arg(weighting)
+  objective_function <- match.arg(objective_function)
+
+  if (length(cluster_args)) {
+    stopifnot(
+      "`cluster_args` must be a named list." = is.list(cluster_args) && !("" %in% allNames(cluster_args)),
+      "`cluster_args` elements must all have only a single value" = all(sapply(cluster_args, length) == 1)
+    )
+  }
+
+  # Update cluster_args list with parameters that users can directly provide
+  # note that clusterRows throws an error if this list has a param not used by the chosen algorithm
+  if (algorithm %in% c("louvain", "leiden")) {
+    cluster_args$resolution <- resolution
+  }
+  if (algorithm == "leiden") {
+    cluster_args$objective_function <- objective_function
+  }
+
+  if (threads > 1) {
+    bp_param <- BiocParallel::MulticoreParam(threads)
+  } else {
+    bp_param <- BiocParallel::SerialParam()
+  }
+
+
+  # Perform clustering
+  clusters <- bluster::clusterRows(
+    pca_matrix,
+    bluster::NNGraphParam(
+      k = nn,
+      type = weighting,
+      cluster.fun = algorithm,
+      cluster.args = cluster_args,
+      BPPARAM = bp_param
+    )
+  )
+
+  # Transform results into a table and return
+  cluster_df <- data.frame(
+    cell_id = rownames(pca_matrix),
+    cluster = clusters,
+    algorithm = algorithm,
+    weighting = weighting,
+    nn = nn
+  )
+
+  # Add in cluster_args if it has parameters to include
+  if (length(cluster_args) != 0) {
+    cluster_df <- cluster_df |>
+      dplyr::bind_cols(
+        data.frame(cluster_args)
+      )
+  }
+
+  return(cluster_df)
+}
+
+
+
+#' Extract a principal components (PC) matrix from either a SingleCellExperiment
+#' or a Seurat object.
+#'
+#' This function first determines if the provided object is a SingleCellExperiment or
+#' Seurat object, and then extract the PC matrix. If no name for the PC matrix is provided,
+#' this function will assume the name of "PCA" for SingleCellExperiment objects, and
+#' "pca" for Seurat objects.
+#'
+#' @import SingleCellExperiment
+#' @import methods
+#'
+#' @param sc_object Either a SingleCellExperiment or Seurat object
+#' @param pc_name Optionally, the name of the PC matrix in the object. If this is
+#' not provided, the name "PCA" is assumed for SingleCellExperiment objects, and
+#' "pca" for Seurat objects.
+#'
+#' @return PC matrix with row names
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # extract PC matrix from SCE object, assuming default name "PCA"
+#' pca_matrix <- extract_pc_matrix(sce_object)
+#'
+#' # extract PC matrix from SCE object with non-default name "PCA_MAT"
+#' pca_matrix <- extract_pc_matrix(sce_object, pc_name = "PCA_MAT")
+#'
+#' # extract PC matrix from Seurat object, assuming default name "pca"
+#' pca_matrix <- extract_pc_matrix(seurat_object)
+#' }
+extract_pc_matrix <- function(sc_object, pc_name = NULL) {
+  # default PC names for each type of object to use if
+  #  pc_name is NULL
+  default_sce <- "PCA"
+  default_seurat <- "pca"
+
+  if (is(sc_object, "SingleCellExperiment")) {
+    pc_name <- ifelse(is.null(pc_name), default_sce, pc_name)
+    stopifnot(
+      "Could not find a PC matrix in the SingleCellExperiment object." =
+        pc_name %in% reducedDimNames(sc_object)
+    )
+
+    pca_matrix <- reducedDim(sc_object, pc_name)
+  } else if (is(sc_object, "Seurat")) {
+    pc_name <- ifelse(is.null(pc_name), default_seurat, pc_name)
+    stopifnot(
+      "Seurat package must be installed to process a Seurat object" =
+        requireNamespace("Seurat", quietly = TRUE),
+      "Could not find a PC matrix in the Seurat object." =
+        pc_name %in% names(sc_object@reductions)
+    )
+
+    pca_matrix <- Seurat::Embeddings(
+      sc_object,
+      reduction = pc_name
+    )
+  } else {
+    stop("You must provide a SingleCellExperiment or Seurat object.")
+  }
+
+  # Ensure row names are present
+  stopifnot(
+    "The extracted PCA matrix does not have row names." = is.character(rownames(pca_matrix))
+  )
+
+  return(pca_matrix)
+}
+
+
+
+
+
+
+#' Helper function to check and/or extract a matrix of PCs from a given object
+#'
+#' @param x Either a matrix of principal components (PCs), or a SingleCellExperiment
+#'  or Seurat object containing PCs. If a matrix is provided, rows should be cells
+#'  and columns should be PCs, and row names should be cell ids (e.g., barcodes).
+#' @param pc_name Optionally, the name of the PC matrix in the object. Not used for
+#'   matrices. If this is not provided, the name "PCA" is assumed for
+#'   SingleCellExperiment objects, and "pca" for Seurat objects.
+#'
+#' @return A matrix of PCs with row names representing cell ids
+prepare_pc_matrix <- function(x, pc_name = NULL) {
+  if (any(class(x) %in% c("matrix", "Matrix"))) {
+    stopifnot(
+      "The matrix must have row names representing cell ids, e.g. barcodes." = is.character(rownames(x))
+    )
+  } else if (is(x, "SingleCellExperiment") || is(x, "Seurat")) {
+    x <- extract_pc_matrix(x, pc_name = pc_name)
+  } else {
+    stop("The first argument should be one of: a SingleCellExperiment object, a Seurat object, or a matrix with row names.")
+  }
+
+  return(x)
+}