Skip to content

Commit

Permalink
Merge pull request #1 from AlexsLemonade/sjspielman/initial-package-code
Browse files Browse the repository at this point in the history
Port rOpenScPCA package from OpenScPCA-analysis
  • Loading branch information
sjspielman authored Nov 6, 2024
2 parents fb1976c + f009074 commit fc78444
Show file tree
Hide file tree
Showing 21 changed files with 1,576 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
33 changes: 33 additions & 0 deletions .github/workflows/R-CMD-CHECK.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
on:
pull_request:
branches:
- main
- feature/*

name: R-CMD-CHECK

jobs:
R-CMD-check-renv:
runs-on: ubuntu-22.04
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: Set up R
uses: r-lib/actions/setup-r@v2
with:
r-version: 4.4.0
use-public-rspm: true

- name: Set up dependencies
uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check

- name: Check package
uses: r-lib/actions/check-r-package@v2
with:
args: 'c("--no-manual")'
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
42 changes: 42 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
Package: rOpenScPCA
Type: Package
Title: Utility Functions for OpenScPCA Project Analysis Modules
Version: 0.1.0
Authors@R: c(
person(c("Stephanie", "J."), "Spielman",
email = "[email protected]",
comment = list(ORCID = "0000-0002-9090-4788"),
role = c("aut", "cre")),
person(c("Joshua", "A."), "Shapiro",
email = "[email protected]",
comment = list(ORCID = "0000-0002-6224-0347"),
role = c("aut"))
)
Maintainer: Stephanie J. Spielman <[email protected]>
Description: This package contains utility functions that support single-cell RNA-seq
analysis in R-based OpenScPCA analysis module code.
License: BSD_3_clause + file LICENSE
Encoding: UTF-8
LazyData: true
Suggests:
testthat (>= 3.0.0),
scater,
Seurat,
splatter
Config/testthat/edition: 3
RoxygenNote: 7.3.2
Imports:
BiocParallel,
bluster (>= 1.14),
dplyr,
methods,
pdfCluster,
purrr,
SingleCellExperiment,
tibble,
tidyr
biocViews:
GeneExpression,
Transcriptomics,
SingleCell,
Clustering
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(calculate_clusters)
export(calculate_purity)
export(calculate_silhouette)
export(calculate_stability)
export(extract_pc_matrix)
export(sweep_clusters)
import(SingleCellExperiment)
import(methods)
249 changes: 249 additions & 0 deletions R/calculate-clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
#' Calculate graph-based clusters from a provided matrix
#'
#' This function is provided to simplify application of bluster package clustering functions on OpenScPCA data.
#' In particular, this function runs bluster::clusterRows() with the bluster::NNGraphParam() function on a
#' principal components matrix, provided either directly or via single-cell object.
#' Note that defaults for some arguments may differ from the bluster::NNGraphParam() defaults.
#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard"
#' to align with common practice in scRNA-seq analysis.
#'
#' @import methods
#'
#' @param x An object containing PCs that clustering can be performed in. This can be either a SingleCellExperiment
#' object, a Seurat object, or a matrix where columns are PCs and rows are cells. If a matrix is provided, it must
#' have row names of cell ids (e.g., barcodes).
#' @param algorithm Clustering algorithm to use. Must be one of "louvain" (default), "walktrap", or "leiden".
#' @param weighting Weighting scheme to use. Must be one of "jaccard" (default), "rank", or "number"
#' @param nn Number of nearest neighbors. Default is 10.
#' @param resolution Resolution parameter used by louvain and leiden clustering only. Default is 1.
#' @param objective_function Leiden-specific parameter for whether to use the Constant Potts Model ("CPM"; default) or "modularity"
#' @param cluster_args List of additional arguments to pass to the chosen clustering function.
#' Only single values for each argument are supported (no vectors or lists).
#' See igraph documentation for details on each clustering function: https://igraph.org/r/html/latest
#' @param threads Number of threads to use. Default is 1.
#' @param seed Random seed to set for clustering.
#' @param pc_name Name of principal components slot in provided object. This argument is only used if a SingleCellExperiment
#' or Seurat object is provided. If not provided, the SingleCellExperiment object name will default to "PCA" and the
#' Seurat object name will default to "pca".
#'
#' @return A data frame of cluster results with columns `cell_id` and `cluster`. Additional columns represent algorithm parameters
#' and include at least: `algorithm`, `weighting`, and `nn`. Louvain and leiden clustering will also include `resolution`, and
#' leiden clustering will further include `objective_function`.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # cluster PCs from a SingleCellExperiment object using default parameters and
#' # a random seed for reproducibility
#' cluster_df <- calculate_clusters(sce_object, seed = 11)
#'
#' # cluster PCs from a SingleCellExperiment object using default parameters and 4 threads
#' cluster_df <- calculate_clusters(sce_object, threads = 4, seed = 11)
#'
#' # cluster PCs from a Seurat object using default parameters
#' cluster_df <- calculate_clusters(seurat_object, seed = 11)
#'
#' # cluster directly from a matrix using default parameters
#' cluster_df <- calculate_clusters(pca_matrix, seed = 11)
#'
#' # cluster directly from a matrix using the leiden algorithm with a resolution of 0.1
#' cluster_df <- calculate_clusters(
#' pca_matrix,
#' algorithm = "leiden",
#' resolution = 0.1,
#' seed = 11
#' )
#'
#' # cluster directly from a matrix using the leiden algorithm with 3 iterations
#' cluster_df <- calculate_clusters(
#' pca_matrix,
#' algorithm = "leiden",
#' cluster_args = list(n_iterations = 3),
#' seed = 11
#' )
#' }
calculate_clusters <- function(
x,
algorithm = c("louvain", "walktrap", "leiden"),
weighting = c("jaccard", "rank", "number"),
nn = 10,
resolution = 1, # louvain or leiden
objective_function = c("CPM", "modularity"), # leiden only
cluster_args = list(),
threads = 1,
seed = NULL,
pc_name = NULL) {
if (!is.null(seed)) {
set.seed(seed)
}

# check and prepare matrix
pca_matrix <- prepare_pc_matrix(x, pc_name = pc_name)

# Check input arguments
stopifnot(
"`resolution` must be numeric" = is.numeric(resolution),
"`nn` must be numeric" = is.numeric(nn),
"`threads` must be numeric" = is.numeric(threads)
)

algorithm <- match.arg(algorithm)
weighting <- match.arg(weighting)
objective_function <- match.arg(objective_function)

if (length(cluster_args)) {
stopifnot(
"`cluster_args` must be a named list." = is.list(cluster_args) && !("" %in% allNames(cluster_args)),
"`cluster_args` elements must all have only a single value" = all(sapply(cluster_args, length) == 1)
)
}

# Update cluster_args list with parameters that users can directly provide
# note that clusterRows throws an error if this list has a param not used by the chosen algorithm
if (algorithm %in% c("louvain", "leiden")) {
cluster_args$resolution <- resolution
}
if (algorithm == "leiden") {
cluster_args$objective_function <- objective_function
}

if (threads > 1) {
bp_param <- BiocParallel::MulticoreParam(threads)
} else {
bp_param <- BiocParallel::SerialParam()
}


# Perform clustering
clusters <- bluster::clusterRows(
pca_matrix,
bluster::NNGraphParam(
k = nn,
type = weighting,
cluster.fun = algorithm,
cluster.args = cluster_args,
BPPARAM = bp_param
)
)

# Transform results into a table and return
cluster_df <- data.frame(
cell_id = rownames(pca_matrix),
cluster = clusters,
algorithm = algorithm,
weighting = weighting,
nn = nn
)

# Add in cluster_args if it has parameters to include
if (length(cluster_args) != 0) {
cluster_df <- cluster_df |>
dplyr::bind_cols(
data.frame(cluster_args)
)
}

return(cluster_df)
}



#' Extract a principal components (PC) matrix from either a SingleCellExperiment
#' or a Seurat object.
#'
#' This function first determines if the provided object is a SingleCellExperiment or
#' Seurat object, and then extract the PC matrix. If no name for the PC matrix is provided,
#' this function will assume the name of "PCA" for SingleCellExperiment objects, and
#' "pca" for Seurat objects.
#'
#' @import SingleCellExperiment
#' @import methods
#'
#' @param sc_object Either a SingleCellExperiment or Seurat object
#' @param pc_name Optionally, the name of the PC matrix in the object. If this is
#' not provided, the name "PCA" is assumed for SingleCellExperiment objects, and
#' "pca" for Seurat objects.
#'
#' @return PC matrix with row names
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # extract PC matrix from SCE object, assuming default name "PCA"
#' pca_matrix <- extract_pc_matrix(sce_object)
#'
#' # extract PC matrix from SCE object with non-default name "PCA_MAT"
#' pca_matrix <- extract_pc_matrix(sce_object, pc_name = "PCA_MAT")
#'
#' # extract PC matrix from Seurat object, assuming default name "pca"
#' pca_matrix <- extract_pc_matrix(seurat_object)
#' }
extract_pc_matrix <- function(sc_object, pc_name = NULL) {
# default PC names for each type of object to use if
# pc_name is NULL
default_sce <- "PCA"
default_seurat <- "pca"

if (is(sc_object, "SingleCellExperiment")) {
pc_name <- ifelse(is.null(pc_name), default_sce, pc_name)
stopifnot(
"Could not find a PC matrix in the SingleCellExperiment object." =
pc_name %in% reducedDimNames(sc_object)
)

pca_matrix <- reducedDim(sc_object, pc_name)
} else if (is(sc_object, "Seurat")) {
pc_name <- ifelse(is.null(pc_name), default_seurat, pc_name)
stopifnot(
"Seurat package must be installed to process a Seurat object" =
requireNamespace("Seurat", quietly = TRUE),
"Could not find a PC matrix in the Seurat object." =
pc_name %in% names(sc_object@reductions)
)

pca_matrix <- Seurat::Embeddings(
sc_object,
reduction = pc_name
)
} else {
stop("You must provide a SingleCellExperiment or Seurat object.")
}

# Ensure row names are present
stopifnot(
"The extracted PCA matrix does not have row names." = is.character(rownames(pca_matrix))
)

return(pca_matrix)
}






#' Helper function to check and/or extract a matrix of PCs from a given object
#'
#' @param x Either a matrix of principal components (PCs), or a SingleCellExperiment
#' or Seurat object containing PCs. If a matrix is provided, rows should be cells
#' and columns should be PCs, and row names should be cell ids (e.g., barcodes).
#' @param pc_name Optionally, the name of the PC matrix in the object. Not used for
#' matrices. If this is not provided, the name "PCA" is assumed for
#' SingleCellExperiment objects, and "pca" for Seurat objects.
#'
#' @return A matrix of PCs with row names representing cell ids
prepare_pc_matrix <- function(x, pc_name = NULL) {
if (any(class(x) %in% c("matrix", "Matrix"))) {
stopifnot(
"The matrix must have row names representing cell ids, e.g. barcodes." = is.character(rownames(x))
)
} else if (is(x, "SingleCellExperiment") || is(x, "Seurat")) {
x <- extract_pc_matrix(x, pc_name = pc_name)
} else {
stop("The first argument should be one of: a SingleCellExperiment object, a Seurat object, or a matrix with row names.")
}

return(x)
}
Loading

0 comments on commit fc78444

Please sign in to comment.