AlexsLemonade · jashapiro · Dec 3, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,4 @@
 ^\.github$
 ^\.lintr$
 ^\.pre-commit-config.yaml$
+^data-raw$
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,9 +16,10 @@ repos:
         exclude: '\.Rd'
 
   - repo: https://github.com/crate-ci/typos
-    rev: typos-dict-v0.11.34
+    rev: v1.28.1
     hooks:
       - id: typos
+        exclude: '\.nb\.html'
 
   - repo: https://github.com/gitleaks/gitleaks
     rev: v8.21.2
@@ -39,4 +40,4 @@ repos:
       - id: no-browser-statement
       - id: no-debug-statement
       - id: deps-in-desc
-        exclude: 'docker/.*|renv/.*'
+        exclude: 'docker/.*|renv/.*|data-raw/.*'
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,3 +41,5 @@ biocViews:
     Transcriptomics,
     SingleCell,
     Clustering
+Depends:
+    R (>= 4.0)
diff --git a/R/convert-gene-ids.R b/R/convert-gene-ids.R
@@ -1,54 +1,91 @@
-#' Convert Ensembl gene ids to gene symbols based on an ScPCA SingleCellExperiment object
+#' Convert Ensembl gene ids to gene symbols based on reference gene lists
 #'
 #' The SingleCellExperiment objects produced as part of ScPCA are indexed by
 #' Ensembl gene ids, as those are more stable than gene symbols. However,
-#' for many applications gene symbols are useful. This function provides a
-#' simple and consistent conversion of Ensembl gene ids to gene symbols based on
-#' the `gene_symbol` column that is present in the row data of ScPCA
-#' SingleCellExperiment objects.
+#' for many applications gene symbols are useful. This function provides
+#' simple conversion of Ensembl gene ids to gene symbols based on either the
+#' ScPCA reference gene list or a 10x reference gene list as used by Cell Ranger.
+#' Alternatively, a SingleCellExperiment object with gene ids and gene symbols
+#' stored in the row data (as those provided by ScPCA) can be used as the reference.
 #'
-#' For this function, the SingleCellExperiment object must contain a `gene_ids`
-#' column containing Ensembl gene ids and a `gene_symbol` column containing gene
-#' symbols. If any gene ids are not found or if the gene symbol is not defined,
-#' the input gene id is returned, unless the `leave_na` is set to `TRUE`.
+#' The gene symbols can either be made unique (as would be done if read in by Seurat)
+#' or left as is.
 #'
 #'
 #' @param ensembl_ids A character vector of Ensembl gene ids to translate to
 #'  gene symbols.
-#' @param sce A SingleCellExperiment object containing gene ids and gene symbols
-#'  to use for translation.
-#' @param leave_na logical indicating whether to leave NA values in the output.
-#'  Default is `FALSE`
+#' @param reference The reference gene list to use for translation. One of `scpca`,
+#'   `10x2020`, `10x2024`. The `scpca` reference is the default.
+#' @param sce A SingleCellExperiment object to use as a reference for gene symbols.
+#'  If provided, the `reference` argument will be ignored. The `sce` object must
+#'  include columns with the names `gene_ids` (containing Ensembl ids) and
+#'  `gene_symbol` (containing the symbols) to use for conversion.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#'  data had been read in with gene symbols by Seurat. Default is FALSE.
+#' @param leave_na Whether to leave NA values in the output vector.
+#' If FALSE, any missing values will be replaced with the input ensembl_id value.
+#' Default is FALSE.
 #'
 #' @return A vector of gene symbols corresponding to the input Ensembl ids.
 #' @export
 #'
 #' @import SingleCellExperiment
 #'
 #' @examples
-#' \dontrun{
 #' # convert a set of Ensembl ids to gene symbols
-#' # using a SingleCellExperiment reference
 #' ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
-#' gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
+#' gene_symbols <- ensembl_to_symbol(ensembl_ids)
 #' gene_symbols
 #' ### [1] "TP53" "MYCN"
+#'
+#' # convert a set of Ensembl ids to gene symbols using the 10x2020 reference
+#' gene_symbols_10x2020 <- ensembl_to_symbol(ensembl_ids, reference = "10x2020")
+#'
+#' \dontrun{
+#' # convert a set of Ensembl ids to gene symbols using an SCE for reference
+#' gene_symbols_sce <- ensembl_to_symbol(ensembl_ids, sce = sce)
 #' }
-ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
+#'
+ensembl_to_symbol <- function(
+    ensembl_ids,
+    reference = c("scpca", "10x2020", "10x2024"),
+    sce = NULL,
+    unique = FALSE,
+    leave_na = FALSE) {
+  reference <- match.arg(reference)
   stopifnot(
-    "`sce` must be a SingleCellExperiment object ." = is(sce, "SingleCellExperiment"),
     "`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
+    "`sce` must be a SingleCellExperiment object or NULL." = is.null(sce) || inherits(sce, "SingleCellExperiment"),
     "`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
-      all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
-    "`leave_na` must be TRUE or FALSE." = is.logical(leave_na)
+      is.null(sce) || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
+    "`unique` must be TRUE or FALSE." = is.logical(unique) && !is.na(unique),
+    "`leave_na` must be TRUE or FALSE." = is.logical(leave_na) && !is.na(leave_na)
   )
 
-  id_match <- match(ensembl_ids, rowData(sce)$gene_ids)
-  gene_symbols <- rowData(sce)[id_match, "gene_symbol"]
+  if (is.null(sce)) {
+    # build the symbol column name
+    symbol_column <- paste0("gene_symbol_", reference, ifelse(unique, "_unique", ""))
+    # get the gene symbols
+    id_match <- match(ensembl_ids, rOpenScPCA::scpca_gene_reference$gene_ids)
+    gene_symbols <- rOpenScPCA::scpca_gene_reference[id_match, symbol_column]
+  } else {
+    message("Using the provided SingleCellExperiment object for gene symbol conversion.")
+    all_symbols <- rowData(sce)$gene_symbol
+    if (unique) {
+      all_symbols[!is.na(all_symbols)] <- make.unique(all_symbols[!is.na(all_symbols)])
+    }
+    gene_symbols <- all_symbols[match(ensembl_ids, rowData(sce)$gene_ids)]
+  }
 
   missing_symbols <- is.na(gene_symbols)
+  if (all(missing_symbols)) {
+    warning(
+      "None of the input ids have corresponding gene symbols.",
+      " You may want to check the reference and input ids."
+    )
+  }
   if (!leave_na && any(missing_symbols)) {
-    warning("Not all `ensembl_ids` values have corresponding gene symbols, using input ids for missing values.")
+    warning("Not all input ids have corresponding gene symbols, using input ids for missing values.")
     gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
   }
 
@@ -62,17 +99,28 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
 #' for many applications gene symbols are useful. This function converts the
 #' row names (indexes) of a SingleCellExperiment object to gene symbols based on the
 #' `gene_symbol` column that is present in the row data of ScPCA SingleCellExperiment objects.
+#' It is also possible to use an alternative reference, such as the default ScPCA
+#' reference gene sets or the reference gene sets provided by 10x Genomics for
+#' use with Cell Ranger. Values for the 10x-provided 2020 and 2024 references
+#' are available.
+#'
+#' By default, duplicate gene symbols are left as is, but can be made unique
+#' (as would be done by Seurat) by setting the `unique` argument to TRUE.
 #'
 #' Internal data structures such as the list of highly variable genes and the
 #' rotation matrix for the PCA are also updated to use gene symbols, if present
 #' (and not disabled by the `convert_hvg` and `convert_pca` arguments).
 #'
-#' Note that using this function will result in non-unique row ids as no
-#' de-duplication is currently performed.
 #'
 #' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
+#' @param reference The reference gene list for conversion. One of `sce`, `scpca`,
+#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#' data had been read in with gene symbols by Seurat. Default is FALSE.
 #' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
+#'  Default is TRUE.
 #' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
+#'  Default is TRUE.
 #'
 #' @return A SingleCellExperiment object with row names set as gene symbols.
 #' @export
@@ -84,22 +132,45 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
 #' \dontrun{
 #' # convert a SingleCellExperiment object to use gene symbols
 #' symbol_sce <- sce_to_symbols(sce)
+#'
+#' # convert a SingleCellExperiment object, making the gene symbols unique
+#' symbol_sce <- sce_to_symbols(sce, unique = TRUE)
+#'
+#' # convert a SingleCellExperiment object to use gene symbols with the 10x2020 reference
+#' symbol_sce <- sce_to_symbols(sce, reference = "10x2020")
 #' }
-sce_to_symbols <- function(sce, convert_hvg = TRUE, convert_pca = TRUE) {
+#'
+sce_to_symbols <- function(
+    sce,
+    reference = c("sce", "scpca", "10x2020", "10x2024"),
+    unique = FALSE,
+    convert_hvg = TRUE,
+    convert_pca = TRUE) {
+  reference <- match.arg(reference)
   stopifnot(
     "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
-    "`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
-      all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
+    "`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." =
+      reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
   )
-  row_ids <- rowData(sce)$gene_symbol
-  # set Ensembl ids as original ids for later translations
-  names(row_ids) <- rowData(sce)$gene_ids
 
-  missing_ids <- is.na(row_ids)
-  if (any(missing_ids)) {
-    warning("Not all rows have gene symbols, using Ensembl ids for missing values.")
-    row_ids[missing_ids] <- names(row_ids)[missing_ids]
+  # get ensembl ids, either from gene_ids column if present or from the row names as a fallback
+  if ("gene_ids" %in% names(rowData(sce))) {
+    ensembl_ids <- rowData(sce)$gene_ids
+  } else {
+    ensembl_ids <- rownames(sce)
   }
+  if (!all(grepl("^ENS(...)?G\\d+$", ensembl_ids))) {
+    stop("gene_ids and/or row names are not all Ensembl ids, cannot convert to gene symbols.")
+  }
+
+  if (reference == "sce") {
+    gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique)
+  } else {
+    gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique)
+  }
+  row_ids <- gene_symbols
+  # set Ensembl ids as original ids for later translations
+  names(row_ids) <- ensembl_ids
 
   rownames(sce) <- row_ids
 

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,23 @@
+# nolint start
+
+#' Conversion table for Ensembl gene ids and gene symbols
+#'
+#'
+#' This table includes the mapping for gene ids to gene symbols from different
+#' reference genome gene annotation lists.
+#' Included are the original gene symbols and the modified gene symbols that
+#' are created when running the `make.unique()` function, as is done when
+#' importing data using Seurat.
+#'
+#' @format
+#' A data frame with 7 columns:
+#' \describe{
+#'   \item{gene_ids}{Ensembl gene ids}
+#'   \item{gene_symbol_scpca}{The gene symbol used in the ScPCA reference}
+#'   \item{gene_symbol_scpca_unique}{The gene symbol from the ScPCA reference, after `make.unique()`}
+#'   \item{gene_symbol_10x2020}{The gene symbol used in the 2020 10x human genome reference}
+#'   \item{gene_symbol_10x2020_unique}{The gene symbol from the 2020 10x human genome reference, after `make.unique()`}
+#'   \item{gene_symbol_10x2024}{The gene symbol used in the 2024 10x human genome reference}
+#'   \item{gene_symbol_10x2024_unique}{The gene symbol from the 2024 10x human genome reference, after `make.unique()`}
+#' }
+"scpca_gene_reference"
diff --git a/data-raw/README.md b/data-raw/README.md
@@ -0,0 +1,11 @@
+This directory contains scripts to download and preprocess data used within the rOpenScPCA package.
+There are also notebooks that explore some of the prepared datasets for exploration.
+
+## Building Gene References
+
+- The `build_gene_references.R` script creates a table of Ensembl id to gene symbol references.
+The initial table of gene ids and gene symbols is extracted from an example ScPCA-formatted SCE object (`rOpenScPCA/tests/testthat/data/scpca_sce.rds`).
+This is combined with the reference information extracted from example 10x Genomics datasets.
+The full table is saved in `data/scpca_gene_reference.rda` (overwriting any previous file).
+
+- The `explore_gene_references.Rmd` notebook explores the resulting gene references table a bit to see where some of the conversions differ among different references.
diff --git a/data-raw/build_gene_reference.R b/data-raw/build_gene_reference.R
@@ -0,0 +1,76 @@
+#!/usr/env Rscript
+
+# This script downloads and stores reference gene lists from 10x Genomics
+# datasets, creating a final table of Ensembl ids and corresponding symbols
+# including the symbols that would be created on read by Seurat by application
+# of the make.unique() function.
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+  library(dplyr)
+})
+
+## Download the reference data -------------------------------------------------
+
+
+# Read in the test data ScPCA SCE object and extract the row data:
+genes_scpca <- readRDS(here::here("tests", "testthat", "data", "scpca_sce.rds")) |>
+  rowData() |>
+  as.data.frame() |>
+  # Use Ensembl ID if gene symbol is missing, then make unique
+  mutate(
+    gene_symbol_scpca = ifelse(is.na(gene_symbol), gene_ids, gene_symbol),
+    gene_symbol_scpca_unique = make.unique(gene_symbol_scpca)
+  ) |>
+  select(gene_ids, gene_symbol_scpca, gene_symbol_scpca_unique)
+
+
+# Download and read in a 2020 10x reference dataset and extract the gene symbols.
+# Note that the 2020 Cell Ranger reference does not use Ensembl gene IDs for
+# missing symbols, but the 2024 reference does.
+url_10x2020 <- "https://cf.10xgenomics.com/samples/cell-exp/7.0.1/SC3pv3_GEX_Human_PBMC/SC3pv3_GEX_Human_PBMC_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2020 <- tempfile(fileext = ".h5")
+download.file(url_10x2020, temp_10x2020, mode = "wb")
+on.exit(unlink(temp_10x2020), add = TRUE) # delete when done
+
+genes_10x2020 <- DropletUtils::read10xCounts(temp_10x2020) |>
+  rowData() |>
+  as.data.frame() |>
+  filter(Type == "Gene Expression") |>
+  rename(
+    gene_ids = ID,
+    gene_symbol_10x2020 = Symbol
+  ) |>
+  # add unique column
+  mutate(gene_symbol_10x2020_unique = make.unique(gene_symbol_10x2020)) |>
+  select(gene_ids, gene_symbol_10x2020, gene_symbol_10x2020_unique)
+
+# Download and read in a 2024 10x reference dataset and extract the gene symbols.
+url_10x2024 <- "https://cf.10xgenomics.com/samples/cell-exp/9.0.0/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2024 <- tempfile(fileext = ".h5")
+download.file(url_10x2024, temp_10x2024, mode = "wb")
+on.exit(unlink(temp_10x2024), add = TRUE) # delete when done
+
+
+genes_10x2024 <- DropletUtils::read10xCounts(temp_10x2024) |>
+  rowData() |>
+  as.data.frame() |>
+  filter(Type == "Gene Expression") |>
+  rename(
+    gene_ids = ID,
+    gene_symbol_10x2024 = Symbol
+  ) |>
+  mutate(gene_symbol_10x2024_unique = make.unique(gene_symbol_10x2024)) |>
+  select(gene_ids, gene_symbol_10x2024, gene_symbol_10x2024_unique)
+
+# Join the gene lists ----------------------------------------------------------
+scpca_gene_reference <- genes_scpca |>
+  full_join(genes_10x2020, by = "gene_ids") |>
+  full_join(genes_10x2024, by = "gene_ids")
+
+## Add the table to package data -----------------------------------------------
+usethis::use_data(
+  scpca_gene_reference,
+  version = 3,
+  overwrite = TRUE,
+  compress = "xz"
+)