AlexsLemonade · jashapiro · Dec 3, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,4 @@
 ^\.github$
 ^\.lintr$
 ^\.pre-commit-config.yaml$
+^data-raw$
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,9 +16,10 @@ repos:
         exclude: '\.Rd'
 
   - repo: https://github.com/crate-ci/typos
-    rev: typos-dict-v0.11.34
+    rev: v1.28.1
     hooks:
       - id: typos
+        exclude: '\.nb\.html'
 
   - repo: https://github.com/gitleaks/gitleaks
     rev: v8.21.2
@@ -39,4 +40,4 @@ repos:
       - id: no-browser-statement
       - id: no-debug-statement
       - id: deps-in-desc
-        exclude: 'docker/.*|renv/.*'
+        exclude: 'docker/.*|renv/.*|data-raw/.*'
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,3 +41,5 @@ biocViews:
     Transcriptomics,
     SingleCell,
     Clustering
+Depends:
+    R (>= 4.0)
diff --git a/R/convert-gene-ids.R b/R/convert-gene-ids.R
@@ -1,54 +1,72 @@
-#' Convert Ensembl gene ids to gene symbols based on an ScPCA SingleCellExperiment object
+#' Convert Ensembl gene ids to gene symbols based on reference gene lists
 #'
 #' The SingleCellExperiment objects produced as part of ScPCA are indexed by
 #' Ensembl gene ids, as those are more stable than gene symbols. However,
-#' for many applications gene symbols are useful. This function provides a
-#' simple and consistent conversion of Ensembl gene ids to gene symbols based on
-#' the `gene_symbol` column that is present in the row data of ScPCA
-#' SingleCellExperiment objects.
+#' for many applications gene symbols are useful. This function provides
+#' simple conversion of Ensembl gene ids to gene symbols based on either the
+#' ScPCA reference gene list or a 10x reference gene list as used by Cell Ranger.
 #'
-#' For this function, the SingleCellExperiment object must contain a `gene_ids`
-#' column containing Ensembl gene ids and a `gene_symbol` column containing gene
-#' symbols. If any gene ids are not found or if the gene symbol is not defined,
-#' the input gene id is returned, unless the `leave_na` is set to `TRUE`.
+#' The gene symbols can either be made unique (as would be done if read in by Seurat)
+#' or left as is.
 #'
 #'
 #' @param ensembl_ids A character vector of Ensembl gene ids to translate to
 #'  gene symbols.
-#' @param sce A SingleCellExperiment object containing gene ids and gene symbols
-#'  to use for translation.
-#' @param leave_na logical indicating whether to leave NA values in the output.
-#'  Default is `FALSE`
+#' @param reference The reference gene list to use for translation. One of `scpca`,
+#'   `10x2020`, `10x2024`. The `scpca` reference is the default.
+#' @param sce A SingleCellExperiment object to use as a reference for gene symbols.
+#'  If provided, the `reference` argument will be ignored. The `sce` object must
+#'  include columns with the names `gene_ids` and `gene_symbol` to use for conversion.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#'  data had been read in with gene symbols by Seurat. Default is FALSE.
+#' @param leave_na Whether to leave NA values in the output vector.
+#' If FALSE, any missing values will be replaced with the input ensembl_id value.
+#' Default is FALSE.
 #'
 #' @return A vector of gene symbols corresponding to the input Ensembl ids.
 #' @export
 #'
 #' @import SingleCellExperiment
 #'
 #' @examples
-#' \dontrun{
 #' # convert a set of Ensembl ids to gene symbols
-#' # using a SingleCellExperiment reference
 #' ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
-#' gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
+#' gene_symbols <- ensembl_to_symbol(ensembl_ids)
 #' gene_symbols
 #' ### [1] "TP53" "MYCN"
-#' }
-ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
+#'
+ensembl_to_symbol <- function(
+    ensembl_ids,
+    reference = c("scpca", "10x2020", "10x2024"),
+    sce = NULL,
+    unique = FALSE,
+    leave_na = FALSE) {
+  reference <- match.arg(reference)
   stopifnot(
-    "`sce` must be a SingleCellExperiment object ." = is(sce, "SingleCellExperiment"),
     "`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
+    "`sce` must be a SingleCellExperiment object or NULL." = is.null(sce) || inherits(sce, "SingleCellExperiment"),
     "`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
-      all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
-    "`leave_na` must be TRUE or FALSE." = is.logical(leave_na)
+      is.null(sce) || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
+    "`unique` must be TRUE or FALSE." = is.logical(unique) && !is.na(unique),
+    "`leave_na` must be TRUE or FALSE." = is.logical(leave_na) && !is.na(leave_na)
   )
 
-  id_match <- match(ensembl_ids, rowData(sce)$gene_ids)
-  gene_symbols <- rowData(sce)[id_match, "gene_symbol"]
+  if (is.null(sce)) {
+    # build the symbol column name
+    symbol_column <- paste0("gene_symbol_", reference, ifelse(unique, "_unique", ""))
+    # get the gene symbols
+    gene_symbols <- scpca_gene_reference[match(ensembl_ids, scpca_gene_reference$gene_ids), symbol_column]
+  } else {
+    all_symbols <- rowData(sce)$gene_symbol
+    if (unique) {
+      all_symbols[!is.na(all_symbols)] <- make.unique(all_symbols[!is.na(all_symbols)])
+    }
+    gene_symbols <- all_symbols[match(ensembl_ids, rowData(sce)$gene_ids)]
+  }
 
   missing_symbols <- is.na(gene_symbols)
   if (!leave_na && any(missing_symbols)) {
-    warning("Not all `ensembl_ids` values have corresponding gene symbols, using input ids for missing values.")
+    warning("Not all input ids have corresponding gene symbols, using input ids for missing values.")
     gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
   }
 
@@ -62,6 +80,9 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
 #' for many applications gene symbols are useful. This function converts the
 #' row names (indexes) of a SingleCellExperiment object to gene symbols based on the
 #' `gene_symbol` column that is present in the row data of ScPCA SingleCellExperiment objects.
+#' It is also possible to use an alternative reference, such as the reference gene sets
+#' provided by 10x Genomics and used for Cell Ranger. Values for the 10x-provided 2020 and
+#' 2024 references are available.
 #'
 #' Internal data structures such as the list of highly variable genes and the
 #' rotation matrix for the PCA are also updated to use gene symbols, if present
@@ -71,6 +92,10 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
 #' de-duplication is currently performed.
 #'
 #' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
+#' @param reference The reference gene list for conversion. One of `sce`, `scpca`,
+#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#' data had been read in with gene symbols by Seurat. Default is FALSE.
 #' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
 #' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
 #'
@@ -85,22 +110,38 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
 #' # convert a SingleCellExperiment object to use gene symbols
 #' symbol_sce <- sce_to_symbols(sce)
 #' }
-sce_to_symbols <- function(sce, convert_hvg = TRUE, convert_pca = TRUE) {
+sce_to_symbols <- function(
+    sce,
+    reference = c("sce", "scpca", "10x2020", "10x2024"),
+    unique = FALSE,
+    convert_hvg = TRUE,
+    convert_pca = TRUE) {
+  reference <- match.arg(reference)
   stopifnot(
     "`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
-    "`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
-      all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
+    "`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." =
+      reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
   )
-  row_ids <- rowData(sce)$gene_symbol
-  # set Ensembl ids as original ids for later translations
-  names(row_ids) <- rowData(sce)$gene_ids
 
-  missing_ids <- is.na(row_ids)
-  if (any(missing_ids)) {
-    warning("Not all rows have gene symbols, using Ensembl ids for missing values.")
-    row_ids[missing_ids] <- names(row_ids)[missing_ids]
+  # get ensembl ids, either from gene_ids column if present or from the row names as a fallback
+  if ("gene_ids" %in% names(rowData(sce))) {
+    ensembl_ids <- rowData(sce)$gene_ids
+  } else {
+    ensembl_ids <- rownames(sce)
+  }
+  if (!all(startsWith(ensembl_ids, "ENSG"))) {
+    stop("gene_ids and/or row names are not all Ensembl ids, cannot convert to gene symbols.")
   }
 
+  if (reference == "sce") {
+    gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique)
+  } else {
+    gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique)
+  }
+  row_ids <- gene_symbols
+  # set Ensembl ids as original ids for later translations
+  names(row_ids) <- ensembl_ids
+
   rownames(sce) <- row_ids
 
   if (convert_hvg && "highly_variable_genes" %in% names(metadata(sce))) {

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data-raw/build_gene_reference.Rmd b/data-raw/build_gene_reference.Rmd
@@ -0,0 +1,179 @@
+---
+title: "Gene ID Conversion data setup"
+output: html_notebook
+---
+
+
+This notebook is to set up the reference data for gene ID conversion and ultimately deduplication.
+
+```{r}
+suppressPackageStartupMessages({
+  library(SingleCellExperiment)
+  library(dplyr)
+})
+```
+
+
+Read in the test data ScPCA SCE object and extract the row data:
+
+```{r}
+genes_scpca <- readRDS(here::here("tests", "testthat", "data", "scpca_sce.rds")) |>
+  rowData() |>
+  as.data.frame() |>
+  # Use Ensembl ID if gene symbol is missing, then make unique
+  mutate(
+    gene_symbol_scpca = ifelse(is.na(gene_symbol), gene_ids, gene_symbol),
+    gene_symbol_scpca_unique = make.unique(gene_symbol_scpca)
+  ) |>
+  select(gene_ids, gene_symbol_scpca, gene_symbol_scpca_unique)
+```
+
+
+Download and read in a 2020 10x reference dataset and extract the gene symbols.
+Note that the 2020 Cell Ranger reference does not use Ensembl gene IDs for missing symbols, but the 2024 reference does.
+
+```{r}
+# Download data
+url_10x2020 <- "https://cf.10xgenomics.com/samples/cell-exp/7.0.1/SC3pv3_GEX_Human_PBMC/SC3pv3_GEX_Human_PBMC_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2020 <- tempfile(fileext = ".h5")
+download.file(url_10x2020, temp_10x2020, mode = "wb")
+on.exit(unlink(temp_10x2020), add = TRUE) # delete when done
+```
+
+```{r}
+# Read in the data
+genes_10x2020 <- DropletUtils::read10xCounts(temp_10x2020) |>
+  rowData() |>
+  as.data.frame() |>
+  filter(Type == "Gene Expression") |>
+  rename(
+    gene_ids = ID,
+    gene_symbol_10x2020 = Symbol
+  ) |>
+  # Use Ensembl ID if gene symbol is missing, then make unique
+  mutate(gene_symbol_10x2020_unique = make.unique(gene_symbol_10x2020)) |>
+  select(gene_ids, gene_symbol_10x2020, gene_symbol_10x2020_unique)
+```
+Download and read in a 2024 10x reference dataset and extract the gene symbols:
+
+```{r}
+# Download data
+url_10x2024 <- "https://cf.10xgenomics.com/samples/cell-exp/9.0.0/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2024 <- tempfile(fileext = ".h5")
+download.file(url_10x2024, temp_10x2024, mode = "wb")
+on.exit(unlink(temp_10x2024), add = TRUE) # delete when done
+```
+
+
+```{r}
+genes_10x2024 <- DropletUtils::read10xCounts(temp_10x2024) |>
+  rowData() |>
+  as.data.frame() |>
+  filter(Type == "Gene Expression") |>
+  rename(
+    gene_ids = ID,
+    gene_symbol_10x2024 = Symbol
+  ) |>
+  # Use Ensembl ID if gene symbol is missing, then make unique
+  mutate(gene_symbol_10x2024_unique = make.unique(gene_symbol_10x2024)) |>
+  select(gene_ids, gene_symbol_10x2024, gene_symbol_10x2024_unique)
+```
+
+## Merge the gene symbol tables
+
+```{r}
+scpca_gene_reference <- genes_scpca |>
+  full_join(genes_10x2020, by = "gene_ids") |>
+  full_join(genes_10x2024, by = "gene_ids")
+```
+
+
+## Look at some stats for the tables
+
+We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references have Ensembl IDs not present in the ScPCA data.
-We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references have Ensembl IDs not present in the ScPCA data.
+We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references will have Ensembl IDs not present in the ScPCA data.
-We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references have Ensembl IDs not present in the ScPCA data.
+We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references will have Ensembl IDs not present in the ScPCA data.
+Let's look at the number of missing gene symbols in the ScPCA data:
+
+```{r}
+scpca_missing <- scpca_gene_reference |>
+  filter(is.na(gene_symbol_scpca))
+```
+
+How many of the gene symbols in the 10x 2020 reference are in the ScPCA data, but perhaps with a different Ensembl ID?
+
+```{r}
+symbols_10x2020 <- scpca_missing$gene_symbol_10x2020[!is.na(scpca_missing$gene_symbol_10x2020)]
+sum(symbols_10x2020 %in% genes_scpca$gene_symbol)
+```
+
+What are they?
+
+```{r}
+# Match the genes that are in the 10x 2020 reference to the ScPCA table on gene symbol
+# Only for those where the gene id is missing in the ScPCA table
+genes_10x2020 |>
+  filter(
+    gene_symbol_10x2020 %in% genes_scpca$gene_symbol,
+    !gene_ids %in% genes_scpca$gene_ids
+  ) |>
+  left_join(genes_scpca, by = c("gene_symbol_10x2020" = "gene_symbol_scpca"))
+```
+
+Looks like these are all cases where the gene symbol has been updated.
+In only one case is this a gene where something was not unique, which is `LINC01505`.
+This gene seems to have simply been removed in later revisions, so I think we can safely not worry about it.
+
+```{r}
+scpca_gene_reference |>
+  filter(gene_symbol_10x2020 == "LINC01505")
+```
+
+In general, simply translating to the list gene symbols should work as expected.
+
+### BAC removal
+
+In Ensembl version 104, the version we are using for ScPCA, BAC-based gene IDs were removed and replaced with simply the Ensembl gene ID.
+Many of the gene symbols that are present in the 10x references but missing in the ScPCA data are these BAC-based gene IDs.
+We will probably want to translate these to the 10x symbol when requested.
+
+```{r}
+scpca_gene_reference |>
+  filter(
+    !is.na(gene_symbol_10x2020),
+    !gene_symbol_10x2020 %in% genes_scpca$gene_symbol_scpca
+  )
+```
+
+### Disagreements between symbols
+
+One other question is how often the `unique` gene symbols disagree for the same Ensemble ID.
+Here we will exclude the cases where the gene symbol is an Ensembl ID in the ScPCA data, as these are expected to be different.
+
+
+```{r}
+scpca_gene_reference |>
+  filter(
+    gene_symbol_scpca != gene_ids,
+    gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+  )
+```
+
+More than I expected, but it looks like most of these are cases where the gene symbol has been updated.
+What we are really interested in is the cases where the process of making the gene symbol unique had different effects:
-What we are really interested in is the cases where the process of making the gene symbol unique had different effects:
+What we are really interested in are the cases where the process of making the gene symbol unique had different effects:
-What we are really interested in is the cases where the process of making the gene symbol unique had different effects:
+What we are really interested in are the cases where the process of making the gene symbol unique had different effects:
+
+```{r}
+scpca_gene_reference |>
+  filter(
+    gene_symbol_scpca == gene_symbol_10x2020,
+    gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+  )
+```
+
+This definitely happens more than I would have hoped. Which is to say that the best way to handle this is probably simply to translate using the the table directly when performing comparisons to existing data for the most accurate results.
+
+We will need to have clear instructions about when to use which kind of translation.
+
+## Add the table to package data
+
+```{r}
+usethis::use_data(scpca_gene_reference, internal = TRUE, version = 3, overwrite = TRUE)
+```