diff --git a/.Rbuildignore b/.Rbuildignore
index 2d51c4a..b238b50 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -5,3 +5,4 @@
^\.github$
^\.lintr$
^\.pre-commit-config.yaml$
+^data-raw$
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 32537c7..e3079b8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,9 +16,10 @@ repos:
exclude: '\.Rd'
- repo: https://github.com/crate-ci/typos
- rev: typos-dict-v0.11.34
+ rev: v1.28.1
hooks:
- id: typos
+ exclude: '\.nb\.html'
- repo: https://github.com/gitleaks/gitleaks
rev: v8.21.2
@@ -39,4 +40,4 @@ repos:
- id: no-browser-statement
- id: no-debug-statement
- id: deps-in-desc
- exclude: 'docker/.*|renv/.*'
+ exclude: 'docker/.*|renv/.*|data-raw/.*'
diff --git a/DESCRIPTION b/DESCRIPTION
index b43bc86..b2908ff 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -41,3 +41,5 @@ biocViews:
Transcriptomics,
SingleCell,
Clustering
+Depends:
+ R (>= 4.0)
diff --git a/R/convert-gene-ids.R b/R/convert-gene-ids.R
index 28e795e..5645a9c 100644
--- a/R/convert-gene-ids.R
+++ b/R/convert-gene-ids.R
@@ -1,24 +1,30 @@
-#' Convert Ensembl gene ids to gene symbols based on an ScPCA SingleCellExperiment object
+#' Convert Ensembl gene ids to gene symbols based on reference gene lists
#'
#' The SingleCellExperiment objects produced as part of ScPCA are indexed by
#' Ensembl gene ids, as those are more stable than gene symbols. However,
-#' for many applications gene symbols are useful. This function provides a
-#' simple and consistent conversion of Ensembl gene ids to gene symbols based on
-#' the `gene_symbol` column that is present in the row data of ScPCA
-#' SingleCellExperiment objects.
+#' for many applications gene symbols are useful. This function provides
+#' simple conversion of Ensembl gene ids to gene symbols based on either the
+#' ScPCA reference gene list or a 10x reference gene list as used by Cell Ranger.
+#' Alternatively, a SingleCellExperiment object with gene ids and gene symbols
+#' stored in the row data (as those provided by ScPCA) can be used as the reference.
#'
-#' For this function, the SingleCellExperiment object must contain a `gene_ids`
-#' column containing Ensembl gene ids and a `gene_symbol` column containing gene
-#' symbols. If any gene ids are not found or if the gene symbol is not defined,
-#' the input gene id is returned, unless the `leave_na` is set to `TRUE`.
+#' The gene symbols can either be made unique (as would be done if read in by Seurat)
+#' or left as is.
#'
#'
#' @param ensembl_ids A character vector of Ensembl gene ids to translate to
#' gene symbols.
-#' @param sce A SingleCellExperiment object containing gene ids and gene symbols
-#' to use for translation.
-#' @param leave_na logical indicating whether to leave NA values in the output.
-#' Default is `FALSE`
+#' @param reference The reference gene list to use for translation. One of `scpca`,
+#' `10x2020`, `10x2024`. The `scpca` reference is the default.
+#' @param sce A SingleCellExperiment object to use as a reference for gene symbols.
+#' If provided, the `reference` argument will be ignored. The `sce` object must
+#' include columns with the names `gene_ids` (containing Ensembl ids) and
+#' `gene_symbol` (containing the symbols) to use for conversion.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#' data had been read in with gene symbols by Seurat. Default is FALSE.
+#' @param leave_na Whether to leave NA values in the output vector.
+#' If FALSE, any missing values will be replaced with the input ensembl_id value.
+#' Default is FALSE.
#'
#' @return A vector of gene symbols corresponding to the input Ensembl ids.
#' @export
@@ -26,29 +32,60 @@
#' @import SingleCellExperiment
#'
#' @examples
-#' \dontrun{
#' # convert a set of Ensembl ids to gene symbols
-#' # using a SingleCellExperiment reference
#' ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
-#' gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
+#' gene_symbols <- ensembl_to_symbol(ensembl_ids)
#' gene_symbols
#' ### [1] "TP53" "MYCN"
+#'
+#' # convert a set of Ensembl ids to gene symbols using the 10x2020 reference
+#' gene_symbols_10x2020 <- ensembl_to_symbol(ensembl_ids, reference = "10x2020")
+#'
+#' \dontrun{
+#' # convert a set of Ensembl ids to gene symbols using an SCE for reference
+#' gene_symbols_sce <- ensembl_to_symbol(ensembl_ids, sce = sce)
#' }
-ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
+#'
+ensembl_to_symbol <- function(
+ ensembl_ids,
+ reference = c("scpca", "10x2020", "10x2024"),
+ sce = NULL,
+ unique = FALSE,
+ leave_na = FALSE) {
+ reference <- match.arg(reference)
stopifnot(
- "`sce` must be a SingleCellExperiment object ." = is(sce, "SingleCellExperiment"),
"`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
+ "`sce` must be a SingleCellExperiment object or NULL." = is.null(sce) || inherits(sce, "SingleCellExperiment"),
"`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
- all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
- "`leave_na` must be TRUE or FALSE." = is.logical(leave_na)
+ is.null(sce) || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
+ "`unique` must be TRUE or FALSE." = is.logical(unique) && !is.na(unique),
+ "`leave_na` must be TRUE or FALSE." = is.logical(leave_na) && !is.na(leave_na)
)
- id_match <- match(ensembl_ids, rowData(sce)$gene_ids)
- gene_symbols <- rowData(sce)[id_match, "gene_symbol"]
+ if (is.null(sce)) {
+ # build the symbol column name
+ symbol_column <- paste0("gene_symbol_", reference, ifelse(unique, "_unique", ""))
+ # get the gene symbols
+ id_match <- match(ensembl_ids, rOpenScPCA::scpca_gene_reference$gene_ids)
+ gene_symbols <- rOpenScPCA::scpca_gene_reference[id_match, symbol_column]
+ } else {
+ message("Using the provided SingleCellExperiment object for gene symbol conversion.")
+ all_symbols <- rowData(sce)$gene_symbol
+ if (unique) {
+ all_symbols[!is.na(all_symbols)] <- make.unique(all_symbols[!is.na(all_symbols)])
+ }
+ gene_symbols <- all_symbols[match(ensembl_ids, rowData(sce)$gene_ids)]
+ }
missing_symbols <- is.na(gene_symbols)
+ if (all(missing_symbols)) {
+ warning(
+ "None of the input ids have corresponding gene symbols.",
+ " You may want to check the reference and input ids."
+ )
+ }
if (!leave_na && any(missing_symbols)) {
- warning("Not all `ensembl_ids` values have corresponding gene symbols, using input ids for missing values.")
+ warning("Not all input ids have corresponding gene symbols, using input ids for missing values.")
gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
}
@@ -62,17 +99,28 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
#' for many applications gene symbols are useful. This function converts the
#' row names (indexes) of a SingleCellExperiment object to gene symbols based on the
#' `gene_symbol` column that is present in the row data of ScPCA SingleCellExperiment objects.
+#' It is also possible to use an alternative reference, such as the default ScPCA
+#' reference gene sets or the reference gene sets provided by 10x Genomics for
+#' use with Cell Ranger. Values for the 10x-provided 2020 and 2024 references
+#' are available.
+#'
+#' By default, duplicate gene symbols are left as is, but can be made unique
+#' (as would be done by Seurat) by setting the `unique` argument to TRUE.
#'
#' Internal data structures such as the list of highly variable genes and the
#' rotation matrix for the PCA are also updated to use gene symbols, if present
#' (and not disabled by the `convert_hvg` and `convert_pca` arguments).
#'
-#' Note that using this function will result in non-unique row ids as no
-#' de-duplication is currently performed.
#'
#' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
+#' @param reference The reference gene list for conversion. One of `sce`, `scpca`,
+#' `10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.
+#' @param unique Whether to use unique gene symbols, as would be done if
+#' data had been read in with gene symbols by Seurat. Default is FALSE.
#' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
+#' Default is TRUE.
#' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
+#' Default is TRUE.
#'
#' @return A SingleCellExperiment object with row names set as gene symbols.
#' @export
@@ -84,22 +132,45 @@ ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
#' \dontrun{
#' # convert a SingleCellExperiment object to use gene symbols
#' symbol_sce <- sce_to_symbols(sce)
+#'
+#' # convert a SingleCellExperiment object, making the gene symbols unique
+#' symbol_sce <- sce_to_symbols(sce, unique = TRUE)
+#'
+#' # convert a SingleCellExperiment object to use gene symbols with the 10x2020 reference
+#' symbol_sce <- sce_to_symbols(sce, reference = "10x2020")
#' }
-sce_to_symbols <- function(sce, convert_hvg = TRUE, convert_pca = TRUE) {
+#'
+sce_to_symbols <- function(
+ sce,
+ reference = c("sce", "scpca", "10x2020", "10x2024"),
+ unique = FALSE,
+ convert_hvg = TRUE,
+ convert_pca = TRUE) {
+ reference <- match.arg(reference)
stopifnot(
"`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
- "`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
- all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
+ "`sce` must contain both `gene_ids` and `gene_symbol` columns in the row data if it is being used as a reference." =
+ reference != "sce" || all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
)
- row_ids <- rowData(sce)$gene_symbol
- # set Ensembl ids as original ids for later translations
- names(row_ids) <- rowData(sce)$gene_ids
- missing_ids <- is.na(row_ids)
- if (any(missing_ids)) {
- warning("Not all rows have gene symbols, using Ensembl ids for missing values.")
- row_ids[missing_ids] <- names(row_ids)[missing_ids]
+ # get ensembl ids, either from gene_ids column if present or from the row names as a fallback
+ if ("gene_ids" %in% names(rowData(sce))) {
+ ensembl_ids <- rowData(sce)$gene_ids
+ } else {
+ ensembl_ids <- rownames(sce)
}
+ if (!all(grepl("^ENS(...)?G\\d+$", ensembl_ids))) {
+ stop("gene_ids and/or row names are not all Ensembl ids, cannot convert to gene symbols.")
+ }
+
+ if (reference == "sce") {
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = unique)
+ } else {
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = reference, unique = unique)
+ }
+ row_ids <- gene_symbols
+ # set Ensembl ids as original ids for later translations
+ names(row_ids) <- ensembl_ids
rownames(sce) <- row_ids
diff --git a/R/data.R b/R/data.R
new file mode 100644
index 0000000..b1f2690
--- /dev/null
+++ b/R/data.R
@@ -0,0 +1,23 @@
+# nolint start
+
+#' Conversion table for Ensembl gene ids and gene symbols
+#'
+#'
+#' This table includes the mapping for gene ids to gene symbols from different
+#' reference genome gene annotation lists.
+#' Included are the original gene symbols and the modified gene symbols that
+#' are created when running the `make.unique()` function, as is done when
+#' importing data using Seurat.
+#'
+#' @format
+#' A data frame with 7 columns:
+#' \describe{
+#' \item{gene_ids}{Ensembl gene ids}
+#' \item{gene_symbol_scpca}{The gene symbol used in the ScPCA reference}
+#' \item{gene_symbol_scpca_unique}{The gene symbol from the ScPCA reference, after `make.unique()`}
+#' \item{gene_symbol_10x2020}{The gene symbol used in the 2020 10x human genome reference}
+#' \item{gene_symbol_10x2020_unique}{The gene symbol from the 2020 10x human genome reference, after `make.unique()`}
+#' \item{gene_symbol_10x2024}{The gene symbol used in the 2024 10x human genome reference}
+#' \item{gene_symbol_10x2024_unique}{The gene symbol from the 2024 10x human genome reference, after `make.unique()`}
+#' }
+"scpca_gene_reference"
diff --git a/data-raw/README.md b/data-raw/README.md
new file mode 100644
index 0000000..608c806
--- /dev/null
+++ b/data-raw/README.md
@@ -0,0 +1,11 @@
+This directory contains scripts to download and preprocess data used within the rOpenScPCA package.
+There are also notebooks that explore some of the prepared datasets for exploration.
+
+## Building Gene References
+
+- The `build_gene_references.R` script creates a table of Ensembl id to gene symbol references.
+The initial table of gene ids and gene symbols is extracted from an example ScPCA-formatted SCE object (`rOpenScPCA/tests/testthat/data/scpca_sce.rds`).
+This is combined with the reference information extracted from example 10x Genomics datasets.
+The full table is saved in `data/scpca_gene_reference.rda` (overwriting any previous file).
+
+- The `explore_gene_references.Rmd` notebook explores the resulting gene references table a bit to see where some of the conversions differ among different references.
diff --git a/data-raw/build_gene_reference.R b/data-raw/build_gene_reference.R
new file mode 100644
index 0000000..9860b7f
--- /dev/null
+++ b/data-raw/build_gene_reference.R
@@ -0,0 +1,76 @@
+#!/usr/env Rscript
+
+# This script downloads and stores reference gene lists from 10x Genomics
+# datasets, creating a final table of Ensembl ids and corresponding symbols
+# including the symbols that would be created on read by Seurat by application
+# of the make.unique() function.
+suppressPackageStartupMessages({
+ library(SingleCellExperiment)
+ library(dplyr)
+})
+
+## Download the reference data -------------------------------------------------
+
+
+# Read in the test data ScPCA SCE object and extract the row data:
+genes_scpca <- readRDS(here::here("tests", "testthat", "data", "scpca_sce.rds")) |>
+ rowData() |>
+ as.data.frame() |>
+ # Use Ensembl ID if gene symbol is missing, then make unique
+ mutate(
+ gene_symbol_scpca = ifelse(is.na(gene_symbol), gene_ids, gene_symbol),
+ gene_symbol_scpca_unique = make.unique(gene_symbol_scpca)
+ ) |>
+ select(gene_ids, gene_symbol_scpca, gene_symbol_scpca_unique)
+
+
+# Download and read in a 2020 10x reference dataset and extract the gene symbols.
+# Note that the 2020 Cell Ranger reference does not use Ensembl gene IDs for
+# missing symbols, but the 2024 reference does.
+url_10x2020 <- "https://cf.10xgenomics.com/samples/cell-exp/7.0.1/SC3pv3_GEX_Human_PBMC/SC3pv3_GEX_Human_PBMC_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2020 <- tempfile(fileext = ".h5")
+download.file(url_10x2020, temp_10x2020, mode = "wb")
+on.exit(unlink(temp_10x2020), add = TRUE) # delete when done
+
+genes_10x2020 <- DropletUtils::read10xCounts(temp_10x2020) |>
+ rowData() |>
+ as.data.frame() |>
+ filter(Type == "Gene Expression") |>
+ rename(
+ gene_ids = ID,
+ gene_symbol_10x2020 = Symbol
+ ) |>
+ # add unique column
+ mutate(gene_symbol_10x2020_unique = make.unique(gene_symbol_10x2020)) |>
+ select(gene_ids, gene_symbol_10x2020, gene_symbol_10x2020_unique)
+
+# Download and read in a 2024 10x reference dataset and extract the gene symbols.
+url_10x2024 <- "https://cf.10xgenomics.com/samples/cell-exp/9.0.0/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5" # nolint
+temp_10x2024 <- tempfile(fileext = ".h5")
+download.file(url_10x2024, temp_10x2024, mode = "wb")
+on.exit(unlink(temp_10x2024), add = TRUE) # delete when done
+
+
+genes_10x2024 <- DropletUtils::read10xCounts(temp_10x2024) |>
+ rowData() |>
+ as.data.frame() |>
+ filter(Type == "Gene Expression") |>
+ rename(
+ gene_ids = ID,
+ gene_symbol_10x2024 = Symbol
+ ) |>
+ mutate(gene_symbol_10x2024_unique = make.unique(gene_symbol_10x2024)) |>
+ select(gene_ids, gene_symbol_10x2024, gene_symbol_10x2024_unique)
+
+# Join the gene lists ----------------------------------------------------------
+scpca_gene_reference <- genes_scpca |>
+ full_join(genes_10x2020, by = "gene_ids") |>
+ full_join(genes_10x2024, by = "gene_ids")
+
+## Add the table to package data -----------------------------------------------
+usethis::use_data(
+ scpca_gene_reference,
+ version = 3,
+ overwrite = TRUE,
+ compress = "xz"
+)
diff --git a/data-raw/explore_gene_reference.Rmd b/data-raw/explore_gene_reference.Rmd
new file mode 100644
index 0000000..4d87dd9
--- /dev/null
+++ b/data-raw/explore_gene_reference.Rmd
@@ -0,0 +1,131 @@
+---
+title: "Gene ID conversion exploration"
+date: "`r Sys.Date()`"
+output: html_notebook
+---
+
+This notebook explores the table of gene conversions a bit, to get a bit of a sense of where different references might vary.
+
+The table includes the following reference data:
+
+- ScPCA reference, based on Ensembl 104
+- 10x 2020 reference, based on Ensembl 98
+- 10x 2024 reference, based on Ensembl 110
+
+We will focus on the ScPCA and 10x 2020 references for now, as few data sets include the 2024 reference at this point.
+One thing to note is that Ensembl 98 reference used BAC-based gene names for otherwise unnamed genes, but as of Ensmbl 104 (the base for the ScPCA reference), those were replaced with Ensembl ids.
+
+```{r setup}
+suppressPackageStartupMessages({
+ library(SingleCellExperiment)
+ library(dplyr)
+})
+```
+
+
+# Load the R data
+```{r}
+load(here::here("data", "scpca_gene_reference.rda"))
+```
+
+## Look at some stats for the tables
+
+We expect that many of the Ensembl IDs in the ScPCA data will not have corresponding values in the 10x references, but some of the 10x references have Ensembl IDs not present in the ScPCA data.
+Let's look at the number of missing gene symbols in the ScPCA data:
+
+```{r}
+scpca_missing <- scpca_gene_reference |>
+ filter(is.na(gene_symbol_scpca))
+```
+
+How many of the gene symbols in the 10x 2020 reference are in the ScPCA data, but perhaps with a different Ensembl ID?
+
+```{r}
+symbols_10x2020 <- scpca_missing$gene_symbol_10x2020[!is.na(scpca_missing$gene_symbol_10x2020)]
+sum(symbols_10x2020 %in% scpca_gene_reference$gene_symbol_scpca)
+```
+
+What are they?
+
+```{r}
+# Match the genes that are in the 10x 2020 reference to the ScPCA table on gene symbol
+# Only for those where the gene id is missing in the ScPCA table
+scpca_gene_reference |>
+ filter(
+ !is.na(gene_symbol_10x2020),
+ is.na(gene_symbol_scpca),
+ gene_symbol_10x2020 %in% scpca_gene_reference$gene_symbol_scpca
+ ) |>
+ select(gene_ids, gene_symbol_10x2020, gene_symbol_10x2020_unique) |>
+ left_join(
+ scpca_gene_reference |> select(gene_ids, gene_symbol_scpca, gene_symbol_scpca_unique),
+ by = c("gene_symbol_10x2020" = "gene_symbol_scpca"),
+ suffix = c("_10x2020", "_scpca")
+ )
+```
+
+Looks like these are all cases where the gene id has been updated.
+In only one case is this a gene where something was not unique, which is `LINC01505`.
+This gene (`ENSG00000234229`) seems to have simply been removed in later revisions, so I think we can safely not worry about it.
+
+```{r}
+scpca_gene_reference |>
+ filter(gene_symbol_10x2020 == "LINC01505")
+```
+
+In general, simply translating to the list gene symbols should work as expected.
+
+### BAC removal
+
+In Ensembl version 104, the version we are using for ScPCA, BAC-based gene IDs were removed and replaced with simply the Ensembl gene ID.
+Many of the gene symbols that are present in the 10x references but missing in the ScPCA data are these BAC-based gene IDs.
+We will probably want to translate these to the symbol used in the 10x reference when requested.
+
+```{r}
+scpca_gene_reference |>
+ filter(
+ !is.na(gene_symbol_10x2020),
+ !gene_symbol_10x2020 %in% scpca_gene_reference$gene_symbol_scpca
+ )
+```
+
+### Disagreements between symbols
+
+One other question is how often the `unique` gene symbols disagree for the same Ensembl ID.
+Here we will exclude the cases where the gene symbol is an Ensembl ID in the ScPCA data, as these are expected to be different.
+
+```{r}
+scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca != gene_ids,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+ )
+```
+
+More than I expected, but it looks like most of these are cases where the gene symbol has been updated.
+We can verify this by looking at how often the base symbol is the same.
+
+```{r}
+scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca != gene_ids,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique,
+ ) |>
+ mutate(same_symbol = gene_symbol_scpca == gene_symbol_10x2020) |>
+ count(same_symbol)
+```
+
+What we are really interested in are the cases where the process of making the gene symbol unique had different effects:
+
+```{r}
+scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca == gene_symbol_10x2020,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+ )
+```
+
+This definitely happens more than I would have hoped, though my hope was probably unreasonable, as 14 cases is not bad.
+Which is to say that the best way to handle this is probably simply to translate using the the table directly when performing comparisons to existing data for the most accurate results.
+
+We will need to have clear instructions about when to use which kind of translation.
diff --git a/data-raw/explore_gene_reference.nb.html b/data-raw/explore_gene_reference.nb.html
new file mode 100644
index 0000000..bba06a4
--- /dev/null
+++ b/data-raw/explore_gene_reference.nb.html
@@ -0,0 +1,2039 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Gene ID conversion exploration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
This notebook explores the table of gene conversions a bit, to get a
+bit of a sense of where different references might vary.
+
The table includes the following reference data:
+
+- ScPCA reference, based on Ensembl 104
+- 10x 2020 reference, based on Ensembl 98
+- 10x 2024 reference, based on Ensembl 110
+
+
We will focus on the ScPCA and 10x 2020 references for now, as few
+data sets include the 2024 reference at this point. One thing to note is
+that Ensembl 98 reference used BAC-based gene names for otherwise
+unnamed genes, but as of Ensmbl 104 (the base for the ScPCA reference),
+those were replaced with Ensembl ids.
+
+
+
+
suppressPackageStartupMessages({
+ library(SingleCellExperiment)
+ library(dplyr)
+})
+
+
+
Warning: replacing previous import ‘S4Arrays::makeNindexFromArrayViewport’ by ‘DelayedArray::makeNindexFromArrayViewport’ when loading ‘SummarizedExperiment’
+
+
+
+
+
Load the R data
+
+
+
+
load(here::here("data", "scpca_gene_reference.rda"))
+
+
+
+
+
Look at some stats for the tables
+
We expect that many of the Ensembl IDs in the ScPCA data will not
+have corresponding values in the 10x references, but some of the 10x
+references have Ensembl IDs not present in the ScPCA data. Let’s look at
+the number of missing gene symbols in the ScPCA data:
+
+
+
+
scpca_missing <- scpca_gene_reference |>
+ filter(is.na(gene_symbol_scpca))
+
+
+
+
How many of the gene symbols in the 10x 2020 reference are in the
+ScPCA data, but perhaps with a different Ensembl ID?
+
+
+
+
symbols_10x2020 <- scpca_missing$gene_symbol_10x2020[!is.na(scpca_missing$gene_symbol_10x2020)]
+sum(symbols_10x2020 %in% scpca_gene_reference$gene_symbol_scpca)
+
+
+
[1] 10
+
+
+
+
What are they?
+
+
+
+
# Match the genes that are in the 10x 2020 reference to the ScPCA table on gene symbol
+# Only for those where the gene id is missing in the ScPCA table
+scpca_gene_reference |>
+ filter(
+ !is.na(gene_symbol_10x2020),
+ is.na(gene_symbol_scpca),
+ gene_symbol_10x2020 %in% scpca_gene_reference$gene_symbol_scpca
+ ) |>
+ select(gene_ids, gene_symbol_10x2020, gene_symbol_10x2020_unique) |>
+ left_join(
+ scpca_gene_reference |> select(gene_ids, gene_symbol_scpca, gene_symbol_scpca_unique),
+ by = c("gene_symbol_10x2020" = "gene_symbol_scpca"),
+ suffix = c("_10x2020", "_scpca")
+ )
+
+
+
+
+
+
+
+
+
Looks like these are all cases where the gene id has been updated. In
+only one case is this a gene where something was not unique, which is
+LINC01505
. This gene (ENSG00000234229
) seems
+to have simply been removed in later revisions, so I think we can safely
+not worry about it.
+
+
+
+
scpca_gene_reference |>
+ filter(gene_symbol_10x2020 == "LINC01505")
+
+
+
+
+
+
+
+
+
In general, simply translating to the list gene symbols should work
+as expected.
+
+
BAC removal
+
In Ensembl version 104, the version we are using for ScPCA, BAC-based
+gene IDs were removed and replaced with simply the Ensembl gene ID. Many
+of the gene symbols that are present in the 10x references but missing
+in the ScPCA data are these BAC-based gene IDs. We will probably want to
+translate these to the symbol used in the 10x reference when
+requested.
+
+
+
+
scpca_gene_reference |>
+ filter(
+ !is.na(gene_symbol_10x2020),
+ !gene_symbol_10x2020 %in% scpca_gene_reference$gene_symbol_scpca
+ )
+
+
+
+
+
+
+
+
+
+
+
Disagreements between symbols
+
One other question is how often the unique
gene symbols
+disagree for the same Ensembl ID. Here we will exclude the cases where
+the gene symbol is an Ensembl ID in the ScPCA data, as these are
+expected to be different.
+
+
+
+
scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca != gene_ids,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+ )
+
+
+
+
+
+
+
+
+
More than I expected, but it looks like most of these are cases where
+the gene symbol has been updated. We can verify this by looking at how
+often the base symbol is the same.
+
+
+
+
scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca != gene_ids,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique,
+ ) |>
+ mutate(same_symbol = gene_symbol_scpca == gene_symbol_10x2020) |>
+ count(same_symbol)
+
+
+
+
+
+
+
+
+
What we are really interested in are the cases where the process of
+making the gene symbol unique had different effects:
+
+
+
+
scpca_gene_reference |>
+ filter(
+ gene_symbol_scpca == gene_symbol_10x2020,
+ gene_symbol_scpca_unique != gene_symbol_10x2020_unique
+ )
+
+
+
+
+
+
+
+
+
This definitely happens more than I would have hoped, though my hope
+was probably unreasonable, as 14 cases is not bad. Which is to say that
+the best way to handle this is probably simply to translate using the
+the table directly when performing comparisons to existing data for the
+most accurate results.
+
We will need to have clear instructions about when to use which kind
+of translation.
+
+
+
+
+
+
LS0tCnRpdGxlOiAiR2VuZSBJRCBjb252ZXJzaW9uIGV4cGxvcmF0aW9uIgpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KClRoaXMgbm90ZWJvb2sgZXhwbG9yZXMgdGhlIHRhYmxlIG9mIGdlbmUgY29udmVyc2lvbnMgYSBiaXQsIHRvIGdldCBhIGJpdCBvZiBhIHNlbnNlIG9mIHdoZXJlIGRpZmZlcmVudCByZWZlcmVuY2VzIG1pZ2h0IHZhcnkuCgpUaGUgdGFibGUgaW5jbHVkZXMgdGhlIGZvbGxvd2luZyByZWZlcmVuY2UgZGF0YToKCi0gU2NQQ0EgcmVmZXJlbmNlLCBiYXNlZCBvbiBFbnNlbWJsIDEwNAotIDEweCAyMDIwIHJlZmVyZW5jZSwgYmFzZWQgb24gRW5zZW1ibCA5OAotIDEweCAyMDI0IHJlZmVyZW5jZSwgYmFzZWQgb24gRW5zZW1ibCAxMTAKCldlIHdpbGwgZm9jdXMgb24gdGhlIFNjUENBIGFuZCAxMHggMjAyMCByZWZlcmVuY2VzIGZvciBub3csIGFzIGZldyBkYXRhIHNldHMgaW5jbHVkZSB0aGUgMjAyNCByZWZlcmVuY2UgYXQgdGhpcyBwb2ludC4KT25lIHRoaW5nIHRvIG5vdGUgaXMgdGhhdCBFbnNlbWJsIDk4IHJlZmVyZW5jZSB1c2VkIEJBQy1iYXNlZCBnZW5lIG5hbWVzIGZvciBvdGhlcndpc2UgdW5uYW1lZCBnZW5lcywgYnV0IGFzIG9mIEVuc21ibCAxMDQgKHRoZSBiYXNlIGZvciB0aGUgU2NQQ0EgcmVmZXJlbmNlKSwgdGhvc2Ugd2VyZSByZXBsYWNlZCB3aXRoIEVuc2VtYmwgaWRzLgoKYGBge3Igc2V0dXB9CnN1cHByZXNzUGFja2FnZVN0YXJ0dXBNZXNzYWdlcyh7CiAgbGlicmFyeShTaW5nbGVDZWxsRXhwZXJpbWVudCkKICBsaWJyYXJ5KGRwbHlyKQp9KQpgYGAKCgojIExvYWQgdGhlIFIgZGF0YQpgYGB7cn0KbG9hZChoZXJlOjpoZXJlKCJkYXRhIiwgInNjcGNhX2dlbmVfcmVmZXJlbmNlLnJkYSIpKQpgYGAKCiMjIExvb2sgYXQgc29tZSBzdGF0cyBmb3IgdGhlIHRhYmxlcwoKV2UgZXhwZWN0IHRoYXQgbWFueSBvZiB0aGUgRW5zZW1ibCBJRHMgaW4gdGhlIFNjUENBIGRhdGEgd2lsbCBub3QgaGF2ZSBjb3JyZXNwb25kaW5nIHZhbHVlcyBpbiB0aGUgMTB4IHJlZmVyZW5jZXMsIGJ1dCBzb21lIG9mIHRoZSAxMHggcmVmZXJlbmNlcyBoYXZlIEVuc2VtYmwgSURzIG5vdCBwcmVzZW50IGluIHRoZSBTY1BDQSBkYXRhLgpMZXQncyBsb29rIGF0IHRoZSBudW1iZXIgb2YgbWlzc2luZyBnZW5lIHN5bWJvbHMgaW4gdGhlIFNjUENBIGRhdGE6CgpgYGB7cn0Kc2NwY2FfbWlzc2luZyA8LSBzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PgogIGZpbHRlcihpcy5uYShnZW5lX3N5bWJvbF9zY3BjYSkpCmBgYAoKSG93IG1hbnkgb2YgdGhlIGdlbmUgc3ltYm9scyBpbiB0aGUgMTB4IDIwMjAgcmVmZXJlbmNlIGFyZSBpbiB0aGUgU2NQQ0EgZGF0YSwgYnV0IHBlcmhhcHMgd2l0aCBhIGRpZmZlcmVudCBFbnNlbWJsIElEPwoKYGBge3J9CnN5bWJvbHNfMTB4MjAyMCA8LSBzY3BjYV9taXNzaW5nJGdlbmVfc3ltYm9sXzEweDIwMjBbIWlzLm5hKHNjcGNhX21pc3NpbmckZ2VuZV9zeW1ib2xfMTB4MjAyMCldCnN1bShzeW1ib2xzXzEweDIwMjAgJWluJSBzY3BjYV9nZW5lX3JlZmVyZW5jZSRnZW5lX3N5bWJvbF9zY3BjYSkKYGBgCgpXaGF0IGFyZSB0aGV5PwoKYGBge3J9CiMgTWF0Y2ggdGhlIGdlbmVzIHRoYXQgYXJlIGluIHRoZSAxMHggMjAyMCByZWZlcmVuY2UgdG8gdGhlIFNjUENBIHRhYmxlIG9uIGdlbmUgc3ltYm9sCiMgT25seSBmb3IgdGhvc2Ugd2hlcmUgdGhlIGdlbmUgaWQgaXMgbWlzc2luZyBpbiB0aGUgU2NQQ0EgdGFibGUKc2NwY2FfZ2VuZV9yZWZlcmVuY2UgfD4KICBmaWx0ZXIoCiAgICAhaXMubmEoZ2VuZV9zeW1ib2xfMTB4MjAyMCksCiAgICBpcy5uYShnZW5lX3N5bWJvbF9zY3BjYSksCiAgICBnZW5lX3N5bWJvbF8xMHgyMDIwICVpbiUgc2NwY2FfZ2VuZV9yZWZlcmVuY2UkZ2VuZV9zeW1ib2xfc2NwY2EKICApIHw+CiAgc2VsZWN0KGdlbmVfaWRzLCBnZW5lX3N5bWJvbF8xMHgyMDIwLCBnZW5lX3N5bWJvbF8xMHgyMDIwX3VuaXF1ZSkgfD4KICBsZWZ0X2pvaW4oCiAgICBzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PiBzZWxlY3QoZ2VuZV9pZHMsIGdlbmVfc3ltYm9sX3NjcGNhLCBnZW5lX3N5bWJvbF9zY3BjYV91bmlxdWUpLAogICAgYnkgPSBjKCJnZW5lX3N5bWJvbF8xMHgyMDIwIiA9ICJnZW5lX3N5bWJvbF9zY3BjYSIpLAogICAgc3VmZml4ID0gYygiXzEweDIwMjAiLCAiX3NjcGNhIikKICApCmBgYAoKTG9va3MgbGlrZSB0aGVzZSBhcmUgYWxsIGNhc2VzIHdoZXJlIHRoZSBnZW5lIGlkIGhhcyBiZWVuIHVwZGF0ZWQuCkluIG9ubHkgb25lIGNhc2UgaXMgdGhpcyBhIGdlbmUgd2hlcmUgc29tZXRoaW5nIHdhcyBub3QgdW5pcXVlLCB3aGljaCBpcyBgTElOQzAxNTA1YC4KVGhpcyBnZW5lIChgRU5TRzAwMDAwMjM0MjI5YCkgc2VlbXMgdG8gaGF2ZSBzaW1wbHkgYmVlbiByZW1vdmVkIGluIGxhdGVyIHJldmlzaW9ucywgc28gSSB0aGluayB3ZSBjYW4gc2FmZWx5IG5vdCB3b3JyeSBhYm91dCBpdC4KCmBgYHtyfQpzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PgogIGZpbHRlcihnZW5lX3N5bWJvbF8xMHgyMDIwID09ICJMSU5DMDE1MDUiKQpgYGAKCkluIGdlbmVyYWwsIHNpbXBseSB0cmFuc2xhdGluZyB0byB0aGUgbGlzdCBnZW5lIHN5bWJvbHMgc2hvdWxkIHdvcmsgYXMgZXhwZWN0ZWQuCgojIyMgQkFDIHJlbW92YWwKCkluIEVuc2VtYmwgdmVyc2lvbiAxMDQsIHRoZSB2ZXJzaW9uIHdlIGFyZSB1c2luZyBmb3IgU2NQQ0EsIEJBQy1iYXNlZCBnZW5lIElEcyB3ZXJlIHJlbW92ZWQgYW5kIHJlcGxhY2VkIHdpdGggc2ltcGx5IHRoZSBFbnNlbWJsIGdlbmUgSUQuCk1hbnkgb2YgdGhlIGdlbmUgc3ltYm9scyB0aGF0IGFyZSBwcmVzZW50IGluIHRoZSAxMHggcmVmZXJlbmNlcyBidXQgbWlzc2luZyBpbiB0aGUgU2NQQ0EgZGF0YSBhcmUgdGhlc2UgQkFDLWJhc2VkIGdlbmUgSURzLgpXZSB3aWxsIHByb2JhYmx5IHdhbnQgdG8gdHJhbnNsYXRlIHRoZXNlIHRvIHRoZSBzeW1ib2wgdXNlZCBpbiB0aGUgMTB4IHJlZmVyZW5jZSB3aGVuIHJlcXVlc3RlZC4KCmBgYHtyfQpzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PgogIGZpbHRlcigKICAgICFpcy5uYShnZW5lX3N5bWJvbF8xMHgyMDIwKSwKICAgICFnZW5lX3N5bWJvbF8xMHgyMDIwICVpbiUgc2NwY2FfZ2VuZV9yZWZlcmVuY2UkZ2VuZV9zeW1ib2xfc2NwY2EKICApCmBgYAoKIyMjIERpc2FncmVlbWVudHMgYmV0d2VlbiBzeW1ib2xzCgpPbmUgb3RoZXIgcXVlc3Rpb24gaXMgaG93IG9mdGVuIHRoZSBgdW5pcXVlYCBnZW5lIHN5bWJvbHMgZGlzYWdyZWUgZm9yIHRoZSBzYW1lIEVuc2VtYmwgSUQuCkhlcmUgd2Ugd2lsbCBleGNsdWRlIHRoZSBjYXNlcyB3aGVyZSB0aGUgZ2VuZSBzeW1ib2wgaXMgYW4gRW5zZW1ibCBJRCBpbiB0aGUgU2NQQ0EgZGF0YSwgYXMgdGhlc2UgYXJlIGV4cGVjdGVkIHRvIGJlIGRpZmZlcmVudC4KCmBgYHtyfQpzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PgogIGZpbHRlcigKICAgIGdlbmVfc3ltYm9sX3NjcGNhICE9IGdlbmVfaWRzLAogICAgZ2VuZV9zeW1ib2xfc2NwY2FfdW5pcXVlICE9IGdlbmVfc3ltYm9sXzEweDIwMjBfdW5pcXVlCiAgKQpgYGAKCk1vcmUgdGhhbiBJIGV4cGVjdGVkLCBidXQgaXQgbG9va3MgbGlrZSBtb3N0IG9mIHRoZXNlIGFyZSBjYXNlcyB3aGVyZSB0aGUgZ2VuZSBzeW1ib2wgaGFzIGJlZW4gdXBkYXRlZC4KV2UgY2FuIHZlcmlmeSB0aGlzIGJ5IGxvb2tpbmcgYXQgaG93IG9mdGVuIHRoZSBiYXNlIHN5bWJvbCBpcyB0aGUgc2FtZS4KCmBgYHtyfQpzY3BjYV9nZW5lX3JlZmVyZW5jZSB8PgogIGZpbHRlcigKICAgIGdlbmVfc3ltYm9sX3NjcGNhICE9IGdlbmVfaWRzLAogICAgZ2VuZV9zeW1ib2xfc2NwY2FfdW5pcXVlICE9IGdlbmVfc3ltYm9sXzEweDIwMjBfdW5pcXVlLAogICkgfD4KICBtdXRhdGUoc2FtZV9zeW1ib2wgPSBnZW5lX3N5bWJvbF9zY3BjYSA9PSBnZW5lX3N5bWJvbF8xMHgyMDIwKSB8PgogIGNvdW50KHNhbWVfc3ltYm9sKQpgYGAKCldoYXQgd2UgYXJlIHJlYWxseSBpbnRlcmVzdGVkIGluIGFyZSB0aGUgY2FzZXMgd2hlcmUgdGhlIHByb2Nlc3Mgb2YgbWFraW5nIHRoZSBnZW5lIHN5bWJvbCB1bmlxdWUgaGFkIGRpZmZlcmVudCBlZmZlY3RzOgoKYGBge3J9CnNjcGNhX2dlbmVfcmVmZXJlbmNlIHw+CiAgZmlsdGVyKAogICAgZ2VuZV9zeW1ib2xfc2NwY2EgPT0gZ2VuZV9zeW1ib2xfMTB4MjAyMCwKICAgIGdlbmVfc3ltYm9sX3NjcGNhX3VuaXF1ZSAhPSBnZW5lX3N5bWJvbF8xMHgyMDIwX3VuaXF1ZQogICkKYGBgCgpUaGlzIGRlZmluaXRlbHkgaGFwcGVucyBtb3JlIHRoYW4gSSB3b3VsZCBoYXZlIGhvcGVkLCB0aG91Z2ggbXkgaG9wZSB3YXMgcHJvYmFibHkgdW5yZWFzb25hYmxlLCBhcyAxNCBjYXNlcyBpcyBub3QgYmFkLgpXaGljaCBpcyB0byBzYXkgdGhhdCB0aGUgYmVzdCB3YXkgdG8gaGFuZGxlIHRoaXMgaXMgcHJvYmFibHkgc2ltcGx5IHRvIHRyYW5zbGF0ZSB1c2luZyB0aGUgdGhlIHRhYmxlIGRpcmVjdGx5IHdoZW4gcGVyZm9ybWluZyBjb21wYXJpc29ucyB0byBleGlzdGluZyBkYXRhIGZvciB0aGUgbW9zdCBhY2N1cmF0ZSByZXN1bHRzLgoKV2Ugd2lsbCBuZWVkIHRvIGhhdmUgY2xlYXIgaW5zdHJ1Y3Rpb25zIGFib3V0IHdoZW4gdG8gdXNlIHdoaWNoIGtpbmQgb2YgdHJhbnNsYXRpb24uCg==
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/data/scpca_gene_reference.rda b/data/scpca_gene_reference.rda
new file mode 100644
index 0000000..12e1eeb
Binary files /dev/null and b/data/scpca_gene_reference.rda differ
diff --git a/man/ensembl_to_symbol.Rd b/man/ensembl_to_symbol.Rd
index 33c74a8..bb380cc 100644
--- a/man/ensembl_to_symbol.Rd
+++ b/man/ensembl_to_symbol.Rd
@@ -2,19 +2,34 @@
% Please edit documentation in R/convert-gene-ids.R
\name{ensembl_to_symbol}
\alias{ensembl_to_symbol}
-\title{Convert Ensembl gene ids to gene symbols based on an ScPCA SingleCellExperiment object}
+\title{Convert Ensembl gene ids to gene symbols based on reference gene lists}
\usage{
-ensembl_to_symbol(ensembl_ids, sce, leave_na = FALSE)
+ensembl_to_symbol(
+ ensembl_ids,
+ reference = c("scpca", "10x2020", "10x2024"),
+ sce = NULL,
+ unique = FALSE,
+ leave_na = FALSE
+)
}
\arguments{
\item{ensembl_ids}{A character vector of Ensembl gene ids to translate to
gene symbols.}
-\item{sce}{A SingleCellExperiment object containing gene ids and gene symbols
-to use for translation.}
+\item{reference}{The reference gene list to use for translation. One of `scpca`,
+`10x2020`, `10x2024`. The `scpca` reference is the default.}
-\item{leave_na}{logical indicating whether to leave NA values in the output.
-Default is `FALSE`}
+\item{sce}{A SingleCellExperiment object to use as a reference for gene symbols.
+If provided, the `reference` argument will be ignored. The `sce` object must
+include columns with the names `gene_ids` (containing Ensembl ids) and
+`gene_symbol` (containing the symbols) to use for conversion.}
+
+\item{unique}{Whether to use unique gene symbols, as would be done if
+data had been read in with gene symbols by Seurat. Default is FALSE.}
+
+\item{leave_na}{Whether to leave NA values in the output vector.
+If FALSE, any missing values will be replaced with the input ensembl_id value.
+Default is FALSE.}
}
\value{
A vector of gene symbols corresponding to the input Ensembl ids.
@@ -22,24 +37,29 @@ A vector of gene symbols corresponding to the input Ensembl ids.
\description{
The SingleCellExperiment objects produced as part of ScPCA are indexed by
Ensembl gene ids, as those are more stable than gene symbols. However,
-for many applications gene symbols are useful. This function provides a
-simple and consistent conversion of Ensembl gene ids to gene symbols based on
-the `gene_symbol` column that is present in the row data of ScPCA
-SingleCellExperiment objects.
+for many applications gene symbols are useful. This function provides
+simple conversion of Ensembl gene ids to gene symbols based on either the
+ScPCA reference gene list or a 10x reference gene list as used by Cell Ranger.
+Alternatively, a SingleCellExperiment object with gene ids and gene symbols
+stored in the row data (as those provided by ScPCA) can be used as the reference.
}
\details{
-For this function, the SingleCellExperiment object must contain a `gene_ids`
-column containing Ensembl gene ids and a `gene_symbol` column containing gene
-symbols. If any gene ids are not found or if the gene symbol is not defined,
-the input gene id is returned, unless the `leave_na` is set to `TRUE`.
+The gene symbols can either be made unique (as would be done if read in by Seurat)
+or left as is.
}
\examples{
-\dontrun{
# convert a set of Ensembl ids to gene symbols
-# using a SingleCellExperiment reference
ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
-gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
+gene_symbols <- ensembl_to_symbol(ensembl_ids)
gene_symbols
### [1] "TP53" "MYCN"
+
+# convert a set of Ensembl ids to gene symbols using the 10x2020 reference
+gene_symbols_10x2020 <- ensembl_to_symbol(ensembl_ids, reference = "10x2020")
+
+\dontrun{
+# convert a set of Ensembl ids to gene symbols using an SCE for reference
+gene_symbols_sce <- ensembl_to_symbol(ensembl_ids, sce = sce)
}
+
}
diff --git a/man/sce_to_symbols.Rd b/man/sce_to_symbols.Rd
index 17b729c..5af59e8 100644
--- a/man/sce_to_symbols.Rd
+++ b/man/sce_to_symbols.Rd
@@ -4,14 +4,28 @@
\alias{sce_to_symbols}
\title{Set the row names of an ScPCA SingleCellExperiment object to gene symbols}
\usage{
-sce_to_symbols(sce, convert_hvg = TRUE, convert_pca = TRUE)
+sce_to_symbols(
+ sce,
+ reference = c("sce", "scpca", "10x2020", "10x2024"),
+ unique = FALSE,
+ convert_hvg = TRUE,
+ convert_pca = TRUE
+)
}
\arguments{
\item{sce}{A SingleCellExperiment object containing gene ids and gene symbols.}
-\item{convert_hvg}{Logical indicating whether to convert highly variable genes to gene symbols.}
+\item{reference}{The reference gene list for conversion. One of `sce`, `scpca`,
+`10x2020`, or `10x2024`. If `sce` (the default) the internal row data is used.}
-\item{convert_pca}{Logical indicating whether to convert PCA rotation matrix to gene symbols.}
+\item{unique}{Whether to use unique gene symbols, as would be done if
+data had been read in with gene symbols by Seurat. Default is FALSE.}
+
+\item{convert_hvg}{Logical indicating whether to convert highly variable genes to gene symbols.
+Default is TRUE.}
+
+\item{convert_pca}{Logical indicating whether to convert PCA rotation matrix to gene symbols.
+Default is TRUE.}
}
\value{
A SingleCellExperiment object with row names set as gene symbols.
@@ -22,18 +36,29 @@ Ensembl gene ids, as those are more stable than gene symbols. However,
for many applications gene symbols are useful. This function converts the
row names (indexes) of a SingleCellExperiment object to gene symbols based on the
`gene_symbol` column that is present in the row data of ScPCA SingleCellExperiment objects.
+It is also possible to use an alternative reference, such as the default ScPCA
+reference gene sets or the reference gene sets provided by 10x Genomics for
+use with Cell Ranger. Values for the 10x-provided 2020 and 2024 references
+are available.
}
\details{
+By default, duplicate gene symbols are left as is, but can be made unique
+(as would be done by Seurat) by setting the `unique` argument to TRUE.
+
Internal data structures such as the list of highly variable genes and the
rotation matrix for the PCA are also updated to use gene symbols, if present
(and not disabled by the `convert_hvg` and `convert_pca` arguments).
-
-Note that using this function will result in non-unique row ids as no
-de-duplication is currently performed.
}
\examples{
\dontrun{
# convert a SingleCellExperiment object to use gene symbols
symbol_sce <- sce_to_symbols(sce)
+
+# convert a SingleCellExperiment object, making the gene symbols unique
+symbol_sce <- sce_to_symbols(sce, unique = TRUE)
+
+# convert a SingleCellExperiment object to use gene symbols with the 10x2020 reference
+symbol_sce <- sce_to_symbols(sce, reference = "10x2020")
}
+
}
diff --git a/man/scpca_gene_reference.Rd b/man/scpca_gene_reference.Rd
new file mode 100644
index 0000000..3aab9a5
--- /dev/null
+++ b/man/scpca_gene_reference.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{scpca_gene_reference}
+\alias{scpca_gene_reference}
+\title{Conversion table for Ensembl gene ids and gene symbols}
+\format{
+A data frame with 7 columns:
+\describe{
+ \item{gene_ids}{Ensembl gene ids}
+ \item{gene_symbol_scpca}{The gene symbol used in the ScPCA reference}
+ \item{gene_symbol_scpca_unique}{The gene symbol from the ScPCA reference, after `make.unique()`}
+ \item{gene_symbol_10x2020}{The gene symbol used in the 2020 10x human genome reference}
+ \item{gene_symbol_10x2020_unique}{The gene symbol from the 2020 10x human genome reference, after `make.unique()`}
+ \item{gene_symbol_10x2024}{The gene symbol used in the 2024 10x human genome reference}
+ \item{gene_symbol_10x2024_unique}{The gene symbol from the 2024 10x human genome reference, after `make.unique()`}
+}
+}
+\usage{
+scpca_gene_reference
+}
+\description{
+This table includes the mapping for gene ids to gene symbols from different
+reference genome gene annotation lists.
+Included are the original gene symbols and the modified gene symbols that
+are created when running the `make.unique()` function, as is done when
+importing data using Seurat.
+}
+\keyword{datasets}
diff --git a/renv.lock b/renv.lock
index c321d11..787a0fa 100644
--- a/renv.lock
+++ b/renv.lock
@@ -66,6 +66,20 @@
],
"Hash": "ef32d07aafdd12f24c5827374ae3590d"
},
+ "BiocIO": {
+ "Package": "BiocIO",
+ "Version": "1.14.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "R",
+ "S4Vectors",
+ "methods",
+ "tools"
+ ],
+ "Hash": "f97a7ef01d364cf20d1946d43a3d526f"
+ },
"BiocManager": {
"Package": "BiocManager",
"Version": "1.30.25",
@@ -142,6 +156,26 @@
],
"Hash": "b892e27fc9659a4c8f8787d34c37b8b2"
},
+ "Biostrings": {
+ "Package": "Biostrings",
+ "Version": "2.72.1",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "GenomeInfoDb",
+ "IRanges",
+ "R",
+ "S4Vectors",
+ "XVector",
+ "crayon",
+ "grDevices",
+ "methods",
+ "stats",
+ "utils"
+ ],
+ "Hash": "886ff0ed958d6f839ed2e0d01f6853b3"
+ },
"Cairo": {
"Package": "Cairo",
"Version": "1.6-2",
@@ -190,6 +224,38 @@
],
"Hash": "5d9536664ccddb0eaa68a90afe4ee76e"
},
+ "DropletUtils": {
+ "Package": "DropletUtils",
+ "Version": "1.24.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BH",
+ "BiocGenerics",
+ "BiocParallel",
+ "DelayedArray",
+ "DelayedMatrixStats",
+ "GenomicRanges",
+ "HDF5Array",
+ "IRanges",
+ "Matrix",
+ "R.utils",
+ "Rcpp",
+ "Rhdf5lib",
+ "S4Vectors",
+ "SingleCellExperiment",
+ "SummarizedExperiment",
+ "beachmat",
+ "dqrng",
+ "edgeR",
+ "methods",
+ "rhdf5",
+ "scuttle",
+ "stats",
+ "utils"
+ ],
+ "Hash": "77f762ad74d48a0ef578fc81deded039"
+ },
"FNN": {
"Package": "FNN",
"Version": "1.1.4",
@@ -228,6 +294,28 @@
],
"Hash": "c3c792a7b7f2677be56e8632c5b7543d"
},
+ "GenomicAlignments": {
+ "Package": "GenomicAlignments",
+ "Version": "1.40.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "BiocParallel",
+ "Biostrings",
+ "GenomeInfoDb",
+ "GenomicRanges",
+ "IRanges",
+ "R",
+ "Rsamtools",
+ "S4Vectors",
+ "SummarizedExperiment",
+ "methods",
+ "stats",
+ "utils"
+ ],
+ "Hash": "e539709764587c581b31e446dc84d7b8"
+ },
"GenomicRanges": {
"Package": "GenomicRanges",
"Version": "1.56.1",
@@ -247,6 +335,29 @@
],
"Hash": "a3c822ef3c124828e25e7a9611beeb50"
},
+ "HDF5Array": {
+ "Package": "HDF5Array",
+ "Version": "1.32.1",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "DelayedArray",
+ "IRanges",
+ "Matrix",
+ "R",
+ "Rhdf5lib",
+ "S4Arrays",
+ "S4Vectors",
+ "methods",
+ "rhdf5",
+ "rhdf5filters",
+ "stats",
+ "tools",
+ "utils"
+ ],
+ "Hash": "420012f82591a2a20156ef65d4aa210a"
+ },
"IRanges": {
"Package": "IRanges",
"Version": "2.38.1",
@@ -317,6 +428,45 @@
],
"Hash": "152dbbcde6a9a7c7f3beef79b68cd76a"
},
+ "R.methodsS3": {
+ "Package": "R.methodsS3",
+ "Version": "1.8.2",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "utils"
+ ],
+ "Hash": "278c286fd6e9e75d0c2e8f731ea445c8"
+ },
+ "R.oo": {
+ "Package": "R.oo",
+ "Version": "1.27.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "R.methodsS3",
+ "methods",
+ "utils"
+ ],
+ "Hash": "6ac79ff194202248cf946fe3a5d6d498"
+ },
+ "R.utils": {
+ "Package": "R.utils",
+ "Version": "2.12.3",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "R.methodsS3",
+ "R.oo",
+ "methods",
+ "tools",
+ "utils"
+ ],
+ "Hash": "3dc2829b790254bfba21e60965787651"
+ },
"R6": {
"Package": "R6",
"Version": "2.5.1",
@@ -344,6 +494,18 @@
],
"Hash": "45f0398006e83a5b10b72a90663d8d8c"
},
+ "RCurl": {
+ "Package": "RCurl",
+ "Version": "1.98-1.16",
+ "Source": "Repository",
+ "Repository": "RSPM",
+ "Requirements": [
+ "R",
+ "bitops",
+ "methods"
+ ],
+ "Hash": "ddbdf53d15b47be4407ede6914f56fbb"
+ },
"ROCR": {
"Package": "ROCR",
"Version": "1.0-11",
@@ -465,6 +627,51 @@
],
"Hash": "c232938949fcd8126034419cc529333a"
},
+ "Rhdf5lib": {
+ "Package": "Rhdf5lib",
+ "Version": "1.26.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "R"
+ ],
+ "Hash": "c92ba8b9a2c5c9ff600a1062a3b7b727"
+ },
+ "Rhtslib": {
+ "Package": "Rhtslib",
+ "Version": "3.0.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "tools",
+ "zlibbioc"
+ ],
+ "Hash": "5d6514cd44a0106581e3310f3972a82e"
+ },
+ "Rsamtools": {
+ "Package": "Rsamtools",
+ "Version": "2.20.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "BiocParallel",
+ "Biostrings",
+ "GenomeInfoDb",
+ "GenomicRanges",
+ "IRanges",
+ "R",
+ "Rhtslib",
+ "S4Vectors",
+ "XVector",
+ "bitops",
+ "methods",
+ "stats",
+ "utils",
+ "zlibbioc"
+ ],
+ "Hash": "9762f24dcbdbd1626173c516bb64792c"
+ },
"Rtsne": {
"Package": "Rtsne",
"Version": "0.17",
@@ -692,6 +899,18 @@
],
"Hash": "83d45b690bffd09d1980c224ef329f5b"
},
+ "XML": {
+ "Package": "XML",
+ "Version": "3.99-0.17",
+ "Source": "Repository",
+ "Repository": "RSPM",
+ "Requirements": [
+ "R",
+ "methods",
+ "utils"
+ ],
+ "Hash": "bc2a8a1139d8d4bd9c46086708945124"
+ },
"XVector": {
"Package": "XVector",
"Version": "0.44.0",
@@ -779,6 +998,30 @@
],
"Hash": "0f4e9d8caa6feaa7e409ae6c30f2ca66"
},
+ "bit": {
+ "Package": "bit",
+ "Version": "4.5.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R"
+ ],
+ "Hash": "5dc7b2677d65d0e874fc4aaf0e879987"
+ },
+ "bit64": {
+ "Package": "bit64",
+ "Version": "4.5.2",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "bit",
+ "methods",
+ "stats",
+ "utils"
+ ],
+ "Hash": "e84984bf5f12a18628d9a02322128dfd"
+ },
"bitops": {
"Package": "bitops",
"Version": "1.0-8",
@@ -895,6 +1138,16 @@
],
"Hash": "b21916dd77a27642b447374a5d30ecf3"
},
+ "clipr": {
+ "Package": "clipr",
+ "Version": "0.8.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "utils"
+ ],
+ "Hash": "3f038e5ac7f41d4ac41ce658c85e3042"
+ },
"cluster": {
"Package": "cluster",
"Version": "2.1.6",
@@ -979,6 +1232,20 @@
],
"Hash": "859d96e65ef198fd43e82b9628d593ef"
},
+ "credentials": {
+ "Package": "credentials",
+ "Version": "2.0.2",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "askpass",
+ "curl",
+ "jsonlite",
+ "openssl",
+ "sys"
+ ],
+ "Hash": "09fd631e607a236f8cc7f9604db32cb8"
+ },
"crosstalk": {
"Package": "crosstalk",
"Version": "1.2.1",
@@ -1304,6 +1571,21 @@
],
"Hash": "b052bd270aeddeca332c20feecfb039d"
},
+ "gert": {
+ "Package": "gert",
+ "Version": "2.1.4",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "askpass",
+ "credentials",
+ "openssl",
+ "rstudioapi",
+ "sys",
+ "zip"
+ ],
+ "Hash": "ae855ad6d7be20dd7b05d43d25700398"
+ },
"ggbeeswarm": {
"Package": "ggbeeswarm",
"Version": "0.7.2",
@@ -1390,6 +1672,34 @@
],
"Hash": "66488692cb8621bc78df1b9b819497a6"
},
+ "gh": {
+ "Package": "gh",
+ "Version": "1.4.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "cli",
+ "gitcreds",
+ "glue",
+ "httr2",
+ "ini",
+ "jsonlite",
+ "lifecycle",
+ "rlang"
+ ],
+ "Hash": "fbbbc48eba7a6626a08bb365e44b563b"
+ },
+ "gitcreds": {
+ "Package": "gitcreds",
+ "Version": "0.1.2",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R"
+ ],
+ "Hash": "ab08ac61f3e1be454ae21911eb8bc2fe"
+ },
"globals": {
"Package": "globals",
"Version": "0.16.3",
@@ -1500,6 +1810,20 @@
],
"Hash": "d65ba49117ca223614f71b60d85b8ab7"
},
+ "hms": {
+ "Package": "hms",
+ "Version": "1.1.3",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "lifecycle",
+ "methods",
+ "pkgconfig",
+ "rlang",
+ "vctrs"
+ ],
+ "Hash": "b59377caa7ed00fa41808342002138f9"
+ },
"htmltools": {
"Package": "htmltools",
"Version": "0.5.8.1",
@@ -1561,6 +1885,27 @@
],
"Hash": "ac107251d9d9fd72f0ca8049988f1d7f"
},
+ "httr2": {
+ "Package": "httr2",
+ "Version": "1.0.6",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "R6",
+ "cli",
+ "curl",
+ "glue",
+ "lifecycle",
+ "magrittr",
+ "openssl",
+ "rappdirs",
+ "rlang",
+ "vctrs",
+ "withr"
+ ],
+ "Hash": "3ef5d07ec78803475a94367d71b40c41"
+ },
"ica": {
"Package": "ica",
"Version": "1.0-3",
@@ -1591,6 +1936,13 @@
],
"Hash": "c3b7d801d722e26e4cd888e042bf9af5"
},
+ "ini": {
+ "Package": "ini",
+ "Version": "0.3.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "6154ec2223172bce8162d4153cda21f7"
+ },
"irlba": {
"Package": "irlba",
"Version": "2.3.5.1",
@@ -2122,6 +2474,16 @@
"Repository": "CRAN",
"Hash": "a555924add98c99d2f411e37e7d25e9f"
},
+ "prettyunits": {
+ "Package": "prettyunits",
+ "Version": "1.2.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R"
+ ],
+ "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7"
+ },
"processx": {
"Package": "processx",
"Version": "3.8.4",
@@ -2135,6 +2497,20 @@
],
"Hash": "0c90a7d71988856bad2a2a45dd871bb9"
},
+ "progress": {
+ "Package": "progress",
+ "Version": "1.2.3",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "R6",
+ "crayon",
+ "hms",
+ "prettyunits"
+ ],
+ "Hash": "f4625e061cb2865f111b47ff163a5ca6"
+ },
"progressr": {
"Package": "progressr",
"Version": "0.14.0",
@@ -2210,6 +2586,29 @@
],
"Hash": "5e3c5dc0b071b21fa128676560dbe94d"
},
+ "readr": {
+ "Package": "readr",
+ "Version": "2.1.5",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "R6",
+ "cli",
+ "clipr",
+ "cpp11",
+ "crayon",
+ "hms",
+ "lifecycle",
+ "methods",
+ "rlang",
+ "tibble",
+ "tzdb",
+ "utils",
+ "vroom"
+ ],
+ "Hash": "9de96463d2117f6ac49980577939dfb3"
+ },
"rematch2": {
"Package": "rematch2",
"Version": "2.1.2",
@@ -2222,13 +2621,13 @@
},
"renv": {
"Package": "renv",
- "Version": "1.0.7",
+ "Version": "1.0.11",
"Source": "Repository",
"Repository": "RSPM",
"Requirements": [
"utils"
],
- "Hash": "397b7b2a265bc5a7a06852524dabae20"
+ "Hash": "47623f66b4e80b3b0587bc5d7b309888"
},
"reshape2": {
"Package": "reshape2",
@@ -2243,6 +2642,22 @@
],
"Hash": "bb5996d0bd962d214a11140d77589917"
},
+ "restfulr": {
+ "Package": "restfulr",
+ "Version": "0.0.15",
+ "Source": "Repository",
+ "Repository": "RSPM",
+ "Requirements": [
+ "R",
+ "RCurl",
+ "S4Vectors",
+ "XML",
+ "methods",
+ "rjson",
+ "yaml"
+ ],
+ "Hash": "44651c1e68eda9d462610aca9f15a815"
+ },
"reticulate": {
"Package": "reticulate",
"Version": "1.39.0",
@@ -2265,6 +2680,39 @@
],
"Hash": "e1a5d04397edc1580c5e0ed1dbdccf76"
},
+ "rhdf5": {
+ "Package": "rhdf5",
+ "Version": "2.48.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "R",
+ "Rhdf5lib",
+ "methods",
+ "rhdf5filters"
+ ],
+ "Hash": "74d8c5aeb96d090ce8efc9ffd16afa2b"
+ },
+ "rhdf5filters": {
+ "Package": "rhdf5filters",
+ "Version": "1.16.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "Rhdf5lib"
+ ],
+ "Hash": "99e15369f8fb17dc188377234de13fc6"
+ },
+ "rjson": {
+ "Package": "rjson",
+ "Version": "0.2.23",
+ "Source": "Repository",
+ "Repository": "RSPM",
+ "Requirements": [
+ "R"
+ ],
+ "Hash": "7a04e9eff95857dbf557b4e5f0b3d1a8"
+ },
"rlang": {
"Package": "rlang",
"Version": "1.1.4",
@@ -2309,6 +2757,13 @@
],
"Hash": "4c8415e0ec1e29f3f4f6fc108bef0144"
},
+ "rstudioapi": {
+ "Package": "rstudioapi",
+ "Version": "0.17.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "5f90cd73946d706cfe26024294236113"
+ },
"rsvd": {
"Package": "rsvd",
"Version": "1.0.5",
@@ -2320,6 +2775,33 @@
],
"Hash": "b462187d887abc519894874486dbd6fd"
},
+ "rtracklayer": {
+ "Package": "rtracklayer",
+ "Version": "1.64.0",
+ "Source": "Bioconductor",
+ "Repository": "Bioconductor 3.19",
+ "Requirements": [
+ "BiocGenerics",
+ "BiocIO",
+ "Biostrings",
+ "GenomeInfoDb",
+ "GenomicAlignments",
+ "GenomicRanges",
+ "IRanges",
+ "R",
+ "Rsamtools",
+ "S4Vectors",
+ "XML",
+ "XVector",
+ "curl",
+ "httr",
+ "methods",
+ "restfulr",
+ "tools",
+ "zlibbioc"
+ ],
+ "Hash": "3d6f004fce582bd7d68e2e18d44abbc1"
+ },
"sass": {
"Package": "sass",
"Version": "0.4.9",
@@ -2901,6 +3383,48 @@
],
"Hash": "cfbad971a71f0e27cec22e544a08bc3b"
},
+ "tzdb": {
+ "Package": "tzdb",
+ "Version": "0.4.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "cpp11"
+ ],
+ "Hash": "f561504ec2897f4d46f0c7657e488ae1"
+ },
+ "usethis": {
+ "Package": "usethis",
+ "Version": "3.0.0",
+ "Source": "Repository",
+ "Repository": "RSPM",
+ "Requirements": [
+ "R",
+ "cli",
+ "clipr",
+ "crayon",
+ "curl",
+ "desc",
+ "fs",
+ "gert",
+ "gh",
+ "glue",
+ "jsonlite",
+ "lifecycle",
+ "purrr",
+ "rappdirs",
+ "rlang",
+ "rprojroot",
+ "rstudioapi",
+ "stats",
+ "utils",
+ "whisker",
+ "withr",
+ "yaml"
+ ],
+ "Hash": "b2fbf93c2127bedd2cbe9b799530d5d2"
+ },
"utf8": {
"Package": "utf8",
"Version": "1.2.4",
@@ -2978,6 +3502,32 @@
],
"Hash": "c826c7c4241b6fc89ff55aaea3fa7491"
},
+ "vroom": {
+ "Package": "vroom",
+ "Version": "1.6.5",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Requirements": [
+ "R",
+ "bit64",
+ "cli",
+ "cpp11",
+ "crayon",
+ "glue",
+ "hms",
+ "lifecycle",
+ "methods",
+ "progress",
+ "rlang",
+ "stats",
+ "tibble",
+ "tidyselect",
+ "tzdb",
+ "vctrs",
+ "withr"
+ ],
+ "Hash": "390f9315bc0025be03012054103d227c"
+ },
"waldo": {
"Package": "waldo",
"Version": "0.5.3",
@@ -2995,6 +3545,13 @@
],
"Hash": "16aa934a49658677d8041df9017329b9"
},
+ "whisker": {
+ "Package": "whisker",
+ "Version": "0.4.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "c6abfa47a46d281a7d5159d0a8891e88"
+ },
"withr": {
"Package": "withr",
"Version": "3.0.1",
@@ -3039,6 +3596,13 @@
"Repository": "RSPM",
"Hash": "51dab85c6c98e50a18d7551e9d49f76c"
},
+ "zip": {
+ "Package": "zip",
+ "Version": "2.3.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab"
+ },
"zlibbioc": {
"Package": "zlibbioc",
"Version": "1.50.0",
diff --git a/renv/activate.R b/renv/activate.R
index d13f993..0eb5108 100644
--- a/renv/activate.R
+++ b/renv/activate.R
@@ -2,7 +2,7 @@
local({
# the requested version of renv
- version <- "1.0.7"
+ version <- "1.0.11"
attr(version, "sha") <- NULL
# the project directory
@@ -98,6 +98,66 @@ local({
unloadNamespace("renv")
# load bootstrap tools
+ ansify <- function(text) {
+ if (renv_ansify_enabled())
+ renv_ansify_enhanced(text)
+ else
+ renv_ansify_default(text)
+ }
+
+ renv_ansify_enabled <- function() {
+
+ override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA)
+ if (!is.na(override))
+ return(as.logical(override))
+
+ pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA)
+ if (identical(pane, "build"))
+ return(FALSE)
+
+ testthat <- Sys.getenv("TESTTHAT", unset = "false")
+ if (tolower(testthat) %in% "true")
+ return(FALSE)
+
+ iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false")
+ if (tolower(iderun) %in% "false")
+ return(FALSE)
+
+ TRUE
+
+ }
+
+ renv_ansify_default <- function(text) {
+ text
+ }
+
+ renv_ansify_enhanced <- function(text) {
+
+ # R help links
+ pattern <- "`\\?(renv::(?:[^`])+)`"
+ replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`"
+ text <- gsub(pattern, replacement, text, perl = TRUE)
+
+ # runnable code
+ pattern <- "`(renv::(?:[^`])+)`"
+ replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`"
+ text <- gsub(pattern, replacement, text, perl = TRUE)
+
+ # return ansified text
+ text
+
+ }
+
+ renv_ansify_init <- function() {
+
+ envir <- renv_envir_self()
+ if (renv_ansify_enabled())
+ assign("ansify", renv_ansify_enhanced, envir = envir)
+ else
+ assign("ansify", renv_ansify_default, envir = envir)
+
+ }
+
`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
@@ -142,7 +202,10 @@ local({
# compute common indent
indent <- regexpr("[^[:space:]]", lines)
common <- min(setdiff(indent, -1L)) - leave
- paste(substring(lines, common), collapse = "\n")
+ text <- paste(substring(lines, common), collapse = "\n")
+
+ # substitute in ANSI links for executable renv code
+ ansify(text)
}
@@ -305,8 +368,11 @@ local({
quiet = TRUE
)
- if ("headers" %in% names(formals(utils::download.file)))
- args$headers <- renv_bootstrap_download_custom_headers(url)
+ if ("headers" %in% names(formals(utils::download.file))) {
+ headers <- renv_bootstrap_download_custom_headers(url)
+ if (length(headers) && is.character(headers))
+ args$headers <- headers
+ }
do.call(utils::download.file, args)
@@ -385,10 +451,21 @@ local({
for (type in types) {
for (repos in renv_bootstrap_repos()) {
+ # build arguments for utils::available.packages() call
+ args <- list(type = type, repos = repos)
+
+ # add custom headers if available -- note that
+ # utils::available.packages() will pass this to download.file()
+ if ("headers" %in% names(formals(utils::download.file))) {
+ headers <- renv_bootstrap_download_custom_headers(repos)
+ if (length(headers) && is.character(headers))
+ args$headers <- headers
+ }
+
# retrieve package database
db <- tryCatch(
as.data.frame(
- utils::available.packages(type = type, repos = repos),
+ do.call(utils::available.packages, args),
stringsAsFactors = FALSE
),
error = identity
@@ -470,6 +547,14 @@ local({
}
+ renv_bootstrap_github_token <- function() {
+ for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) {
+ envval <- Sys.getenv(envvar, unset = NA)
+ if (!is.na(envval))
+ return(envval)
+ }
+ }
+
renv_bootstrap_download_github <- function(version) {
enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
@@ -477,16 +562,16 @@ local({
return(FALSE)
# prepare download options
- pat <- Sys.getenv("GITHUB_PAT")
- if (nzchar(Sys.which("curl")) && nzchar(pat)) {
+ token <- renv_bootstrap_github_token()
+ if (nzchar(Sys.which("curl")) && nzchar(token)) {
fmt <- "--location --fail --header \"Authorization: token %s\""
- extra <- sprintf(fmt, pat)
+ extra <- sprintf(fmt, token)
saved <- options("download.file.method", "download.file.extra")
options(download.file.method = "curl", download.file.extra = extra)
on.exit(do.call(base::options, saved), add = TRUE)
- } else if (nzchar(Sys.which("wget")) && nzchar(pat)) {
+ } else if (nzchar(Sys.which("wget")) && nzchar(token)) {
fmt <- "--header=\"Authorization: token %s\""
- extra <- sprintf(fmt, pat)
+ extra <- sprintf(fmt, token)
saved <- options("download.file.method", "download.file.extra")
options(download.file.method = "wget", download.file.extra = extra)
on.exit(do.call(base::options, saved), add = TRUE)
diff --git a/renv/settings.json b/renv/settings.json
index 9dceb94..d79b996 100644
--- a/renv/settings.json
+++ b/renv/settings.json
@@ -9,7 +9,7 @@
],
"ppm.enabled": null,
"ppm.ignored.urls": [],
- "r.version": null,
+ "r.version": "4.4.0",
"snapshot.type": "implicit",
"use.cache": true,
"vcs.ignore.cellar": true,
diff --git a/tests/testthat/test-convert-gene-ids.R b/tests/testthat/test-convert-gene-ids.R
index 491c0a3..23ec3c2 100644
--- a/tests/testthat/test-convert-gene-ids.R
+++ b/tests/testthat/test-convert-gene-ids.R
@@ -1,30 +1,86 @@
-# read in SCE for testing
-sce <- readRDS(test_path("data", "scpca_sce.rds"))
-
-test_that("basic ensembl_id conversion works", {
+test_that("basic gene symbol conversion works", {
ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
- gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
+ gene_symbols <- ensembl_to_symbol(ensembl_ids)
expect_equal(gene_symbols, c("TP53", "MYCN"))
})
-test_that("ensembl_id conversion works with unexpected ids", {
+test_that("gene symbol conversion works with unexpected ids", {
ensembl_ids <- c("ENSG00000141510", "ENSG00000134323", "foobar")
- expect_warning(gene_symbols <- ensembl_to_symbol(ensembl_ids, sce))
+ expect_warning(gene_symbols <- ensembl_to_symbol(ensembl_ids))
expect_equal(gene_symbols, c("TP53", "MYCN", "foobar"))
- expect_no_warning(gene_symbols_na <- ensembl_to_symbol(ensembl_ids, sce, leave_na = TRUE))
+ expect_no_warning(gene_symbols_na <- ensembl_to_symbol(ensembl_ids, leave_na = TRUE))
expect_equal(gene_symbols_na, c("TP53", "MYCN", NA))
})
+test_that("gene symbol conversion works for 10x references", {
+ ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "10x2020")
+ expect_equal(gene_symbols, c("TP53", "MYCN"))
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "10x2024")
+ expect_equal(gene_symbols, c("TP53", "MYCN"))
+})
+
+test_that("gene symbol conversion works for 'unique' gene symbols", {
+ ensembl_ids <- c("ENSG00000015479", "ENSG00000269226")
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "scpca", unique = FALSE)
+ expect_equal(gene_symbols, c("MATR3", "TMSB15B"))
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "scpca", unique = TRUE)
+ expect_equal(gene_symbols, c("MATR3.1", "TMSB15B.1"))
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "10x2020", unique = FALSE)
+ expect_equal(gene_symbols, c("MATR3", "TMSB15B"))
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, reference = "10x2020", unique = TRUE)
+ expect_equal(gene_symbols, c("MATR3.1", "TMSB15B.1"))
+})
+
+test_that("gene symbol conversion works using an SCE reference", {
+ sce <- readRDS(test_path("data", "scpca_sce.rds"))
+ ensembl_ids <- c("ENSG00000015479", "ENSG00000269226")
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = FALSE)
+ expect_equal(gene_symbols, c("MATR3", "TMSB15B"))
+
+ gene_symbols <- ensembl_to_symbol(ensembl_ids, sce = sce, unique = TRUE)
+ expect_equal(gene_symbols, c("MATR3.1", "TMSB15B.1"))
+})
+
+
+
test_that("conversion of a full sce object works as expected", {
+ sce <- readRDS(test_path("data", "scpca_sce.rds"))
+ gene_symbols <- rowData(sce)$gene_symbol
+ names(gene_symbols) <- rowData(sce)$gene_ids
+ gene_symbols[is.na(gene_symbols)] <- names(gene_symbols)[is.na(gene_symbols)]
+
expect_warning(converted_sce <- sce_to_symbols(sce))
+ expect_equal(rownames(converted_sce), unname(gene_symbols))
+
+ # check that hvg and PCA were converted too.
+ expected_hvg <- gene_symbols[metadata(sce)$highly_variable_genes]
+ expect_equal(metadata(converted_sce)$highly_variable_genes, expected_hvg)
+
+ rotation_ids <- rownames(attr(reducedDim(converted_sce, "PCA"), "rotation"))
+ expected_rotation_ids <- gene_symbols[rownames(attr(reducedDim(sce, "PCA"), "rotation"))]
+ expect_equal(rotation_ids, expected_rotation_ids)
+})
+
+test_that("conversion of an sce object using a reference works as expected", {
+ sce <- readRDS(test_path("data", "scpca_sce.rds"))
gene_symbols <- rowData(sce)$gene_symbol
names(gene_symbols) <- rowData(sce)$gene_ids
gene_symbols[is.na(gene_symbols)] <- names(gene_symbols)[is.na(gene_symbols)]
+ # testing with the ScPCA reference, which should be the same as internal table
+ converted_sce <- sce_to_symbols(sce, reference = "scpca")
expect_equal(rownames(converted_sce), unname(gene_symbols))
# check that hvg and PCA were converted too.