From d071fafc7ccc58d93a85e68ecdb952603a865846 Mon Sep 17 00:00:00 2001 From: Sehrish Kanwal Date: Thu, 15 Feb 2024 15:47:21 +1100 Subject: [PATCH] Use AnnotationHub to access ensembldb (#102) * use annotationhub to access ensembldb * explicitly specify package * specify package for select call * remove ensembldb from libraries * refactor function * download db in a separate function * return object * script for downloading ensembl database * remove db download function and pass as an argument * save complete gene and tx ids to a file * save tx and gene id object as external data * "save tx and gene id object as external data" This reverts commit 9aa143a3d6467cdf4511ce453cd4423e36c9311e. revert data commit * Revert "save complete gene and tx ids to a file" This reverts commit 2cfa9e79c1869a7ba33fe3164c413860d89d880a. * add ah database download script * transcript and gene id object for ensdb 105 * remove namespace for get_ensembl_db * use saved rds file to get info from ensembl db * update data prep script to include additional columns * updated rds data * get gene length from saved rds object * update gitignore * remove ensdb pkg library * get ensembl annotation file from RNAsum.data package * remove css file * update tx_gene_id_105 input * remove duplicated and redundant libraries * ensembl data now exists in RNAsum.data * clean-up code --------- Co-authored-by: Peter Diakumis --- .gitignore | 6 +++++ inst/rmd/rnasum.Rmd | 53 +++++++++++++++++++++++---------------------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 6393088f..057ae76e 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,12 @@ vignettes/*.pdf /nogit +# tmp runtime files +*.png +*.html +*.txt +*.pdf + # Big data files # ###################### #data/Avner.counts.matrix.txt diff --git a/inst/rmd/rnasum.Rmd b/inst/rmd/rnasum.Rmd index 317cda84..35df26bb 100755 --- a/inst/rmd/rnasum.Rmd +++ b/inst/rmd/rnasum.Rmd @@ -63,11 +63,11 @@ knitr::knit_hooks$set(timeit = local({ knitr::opts_chunk$set(timeit = TRUE, echo = FALSE) ``` -```{r load_pkgs, message=FALSE} -# source(here::here("inst/rmd/params/pd.R")) # for use with 'Run All Chunks Above' -# source(here::here("inst/rmd/params/sk.R")) # for use with 'Run All Chunks Above' -ensembl_version <- 86L -ensdb_pkg <- paste0("EnsDb.Hsapiens.v", ensembl_version) +```{r echo=TRUE, message=FALSE} +#source(here::here("inst/rmd/params/pd.R")) # for use with 'Run All Chunks Above' +#source(here::here("inst/rmd/params/sk.R")) # for use with 'Run All Chunks Above' + +# start with more exotic pkgs, then get to core ones { library(conflicted) # checks for pkg function conflicts library(AnnotationDbi, include.only = c("keys")) @@ -76,7 +76,6 @@ ensdb_pkg <- paste0("EnsDb.Hsapiens.v", ensembl_version) library(dplyr, include.only = c("mutate", "select", "filter", "if_else")) library(EDASeq, include.only = c("plotRLE")) library(edgeR, include.only = c("DGEList")) - library(ensdb_pkg, character.only = TRUE, include.only = ensdb_pkg) library(ensembldb, include.only = c("lengthOf")) library(fs, include.only = c("dir_create")) library(glue, include.only = c("glue")) @@ -111,14 +110,12 @@ sample_name <- paste0( sample_report_title <- sample_name results_dir <- file.path(params$report_dir, glue::glue("{sample_name}.results")) fs::dir_create(results_dir) -# Annotate transcripts with gene IDs -# These could probably go into refdata -##### Get genes annotation and genomic locations -##### Get keytypes for gene SYMBOL -edb <- eval(parse(text = ensdb_pkg)) -keys <- AnnotationDbi::keys(edb, keytype = "GENEID") -tx2ensembl <- edb |> - ensembldb::select(keys = keys, columns = c("TXID", "GENEID"), keytype = "GENEID") |> +#### Annotate transcripts with gene IDs +#### Get genes annotation and genomic locations using AnnotationHub +#### tx_gene_id_105.rds was generated using the ah_edb.R script under data-raw +tx_gene_id_105 <- system.file("extdata/ensembl/tx_gene_id_105.rds", package = "RNAsum.data") +tx2ensembl <- tx_gene_id_105 |> + dplyr::select("TXID", "GENEID") |> dplyr::rename(tx_name = "TXID", gene_id = "GENEID") # reference data @@ -335,9 +332,12 @@ genes2keep <- unlist(ref_genes.list[["summary"]]) |> unique() ##### Get gene symbols for the genes of interest. These genes will not be filtered out due to low/insufficient expression ##### Get genes genomic coordinates -gene_info <- edb |> - ensembldb::select(keys = keys, columns = c("GENEID", "GENENAME"), keytype = "GENEID") |> +gene_info <- tx_gene_id_105 |> + dplyr::select("GENEID", "GENENAME") |> dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME") +#gene_info <- edb |> +# ensembldb::select(keys = keys, columns = c("GENEID", "GENENAME"), keytype = "GENEID") |> +# dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME") ##### Limit genes annotation to the gene of interest, then ##### remove rows with duplicated ENSEMBL IDs @@ -467,7 +467,10 @@ for (group in targets_mod.list) { ##### TPM transformation with filtering } else if (params$filter && params$transform == "TPM") { ##### Get genes lengths - gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1))) + #gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1))) + gene.length <- setNames(as.integer(tx_gene_id_105$GENELENGTH), tx_gene_id_105$GENEID) + # Select distinct gene ids + gene.length <- gene.length[!duplicated(names(gene.length))] ##### Check for which genes the lenght info is not available and remove them from the data genes.no_length <- rownames(dat1)[!rownames(dat1) %in% names(gene.length)] @@ -513,7 +516,10 @@ for (group in targets_mod.list) { ##### TPM transformation without filtering } else if (!params$filter && params$transform == "TPM") { ##### Get genes lengths - gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1))) + #gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1))) + gene.length <- setNames(as.integer(tx_gene_id_105$GENELENGTH), tx_gene_id_105$GENEID) + # Select distinct gene ids + gene.length <- gene.length[!duplicated(names(gene.length))] ##### Check for which genes the lenght info is not available and remove them from the data genes.no_length <- rownames(dat1)[!rownames(dat1) %in% names(gene.length)] @@ -1060,14 +1066,9 @@ for (dataset in names(ref_dataset.list)) { ##### Get genes genomic coordiantes ##### Limit genes annotation to those genes for which sample expression measurements are available - gene_info <- edb |> - ensembldb::select( - keys = keys, - columns = c( - "GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME", - "GENESEQSTART", "GENESEQEND" - ), keytype = "GENEID" - ) |> + gene_info <- tx_gene_id_105 |> + dplyr::select("GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME", + "GENESEQSTART", "GENESEQEND") |> dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME") |> dplyr::filter(.data$ENSEMBL %in% data.df$ENSEMBL)