From d071fafc7ccc58d93a85e68ecdb952603a865846 Mon Sep 17 00:00:00 2001
From: Sehrish Kanwal <kanwals@unimelb.edu.au>
Date: Thu, 15 Feb 2024 15:47:21 +1100
Subject: [PATCH] Use AnnotationHub to access ensembldb (#102)

* use annotationhub to access ensembldb

* explicitly specify package

* specify package for select call

* remove ensembldb from libraries

* refactor function

* download db in a separate function

* return object

* script for downloading ensembl database

* remove db download function and pass as an argument

* save complete gene and tx ids to a file

* save tx and gene id object as external data

* "save tx and gene id object as external data"

This reverts commit 9aa143a3d6467cdf4511ce453cd4423e36c9311e.

revert data commit

* Revert "save complete gene and tx ids to a file"

This reverts commit 2cfa9e79c1869a7ba33fe3164c413860d89d880a.

* add ah database download script

* transcript and gene id object for ensdb 105

* remove namespace for get_ensembl_db

* use saved rds file to get info from ensembl db

* update data prep script to include additional columns

* updated rds data

* get gene length from saved rds object

* update gitignore

* remove ensdb pkg library

* get ensembl annotation file from RNAsum.data package

* remove css file

* update tx_gene_id_105 input

* remove duplicated and redundant libraries

* ensembl data now exists in RNAsum.data

* clean-up code

---------

Co-authored-by: Peter Diakumis <peterdiakumis@gmail.com>
---
 .gitignore          |  6 +++++
 inst/rmd/rnasum.Rmd | 53 +++++++++++++++++++++++----------------------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6393088f..057ae76e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,12 @@ vignettes/*.pdf
 
 /nogit
 
+# tmp runtime files
+*.png
+*.html
+*.txt
+*.pdf
+
 # Big data files #
 ######################
 #data/Avner.counts.matrix.txt
diff --git a/inst/rmd/rnasum.Rmd b/inst/rmd/rnasum.Rmd
index 317cda84..35df26bb 100755
--- a/inst/rmd/rnasum.Rmd
+++ b/inst/rmd/rnasum.Rmd
@@ -63,11 +63,11 @@ knitr::knit_hooks$set(timeit = local({
 knitr::opts_chunk$set(timeit = TRUE, echo = FALSE)
 ```
 
-```{r load_pkgs, message=FALSE}
-# source(here::here("inst/rmd/params/pd.R")) # for use with 'Run All Chunks Above'
-# source(here::here("inst/rmd/params/sk.R")) # for use with 'Run All Chunks Above'
-ensembl_version <- 86L
-ensdb_pkg <- paste0("EnsDb.Hsapiens.v", ensembl_version)
+```{r echo=TRUE, message=FALSE}
+#source(here::here("inst/rmd/params/pd.R")) # for use with 'Run All Chunks Above'
+#source(here::here("inst/rmd/params/sk.R")) # for use with 'Run All Chunks Above'
+
+# start with more exotic pkgs, then get to core ones
 {
   library(conflicted) # checks for pkg function conflicts
   library(AnnotationDbi, include.only = c("keys"))
@@ -76,7 +76,6 @@ ensdb_pkg <- paste0("EnsDb.Hsapiens.v", ensembl_version)
   library(dplyr, include.only = c("mutate", "select", "filter", "if_else"))
   library(EDASeq, include.only = c("plotRLE"))
   library(edgeR, include.only = c("DGEList"))
-  library(ensdb_pkg, character.only = TRUE, include.only = ensdb_pkg)
   library(ensembldb, include.only = c("lengthOf"))
   library(fs, include.only = c("dir_create"))
   library(glue, include.only = c("glue"))
@@ -111,14 +110,12 @@ sample_name <- paste0(
 sample_report_title <- sample_name
 results_dir <- file.path(params$report_dir, glue::glue("{sample_name}.results"))
 fs::dir_create(results_dir)
-# Annotate transcripts with gene IDs
-# These could probably go into refdata
-##### Get genes annotation and genomic locations
-##### Get keytypes for gene SYMBOL
-edb <- eval(parse(text = ensdb_pkg))
-keys <- AnnotationDbi::keys(edb, keytype = "GENEID")
-tx2ensembl <- edb |>
-  ensembldb::select(keys = keys, columns = c("TXID", "GENEID"), keytype = "GENEID") |>
+#### Annotate transcripts with gene IDs
+#### Get genes annotation and genomic locations using AnnotationHub
+#### tx_gene_id_105.rds was generated using the ah_edb.R script under data-raw
+tx_gene_id_105 <- system.file("extdata/ensembl/tx_gene_id_105.rds", package = "RNAsum.data")
+tx2ensembl <- tx_gene_id_105 |>
+  dplyr::select("TXID", "GENEID") |>
   dplyr::rename(tx_name = "TXID", gene_id = "GENEID")
 
 # reference data
@@ -335,9 +332,12 @@ genes2keep <- unlist(ref_genes.list[["summary"]]) |> unique()
 ##### Get gene symbols for the genes of interest. These genes will not be filtered out due to low/insufficient expression
 
 ##### Get genes genomic coordinates
-gene_info <- edb |>
-  ensembldb::select(keys = keys, columns = c("GENEID", "GENENAME"), keytype = "GENEID") |>
+gene_info <- tx_gene_id_105 |>
+  dplyr::select("GENEID", "GENENAME") |>
   dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME")
+#gene_info <- edb |>
+#  ensembldb::select(keys = keys, columns = c("GENEID", "GENENAME"), keytype = "GENEID") |>
+#  dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME")
 
 ##### Limit genes annotation to the gene of interest, then
 ##### remove rows with duplicated ENSEMBL IDs
@@ -467,7 +467,10 @@ for (group in targets_mod.list) {
     ##### TPM transformation with filtering
   } else if (params$filter && params$transform == "TPM") {
     ##### Get genes lengths
-    gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1)))
+    #gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1)))
+    gene.length <- setNames(as.integer(tx_gene_id_105$GENELENGTH), tx_gene_id_105$GENEID)
+    # Select distinct gene ids
+    gene.length <- gene.length[!duplicated(names(gene.length))]
 
     ##### Check for which genes the lenght info is not available and remove them from the data
     genes.no_length <- rownames(dat1)[!rownames(dat1) %in% names(gene.length)]
@@ -513,7 +516,10 @@ for (group in targets_mod.list) {
     ##### TPM transformation without filtering
   } else if (!params$filter && params$transform == "TPM") {
     ##### Get genes lengths
-    gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1)))
+    #gene.length <- ensembldb::lengthOf(edb, filter = AnnotationFilter::GeneIdFilter(rownames(dat1)))
+    gene.length <- setNames(as.integer(tx_gene_id_105$GENELENGTH), tx_gene_id_105$GENEID)
+    # Select distinct gene ids
+    gene.length <- gene.length[!duplicated(names(gene.length))]
 
     ##### Check for which genes the lenght info is not available and remove them from the data
     genes.no_length <- rownames(dat1)[!rownames(dat1) %in% names(gene.length)]
@@ -1060,14 +1066,9 @@ for (dataset in names(ref_dataset.list)) {
 
   ##### Get genes genomic coordiantes
   ##### Limit genes annotation to those genes for which sample expression measurements are available
-  gene_info <- edb |>
-    ensembldb::select(
-      keys = keys,
-      columns = c(
-        "GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME",
-        "GENESEQSTART", "GENESEQEND"
-      ), keytype = "GENEID"
-    ) |>
+  gene_info <- tx_gene_id_105 |>
+    dplyr::select("GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME",
+        "GENESEQSTART", "GENESEQEND") |>
     dplyr::rename("ENSEMBL" = "GENEID", "SYMBOL" = "GENENAME") |>
     dplyr::filter(.data$ENSEMBL %in% data.df$ENSEMBL)