Skip to content

Commit

Permalink
Feat: Implement HGNC Data Update Functionality
Browse files Browse the repository at this point in the history
#### New Features and Enhancements:
- **API Endpoint**:
  - Added new endpoint `/api/admin/update_hgnc_data`.
  - Restricted access to Administrator users.
  - Secure transaction implementation for data update.

- **Functionality Enhancements**:
  - Developed `update_process_hgnc_data` in `hgnc-functions.R`.
  - Refined `gene_coordinates_from_symbol` and `gene_coordinates_from_ensembl` in `ensembl-functions.R`.
  - Prepared for upcoming HTTPS requirements for Ensembl connections.

- **UI Integration**:
  - Included HGNC data update section in `ManageAnnotations.vue`.
  - Implemented loading indicators and user notifications.

- **Error Handling and Clean-up**:
  - Improved error management in the API endpoint.
  - Ensured proper database disconnection with `on.exit(dbDisconnect(sysndd_db), add = TRUE)`.

- **Miscellaneous Improvements**:
  - Suppressed warnings and resolved parsing issues.
  - Integrated Biomart library into Docker and Plumber setup.

#### Closing Remarks:
These updates, addressing the "Feature request: Functionality to initiate gene table update/check #11," enhance the system's reliability and data accuracy by ensuring up-to-date HGNC data integration. Closes #11
  • Loading branch information
berntpopp committed Mar 29, 2024
1 parent 01cb1c3 commit 1ec4201
Show file tree
Hide file tree
Showing 5 changed files with 399 additions and 10 deletions.
1 change: 1 addition & 0 deletions api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ RUN Rscript -e 'require(devtools); install_version("tictoc", version = "1.2", re
RUN Rscript -e 'require(devtools); install_version("fs", version = "1.6.2", repos = "http://cran.us.r-project.org")'
RUN Rscript -e 'require(devtools); install_version("FactoMineR", version = "2.8", repos = "http://cran.us.r-project.org")'
RUN Rscript -e 'BiocManager::install(c("STRINGdb"))'
RUN Rscript -e 'BiocManager::install(c("biomaRt"))'
150 changes: 150 additions & 0 deletions api/functions/ensembl-functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#### This file holds analyses functions for the ensembl database name using biomart

#' Retrieve gene coordinates in BED format from gene symbols
#'
#' This function retrieves the gene coordinates in BED format for the given gene
#' symbols. The coordinates are obtained from the specified reference genome.
#'
#' @param gene_symbols A vector or tibble containing the gene symbols.
#' @param reference The reference genome to use (default: "hg19").
#'
#' @return A tibble with the gene symbols and their corresponding coordinates in BED format.
#'
#' @examples
#' gene_symbols <- c("ARID1B ", "GRIN2B", "NAA10")
#' gene_coordinates_from_symbol(gene_symbols, reference = "hg19")
#'
#' @export
gene_coordinates_from_symbol <- function(gene_symbols, reference = "hg19") {
gene_symbol_list <- as_tibble(gene_symbols) %>%
dplyr::select(hgnc_symbol = value)

# define mart
mart_hg19 <- useMart("ensembl", host = "grch37.ensembl.org")
mart_hg19 <- useDataset("hsapiens_gene_ensembl", mart_hg19)

mart_hg38 <- useMart("ensembl", host = "ensembl.org")
mart_hg38 <- useDataset("hsapiens_gene_ensembl", mart_hg38)

if (reference == "hg19") {
mart <- useMart("ensembl", host = "grch37.ensembl.org")
mart <- useDataset("hsapiens_gene_ensembl", mart_hg19)
} else {
mart <- useMart("ensembl", host = "ensembl.org")
mart <- useDataset("hsapiens_gene_ensembl", mart_hg38)
}

attributes <- c("hgnc_symbol", "chromosome_name", "start_position", "end_position")
filters <- c("hgnc_symbol")

values <- list(hgnc_symbol = gene_symbol_list$hgnc_symbol)

gene_coordinates_hg19 <- getBM(attributes=attributes, filters=filters, values=values, mart=mart) %>%
group_by(hgnc_symbol) %>%
summarise(hgnc_symbol = max(hgnc_symbol), chromosome_name = max(chromosome_name), start_position = max(start_position), end_position = max(end_position)) %>%
mutate(bed_format = paste0("chr", chromosome_name, ":", start_position, "-", end_position)) %>%
dplyr::select(hgnc_symbol, bed_format)

gene_symbol_list_return <- gene_symbol_list %>%
left_join(gene_coordinates_hg19, by = ("hgnc_symbol"))

return(gene_symbol_list_return)
}


#' Retrieve gene coordinates in BED format from Ensembl IDs
#'
#' This function retrieves the gene coordinates in BED format for the given Ensembl
#' gene IDs. The coordinates are obtained from the specified reference genome.
#'
#' @param ensembl_id A vector or tibble containing the Ensembl gene IDs.
#' @param reference The reference genome to use (default: "hg19").
#'
#' @return A tibble with the Ensembl gene IDs and their corresponding coordinates in BED format.
#'
#' @examples
#' ensembl_id <- c("ENSG00000123456", "ENSG00000123457", "ENSG00000123458")
#' gene_coordinates_from_ensembl(ensembl_id, reference = "hg19")
#'
#' @export
gene_coordinates_from_ensembl <- function(ensembl_id, reference = "hg19") {
ensembl_id_list <- as_tibble(ensembl_id) %>%
dplyr::select(ensembl_gene_id = value)

# define mart
mart_hg19 <- useMart("ensembl", host = "grch37.ensembl.org")
mart_hg19 <- useDataset("hsapiens_gene_ensembl", mart_hg19)

mart_hg38 <- useMart("ensembl", host = "ensembl.org")
mart_hg38 <- useDataset("hsapiens_gene_ensembl", mart_hg38)

if (reference == "hg19") {
mart <- useMart("ensembl", host = "grch37.ensembl.org")
mart <- useDataset("hsapiens_gene_ensembl", mart_hg19)
} else {
mart <- useMart("ensembl", host = "ensembl.org")
mart <- useDataset("hsapiens_gene_ensembl", mart_hg38)
}

attributes <- c("ensembl_gene_id", "chromosome_name", "start_position", "end_position")
filters <- c("ensembl_gene_id")

values <- list(ensembl_gene_id = ensembl_id_list$ensembl_gene_id)

gene_coordinates_hg19 <- getBM(attributes=attributes, filters=filters, values=values, mart=mart) %>%
group_by(ensembl_gene_id) %>%
summarise(ensembl_gene_id = max(ensembl_gene_id), chromosome_name = max(chromosome_name), start_position = max(start_position), end_position = max(end_position)) %>%
mutate(bed_format = paste0("chr", chromosome_name, ":", start_position, "-", end_position)) %>%
dplyr::select(ensembl_gene_id, bed_format)

ensembl_id_list_return <- ensembl_id_list %>%
left_join(gene_coordinates_hg19, by = ("ensembl_gene_id"))

return(ensembl_id_list_return)
}


#' Retrieve Ensembl gene ID versions from Ensembl gene IDs
#'
#' This function retrieves the Ensembl gene ID versions for the given Ensembl
#' gene IDs. The ID versions are obtained from the specified reference genome.
#'
#' @param ensembl_id A vector or tibble containing the Ensembl gene IDs.
#' @param reference The reference genome to use (default: "hg19").
#'
#' @return A tibble with the Ensembl gene IDs and their corresponding Ensembl
#' gene ID versions.
#'
#' @examples
#' ensembl_id <- c("ENSG00000203782", "ENSG00000008710")
#' gene_id_version_from_ensembl(ensembl_id, reference = "hg19")
#'
#' @export
gene_id_version_from_ensembl <- function(ensembl_id, reference = "hg19") {
ensembl_id_list <- enframe(ensembl_id,
name = NULL,
value = "ensembl_gene_id")

# Define mart
if (reference == "hg19") {
mart <- useMart("ensembl",
dataset = "hsapiens_gene_ensembl", host = "grch37.ensembl.org")
} else {
mart <- useMart("ensembl",
dataset = "hsapiens_gene_ensembl", host = "ensembl.org")
}

# Define the attributes and filters
attributes <- c("ensembl_gene_id", "ensembl_gene_id_version")
filters <- "ensembl_gene_id"

# Retrieve the data
gene_id_version <- getBM(attributes = attributes, filters = filters,
values = ensembl_id_list$ensembl_gene_id, mart = mart)

# Join the data back to the input list to ensure all input IDs are in the output
ensembl_id_list_return <- ensembl_id_list %>%
left_join(gene_id_version, by = "ensembl_gene_id")

return(ensembl_id_list_return)
}
108 changes: 107 additions & 1 deletion api/functions/hgnc-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,110 @@ symbol_from_hgnc_id_grouped <- function(input_tibble, request_max = 150) {
ungroup()

return(input_tibble_request$response)
}
}


#' Update and Process HGNC Data
#'
#' This function checks for the latest HGNC file and downloads it if necessary,
#' then processes the gene information, updates STRINGdb identifiers, computes gene
#' coordinates, and returns a tibble with the updated data. If the data is updated
#' successfully, it saves it as a CSV file.
#'
#' @param hgnc_link The URL to download the latest HGNC file.
#' @param output_path String, the path where the output CSV file will be stored.
#' @param max_file_age Integer, the number of days to consider the file recent enough not to require re-downloading.
#' @return A tibble containing the updated non_alt_loci_set data.
#'
#' @examples
#' \dontrun{
#' updated_hgnc_data <- update_process_hgnc_data()
#' }
#'
#' @export
update_process_hgnc_data <- function(hgnc_link = "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/non_alt_loci_set.txt",
output_path = "data/",
max_file_age = 1) {
# TODO: replace with function
current_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")

# define the file base name
hgnc_file_basename <- "non_alt_loci_set"

if (check_file_age(hgnc_file_basename, output_path, max_file_age)) {
hgnc_file <- get_newest_file(hgnc_file_basename, output_path)
} else {
hgnc_file <- paste0(output_path,
hgnc_file_basename,
".",
current_date,
".txt")

download.file(hgnc_link, hgnc_file, mode = "wb", quiet = TRUE)
}

# Load the downloaded HGNC file
non_alt_loci_set <- suppressWarnings(read_delim(hgnc_file, "\t", col_names = TRUE, show_col_types = FALSE) %>%
mutate(update_date = current_date))

# get symbols for string db mapping
non_alt_loci_set_table <- non_alt_loci_set %>%
dplyr::select(symbol) %>%
unique()

# convert to data frame
non_alt_loci_set_df <- non_alt_loci_set_table %>%
as.data.frame()

# Load STRINGdb database
string_db <- STRINGdb$new(version = "11.5", species = 9606, score_threshold = 200, input_directory = output_path)

# Map the gene symbols to STRING identifiers
non_alt_loci_set_mapped <- string_db$map(non_alt_loci_set_df, "symbol")

# Convert the mapped data to a tibble
non_alt_loci_set_mapped_tibble <- as_tibble(non_alt_loci_set_mapped) %>%
filter(!is.na(STRING_id)) %>%
group_by(symbol) %>%
summarise(STRING_id = str_c(STRING_id, collapse=";")) %>%
ungroup %>%
unique()

## join with String identifiers
non_alt_loci_set_string <- non_alt_loci_set %>%
left_join(non_alt_loci_set_mapped_tibble, by="symbol")

# Compute gene coordinates from symbol and Ensembl ID
non_alt_loci_set_coordinates <- non_alt_loci_set_string %>%
mutate(hg19_coordinates_from_ensembl =
gene_coordinates_from_ensembl(ensembl_gene_id)) %>%
mutate(hg19_coordinates_from_symbol =
gene_coordinates_from_symbol(symbol)) %>%
mutate(hg38_coordinates_from_ensembl =
gene_coordinates_from_ensembl(ensembl_gene_id, reference = "hg38")) %>%
mutate(hg38_coordinates_from_symbol =
gene_coordinates_from_symbol(symbol, reference = "hg38")) %>%
mutate(bed_hg19 =
case_when(
!is.na(hg19_coordinates_from_ensembl$bed_format) ~
hg19_coordinates_from_ensembl$bed_format,
is.na(hg19_coordinates_from_ensembl$bed_format) ~
hg19_coordinates_from_symbol$bed_format,
)
) %>%
mutate(bed_hg38 =
case_when(
!is.na(hg38_coordinates_from_ensembl$bed_format) ~
hg38_coordinates_from_ensembl$bed_format,
is.na(hg38_coordinates_from_ensembl$bed_format) ~
hg38_coordinates_from_symbol$bed_format,
)
) %>%
dplyr::select(-hg19_coordinates_from_ensembl,
-hg19_coordinates_from_symbol,
-hg38_coordinates_from_ensembl,
-hg38_coordinates_from_symbol)

# Return the tibble
return(non_alt_loci_set_coordinates)
}
Loading

0 comments on commit 1ec4201

Please sign in to comment.