diff --git a/analyses/AnnotateMergedTable.R b/analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R similarity index 72% rename from analyses/AnnotateMergedTable.R rename to analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R index bc3ea9a..887f5f2 100644 --- a/analyses/AnnotateMergedTable.R +++ b/analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R @@ -1,12 +1,12 @@ ############################################ ## load libraries -library(tidyverse) ## needed for general table operations -library(jsonlite) ##needed for HGNC requests -library(rvest) ## needed for scraping -library(readr) ## needed to read files -library(tools) ## needed for checksums -library("R.utils") ## gzip downloaded and result files -library(config) ## needed to read config file +library(tidyverse) ## needed for general table operations +library(jsonlite) ## needed for HGNC requests +library(rvest) ## needed for scraping +library(readr) ## needed to read files +library(tools) ## needed for checksums +library("R.utils") ## gzip downloaded and result files +library(config) ## needed to read config file library(ontologyIndex) ## needed to read ontology files ############################################ @@ -15,7 +15,7 @@ library(ontologyIndex) ## needed to read ontology files ## define relative script path project_topic <- "nephrology" project_name <- "kidney-genetics" -script_path <- "/analyses/" +script_path <- "/analyses/C_AnnotateMergedTable/" ## read configs config_vars_proj <- config::get(file = Sys.getenv("CONFIG_FILE"), @@ -26,29 +26,41 @@ setwd(paste0(config_vars_proj$projectsdir, project_name, script_path)) ## set global options options(scipen = 999) + +# compute date only once or somehow in config +current_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d") ############################################ ############################################ # load global functions # hgnc functions -source("functions/hgnc-functions.R", local = TRUE) -source("functions/hpo-functions.R", local = TRUE) -source("functions/file-functions.R", local = TRUE) -source("functions/gtex-functions.R", local = TRUE) -source("functions/descartes-functions.R", local = TRUE) -source("functions/stringdb-functions.R", local = TRUE) +source("../functions/hgnc-functions.R", local = TRUE) +source("../functions/hpo-functions.R", local = TRUE) +source("../functions/file-functions.R", local = TRUE) +source("../functions/gtex-functions.R", local = TRUE) +source("../functions/descartes-functions.R", local = TRUE) +source("../functions/stringdb-functions.R", local = TRUE) +source("../functions/mpo-mousemine-functions.R", local = TRUE) ############################################ ############################################ -## load the merged table for filtering and annotation - +## load the merged table and the HGNC table for filtering and annotation # define merged analysis path -merged_path <- "A_MergeAnalysesSources/results/" +merged_path <- "../A_MergeAnalysesSources/results/" +annotation_path <- "../B_AnnotationHGNC/results/" # find newest file CSV files in merged folders and filter -merged_csv_table <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_path)) +# mutate hgnc_id column to integer +# TODO: unify hgnc_id column names in all tables +merge_analyses_sources <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_path)) +non_alt_loci_set_coordinates <- read_csv(get_newest_file("non_alt_loci_set_coordinates", annotation_path)) %>% + mutate(hgnc_id = as.integer(str_remove(hgnc_id, "HGNC:"))) + +# filter for genes with at least 3 evidence sources +merge_analyses_sources_high_evidence <- merge_analyses_sources %>% + filter(evidence_count > 2) ############################################ @@ -56,18 +68,14 @@ merged_csv_table <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_pa ## download all required database sources from HPO and MPO # we load and use the results of previous walks through the ontology tree if not older then 1 month -current_date <- strftime(as.POSIXlt(Sys.time(), - "UTC", "%Y-%m-%dT%H:%M:%S"), - "%Y-%m-%d") - # HPO obo file download -if (check_file_age("hpo_obo", "shared/data/downloads/", 1)) { - hpo_obo_filename <- get_newest_file("hpo_obo", "shared/data/downloads/") +if (check_file_age("hpo_obo", "../shared/data/downloads/", 1)) { + hpo_obo_filename <- get_newest_file("hpo_obo", "../shared/data/downloads/") } else { # HPO links to hpo_obo file needs to be set in config hpo_obo_url <- config_vars_proj$hpo_obo_url - hpo_obo_filename <- paste0("shared/data/downloads/hpo_obo.", + hpo_obo_filename <- paste0("../shared/data/downloads/hpo_obo.", current_date, ".obo") @@ -82,13 +90,13 @@ hpo <- get_ontology( ) # MPO obo file download -if (check_file_age("mpo_obo", "shared/data/downloads/", 1)) { - mpo_obo_filename <- get_newest_file("mpo_obo", "shared/data/downloads/") +if (check_file_age("mpo_obo", "../shared/data/downloads/", 1)) { + mpo_obo_filename <- get_newest_file("mpo_obo", "../shared/data/downloads/") } else { # MPO links to mpo_obo file needs to be set in config mpo_obo_url <- config_vars_proj$mpo_obo_url - mpo_obo_filename <- paste0("shared/data/downloads/mpo_obo.", + mpo_obo_filename <- paste0("../shared/data/downloads/mpo_obo.", current_date, ".obo") @@ -107,48 +115,46 @@ mpo <- get_ontology( ############################################ ## get relevant HPO terms for kidney disease classification # get the current date -# TODO: compute date only once or somehow in config -current_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d") # 1) get all children of term upper urinary tract (HP:0010935) for classification into kidney disease groups # walk through the ontology tree and add all unique terms descending from # Abnormality of the upper urinary tract (HP:0010935) # we load and use the results of previous walks through the ontology tree if not older then 1 month -if (check_file_age("hpo_list_kidney", "shared/", 1)) { - hpo_list_kidney <- read_csv(get_newest_file("hpo_list_kidney", "shared")) +if (check_file_age("hpo_list_kidney", "../shared/", 1)) { + hpo_list_kidney <- read_csv(get_newest_file("hpo_list_kidney", "../shared")) } else { hpo_list_kidney <- hpo_all_children_from_term("HP:0010935") write_csv(hpo_list_kidney, - file = paste0("shared/hpo_list_kidney.", + file = paste0("../shared/hpo_list_kidney.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_kidney.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_kidney.", current_date, ".csv"), overwrite = TRUE) } # 2) get all children of term Adult onset (HP:0003581) for classification into adult vs antenatal_or_congenital vs neonatal_or_pediatric onset # we load and use the results of previous walks through the ontology tree if not older then 1 month -if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_antenatal_or_congenital", "shared/", 1) && check_file_age("hpo_list_neonatal_or_pediatric", "shared/", 1)) { - hpo_list_adult <- read_csv(get_newest_file("hpo_list_adult", "shared")) - hpo_list_antenatal_or_congenital <- read_csv(get_newest_file("hpo_list_antenatal_or_congenital", "shared")) - hpo_list_neonatal_or_pediatric <- read_csv(get_newest_file("hpo_list_neonatal_or_pediatric", "shared")) +if (check_file_age("hpo_list_adult", "../shared/", 1) && check_file_age("hpo_list_antenatal_or_congenital", "../shared/", 1) && check_file_age("hpo_list_neonatal_or_pediatric", "../shared/", 1)) { + hpo_list_adult <- read_csv(get_newest_file("hpo_list_adult", "../shared")) + hpo_list_antenatal_or_congenital <- read_csv(get_newest_file("hpo_list_antenatal_or_congenital", "../shared")) + hpo_list_neonatal_or_pediatric <- read_csv(get_newest_file("hpo_list_neonatal_or_pediatric", "../shared")) } else { # walk through the ontology tree and add all unique terms descending from # Adult onset (HP:0003581) hpo_list_adult <- hpo_all_children_from_term("HP:0003581") write_csv(hpo_list_adult, - file = paste0("shared/hpo_list_adult.", + file = paste0("../shared/hpo_list_adult.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_adult.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_adult.", current_date, ".csv"), overwrite = TRUE) # walk through the ontology tree and add all unique terms descending from @@ -157,12 +163,12 @@ if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_a hpo_all_children_from_term("HP:0003577")) write_csv(hpo_list_antenatal_or_congenital, - file = paste0("shared/hpo_list_antenatal_or_congenital.", + file = paste0("../shared/hpo_list_antenatal_or_congenital.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_antenatal_or_congenital.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_antenatal_or_congenital.", current_date, ".csv"), overwrite = TRUE) # walk through the ontology tree and add all unique terms descending from @@ -171,26 +177,26 @@ if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_a hpo_all_children_from_term("HP:0410280")) write_csv(hpo_list_neonatal_or_pediatric, - file = paste0("shared/hpo_list_neonatal_or_pediatric.", + file = paste0("../shared/hpo_list_neonatal_or_pediatric.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_neonatal_or_pediatric.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_neonatal_or_pediatric.", current_date, ".csv"), overwrite = TRUE) } # 3) syndromic vs non-syndromic (categories in OMIM: GROWTH, SKELETAL, NEUROLOGIC, HEAD & NECK; exclude: CARDIOVASCULAR, ABDOMEN, GENITOURINARY) # we load and use the results of previous walks through the ontology tree if not older then 1 month -if (check_file_age("hpo_list_syndromic", "shared/", 1)) { +if (check_file_age("hpo_list_syndromic", "../shared/", 1)) { # load the list of syndromic terms - hpo_list_syndromic <- read_csv(get_newest_file("hpo_list_syndromic", "shared")) + hpo_list_syndromic <- read_csv(get_newest_file("hpo_list_syndromic", "../shared")) # also load the sublists if not older then 1 month - all_hpo_children_list_growth_term <- read_csv(get_newest_file("hpo_list_growth_term", "shared")) - all_hpo_children_list_skeletal_term <- read_csv(get_newest_file("hpo_list_skeletal_term", "shared")) - all_hpo_children_list_neurologic_term <- read_csv(get_newest_file("hpo_list_neurologic_term", "shared")) - all_hpo_children_list_head_and_neck_term <- read_csv(get_newest_file("hpo_list_head_and_neck_term", "shared")) + all_hpo_children_list_growth_term <- read_csv(get_newest_file("hpo_list_growth_term", "../shared")) + all_hpo_children_list_skeletal_term <- read_csv(get_newest_file("hpo_list_skeletal_term", "../shared")) + all_hpo_children_list_neurologic_term <- read_csv(get_newest_file("hpo_list_neurologic_term", "../shared")) + all_hpo_children_list_head_and_neck_term <- read_csv(get_newest_file("hpo_list_head_and_neck_term", "../shared")) } else { # walk through the ontology tree and add all unique terms descending from # Growth abnormality (HP:0001507) @@ -231,39 +237,39 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) { select(term, base_term, base_term_id) write_csv(all_hpo_children_list_growth_term, - file = paste0("shared/hpo_list_growth_term.", + file = paste0("../shared/hpo_list_growth_term.", current_date, ".csv"), na = "NULL") write_csv(all_hpo_children_list_skeletal_term, - file = paste0("shared/hpo_list_skeletal_term.", + file = paste0("../shared/hpo_list_skeletal_term.", current_date, ".csv"), na = "NULL") write_csv(all_hpo_children_list_neurologic_term, - file = paste0("shared/hpo_list_neurologic_term.", + file = paste0("../shared/hpo_list_neurologic_term.", current_date, ".csv"), na = "NULL") write_csv(all_hpo_children_list_head_and_neck_term, - file = paste0("shared/hpo_list_head_and_neck_term.", + file = paste0("../shared/hpo_list_head_and_neck_term.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_growth_term.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_growth_term.", current_date, ".csv"), overwrite = TRUE) - gzip(paste0("shared/hpo_list_skeletal_term.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_skeletal_term.", current_date, ".csv"), overwrite = TRUE) - gzip(paste0("shared/hpo_list_neurologic_term.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_neurologic_term.", current_date, ".csv"), overwrite = TRUE) - gzip(paste0("shared/hpo_list_head_and_neck_term.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_head_and_neck_term.", current_date, ".csv"), overwrite = TRUE) # merge all lists and transform the list into a tibble @@ -279,12 +285,38 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) { query_date = current_date) write_csv(hpo_list_syndromic, - file = paste0("shared/hpo_list_syndromic.", + file = paste0("../shared/hpo_list_syndromic.", current_date, ".csv"), na = "NULL") - gzip(paste0("shared/hpo_list_syndromic.", current_date, ".csv"), + gzip(paste0("../shared/hpo_list_syndromic.", current_date, ".csv"), + overwrite = TRUE) +} +############################################ + + +############################################ +## get relevant MPO terms for kidney disease classification in mice from MPO +# get the current date + +# get all children of term upper urinary tract (MP:0005367) for classification into kidney disease groups +# walk through the ontology tree and add all unique terms descending from +# renal/urinary system phenotype (MP:0005367) +# we load and use the results of previous walks through the ontology tree if not older then 1 month + +if (check_file_age("mpo_list_kidney", "../shared/", 1)) { + mpo_list_kidney <- read_csv(get_newest_file("mpo_list_kidney", "../shared")) +} else { + mpo_list_kidney <- mpo_all_children_from_term("MP:0005367") + + write_csv(mpo_list_kidney, + file = paste0("../shared/mpo_list_kidney.", + current_date, + ".csv"), + na = "NULL") + + gzip(paste0("../shared/mpo_list_kidney.", current_date, ".csv"), overwrite = TRUE) } ############################################ @@ -295,17 +327,14 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) { # we load and use the results of previous walks through the ontology tree if not older then 1 month # disease ontology annotations from HPO -if (check_file_age("phenotype", "shared/data/downloads/", 1)) { - phenotype_hpoa_filename <- get_newest_file("phenotype", "shared/data/downloads/") +if (check_file_age("phenotype", "../shared/data/downloads/", 1)) { + phenotype_hpoa_filename <- get_newest_file("phenotype", "../shared/data/downloads/") } else { - # TODO: compute date only once or somehow in config - file_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d") - # disease ontology annotations from HPO phenotype_hpoa_url <- config_vars_proj$phenotype_hpoa - phenotype_hpoa_filename <- paste0("shared/data/downloads/phenotype.", - file_date, + phenotype_hpoa_filename <- paste0("../shared/data/downloads/phenotype.", + current_date, ".hpoa") download.file(phenotype_hpoa_url, phenotype_hpoa_filename, mode = "wb") @@ -319,18 +348,15 @@ if (check_file_age("phenotype", "shared/data/downloads/", 1)) { # OMIM links to genemap2 file needs to be set in config and applied for at # https://www.omim.org/downloads -if (check_file_age("omim_genemap2", "shared/data/downloads/", 1)) { - omim_genemap2_filename <- get_newest_file("omim_genemap2", "shared/data/downloads/") +if (check_file_age("omim_genemap2", "../shared/data/downloads/", 1)) { + omim_genemap2_filename <- get_newest_file("omim_genemap2", "../shared/data/downloads/") } else { - # TODO: compute date only once or somehow in config - file_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d") - # OMIM links to genemap2 file needs to be set in config and applied for at # https://www.omim.org/downloads omim_genemap2_url <- config_vars_proj$omim_genemap2_url - omim_genemap2_filename <- paste0("shared/data/downloads/omim_genemap2.", - file_date, + omim_genemap2_filename <- paste0("../shared/data/downloads/omim_genemap2.", + current_date, ".txt") download.file(omim_genemap2_url, omim_genemap2_filename, mode = "wb") @@ -668,7 +694,7 @@ all_kidney_groups_hpo <- all_kidney_groups %>% hpo_gene_list_kidney_all_kidney_groups <- hpo_gene_list_kidney %>% left_join(all_kidney_groups_hpo, by = c("hpo_id"), relationship = "many-to-many") %>% filter(!is.na(kidney_disease_group_short)) %>% - select(approved_symbol, database_id , kidney_disease_group_short, hpo_id_group_frac) %>% + select(approved_symbol, database_id, kidney_disease_group_short, hpo_id_group_frac) %>% unique() %>% group_by(approved_symbol, database_id, kidney_disease_group_short) %>% summarise(approved_symbol = unique(approved_symbol), @@ -690,17 +716,16 @@ hpo_gene_list_kidney_all_kidney_groups_summarized_for_join <- hpo_gene_list_kidn summarise(approved_symbol = unique(approved_symbol), groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "), .groups = "keep") %>% - ungroup() + ungroup() %>% + select(approved_symbol, clinical_groups_p = groups_p) # TODO: check if it could be helpful to do this also per gene -# TODO: workflow: if the respective gene/entity from our kidney list is in the ClinGen curated list apply this group, if not apply the group with the highest score after reviewing the groups manually ############################################ ############################################ -# annotate pediatric vs adult onset (OMIM: HPO terms Adult onset HP:0003581, Pediatric onset HP:0410280, and children of terms) +# annotate pediatric vs adult onset (OMIM: e.g. HPO terms Adult onset HP:0003581, Pediatric onset HP:0410280, and children of terms) # TODO: describe the logic of the analysis in comments step by step -# TODO: make prenatal and congenital a separate group # filter OMIM data for all the disease database ids with HPO terms for adult onset phenotype_hpoa_filter_adult <- phenotype_hpoa %>% @@ -758,8 +783,7 @@ onset_hpo <- bind_rows(hpo_gene_list_adult, hpo_gene_list_antenatal_or_congenita # now compute the sum of the relative frequency of the HPO terms in the two lists per gene and disease # this will be the hpo id group p-value -# TODO: check if it could be helpful to do this also per gene -hpo_gene_list_onset_groups <- bind_rows(hpo_gene_list_adult, hpo_gene_list_non_adult) %>% +hpo_gene_list_onset_groups <- bind_rows(hpo_gene_list_adult, hpo_gene_list_antenatal_or_congenital, hpo_gene_list_neonatal_or_pediatric) %>% select(-onset_group) %>% left_join(onset_hpo, by = c("hpo_id"), relationship = "many-to-many") %>% select(approved_symbol, database_id, onset_group, hpo_id_group_frac) %>% @@ -784,7 +808,10 @@ hpo_gene_list_onset_groups_summarized_for_join <- hpo_gene_list_onset_groups %>% summarise(approved_symbol = unique(approved_symbol), groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "), .groups = "keep") %>% - ungroup() + ungroup() %>% + select(approved_symbol, onset_groups_p = groups_p) + +# TODO: check if it could be helpful to do this also per gene ############################################ @@ -880,60 +907,151 @@ hpo_gene_list_symptome_groups_summarized_for_join <- hpo_gene_list_symptome_grou summarise(approved_symbol = unique(approved_symbol), groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "), .groups = "keep") %>% - ungroup() + ungroup() %>% + select(approved_symbol, syndromic_groups_p = groups_p) # TODO: check if we need to do this analysis per entity instead per gene and then aggregate/ summarize per gene ############################################ ############################################ -# TODO: annotate MGI mouse phenotypes kidney -# use https://www.mousemine.org/mousemine/begin.do -# https://www.mousemine.org/mousemine/ +# annotate MGI mouse phenotypes kidney +# use MPO terms and InterMineR package through the mpo-mousemine.R functions +# use the mpo_list_kidney as phenotype list +# select only the columns we want to keep for the final table join +# we load and use the results of previous walks through the ontology tree if not older then 1 month -# use R package: https://www.bioconductor.org/packages/release/bioc/vignettes/InterMineR/inst/doc/InterMineR.html -# TODO: need to find a table to show the genes names -# TODO: need to find a table to map the mouse phenotypes to HPO terms -# TODO: alternatively use the equivalent MPO kidney parent term -# TODO: differentiate between gene and inheritance mode (biallelic vs monoallelic) -# TODO: check Ninas solution for this +if (check_file_age("merge_analyses_sources_high_evidence_mgi", "results", 1)) { + merge_analyses_sources_high_evidence_mgi <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_mgi", "results")) +} else { + merge_analyses_sources_high_evidence_mgi <- merge_analyses_sources_high_evidence %>% + select(approved_symbol, hgnc_id) %>% + rowwise() %>% + mutate(mgi_phenotype = compare_gene_phenotypes(approved_symbol, mpo_list_kidney)) %>% + select(hgnc_id, mgi_phenotype) + + write_csv(merge_analyses_sources_high_evidence_mgi, + file = paste0("results/merge_analyses_sources_high_evidence_mgi.", + current_date, + ".csv"), + na = "NULL") + + gzip(paste0("results/merge_analyses_sources_high_evidence_mgi.", current_date, ".csv"), + overwrite = TRUE) +} ############################################ ############################################ -# TODO: annotate StringDB interactions with strong evidence kidney genes +# annotate StringDB interactions with strong evidence kidney genes # use download tables: https://string-db.org/cgi/download?sessionId=bVJC28yKCRz5&species_text=Homo+sapiens # eg physical: https://stringdb-downloads.org/download/protein.physical.links.v12.0/9606.protein.physical.links.v12.0.txt.gz # sum the interactions for each gene with the high evidence list, then percentile normalize, additionally show a string with interacting genes and scores -############################################ +# select only the columns we want to keep for the final table join +# we load and use the results of previous walks through the ontology tree if not older then 1 month +if (check_file_age("merge_analyses_sources_high_evidence_string", "results", 1)) { + merge_analyses_sources_high_evidence_string <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_string", "results")) +} else { + merge_analyses_sources_high_evidence_string <- merge_analyses_sources_high_evidence %>% + select(approved_symbol, hgnc_id) %>% + left_join(non_alt_loci_set_coordinates %>% select(hgnc_id, STRING_id), by = c("hgnc_id")) %>% + mutate(stringdb = compute_protein_interactions(STRING_id), directory = "../shared/downloads/") %>% + unnest(cols = stringdb, names_sep = "_") %>% + select(hgnc_id, stringdb_interaction_sum_score, stringdb_interaction_normalized_score, stringdb_interaction_string) + + write_csv(merge_analyses_sources_high_evidence_string, + file = paste0("results/merge_analyses_sources_high_evidence_string.", + current_date, + ".csv"), + na = "NULL") + gzip(paste0("results/merge_analyses_sources_high_evidence_string.", current_date, ".csv"), + overwrite = TRUE) +} ############################################ -# TODO: annotate kidney expression -# see GitHub issue for cutoffs -# TODO: use gtex functions to compute TPM GTEx kidney expression -# we need the gencode_ids from the HGNC table for this +############################################ +# annotate kidney expression +# we load and use the results of previous walks through the ontology tree if not older then 1 month -# TODO: maybe add expression in embryonic kidney from somewhere (e.g. https://descartes.brotmanbaty.org/) -# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/tables/tissue_tpm/kidney.csv (this one for TPM) -# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/tables/tissue_percentage/kidney.csv -# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/counts/gene2ens.csv (for identifier mapping) -# TODO: use descartes functions to compute TPM GTEx kidney expression -# TODO: annotate descartes kidney expression +if (check_file_age("merge_analyses_sources_high_evidence_expression", "results", 1)) { + merge_analyses_sources_high_evidence_expression <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_expression", "results")) +} else { + merge_analyses_sources_high_evidence_gencode <- merge_analyses_sources_high_evidence %>% + dplyr::select(approved_symbol, hgnc_id) %>% + left_join(non_alt_loci_set_coordinates %>% dplyr::select(hgnc_id, gencode_id, ensembl_gene_id_version_hg19, ensembl_gene_id_version_hg38), by = c("hgnc_id")) + + # use gtex functions to compute TPM GTEx kidney expression + # we need the gencode_ids from the HGNC table for this + # TODO: try to salvage the missing gtex annotations using the ensembl gene ids + gtex_results_kidney <- merge_analyses_sources_high_evidence_gencode %>% + filter(!is.na(gencode_id)) %>% + rowwise() %>% + mutate(gencode = get_multiple_median_tissue_expression(gencode_id, wide_format = TRUE, max_ids_per_request = 5)) %>% + unnest(cols = gencode, names_sep = "_") %>% + select(hgnc_id, gencode_kidney_medulla, gencode_kidney_cortex) + + # add expression in embryonic kidney from descartes (https://descartes.brotmanbaty.org/) + descartes_results_kidney <- merge_analyses_sources_high_evidence_gencode %>% + mutate(descartes = get_descartes_tpm_expression(approved_symbol, "kidney")) %>% + unnest(cols = descartes, names_sep = "_") %>% + select(hgnc_id, descartes_kidney_tpm = descartes_tpm) + + # join the results back to the main merge_analyses_sources_high_evidence_gencode table + # select the columns we want to keep for the final table join + merge_analyses_sources_high_evidence_expression <- merge_analyses_sources_high_evidence_gencode %>% + left_join(gtex_results_kidney, by = c("hgnc_id")) %>% + left_join(descartes_results_kidney, by = c("hgnc_id")) %>% + select(hgnc_id, gencode_kidney_medulla, gencode_kidney_cortex, descartes_kidney_tpm) + + write_csv(merge_analyses_sources_high_evidence_expression, + file = paste0("results/merge_analyses_sources_high_evidence_expression.", + current_date, + ".csv"), + na = "NULL") + gzip(paste0("results/merge_analyses_sources_high_evidence_expression.", current_date, ".csv"), + overwrite = TRUE) +} ############################################ ############################################ -# TODO: add a column in the final table for publication (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points) +# generate final annotated table of high evidence genes for manual review + +# select the columns from the hgnc annotation table we want to keep for the final table join +non_alt_loci_set_coordinates_annotation <- non_alt_loci_set_coordinates %>% + select(hgnc_id, pLI, oe_lof, lof_z, mis_z, omim_summary, gencc_summary, clingen_summary, clinvar) + +# 1. join the results back to the main merge_analyses_sources_high_evidence_gencode table +# 2. join with clinical categories tables (hpo_gene_list_kidney_all_kidney_groups_summarized_for_join, hpo_gene_list_onset_groups_summarized_for_join, hpo_gene_list_symptome_groups_summarized_for_join) +# 3. join with the functional annotation tables (merge_analyses_sources_high_evidence_mgi, merge_analyses_sources_high_evidence_string, merge_analyses_sources_high_evidence_expression) +# 4. add an empty column in the final table for publication (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points) and add an empty column in the final table for publications used +# 5. add a unique id column for each entry in the final table (format "cur_XXX", with leading zeros if necessary) +merge_analyses_sources_high_evidence_annotated <- merge_analyses_sources_high_evidence %>% + select(approved_symbol, hgnc_id, evidence_count) %>% + left_join(non_alt_loci_set_coordinates_annotation, by = c("hgnc_id")) %>% + left_join(hpo_gene_list_kidney_all_kidney_groups_summarized_for_join, by = c("approved_symbol")) %>% + left_join(hpo_gene_list_onset_groups_summarized_for_join, by = c("approved_symbol")) %>% + left_join(hpo_gene_list_symptome_groups_summarized_for_join, by = c("approved_symbol")) %>% + left_join(merge_analyses_sources_high_evidence_mgi, by = c("hgnc_id")) %>% + left_join(merge_analyses_sources_high_evidence_string, by = c("hgnc_id")) %>% + left_join(merge_analyses_sources_high_evidence_expression, by = c("hgnc_id")) %>% + mutate(publication_score = NA) %>% + mutate(publications_used = NA) %>% + mutate(cur_id = paste0("cur_", formatC(1:nrow(.), width = 3, flag = "0"))) + + +# TODO: workflow: if the respective gene/entity from our kidney list is in the ClinGen or GenCC curated list apply this group, if not apply the group with the highest score after reviewing the groups manually +# TODO: define the scoring logic for the final table with cutoffs for the different categories +# TODO: see GitHub issue for cutoffs in expression +# TODO: scoring logic for publications (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points) # TODO: scoring logic: if only screening publication then the category cant be more then Limited, if there is a clinical description then the category can be Moderate, if there is a clinical replication then the category can be Definitive # TODO: scoring logic: we further use the category "no known relation" for genes that are not trustworthy associated with kidney disease # TODO: maybe annotate with publications (from OMIM) and GeneReviews - -#TODO: annotate phenotypes as phenopackets for each entity? - +# TODO: annotate phenotypes as phenopackets for each entity? ############################################ @@ -943,12 +1061,12 @@ creation_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d") -write_csv(results_genes_wider, - file = paste0("merged/KidneyGenetics_AnnotateMergedTable.", +write_csv(merge_analyses_sources_high_evidence_annotated, + file = paste0("results/C_high_evidence_annotated_csv_table.", creation_date, ".csv"), na = "NULL") -gzip(paste0("results/KidneyGenetics_AnnotateMergedTable.", creation_date, ".csv"), +gzip(paste0("results/C_high_evidence_annotated_csv_table.", creation_date, ".csv"), overwrite = TRUE) ############################################ \ No newline at end of file diff --git a/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz new file mode 100644 index 0000000..72e6f14 Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz differ diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz new file mode 100644 index 0000000..355fc19 Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz differ diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz new file mode 100644 index 0000000..8f6f3e0 Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz differ diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz new file mode 100644 index 0000000..c83c38f Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz differ diff --git a/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz b/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz new file mode 100644 index 0000000..36f097a Binary files /dev/null and b/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz differ diff --git a/analyses/functions/gtex-functions.R b/analyses/functions/gtex-functions.R index a7c27aa..6254630 100644 --- a/analyses/functions/gtex-functions.R +++ b/analyses/functions/gtex-functions.R @@ -79,6 +79,8 @@ get_median_tissue_expression <- function(gencode_ids, tissue_site_detail_ids = N } +# TODO: fix the gtex function get_multiple_median_tissue_expression to work properly with multiple gencode ids instead of using rowwise +# TODO: fix the gtex functions to implement retry logic if the API call results in an error or empty result #' Fetch Median Tissue Expression Levels from GTEx API for Multiple Genes #' #' This function takes a character vector of GENCODE gene identifiers as input,