diff --git a/analyses/AnnotateMergedTable.R b/analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R
similarity index 72%
rename from analyses/AnnotateMergedTable.R
rename to analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R
index bc3ea9a..887f5f2 100644
--- a/analyses/AnnotateMergedTable.R
+++ b/analyses/C_AnnotateMergedTable/C_AnnotateMergedTable.R
@@ -1,12 +1,12 @@
 ############################################
 ## load libraries
-library(tidyverse)  ## needed for general table operations
-library(jsonlite)  ##needed for HGNC requests
-library(rvest)  ## needed for scraping
-library(readr)  ## needed to read files
-library(tools)  ## needed for checksums
-library("R.utils")  ## gzip downloaded and result files
-library(config) ## needed to read config file
+library(tidyverse)    ## needed for general table operations
+library(jsonlite)     ## needed for HGNC requests
+library(rvest)        ## needed for scraping
+library(readr)        ## needed to read files
+library(tools)        ## needed for checksums
+library("R.utils")    ## gzip downloaded and result files
+library(config)       ## needed to read config file
 library(ontologyIndex) ## needed to read ontology files
 ############################################
 
@@ -15,7 +15,7 @@ library(ontologyIndex) ## needed to read ontology files
 ## define relative script path
 project_topic <- "nephrology"
 project_name <- "kidney-genetics"
-script_path <- "/analyses/"
+script_path <- "/analyses/C_AnnotateMergedTable/"
 
 ## read configs
 config_vars_proj <- config::get(file = Sys.getenv("CONFIG_FILE"),
@@ -26,29 +26,41 @@ setwd(paste0(config_vars_proj$projectsdir, project_name, script_path))
 
 ## set global options
 options(scipen = 999)
+
+# compute date only once or somehow in config
+current_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")
 ############################################
 
 
 ############################################
 # load global functions
 # hgnc functions
-source("functions/hgnc-functions.R", local = TRUE)
-source("functions/hpo-functions.R", local = TRUE)
-source("functions/file-functions.R", local = TRUE)
-source("functions/gtex-functions.R", local = TRUE)
-source("functions/descartes-functions.R", local = TRUE)
-source("functions/stringdb-functions.R", local = TRUE)
+source("../functions/hgnc-functions.R", local = TRUE)
+source("../functions/hpo-functions.R", local = TRUE)
+source("../functions/file-functions.R", local = TRUE)
+source("../functions/gtex-functions.R", local = TRUE)
+source("../functions/descartes-functions.R", local = TRUE)
+source("../functions/stringdb-functions.R", local = TRUE)
+source("../functions/mpo-mousemine-functions.R", local = TRUE)
 ############################################
 
 
 ############################################
-## load the merged table for filtering and annotation
-
+## load the merged table and the HGNC table for filtering and annotation
 # define merged analysis path
-merged_path <- "A_MergeAnalysesSources/results/"
+merged_path <- "../A_MergeAnalysesSources/results/"
+annotation_path <- "../B_AnnotationHGNC/results/"
 
 # find newest file CSV files in merged folders and filter
-merged_csv_table <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_path))
+# mutate hgnc_id column to integer
+# TODO: unify hgnc_id column names in all tables
+merge_analyses_sources <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_path))
+non_alt_loci_set_coordinates <- read_csv(get_newest_file("non_alt_loci_set_coordinates", annotation_path)) %>%
+  mutate(hgnc_id = as.integer(str_remove(hgnc_id, "HGNC:")))
+
+# filter for genes with at least 3 evidence sources
+merge_analyses_sources_high_evidence <- merge_analyses_sources %>%
+    filter(evidence_count > 2)
 ############################################
 
 
@@ -56,18 +68,14 @@ merged_csv_table <- read_csv(get_newest_file("A_MergeAnalysesSources", merged_pa
 ## download all required database sources from HPO and MPO
 # we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-current_date <- strftime(as.POSIXlt(Sys.time(),
-    "UTC", "%Y-%m-%dT%H:%M:%S"),
-  "%Y-%m-%d")
-
 # HPO obo file download
-if (check_file_age("hpo_obo", "shared/data/downloads/", 1)) {
-  hpo_obo_filename <- get_newest_file("hpo_obo", "shared/data/downloads/")
+if (check_file_age("hpo_obo", "../shared/data/downloads/", 1)) {
+  hpo_obo_filename <- get_newest_file("hpo_obo", "../shared/data/downloads/")
 } else {
   # HPO links to hpo_obo file needs to be set in config
   hpo_obo_url <- config_vars_proj$hpo_obo_url
 
-  hpo_obo_filename <- paste0("shared/data/downloads/hpo_obo.",
+  hpo_obo_filename <- paste0("../shared/data/downloads/hpo_obo.",
     current_date,
     ".obo")
 
@@ -82,13 +90,13 @@ hpo <- get_ontology(
 )
 
 # MPO obo file download
-if (check_file_age("mpo_obo", "shared/data/downloads/", 1)) {
-  mpo_obo_filename <- get_newest_file("mpo_obo", "shared/data/downloads/")
+if (check_file_age("mpo_obo", "../shared/data/downloads/", 1)) {
+  mpo_obo_filename <- get_newest_file("mpo_obo", "../shared/data/downloads/")
 } else {
   # MPO links to mpo_obo file needs to be set in config
   mpo_obo_url <- config_vars_proj$mpo_obo_url
 
-  mpo_obo_filename <- paste0("shared/data/downloads/mpo_obo.",
+  mpo_obo_filename <- paste0("../shared/data/downloads/mpo_obo.",
     current_date,
     ".obo")
 
@@ -107,48 +115,46 @@ mpo <- get_ontology(
 ############################################
 ## get relevant HPO terms for kidney disease classification
 # get the current date
-# TODO: compute date only once or somehow in config
-current_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")
 
 # 1) get all children of term upper urinary tract (HP:0010935) for classification into kidney disease groups
 # walk through the ontology tree and add all unique terms descending from
 # Abnormality of the upper urinary tract (HP:0010935)
 # we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-if (check_file_age("hpo_list_kidney", "shared/", 1)) {
-  hpo_list_kidney <- read_csv(get_newest_file("hpo_list_kidney", "shared"))
+if (check_file_age("hpo_list_kidney", "../shared/", 1)) {
+  hpo_list_kidney <- read_csv(get_newest_file("hpo_list_kidney", "../shared"))
 } else {
   hpo_list_kidney <- hpo_all_children_from_term("HP:0010935")
 
   write_csv(hpo_list_kidney,
-    file = paste0("shared/hpo_list_kidney.",
+    file = paste0("../shared/hpo_list_kidney.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_kidney.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_kidney.", current_date, ".csv"),
     overwrite = TRUE)
 }
 
 # 2) get all children of term Adult onset (HP:0003581) for classification into adult vs antenatal_or_congenital vs neonatal_or_pediatric onset
 # we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_antenatal_or_congenital", "shared/", 1) && check_file_age("hpo_list_neonatal_or_pediatric", "shared/", 1)) {
-  hpo_list_adult <- read_csv(get_newest_file("hpo_list_adult", "shared"))
-  hpo_list_antenatal_or_congenital <- read_csv(get_newest_file("hpo_list_antenatal_or_congenital", "shared"))
-  hpo_list_neonatal_or_pediatric <- read_csv(get_newest_file("hpo_list_neonatal_or_pediatric", "shared"))
+if (check_file_age("hpo_list_adult", "../shared/", 1) && check_file_age("hpo_list_antenatal_or_congenital", "../shared/", 1) && check_file_age("hpo_list_neonatal_or_pediatric", "../shared/", 1)) {
+  hpo_list_adult <- read_csv(get_newest_file("hpo_list_adult", "../shared"))
+  hpo_list_antenatal_or_congenital <- read_csv(get_newest_file("hpo_list_antenatal_or_congenital", "../shared"))
+  hpo_list_neonatal_or_pediatric <- read_csv(get_newest_file("hpo_list_neonatal_or_pediatric", "../shared"))
 } else {
   # walk through the ontology tree and add all unique terms descending from
   # Adult onset (HP:0003581)
   hpo_list_adult <- hpo_all_children_from_term("HP:0003581")
 
   write_csv(hpo_list_adult,
-    file = paste0("shared/hpo_list_adult.",
+    file = paste0("../shared/hpo_list_adult.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_adult.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_adult.", current_date, ".csv"),
     overwrite = TRUE)
 
   # walk through the ontology tree and add all unique terms descending from
@@ -157,12 +163,12 @@ if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_a
     hpo_all_children_from_term("HP:0003577"))
 
   write_csv(hpo_list_antenatal_or_congenital,
-    file = paste0("shared/hpo_list_antenatal_or_congenital.",
+    file = paste0("../shared/hpo_list_antenatal_or_congenital.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_antenatal_or_congenital.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_antenatal_or_congenital.", current_date, ".csv"),
     overwrite = TRUE)
 
   # walk through the ontology tree and add all unique terms descending from
@@ -171,26 +177,26 @@ if (check_file_age("hpo_list_adult", "shared/", 1) && check_file_age("hpo_list_a
     hpo_all_children_from_term("HP:0410280"))
 
   write_csv(hpo_list_neonatal_or_pediatric,
-    file = paste0("shared/hpo_list_neonatal_or_pediatric.",
+    file = paste0("../shared/hpo_list_neonatal_or_pediatric.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_neonatal_or_pediatric.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_neonatal_or_pediatric.", current_date, ".csv"),
     overwrite = TRUE)
 }
 
 # 3) syndromic vs non-syndromic (categories in OMIM: GROWTH, SKELETAL, NEUROLOGIC, HEAD & NECK; exclude: CARDIOVASCULAR, ABDOMEN, GENITOURINARY)
 # we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-if (check_file_age("hpo_list_syndromic", "shared/", 1)) {
+if (check_file_age("hpo_list_syndromic", "../shared/", 1)) {
   # load the list of syndromic terms
-  hpo_list_syndromic <- read_csv(get_newest_file("hpo_list_syndromic", "shared"))
+  hpo_list_syndromic <- read_csv(get_newest_file("hpo_list_syndromic", "../shared"))
   # also load the sublists if not older then 1 month
-  all_hpo_children_list_growth_term <- read_csv(get_newest_file("hpo_list_growth_term", "shared"))
-  all_hpo_children_list_skeletal_term <- read_csv(get_newest_file("hpo_list_skeletal_term", "shared"))
-  all_hpo_children_list_neurologic_term <- read_csv(get_newest_file("hpo_list_neurologic_term", "shared"))
-  all_hpo_children_list_head_and_neck_term <- read_csv(get_newest_file("hpo_list_head_and_neck_term", "shared"))
+  all_hpo_children_list_growth_term <- read_csv(get_newest_file("hpo_list_growth_term", "../shared"))
+  all_hpo_children_list_skeletal_term <- read_csv(get_newest_file("hpo_list_skeletal_term", "../shared"))
+  all_hpo_children_list_neurologic_term <- read_csv(get_newest_file("hpo_list_neurologic_term", "../shared"))
+  all_hpo_children_list_head_and_neck_term <- read_csv(get_newest_file("hpo_list_head_and_neck_term", "../shared"))
 } else {
   # walk through the ontology tree and add all unique terms descending from
   # Growth abnormality (HP:0001507)
@@ -231,39 +237,39 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) {
     select(term, base_term, base_term_id)
 
   write_csv(all_hpo_children_list_growth_term,
-    file = paste0("shared/hpo_list_growth_term.",
+    file = paste0("../shared/hpo_list_growth_term.",
       current_date,
       ".csv"),
     na = "NULL")
 
   write_csv(all_hpo_children_list_skeletal_term,
-    file = paste0("shared/hpo_list_skeletal_term.",
+    file = paste0("../shared/hpo_list_skeletal_term.",
       current_date,
       ".csv"),
     na = "NULL")
 
   write_csv(all_hpo_children_list_neurologic_term,
-    file = paste0("shared/hpo_list_neurologic_term.",
+    file = paste0("../shared/hpo_list_neurologic_term.",
       current_date,
       ".csv"),
     na = "NULL")
 
   write_csv(all_hpo_children_list_head_and_neck_term,
-    file = paste0("shared/hpo_list_head_and_neck_term.",
+    file = paste0("../shared/hpo_list_head_and_neck_term.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_growth_term.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_growth_term.", current_date, ".csv"),
     overwrite = TRUE)
 
-  gzip(paste0("shared/hpo_list_skeletal_term.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_skeletal_term.", current_date, ".csv"),
     overwrite = TRUE)
 
-  gzip(paste0("shared/hpo_list_neurologic_term.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_neurologic_term.", current_date, ".csv"),
     overwrite = TRUE)
 
-  gzip(paste0("shared/hpo_list_head_and_neck_term.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_head_and_neck_term.", current_date, ".csv"),
     overwrite = TRUE)
 
   # merge all lists and transform the list into a tibble
@@ -279,12 +285,38 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) {
       query_date = current_date)
 
   write_csv(hpo_list_syndromic,
-    file = paste0("shared/hpo_list_syndromic.",
+    file = paste0("../shared/hpo_list_syndromic.",
       current_date,
       ".csv"),
     na = "NULL")
 
-  gzip(paste0("shared/hpo_list_syndromic.", current_date, ".csv"),
+  gzip(paste0("../shared/hpo_list_syndromic.", current_date, ".csv"),
+    overwrite = TRUE)
+}
+############################################
+
+
+############################################
+## get relevant MPO terms for kidney disease classification in mice from MPO
+# get the current date
+
+# get all children of term upper urinary tract (MP:0005367) for classification into kidney disease groups
+# walk through the ontology tree and add all unique terms descending from
+# renal/urinary system phenotype (MP:0005367)
+# we load and use the results of previous walks through the ontology tree if not older then 1 month
+
+if (check_file_age("mpo_list_kidney", "../shared/", 1)) {
+  mpo_list_kidney <- read_csv(get_newest_file("mpo_list_kidney", "../shared"))
+} else {
+  mpo_list_kidney <- mpo_all_children_from_term("MP:0005367")
+
+  write_csv(mpo_list_kidney,
+    file = paste0("../shared/mpo_list_kidney.",
+      current_date,
+      ".csv"),
+    na = "NULL")
+
+  gzip(paste0("../shared/mpo_list_kidney.", current_date, ".csv"),
     overwrite = TRUE)
 }
 ############################################
@@ -295,17 +327,14 @@ if (check_file_age("hpo_list_syndromic", "shared/", 1)) {
 # we load and use the results of previous walks through the ontology tree if not older then 1 month
 
 # disease ontology annotations from HPO
-if (check_file_age("phenotype", "shared/data/downloads/", 1)) {
-  phenotype_hpoa_filename <- get_newest_file("phenotype", "shared/data/downloads/")
+if (check_file_age("phenotype", "../shared/data/downloads/", 1)) {
+  phenotype_hpoa_filename <- get_newest_file("phenotype", "../shared/data/downloads/")
 } else {
-  # TODO: compute date only once or somehow in config
-  file_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")
-
   # disease ontology annotations from HPO
   phenotype_hpoa_url <- config_vars_proj$phenotype_hpoa
 
-  phenotype_hpoa_filename <- paste0("shared/data/downloads/phenotype.",
-    file_date,
+  phenotype_hpoa_filename <- paste0("../shared/data/downloads/phenotype.",
+    current_date,
     ".hpoa")
 
   download.file(phenotype_hpoa_url, phenotype_hpoa_filename, mode = "wb")
@@ -319,18 +348,15 @@ if (check_file_age("phenotype", "shared/data/downloads/", 1)) {
 
 # OMIM links to genemap2 file needs to be set in config and applied for at
 # https://www.omim.org/downloads
-if (check_file_age("omim_genemap2", "shared/data/downloads/", 1)) {
-  omim_genemap2_filename <- get_newest_file("omim_genemap2", "shared/data/downloads/")
+if (check_file_age("omim_genemap2", "../shared/data/downloads/", 1)) {
+  omim_genemap2_filename <- get_newest_file("omim_genemap2", "../shared/data/downloads/")
 } else {
-  # TODO: compute date only once or somehow in config
-  file_date <- strftime(as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")
-
   # OMIM links to genemap2 file needs to be set in config and applied for at
   # https://www.omim.org/downloads
   omim_genemap2_url <- config_vars_proj$omim_genemap2_url
 
-  omim_genemap2_filename <- paste0("shared/data/downloads/omim_genemap2.",
-    file_date,
+  omim_genemap2_filename <- paste0("../shared/data/downloads/omim_genemap2.",
+    current_date,
     ".txt")
 
   download.file(omim_genemap2_url, omim_genemap2_filename, mode = "wb")
@@ -668,7 +694,7 @@ all_kidney_groups_hpo <- all_kidney_groups %>%
 hpo_gene_list_kidney_all_kidney_groups <- hpo_gene_list_kidney %>%
   left_join(all_kidney_groups_hpo, by = c("hpo_id"), relationship = "many-to-many") %>%
   filter(!is.na(kidney_disease_group_short)) %>%
-  select(approved_symbol, database_id , kidney_disease_group_short, hpo_id_group_frac) %>%
+  select(approved_symbol, database_id, kidney_disease_group_short, hpo_id_group_frac) %>%
   unique() %>%
   group_by(approved_symbol, database_id, kidney_disease_group_short) %>%
   summarise(approved_symbol = unique(approved_symbol),
@@ -690,17 +716,16 @@ hpo_gene_list_kidney_all_kidney_groups_summarized_for_join <- hpo_gene_list_kidn
   summarise(approved_symbol = unique(approved_symbol),
     groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "),
     .groups = "keep") %>%
-  ungroup()
+  ungroup() %>%
+  select(approved_symbol, clinical_groups_p = groups_p)
 
 # TODO: check if it could be helpful to do this also per gene
-# TODO: workflow: if the respective gene/entity from our kidney list is in the ClinGen curated list apply this group, if not apply the group with the highest score after reviewing the groups manually
 ############################################
 
 
 ############################################
-# annotate pediatric vs adult onset (OMIM: HPO terms Adult onset HP:0003581, Pediatric onset HP:0410280, and children of terms)
+# annotate pediatric vs adult onset (OMIM: e.g. HPO terms Adult onset HP:0003581, Pediatric onset HP:0410280, and children of terms)
 # TODO: describe the logic of the analysis in comments step by step
-# TODO: make prenatal and congenital a separate group
 
 # filter OMIM data for all the disease database ids with HPO terms for adult onset
 phenotype_hpoa_filter_adult <- phenotype_hpoa %>%
@@ -758,8 +783,7 @@ onset_hpo <- bind_rows(hpo_gene_list_adult, hpo_gene_list_antenatal_or_congenita
 
 # now compute the sum of the relative frequency of the HPO terms in the two lists per gene and disease
 # this will be the hpo id group p-value
-# TODO: check if it could be helpful to do this also per gene
-hpo_gene_list_onset_groups <- bind_rows(hpo_gene_list_adult, hpo_gene_list_non_adult) %>%
+hpo_gene_list_onset_groups <- bind_rows(hpo_gene_list_adult, hpo_gene_list_antenatal_or_congenital, hpo_gene_list_neonatal_or_pediatric) %>%
    select(-onset_group) %>%
   left_join(onset_hpo, by = c("hpo_id"), relationship = "many-to-many") %>%
   select(approved_symbol, database_id, onset_group, hpo_id_group_frac) %>%
@@ -784,7 +808,10 @@ hpo_gene_list_onset_groups_summarized_for_join <- hpo_gene_list_onset_groups %>%
   summarise(approved_symbol = unique(approved_symbol),
     groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "),
     .groups = "keep") %>%
-  ungroup()
+  ungroup() %>%
+  select(approved_symbol, onset_groups_p = groups_p)
+
+# TODO: check if it could be helpful to do this also per gene
 ############################################
 
 
@@ -880,60 +907,151 @@ hpo_gene_list_symptome_groups_summarized_for_join <- hpo_gene_list_symptome_grou
   summarise(approved_symbol = unique(approved_symbol),
     groups_p = paste0(database_id, " (", groups_p, ")", collapse = "; "),
     .groups = "keep") %>%
-  ungroup()
+  ungroup() %>%
+  select(approved_symbol, syndromic_groups_p = groups_p)
 
 # TODO: check if we need to do this analysis per entity instead per gene and then aggregate/ summarize per gene
 ############################################
 
 
 ############################################
-# TODO: annotate MGI mouse phenotypes kidney
-# use https://www.mousemine.org/mousemine/begin.do
-# https://www.mousemine.org/mousemine/
+# annotate MGI mouse phenotypes kidney
+# use MPO terms and InterMineR package through the mpo-mousemine.R functions
+# use the mpo_list_kidney as phenotype list
+# select only the columns we want to keep for the final table join
+# we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-# use R package: https://www.bioconductor.org/packages/release/bioc/vignettes/InterMineR/inst/doc/InterMineR.html
-# TODO: need to find a table to show the genes names
-# TODO: need to find a table to map the mouse phenotypes to HPO terms
-# TODO: alternatively use the equivalent MPO kidney parent term
-# TODO: differentiate between gene and inheritance mode (biallelic vs monoallelic)
-# TODO: check Ninas solution for this
+if (check_file_age("merge_analyses_sources_high_evidence_mgi", "results", 1)) {
+  merge_analyses_sources_high_evidence_mgi <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_mgi", "results"))
+} else {
+  merge_analyses_sources_high_evidence_mgi <- merge_analyses_sources_high_evidence %>%
+    select(approved_symbol, hgnc_id) %>%
+    rowwise() %>%
+    mutate(mgi_phenotype = compare_gene_phenotypes(approved_symbol, mpo_list_kidney)) %>%
+    select(hgnc_id, mgi_phenotype)
+
+  write_csv(merge_analyses_sources_high_evidence_mgi,
+    file = paste0("results/merge_analyses_sources_high_evidence_mgi.",
+      current_date,
+      ".csv"),
+    na = "NULL")
+
+  gzip(paste0("results/merge_analyses_sources_high_evidence_mgi.", current_date, ".csv"),
+    overwrite = TRUE)
+}
 ############################################
 
 
 ############################################
-# TODO: annotate StringDB interactions with strong evidence kidney genes
+# annotate StringDB interactions with strong evidence kidney genes
 # use download tables: https://string-db.org/cgi/download?sessionId=bVJC28yKCRz5&species_text=Homo+sapiens
 # eg physical: https://stringdb-downloads.org/download/protein.physical.links.v12.0/9606.protein.physical.links.v12.0.txt.gz
 # sum the interactions for each gene with the high evidence list, then percentile normalize, additionally show a string with interacting genes and scores
-############################################
+# select only the columns we want to keep for the final table join
+# we load and use the results of previous walks through the ontology tree if not older then 1 month
 
+if (check_file_age("merge_analyses_sources_high_evidence_string", "results", 1)) {
+  merge_analyses_sources_high_evidence_string <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_string", "results"))
+} else {
+  merge_analyses_sources_high_evidence_string <- merge_analyses_sources_high_evidence %>%
+    select(approved_symbol, hgnc_id) %>%
+    left_join(non_alt_loci_set_coordinates %>% select(hgnc_id, STRING_id), by = c("hgnc_id")) %>%
+    mutate(stringdb = compute_protein_interactions(STRING_id), directory = "../shared/downloads/") %>%
+    unnest(cols = stringdb, names_sep = "_") %>%
+    select(hgnc_id, stringdb_interaction_sum_score, stringdb_interaction_normalized_score, stringdb_interaction_string)
+
+  write_csv(merge_analyses_sources_high_evidence_string,
+    file = paste0("results/merge_analyses_sources_high_evidence_string.",
+      current_date,
+      ".csv"),
+    na = "NULL")
 
+  gzip(paste0("results/merge_analyses_sources_high_evidence_string.", current_date, ".csv"),
+    overwrite = TRUE)
+}
 ############################################
-# TODO: annotate kidney expression
-# see GitHub issue for cutoffs
 
-# TODO: use gtex functions to compute TPM GTEx kidney expression
-# we need the gencode_ids from the HGNC table for this
 
+############################################
+# annotate kidney expression
+# we load and use the results of previous walks through the ontology tree if not older then 1 month
 
-# TODO: maybe add expression in embryonic kidney from somewhere (e.g. https://descartes.brotmanbaty.org/)
-# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/tables/tissue_tpm/kidney.csv (this one for TPM)
-# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/tables/tissue_percentage/kidney.csv
-# descartes: https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/counts/gene2ens.csv (for identifier mapping)
-# TODO: use descartes functions to compute TPM GTEx kidney expression
-# TODO: annotate descartes kidney expression
+if (check_file_age("merge_analyses_sources_high_evidence_expression", "results", 1)) {
+  merge_analyses_sources_high_evidence_expression <- read_csv(get_newest_file("merge_analyses_sources_high_evidence_expression", "results"))
+} else {
+  merge_analyses_sources_high_evidence_gencode <- merge_analyses_sources_high_evidence %>%
+    dplyr::select(approved_symbol, hgnc_id) %>%
+    left_join(non_alt_loci_set_coordinates %>% dplyr::select(hgnc_id, gencode_id, ensembl_gene_id_version_hg19, ensembl_gene_id_version_hg38), by = c("hgnc_id"))
+
+  # use gtex functions to compute TPM GTEx kidney expression
+  # we need the gencode_ids from the HGNC table for this
+  # TODO: try to salvage the missing gtex annotations using the ensembl gene ids
+  gtex_results_kidney <- merge_analyses_sources_high_evidence_gencode %>%
+    filter(!is.na(gencode_id)) %>%
+    rowwise() %>%
+    mutate(gencode = get_multiple_median_tissue_expression(gencode_id, wide_format = TRUE, max_ids_per_request = 5)) %>%
+    unnest(cols = gencode, names_sep = "_") %>%
+    select(hgnc_id, gencode_kidney_medulla, gencode_kidney_cortex)
+
+  # add expression in embryonic kidney from descartes (https://descartes.brotmanbaty.org/)
+  descartes_results_kidney <- merge_analyses_sources_high_evidence_gencode %>%
+    mutate(descartes = get_descartes_tpm_expression(approved_symbol, "kidney")) %>%
+    unnest(cols = descartes, names_sep = "_") %>%
+    select(hgnc_id, descartes_kidney_tpm = descartes_tpm)
+
+  # join the results back to the main merge_analyses_sources_high_evidence_gencode table
+  # select the columns we want to keep for the final table join
+  merge_analyses_sources_high_evidence_expression <- merge_analyses_sources_high_evidence_gencode %>%
+    left_join(gtex_results_kidney, by = c("hgnc_id")) %>%
+    left_join(descartes_results_kidney, by = c("hgnc_id")) %>%
+    select(hgnc_id, gencode_kidney_medulla, gencode_kidney_cortex, descartes_kidney_tpm)
+
+  write_csv(merge_analyses_sources_high_evidence_expression,
+    file = paste0("results/merge_analyses_sources_high_evidence_expression.",
+      current_date,
+      ".csv"),
+    na = "NULL")
 
+  gzip(paste0("results/merge_analyses_sources_high_evidence_expression.", current_date, ".csv"),
+    overwrite = TRUE)
+}
 ############################################
 
 
 ############################################
-# TODO: add a column in the final table for publication (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points)
+# generate final annotated table of high evidence genes for manual review
+
+# select the columns from the hgnc annotation table we want to keep for the final table join
+non_alt_loci_set_coordinates_annotation <- non_alt_loci_set_coordinates %>%
+  select(hgnc_id, pLI, oe_lof, lof_z, mis_z, omim_summary, gencc_summary, clingen_summary, clinvar)
+
+# 1. join the results back to the main merge_analyses_sources_high_evidence_gencode table
+# 2. join with clinical categories tables (hpo_gene_list_kidney_all_kidney_groups_summarized_for_join, hpo_gene_list_onset_groups_summarized_for_join, hpo_gene_list_symptome_groups_summarized_for_join)
+# 3. join with the functional annotation tables (merge_analyses_sources_high_evidence_mgi, merge_analyses_sources_high_evidence_string, merge_analyses_sources_high_evidence_expression)
+# 4. add an empty column in the final table for publication (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points) and add an empty column in the final table for publications used
+# 5. add a unique id column for each entry in the final table (format "cur_XXX", with leading zeros if necessary)
+merge_analyses_sources_high_evidence_annotated <- merge_analyses_sources_high_evidence %>%
+  select(approved_symbol, hgnc_id, evidence_count) %>%
+  left_join(non_alt_loci_set_coordinates_annotation, by = c("hgnc_id")) %>%
+  left_join(hpo_gene_list_kidney_all_kidney_groups_summarized_for_join, by = c("approved_symbol")) %>%
+  left_join(hpo_gene_list_onset_groups_summarized_for_join, by = c("approved_symbol")) %>%
+  left_join(hpo_gene_list_symptome_groups_summarized_for_join, by = c("approved_symbol")) %>%
+  left_join(merge_analyses_sources_high_evidence_mgi, by = c("hgnc_id")) %>%
+  left_join(merge_analyses_sources_high_evidence_string, by = c("hgnc_id")) %>%
+  left_join(merge_analyses_sources_high_evidence_expression, by = c("hgnc_id")) %>%
+  mutate(publication_score = NA) %>%
+  mutate(publications_used = NA) %>%
+  mutate(cur_id = paste0("cur_", formatC(1:nrow(.), width = 3, flag = "0")))
+
+
+# TODO: workflow: if the respective gene/entity from our kidney list is in the ClinGen or GenCC curated list apply this group, if not apply the group with the highest score after reviewing the groups manually
+# TODO: define the scoring logic for the final table with cutoffs for the different categories
+# TODO: see GitHub issue for cutoffs in expression
+# TODO: scoring logic for publications (screening = 1 point, first clinical description = 2 points, clinical replication = 3 points)
 # TODO: scoring logic: if only screening publication then the category cant be more then Limited, if there is a clinical description then the category can be Moderate, if there is a clinical replication then the category can be Definitive
 # TODO: scoring logic: we further use the category "no known relation" for genes that are not trustworthy associated with kidney disease
 # TODO: maybe annotate with publications (from OMIM) and GeneReviews
-
-#TODO: annotate phenotypes as phenopackets for each entity?
-
+# TODO: annotate phenotypes as phenopackets for each entity?
 ############################################
 
 
@@ -943,12 +1061,12 @@ creation_date <- strftime(as.POSIXlt(Sys.time(),
   "UTC",
   "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d")
 
-write_csv(results_genes_wider,
-  file = paste0("merged/KidneyGenetics_AnnotateMergedTable.",
+write_csv(merge_analyses_sources_high_evidence_annotated,
+  file = paste0("results/C_high_evidence_annotated_csv_table.",
     creation_date,
     ".csv"),
   na = "NULL")
 
-gzip(paste0("results/KidneyGenetics_AnnotateMergedTable.", creation_date, ".csv"),
+gzip(paste0("results/C_high_evidence_annotated_csv_table.", creation_date, ".csv"),
   overwrite = TRUE)
 ############################################
\ No newline at end of file
diff --git a/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz
new file mode 100644
index 0000000..72e6f14
Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/C_high_evidence_annotated_csv_table.2023-10-11.csv.gz differ
diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz
new file mode 100644
index 0000000..355fc19
Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_expression.2023-10-11.csv.gz differ
diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz
new file mode 100644
index 0000000..8f6f3e0
Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_mgi.2023-10-11.csv.gz differ
diff --git a/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz
new file mode 100644
index 0000000..c83c38f
Binary files /dev/null and b/analyses/C_AnnotateMergedTable/results/merge_analyses_sources_high_evidence_string.2023-10-11.csv.gz differ
diff --git a/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz b/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz
new file mode 100644
index 0000000..36f097a
Binary files /dev/null and b/analyses/C_AnnotateMergedTable/stringdb_protein_links.2023-10-11.txt.gz differ
diff --git a/analyses/functions/gtex-functions.R b/analyses/functions/gtex-functions.R
index a7c27aa..6254630 100644
--- a/analyses/functions/gtex-functions.R
+++ b/analyses/functions/gtex-functions.R
@@ -79,6 +79,8 @@ get_median_tissue_expression <- function(gencode_ids, tissue_site_detail_ids = N
 }
 
 
+# TODO: fix the gtex function get_multiple_median_tissue_expression to work properly with multiple gencode ids instead of using rowwise
+# TODO: fix the gtex functions to implement retry logic if the API call results in an error or empty result
 #' Fetch Median Tissue Expression Levels from GTEx API for Multiple Genes
 #'
 #' This function takes a character vector of GENCODE gene identifiers as input,