Generate_master.Rmd

---
title: "Contruction of a comprehensive *E.coli* gene set"
author: "Claire Rioualen"
date: '`r Sys.Date()`'
output:
 html_document:
   fig_caption: yes
   highlight: zenburn
   self_contained: yes
   theme: cerulean
   toc: yes
   toc_depth: 5
   toc_float: yes
   number_sections: true
   code_folding: hide


---
<style type="text/css">

body{ /* Normal  */
      font-size: 14px;
  }
</style>

```{r init, include=FALSE}
setwd("/Users/rioualen/Desktop/Git/EcoliGenes/")
date <- Sys.Date()
source(file = "env.r")

```


```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = "/Users/rioualen/Desktop/Git/EcoliGenes/")
knitr::opts_chunk$set(echo = FALSE, include = TRUE, warning = FALSE, message = FALSE, cache = TRUE, eval = FALSE, results = 'asis', fig.width=12, fig.height=6, fig.align = 'center')
```

```{r libs-n-functions}
library(dplyr)
library(DT)
library(ggplot2)
library(gridExtra)
library(igraph)
library(plotly)
library(patchwork)
library(RMariaDB)
library(sqldf)
library(stringr)
library(tidyr)
library(UpSetR)
library(viridis)

feature_set_dir <- paste0("Feature_set_", date)
dir.create(feature_set_dir)

concat_uniq <- function(x, sep = ","){
  paste0(unique(na.omit(x)), collapse = sep)
}

concat_uniq2 <- function(..., sep = ","){
    vec <- paste(na.omit(c(...)), collapse = sep)
    res <- gsub("^,", "", paste(unique(sort(str_split(vec, pattern = ",")[[1]])), collapse = sep))
    res
}

```

```{r hide-pwd}
local({
  hook_source <- knitr::knit_hooks$get('source')
  knitr::knit_hooks$set(source = function(x, options) {
    x <- x[!grepl('# SECRET!$', x)]
    hook_source(x, options)
  })
})
```

# Master gene table 

This documents reports the details behind the generation of an exhaustive *E. coli* gene set, a table generated by merging all gene-related information from several databases, indexed by a *consensus bnumber*, and containing exhaustive synonyms for genes and their products.

This table is then queried upon executing function from the R library `EcoliGenes` to extract consistant and exhaustive information about *E. coli* K12 genes and TFs.

## RegulonDB genes table

Basically a join on gene, product, synonyms and bnumbers tables queried from RegulonDB. A column is added to state whether the gene is considered TF-coding or not.

*Note the pseudogene "insI2" has its symbol changed for its RegulonDB ID, for there is another gene that shares the same symbol, but a distinct bnumber.*

```{r query-regulon-genes}
regulondb_access <- dbConnect(RMariaDB::MariaDB(), 
											 username = regulondb_username,
                       password = regulondb_password,
                       dbname = regulondb_dbname,
                       host = regulondb_host,
                       port = regulondb_port)

query_1 <- "
select 
  G.gene_id as RegulonDB_gene_id,
  G.gene_name as RegulonDB_symbol,
  group_concat(distinct GBNUM.bnumber) as RegulonDB_bnumber,
	  min(G.gene_posleft) as RegulonDB_start,
	  min(G.gene_posright) as RegulonDB_stop,
	  group_concat(distinct G.gene_strand) as RegulonDB_strand,
	  group_concat(distinct G.gene_type) as RegulonDB_type,
	  group_concat(distinct P.product_id) as RegulonDB_product_id,
	  group_concat(distinct P.product_name) as RegulonDB_product_name,
	  group_concat(distinct P.product_type) as RegulonDB_product_type,
	  group_concat(distinct OBSYN.object_synonym_name) as gene_synonyms,
	  group_concat(distinct OBSYN2.object_synonym_name) as product_synonyms
		from 
			(select gene_id,gene_name,gene_posleft,gene_posright,gene_strand,gene_type from regulondb.GENE) as G
		left join (select ecocyc_id,bnumber,gene_id from regulondb.GENE_BNUMBER_TMP) as GBNUM
        on G.gene_id = GBNUM.gene_id
        left join (select * from regulondb.GENE_PRODUCT_LINK) as GPLN
        on G.gene_id = GPLN.gene_id
        left join (select product_id,product_name,product_type from regulondb.PRODUCT) as P
        on GPLN.product_id = P.product_id
        left join regulondb.OBJECT_SYNONYM as OBSYN
        on G.gene_id = OBSYN.object_id  
         left join regulondb.OBJECT_SYNONYM as OBSYN2
        on P.product_id = OBSYN2.object_id
	    group by G.gene_id,G.gene_name
"
genes_and_products <- dbGetQuery(regulondb_access, query_1)

query_2 <- "select product_id from PRODUCT_TF_LINK"

tfs_products <- unique(dbGetQuery(regulondb_access, query_2)$product_id)

genes_and_products <- genes_and_products %>%
  dplyr::mutate(RegulonDB_strand = ifelse(RegulonDB_strand == "reverse", "-", ifelse(RegulonDB_strand == "forward", "+",NA))) %>%
  dplyr::mutate(RegulonDB_TF = ifelse(RegulonDB_product_id %in% tfs_products,1,0)) %>%
  mutate_at("RegulonDB_symbol", ~replace(., RegulonDB_bnumber == 'b4708', 'ECK125240991')) ### Changed manually, since there's 2 genes with the same RegulonDB_symbol and we don't want them to be merged

dbDisconnect(regulondb_access)
```

## Zika genesView

Zika's genesView is queried. An additional column with parsed bnumbers is added. 

```{r query-zika}
zikadb_access <- dbConnect(RMariaDB::MariaDB(),
                       username = zika_username,
                       password = zika_password,
                       dbname = zika_dbname,
                       host = zika_host,
                       port = zika_port)
## Part 1: get proms and genes associated to TUs from RegulonDB
query_2 <- "SELECT * FROM ecoli_project_final.genesView;"

zika_genes <- dbGetQuery(zikadb_access, query_2)

dbDisconnect(regulondb_access)

zika_genes_parsed <- zika_genes %>%
  dplyr::rename(Zika_gene_id = gene_id,
                Zika_bnumber = bnumber,
                Zika_symbol = symbol,
                Zika_product = product,
                Zika_start = start,
                Zika_stop = stop,
                Zika_strand = strand,
                Zika_type = type,
  							Zika_essentiality = essentiality,
                Zika_TF = TF
                ) %>%
  dplyr::mutate(Zika_parsed_bnum = ifelse(grepl('_', Zika_bnumber), ifelse(grepl('^b', Zika_bnumber), stringr::str_split(Zika_bnumber, '_', simplify = T), Zika_bnumber), Zika_bnumber))
```

## Merge

* Genes are first retrieved from RegulonDB and Zika separately
* Tables are joined using bnumbers (or parsed bnumbers, for sRNA in Zika)
* Genes that are not merged during this first step are then joined based on their symbol
* Coherency of coordinates and strand is checked
* In case some information differs between RegulonDB and Zika, the priority is given to RegulonDB
* A third join is made based on coordinates identity

* 6 new columns are added:

  * **Consensus_bnumber**: if different, RegulonDB's is kept; if absent, RegulonDB internal ID is kept, else, Zika internal "bnumber" is kept ; synonyms are updated.
  * **Consensus_symbol**: if different, RegulonDB's is kept; if absent, RegulonDB bnumber/ID is kept; synonyms are updated.
  * **Consensus_start**: if different, RegulonDB's is kept; if absent in RegulonDB, Zika's is used. 
  * **Consensus_stop**: if different, RegulonDB's is kept; if absent in RegulonDB, Zika's is used. 
  * **Consensus_strand**: if different, RegulonDB's is kept; if absent in RegulonDB, Zika's is used. 
  * **Consensus_TF**: if 0, gene doesn't code for a TF; if 1, gene considered TF-coding in either database; if 2, gene considered TF-coding in both databases. 

```{r merge-genes}
## Manually added synonyms
## raiZ/b4805, C0293/b4806
## istR-1/istR
Consensus_symbol <- c("gatR", "insO", "istR", "yagP", "ydfJ", "ydiU", "yjhB", "ydgV", "yhjC", "yejM", "yhcH",  "yeeX", "ygeR", "yedR", "yraP", "yddW", "yhcB", "yafK", "ytfL", "yebS", "yjhC")
add_synonyms <- c("gatR_2", "insN-2", "istR-1", "b4694", "b4600", "selO", "nanX", "mdtU", "rcdB", "lapC", "nanQ", "tmaR", "actS", "drpB", "dolP", "digH", "zapG", "dpaA", "paeA", "letA", "nanY")
manual_synonyms <- data.frame(Consensus_symbol, add_synonyms)

## Join RegulonDB and Zika genes on bnumber, then symbol, then coordinates
regulon_zika_genes <- genes_and_products %>%
	mutate_all(na_if,"") %>%
  ## Join RegulonDB and Zika on bnumbers
  dplyr::mutate(Consensus_bnumber = ifelse(!is.na(RegulonDB_bnumber), RegulonDB_bnumber, RegulonDB_gene_id)) %>%
	dplyr::full_join(zika_genes_parsed, by = c("Consensus_bnumber" = "Zika_parsed_bnum")) %>%
	dplyr::mutate(Consensus_symbol = ifelse(((is.na(RegulonDB_symbol)) & (is.na(Zika_symbol))), Consensus_bnumber,
                          ifelse(is.na(Zika_symbol), RegulonDB_symbol,
                          			 ifelse(is.na(RegulonDB_symbol), Zika_symbol, RegulonDB_symbol)))) %>%
	## Join on symbol
  dplyr::group_by(Consensus_symbol) %>%
	dplyr::summarise(Consensus_bnumber = first(Consensus_bnumber),
									 across(where(is.character), concat_uniq),
		# Consensus_bnumber = concat_uniq(Consensus_bnumber),
		# 							RegulonDB_gene_id = concat_uniq(RegulonDB_gene_id),
		# 							RegulonDB_symbol = concat_uniq(RegulonDB_symbol),
		# 							RegulonDB_bnumber = concat_uniq(RegulonDB_bnumber),
									RegulonDB_start = first(RegulonDB_start),
									RegulonDB_stop = first(RegulonDB_stop),
									# RegulonDB_strand = concat_uniq(RegulonDB_strand),
									# RegulonDB_type = concat_uniq(RegulonDB_type),
									# RegulonDB_product_id = concat_uniq(RegulonDB_product_id),
									# RegulonDB_product_name = concat_uniq(RegulonDB_product_name),
									# RegulonDB_product_type = concat_uniq(RegulonDB_product_type),
									RegulonDB_TF = mean(RegulonDB_TF),
									# RegulonDB_type = concat_uniq(RegulonDB_type),
									Zika_gene_id = concat_uniq(Zika_gene_id),
									# Zika_symbol = concat_uniq(Zika_symbol),
									# Zika_bnumber = concat_uniq(Zika_bnumber),
									# Zika_essentiality = concat_uniq(Zika_essentiality),
									# Zika_product = concat_uniq(Zika_product),
									Zika_start = first(Zika_start),
									Zika_stop = first(Zika_stop),
									# Zika_strand = concat_uniq(Zika_strand),
									# Zika_type = concat_uniq(Zika_type),
									Zika_TF = mean(Zika_TF),
									gene_synonyms = concat_uniq(gene_synonyms),
									product_synonyms = concat_uniq2(product_synonyms, RegulonDB_product_id)) %>%
	mutate_all(na_if,"") %>%
	rowwise() %>%
  dplyr::mutate(Consensus_TF = sum(Zika_TF, RegulonDB_TF, na.rm=T)) %>%
  dplyr::mutate(Consensus_start = ifelse(is.na(RegulonDB_start), Zika_start,
                          ifelse(is.na(Zika_start), RegulonDB_start,
                                 ifelse(RegulonDB_start == Zika_start, RegulonDB_start, RegulonDB_start)))) %>%
  dplyr::mutate(Consensus_stop = ifelse(is.na(RegulonDB_stop), Zika_stop,
                          ifelse(is.na(Zika_stop),RegulonDB_stop,
                                 ifelse(RegulonDB_stop == Zika_stop,RegulonDB_stop,RegulonDB_stop)))) %>%
  dplyr::mutate(Consensus_strand = ifelse(is.na(RegulonDB_strand), Zika_strand,
                          ifelse(is.na(Zika_strand), RegulonDB_strand,
                                 ifelse(RegulonDB_strand == Zika_strand, RegulonDB_strand, RegulonDB_strand))))  %>%
  ## Join on coordinates and strand
  mutate(coords = paste0(Consensus_start, "_", Consensus_stop, "_", Consensus_strand)) %>%
  dplyr::group_by(coords) %>%
	dplyr::summarise(Consensus_bnumber = first(Consensus_bnumber),
									 Consensus_symbol = first(Consensus_symbol),
									 across(where(is.factor), concat_uniq),
									 across(where(is.character), concat_uniq),
									 across(where(is.numeric), min)) %>%
	mutate_all(na_if,"") %>%
  ## Add synonyms
  dplyr::left_join(manual_synonyms, by = ("Consensus_symbol")) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(gene_synonyms = concat_uniq2(gene_synonyms, RegulonDB_bnumber, RegulonDB_gene_id, RegulonDB_symbol, Zika_symbol, Zika_bnumber, add_synonyms)) %>%
	## Order columns and remove missing values
	dplyr::filter(!is.na(Consensus_bnumber) & !is.na(Consensus_symbol) & !is.na(Consensus_start) & !is.na(Consensus_stop) & !is.na(Consensus_strand) & !is.na(Consensus_TF)) %>%
	dplyr::select(Consensus_bnumber, Consensus_symbol, Consensus_start, Consensus_stop, Consensus_strand, Consensus_TF, 
	  							gene_synonyms, product_synonyms, everything(), -coords, -add_synonyms) %>%
	dplyr::arrange(Consensus_start)


# write.table(regulon_zika_genes, file = "TEMP_MASTER_GENE_FILE.tsv", col.names = T,  row.names = F, quote = F, sep = "\t")

## Check if there are genes without consensus info
# regulon_zika_genes %>% filter(is.na(Consensus_bnumber)|is.na(Consensus_symbol)|is.na(Consensus_start)|is.na(Consensus_stop))

```

```{r write-gene-set-bed}

# gene_set_bed <- regulon_zika_genes %>%
#   mutate(chromosome = "Chromosome") %>%
# 	select(chromosome, Consensus_start, Consensus_stop, Consensus_symbol, Consensus_bnumber, Consensus_strand) %>%
# 	arrange(Consensus_start) %>%
# 	mutate_all(na_if,"") %>%
# 	tidyr::drop_na()
# 
# write.table(gene_set_bed, file = "TEMP_MASTER_GENE_BED.tsv", col.names = F,  row.names = F, quote = F, sep = "\t")

```

<!-- *Note: a few product synonyms are added manually: FNR, CRP, HNS, GlpR, GatR.* -->
<!-- * In order to deal with heterodimeric TFs, a few rows are added that have pairs of bnumbers and symbols -->

```{r add-tf-synonyms}
## Complete protein synonyms

Consensus_symbol <- c("fnr", "crp", "hns", "glpR", "gatR")
add_synonyms <- c("FNR", "CRP", "HNS", "GlpR", "GatR")
manual_synonyms <- data.frame(Consensus_symbol, add_synonyms)

regulon_zika_genes_proteins_synonyms <- regulon_zika_genes %>%
	dplyr::left_join(manual_synonyms, by = ("Consensus_symbol")) %>%
	dplyr::mutate(product_synonyms = concat_uniq2(product_synonyms, add_synonyms)) %>%
	dplyr::select(-add_synonyms)

## Add RefSeq protein_ids from RSAT cds.tab file

refseq_ids <- read.delim("~/Desktop/Genomes/Escherichia_coli_GCF_000005845.2_ASM584v2/genome/cds.tab", header = T,  sep = "\t")

refseq_ids_update <- refseq_ids %>%
	dplyr::mutate(bnumber = EcoliGenes::get_gene_bnumber(id)) %>%
	dplyr::mutate_all(na_if,"<NA>") %>%
	dplyr::rename(refseq_id = protein_id) %>%
	dplyr::select(bnumber, refseq_id) %>%
	dplyr::distinct()

refseq_added <- regulon_zika_genes_proteins_synonyms %>% 
	left_join(refseq_ids_update, by = c("Consensus_bnumber" = "bnumber")) %>% 
  # mutate(product_synonyms = ifelse(is.na(product_synonyms), Consensus_bnumber, product_synonyms)) %>%
	dplyr::mutate(product_synonyms = concat_uniq2(product_synonyms, refseq_id)) %>%
	dplyr::select(-refseq_id)


## Manually add lines for TF that form dimers are associated to 2 bnumbers
bnum <- c("b1712,b0912", "b3357,b0959", "b1892,b1891", "b0226,b0225", "b2783,b2782", "b1951,b2217", "b2217,b4366", "b3512,b2217", "b1564,b1563", "b2017,b4539", "b1507,b1508", "b3083,b3082", "b4000,b0440")
sym <- c("ihfA,ihfB", "crp,sxy", "flhD,flhC", "dinJ,yafQ", "mazE,mazF", "rcsA,rcsB", "rcsB,bglJ", "gadE,rcsB", "relB,rel", "yefM,yoeB", "hipA,hipB", "higB,higA", "hupA,hupB")
syno <-  c("IHF", "CRP-Sxy", "FlhDC", "DinJ-YafQ", "MazE-MazF", "RcsAB", "RcsB-BglJ", "GadE-RcsB", "RelB-RelE", "YefM-YoeB", "HipAB", "HigBA", "HU,hu,HupAB")
fill <- c(rep(NA, 13))

dimers_df <- data.frame(
	Consensus_bnumber = bnum, 
	Consensus_symbol = sym, 
	Consensus_start = fill, 
	Consensus_stop = fill, 
	Consensus_strand = fill, 
	Consensus_TF = fill, 
	gene_synonyms = fill, 
	product_synonyms = syno,
	RegulonDB_gene_id = fill, 
	RegulonDB_symbol = fill, 
	RegulonDB_bnumber = fill, 
	RegulonDB_start = fill, 
	RegulonDB_stop = fill, 
	RegulonDB_strand = fill, 
	RegulonDB_type = fill, 
	RegulonDB_product_id = fill, 
	RegulonDB_product_name = fill, 
	RegulonDB_product_type = fill, 
	RegulonDB_TF = c(rep(1, 13)), 
	Zika_gene_id = fill, 
	Zika_symbol = fill, 
	Zika_bnumber = fill, 
	Zika_essentiality = fill, 
	Zika_product = fill, 
	Zika_start = fill, 
	Zika_stop = fill, 
	Zika_strand = fill, 
	Zika_type = fill, 
	Zika_TF = fill
)

dimers_merged <- rbind.data.frame(refseq_added, dimers_df)


```


```{r add-biocyc}
# biocyc_genes <- read.delim("All_genes_of_E._coli_K-12_substr._MG1655.txt", header = T,  sep = "\t")
# 
# biocyc_update <- biocyc_genes %>%
# 	dplyr::mutate_all(na_if,"") %>%
# 	dplyr::mutate(coords = paste0(Left.End.Position, "_", Right.End.Position)) %>%
# 	dplyr::select(Gene.Name, Accession.1, coords ) %>%
# 	dplyr::distinct() 
# 
# all_synonyms <- unique((master_table %>% dplyr::select(gene_synonyms) %>% tidyr::separate_rows(gene_synonyms, sep = ",") %>% dplyr::arrange(gene_synonyms))$gene_synonyms)
# 
# all_synonyms <- unique((regulon_zika_genes_synonyms %>% dplyr::select(gene_synonyms) %>% tidyr::separate_rows(gene_synonyms, sep = ",") %>% dplyr::arrange(gene_synonyms))$gene_synonyms)
# 
# biocyc_update %>%   dplyr::filter(!Accession.1 %in% all_synonyms)
# biocyc_update %>%   dplyr::filter(!Gene.Name %in% all_synonyms)
# 
# 
# master_temp <- master_table %>% 
# 	dplyr::mutate(coords = paste0(Consensus_start, "_", Consensus_stop)) %>%
#   dplyr::select(Consensus_bnumber, RegulonDB_gene_id, RegulonDB_symbol, RegulonDB_bnumber, gene_synonyms,  coords)
#                                               
# temp <- biocyc_update %>%   dplyr::filter(!Accession.1 %in% all_synonyms | !Gene.Name %in% all_synonyms) %>%
#   dplyr::mutate(regulondb_bnumber1 = EcoliGenes::get_gene_bnumber(Accession.1),
#                 regulondb_bnumber2 = EcoliGenes::get_gene_bnumber(Gene.Name),
#                 regulondb_symbol1 = EcoliGenes::get_gene_symbol(Accession.1),
#                 regulondb_symbol2 = EcoliGenes::get_gene_symbol(Gene.Name)) %>%
#   dplyr::rowwise() %>%
#   tidyr::unite(Consensus_bnumber, regulondb_bnumber1, regulondb_bnumber2, na.rm = T) %>%
#   tidyr::unite(Consensus_symbol, regulondb_symbol1, regulondb_symbol2, na.rm = T) %>%
#   dplyr::left_join(master_temp %>%  dplyr::select(-coords), by = "Consensus_bnumber") %>%
#   dplyr::filter(!coords == "NA_NA") %>%
#   dplyr::left_join(master_temp %>%  dplyr::select(-Consensus_bnumber), by = "coords")
# 
# write.table(temp, file = paste0("temp_genes_biocyc_", date, ".tsv"), sep="\t", col.names = T, row.names = F)

```

```{r write-master}
master_table  <- dimers_merged  
  # dplyr::select(-contains("Zika")) %>%
  # dplyr::arrange(Consensus_start)

write.table(master_table, file = "inst/extdata/master_gene_file.tsv", sep="\t", col.names = T, row.names = F)
```