metabolomics.Rmd

---
title: "James P. Johnson - Metabolomics LIMMA Analysis of Bowel Movement Frequency (BMF) v3-7-23"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}
# Load some useful libraries
library(limma)
library(edgeR)
library(sns)
library(data.table)
library(tidyr)
library(tidyverse)
library(tibble)
library(sjmisc)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(stringr)
library(ggbeeswarm)
```


```{r}
# Import data
#import data
full <- read.csv(file = 'metabolomics.csv')
#rownames(countdf) <- NULL
meta <- read.csv(file = '../Metabolomics_Data/metabolomics_fullmetadata.csv')
#rownames(meta) <- NULL
countdf <- read.csv(file = 'metabolomics_count.csv')
#rownames(full) <- NULL

#clean-up tables
#prepare for model
#make consistent index/column/label/ordering between tables/matrices

# remove added X's from PCI names
names(countdf) <- sapply(str_remove_all(names(countdf),"X"),"[") 
names(meta) <- sapply(str_remove_all(names(meta),"X"),"[") 

#count_matrix <- countdf
#count_matrix <- count_matrix[,2:ncol(count_matrix)]
#count_matrix <- as.matrix(count_matrix)

names(full) <- sapply(str_remove_all(colnames(full),"X"),"[")
names(meta) <- sapply(str_remove_all(colnames(meta),"X"),"[")

#factorize sex
#meta$sex <- factor(meta$sex)
full$sex <- factor(full$sex)

#factorize BMF
#meta$bowel <- factor(as.numeric(factor(meta$bowel, levels = c(1,2,3,4), labels = c(1,2,3,4))))
full$bowel <- factor(as.numeric(factor(full$bowel, levels = c(1,2,3,4), labels = c(1,2,3,4))))
#meta <- within(meta, bowel <- relevel(bowel, ref = 3))
full <- within(full, bowel <- relevel(bowel, ref = 3))
#names(meta)[10] <- 'CRP'
names(full)[10] <- 'CRP'
#names(meta)[11] <- 'LDL'
names(full)[11] <- 'LDL'
#names(meta)[12] <- 'A1C'
names(full)[12] <- 'A1C'

#full <- DropNA(full)
#countdf <- DropNA(countdf[,-c(1)])
#meta <- DropNA(meta)

#countdf[1] <- colnames(full)[13:ncol(full)]
colnames(full)[13:ncol(full)] <- countdf[[1]]

colnames(countdf)[1] <- 'CHEMICAL_ID'
countdf <- countdf[-c(1:4),]
countdf <- data.frame(countdf, row.names = "CHEMICAL_ID")
#countdf <- column_to_rownames(countdf, var = "CHEMICAL_ID")

#colnames(countdf)[1] <- 'CHEMICAL_ID'
#countdf[1] <- NULL
meta[1] <- NULL
#rownames(countdf) <- NULL
#rownames(meta) <- NULL

#countdf <- countdf[, -1]  # Remove the original column
countdf[,2:ncol(countdf)] <- as.data.frame(lapply(countdf[,2:ncol(countdf)], as.numeric))
names(countdf) <- sapply(str_remove_all(colnames(countdf),"X"),"[")
countdf[,1] <- as.numeric(countdf[,1])
countdf
full
meta[,1] <- as.character(meta[,1])
meta
```


```{r}
# Design linear regression models using lmFit and eBayes with help LIMMA
# This code adapted from Christian Diener, PhD:
design <- model.matrix(~bowel + sex + age + BMI_CALC + eGFR + LDL + CRP + PC1 + PC2 + PC3 + A1C, full) # Covariates: sex, age, BMI, eGFR
dge <- DGEList(counts=countdf)  # Where `count_matrix` is the matrix mentioned above
#dge <- calcNormFactors(dge)  # Normalize the matrix (this step only for CORNCOB/microbiome data)
logCPM <- cpm(dge, log=TRUE)  # Takes the log of the data
fit <- lmFit(logCPM, design)  # Fits the model for all metabolites
fit <- eBayes(fit)  # Stabilizes the variances
```


```{r}
rownames(meta) <- meta$CHEMICAL_ID
meta
```

```{r}
#Get results table for Constipation coefficient relative to High Normal BMF:
re_const <- topTable(fit, coef = 2, genelist = countdf$name, sort="p", number="none")  # Select the significant models by coefficient 2
re_low <- topTable(fit, coef = 3, genelist = countdf$name, sort="p", number="none")  # Select the significant models by coefficient 3
re_diarrhea <- topTable(fit, coef = 4, genelist = countdf$name, sort="p", number="none")  # Select the significant models by coefficient 4

indices <- match(rownames(re_const), meta$BIOCHEMICAL_NAME) # associate column of labs names with index of lab
re_const[1] <- countdf$name[indices] # associate the labs names with the labs indices
p_const <- re_const[re_const$adj.P.Val < 0.05,] # create df of just significant adj P value results
p_const <- p_const[order(p_const$adj.P.Val),] # order by adj P value
#p_const <- p_const[,c('logFC','B','adj.P.Val','ID','P.Value')] # keep only desired columns

#Get results table for Low Normal coefficient relative to High Normal BMF:
indices <- match(rownames(re_low), meta$BIOCHEMICAL_NAME) # associate column of labs names with index of lab
re_low[1] <- countdf$name[indices] # associate the labs names with the labs indices
p_low <- re_low[re_low$adj.P.Val < 0.05,] # create df of just significant adj P value results
p_low <- p_low[order(p_low$adj.P.Val),] # order by adj P value
#p_low <- p_low[,c('logFC','B','adj.P.Val','ID','P.Value')] # keep only desired columns

#Get results table for Diarrhea coefficient relative to High Normal BMF:
indices <- match(rownames(re_diarrhea), meta$BIOCHEMICAL_NAME) # associate column of labs names with index of lab
re_diarrhea[1] <- countdf$name[indices] # associate the labs names with the labs indices
p_diarrhea <- re_diarrhea[re_diarrhea$adj.P.Val < 0.05,] # create df of just significant adj P value results
p_diarrhea <- p_diarrhea[order(p_diarrhea$adj.P.Val),] # order by adj P value
#p_diarrhea <- p_diarrhea[,c('logFC','B','adj.P.Val','ID','P.Value')] # keep only desired columns

#Get results table for Diarrhea coefficient relative to High Normal BMF:
indices <- match(rownames(re_egfr), meta$BIOCHEMICAL_NAME) # associate column of labs names with index of lab
re_egfr[1] <- countdf$name[indices] # associate the labs names with the labs indices
p_egfr <- re_egfr[re_egfr$adj.P.Val < 0.05,] # create df of just significant adj P value results
p_egr <- p_egfr[order(p_egfr$adj.P.Val),] # order by adj P value
#p_diarrhea <- p_diarrhea[,c('logFC','B','adj.P.Val','ID','P.Value')] # keep only desired columns

#Show dfs of significant hits (there are none)
sig_const <- p_const[which(p_const$adj.P.Val < 0.05),]
sig_low <- p_low[which(p_low$adj.P.Val < 0.05),]
sig_diarrhea <- p_diarrhea[which(p_diarrhea$adj.P.Val < 0.05),]
sig_egfr <- p_egfr[which(p_egfr$adj.P.Val < 0.05),]
#sig_low$bowel <- 'Low Normal'
sig_const # no hits
sig_low
sig_diarrhea
sig_egfr
```


```{r}
#Pre-process for plotting:
#Get results table for Constipation coefficient relative to High Normal BMF:
re_const <- topTable(fit, coef = 2, genelist = rownames(countdf), sort="p", number="none")

names(re_const)[1] <- 'CHEMICAL_ID' # rename first column from ID to CHEMICAL_ID to match other dfs
meta[meta==""]<- NA
re_const <- dplyr::inner_join(meta, re_const, by = intersect(names(meta),names(re_const))) #combine anno and re dfs by intersection of CHEMICAL_ID values
p <- re_const
p_aconst <- re_const[re_const$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_const <- p[,c('logFC','B','adj.P.Val','BIOCHEMICAL_NAME','P.Value','SUPER_PATHWAY')] # keep only desired columns

#Get results table for Low Normal coefficient relative to High Normal BMF:
re_low <- topTable(fit, coef = 3, genelist = rownames(countdf), sort="p", number="none")

names(re_low)[1] <- 'CHEMICAL_ID' # rename first column from ID to CHEMICAL_ID to match other dfs
re_low <- dplyr::inner_join(meta, re_low, by = intersect(names(meta),names(re_low))) #combine anno and re dfs by intersection of CHEMICAL_ID values
p <- re_low
p_alow <- re_low[re_low$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_low <- p[,c('logFC','B','adj.P.Val','BIOCHEMICAL_NAME','P.Value','SUPER_PATHWAY')] # keep only desired columns

#Get results table for Diarrhea coefficient relative to High Normal BMF:
re_diarrhea <- topTable(fit, coef = 4, genelist = rownames(countdf), sort="p", number="none")

re_egfr <- topTable(fit, coef = 8, genelist = rownames(countdf), sort="p", number="none")

names(re_egfr)[1] <- 'CHEMICAL_ID' # rename first column from ID to CHEMICAL_ID to match other dfs

re_egfr <- dplyr::inner_join(meta, re_egfr, by = intersect(names(meta),names(re_egfr))) #combine anno and re dfs by intersection of CHEMICAL_ID values
p <- re_egfr
p_aegfr <- re_egfr[re_egfr$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_egfr <- p[,c('logFC','B','adj.P.Val','BIOCHEMICAL_NAME','P.Value','SUPER_PATHWAY')] # keep only desired columns


re_diarrhea <- dplyr::inner_join(meta, re_diarrhea, by = intersect(names(meta),names(re_diarrhea))) #combine anno and re dfs by intersection of CHEMICAL_ID values
p <- re_diarrhea
p_adiarrhea <- re_diarrhea[re_diarrhea$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_diarrhea <- p[,c('logFC','B','adj.P.Val','BIOCHEMICAL_NAME','P.Value','SUPER_PATHWAY')] # keep only desired columns

#Show dfs of significant hits
sig_const <- p_const[which(p_const$adj.P.Val < 0.05),]
sig_low <- p_low[which(p_low$adj.P.Val < 0.05),]
sig_diarrhea <- p_diarrhea[which(p_diarrhea$adj.P.Val < 0.05),]
sig_egfr <- p_egfr[which(p_egfr$adj.P.Val < 0.05),]
sig_const
sig_low
sig_diarrhea
sig_egfr
```

```{r}
#Plotting preparation:
combinations <- list(c("Constipation","Low\nNormal"),
                                 c("Constipation","High\nNormal"),
                                 c("Constipation","Diarrhea"),
                                 c("Low\nNormal","High\nNormal"),
                                 c("Low\nNormal","Diarrhea"),
                                 c("High\nNormal","Diarrhea"))

comparisons = list(c("Constipation","High\nNormal"),c("Low\nNormal","High\nNormal"),c("Diarrhea","High\nNormal"))

#Create new df for plotting
biochemical_names <- dplyr::inner_join(meta, re_const, by = intersect(meta$BIOCHEMICAL_NAME,names(re_const$BIOCHEMICAL_NAME)))
biochemical_names <- biochemical_names[,c('CHEMICAL_ID','BIOCHEMICAL_NAME')]
biochemistry <- full[,c(1:12,17:ncol(full))]
names(biochemistry)[13:ncol(biochemistry)] <- meta$BIOCHEMICAL_NAME
biochemistry

#sig_const$bowel <- 'Constipation'
sig_low$bowel <- 'Low\nNormal'
#sig_diarrhea$bowel <- 'Diarrhea'
#sig_const$bowel <- factor(sig_const$bowel)
sig_low$bowel <- factor(sig_low$bowel)
#sig_diarrhea$bowel <- factor(sig_diarrhea$bowel)
titles = c(sig_const$BIOCHEMICAL_NAME,sig_low$BIOCHEMICAL_NAME,sig_diarrhea$BIOCHEMICAL_NAME)

#Plotting preparation:
#Annotation function:
sig = function(x){
  if(x < 0.001){"***"} 
  else if(x < 0.01){"**"}
  else if(x < 0.05){"*"}
  else{NA}}


p <- sig_low
p <- p[order(p$adj.P.Val < 0.05),]
p <- p[order(factor(p$bowel, levels = c('Low\nNormal'))),]
p_nodup <- p[!duplicated(p$BIOCHEMICAL_NAME),]

colnames(biochemistry)[13:length(colnames(biochemistry))] <- biochemical_names$BIOCHEMICAL_NAME
#biochemistry <- dplyr::inner_join(full[,c(1:17)],biochemistry, by = intersect(names(full[,c(1:17)])[14],names(biochemistry)[10]))
biochemistry$bowel <- factor(biochemistry$bowel, levels=c(1,2,3,4), labels = c("Constipation","Low\nNormal","High\nNormal","Diarrhea"))

#annotated-only dataframe:
annotated <- p
annotated_nodup <- p_nodup
annotated <- p[!grepl("X - ", p$BIOCHEMICAL_NAME),] #remove unannotated hits
annotated_nodup <- p_nodup[!grepl("X - ", p_nodup$BIOCHEMICAL_NAME),]
annotated <- annotated[!is.na(annotated$bowel),]
annotated_nodup <- annotated_nodup[!is.na(annotated_nodup$bowel),]
annotated <- annotated[order(annotated$adj.P.Val),]
annotated_nodup <- annotated_nodup[order(annotated_nodup$adj.P.Val),]
const_anno <- annotated_nodup[order(annotated_nodup$bowel),][which(annotated_nodup[order(annotated_nodup$bowel),]$bowel == 'Constipation'),]
low_anno <- annotated_nodup[order(annotated_nodup$bowel),][which(annotated_nodup[order(annotated_nodup$bowel),]$bowel == 'Low\nNormal'),]
diarrhea_anno <- annotated_nodup[order(annotated_nodup$bowel),][which(annotated_nodup[order(annotated_nodup$bowel),]$bowel == 'Diarrhea'),]
anno_list <- rbind(const_anno,low_anno,diarrhea_anno)

myplots_main <- list()  # new empty list
biochemistry[biochemistry==0] <- NA
#biochemistry[,c(4:ncol(biochemistry))] <- log10(biochemistry[,c(4:ncol(biochemistry))])
biochemistry_test <- biochemistry[,c('bowel','public_client_id',p_nodup$BIOCHEMICAL_NAME)]

biochemistry_testanno <- biochemistry_test
#biochemistry_testanno <- biochemistry_testanno %>% select(-c(3:12))
biochemistry_selection <- biochemistry_testanno[,colnames(biochemistry_testanno) %in% titles]
biochemistry_testanno <- cbind(biochemistry_testanno[,c(1:2)],biochemistry_selection)

#biochemistry_testanno <- biochemistry_testanno[,c('public_client_id','bowel',names(biochemistry)[(names(biochemistry) %in% titles)])]
#biochemistry_testanno$bowel <- factor(biochemistry_testanno$bowel, levels=c(1,2,3,4), labels = c("Constipation","Low\nNormal","High\nNormal","Diarrhea"))
#biochemistry_testanno$bowel <- factor(biochemistry_testanno$bowel, levels=c("Constipation","Low\nNormal","High\nNormal","Diarrhea"), labels = c("Constipation","Low\nNormal","High\nNormal","Diarrhea"))

biochemistry_testanno_const <- biochemistry_testanno[,c('public_client_id','bowel',sig_const$BIOCHEMICAL_NAME)]
biochemistry_testanno_low <- biochemistry_testanno[,c('public_client_id','bowel',sig_low$BIOCHEMICAL_NAME)]
biochemistry_testanno_diarrhea <- biochemistry_testanno[,c('public_client_id','bowel',sig_diarrhea$BIOCHEMICAL_NAME)]
biochemistry_testanno <- list(biochemistry_testanno_const,biochemistry_testanno_low,biochemistry_testanno_diarrhea)
bmfs <- list(sig_const,sig_low,sig_diarrhea)

#Main Text Figures
```

```{r}
fig_index = 1
```

```{r}
#Complex plotting function for annotated-only dataframe:
test_anno = function(x,y,z,j,row) {
 x = NULL
 y = NULL
 j <- rbind(bmfs[[1]],bmfs[[2]],bmfs[[3]])$BIOCHEMICAL_NAME[row]
 if (counter > 3) {
   counter <<- 1
   #row_ind_iterator <<- row_ind_iterator + 1
   results <- list(p.value = 1)
   names(results) <- 'p.value'
   return(results)
 } else if (counter <= 3) {
    print("Item is:")
    print(j)
    print("Initial/unincremented count is:")
    print(counter)
    print("z[[counter]][1]: ")
    print(z[[counter]][1])
    if (row <= total_rows) {        
      if (z[[counter]][1]== comparisons[[1]][1]) {
          j_has_match = ifelse(dim(biochemistry_testanno[[counter]])[2]>2,any(names(biochemistry_testanno[[counter]][3:ncol(biochemistry_testanno[[counter]])]) %in% j),FALSE)
          print("Does y_name match a BMF df that has hits?")
          print(j_has_match)
          print("Count is:")
          print(counter)
          print("Row_ind_iterator is:")
          print(row)
          pval <- ifelse(dim(biochemistry_testanno[[counter]][j_has_match])[2] !=0,c(bmfs[[1]]$adj.P.Val,bmfs[[2]]$adj.P.Val,bmfs[[3]]$adj.P.Val)[row],1)
          results <- list(p.value = pval)
          names(results) <- 'p.value'
          print(results)
          counter <<- 2
          #counter <<- counter + 1   
        } else if (z[[counter]][1]==comparisons[[2]][1]) {
          j_has_match = ifelse(dim(biochemistry_testanno[[counter]])[2]>2,any(names(biochemistry_testanno[[counter]][3:ncol(biochemistry_testanno[[counter]])]) %in% j),FALSE)
          print("Does y_name match a BMF df that has hits?")
          print(j_has_match)
          print("Count is:")
          print(counter)
          print("Row_ind_iterator is:")
          print(row)
          pval <- ifelse(dim(biochemistry_testanno[[counter]][j_has_match])[2] !=0,c(bmfs[[1]]$adj.P.Val,bmfs[[2]]$adj.P.Val,bmfs[[3]]$adj.P.Val)[row],1)
          results <- list(p.value = pval)
          names(results) <- 'p.value'
          print(results)
          counter <<- 3
          #counter <<- counter + 1
        } else if (z[[counter]][1]==comparisons[[3]][1]) {
          print("Last comparison drawn for this item")
          j_has_match = ifelse(dim(biochemistry_testanno[[counter]])[2]>2,any(names(biochemistry_testanno[[counter]][3:ncol(biochemistry_testanno[[counter]])]) %in% j),FALSE)
          print("Does y_name match a BMF df that has hits?")
          print(j_has_match)
          print("Row_ind_iterator is now: ")
          #row_ind_iterator <<- row_ind_iterator + 1
          print(row)
          print("Count is:")
          print(counter)
          pval <- ifelse(dim(biochemistry_testanno[[counter]][j_has_match])[2] !=0,c(bmfs[[1]]$adj.P.Val,bmfs[[2]]$adj.P.Val,bmfs[[3]]$adj.P.Val)[row],1)
          results <- list(p.value = pval)
          names(results) <- 'p.value'
          print(results)
          counter <<- 1
          #counter <<- counter + 1
          #row_ind_iterator <<- row_ind_iterator + 1
        }
    } else {
      counter <<- 1
      results <- list(p.value = 1)
      names(results) <- 'p.value'
      print("Finished pairwise comparing")
      print(results)
    }
 }
 return(results)
}

#Begin accruing plots:
figures <- list()
counter <<- 1
total_rows <<- 21
for (fig_index in 1:length(bmfs)[1]) {
  has_match <- any(names(biochemistry_testanno[[fig_index]]) %in% bmfs[[fig_index]]$BIOCHEMICAL_NAME)
  if (dim(biochemistry_testanno[[fig_index]][has_match])[2] > 2) {
    for (row_ind_iterator in 1:sum(nrow(bmfs[[1]]),nrow(bmfs[[2]]),nrow(bmfs[[3]]))) {
      y_name <- rbind(bmfs[[1]],bmfs[[2]],bmfs[[3]])$BIOCHEMICAL_NAME[row_ind_iterator]
      plotlim_loweranno =ifelse(!is_empty(biochemistry_testanno[[fig_index]][y_name]), 
       min(biochemistry_testanno[[fig_index]][y_name][[y_name]]),
       FALSE)
      plotlim_upperanno = ifelse(!is_empty(biochemistry_testanno[[fig_index]][y_name]), 
       max(biochemistry_testanno[[fig_index]][y_name][[y_name]]),
       FALSE)
      plotlim_baranno = ifelse(!is_empty(biochemistry_testanno[[fig_index]][y_name]), 
       mean(biochemistry_testanno[[fig_index]][y_name][[y_name]]),
       10)
    #plotlim_marginanno = abs(plotlim_baranno)-8
     myplots_main[[row_ind_iterator]] <- local({
     biochemistry_testanno_trunc <- biochemistry[,c('bowel','public_client_id',y_name)]
     plt_anno <- ggplot(data = biochemistry_testanno_trunc, aes(x = bowel, y = .data[[y_name]], group = bowel)) +
      geom_jitter(aes(color = bowel), size = 1, cex = 2) +
      geom_boxplot(color = "black", linewidth = 0.32, alpha=0.75,outlier.shape = NA, varwidth = FALSE) +
      ggtitle(label = str_wrap(y_name, width = 6)) +
      geom_signif(comparisons = comparisons, 
                  map_signif_level = sig, 
                  test = 'test_anno', 
                  test.args = list(z = comparisons, j = y_name, row = row_ind_iterator),
                  step_increase = 0.1,  size = 1, 
                  y_position = plotlim_baranno,
                  textsize = 7, color = "red"
                    #paste0(cl(df$bowel[genera]))
                , tip_length = c(0,0)) +
      coord_cartesian(ylim=c(plotlim_loweranno,plotlim_upperanno),clip="off")+
      labs(color = "BMF Category", y = ifelse(row_ind_iterator == 1 | row_ind_iterator == 4 | row_ind_iterator == 7, "Log-Transformed\n Metabolite Level","")) +
      guides(color = guide_legend(override.aes = list(size=8), title.position = 'left', nrow = 1, ncol = 4), x = guide_axis(n.dodge=2)) +
    theme(plot.margin = unit(c(0,0,0,0), "cm"),
              plot.title = element_text(size=7),
              #legend.title = element_text(size=10), 
              #plot.subtitle = element_text(size=10), 
              #legend.text = element_text(size=7),
              axis.text.x = element_text(size=6),
              axis.text.y = element_text(size=5), 
              axis.title.y = element_text(size=6),
              axis.title.x = element_blank())+
      scale_fill_manual(limits = c("Constipation","Low\nNormal","High\nNormal","Diarrhea"), labels = c("Constipation","Low\nNormal","High\nNormal","Diarrhea"), values = ggthemes::canva_palettes[['Primary colors with a vibrant twist']][c(2,3,4,1)], drop = FALSE)+
      scale_color_manual(values = ggthemes::canva_palettes[['Primary colors with a vibrant twist']][c(2,3,4,1)], guide = guide_axis(n.dodge=2))+
      coord_trans(y = "log10")})
    }
  }
}
```


```{r}
counter<<-1
#Arrange the plots:
figure_main <- ggarrange(plotlist = myplots_main[1:(sum(dim(biochemistry_testanno[[1]])[2],dim(biochemistry_testanno[[2]])[2],dim(biochemistry_testanno[[3]])[2]) - 2*length(bmfs))], labels = LETTERS[1:(sum(dim(biochemistry_testanno[[1]])[2],dim(biochemistry_testanno[[2]])[2],dim(biochemistry_testanno[[3]])[2]) - 2*length(bmfs))], legend = "top", align = "hv", widths=rep(10,9), heights = rep(200,9), common.legend = TRUE, nrow = 3, ncol = 3)
figure_main


#final_main <- ggarrange(plotlist = plots_final[1:20], labels = LETTERS[1:20], legend = "top", align = "hv", common.legend = TRUE, nrow = 4, ncol = 5)

counter<<-1
ggsave(
  "BMFvsMetabolites.png",
  plot = figure_main,
  device = NULL,
  path = NULL,
  scale = 1.6,
  width = NA,
  height = NA,
  units = c("in", "cm", "mm", "px"),
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)

```
```{r}
biochemistry_test
```


```{r}
library(pheatmap)
library(compositions)
library(viridis)
library(formattable)
library(scales)
library(gplots)

# Z-score the metabolomics
z <- biochemistry_test

z <- cbind(z$bowel,z$public_client_id,(z[,3:ncol(z)][,order(p_nodup$adj.P.Val)]))
colnames(z)[1] <- 'BMF'
colnames(z)[2] <- 'public_client_id'
z_matrix <- as.matrix(z[,3:ncol(z)])

for (i in seq(1:ncol(z_matrix))) {
  meanz <- colMeans(z_matrix)
  stdevz <- sd(z_matrix[,i])
  for (j in 1:nrow(z_matrix)) {
    z_matrix[j,i] <- (z_matrix[j,i] - meanz[i])/stdevz
  }
}

z
z <- cbind(z[,c(1:2)],as.data.frame(z_matrix))

df_subsets <- split(z, z$BMF)
df_subsets

# Create the ggplot object
ggplot <- ggplot(data = df_subsets[[1]])
ggplotcolor_palette <- get("scales", ggplot)
ggplot_color_palette_vector <- unlist(ggplotcolor_palette)

# Create a new data frame with four rows and one column for each taxon in the data
new_df <- matrix(nrow = 4, ncol = ncol(df_subsets[[1]]) - 2, dimnames = list(c("Constipation", "Low\nNormal", "High\nNormal", "Diarrhea"), colnames(df_subsets[[1]][3:ncol(df_subsets[[1]])])))


# Iterate over the df_subsets list and calculate the column-average for each subset data frame
for (i in 1:length(df_subsets)) {
  # Calculate the column-average of all the samples in the subset data frame
  row_mean <- colMeans(df_subsets[[i]][,3:ncol(df_subsets[[i]])])
  # Assign the column-average to the corresponding row and column in the new data frame
  new_df[i, ] <- row_mean
}

# Convert the new_df object to a data frame
new_df_df <- as.data.frame(new_df)

# Create the heatmap
hm <- pheatmap(t(new_df_df), 
               color_palettes_list = ggplot_color_palette_vector, 
               cellheight = 15, 
               cellwidth = 100, 
               display_numbers = FALSE, 
               gaps_row = c(seq(1:length(colnames(new_df_df)))), 
               gaps_col = c(seq(1:4)), 
               border_color = "white",
               angle_col = "0", 
               cluster_rows = FALSE, 
               fontsize_row = 7, 
               fontsize_col = 10, 
               cluster_cols = FALSE)
ggsave(
  "BMFvsMetabolitesHM.png",
  plot = hm,
  device = NULL,
  path = NULL,
  scale = 1.6,
  width = NA,
  height = NA,
  units = c("in", "cm", "mm", "px"),
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)
```


Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.