preprocessing.Rmd

---
title: "Pre-processing"
output:
  html_document:
    df_print: paged
---



```{r}
#load necessary libraries
library(rms)
library(limma)
library(edgeR)
library(tidyverse)
library(dplyr)
library(magrittr)
library(stringr)
#load QC script 
source("bulkRNA_QC.R")
```
Now that we've loaded in the necessary libraries we want to load in the raw count matrices and phenotype data for each of our cohorts from the Accelerating Medicines Partnership - Alzheimer's Disease (AMP-AD)  consortium. There are three bulk-tissue RNAseq datasets, which sample from 6 different brain regions. The cohorts are the Religious Orders Study and Rush Memory and Aging Project cohort (delineated ROSMAP from here on) which has dorsolateral prefrontal cortex samples, the Mayo Clinic cohort, hereafter referred to as MAYO, which consists of temporal cortex samples and the Mount Sinai Brain Bank cohort (referred to as MSBB), which has expression data sampled from Brodmann area 10, 22, 36 and 44 (written BM10, BM22, BM36, BM44 hereafter).
```{r}
#function to create YDGE objects for each cohort in the study
createYDGEAMPAD <- function(cohort){
  if (cohort == "ROSMAP"){
    filename <- "./rawCohortData/ROSMAP_all_counts_matrix.txt.gz"
  }
  if(cohort =="MAYO"){
    filename <- "./rawCohortData/Mayo_TCX_all_counts_matrix.txt.gz"
  }
  if(cohort == "MSBB"){
    filename <- "./rawCohortData/MSSM_all_counts_matrix.txt.gz"
  }
  all_counts <- read.table(gzfile(filename),sep="\t")
  
  colnames(all_counts) <- as.character(unlist(all_counts[1,]))
  all_counts <- all_counts[-c(1:5),]
  rownames(all_counts) <- gsub('\\.[0-9]*$', '', all_counts$feature)
  counts <- all_counts[,-1] 
  
  counts[] <- lapply(counts, as.numeric)
  
  if(cohort == "ROSMAP"){
    # read in phenotype/covariate data
    pdata <- read.table("./cohortMetadata/ageCensoredCovariates.tsv", sep="\t",header=T,check.names = F,stringsAsFactors = F)
    rownames(pdata) <- pdata$SampleID  
    subjects_to_include <-  pdata$SampleID[which(pdata$Diagnosis %in% c("AD","CONTROL"))]
  }
  if(cohort == "MAYO"){
    # read in phenotype/covariate data
    pdata <- read.table("./cohortMetadata/MAYO_CBE_TCX_Covariates.tsv", sep="\t",header=T,check.names = F,stringsAsFactors = F)
    # Keep only AD cases and controls
    rownames(pdata) <- pdata$SampleID
    subjects_to_include <- pdata$SampleID[which(pdata$Tissue.Diagnosis %in% c("TCX.AD","TCX.CONTROL"))]
  } 
  if(cohort == "MSBB"){
    # read in phenotype/covariate data
    pdata <- read.table("./cohortMetadata/MSBB_RNAseq_covariates_November2018Update.csv", sep=",",header=T,check.names = F,stringsAsFactors = F)
    
    # "BM_22_", "BM_36_", "BM_10_", "BM_44" are 4 brodmann areas of msbb
    phenodata <- read.table("./cohortMetadata/msbb_individual_metadata.csv", sep=",",header=T,check.names = F,stringsAsFactors = F)
    phenodata <- phenodata %>% 
      dplyr::rename(
        individualIdentifier = individualID,
      )
    pdata <- merge(pdata, phenodata, by="individualIdentifier")
    #remove 272 samples as specified in syn8484987
    samples<- read.table("./cohortMetadata/removesinai.txt", sep=",",header=T,check.names = F,stringsAsFactors = F)
    for (sample in colnames(samples)){
      pdata <- subset(pdata, 
                      sampleIdentifier != sample)
    }
    #create diagnosis variable as specified in syn8484987
    pdata$Diagnosis = 'OTHER'
    #HAD TO MOD FROM OG CODE: NP.1 is now CERAD, bbscore is now Braak
    pdata$Diagnosis[pdata$CDR <= 0.5 & pdata$Braak <= 3 & pdata$CERAD <= 1] = 'CONTROL'
    pdata$Diagnosis[pdata$CDR >= 1 & pdata$Braak >= 4 & pdata$CERAD >= 2] = 'AD'
    pdata$Tissue.Diagnosis = paste(pdata$Tissue, pdata$Diagnosis, sep = '.')
    
    pdata <- pdata %>% filter(Diagnosis != "OTHER")
    rownames(pdata) <- make.names(pdata$sampleIdentifier, unique= TRUE)
    MSBBs <- c("MSBBBM10", "MSBBBM22", "MSBBBM36", "MSBBBM44")
    #taking overlap of subjects described in each brodman area and found in all_countsmatrix, some
    #samples described in msbb_individual_metadata.csv not found in all_countsmatrix
    for(MSBB in MSBBs){
      region <- gsub('MSBB', '', MSBB)
      subjects_to_include <- unique(pdata$sampleIdentifier[which(pdata$BrodmannArea %in% c(region))])
      subjects_to_include <- intersect(colnames(counts), subjects_to_include)
      region_name <- paste0(region, "subjects_to_include")
      print(region_name)
      assign(region_name, subjects_to_include)
      if(region == "BM10"){
        subs <- data.frame("sampleIdentifier" = subjects_to_include)
      }
      else{
        curr_subs <- data.frame("sampleIdentifier" = subjects_to_include)
        subs <- rbind(subs, curr_subs)
      }
      
    }
    idsAcross <- data.frame("sampleIdentifier" = pdata$sampleIdentifier, "individualIdentifier" = pdata$individualIdentifier)
    allMSBBsubs <- subs 
    MSBBIDsAcross <-  inner_join(allMSBBsubs, idsAcross)
    saveRDS(MSBBIDsAcross, "./rawCohortData/allMSBBIDs.rds")
  }
  if(cohort == "ROSMAP" | cohort=="MAYO"){
    cohort_ydge_name <-  paste0(cohort, "ydge")
    ydge <- DGEList(counts=counts[,subjects_to_include],
                    lib.size=colSums(counts[,subjects_to_include]),
                    samples=pdata[subjects_to_include,],
                    group=pdata[subjects_to_include,"Source"])
    print(cohort_ydge_name)
    assign(cohort_ydge_name, ydge)
    saveRDS(ydge, file=paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds"))
  }
  else{
    for(MSBB in MSBBs){
      region <- gsub('MSBB', '', MSBB)
      region_name <- paste0(region, "subjects_to_include")
      ydge <- DGEList(counts=counts[,get(region_name)],
                      lib.size=colSums(counts[,get(region_name)]),
                      samples=pdata[get(region_name),],
                      group=pdata[get(region_name),"Source"])
      region_ydge_name <- paste0(region, "ydge")
      print(region_ydge_name)
      assign(region_ydge_name, ydge)
      saveRDS(ydge, file=paste0("./rawYDGE/raw_", region_ydge_name, ".rds"))
    }
    
  }
}

#cohorts are split into ROSMAP/MAYO/MSBB
cohorts <- c("ROSMAP", "MAYO", "MSBB")
for(cohort in cohorts){
  createYDGEAMPAD(cohort)
}
```
```{r}
#load in new cohorts raw ydge created by splitting MSBB into brain regions 
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
  if(str_detect(cohort, "MSBB")){
    cohort <- gsub('MSB', '', cohort) 
  }
  print(cohort)
  cohort_ydge_name <-  paste0(cohort, "ydge")
  filename <- paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds")
  raw_ydge <- readRDS(filename)
  assign(cohort_ydge_name, raw_ydge)
}
```
At the moment each cohort has the following number of subjects: \

ROSMAP: `r (length(ROSMAPydge$samples$lib.size))` \

MAYO: `r (length(MAYOydge$samples$lib.size))` \

MSBB BM10: `r (length(BM10ydge$samples$lib.size))` \

MSBB BM22: `r (length(BM22ydge$samples$lib.size))` \

MSBB BM36: `r (length(BM36ydge$samples$lib.size))` \

MSBB BM44: `r (length(BM44ydge$samples$lib.size))` \


```{r}
#we want to run each cohort through the QC pipeline
QCpipelineAMPAD <- function(cohort){
  if(str_detect(cohort, "MSBB")){
    cohort <- gsub('MSB', '', cohort) 
  }
  print(cohort)
  cohort_ydge_name <-  paste0(cohort, "ydge")
  filename <- paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds")
  ### read in raw ydge for each dataset
  raw_ydge <- readRDS(filename)
  ### run through unified QC pipeline, running PCA separately 
  PCBYGROUP <- T
  SUBTHRES <- 3
  IQROBSTHRES <- 3
  y <- bulk_qc(ydge=raw_ydge,
               pc.bygroup = PCBYGROUP,
               IQR_sub_threshold = SUBTHRES,
               IQR_obs_threshold = IQROBSTHRES,
               variance.prune = F,
               high.prune = F)
  
  fileConn<-file(paste0("./QCpipelineResults/",cohort,"QCresults.txt"))
  writeLines(
    c(paste(t(y$returnparams)[,1],collapse=": "),
      paste(t(y$returnparams)[,2],collapse=": "),
      paste(t(y$returnparams)[,3],collapse=": "),
      paste(t(y$returnparams)[,4],collapse=": "),
      paste(t(y$returnparams)[,5],collapse=": "),
      paste(t(y$returnparams)[,6],collapse=": "),
      paste(t(y$returnparams)[,7],collapse=": "),
      paste(t(y$returnparams)[,8],collapse=": "),
      paste(t(y$returnparams)[,9],collapse=": "),
      paste(y$returnprune),
      paste(y$returnprunehigh),
      paste(y$returnprunevar),
      paste(y$returnsub),
      paste(y$timestamp)),
    fileConn)
  close(fileConn)
  
  y_name <-  paste0("QCd_ydge_", cohort)
  y2 <- calcNormFactors(y$YDGE)
  assign(y_name, y2)
  print(y_name)
  saveRDS(y2,file=paste0("./QCpipelineResults/",y_name, ".rds"))
  
  v_name <-  paste0("v_", cohort)
  v <- voom(y2)
  assign(v_name, v)
  print(v_name)
  saveRDS(v,file=paste0("./QCpipelineResults/",v_name, ".rds"))
}

#run QC on each cohort
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
  QCpipelineAMPAD(cohort)
}
```
```{r}
#load in new cohorts QC-ed data
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
  if(str_detect(cohort, "MSBB")){
    cohort <- gsub('MSB', '', cohort) 
  }
  print(cohort)
  y_name <-  paste0("QCd_ydge_", cohort)
  filename <- paste0("./QCpipelineResults/",y_name, ".rds")
  y <- readRDS(filename)
  assign(y_name, y)
  
  v_name <-  paste0("v_", cohort)
  filename <- paste0("./QCpipelineResults/",v_name, ".rds")
  voom <- readRDS(filename)
  assign(v_name, voom)
  print(v_name)
}
```
This now means we have:

ROSMAP: `r (length(v_ROSMAP$targets$lib.size))` \

(So we lost `r (length(ROSMAPydge$samples$lib.size)) - (length(v_ROSMAP$targets$lib.size))` subjects in QC)

MAYO: `r (length(v_MAYO$targets$lib.size))` \

(So we lost `r (length(MAYOydge$samples$lib.size)) - (length(v_MAYO$targets$lib.size))` subjects in QC)

MSBB BM10: `r (length(v_BM10$targets$lib.size))` \

(So we lost `r (length(BM10ydge$samples$lib.size)) - (length(v_BM10$targets$lib.size))` subjects in QC)

MSBB BM22: `r (length(v_BM22$targets$lib.size))` \

(So we lost `r (length(BM22ydge$samples$lib.size)) - (length(v_BM22$targets$lib.size))` subjects in QC)

MSBB BM36: `r (length(v_BM36$targets$lib.size))` \

(So we lost `r (length(BM36ydge$samples$lib.size)) - (length(v_BM36$targets$lib.size))` subjects in QC)

MSBB BM44: `r (length(v_BM44$targets$lib.size))` \

(So we lost `r (length(BM44ydge$samples$lib.size)) - (length(v_BM44$targets$lib.size))` subjectsin QC)

```{r}
#we want to remove batch effort on each of the cohorts
removeBatchAMPAD <- function(cohort){
  if(str_detect(cohort, "MSBB")){
    cohort <- gsub('MSB', '', cohort) 
  }
  print(cohort)
  cohort_QC_ydge_name <-  paste0("QCd_ydge_", cohort)
  QC_filename <- paste0("./QCpipelineResults/",cohort_QC_ydge_name, ".rds")
  voom_filename <- paste0("./QCpipelineResults/", "v_", cohort, ".rds")
  ### read in QCd data
  QC_ydge <- readRDS(QC_filename)
  voom <- readRDS(voom_filename)
  matrix_name <- paste0(cohort, "_matrix")
  lmod_name <-  paste0("lmod_" ,cohort)
  #set variables to correct class within each voom object
  if (cohort == "ROSMAP"){
    voom$targets <- within(voom$targets,{
      LOAD <- factor(ifelse(Diagnosis=="AD",1,0))
      sex <- as.factor(msex)
    })
  #variables to covary for
  all.covs <- c('lib.size', 'PCT_PF_READS_ALIGNED', 'PCT_CODING_BASES', 'PCT_INTERGENIC_BASES', 'PCT_INTRONIC_BASES', 'PCT_RIBOSOMAL_BASES', 'norm.factors', 'RINcontinuous', 'pmi') 
    designvars <- c("msex","age","Batch", "PCT_PF_READS_ALIGNED", "PCT_CODING_BASES","PCT_INTERGENIC_BASES", "PCT_INTRONIC_BASES", "PCT_RIBOSOMAL_BASES", "pmi","LOAD","RINcontinuous")
    batch <- c('Batch')
    Acovs <- voom$targets[,all.covs]
    
  }
  if (cohort == "MAYO"){
    voom$targets <- within(voom$targets,{
      msex <- as.factor(ifelse(Sex=="MALE",1,0))
      FLOWCELL <- as.factor(FLOWCELL)
      Source <- as.factor(Source)
      AgeAtDeath <- as.numeric(AgeAtDeath)
      LOAD <- factor(ifelse(Tissue.Diagnosis=="TCX.AD",1,0))
      PMI <- as.numeric(impute(voom$targets$PMI,fun = mean)) 
    })
    #variables to covary for
    all.covs =c('lib.size', 'PCT_PF_READS_ALIGNED', 'PCT_CODING_BASES', 'PCT_INTERGENIC_BASES', 'PCT_INTRONIC_BASES', 'PCT_RIBOSOMAL_BASES', 'norm.factors', 'RIN','PMI')
    designvars <- c("msex","FLOWCELL", "AgeAtDeath","PCT_PF_READS_ALIGNED", "PCT_CODING_BASES","PCT_INTERGENIC_BASES", "PCT_INTRONIC_BASES", "PCT_RIBOSOMAL_BASES", "PMI","LOAD","RIN")
    batch <- c('Source')
    Acovs <- cbind(voom$targets[,all.covs],model.matrix( ~ voom$targets$FLOWCELL-1))
    
  }
  if(str_detect(cohort, "BM")){
    voom$targets <- voom$targets %>% mutate(ageDeath = gsub('\\+','',ageDeath))
    voom$targets <- within(voom$targets,{
      msex <- as.factor(ifelse(sex=="male",1,0))
      batch <- as.factor(batch)
      TotalReads <- as.numeric(TotalReads)
      Mapped <- as.numeric(Mapped)
      pmi <- as.numeric(pmi)
      AgeAtDeath <- as.numeric(ageDeath)
      LOAD <- factor(ifelse(Tissue.Diagnosis==".AD",1,0))
    })
    #variables to covary for
    all.covs =c("lib.size", "norm.factors","RIN", "rRNA.rate","TotalReads", "Mapped", "pmi")
    designvars <-c("lib.size", "norm.factors", "msex","batch", "AgeAtDeath","RIN", "rRNA.rate","TotalReads", "Mapped", "pmi","LOAD")
    batch <- c('batch')
    Acovs <- cbind(voom$targets[,all.covs],model.matrix( ~ voom$targets$batch - 1))
    
  }
  saveRDS(voom, voom_filename)
  designtext <- paste("model.matrix( ~ ",paste("voom$targets$",designvars,sep="",collapse=" + ")," )",sep="")
  design <- eval(parse(text=designtext))
  
  
  #remove batch effect, without sex and age at death & no design matrix
  cohort_matrix <- removeBatchEffect(voom, 
                                     batch = voom$targets[, batch], 
                                     covariates= Acovs)
  
  assign(matrix_name, cohort_matrix)
  saveRDS(cohort_matrix, file=paste0('./finalCountMatrices/', matrix_name, ".rds"))
  
  #save lmFit run 
  lmod <- lmFit(voom,design= design,method="robust",maxit=10000)
  assign(lmod_name, lmod)
  saveRDS(lmod, file=paste0('./cohortQCMods/', lmod_name, ".rds"))
  
  eb <- eBayes(lmod,robust = T)
  eb_name <- paste0(cohort, "_eb")
  assign(eb_name, eb)
  saveRDS(eb,file=paste0('./cohortQCMods/', eb_name, ".rds"))
  
  coefnum <- grep("LOAD",colnames(eb))
  tt <- topTable(eb,coef=coefnum,20000,sort.by = "none")
  
  # calculate the standard error of LogFC
  tt$se <- eb$stdev.unscaled[,coefnum]*sqrt(eb$s2.post)
  
  # format the output to be most informative
  tt$gene <- rownames(tt)
  tt <- tt[order(tt$P.Value,decreasing=F),]
  
  tt_name <- paste0(cohort, "_tt")
  assign(tt_name, tt)
  
  # save toptable (tt) of all summary stats
  saveRDS(tt,file=paste0('./cohortQCMods/', tt_name, ".rds"))
  
}

#this will give us the final count matrices, lmod and ebayes objects for each cohort
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
  removeBatchAMPAD(cohort)
}

```


```{r}
#load in new cohorts QC-ed data
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
  if(str_detect(cohort, "MSBB")){
    cohort <- gsub('MSB', '', cohort) 
  }
  print(cohort)
  matrix_name <- paste0(cohort, "_matrix")
  lmod_name <-  paste0("lmod_" ,cohort)
  
  filename <- paste0('./finalCountMatrices/', matrix_name, ".rds")
  matrix <- readRDS(filename)
  assign(matrix_name, matrix)
  
  filename <- paste0('./cohortQCMods/', lmod_name, ".rds")
  lmod <- readRDS(filename)
  assign(lmod_name, lmod)
}
```