forked from meconsens/rctp
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.Rmd
407 lines (349 loc) · 15.6 KB
/
preprocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
---
title: "Pre-processing"
output:
html_document:
df_print: paged
---
```{r}
#load necessary libraries
library(rms)
library(limma)
library(edgeR)
library(tidyverse)
library(dplyr)
library(magrittr)
library(stringr)
#load QC script
source("bulkRNA_QC.R")
```
Now that we've loaded in the necessary libraries we want to load in the raw count matrices and phenotype data for each of our cohorts from the Accelerating Medicines Partnership - Alzheimer's Disease (AMP-AD) consortium. There are three bulk-tissue RNAseq datasets, which sample from 6 different brain regions. The cohorts are the Religious Orders Study and Rush Memory and Aging Project cohort (delineated ROSMAP from here on) which has dorsolateral prefrontal cortex samples, the Mayo Clinic cohort, hereafter referred to as MAYO, which consists of temporal cortex samples and the Mount Sinai Brain Bank cohort (referred to as MSBB), which has expression data sampled from Brodmann area 10, 22, 36 and 44 (written BM10, BM22, BM36, BM44 hereafter).
```{r}
#function to create YDGE objects for each cohort in the study
createYDGEAMPAD <- function(cohort){
if (cohort == "ROSMAP"){
filename <- "./rawCohortData/ROSMAP_all_counts_matrix.txt.gz"
}
if(cohort =="MAYO"){
filename <- "./rawCohortData/Mayo_TCX_all_counts_matrix.txt.gz"
}
if(cohort == "MSBB"){
filename <- "./rawCohortData/MSSM_all_counts_matrix.txt.gz"
}
all_counts <- read.table(gzfile(filename),sep="\t")
colnames(all_counts) <- as.character(unlist(all_counts[1,]))
all_counts <- all_counts[-c(1:5),]
rownames(all_counts) <- gsub('\\.[0-9]*$', '', all_counts$feature)
counts <- all_counts[,-1]
counts[] <- lapply(counts, as.numeric)
if(cohort == "ROSMAP"){
# read in phenotype/covariate data
pdata <- read.table("./cohortMetadata/ageCensoredCovariates.tsv", sep="\t",header=T,check.names = F,stringsAsFactors = F)
rownames(pdata) <- pdata$SampleID
subjects_to_include <- pdata$SampleID[which(pdata$Diagnosis %in% c("AD","CONTROL"))]
}
if(cohort == "MAYO"){
# read in phenotype/covariate data
pdata <- read.table("./cohortMetadata/MAYO_CBE_TCX_Covariates.tsv", sep="\t",header=T,check.names = F,stringsAsFactors = F)
# Keep only AD cases and controls
rownames(pdata) <- pdata$SampleID
subjects_to_include <- pdata$SampleID[which(pdata$Tissue.Diagnosis %in% c("TCX.AD","TCX.CONTROL"))]
}
if(cohort == "MSBB"){
# read in phenotype/covariate data
pdata <- read.table("./cohortMetadata/MSBB_RNAseq_covariates_November2018Update.csv", sep=",",header=T,check.names = F,stringsAsFactors = F)
# "BM_22_", "BM_36_", "BM_10_", "BM_44" are 4 brodmann areas of msbb
phenodata <- read.table("./cohortMetadata/msbb_individual_metadata.csv", sep=",",header=T,check.names = F,stringsAsFactors = F)
phenodata <- phenodata %>%
dplyr::rename(
individualIdentifier = individualID,
)
pdata <- merge(pdata, phenodata, by="individualIdentifier")
#remove 272 samples as specified in syn8484987
samples<- read.table("./cohortMetadata/removesinai.txt", sep=",",header=T,check.names = F,stringsAsFactors = F)
for (sample in colnames(samples)){
pdata <- subset(pdata,
sampleIdentifier != sample)
}
#create diagnosis variable as specified in syn8484987
pdata$Diagnosis = 'OTHER'
#HAD TO MOD FROM OG CODE: NP.1 is now CERAD, bbscore is now Braak
pdata$Diagnosis[pdata$CDR <= 0.5 & pdata$Braak <= 3 & pdata$CERAD <= 1] = 'CONTROL'
pdata$Diagnosis[pdata$CDR >= 1 & pdata$Braak >= 4 & pdata$CERAD >= 2] = 'AD'
pdata$Tissue.Diagnosis = paste(pdata$Tissue, pdata$Diagnosis, sep = '.')
pdata <- pdata %>% filter(Diagnosis != "OTHER")
rownames(pdata) <- make.names(pdata$sampleIdentifier, unique= TRUE)
MSBBs <- c("MSBBBM10", "MSBBBM22", "MSBBBM36", "MSBBBM44")
#taking overlap of subjects described in each brodman area and found in all_countsmatrix, some
#samples described in msbb_individual_metadata.csv not found in all_countsmatrix
for(MSBB in MSBBs){
region <- gsub('MSBB', '', MSBB)
subjects_to_include <- unique(pdata$sampleIdentifier[which(pdata$BrodmannArea %in% c(region))])
subjects_to_include <- intersect(colnames(counts), subjects_to_include)
region_name <- paste0(region, "subjects_to_include")
print(region_name)
assign(region_name, subjects_to_include)
if(region == "BM10"){
subs <- data.frame("sampleIdentifier" = subjects_to_include)
}
else{
curr_subs <- data.frame("sampleIdentifier" = subjects_to_include)
subs <- rbind(subs, curr_subs)
}
}
idsAcross <- data.frame("sampleIdentifier" = pdata$sampleIdentifier, "individualIdentifier" = pdata$individualIdentifier)
allMSBBsubs <- subs
MSBBIDsAcross <- inner_join(allMSBBsubs, idsAcross)
saveRDS(MSBBIDsAcross, "./rawCohortData/allMSBBIDs.rds")
}
if(cohort == "ROSMAP" | cohort=="MAYO"){
cohort_ydge_name <- paste0(cohort, "ydge")
ydge <- DGEList(counts=counts[,subjects_to_include],
lib.size=colSums(counts[,subjects_to_include]),
samples=pdata[subjects_to_include,],
group=pdata[subjects_to_include,"Source"])
print(cohort_ydge_name)
assign(cohort_ydge_name, ydge)
saveRDS(ydge, file=paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds"))
}
else{
for(MSBB in MSBBs){
region <- gsub('MSBB', '', MSBB)
region_name <- paste0(region, "subjects_to_include")
ydge <- DGEList(counts=counts[,get(region_name)],
lib.size=colSums(counts[,get(region_name)]),
samples=pdata[get(region_name),],
group=pdata[get(region_name),"Source"])
region_ydge_name <- paste0(region, "ydge")
print(region_ydge_name)
assign(region_ydge_name, ydge)
saveRDS(ydge, file=paste0("./rawYDGE/raw_", region_ydge_name, ".rds"))
}
}
}
#cohorts are split into ROSMAP/MAYO/MSBB
cohorts <- c("ROSMAP", "MAYO", "MSBB")
for(cohort in cohorts){
createYDGEAMPAD(cohort)
}
```
```{r}
#load in new cohorts raw ydge created by splitting MSBB into brain regions
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
if(str_detect(cohort, "MSBB")){
cohort <- gsub('MSB', '', cohort)
}
print(cohort)
cohort_ydge_name <- paste0(cohort, "ydge")
filename <- paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds")
raw_ydge <- readRDS(filename)
assign(cohort_ydge_name, raw_ydge)
}
```
At the moment each cohort has the following number of subjects: \
ROSMAP: `r (length(ROSMAPydge$samples$lib.size))` \
MAYO: `r (length(MAYOydge$samples$lib.size))` \
MSBB BM10: `r (length(BM10ydge$samples$lib.size))` \
MSBB BM22: `r (length(BM22ydge$samples$lib.size))` \
MSBB BM36: `r (length(BM36ydge$samples$lib.size))` \
MSBB BM44: `r (length(BM44ydge$samples$lib.size))` \
```{r}
#we want to run each cohort through the QC pipeline
QCpipelineAMPAD <- function(cohort){
if(str_detect(cohort, "MSBB")){
cohort <- gsub('MSB', '', cohort)
}
print(cohort)
cohort_ydge_name <- paste0(cohort, "ydge")
filename <- paste0("./rawYDGE/raw_", cohort_ydge_name, ".rds")
### read in raw ydge for each dataset
raw_ydge <- readRDS(filename)
### run through unified QC pipeline, running PCA separately
PCBYGROUP <- T
SUBTHRES <- 3
IQROBSTHRES <- 3
y <- bulk_qc(ydge=raw_ydge,
pc.bygroup = PCBYGROUP,
IQR_sub_threshold = SUBTHRES,
IQR_obs_threshold = IQROBSTHRES,
variance.prune = F,
high.prune = F)
fileConn<-file(paste0("./QCpipelineResults/",cohort,"QCresults.txt"))
writeLines(
c(paste(t(y$returnparams)[,1],collapse=": "),
paste(t(y$returnparams)[,2],collapse=": "),
paste(t(y$returnparams)[,3],collapse=": "),
paste(t(y$returnparams)[,4],collapse=": "),
paste(t(y$returnparams)[,5],collapse=": "),
paste(t(y$returnparams)[,6],collapse=": "),
paste(t(y$returnparams)[,7],collapse=": "),
paste(t(y$returnparams)[,8],collapse=": "),
paste(t(y$returnparams)[,9],collapse=": "),
paste(y$returnprune),
paste(y$returnprunehigh),
paste(y$returnprunevar),
paste(y$returnsub),
paste(y$timestamp)),
fileConn)
close(fileConn)
y_name <- paste0("QCd_ydge_", cohort)
y2 <- calcNormFactors(y$YDGE)
assign(y_name, y2)
print(y_name)
saveRDS(y2,file=paste0("./QCpipelineResults/",y_name, ".rds"))
v_name <- paste0("v_", cohort)
v <- voom(y2)
assign(v_name, v)
print(v_name)
saveRDS(v,file=paste0("./QCpipelineResults/",v_name, ".rds"))
}
#run QC on each cohort
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
QCpipelineAMPAD(cohort)
}
```
```{r}
#load in new cohorts QC-ed data
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
if(str_detect(cohort, "MSBB")){
cohort <- gsub('MSB', '', cohort)
}
print(cohort)
y_name <- paste0("QCd_ydge_", cohort)
filename <- paste0("./QCpipelineResults/",y_name, ".rds")
y <- readRDS(filename)
assign(y_name, y)
v_name <- paste0("v_", cohort)
filename <- paste0("./QCpipelineResults/",v_name, ".rds")
voom <- readRDS(filename)
assign(v_name, voom)
print(v_name)
}
```
This now means we have:
ROSMAP: `r (length(v_ROSMAP$targets$lib.size))` \
(So we lost `r (length(ROSMAPydge$samples$lib.size)) - (length(v_ROSMAP$targets$lib.size))` subjects in QC)
MAYO: `r (length(v_MAYO$targets$lib.size))` \
(So we lost `r (length(MAYOydge$samples$lib.size)) - (length(v_MAYO$targets$lib.size))` subjects in QC)
MSBB BM10: `r (length(v_BM10$targets$lib.size))` \
(So we lost `r (length(BM10ydge$samples$lib.size)) - (length(v_BM10$targets$lib.size))` subjects in QC)
MSBB BM22: `r (length(v_BM22$targets$lib.size))` \
(So we lost `r (length(BM22ydge$samples$lib.size)) - (length(v_BM22$targets$lib.size))` subjects in QC)
MSBB BM36: `r (length(v_BM36$targets$lib.size))` \
(So we lost `r (length(BM36ydge$samples$lib.size)) - (length(v_BM36$targets$lib.size))` subjects in QC)
MSBB BM44: `r (length(v_BM44$targets$lib.size))` \
(So we lost `r (length(BM44ydge$samples$lib.size)) - (length(v_BM44$targets$lib.size))` subjectsin QC)
```{r}
#we want to remove batch effort on each of the cohorts
removeBatchAMPAD <- function(cohort){
if(str_detect(cohort, "MSBB")){
cohort <- gsub('MSB', '', cohort)
}
print(cohort)
cohort_QC_ydge_name <- paste0("QCd_ydge_", cohort)
QC_filename <- paste0("./QCpipelineResults/",cohort_QC_ydge_name, ".rds")
voom_filename <- paste0("./QCpipelineResults/", "v_", cohort, ".rds")
### read in QCd data
QC_ydge <- readRDS(QC_filename)
voom <- readRDS(voom_filename)
matrix_name <- paste0(cohort, "_matrix")
lmod_name <- paste0("lmod_" ,cohort)
#set variables to correct class within each voom object
if (cohort == "ROSMAP"){
voom$targets <- within(voom$targets,{
LOAD <- factor(ifelse(Diagnosis=="AD",1,0))
sex <- as.factor(msex)
})
#variables to covary for
all.covs <- c('lib.size', 'PCT_PF_READS_ALIGNED', 'PCT_CODING_BASES', 'PCT_INTERGENIC_BASES', 'PCT_INTRONIC_BASES', 'PCT_RIBOSOMAL_BASES', 'norm.factors', 'RINcontinuous', 'pmi')
designvars <- c("msex","age","Batch", "PCT_PF_READS_ALIGNED", "PCT_CODING_BASES","PCT_INTERGENIC_BASES", "PCT_INTRONIC_BASES", "PCT_RIBOSOMAL_BASES", "pmi","LOAD","RINcontinuous")
batch <- c('Batch')
Acovs <- voom$targets[,all.covs]
}
if (cohort == "MAYO"){
voom$targets <- within(voom$targets,{
msex <- as.factor(ifelse(Sex=="MALE",1,0))
FLOWCELL <- as.factor(FLOWCELL)
Source <- as.factor(Source)
AgeAtDeath <- as.numeric(AgeAtDeath)
LOAD <- factor(ifelse(Tissue.Diagnosis=="TCX.AD",1,0))
PMI <- as.numeric(impute(voom$targets$PMI,fun = mean))
})
#variables to covary for
all.covs =c('lib.size', 'PCT_PF_READS_ALIGNED', 'PCT_CODING_BASES', 'PCT_INTERGENIC_BASES', 'PCT_INTRONIC_BASES', 'PCT_RIBOSOMAL_BASES', 'norm.factors', 'RIN','PMI')
designvars <- c("msex","FLOWCELL", "AgeAtDeath","PCT_PF_READS_ALIGNED", "PCT_CODING_BASES","PCT_INTERGENIC_BASES", "PCT_INTRONIC_BASES", "PCT_RIBOSOMAL_BASES", "PMI","LOAD","RIN")
batch <- c('Source')
Acovs <- cbind(voom$targets[,all.covs],model.matrix( ~ voom$targets$FLOWCELL-1))
}
if(str_detect(cohort, "BM")){
voom$targets <- voom$targets %>% mutate(ageDeath = gsub('\\+','',ageDeath))
voom$targets <- within(voom$targets,{
msex <- as.factor(ifelse(sex=="male",1,0))
batch <- as.factor(batch)
TotalReads <- as.numeric(TotalReads)
Mapped <- as.numeric(Mapped)
pmi <- as.numeric(pmi)
AgeAtDeath <- as.numeric(ageDeath)
LOAD <- factor(ifelse(Tissue.Diagnosis==".AD",1,0))
})
#variables to covary for
all.covs =c("lib.size", "norm.factors","RIN", "rRNA.rate","TotalReads", "Mapped", "pmi")
designvars <-c("lib.size", "norm.factors", "msex","batch", "AgeAtDeath","RIN", "rRNA.rate","TotalReads", "Mapped", "pmi","LOAD")
batch <- c('batch')
Acovs <- cbind(voom$targets[,all.covs],model.matrix( ~ voom$targets$batch - 1))
}
saveRDS(voom, voom_filename)
designtext <- paste("model.matrix( ~ ",paste("voom$targets$",designvars,sep="",collapse=" + ")," )",sep="")
design <- eval(parse(text=designtext))
#remove batch effect, without sex and age at death & no design matrix
cohort_matrix <- removeBatchEffect(voom,
batch = voom$targets[, batch],
covariates= Acovs)
assign(matrix_name, cohort_matrix)
saveRDS(cohort_matrix, file=paste0('./finalCountMatrices/', matrix_name, ".rds"))
#save lmFit run
lmod <- lmFit(voom,design= design,method="robust",maxit=10000)
assign(lmod_name, lmod)
saveRDS(lmod, file=paste0('./cohortQCMods/', lmod_name, ".rds"))
eb <- eBayes(lmod,robust = T)
eb_name <- paste0(cohort, "_eb")
assign(eb_name, eb)
saveRDS(eb,file=paste0('./cohortQCMods/', eb_name, ".rds"))
coefnum <- grep("LOAD",colnames(eb))
tt <- topTable(eb,coef=coefnum,20000,sort.by = "none")
# calculate the standard error of LogFC
tt$se <- eb$stdev.unscaled[,coefnum]*sqrt(eb$s2.post)
# format the output to be most informative
tt$gene <- rownames(tt)
tt <- tt[order(tt$P.Value,decreasing=F),]
tt_name <- paste0(cohort, "_tt")
assign(tt_name, tt)
# save toptable (tt) of all summary stats
saveRDS(tt,file=paste0('./cohortQCMods/', tt_name, ".rds"))
}
#this will give us the final count matrices, lmod and ebayes objects for each cohort
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
removeBatchAMPAD(cohort)
}
```
```{r}
#load in new cohorts QC-ed data
cohorts <- c("ROSMAP", "MAYO", "MSBBM10", "MSBBM22", "MSBBM36", "MSBBM44")
for(cohort in cohorts){
if(str_detect(cohort, "MSBB")){
cohort <- gsub('MSB', '', cohort)
}
print(cohort)
matrix_name <- paste0(cohort, "_matrix")
lmod_name <- paste0("lmod_" ,cohort)
filename <- paste0('./finalCountMatrices/', matrix_name, ".rds")
matrix <- readRDS(filename)
assign(matrix_name, matrix)
filename <- paste0('./cohortQCMods/', lmod_name, ".rds")
lmod <- readRDS(filename)
assign(lmod_name, lmod)
}
```