forked from sjackman/stat540-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lme_apl.R
142 lines (121 loc) · 5.13 KB
/
lme_apl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Fit a linear mixed model
setwd('~/UBC Stats/STAT540/Group Project/')
load('Data/CPGI2Probe_MList.Rdata')
source('coord_to_gene.R')
library(latticeExtra)
library(lme4)
library(plyr)
library(RColorBrewer)
library(reshape2)
library(VennDiagram)
library(nlme)
# Bind the datasets into one data frame.
cpgi.probes.M.dat <- with(cpgi.probes.M,
cbind(ALL[-ncol(ALL)], APL[-ncol(APL)], CTRL))
rm(cpgi.probes.M)
################################################################################
#### APL vs. Control
################################################################################
#Rename rows (APL = PLP + PLR)
substr(names(cpgi.probes.M.dat)[grepl('PLP|PLR', names(cpgi.probes.M.dat))], 1, 3) <-
'APL'
# Take a subset of the data.
data <- t(subset(cpgi.probes.M.dat,
select = grepl('HBC|APL', names(cpgi.probes.M.dat))))
cpgi <- cpgi.probes.M.dat[, 'cpgi', drop=FALSE]
rm(cpgi.probes.M.dat)
# Specify the design matrix.
design <- data.frame(
Group=relevel(ref='HBC', factor(substr(rownames(data), 1, 3))),
row.names=rownames(data))
# Reshape the data.
tall <- melt(cbind(design, data), id.vars=colnames(design),
variable.name = 'probe', value.name = 'M')
tall$cgi <- cpgi[tall$probe,'cpgi']
# Fit a linear model.
lm.t <- ddply(tall, .(cgi), .progress='text', .fun=function(x)
coef(summary(lm(M ~ Group, x)))['GroupALL', c('t value', 'Pr(>|t|)')])
colnames(lm.t) <- c('cgi', 't', 'p')
lm.t$q <- p.adjust(lm.t$p, 'BH')
# Write the results to a file.
write.table(lm.t, 'Data/lm.tab')
# Write the gene set to a file.
lm.geneset <- unique(coordToGene(as.character(
subset(lm.t, subset=q<1e-5, select=cgi, drop=TRUE))))
writeLines(lm.geneset, 'Data/lm-geneset.txt')
lme_REML_topTable <- function(tall.data, coefName, size.each = 1500){
ids <- as.character(unique(cpgi[,1]))
ids_list <- split(ids, ceiling(1:length(ids)/size.each))
res.list <- vector('list', length(ids_list))
for(i in 1:length(ids_list)){
print(i)
tall.sub <- subset(tall.data, cgi %in% ids_list[[i]])
tall.sub <- droplevels(tall.sub)
lme.t_reml <- ddply(tall.sub, .(cgi), .progress='text', .fun=function(x)
coef(summary(lmer(M ~ Group + (1|probe), x)))[coefName, c('Estimate','t value')])
colnames(lme.t_reml) <- c('cgi', 'Estim', 't')
res.list[[i]] <- lme.t_reml
}
return(do.call(rbind, res.list))
}
# Fit a linear mixed-effects model.
lme.t_reml <- ddply(tall, .(cgi), .progress='text', .fun=function(x)
coef(summary(lmer(M ~ Group + (1|probe), x)))['GroupALL', 't value'])
colnames(lme.t_reml) <- c('cgi', 't')
# Write the results to a file.
write.table(lme.t_reml, 'Data/lme.tab')
# Another linear mixed-effects model, with 'nlme' package in order to apply ML
# and return p-values
### I found that with very large data sets, ddply gets very slow. After a quick test, I found that it looked to get exponentially slower with the size of the data set. So I will attempt to do this by chopping up the data frame into pieces (10, for ~2500 genes each) and running the same part on each
### Use this function: it will do the above. You need to specify coefName as it would appear in the result summary, e.g. "GroupALL" or "GroupAPL"
lme_ML_topTable <- function(tall.data, coefName){
ids <- as.character(unique(cpgi[,1]))
ids_list <- split(ids, ceiling(1:length(ids)/2500))
res.list <- vector('list', length(ids_list))
for(i in 1:length(ids_list)){
print(i)
tall.sub <- subset(tall.data, cgi %in% ids_list[[i]])
tall.sub <- droplevels(tall.sub)
lme.t_ml <- ddply(tall.sub, .(cgi), .progress='text', .fun = function(x){
res <- lme(M ~ Group, data = x, random = ~1 | probe, method = 'ML', na.action = 'na.omit')
summary(res)$tTable[coefName, c('Value', 't-value', 'p-value')]
})
res.list[[i]] <- lme.t_ml
}
return(do.call(rbind, res.list))
}
# Fit the LME
lme.t_ml_apl <- lme_ML_topTable(tall, 'GroupAPL')
# Write to table
write.table(lme.t_ml_apl, 'Data/lme_ml_apl.tab')
# Write the gene set to a file.
lme.geneset <- unique(coordToGene(as.character(
subset(lme.t, subset = abs(t) > 10, select = cgi, drop = TRUE))))
writeLines(lme.geneset, 'Data/lme-geneset.txt')
# Plot a Venn diagram of the overlap of the fixed and mixed models.
plot.new()
grid.draw(venn.diagram(list(
LinearModel = lm.geneset,
LinearMixedEffectsModel = lme.geneset),
filename=NULL, fill=c('red', 'blue')))
# Plot the denisty of the q-values of the linear model.
densityplot(lm.t$q,
main='Density of q-values of the linear model',
xlab='q-value')
# Plot the denisty of the t-statistic of the linear mixed-effects model.
densityplot(lme.t$t,
main='Density of the t-statistic of the linear mixed-effects model',
xlab='t-statistic')
# Plot the M-values of a CpG island.
stripplot(M ~ Group | probe, tall, group = Group,
auto.key = TRUE, type = c('p', 'a'),
subset = cgi == rownames(lme.t)[1])
# A similar plot.
mycol <- brewer.pal(7, 'Set1')[c(1,5,2,7)]
mycol <- rgb(t(col2rgb(mycol)), alpha = 180, maxColorValue=255)
my.par <- list(superpose.symbol = list(col = mycol, pch = 16))
stripplot(M ~ Group | probe, tall, groups = Group,
par.settings = my.par,
auto.key = TRUE, jitter = TRUE,
layout = c(length(unique(tall$probe)), 1),
subset = cgi == rownames(lme.t)[1])