-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvariancePartition.R
191 lines (130 loc) · 5.17 KB
/
variancePartition.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
## ----knitr, echo=FALSE, results='hide'----------------------------------------
library("knitr")
opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
fig.width=4,fig.height=4.5,
message=FALSE, warning=FALSE)
## ----options, results="hide", echo=FALSE--------------------------------------
options(digits=3, width=80, prompt=" ", continue=" ")
## ----simResult, cache=TRUE, results='hide', fig.height=4, fig.width=4---------
# load library
library('variancePartition')
library("lme4")
# load simulated data:
# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
geneExpr1 <- read.csv("IMMVAR_Refine2.csv",row.names = 1)
ncol(geneExpr1)
info1 <- read.csv("info_immVar1.csv")
nrow(info1)
info1$Sex <- as.factor(info1$Sex)
info1$Batch <- as.factor(info1$Batch)
info1$cellType <- as.factor(info1$cellType)
info1$Individual <- as.factor(info1$Individual)
nrow(info1) == ncol(geneExpr1)
## Analyzing the simulated format and variable in the package (variancePartition)
info_sim <- read.csv("info_simulated.csv")
nrow(info_sim)
imm_sim <- read.csv("geneExpr_simulated.csv")
ncol(imm_sim)
#data(varPartData)
# Specify variables to consider
# Age is continuous so model it as a fixed effect
# Individual and Tissue are both categorical,
# so model them as random effects
# Note the syntax used to specify random effects
form1 <- ~ Age + (1|Individual) + (1|Sex) + (1|cellType) + (1|Batch)
# Fit model and extract results
# 1) fit linear mixed model on gene expression
# If categorical variables are specified,
# a linear mixed model is used
# If all variables are modeled as fixed effects,
# a linear model is used
# each entry in results is a regression model fit on a single gene
# 2) extract variance fractions from each model fit
# for each gene, returns fraction of variation attributable
# to each variable
# Interpretation: the variance explained by each variables
# after correcting for all other variables
# Note that geneExpr can either be a matrix,
# and EList output by voom() in the limma package,
# or an ExpressionSet
varPart1 <- fitExtractVarPartModel( geneExpr1, form1, info1 )
# sort variables (i.e. columns) by median fraction
# of variance explained
vp1 <- sortCols( varPart1 )
# Figure 1a
# Bar plot of variance fractions for the first 10 genes
plotPercentBars( vp1[1:10,] )
#
# Figure 1b
# violin plot of contribution of each variable to total variance
plotVarPart( vp1 )
## ----accessResults, cache=TRUE, warning=FALSE---------------------------------
# Access first entries
head(varPart1)
# Access first entries for Individual
head(varPart1$Individual)
# sort genes based on variance explained by Individual
head(varPart1[order(varPart1$Individual, decreasing=TRUE),])
## ----plotStratify, cache=TRUE, warning=FALSE, fig.height=4, fig.width=4-------
# get gene with the highest variation across Tissues
# create data.frame with expression of gene i and Sex
# type for each sample
i <- which.max( varPart1$Sex )
i
GE1 <- data.frame(t(geneExpr1[i,]), Sex = info1$Sex)
summary(GE1)
colnames(GE1)[1] <- "Expression"
ppp = rownames(geneExpr1)[i]
ppp
# plot expression stratified by Sex
plotStratify(Expression ~ Sex,GE1,ppp)
### For cellType
i <- which.max( varPart1$cellType )
i
GE3 <- data.frame(t(geneExpr1[i,]), cellType = info1$cellType)
colnames(GE3)[1] <- "Expression"
summary(GE3)
ppp1 = rownames(geneExpr1)[i]
ppp1
# plot expression stratified by cellType
plotStratify(Expression ~ cellType,GE3,ppp1)
#
# get gene with the highest variation across Individuals
# create data.frame with expression of gene i and Tissue
# type for each sample
i <- which.max( varPart1$Individual )
GE2 <- data.frame(t(geneExpr1[i,]),
Individual = info1$Individual)
colnames(GE2)[1] <- "Expression"
# Figure 2b
# plot expression stratified by Tissue
label <- paste("Individual:", format(varPart1$Individual[i]*100,
digits=3), "%")
main <- rownames(geneExpr1)[i]
main
plotStratify( Expression ~ Individual, GE2, colorBy=NULL,
text=label, main=main)
## ----cache=TRUE---------------------------------------------------------------
library('lme4')
geneExpr1[1,]
# fit regression model for the first gene
form_test <- geneExpr1[1,] ~ Age + (1|Individual) + (1|Sex)
fit <- lmer(form_test, info1, REML=FALSE )
# extract variance statistics
calcVarPart(fit)
## ----canCorPairs, cache=TRUE, results='hide', fig.width=5, fig.height=5-------
form <- ~ Individual + Sex + cellType + Age
# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
C = canCorPairs( form, info1)
# Plot correlation matrix
plotCorrMatrix( C )
## ----vpInteraction, echo=TRUE, cache=TRUE, results='hide', fig.width=4, fig.height=4----
form <- ~ (1|Individual) + Age + (1|Sex) + (1|Batch) + (1|cellType) +
(1|Batch:Sex)
# fit model
vpInteraction <- fitExtractVarPartModel( geneExpr1, form, info1 )
plotVarPart( sortCols( vpInteraction ) )
########################################################################