-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRDA.R
executable file
·368 lines (305 loc) · 18.1 KB
/
RDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
library(dplyr)
library(magrittr)
library(data.table)
library(WGCNA)
library(vegan)
library(ggplot2)
library(robust)
library(qvalue)
args = commandArgs(trailingOnly=TRUE)
#################################
LFMM_imp = args[1]
PREDICTORS = read.table(args[2], sep = '\t', header = T)
COVARIATES = read.table(args[3], sep = '\t', header = T)
SNPS = fread(args[4], sep = ' ', header = T)
OUTSUFFIX = args[5]
#################################
#### Function to conduct a RDA based genome scan from Capblancq et al. (2018)
rdadapt <- function(rda,K)
{
zscores<-rda$CCA$v[,1:as.numeric(K)]
resscale <- apply(zscores, 2, scale)
resmaha <- covRob(resscale, distance = TRUE, na.action= na.omit, estim="pairwiseGK")$dist
lambda <- median(resmaha)/qchisq(0.5,df=K)
reschi2test <- pchisq(resmaha/lambda,K,lower.tail=FALSE)
qval <- qvalue(reschi2test)
q.values_rdadapt<-qval$qvalues
return(data.frame(p.values=reschi2test, q.values=q.values_rdadapt))
}
FUN_rda <- function(LFMM_imp, # X
predictors, # Y
covariates, # Z -> NULL if no correction
snps,
model.name = "RDA.model",
K = ncol(predictors), # number of RDA takes into analysis (too many may lead to overfiting)
pval = 0.05,# before bonferonni correction
sd_num = 3, # only for situation with 1 predictor
permutation_num = 999,
only_basic = F,
check_significance = F){ # long execution
# Load imput lfmm
message('READ LFMM: START')
lfmm_imp <-
fread(LFMM_imp)
message('READ LFMM: FINISH')
# Run model
message('RDA: START')
if(is.null(covariates)){ # No correction for pop structure
message('RDA: WITH NO CORRECTION')
formul = paste0(
'lfmm_imp ~ ', # response
paste(colnames(predictors), collapse = ' + ')) # predictors
message(formul[[1]])
rda.p <- rda(formula(formul), data = predictors)
} else { # With correction for covariates
message('RDA: WITH CORRECTION')
formul = formula(paste0(
'lfmm_imp ~', # response
paste(colnames(predictors), collapse = ' + '), # predictors
' + Condition(', paste0(colnames(covariates), collapse = ' + '), ')')) # Covariates
rda.p <- rda(formul, data = cbind(predictors, covariates))
}
# stopifnot(!is.null(rda.p))
message(paste0('RDA MODEL IS NULL: ', is.null(rda.p)))
message('RDA: FINISH')
# lfmm_imp ~ predictors + Condition(covariates))
message('RDA OUTLIERS DETECTION: START')
# Extract data
data <- scores(rda.p, type="n", scaling=3) %>% lapply(as.data.frame)
# Outlier detection
if(ncol(predictors) > 1){
message(paste0('RDA: ', ncol(predictors), ' predictors and ', K, 'RDAs'))
rdadapt.res <- rdadapt(rda.p, K)
gLOADINGS_HIST = NULL #####################
} else { # If we have only 1 predictors we can't use function rdadapt, and estimate outliers and p.values in another way
message('RDA: 1 predictor')
load.rda <- rda.p$CCA$v[,1]
gLOADINGS_HIST <- ##################
as.ggplot(function() hist(load.rda, main="Loadings on RDA1") )
# outliers_sd <- loadings_outliers(load.rda, sd_num) # 38
# Calculate z-scores
z_scores <- scale(load.rda, center = mean(load.rda), scale = sd(load.rda))
# Calculate two-tailed p-values - min between left and right tailed multiplied by 2
p_values_two_tailed <- 2 * pmin(pnorm(z_scores, lower.tail = TRUE),
pnorm(z_scores, lower.tail = FALSE))
rdadapt.res <- data.frame(p.values = p_values_two_tailed[,1])
}
## Use Bonferroni for correction
## P-values threshold after Bonferroni correction
rdadapt.thres <- pval / length(rdadapt.res$p.values)
# Use FDR
# rdadapt.thres <- rdadapt.res$p.values[rdadapt.res$q.values < pval] %>% max
## Identifying the loci that are below the p-value threshold
outliers <- data.frame(Loci = colnames(lfmm_imp)[which(rdadapt.res$p.values < rdadapt.thres)],
p.value = rdadapt.res$p.values[which(rdadapt.res$p.values < rdadapt.thres)])
# save for 3d biplot
locus_scores <- scores(rda.p, choices=c(1:K), display="species", scaling="none") # extract again but without scaling?
TAB_loci <- data.frame(Loci = row.names(locus_scores), locus_scores)
TAB_loci <- ##############
TAB_loci %>%
dplyr::mutate(p.value = rdadapt.res$p.values,
q.value = qvalue(rdadapt.res$p.values)$qvalue) %>%
dplyr::mutate(type = ifelse(p.value < rdadapt.thres, 'Outlier', 'Neutral') %>% as.factor) %>%
cbind(snps, .) #%T>%
# write.table(paste0(OUTTABLES, 'RDA_SNPs_shpere_', model.name, '.tsv'), quote = F, row.names = F, col.names = T, sep = '\t')
if(only_basic){
return(TAB_loci)
}
message(paste0('RDA OUTLIERS DETECTION: FINISH: ', ncol(outliers)))
message('RDA MODEL ASSESMENT: START')
# Assessment
## screeplot
message('RDA MODEL ASSESMENT: SCREEPLOT')
# gSCREE <- as.ggplot(function() screeplot(rda.p, main="Eigenvalues of constrained axes")) #######################
message('RDA MODEL ASSESMENT: CUMSUM')
## Cumsum - to Decide how many RDAs to use
## Note that choosing too many axes may result in overfitting
# rda.eigv <- a$y # save this it's the eigenvalues
# prop_var <- round(rda.eigv / sum(rda.eigv), 3)
#
# gCUMSUM <- as.ggplot(function() { #############################
# plot(cumsum(prop_var), xlab="Number of RDA axes", # plot cumsum of var explained of each RDAs
# ylab="Cumulative percent of variation explained", ylim=c(0,1))
# })
gSCREE <- NULL #TODO
gCUMSUM <- NULL
# VIF
message('RDA MODEL ASSESMENT: VIF')
vif <- vif.cca(rda.p) #TODO returт it somewhere
message('RDA MODEL ASSESMENT: REVERSE PREDICT')
# Predict predictors values de novo :)
pred.names <- rda.p$CCA$biplot %>% row.names # extract names of predictors in the same order as in the model (probably it's always the same as in 'predictors' input argument, but it's reinsurance)
# biplot.RDAs.number = summary(rda.p$model)$biplot %>% ncol
reverse.predict <-
mclapply(1:K, function(n){
traitPred <- rda_trait_pred(rda.p, n, K) # second argument is the row in rda.pc$CCA$biplot
# Empirical study could compare an empirically measured trait value
# to the RDA-predicted trait value to test how well landscape genomic data
# can predict functional traits
cor.value = cor(predictors[[n]], traitPred)
gCORPLOT = as.ggplot(function() {
plot(predictors[[n]], traitPred, xlab = pred.names[n],
ylab="RDA trait prediction", main = paste0('Correlation: ', round(cor.value, 3)))
abline(0,1)
})
return(gCORPLOT)
}, mc.cores = CPU) %>% setNames(pred.names)
message('RDA MODEL ASSESMENT: FINISH')
if(ncol(predictors) > 1 & length(outliers$Loci) > 1){
message('RDA LOCI COEFFICIENT ESTIMATION: START')
# Coefficients
## It may be interesting for some studies to understand how each locus is shaped by the environment - in other words, what are the slopes associated with the environmental variables in the multiple regression model for each locus?
## Unfortunately there is not a way to output these slopes in the R package vegan, but we can reproduce the first step of the RDA to get the regression coefficients: (vegan source code at https://github.com/cran/vegan/blob/master/R/simpleRDA2.R)
# Decomposition
Q = qr(predictors, tol = 1e-6)
# Get the matrix of regression coefficients
Qr.coeff <- qr.coef(Q, as.matrix(lfmm_imp))
# This matrix has each SNP in a column and the regression coefficients for that SNP corresponds to each environmental variable. This is the step that is not performed in the `vegan` package - the package skips directly to predicting the fitted values, on which the ordination is performed.
# We can visualize the regression coefficients with a heatmap for candidate loci
# Extract coefficients and arrange
rda.coeff <- #################
Qr.coeff[, outliers$Loci] %>%
t %>%
as.data.table %>%
dplyr::mutate(SUM = rowSums(abs(.)),
snp = outliers$Loci) %>%
dplyr::arrange(desc(SUM)) %>%
dplyr::select(snp, everything())
## Based on this range choose color breaks for heatmap
coef_range <-
Qr.coeff[, outliers$Loci] %>%
as.numeric %>%
summary
brks <- seq(coef_range[1], coef_range[6], by=0.05) #set the color scale
message('RDA LOCI COEFFICIENT ESTIMATION: FINISH')
gCOEFHEATMAP <- ###############
as.ggplot(function() {
heatmap.2(t(Qr.coeff[, outliers$Loci]),
scale="none",
col = cm.colors(length(brks)-1),
breaks=brks,
dendrogram = "both",
Rowv=TRUE,
trace="none",
key.title = "Coefficient in multiple\nregression model",
ylab="SNPs",
cexCol=1)
})
} else {
gCOEFHEATMAP = NULL
rda.coeff = NULL
}
message('RDA ANALYSING RESULTS: FINISH')
# Significance
# We can **assess both the full model and each constrained axis using F-statistics** (Legendre et al, 2010).
# The output looks a lot like an ANOVA table and the function is even called anova, but in fact it is a permutation test, in which the rows of the unconstrained matrix are randomized repeatedly across some number of permutations. If the observed relationship is stronger than the randomly permuted relationships (e.g. at alpha = 0.05), then the relationship is significant.
# The null hypothesis is that no linear relationship exists between the SNP data and the environmental predictors
if(check_significance){
message('RDA MODEL SIGNIFICANCE: START')
# Full model significance
signif.full <- anova.cca(rda.p,
parallel=CPU,
permutations = permutation_num)
## RDA axis significance
## We can **check each constrained axis for significance**.
##For this test, **each constrained axis is tested using all previous constrained axes as conditions**.
signif.axis <- anova.cca(rda.p,
by="axis",
parallel=CPU,
cutoff = 0.01,
permutations = permutation_num)
## Variable significance
signif.vars <- anova.cca(rda.p,
by="term",
parallel=CPU,
permutations = permutation_num)
message('RDA MODEL SIGNIFICANCE: FINISH')
significance <-
list(full = signif.full,
axis = signif.axis,
vars = signif.vars)
} else {significance = NULL}
# Basic plots
TAB_samples <- cbind(data$sites, predictors) #############
## Samples
if(ncol(predictors) > 1){
gSAMPLES <- #############
lapply(colnames(predictors), function(pred.name){
ggplot(TAB_samples,
aes_string(x = 'RDA1', y = 'RDA2', color = pred.name)) +
geom_point() +
scale_color_viridis_c(option = "plasma") +
plot_theme
}) %>% setNames(colnames(predictors))
} else { gSAMPLES <- NULL }
## SNPs
if(ncol(predictors) > 1 & length(outliers$Loci) > 1){
message('RDA CORRELATION OUTLIER LOCI WITH PREDICTORS: START')
### Assign each SNP to predictor
cor.list <-
mclapply(outliers$Loci, function(snp) {
lapply(colnames(predictors), function(env){
WGCNA::cor(lfmm_imp[[snp]],
predictors[[env]],
method = 'spearman')
}) %>%
unlist
}, mc.cores = CPU)
cor.df <-
sapply(1:length(cor.list[[1]]), function(i) {
sapply(1:length(cor.list), function(j) { cor.list[[j]][i] } )
}) %>%
as.data.frame %>%
setNames(colnames(predictors)) %>%
dplyr::mutate(Cor.Main = sapply(cor.list, function(vec) colnames(predictors)[which.max(abs(vec))])) %>%
cbind(outliers) %>%
dplyr::select(-p.value)
# Manhattan
gMANHAT <- #################
as.ggplot(function() {
TAB_loci %>%
dplyr::mutate(CHROM = gsub('chr|H', '', CHROM) %>% as.numeric) %>%
qqman::manhattan(chr = 'CHROM', bp = 'POS', p = 'p.value', snp = 'Loci',
highlight = outliers$Loci ,
suggestiveline = F, genomewideline = -log10(rdadapt.thres))
})
#TODO Add to TAB_loci
# { if(ncol(predictors) > 1) left_join(., cor.df, by = 'Loci') else . } %>%
TAB_var <- ##############
as.data.frame(scores(rda.p, choices=c(1:3), display="bp")) %T>%
write.table(paste0(OUTTABLES, 'RDA_SNPs_sphere_Vectors_', model.name, '.tsv'), sep = '\t')# pull the biplot scores
message('RDA CORRELATION OUTLIER LOCI WITH PREDICTORS: FINISH')
} else { cor.df = NULL; TAB_loci = NULL; TAB_var = NULL; gMANHAT = NULL}
# gSNPs <-
# ggplot(data$species,
# aes(x = RDA1, y = 'RDA2', color = pred.name)) +
# geom_point() +
# scale_color_viridis_c(option = "plasma") +
# plot_theme
# }) %>% setNames(colnames(predictors))
return(list(model = rda.p,
coefficients = list(heatmap = gCOEFHEATMAP,
value = rda.coeff),
characteristics = list(screeplot = gSCREE,
hist = gLOADINGS_HIST,
cumsum = gCUMSUM,
vif = vif,
reverse.predict = reverse.predict,
significance = significance
),
plots = list(samples = gSAMPLES,
manhat = gMANHAT),
values = list(loci = TAB_loci,
outliers = outliers,
samples = TAB_samples,
bp = TAB_var,
pval = pval,
pval.adj = rdadapt.thres
)
)
)
}
########################################################################## RUN #################################################################################
FUN_rda(LFMM_imp, PREDICTORS, COVARIATES, SNPS, OUTSUFFIX,
only_basic = T) %>%
fwrite(paste0(OUTSUFFIX, '.tsv'), sep = '\t', col.names = T, row.names = F, quote = F)