-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path11.2_BohlinEstimateGA.R
94 lines (57 loc) · 3.18 KB
/
11.2_BohlinEstimateGA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
###########################################################################################################################
# Author: Rebekka Koeck
# Lab: Cellular Genomic Medicine, Clinical Genetics, Maastricht University (Medical Centre +)
# script purpose: application of the Bohlin GA estimator
# input: beta values generated by 11preprocessingBohlin.R
# epigenetic gestational age was only calculated for samples without pregnancy complications
# output: epigenetic gestational age estimates per sample added to the annotation sheet (.csv)
# method replicated from: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1063-4
###########################################################################################################################
# load packages
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(predictGA))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(sva))
suppressPackageStartupMessages(library(ggplot2))
# set the data directories
base.dir <- "output/"
annotation.file <- file.path(base.dir, "annotation.csv")
betas.file <- file.path(base.dir, "betasBMIQ.csv")
# load the annotation file and add the additional necessary columns
annotation <- fread(annotation.file) %>% as.data.frame() %>%
mutate(Sample_ID = as.character(Sample_ID)) %>%
dplyr::select(-V1)
annotation$samplePlate <- if_else(annotation$Sample_Plate == "WG5839007-BCD", 1, 2)
annotation$complication <- if_else(annotation$pregnancy_complic == "no", "no", "yes")
# load the betas file - this has been BMIQ (within sample) normalised using RnBeads (as suggested by the Bohlin paper)
# only select the samples that are included in the annotation file
betas <- fread(betas.file) %>% select(-Chromosome, -Start, -End, - Strand) %>%
column_to_rownames("ID") %>% select(all_of(annotation$Sample_ID))
# Because of the observed plate effects Bohlin recommend to carry out between sample normalisation as well
# method: ComBat
# package: sva
betas <- ComBat(betas, batch = annotation$samplePlate)
sites <- extractSites(type = "se")
sites.na <- sites[!sites %in% rownames(betas)]
# complete all missing sites with 0s - effectively removes them from the model
for(site in sites.na){
betas[site ,] <- 0
}
mypred <- predictGA(betas)
# predictGA gives you a GA estimated in days, create also a variable in weeks
mypred$predictedGA <- mypred$GA / 7
# merge the GA predictions with the annotation file
annotation <- mypred %>% rownames_to_column("Sample_ID") %>%
full_join(annotation, ., by = "Sample_ID")
# plot the predicted vs. actual gestational age
annotation %>% ggplot(aes(x = gestational_age, y = predictedGA)) +
geom_point(aes(colour = MEDIUM)) +
theme_classic() +
geom_smooth(method = "lm", colour = "black", se = FALSE, fullrange = T) +
scale_x_continuous(limits = c(36, 43)) +
scale_y_continuous(limits = c(36,43)) +
geom_abline(slope = 1, intercept = 0, colour = "grey")
cor(annotation$gestational_age, annotation$predictedGA, method = "pearson")
# save the output
write.csv(x = annotation, file = file.path(base.dir, "estimatedGABohlin.csv"))