forked from rkoeck/saliva_methylome
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06_RegionStatistics.R
133 lines (80 loc) · 4.76 KB
/
06_RegionStatistics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
###########################################################################################################################
# Author: Rebekka Koeck
# Lab: Cellular Genomic Medicine, Clinical Genetics, Maastricht University (Medical Centre +)
# script purpose: statistics testing of CpG sites aggregated into regions (genes, promoters and CpG islands)
# input: preprocessed beta values generated using 01_preprocessingSWAN.R to calculate average methylation values per gene
# aggregated beta values per promoter and CpG island extracted frmo the output of 01_preprocessingSWAN.R
# conducted to compare G3 and K-SICM within the IVF cohort (including and excluding pregnancy complications)
# and once to compare all IVF to all naturally conceived individuals for each region type (genes, promoters, CpG islands)
# targeted analysis: for the targeted analysis, genes were restricted to those contained in the following documents:
# imprinted sites: manifestImprinted.csv (source: Ginjala V. Gene imprinting gateway. Genome Biology 2001.)
# output: (multiple testing corrected) statistics per region - gene/promoter/CGI (.csv)
###########################################################################################################################
# load packages
message("loading packages")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(variancePartition))
suppressPackageStartupMessages(library(stringr))
# set up the parallel environment
param = SnowParam(workers = 15, type = "SOCK", progressbar = TRUE)
register(param)
# set data directories
message("loading files")
annotation.file = "annotationExcluded.csv"
betas.file = "betasSWAN.csv"
manifest.file <- "manifest_EPIC_selected_combined.csv"
# load the data files
annotation <- read.csv(annotation.file) %>% select(-X)
betas <- fread(betas.file)
manifest <- fread(manifest.file) %>% select(ID, CHR, custom_Name, custom_Accession, custom_Group)
# select only the columns relating to the QC passing samples
betas <- betas %>% select(ID, annotation$Sample_ID) %>%
column_to_rownames("ID")
# adjust the sample plate variable in the annotation file
annotation$samplePlate <- if_else(annotation$Sample_Plate == "WG5839006-BCD", "a", "b")
# for consistency of naming rownames of annotation file should correspond to colnames of betas file
annotation <- annotation %>% column_to_rownames("Sample_ID")
# filter the manifest file to contain only probes passing QC
message("grouping by gene name")
manifest <- manifest %>% filter(ID %in% rownames(betas))
sites_per_gene <- manifest %>%
filter(nchar(custom_Name) > 0) %>%
group_by(custom_Name, CHR) %>%
summarise(sites = n())
chroms_per_gene <- sites_per_gene %>%
group_by(custom_Name) %>%
summarise(chrs = n())
#remove genes that aren't annotated to a unique chromosome
chroms_per_gene <- chroms_per_gene %>% filter(chrs == 1)
# remove genes that aren't represented by 3 or more individual CpG sites
sites_per_gene <- sites_per_gene %>% filter(custom_Name %in% chroms_per_gene$custom_Name) %>% filter(sites >=3)
#annotate the betas file to contain the gene names
betas.annotated <- betas %>%
rownames_to_column("ID") %>%
full_join(manifest, . , by = "ID")
# remove CpG sites that are not relevant for the genes analysis and then aggregate the methylation values by gene
betas.genes <- betas.annotated %>% filter(custom_Name %in% sites_per_gene$custom_Name)
betas.genes <- betas.genes %>%
select(-ID, -CHR, - custom_Accession, -custom_Group) %>%
group_by(custom_Name) %>%
summarise_all(., list(~mean(.,na.rm = T)))
# reformat the betas.genes object to be suitable for statistical modelling
betas.genes <- betas.genes %>% as.data.frame() %>% column_to_rownames("custom_Name")
################ FIT THE STATISTICAL MODEL ###################################
# create the complex model using the dream package (variancePartition)
# fixed effects = age, gender, culture medium, cell composition
# random effects = sample plate
# specify the model
model <- ~ Culturemedium + Age + Gender + Leukocytes + Epithelial.cells + (1 | samplePlate)
# calculate the weights (this is only possible on sites containing no NA values)
message("calculating weights")
weights <- voomWithDreamWeights(na.omit(betas.genes), model, annotation)
message("model fitting")
fit.complex <- dream(weights, model, annotation)
message("generating output")
complex.table <- topTable(fit.complex, coef = "Culturemediumvg3", number = Inf)
# save the resultant objects as .csv files
write.csv(x = complex.table, file = "dmrsGenesCustom.csv")