-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCGE_auton_part1.Rmd
332 lines (271 loc) · 12.8 KB
/
CGE_auton_part1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
CGE Final Exam: Adam Auton
========================================================
```{r, echo=FALSE}
# Set up working envrionment
work_dir <- '~/My Box Files/Class/cge/final_exam_questions/data'
options(stringsAsFactors = FALSE)
```
## Plotting SNP density ##
SNP densities calculated using VCFtools. I used *region3* for this question. SNP density was calculated for 10kb regions for all populations, CEU, YRI, and CHB populations.
Density information was first loaded into R.
```{r, echo=FALSE}
# Get a list of density files
density_files <- list.files(work_dir, pattern = '\\.snpden',)
# Read in SNP density files
densities <- list()
for(file in density_files){
densities[[file]] <- read.table(file.path(work_dir, file)
, sep = '\t', header = TRUE,
row.names = NULL)
}
```
### Density Plot ###
Plotting density of rare SNPs (maf <= 0.001) for each population.
```{r fig.width=7, fig.height=6, echo=FALSE}
# Plot all SNPs
file <- 'auton_snpdens_10kb_rare_snp.snpden'
snp_pos <- densities[[file]][,2]
snp_den <- densities[[file]][,4]
plot(snp_pos, snp_den,
type = 'l', col = 'blue', xlab = 'Position', ylab = 'SNPs/kb',
main = 'SNP densities for all populations and all SNPs',
ylim = c(0, 15))
# Plot CEU SNPs
file <- 'auton_snpdens_10kb_rare_snp_CEU.snpden'
snp_pos <- densities[[file]][,2]
snp_den <- densities[[file]][,4]
lines(snp_pos, snp_den,
type = 'l', col = 'red')
# Plot YRI SNPs
file <- 'auton_snpdens_10kb_rare_snp_YRI.snpden'
snp_pos <- densities[[file]][,2]
snp_den <- densities[[file]][,4]
lines(snp_pos, snp_den,
type = 'l', col = 'green')
# Plot CHB SNPs
file <- 'auton_snpdens_10kb_rare_snp_CHB.snpden'
snp_pos <- densities[[file]][,2]
snp_den <- densities[[file]][,4]
lines(snp_pos, snp_den,
type = 'l', col = 'black')
# Add legend
legend('topleft', c('All Pop', 'CEU', 'YRI', 'CHB'),
col=c('blue', 'red', 'green', 'black'), lty = c(1,1,1,1), ncol=2, bty='n')
```
Average SNP densities for each population are shown in Table 1.
```{r, echo=FALSE}
# Calculate average SNP density
mean_density <- lapply(densities, colMeans)
mean_density <- mean_density[c('auton_snpdens_10kb_rare_snp.snpden',
'auton_snpdens_10kb_rare_snp_CEU.snpden',
'auton_snpdens_10kb_rare_snp_YRI.snpden',
'auton_snpdens_10kb_rare_snp_CHB.snpden')]
## Extract mean density
mean_density <- lapply(mean_density, function(x) x['SNPS.KB'])
mean_density <- t(as.data.frame(mean_density))
mean_density <- cbind(c('All_pops', 'CEU', 'YRI', 'CHB'),
mean_density)
```
```{r, results='asis', echo=FALSE}
cat("Table 1: Average Density per population\n\n")
cat("Population | Average Density", "--- | ---", sep="\n")
cat(apply(mean_density, 1, function(x) paste(x, collapse = ' | ')), sep = '\n')
```
### Interpretation of density plot ###
The average SNP density is different for each population. The YRI population contains the most variation among the three populations while the CEU and CHB populations have similar SNP densities. The mean densities are shown in *Table 1*.
This may be due to the reduction of genomic diversity as populations migrated out of Africa.
## Frequency spectrum ##
Frequency for each population was calculated using VCFtools.
```{r, echo=FALSE}
## List frequency files
freq_files = list.files(work_dir, pattern = '\\.frq')
## Read frequency files
freqs <- list()
for(file in freq_files){
freqs[[file]] <- read.table(file.path(work_dir, file)
, sep = '\t', header = FALSE,
skip = 1)
colnames(freqs[[file]]) <- c('CHROM','POS','CHROM',
'N_ALLELES', 'N_CHR','FREQ')
}
names(freqs) <- c('ALL', 'CEU','CHB','YRI')
## Remove SNPs with 0 minor allele frequency
freqs_filt <- lapply(freqs, subset,
FREQ > 0)
```
```{r, echo=FALSE, fig.width=10, fig.height=10}
## Break SNP frequencies into bins
bins <- seq(0, 1, 0.05)
## Plot frequency spectrum
par(mfrow = c(2,2))
all_freq <- hist(freqs$ALL[,'FREQ'],
breaks = bins,
main = 'All Population allele frequency',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
ceu_freq <- hist(freqs$CEU[,'FREQ'],
breaks = bins,
main = 'CEU allele frequency: All SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
chb_freq <- hist(freqs$CHB[,'FREQ'],
breaks = bins,
main = 'CHB allele frequency: All SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
yri_freq <- hist(freqs$YRI[,'FREQ'],
breaks = bins,
main = 'YRI allele frequency: All SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
```
Upon examination of the SNP frequencies for each population, there are many SNPs that are not polymorphic in some populations. The following plots show allele frequencies after removing SNPs that have an allele frequency of 0 in the given population.
```{r,echo=FALSE, fig.width=10, fig.height=10}
bins <- seq(0, 1, 0.05)
## Plot frequency spectrum
par(mfrow = c(2,2))
all_freq_filt <- hist(freqs_filt$ALL[,'FREQ'],
breaks = bins,
main = 'All Population allele frequency',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
ceu_freq_filt <- hist(freqs_filt$CEU[,'FREQ'],
breaks = bins,
main = 'CEU allele frequency: Filtered SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
chb_freq_filt <- hist(freqs_filt$CHB[,'FREQ'],
breaks = bins,
main = 'CHB allele frequency: Filtered SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
yri_freq_filt <- hist(freqs_filt$YRI[,'FREQ'],
breaks = bins,
main = 'YRI allele frequency: Filtered SNPs',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,23650))$counts
```
### Interpretation of Frequency plots ###
The frequency plots show a large number of rare variants within the human genome. When looking at all populations together, there is a large spike in the number of subjects that cary a rare allele. The cause of this is partly due to the presence of variants that are only seen in a subset of populations. Each population has more rare variants than common and some variants are unique to individual populations. The presence of variants that are population private causes the enrichment of low allele frequency when the populations are combined.
## The result of combined low and high coverage ##
One pitfall of combining low and high sequencing coverage is that the low coverage areas are more suceptible to errors in DNA amplification and sequencing. In low coverage areas, sequencing errors may result in some reads being called with an alternative allele and there is a higher chance that the resulting SNP will be called heterzygous when it is in reality, homozygous.
Another consequence of low coverage is that there is reduced power to detect rare variants.
## SNPs common to multiple populations ##
```{r, echo=FALSE}
## Identify SNPs that are common between CEU and YRI populations
#common_snps
ceu_snps <- freqs_filt$CEU[,'POS']
yri_snps <- freqs_filt$YRI[,'POS']
common_list <- intersect(ceu_snps, yri_snps)
common_all <- freqs_filt$ALL[(freqs_filt$ALL$POS %in% common_list),]
common_ceu <- freqs_filt$CEU[(freqs_filt$CEU$POS %in% common_list),]
common_yri <- freqs_filt$YRI[(freqs_filt$YRI$POS %in% common_list),]
unique_ceu <- freqs_filt$CEU[(!freqs_filt$CEU$POS %in% common_list),]
unique_yri <- freqs_filt$YRI[(!freqs_filt$YRI$POS %in% common_list),]
```
```{r, echo=FALSE, fig.width=10, fig.height=10}
## Set up multi-plot window
par(mfrow = c(2,2))
## Plot allele frequency histogram
common_hist <- hist(common_all[,'FREQ'],
breaks = bins,
main = 'Allele frequency of common SNPs between CEU and YRI',
xlab = 'Allele frequency',
ylab = 'Chromosome count',
ylim = c(0,1000))$counts
ceu_rare <- hist(unique_ceu[,'FREQ'],
breaks = bins,
main = 'Allele frequency of CEU private SNPS',
xlab = 'Allele frequency',
ylab = 'Chromosome count')$counts
yri_rare <- hist(unique_yri[,'FREQ'],
breaks = bins,
main = 'Allele frequency of yri private SNPS',
xlab = 'Allele frequency',
ylab = 'Chromosome count')$counts
```
### Interpretation of Frequency Plots ###
SNPs that are common to the two populations have a larger proportion of SNPs with higher allele frequencies. The median allele frequency for CEU-YRI snps is: `r median(common_all$FREQ)` while the median allele frequencies for CEU and YRI are: `r median(unique_ceu$FREQ)` and `r median(unique_yri$FREQ)` respectively.
The SNPs that are unique to individual populations are overwhelmingly SNPs with allele frequencies of <0.1. It is possible that more common SNPs point to regions of shared genetic material between populations while rare SNPs differentiate the differen populations.
## Squared Correlation Coefficient Analysis ##
```{r, echo=FALSE}
# Function to plot color bar
color.bar <- function(lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='') {
scale = (length(lut)-1)/(max-min)
dev.new(width=1.75, height=5)
plot(c(0,10), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab='', main=title)
axis(2, ticks, las=1)
for (i in 1:(length(lut)-1)) {
y = (i-1)/scale + min
rect(0,y,10,y+1/scale, col=lut[i], border=NA)
}
}
legend.col <- function(col, lev){
# From http://aurelienmadouasse.wordpress.com/2012/01/13/legend-for-a-continuous-color-scale-in-r/
opar <- par
n <- length(col)
bx <- par("usr")
box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000,
bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50)
box.cy <- c(bx[3], bx[3])
box.sy <- (bx[4] - bx[3]) / n
xx <- rep(box.cx, each = 2)
par(xpd = TRUE)
for(i in 1:n){
yy <- c(box.cy[1] + (box.sy * (i - 1)),
box.cy[1] + (box.sy * (i)),
box.cy[1] + (box.sy * (i)),
box.cy[1] + (box.sy * (i - 1)))
polygon(xx, yy, col = col[i], border = col[i])
}
par(new = TRUE)
plot(0, 0, type = "n",
ylim = c(min(lev), max(lev)),
yaxt = "n", ylab = "",
xaxt = "n", xlab = "",
frame.plot = FALSE)
axis(side = 4, las = 2, tick = FALSE, line = .25)
par <- opar
}
scc_file <- 'CEU_sq_corr_coef.geno.ld'
scc <- read.table(file.path(work_dir,scc_file), header = TRUE)
scc_names <- append(names(scc), 'DIST')
scc <- cbind(scc, abs(scc$POS1 - scc$POS2))
names(scc) <- scc_names
snp_names <- unique(scc$POS1)
pos1 <- unique(scc$POS1)
pos2 <- unique(scc$POS2)
scc_mat <- matrix(nrow = length(snp_names), ncol=length(snp_names))
rownames(scc_mat) <- pos1
colnames(scc_mat) <- pos2
## Fill in matrix
#pb <- txtProgressBar(min = 0, max = dim(scc)[1], style = 3)
for (i in 1:dim(scc)[1]){
# setTxtProgressBar(pb, i)
row <- as.character(scc$POS1[i])
col <- as.character(scc$POS2[i])
value <- unlist(scc$R.2[i])
scc_mat[row,col] <- value
}
#close(pb)
# Sort rows and columns by name
scc_mat <- scc_mat[,order(colnames(scc_mat))]
scc_mat <- scc_mat[order(rownames(scc_mat)),]
temp_label <- vector(mode='character', length = dim(scc_mat)[1])
heatmap(scc_mat, na.rm=FALSE, Rowv=NA, Colv=NA, labRow=temp_label,
labCol=temp_label,
scale='none', col=cm.colors(256),
main = 'Heatmap of pairwise correlation values',
keep.dendro=FALSE)
legend.col(col = cm.colors(256), lev = scc$R.2)
## Still not right....
```
The squared correlation coefficient relates to the linkage disequilibrium between two SNPs. A strong correlation, value close to 1, indicates two SNPs are seen together often, meaning they are inherited together. The blocks show regions of the chromosome that are inherited together. In this example, region3, there are three large blocks of SNPs that are inherited together. The further away two SNPs are, the more likely they will be separated during recombination leading to lower correlation between the two SNPs.