-
Notifications
You must be signed in to change notification settings - Fork 4
/
getRecomb.R
executable file
·162 lines (140 loc) · 4.59 KB
/
getRecomb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#'''
#Copyright (c) 2015, David Edwards, Kat Holt
#All rights reserved. (see README.txt for more details)
#'''
getRecombHeatmap<-function(snps,ref,w=NULL,g=NULL,offset=0) {
# remove offset
snps[,1]<-as.numeric(snps[,1])-offset
# get window size
if (is.null(w)){
if (!is.null(g)){
w <- round(g/length(snps)*10,0) # default window size to ensure average of 10 SNPs per window
}
else(return("Can't run. Please specify window size or genome size"))
}
# get genome size
if (is.null(g)) { g <- max(as.numeric(snps[,1])) + w }
# generate density matrix
m<-vector()
i<-0 # strain count
for (strain in colnames(snps)[-1]) {
if (strain!=ref) {
i<-i+1
# snps for strain vs ref
s<-snps[,1][as.character(snps[,which(colnames(snps)==strain)]) != as.character(snps[,which(colnames(snps)==ref)])]
# get counts in windows of size w
h<-hist(s,breaks=seq(1,g,w),plot=F)
if (i==1) { m<-h$counts }
else { m<-rbind(m,h$counts) }
}
}
# return matrix
rownames(m) <- colnames(snps[,-which(colnames(snps)==ref)])[-1]
colnames(m) <- seq(1,g,w)[-1] - 1
m
}
getRecombBetweenStrains<-function(snps,strain1,strain2,w=NULL,g=NULL,offset=0,plotResult=T,multiplier=1) {
s<-snps[,1][as.character(snps[,which(colnames(snps)==strain1)]) != as.character(snps[,which(colnames(snps)==strain2)])]
result<-getRecomb(as.numeric(s)-offset,w=w,g=g,plotResult=plotResult,multiplier=multiplier,strain1,strain2)
return(list(p=result$p,blocks=result$blocks,meanCount=result$meanCount,s=result$s,r=result$r,w=result$w))
}
getRecomb<-function(snps,w=NULL,g=NULL,plotResult=T,multiplier=1,strain1,strain2){
if (is.null(w)){
if (!is.null(g)){
w <- round(g/length(snps)*10,0) # default window size to ensure average of 10 SNPs per window
}
else(return("Can't run. Please specify window size or genome size"))
}
# get counts in windows of size w
if (!is.null(g)){ h<-hist(snps,breaks=seq(1,g+w,w),plot=F) }
else { h<-hist(snps,breaks=seq(1,max(snps)+w,w),plot=F) }
# get p-values for each window
p<-cbind(h$mids,pbinom(h$counts,w,mean(h$counts)*multiplier/w,lower.tail=F),h$counts)
# Bonferroni-corrected significant p-values
p.sig<-data.frame(p[ p[,2]< 0.05/length(snps), ])
plot(h,ylim=c(0,max(h$counts*1.5)))
if (dim(p.sig)[2]==3) {
blocks<-getBlocks(p.sig,w)
try(for(i in 1:nrow(blocks)){lines(c(blocks[i,1],blocks[i,2]),rep(max(h$counts)*1.3,2),col=4,lwd=8,pch=15)})
colnames(p.sig)<-c("window_mid","pvalue","numSNPs")
}
else{
blocks<-NULL
if (dim(p.sig)[1]==3) {
lines(c(p.sig[1,1]-w,p.sig[1,1]+w-1),rep(max(h$counts)*1.2,2),col=4,lwd=8,pch=15)
p.sig<-t(matrix(t(p.sig)))
colnames(p.sig)<-c("window_mid","pvalue","numSNPs")
}
}
cleaning<-removeSNPsInBlocks(snps,blocks)
snps_to_remove<-cleaning$remove
clean_snps<-cleaning$keep
if (plotResult){ hist(clean_snps,breaks=seq(1,max(clean_snps)+w,w),add=T,fill=2,col=2,border=2,main=paste(strain1,strain2)) }
return(list(p=p.sig,blocks=blocks,meanCount=mean(h$counts),s=clean_snps,r=snps_to_remove,w=w))
}
getBlocks<-function(p.sig,w) {
window_start<-vector()
window_stop<-vector()
mean_pvalue<-vector()
total_counts<-vector()
# initialise with first window
prev_start<-p.sig[1,1] - w/2
prev_stop<-p.sig[1,1] + w/2 - 1
pvals<-p.sig[1,2]
counts<-p.sig[1,3]
block<-1
for (i in 2:nrow(p.sig)){
start<-p.sig[i,1] - w/2
stop<-p.sig[i,1] + w/2 - 1
p<-p.sig[i,2]
if (start == prev_stop + 1) {
# overlapping blocks
prev_stop <- stop # update to current stop
pvals <- c(pvals,p) # record p-value
counts <- counts + p.sig[i,3]
}
else {
# new block
# record old block info
window_start[block] <- prev_start
window_stop[block] <- prev_stop
mean_pvalue[block] <- mean(pvals)
total_counts[block] <- counts
# initialise new block
block <- block + 1
prev_start <- start
prev_stop <- stop
pvals <- p
counts <- p.sig[i,3]
}
}
# record last block info (!)
window_start[block] <- prev_start
window_stop[block] <- prev_stop
mean_pvalue[block] <- mean(pvals)
total_counts[block] <- counts
result<-cbind(window_start,window_stop,window_stop-window_start+1,total_counts,mean_pvalue)
colnames(result)<-c("start","stop","windowSize","counts","meanPvalue")
return(result)
}
removeSNPsInBlocks<-function(snps,blocks) {
snps_to_remove<-vector()
for (i in 1:nrow(blocks)){
start<-blocks[i,1]
stop<-blocks[i,2]
snps_to_remove <- c(snps_to_remove, snps[snps>start & snps<stop])
}
snps_to_keep <- snps[! (snps %in% snps_to_remove)]
return(list(keep=snps_to_keep,remove=snps_to_remove))
}
getDistAtSnpSet<-function(snps,strain,snpset) {
counts<-vector()
straincol <- which(colnames(snps)==strain)
for (i in 2:ncol(snps)) {
s<-snps[snps[,1] %in% snpset,c(i,straincol)]
snpcount<-c(1:length(s))[s[,1] == s[,2] && s[,2] != "-"]
counts<-c(counts,snpcount)
}
names(counts)<-colnames(snps)[2:ncol(snps)]
return(counts)
}