-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRcode.txt
121 lines (86 loc) · 3.64 KB
/
Rcode.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
### set of files
select.rdata
an R file that contains the axis found in the sample data set
samples.zip
zip compressed fas file of the sample data set
brazil.txt
an example aligned file to be classified
has been aligned to coincide to the sample.fas
# Applying the axes to your new data.
# RightA: the axes and average data.
# samples.fas, samples.txt: example for the alignment.
# Acnowledgements: those for the GISAID. The original sequences can be downloaded.
# PCsample.zip: scores from each continent and figures.
# sPCbaseSelect.xlsx: The PC for bases. Those made the axes.
# A recent version of the axes and the mean data.
# Those were estimated by using data downloaded on 1st Dec 2020. The RightA.rdata includes two files: Righta (axes) and meansa (average data).
##
# Usage:
#1. 1. Prepare aligned DNA sequences that fit the sample sequence. No addition/removal of bases is allowed. (sample.fas)
#2. Change the sequence format from fasta to tab-separated text. (sample.txt)
#3. Read the text data in the R.
HA <- read.table(file="xxxxxx.txt", sep="\t") # replace xxx to your data's name
HA<-as.matrix(HA)
(n_sample<-dim(HA)[1])
(n_seq<- nchar(HA[2,2]))
HA_lets <- array(0, dim=c(n_sample, 5*n_seq))
colnames(HA_lets) <- c(paste("A_", 1:n_seq, sep=""),paste("T_", 1:n_seq, sep=""),paste("G_", 1:n_seq, sep=""),paste("C_", 1:n_seq, sep=""),paste("s_", 1:n_seq, sep=""))
#4. prepareing the boolean
for (s in 1:n_sample){
se <- HA[s, 2]
se <- tolower(se) # making the boolean vector
for (le in 1:n_seq){ # one by one
base <- substr(se, le,le)
if(base =="a") {
HA_lets[s, le] <-1
} else {
if(base =="t") {
HA_lets[s, le+n_seq] <-1
} else {
if(base =="g") {
HA_lets[s, le+n_seq*2] <-1
} else {
if(base =="c") {
HA_lets[s, le+n_seq*3] <-1
} else {
if(base =="-") {
HA_lets[s, le+n_seq*4] <-1
}}}}}}}
rownames(HA_lets) <- HA[,1]
#5. The sequences may have problems at the first and end.
# Those should be removed.
for(le in c(1:563, 29360:n_seq)){
for(l in 1: n_sample){
HA_lets[l, le] <-0
HA_lets[l, le+n_seq*1] <-0
HA_lets[l, le+n_seq*2] <-0
HA_lets[l, le+n_seq*3] <-0
HA_lets[l, le+n_seq*4] <-1}
}
#6. set the center of rotation. centering.
diffs<-sweep(HA_lets, 2, meansa) # difference from the average data.
diffs <-diffs/2^0.5 # for the double count
#7. if there are "-"s caused by the aligments, the positions should be masked.
for (s in 1:n_sample){
Ns <- which(HA_lets[s,1:n_seq+n_seq*4]==1)
diffs[s, c(Ns, Ns+n_seq,Ns+n_seq*2, Ns+n_seq*3, Ns+n_seq*4) ] <- 0 }
# estimating that the positions are equals to the average
#8. the number of "-"s should be recorded to find out terrible sequence data
Ns <- apply(HA_lets[,563:29360+n_seq*4], 1, sum)
#9 estimation of sPC for samples (for the first 100 axes)
sPC_samplea <- diffs %*% Righta[, 1:100] / (n_seq)^0.5
rownames(sPC_samplea)<- HA[,1]
write.table(cbind(Ns, sPC_samplea), file="sPC_sample.txt", sep="\t")
#####@
## output
# to text files
write.table(rbind(sPC_sample[,1:40],sPC_sample_new[,1:40]) , file="sPC_sample.txt", sep="\t")
# to a png
png(width=1000, height=1000, pointsize = 36, file="PCsampleNew.png")
par(lwd=1, mex=0.5, mar=c(5,5,5,.2) )
plot(sPC_sample[,1], sPC_sample[,2], xlab="PC1", ylab="PC2")
text(sPC_sample[833,1], sPC_sample[833,2], labels="œ", col="green2", cex=0.6)
text(sPC_sample[1,1], sPC_sample[1,2], labels="œ", col="skyblue", cex=0.6)
text(sPC_sample_new[ ,1], sPC_sample_new[,2], labels="œ", col="#FF3300", cex=0.7)
dev.off()
T. Konishi 24May2020