-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDirectReadFasta.txt
54 lines (36 loc) · 1.27 KB
/
DirectReadFasta.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
An alternative way of reading fasta file directory into R by using "Biostrings" library.
#Reading a fasta file into R
library(Biostrings) #using this package
memory.limit(size = 2^19)
gc(reset = TRUE)
gc(reset = TRUE)
# Maybe it's not as memory efficient as using a text table.
# I need to allocate memory here to make it work.
#Reading the fasta file to R
fasta <- readDNAStringSet("samples.fas", format="fasta")#
n_seq<- fasta@ranges@width[1]
n_sample <- length(fasta)
bool <- array(0, dim=c(n_sample, 5*n_seq)) # the boolean vector
colnames(bool) <- c(paste("A_", 1:n_seq, sep=""),paste("T_", 1:n_seq, sep=""),paste("G_", 1:n_seq, sep=""),paste("C_", 1:n_seq, sep=""),paste("s_", 1:n_seq, sep=""))
rownames(bool) <- subseq(fasta)@ranges@NAMES
## storage
for (s in 1:n_sample){
se <- fasta[[s]]
se <- tolower(se) # reading the seq data
for (le in 1:n_seq){ # reading the letters one by one
base <- substr(se, le,le)
if(base =="a") {
bool[s, le] <-1
} else {
if(base =="t") {
bool[s, le+n_seq] <-1
} else {
if(base =="g") {
bool[s, le+n_seq*2] <-1
} else {
if(base =="c") {
bool[s, le+n_seq*3] <-1
} else {
if(base =="-") {
bool[s, le+n_seq*4] <-1
}}}}}}}