forked from ZnL-BCM/DNAshapeR_reference
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_DNAshapes_5mer_cpg.R
72 lines (58 loc) · 2.58 KB
/
generate_DNAshapes_5mer_cpg.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
### generate_DNAhapes_7mer_cpg.R
### Zian Liu
### 6/29/2020
## This script generates the DNA shapes table in 7-mer format. The output can be used for DNA shape deviation analyses.
## This script specifically assumes middle "CG" are all CpG: Annotate them as "Mg" instead
# Library
library(tidyverse)
library(openxlsx)
library(DNAshapeR) # Note: install from the author's github for latest ver
# Build the dictionary and write it to a text file
a = c(rep('A', 256), rep('C', 256), rep('G', 256), rep('T', 256))
b = rep(c(rep('A', 64), rep('C', 64), rep('G', 64), rep('T', 64)), 4)
c = rep(c(rep('A', 16), rep('C', 16), rep('G', 16), rep('T', 16)), 16)
d = rep(c(rep('A', 4), rep('C', 4), rep('G', 4), rep('T', 4)), 64)
e = rep(c('A', 'C', 'G', 'T'), 256)
dict_fa_1 <- paste(a, b, c, d, e, sep = '')
# Create dict_fa_2 which replaces 3 and 4 to "Mg"
dict_fa_2_tmp <- strsplit(dict_fa_1, "")
dict_fa_2 <- character(length(dict_fa_1))
for (i in 1:length(dict_fa_2_tmp)){
if (dict_fa_2_tmp[[i]][3] == 'C' & dict_fa_2_tmp[[i]][4] == 'G'){
dict_fa_2_tmp[[i]][3] = 'M'
dict_fa_2_tmp[[i]][4] = 'g'
}
dict_fa_2[i] <- paste(dict_fa_2_tmp[[i]], collapse = "")
}
dict_fa_title <- paste(rep('>seq', 1024), 1:1024, sep = '_')
# Interweave the previous two vectors into one vector & write it into text file
write.table(as.vector(rbind(dict_fa_title, dict_fa_2)),
file = 'fasta_dict_5mer_cpg.fa',
row.names = FALSE, col.names = FALSE, quote = FALSE)
# The package doesn't have an 'All' option, define all shapes used
shape_type_list = c('HelT', 'Rise', 'Roll', 'Shift', 'Slide', 'Tilt', 'Buckle',
'Opening', 'ProT', 'Shear', 'Stagger', 'Stretch', 'MGW', 'EP')
# Reload the text file & run it through DNAshapeR
pred_2 <- list()
for (shape_type in shape_type_list){
if (shape_type %in% c('HelT', 'Roll', 'ProT', 'MGW')){
pred_shape <- getShape(
paste(getwd(), "fasta_dict_5mer_cpg.fa", sep = '/'), shapeType = shape_type,
methylate=TRUE
)
} else {
pred_shape <- getShape(
paste(getwd(), "fasta_dict_5mer.fa", sep = '/'), shapeType = shape_type
)
}
pred_2[[shape_type]] <- pred_shape[[1]]
}
# Note that there are no edge predictions, so remove the NAs
# Also, add rownames to the values
for (z in 1:length(pred_2)){
pred_2[[z]] <- as.data.frame(t(na.omit(t(pred_2[[z]]))))
rownames(pred_2[[z]]) <- dict_fa_1 # Note that we will use the regular 5-mer names
}
# Make an output
write.xlsx(pred_2, file = 'ref_5mers_structure_cpg.xlsx', rowNames = TRUE)
# Done.