-
Notifications
You must be signed in to change notification settings - Fork 3
/
extractSeq.bash
125 lines (115 loc) · 2.4 KB
/
extractSeq.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
if [[ "$@" =~ "--debug" ]]; then
set -ex
else
set -e
fi
if [[ "$@" =~ "-h" ]]; then
echo "* Usage: bash extractSeq.bash contigname start[int] end[int] strand[+/-] output file.fasta"
exit
fi
contigname=$1
start=$2
end=$3
strand="$4"
output="$5.fasta"
myfile=$6
if [ "$contigname" == "" ]; then
echo "* no contigname was provided"
exit
fi
if [ "$start" == "" ];then
echo "* no start position was provided"
exit
fi
if [ "$end" == "" ];then
echo "* no end position was provided"
exit
fi
if [ "$strand" == "" ];then
echo "* strand option default: +"
strand="+"
fi
if [ "$output" == ".fasta" ];then
output="output.fasta"
fi
if [ "$myfile" == "" ];then
echo "* no $myfile found"
exit
fi
function getGene {
myfile=$(echo "$4" |awk -F"/" '{print $NF}')
awk -v contigname=">$1" -v start="$2" -v end="$3" -v myfile="$myfile" -v strand="$5" 'BEGIN{cband=0;cposition=1;container=0}
{
if($0 == contigname){
cband=1
print $0"_"myfile"_"strand"("start":"end")"
next
}
if(cband==1){
split($0, chars, "")
for(i=1;i<=length($0);i++){
if(cposition>=start && cposition<end){
if(container<70){
printf "%c",chars[i]
container++
}else{
printf "%c\n",chars[i]
container=0
}
}
if(cposition==end){
printf "%c\n",chars[i]
exit
}
cposition+=1
}
}
if($0~">" && cband==1){
printf "\n"
exit
}
}' $4
}
case $strand in
"+")
getGene $contigname $start $end $myfile > $output
echo "" >> $output
;;
"-")
getGene $contigname $start $end $myfile "c" | awk 'BEGIN{t["A"]="T";t["T"]="A";t["G"]="C";t["C"]="G";
t["a"]="t";t["t"]="a";t["g"]="c";t["c"]="g";t["N"]="N";t["n"]="n";t[""]
t["R"]="Y";t["r"]="y";t["Y"]="R";t["y"]="r";t["K"]="M";t["M"]="K";t["k"]="m";t["m"]="k";
t["S"]="S";t["s"]="s";t["W"]="W";t["w"]="w";t["B"]="V";t["V"]="B";t["b"]="v";t["v"]="b";
t["D"]="H";t["H"]="D";t["d"]="h";t["h"]="d";t["d"]="h"
;container=0;cposition=1;seq=""}
{
if(NR>1){
seq=seq""$0
}else{
print $0
}
}
END{
split(seq, chars, "")
for(i=length(seq);i>=1;i--){
if(container<70){
printf "%c",t[chars[i]]
container++
}else{
printf "%c\n",t[chars[i]]
container=0
}
if(cposition==end){
printf "%c\n",t[chars[i]]
exit
}
cposition+=1
}
}' > $output
echo "" >> $output
;;
*)
echo "invalid strand"
exit
;;
esac