-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathInSilico_PCR.sh
executable file
·260 lines (219 loc) · 7.2 KB
/
InSilico_PCR.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/bin/bash
# script: InSilico_PCR.sh
# required:
# conda with -f environment imported
# requires: bbmap, bioawk R (computing percentiles)
#
# Stephane Plaisance VIB-NC September-18-2019 v1.0
# version 1.1; 2019-10-04
# better handling of input file
# better handling of parallel jobs and threads
# version 1.1.1; 2019-10-28
# lower specs to better match low-end servers
# version 1.1.2; 2019-10-29
# add qin=${qual} to extraction command
# add include primer (t/f) as option in header
# reformat to page width
# version 1.1.3; 2022-02-08
# rename logs folder to allow consecutive runs
# add TMPDIR for bbmap and parallel
#
# visit our Git: https://github.com/Nucleomics-VIB
#
version="1.1.3; 2022-02-08"
#
# required once: create a conda env to install the required apps
# conda env create -f environment.yaml
# adapt next line to point to the right conda.sh init script
# see conda activate script for details
source /etc/profile.d/conda.sh
conda activate InSilico_PCR || \
( echo "# the conda environment 'InSilico_PCR' was not found on this machine" ;
echo "# please read the top part of the script!" \
&& exit 1 )
########## start user editable region ##############
# REM: the workflow can be restarted by deleting all log files after a given step
# speed-up
thr=8
pigt=2
mem="1g"
# extract reads corresponding to the 16S PCR
# Long ONT / PacBio amplicon V1-V9
forwardp="AGAGTTTGATCMTGGCTCAG"
forwardl="27F"
reversep="CGGTWACCTTGTTACGACTT"
reversel="1492Rw"
# typical V3-V4 amplicon
#forwardp="GACTCCTACGGGAGGCWGCAG"
#forwardl="337F"
#reversep="GACTACHVGGGTATCTAATCC"
#reversel="805R"
# hybrid amplicon V4-V9 a bit longer
#forwardp="GTGYCAGCMGCCGCGGTAA"
#forwardl="515FB"
#reversep="CGGTWACCTTGTTACGACTT"
#reversel="1492Rw"
# 4.4kb +/-full-length rRNA amplicon
#forwardp="AGRGTTYGATYMTGGCTCAG"
#forwardl="ncec_16S_8F_v7"
#reversep="CGACATCGAGGTGCCAAAC"
#reversel="ncec_23S_2490R_v7"
# be stringent to avoid noisy reads
cut=0.8
# split in 500k read bins and zip
lines=2000000
# quality phred scale of your data (for demo data it is 33)
qual=33
# expected amplicon size limits, set to 10 and 10000 by default
# adjust if you notice that the sequence extraction has unwanted tail(s)
readminlen=10
readmaxlen=100000
# whether to include the primer matches or to clip them (t/f)
primincl="t"
######## end of user editable region ############
# test arguments
if [ -z "$1" ]; then
echo -e "\nUsage: $0 -d (for demo)
or
$0 <path to a local fastq.gz file>\n"
exit 1
fi
if [[ "$1" == "-d" ]]; then
# get mockcommunity data from the cloud
# https://github.com/LomanLab/mockcommunity
infile="Zymo-PromethION-EVEN-BB-SN.fq.gz"
url="https://nanopore.s3.climb.ac.uk"
else
if [ -f "${1}" ]; then
infile=$1
else
echo "# input file not found!"
exit 1
fi
fi
# extract string for output names
name=$(basename ${infile%\.fq\.gz})
##################
# prepare folders
# working default to local folder
data="$(pwd)"
# split folder
split="split_data_${name}"
mkdir -p ${split}
# run logs
logs="run_logs_${name}"
mkdir -p ${logs}
# tmp output folder
tmpout="bbmap_out_${name}_${forwardl}_${reversel}"
mkdir -p ${tmpout}
# parallel folders
WORKDIR=$PWD
mkdir -p $PWD/tmp
export TMPDIR=$PWD/tmp
# keep track of all
runlog=${logs}/runlog.txt
exec &> >(tee -i ${runlog})
# also echo commands to log (verbose!)
# set -x
##################################
# download data for -d (once only)
if [[ "$1" == "-d" ]]; then
if [[ ! -f "${infile}" ]]; then
echo "# downloading ${infile} (may take some time!)"
wget ${url}/${infile}
touch ${logs}/done.gettingdata}
else
echo "# ${infile} was already downloaded from ${url}"
fi
fi
###########################################
# split large file into chunks for parallel
if [[ ! -f ${logs}/done.splitting ]]; then
echo "# splitting the data in multiple smaller files and compressing (may take some time!)"
zcat ${infile} | \
split -a 3 -d -l ${lines} --filter='pigz -p '${pigt}' \
> $FILE.fq.gz' - ${split}/${name}_ && \
touch ${logs}/done.splitting
else
echo "# splitting already done"
fi
#################################
# run in BBMap msa.sh in parallel
#################################
# compute job number & threads/job
jobs=$(ls ${split}|wc -l)
# limit to thr
if [[ "${jobs}" -gt "${thr}" ]]; then
jobs=${thr}
fi
jobt=$((${thr}/${jobs}))
######################################################
# find forward primer using a fraction of the threads
if [[ ! -f ${logs}/done.searching.${name}_${forwardl} ]]; then
echo "# searching for forward primer sequence: ${forwardp} in all files"
find ${split} -type f -name "${name}_???.fq.gz" -printf '%P\n' |\
sort -n |\
parallel --workdir ${WORKDIR} --tmpdir ${TMPDIR} -j ${jobs} msa.sh -Xmx${mem} threads=${jobt} \
qin=${qual} \
in=${split}/{} \
out=${tmpout}/forward_{}.sam \
literal="${forwardp}" \
rcomp=t cutoff=${cut} && \
touch ${logs}/done.searching.${name}_${forwardl}
else
echo "# forward search already done"
fi
######################################################
# find reverse primer using a fraction of the threads
if [[ ! -f ${logs}/done.searching.${name}_${reversel} ]]; then
echo "# searching for reverse primer sequence: ${reversep} in all files"
find ${split} -type f -name "${name}_???.fq.gz" -printf '%P\n' |\
sort -n |\
parallel --workdir ${WORKDIR} --tmpdir ${TMPDIR} -j ${jobs} msa.sh -Xmx${mem} threads=${jobt} \
qin=${qual} \
in=${split}/{} \
out=${tmpout}/reverse_{}.sam \
literal="${reversep}" \
rcomp=t \
cutoff=${cut} && \
touch ${logs}/done.searching.${name}_${reversel}
else
echo "# reverse search already done"
fi
##########################################
# extract regions with BBMap cutprimers.sh
if [[ ! -f ${logs}/done.cutprimer.${name}_${forwardl}_${reversel} ]]; then
echo "# extracting template sequences between primer matches"
find ${split} -type f -name "${name}_???.fq.gz" -printf '%P\n' |\
sort -n |\
parallel --workdir ${WORKDIR} --tmpdir ${TMPDIR} -j ${thr} cutprimers.sh -Xmx${mem} \
qin=${qual} \
in=${split}/{} \
out=${tmpout}/{}_16s.fq \
sam1=${tmpout}/forward_{}.sam \
sam2=${tmpout}/reverse_{}.sam \
include=${primincl} \
fixjunk=t && \
touch ${logs}/done.cutprimer.${name}_${forwardl}_${reversel}
fi
###########################################################
# merge results and keep only reads in amplicon size range
if [[ ! -f ${logs}/done.merging.${name}_${forwardl}_${reversel} ]]; then
# clear existing results
final="${name}_filtered_${forwardl}_${reversel}.fq.gz"
cat /dev/null > ${final}
echo "# filtering results at min:${readminlen} and max:${readmaxlen} and merging to ${final}"
find ${tmpout} -type f -name "${name}_???.fq.gz_16s.fq" | \
sort -n |\
xargs cat |\
bioawk -c fastx -v min="${readminlen}" -v max="${readmaxlen}" \
'{if (length($seq)>=min && length($seq)<=max)
print "@"$name" "$comment"\n"$seq"\n+\n"$qual}' |\
bgzip >> ${final} && \
touch ${logs}/done.merging.${name}_${forwardl}_${reversel}
else
echo "# merging and filtering already done"
echo "# force redo by deleting ${logs}/done.merging.${name}_${forwardl}_${reversel}"
fi
# return to normal
conda deactivate