-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpipeline_non-ref_tree_build_1.4-dockerized.sh
executable file
·290 lines (272 loc) · 12.1 KB
/
pipeline_non-ref_tree_build_1.4-dockerized.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/bin/bash
#Authors: Logan Fink, Curtis Kapsak
#Usage: script to create a reference free phylogenetic tree from a set of fastq files
#Permission to copy and modify is granted without warranty of any kind
version="1.4"
#Print out the line after the current line in the script, and print the evaluation
#of how it will be executed
print_next_command() {
current_line=$(($1+1))
range=$(($1+1))
x=0
while [ $x == 0 ]; do
p=$(sed -n ${range}'p' /home/staphb/scripts/pipeline_non-ref_tree_build_$version-dockerized.sh)
if [[ $p == *\\ ]]; then
range=$(($range+1))
else
x+=1
fi
done
if [[ $range == $current_line ]]; then
#echo $(sed -n ${current_line}'p' /home/staphb/scripts/pipeline_non-ref_tree_build_$version-dockerized.sh)
line_data=$(sed -n ${current_line}'p' /home/staphb/scripts/pipeline_non-ref_tree_build_$version-dockerized.sh)
line_data=$(echo $line_data | sed "s/'//g")
#echo line_data
output_prefix=''
if [[ $line_data == *'>'* ]]; then
end=${line_data##*>}
output_prefix='Output file: '
line_data=${line_data%%>*}
fi
eval echo $line_data
eval echo $output_prefix$end
else
#echo $(sed -n ${current_line}','${range}'p' /home/staphb/scripts/pipeline_non-ref_tree_build_$version-dockerized.sh)
line_data=$(sed -n ${current_line}','${range}'p' /home/staphb/scripts/pipeline_non-ref_tree_build_$version-dockerized.sh)
line_data=$(echo $line_data | sed "s/'//g")
#echo $line_data
output_prefix=''
if [[ $line_data == *'>'* ]]; then
end=${line_data##*>}
output_prefix='Output file: '
line_data=${line_data%%>*}
fi
eval echo $line_data
eval echo $output_prefix$end
fi
}
#This function will check if the file exists before trying to remove it
remove_file() {
if [ -e $1 ];then
echo "File $1 has been removed"
rm -rf $1
fi
}
#This function will check to make sure the directory doesn't already exist before trying to create it
make_directory() {
if [ -e $1 ]; then
echo "Directory "$1" already exists"
else
mkdir $1
echo "Directory "$1" has been created"
fi
}
#Set number of threads, make variable global and accessible to docker containers
THREADS=$(nproc --all)
echo "Number of threads set to: $THREADS"
export THREADS
##### Check to see if docker is installed #####
if [ -z $(which docker) ]; then
echo 'Docker is not installed, Please see https://github.com/StaPH-B/scripts/blob/master/image-information.md#docker-ce for instructions on how to install Docker. Now exiting.'
exit
else
echo "$(docker --version) is installed."
fi
##### Function and check to see if Docker images are downloaded, if not, download them with docker pull #####
docker_image_check () {
if [ -z $(docker images -q $1) ]; then
docker pull $1
else
echo "Docker image $1 already exists locally."
fi
}
# gotta check em all! gotta check em all! Dock-er-mon!
echo 'Now checking to see if all necessary docker images are downloaded...'
docker_image_check staphb/sratoolkit:2.9.2
docker_image_check staphb/lyveset:1.1.4f
docker_image_check staphb/spades:3.12.0
docker_image_check staphb/quast:5.0.0
docker_image_check staphb/prokka:1.13
docker_image_check staphb/roary:3.12.0
##### Move all fastq files from fastq_files directory up one directory, remove fastq_files folder #####
if [[ -e ./fastq_files ]]; then
echo 'Moving fastq files from ./fastq_files to ./ (top-level DIR)'
mv ./fastq_files/* .
rm -rf ./fastq_files
fi
declare -a srr=() #PASTE IN ANY SRR NUMBERS INTO FILE named: SRR
while IFS= read -r line; do
srr+=("$line")
done < ./SRR
#find . -maxdepth 1 -name "*fastq*" |cut -d '-' -f 1|cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '-' and '_')
find . -maxdepth 1 -name "*fastq*" |cut -d '_' -f 1 |cut -d '/' -f 2 >tmp1 #output all fastq file identifiers in cwd to file tmp1 (the delimiters here are '_')
declare -a tmp=()
tmp1='tmp1'
tmpfile=`cat $tmp1`
for line in $tmpfile; do
tmp=("${tmp[@]}" "$line");
done
id=($(printf "%s\n" "${tmp[@]}"|sort -u)) #Get all unique values and output them to array
id=("${id[@]}" "${srr[@]}") #Combine the automatically generated list with the SRR list
remove_file tmp
##### Fetch and fastq-dump all reads from NCBI identified by "SRR" #####
for i in ${srr[@]}; do
echo $i
if [[ -n "$(find *$i* 2>/dev/null)" ]]; then
echo "Files are here."
else
export i
echo 'prefetching '$i'...'
print_next_command $LINENO ${i}
docker run -e i --rm=True -u $(id -u):$(id -g) -v $PWD:/data staphb/sratoolkit:2.9.2 /bin/bash -c \
'prefetch -O /data '${i}
echo 'now running fasterq-dump in container'
print_next_command $LINENO ${i}
docker run -e THREADS -e i --rm=True -u $(id -u):$(id -g) -v $PWD:/data staphb/sratoolkit:2.9.2 /bin/bash -c \
'fasterq-dump --skip-technical --split-files -t /data/tmp-dir -e ${THREADS} -p '${i}'.sra'
mv ${i}.sra_1.fastq ${i}_1.fastq
mv ${i}.sra_2.fastq ${i}_2.fastq
pigz ${i}_1.fastq
pigz ${i}_2.fastq
rm ${i}.sra
fi
done
remove_file tmp-dir
##### These are the QC trimming scripts as input to trimClean #####
make_directory clean
echo "cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning cleaning"
for i in *R1_001.fastq.gz; do
b=`basename ${i} _R1_001.fastq.gz`;
if [[ -n "$(find . -maxdepth 2 -name ${b}.cleaned.fastq.gz)" ]]; then
continue
else
echo "LYVESET CONTAINER RUNNING SHUFFLEREADS.PL"
print_next_command $LINENO ${i}
docker run --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/lyveset:1.1.4f \
run_assembly_shuffleReads.pl /data/${b}"_R1_001.fastq.gz" /data/${b}"_R2_001.fastq.gz" > clean/${b}.fastq;
echo ${b};
echo "LYVESET CONTAINER RUNNING TRIMCLEAN.PL"
print_next_command $LINENO ${i}
docker run --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/lyveset:1.1.4f \
run_assembly_trimClean.pl -i /data/clean/${b}.fastq -o /data/clean/${b}.cleaned.fastq.gz --nosingletons --numcpus ${THREADS};
remove_file clean/${b}.fastq;
fi
done
for i in *_1.fastq.gz; do
b=`basename ${i} _1.fastq.gz`;
if find ./clean/${b}.cleaned.fastq.gz; then
continue
else
echo '(run_assembly_shuffleReads.pl)Interleaving reads for:'${c}' using lyveset docker container'
print_next_command $LINENO ${i}
docker run --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/lyveset:1.1.4f \
run_assembly_shuffleReads.pl /data/${b}"_1.fastq.gz" /data/${b}"_2.fastq.gz" > clean/${b}.fastq;
echo '(run_assembly_trimClean.pl) Trimming/cleaning reads for:'${c}' using lyveset docker container'
print_next_command $LINENO ${i}
docker run --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/lyveset:1.1.4f \
run_assembly_trimClean.pl -i /data/clean/${b}.fastq -o /data/clean/${b}.cleaned.fastq.gz --nosingletons --numcpus ${THREADS};
remove_file clean/${b}.fastq;
fi
done
remove_file ./clean/\**
##### Run SPAdes de novo genome assembler on all cleaned, trimmed, fastq files #####
make_directory ./spades_assembly_trim
for i in ${id[@]}; do
if [[ -n "$(find -path ./spades_assembly_trim/$i/contigs.fasta 2>/dev/null)" ]]; then #This will print out the size of the spades assembly if it already exists
size=$(du -hs ./spades_assembly_trim/$i/contigs.fasta | awk '{print $1}');
echo 'File exists and is '$size' big.'
else
# make sure variable i is available to global env, to pass into container
export i
echo 'constructing assemblies for '$i', could take some time...'
print_next_command $LINENO ${i}
docker run -e i -e THREADS --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/spades:3.12.0 /bin/bash -c \
'spades.py --pe1-12 /data/clean/${i}*.cleaned.fastq.gz --careful -t ${THREADS} -o /data/spades_assembly_trim/${i}/'
rm -rf ./spades_assembly_trim/$i/corrected \
./spades_assembly_trim/$i/K21 \
./spades_assembly_trim/$i/K33 \
./spades_assembly_trim/$i/K55 \
./spades_assembly_trim/$i/K77 \
./spades_assembly_trim/$i/K99 \
./spades_assembly_trim/$i/K127 \
./spades_assembly_trim/$i/misc \
./spades_assembly_trim/$i/mismatch_corrector \
./spades_assembly_trim/$i/split_input \
./spades_assembly_trim/$i/tmp
fi
done
make_directory new_temp
make_directory new_temp/spades_assemblies
for i in ${id[@]}; do
if [[ -n "$(find -path ./spades_assembly_trim/$i/contigs.fasta 2>/dev/null)" ]]; then
continue
else
cp ./clean/${i}*.cleaned.fastq.gz new_temp/
spades_file=./clean/${i}*.cleaned.fastq.gz
echo $spades_file
export i
echo 'constructing assemblies for '$i', second try...'
print_next_command $LINENO ${i}
docker run -e i -e THREADS --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/spades:3.12.0 /bin/bash -c \
'spades.py -o /data/new_temp/spades_assemblies/${i}/ -t ${THREADS} --pe1-12 /data/new_temp/${i}_*.cleaned.fastq.gz --careful'
rm -rf ./new_tmp/spades_assemblies/$i/corrected \
./new_tmp/spades_assemblies/$i/K21 \
./new_tmp/spades_assemblies/$i/K33 \
./new_tmp/spades_assemblies/$i/K55 \
./new_tmp/spades_assemblies/$i/K77 \
./new_tmp/spades_assemblies/$i/K99 \
./new_tmp/spades_assemblies/$i/K127 \
./new_tmp/spades_assemblies/$i/misc \
./new_tmp/spades_assemblies/$i/mismatch_corrector \
./new_tmp/spades_assemblies/$i/split_input \
./new_tmp/spades_assemblies/$i/tmp
fi
done
mv new_temp/spades_assemblies/* spades_assembly_trim/
remove_file new_temp
##### Run quast assembly statistics for verification that the assemblies worked #####
make_directory quast
for i in ${id[@]}; do
if [[ -e quast/${i}_output_file ]]; then
echo "Skipping ${i}. It's SPAdes assembly has already been QUASTed."
else
print_next_command $LINENO ${i}
docker run --rm=True -v $PWD:/data -u $(id -u):$(id -g) staphb/quast:5.0.0 \
quast.py /data/spades_assembly_trim/$i/contigs.fasta -t ${THREADS} -o /data/quast/$i
fi
done
##### Run prokka on all the isolates to get the core genomes and generate .gff files #####
make_directory ./prokka
for i in ${id[@]}; do
if [[ -n "$(find -path ./prokka/$i)" ]]; then
echo "Prokka has been run on this isolate."
else
export i
echo "Prokka will now be run on "$i
print_next_command $LINENO ${i}
docker run -e i -e THREADS --rm=True -u $(id -u):$(id -g) -v $PWD:/data staphb/prokka:1.13 /bin/bash -c \
'prokka /data/spades_assembly_trim/$i/contigs.fasta --outdir /data/prokka/$i --prefix $i --cpus ${THREADS}'
fi
done
remove_file -r ./gff_files
make_directory ./gff_files #make directory to hold the .gff files output by prokka
cp prokka/*/*.gff gff_files/ #copy over the .gff files from prokka
##### Run roary using the .gff file folder #####
rm -rf roary
print_next_command $LINENO ${i}
docker run -e i -e THREADS --rm=True -u $(id -u):$(id -g) -v $PWD:/data staphb/roary:3.12.0 /bin/bash -c \
'roary -p ${THREADS} -e -n -v -f /data/roary /data/gff_files/*.gff'
##### Run raxml on the roary alignment to generate a tree #####
print_next_command $LINENO ${i}
docker run -e i --rm=True -u $(id -u):$(id -g) -v $PWD:/data staphb/lyveset:1.1.4f /bin/bash -c \
'raxmlHPC -m GTRGAMMA -p 12345 -x 12345 -s /data/roary/core_gene_alignment.aln -# 100 -n phylo_output -f a'
rm -rf raxml/
make_directory raxml
mv RAxML* raxml/
##### Move all of the fastq.gz files into a folder #####
make_directory fastq_files
for i in ${id[@]}; do
if [[ -n "$(find *$i*fastq.gz)" ]]; then
mv *$i*fastq.gz fastq_files
fi
done