-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_ins_report.sh
executable file
·343 lines (271 loc) · 10.5 KB
/
create_ins_report.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#!/bin/bash
#
# BIOINFORMATICS AND SYSTEMS BIOLOGY LABORATORY
# UNIVERSIDAD NACIONAL DE COLOMBIA
#
# A simple script for creating Colombia's INS genome reporting
#
# Relies on "jq" for nextclade's JSON outputparsing.
#
# INPUT:
# 1) Nextclade's output in .json format.
# 2) A one column file holding a list of variants.
# 3) Path to Mosdepth's "genome" (not "amplicon")dir, containing a single file for each sample.
# 4) Path to quast info file.
#
# OUTPUT:
# Outputs a tab separated file with the following content:
#
# S: Sample name
# C: Clade
# I: Insertions
# A: Aminoacid Substitutions
# N: Nucleotide substitutions
# V: Variants of interest
# D: Mosdepth mean depth
# F: Covered Genome Fraction
# K: Aminoacid deletion
# R: Range of K in the corresponding reference Nucleotide sequence.
#
# The actual output order is: S C V K R I N A D F
#
# By default the output file is named as the input "json" file + "ins_report.tsv"
# So if the inpout file is named: nextclade.json the output file is named:
# nextclade.json.ins_report.tsv
#
#
# How to run it:
# $> create_ins_report nextclade.json variantsOfConcern.lst path_to_mosdepth_genome_dir quast_genome_info_file
#
# IMPORTANT: Sorry not support for OSX yet in this script.
#
#
#--------------------------------------------------------------------
#
# Minimal input check.
#
#--------------------------------------------------------------------
function input_error ()
{
echo -e "
#
# ** ERROR **: ${1}
#
# Please check that you provided 4 arguments and/or paths are correct.
#
# This script requires the following input files:
#
# 1) A JSON file as obtained from NEXTCLADE.
# 2) A one column file holding a list of Variants of Interest to look for.
# 3) A path to a directory containing mosdepth coverage results.
# 4) Quast genome info file.
#
# Example:
#
# create_ins_report.sh [nextclade_output.json] [variants_file] [mosdepth-dir] [quast-genome-info_file]
#
"
exit 1
}
#If fq is not installed just quit.
if ! [ -x "$(command -v jq)" ]; then
echo 'Error: jq is not installed. Please install it and try again.' >&2
exit 1
fi
if [ -z "$4" ]; then
input_error "Missing argument"
fi
if [ ! -f $1 ];then
input_error "File ${1} not found"
fi
if [ ! -f $2 ];then
input_error "File ${2} not found"
fi
if [ ! -d $3 ] || [ -z "$3" ];then
input_error "Mosdepth directory ${3} not found"
fi
if [ ! -f $4 ];then
input_error "Quast info file: ${4} not found"
fi
# Since this script outputs to STDOUT header lines has to be echoed
# before enything into the loop. For new versions this behaviour can be changed
# by an associative array (ei. clade[C]) and sending output to a file.
#Please note that each field here is tabulated.
#--------------------------------------------------------------------
# Input files
#--------------------------------------------------------------------
#Create a temp dir to hold tmp files
tmpDir=$(mktemp -d -p ./)
jsonFile=${1}
vocFile=${2}
mosDir=${3}
quastFile=${4}
# Name of the main report file to output.
baseName=$(basename ${jsonFile})
reportMainFile=$(echo ${tmpDir}/${baseName}".ins_report.tsv")
# User feed back.
echo ""
echo ""
echo "Creating output file: ${reportMainFile}"
echo ""
# Write header in output file
echo "Codigo:Linaje:Mutacion de interes:Delecion:Delecion(coordenadas):Inserciones:Sustituciones:Sustituciones(AA):Profundidad:Cobertura:Laboratorio" | tr ':' '\t' > ${reportMainFile}
#--------------------------------------------------------------------
# Get the number of entries (A.K.A genomes analyzed, fields in array).
# Nextclade json files has 4 initial objects. Last one (results) is an
# array. This array length is variable and it depends on the number of
# genomes analyzed.
#--------------------------------------------------------------------
resultsLength=$(jq '.results[] | length' ${jsonFile} | wc -l)
echo "Found ${resultsLength} samples in input JSON file."
#--------------------------------------------------------------------
# Iterate through each object in array.
# Notice that 1st index in array is "0". So length = length-1
#--------------------------------------------------------------------
for (( i=0; i<$resultsLength; i++ ))
do
# Get sample name
# Sample name comes in the form: SAMPLE_01/ARTIC/medaka...
# Let's use only the "SAMPLE_01" part for further consistency.
# NOTE: This is true for the web version of the JSON file it is necessary to
# check what de differences are with command line JSON file.
sampleName=$(jq '.results['$i'].seqName' ${jsonFile} | sed -e 's/\"//g' | cut -d '/' -f 1)
echo "Analyzing sample ${i}: ${sampleName}"
echo ${sampleName} > ${tmpDir}/S
#Get clade
clade=$(jq '.results['$i'].clade' ${jsonFile} | sed -e 's/\"//g')
echo ${clade} > ${tmpDir}/C
#Get complete Aminoacid substitutions
aaSubst=$(jq '.results['$i'].aaSubstitutions[] | .gene + ":" + .refAA + (.codon+1|tostring)+ .queryAA' ${jsonFile} | sed -e 's/\"//g')
echo ${aaSubst} > ${tmpDir}/A
#Get Nucleotide substitutions
refNuc=$(jq '.results['$i'].substitutions[] | .refNuc + (.pos+1|tostring) + .queryNuc' ${jsonFile} | sed -e 's/\"//g')
echo ${refNuc} > ${tmpDir}/N
#Get insertions
insertions=$(jq '.results['$i'].insertions[] | (.pos+1|tostring) + ":" + .ins' ${jsonFile} | sed -e 's/\"//g')
#Sometimes there are no insertions at all
if [ -z "${insertions}" ];then
insertions=$(echo "NA")
fi
echo ${insertions} > ${tmpDir}/I
#------------------------------------------------------------------
# Let's see if any of the retrieved aminoacid substitutions is one
# of the VOI/VOC according to the data provided by the INS.
# For this we need a list of VOI/VOC. This list was created as a
# single column file and provided to this script as a parameter 2.
#------------------------------------------------------------------
# In some systems GREP_OPTIONS is set although it is deprecated
# no harm in this unset. Warnings are annoying!
unset GREP_OPTIONS
# File V is an special case. It needs to be reset before the loop. But
# into the loop is not overwritten. Each loop is a new case.
# Note that v and V are two different files that hold the same information
# the difference is that v is the file before stripping end of lines.
rm -f ${tmpDir}/v
rm -f ${tmpDir}/V
while read line
do
#Use -c to count number of matches. If == 1 means there was a match.
match=$(echo ${aaSubst} | grep -c $line)
if [ "$match" -eq 1 ];then
# Here V file shouldn't be overwritten. So create "v" file.
echo ${line} >> ${tmpDir}/v
elif [ "$match" -eq 0 ];then
#If there are no matches create the file with "NA"
echo "" >> ${tmpDir}/v
fi
done < ${vocFile}
#------------------ DELETIONS------------------
# Retrieve aaDeletions.
# Composed of two arrays. This is the general form:
# results[n]
# |_deletions[1]
# |_aaDeletions[n]
#
delLength=$(jq '.results['$i'].deletions | length' ${jsonFile})
rm -f ${tmpDir}/K
rm -f ${tmpDir}/R
#Run this ONLY if there are deletions
if [ $delLength -gt 0 ];then
for (( j=0; j<$delLength; j++ ))
do
#Get aaDeletions Length
aaDelLength=$(jq '.results['$i'].deletions['$j'].aaDeletions | length' ${jsonFile})
# Check if there are aminoacid deletions. If not put NA on the corresponding
# fields.
if [ $aaDelLength -eq 0 ];then
echo "NA" > ${tmpDir}/K
echo "NA" > ${tmpDir}/R
elif [ $aaDelLength -ne 0 ];then
for (( k=0; k<$aaDelLength; k++ ))
do
#Get the deletion itself. Something like: "ORF1a:S365-"
aaDel=$(jq '.results['$i'].deletions['$j'].aaDeletions['$k'] | .gene + ":" + .refAA + (.codon+1|tostring) + "-"' ${jsonFile} | sed -e 's/\"//g')
#Get start coordinate for deletion.
aaDelNucBegin=$(jq '.results['$i'].deletions['$j'].aaDeletions['$k'].contextNucRange.begin' ${jsonFile})
#Get end coordinate for deletion.
aaDelNucEnd=$(jq '.results['$i'].deletions['$j'].aaDeletions['$k'].contextNucRange.end' ${jsonFile})
#Get rid of line brakes for aaDel.
echo ${aaDel} | tr '\n' ' ' >> ${tmpDir}/K
aaDelNucBegin=$(echo ${aaDelNucBegin} | tr '\n' ' ' | tr -d '[:space:]')
aaDelNucEnd=$(echo ${aaDelNucEnd} | tr '\n' ' ' | tr -d '[:space:]')
echo ${aaDelNucBegin}"-"${aaDelNucEnd} | tr '\n' ' ' >> ${tmpDir}/R
done
fi
done
elif [ $delLength -eq 0 ];then
echo "NA" > ${tmpDir}/K
echo "NA" > ${tmpDir}/R
fi #Ends $delLength gt 0
#------------------------------------------------------------------
# MOSDEPTH & QUAST ROUTINES
# Retrieve mapped reads from mosdepth
# Retrieve coverage from quast
#------------------------------------------------------------------
# Mosdepth provides two directories. The one hereby referenced is the
# "genome" dir not the "amplicon" dir.
mosSummaryFile=$(echo ${mosDir}"/"${sampleName}".mosdepth.summary.txt")
meanDepth=$(tail -n 1 ${mosSummaryFile} | awk '{print $4}')
echo ${meanDepth} > ${tmpDir}/D
fraction=$(cat ${quastFile} | grep ${sampleName} | sed -e "s/=//g; s/|//g" | awk '{print $2}')
echo ${fraction} > ${tmpDir}/F
#------------------------------------------------------------------
# We need to gather all retrieved information and format it in a
# single row then save to a file.
#
# S: Sample name
# C: Clade
# I: Insertions
# A: Aminoacid Substitutions
# N: Nucleotide substitutions
# V: Variants of interest
# D: Mosdepth mean depth
# F: Covered Genome Fraction
# K: Aminoacid deletion
# R: Range of K in the corresponding reference Nucleotide sequence.
# L: Laboratory that processed samples
#------------------------------------------------------------------
# Laboratory field.
echo "UNAL-Bogota" > ${tmpDir}/L
cd ${tmpDir}
#Replace line breaks from v file.
cat v | tr '\n' ' ' > V
#Paste does all the magic. Easy to modify columns position.
# We have to give the relative path to the file because now we
# are into the tmp folder.
paste S C V K R I N A D F L >> ../${reportMainFile}
#We need to get back one dir up.
cd ..
# Uncomment when terminal-debugging
# echo "-----"
done
#mv report file un dir up. So it is visible to user.
mv ${reportMainFile} .
echo "Al samples analyzed."
echo "Cleaning up."
#Cleaning up.
rm -Rf ${tmpDir}
#Clean exit
set GREP_OPTIONS
echo "All done. Bye."
exit 0