-
Notifications
You must be signed in to change notification settings - Fork 1
/
realign.sh
executable file
·308 lines (288 loc) · 13.2 KB
/
realign.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/bin/sh
#
# realign.sh
# Second module in the GGPS analysis pipeline
if [ $# != 5 ]
then
MSG="parameter mismatch. "
echo -e "jobid:${PBS_JOBID}\nprogram=$0 stopped at line=$LINENO.\nReason=$MSG" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine""
exit 1;
else
set -x
echo `date`
scriptfile=$0
runfile=$1
elog=$2
olog=$3
email=$4
qsubfile=$5
LOGS="jobid:${PBS_JOBID}\nqsubfile=$qsubfile\nerrorlog=$elog\noutputlog=$olog"
if [ ! -s $runfile ]
then
MSG="$runfile configuration file not found"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
outputdir=$( cat $runfile | grep -w OUTPUTDIR | cut -d '=' -f2 )
pbsprj=$( cat $runfile | grep -w PBSPROJECTID | cut -d '=' -f2 )
threads=$( cat $runfile | grep -w PBSTHREADS | cut -d '=' -f2 )
refdir=$( cat $runfile | grep -w REFGENOMEDIR | cut -d '=' -f2 )
scriptdir=$( cat $runfile | grep -w SCRIPTDIR | cut -d '=' -f2 )
refgenome=$( cat $runfile | grep -w REFGENOME | cut -d '=' -f2 )
type=$( cat $runfile | grep -w TYPE | cut -d '=' -f2 | tr '[a-z]' '[A-Z]' )
igvdir=$( cat $runfile | grep -w IGVDIR | cut -d '=' -f2 )
extradir=$outputdir/extractreads
realrecalflag=$( cat $runfile | grep -w REALIGNORDER | cut -d '=' -f2 )
paired=$( cat $runfile | grep -w PAIRED | cut -d '=' -f2 )
inputdir=$( cat $runfile | grep -w SAMPLEDIR | cut -d '=' -f2 )
samplefileinfo=$( cat $runfile | grep -w SAMPLEFILENAMES | cut -d '=' -f2 )
rlen=$( cat $runfile | grep -w READLENGTH | cut -d '=' -f2 )
multisample=$( cat $runfile | grep -w MULTISAMPLE | cut -d '=' -f2 )
samples=$( cat $runfile | grep -w SAMPLENAMES | cut -d '=' -f2 | tr ":" "\n")
region=$( cat $runfile | grep -w CHRINDEX | cut -d '=' -f2 )
resortflag=$( cat $runfile | grep -w RESORTBAM | cut -d '=' -f2 )
indices=$( echo $region | sed 's/^/chr/' | sed 's/:/ chr/g' )
epilogue=$( cat $runfile | grep -w EPILOGUE | cut -d '=' -f2 )
output_logs=$outputdir/logs
if [ $type == "GENOME" -o $type == "WHOLE_GENOME" -o $type == "WHOLEGENOME" -o $type == "WGS" ]
then
pbscpu=$( cat $runfile | grep -w PBSCPUALIGNWGEN | cut -d '=' -f2 )
pbsqueue=$( cat $runfile | grep -w PBSQUEUEWGEN | cut -d '=' -f2 )
else
if [ $type == "EXOME" -o $type == "WHOLE_EXOME" -o $type == "WHOLEEXOME" -o $type == "WES" ]
then
pbscpu=$( cat $runfile | grep -w PBSCPUALIGNEXOME | cut -d '=' -f2 )
pbsqueue=$( cat $runfile | grep -w PBSQUEUEEXOME | cut -d '=' -f2 )
else
MSG="Invalid value for TYPE=$type in configuration file."
echo -e "program=$0 stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
fi
if [ -z $epilogue ]
then
MSG="Invalid value for EPILOGUE=$epilogue in configuration file"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
else
`chmod 750 $epilogue`
fi
if [ ! -d $scriptdir ]
then
MSG="$scriptdir script directory not found"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
if [ ! -d $refdir ]
then
MSG="$refdir directory of reference genome was not found"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
if [ ! -s $samplefileinfo ]
then
MSG="SAMPLEFILENAMES=$samplefileinfo file not found"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
if [ ! -d $outputdir ]
then
mkdir -p $outputdir
fi
if [ ! -d $output_logs ]
then
mkdir $output_logs
fi
## preprocessing step; only when alignment was not performed inhouse
listdirs=":"
if [ $resortflag == "YES" ]
then
echo "alignment was NOT done inhouse. Checking input files"
numsamples=0
for name in $samples
do
countnames=$( cat $samplefileinfo | grep $name -c )
if [ $countnames -lt 1 ]
then
MSG="Mo samples found in SAMPLEFILENAMES=$samplefileinfo"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
let numsamples+=1
done
if [ $numsamples -gt 1 -a $multisample == "YES" ]
then
echo "multiple samples were aligned"
else
if [ $numsamples -eq 1 -a $multisample == "NO" ]
then
echo "single sample was aligned."
else
MSG="mismatch between number of samples found=$numsamples and value of parameter MULTISAMPLE=$multisample in configuration file"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
fi
counter=0
while read sampledetail
do
echo "processing next line in file ..."
len=`expr ${#sampledetail}`
if [ $len -eq 0 ]
then
echo "line is empty; nothing to do. $sampledetail"
else
echo "preprocessing for realignment $sampledetail"
echo "expected format--> BAM:sample_name=bamfilename.bam"
sampledirname=$( echo $sampledetail | grep ^BAM | cut -d ':' -f2 | cut -d '=' -f1 )
samplename=$( echo $sampledetail | grep ^BAM | cut -d ':' -f2 | cut -d '=' -f2 )
len1=`expr length ${sampledirname}`
if [ $len1 -lt 1 ]
then
MSG="$sampledirname parsing file failed. realignment failed."
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
cd $inputdir
bamfile=$( echo $samplename )
lenbam=`expr length $bamfile`
if [ $lenbam -lt 1 ]
then
MSG="$bamfile parsing file failed. realignment failed."
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit 1;
fi
if [ ! -s $inputdir/$bamfile ]
then
MSG="$inputdir/$bamfile input bam file not found"
echo -e "program=$scriptfile stopped at line=$LINENO.\nReason=$MSG\n$LOGS" | ssh iforge "mailx -s '[Support #200] Mayo variant identification pipeline' "$redmine,$email""
exit;
fi
prefix=`basename $samplename .wrg.sorted.bam`
outputalign=$outputdir/align/$sampledirname
outputlogs=$output_logs/align/$sampledirname
if [ ! -d $outputalign ]
then
mkdir -p $outputalign
let counter+=1
tmpbamfile=tmp.${prefix}_$counter.sorted.bam
sortedplain=${prefix}_$counter.wrg.sorted.bam
sorted=${prefix}_$counter.wdups.sorted.bam
sortednodups=${prefix}_$counter.nodups.sorted.bam
cd $outputalign
cp $inputdir/$bamfile $tmpbamfile
if [ ! -d $outputlogs ]
then
mkdir -p $outputlogs
else
`rm -r $outputlogs/*`
fi
else
let counter+=1
tmpbamfile=tmp.${prefix}_$counter.sorted.bam
sortedplain=${prefix}_$counter.wrg.sorted.bam
sorted=${prefix}_$counter.wdups.sorted.bam
sortednodups=${prefix}_$counter.nodups.sorted.bam
cd $outputalign
cp $inputdir/$bamfile $tmpbamfile
fi
qsub1=$outputlogs/qsub.sortmayo.$counter
echo "#PBS -V" > $qsub1
echo "#PBS -A $pbsprj" >> $qsub1
echo "#PBS -N sortm_$counter" >> $qsub1
echo "#PBS -l epilogue=$epilogue" >> $qsub1
echo "#PBS -l walltime=$pbscpu" >> $qsub1
echo "#PBS -l nodes=1:ppn=16" >> $qsub1
echo "#PBS -o $outputlogs/log.sortmayo.$counter.ou" >> $qsub1
echo "#PBS -e $outputlogs/log.sortmayo.$counter.in" >> $qsub1
echo "#PBS -q $pbsqueue" >> $qsub1
echo "#PBS -m ae" >> $qsub1
echo "#PBS -M $email" >> $qsub1
echo "$scriptdir/sortbammayo.sh $outputalign $tmpbamfile $sortedplain $sorted $sortednodups $runfile $outputlogs/log.sortmayo.$counter.in $outputlogs/log.sortmayo.$counter.ou $email $outputlogs/qsub.sortmayo.$counter" >> $qsub1
`chmod a+r $qsub1`
sortid=`qsub $qsub1`
echo $sortid >> $outputlogs/SORTEDmayo
echo "extracting reads"
qsub2=$outputlogs/qsub.extractreadsbam.$counter
echo "#PBS -V" > $qsub2
echo "#PBS -A $pbsprj" >> $qsub2
echo "#PBS -N extrbam$counter" >> $qsub2
echo "#PBS -l epilogue=$epilogue" >> $qsub2
echo "#PBS -l walltime=$pbscpu" >> $qsub2
echo "#PBS -l nodes=1:ppn=16" >> $qsub2
echo "#PBS -o $outputlogs/log.extractreadsbam.$counter.ou" >> $qsub2
echo "#PBS -e $outputlogs/log.extractreadsbam.$counter.in" >> $qsub2
echo "#PBS -q $pbsqueue" >> $qsub2
echo "#PBS -m ae" >> $qsub2
echo "#PBS -M $email" >> $qsub2
echo "#PBS -W depend=afterok:$sortid" >> $qsub2
echo "$scriptdir/extract_reads_bam.sh $outputalign $sorted $runfile $outputlogs/log.extractreadsbam.$counter.in $outputlogs/log.extractreadsbam.$counter.ou $outputlogs/qsub.extractreadsbam.$counter $igvdir $extradir" >> $qsub2
`chmod a+r $qsub2`
`qsub $qsub2 >> $output_logs/EXTRACTREADSpbs`
fi
done < $samplefileinfo
cat $outputlogs/SORTEDmayo >> $output_logs/REALSORTEDpbs
mv $outputlogs/SORTEDmayo $output_logs/ALN_MAYO_jobids
alndirs=$( ls -1 $outputdir/align )
JOBSmayo=$( cat $output_logs/ALN_MAYO_jobids | sed "s/\.[a-z]*//g" | tr "\n" ":" | sed "s/::/:/g" )
else
echo "alignment was done inhouse. no need to resort"
echo "We need to wait until the alignment jobs enter the queue"
while [ ! -s $output_logs/ALN_NCSA_jobids ]
do
`sleep 60s`
done
alndirs=$( ls -1 $outputdir/align )
JOBSncsa=$( cat $output_logs/ALN_NCSA_jobids | sed "s/\.[a-z]*//g" | tr "\n" ":" | sed "s/::/:/g" )
fi
## preprocessing is done. Now we can realign and recalibrate bam files
echo "preprocessing of bam files is done. ready to realign"
realigndir=$outputdir/realign
realignlogdir=$outputdir/logs/realign
if [ ! -d $realigndir ]
then
mkdir $realigndir
else
echo "$realigndir already exists. resetting it"
`rm -r $realigndir/*`
fi
if [ ! -d $realignlogdir ]
then
mkdir -p $realignlogdir
else
echo "$realignlogdir already exists. resetting it"
`rm -r $realignlogdir/*`
fi
if [ $realrecalflag != "1" -a $realrecalflag != "0" ]
then
echo "realign-recalibration order flag is not set properly. Default value [1] will be assiged to it"
realrecalflag="1"
fi
qsub3=$realignlogdir/qsub.recalibration
echo "#PBS -V" > $qsub3
echo "#PBS -A $pbsprj" >> $qsub3
echo "#PBS -N real_recal" >> $qsub3
echo "#PBS -l epilogue=$epilogue" >> $qsub3
echo "#PBS -l walltime=$pbscpu" >> $qsub3
echo "#PBS -l nodes=1:ppn=16" >> $qsub3
echo "#PBS -o $realignlogdir/log.recalibration.ou" >> $qsub3
echo "#PBS -e $realignlogdir/log.recalibration.in" >> $qsub3
echo "#PBS -q $pbsqueue" >> $qsub3
echo "#PBS -m ae" >> $qsub3
echo "#PBS -M $email" >> $qsub3
if [ $resortflag == "YES" ]
then
echo "#PBS -W depend=afterok:$JOBSmayo" >> $qsub3
else
echo "#PBS -W depend=afterok:$JOBSncsa" >> $qsub3
fi
echo "$scriptdir/realign_extra.sh $realigndir $realignlogdir $outputdir/align $runfile $realrecalflag $realignlogdir/log.recalibration.in $realignlogdir/log.recalibration.ou $email $realignlogdir/qsub.recalibration" >> $qsub3
`chmod a+r $qsub3`
`qsub $qsub3 >> $output_logs/RECALLpbs`
#`sleep 5s`
echo "done realig/recalibrating all bam files."
echo `date`
`chmod -R 770 $outputdir/`
`chmod -R 770 $output_logs/`
fi