-
Notifications
You must be signed in to change notification settings - Fork 2
/
2_mapping_to_genome_and_dedup.bash
133 lines (97 loc) · 2.85 KB
/
2_mapping_to_genome_and_dedup.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env bash
date
echo "Step 2 mapping reads to a genome, and deduplicate with umi-tools, and spliting bams by strand"
# note of paired-end reads
# TRUE -- paired-end reads
# FALSE -- single-end reads
pair=FALSE
# the filename of your R1.fq.gz
inrawR1="YourInputRawFileR1.fq.gz"
if [ "${pair}" == FALSE ]; then
echo "Single-end"
inrawR2=""
else
echo "Paired-end"
inrawR2=${inrawR1/R1/R2}
fi
printf "Your Read 1 fq.gz -- %s, Read 2 fq.gz -- %s\n" $inrawR1 $inrawR2
indir="Your/input/fq.gzfile/directory"
printf "Your directory containing the fq.gz after step1 -- %s\n" $indir
stardir="Your/output/bamfile/directory"
printf "Your directory containing the output bam -- %s\n" $outfdir
staridx="Your/STAR/index/directory"
printf "Your STAR index directory -- %s\n" $staridx
nc=16
printf "Your core number used for STAR -- %s\n" $nc
##
##
startt="$(date +%s)"
echo "3 mapping reads to a genome with STAR"
odir=${inrawR1/.*norRNA./}
odir=${odir/.fq*/}
odir=${odir/R1/}
odir=${stardir}/${odir}
if [ ! -d $odir ];then
echo "NO, mkdir" $odir
mkdir $odir
else
echo "YES"
fi
echo $inrawR1 $odir
if [ "$pair" == FALSE ]; then
inf=${$inrawR1}
else
inf="${$inrawR1} ${$inrawR2}"
fi
STAR --runMode alignReads --runThreadN ${nc} \
--readFilesCommand zcat \
--genomeDir $staridx \
--alignEndsType EndToEnd \
--genomeLoad NoSharedMemory \
--quantMode TranscriptomeSAM \
--alignMatesGapMax 15000 \
--readFilesIn ${inf} \
--outFileNamePrefix $odir/ \
--outFilterMultimapNmax 1 \
--outSAMattributes All \
--outSAMtype BAM SortedByCoordinate \
--outFilterType BySJout \
--outReadsUnmapped Fastx \
--outFilterScoreMin 10 \
--outFilterMatchNmin 24
samtools index -@ ${nc} ./$odir/Aligned.sortedByCoord.out.bam
endt="$(date +%s)"
printf "Mapping reads to a genome for %s done, elapsed time -- %.0f s\n\n" $inrawR1 "$((( $endt - $startt)))"
printf "\n"
##
startt2="$(date +%s)"
echo "4 deduplicating reads with umi-tools"
umi_tools dedup --method unique\
-I ./$odir/Aligned.sortedByCoord.out.bam \
--output-stats=./$odir/dedup \
-L ./$odir/umitools.log --temp-dir=./$odir \
-S ./$odir/dedup.bam
samtools index -@ ${nc} ./$odir/dedup.bam
endt="$(date +%s)"
printf "Deduplicating for %s done, elapsed time -- %.0f s\n\n" $inrawR1 "$((( $endt - $startt2)))"
##
startt3="$(date +%s)"
echo "5 splitting reads by strand"
ofwd=./$odir/dedup.fwd.bam
orev=${ofwd/.fwd.bam/rev.bam}
curinbam=./$odir/dedup.bam
echo ${curinbam}
echo $ofwd
samtools view -@ ${nc} -F 16 ${curinbam} -b -o $ofwd &&
samtools index -@ ${nc} $ofwd;
echo $orev
samtools view -@ ${nc} -f 16 ${curinbam} -b -o $orev &&
samtools index -@ ${nc} $orev
endt="$(date +%s)"
printf "Splitting reads for %s done, elapsed time -- %.0f s\n\n" $curinbam "$((( $endt - $startt2)))"
##
date
endt="$(date +%s)"
printf "Total elapsed time -- %.2f min \n\n" "$((($endt - $startt) / 60))"
echo "Finish!"
######