-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathalign_minimap2.sh
100 lines (89 loc) · 3.86 KB
/
align_minimap2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
# Align and process ONT reads with minimap2 in splice mode
# Will align, sort and perfom initial steps of flair
# Make sure to create separate folder for each sample
set -eo pipefail
OUTDIR_DEFAULT="$(pwd -P)/aligned"
REFERENCE_DEFAULT='data/Homo_sapiens_assembly38_noALT_noHLA_noDecoy'
GTF_DEFAULT='data/gencode.v26.annotation.gtf'
flair_dir='software/flair'
function Usage() {
echo -e "\
Usage: $(basename $0) -f \e[3minput.fq\e[0m -T \e[3mref.fa\e[0m [-o \e[3m/path/to/outdir\e[0m] [--csi] \n\
Where: -f|--fastq is the path to the input or forward FASTQ file \n\
-T|--reference is the path to the reference FASTA and genome file without suffix \n\
defaults to ${REFERENCE_DEFAULT} \n\
-G|--gtf is the path to the transcriptome GTF file \n\
defaults to ${GTF_DEFAULT} \n\
[-N|--native] align to the genome true or false \n\
[-o|--outdir] is an optional path to an output directory \n\
defaults to \e[1m${OUTDIR_DEFAULT}\e[0m \n\
[--csi] will index the final BAM file with a CSI index \n\
defaults to a BAI index
" >&2
exit 1
}
function extension() {
local fname="$1"
local ext="$(echo ${fname} | rev | cut -f 1 -d '.' | rev)"
$(grep -E 'gz|bz|zip' <(echo "${ext}") > /dev/null 2> /dev/null) && ext="$(echo ${fname} | rev | cut -f -2 -d '.' | rev)"
echo ".${ext}"
}
[[ "$#" -lt 1 ]] && Usage
while [[ "$#" -ge 1 ]]; do
case "$1" in
-f|--fastq)
FASTQ="$2"
shift
;;
-G|--gtf)
GTF="$2"
shift
;;
-T|--reference)
REFERENCE="$2"
shift
;;
-o|--outdir)
OUTDIR="$2"
shift
;;
-n|--native)
NATIVE=${2:-true}
shift
;;
--csi)
INDEX='-c'
;;
*)
Usage
;;
esac
shift
done
[[ -z "${FASTQ}" || -z "${REFERENCE}" || -z "${GTF}" ]] && Usage
[[ -f "${FASTQ}" ]] || (echo "Cannot find input FASTQ ${FASTQ}" >&2; exit 1)
[[ -f "${REFERENCE}" ]] || (echo "Cannot find reference genome ${REFERENCE}" >&2; exit 1)
[[ -f "${GTF}" ]] || (echo "Cannot find reference genome ${GTF}" >&2; exit 1)
[[ -z "${INDEX}" ]] && INDEX='-b'
[[ -z "${OUTDIR}" ]] && OUTDIR="${OUTDIR_DEFAULT}"
(set -x; mkdir -p "${OUTDIR}")
EXTENSION="$(extension ${FASTQ})"
NAME="$(basename ${FASTQ} $(extension ${FASTQ}))_$(basename ${REFERENCE} $(extension ${REFERENCE}))"
$(command -v minimap2 > /dev/null 2> /dev/null) || (echo "Cannot find minimap2" >&2; exit 1)
if [ "${NATIVE}" = true ]
then
(set -x; minimap2 -t 8 -ax splice -uf -k14 "${REFERENCE}.fasta" "${FASTQ}" > "${OUTDIR}/genome/${NAME}.sam")
(set -x; samtools view -hSb "${OUTDIR}/genome/${NAME}.sam" > "${OUTDIR}/genome/${NAME}.all.bam")
(set -x; samtools view -q 10 -F 2304 -hb "${OUTDIR}/genome/${NAME}.all.bam" > "${OUTDIR}/genome/${NAME}.bam")
(set -x; samtools sort "${OUTDIR}/genome/${NAME}.bam" -o "${OUTDIR}/genome/${NAME}.sorted.bam")
(set -x; samtools index "${INDEX}" "${OUTDIR}/genome/${NAME}.sorted.bam")
(set -x; python "${flair_dir}/bam2Bed12.py" -i "${OUTDIR}/genome/${NAME}.sorted.bam" > "${OUTDIR}/genome/${NAME}.sorted.bed")
(set -x; cd "${OUTDIR}genome/")
(set -x; python flair.py correct -g "${REFERENCE}.fasta" -q "${NAME}.sorted.bed" -t 8 -w 20 -f "${GTF}" -c "${REFERENCE}.genome" -o "${NAME}_noSJ")
else
(set -x; minimap2 -t 8 -ax map-ont -uf -k14 "${REFERENCE}.fasta" "${FASTQ}" > "${OUTDIR}/transcriptome/${NAME}.sam")
(set -x; samtools view -hSb "${OUTDIR}/transcriptome/${NAME}.sam" > "${OUTDIR}/transcriptome/transcriptome/${NAME}.all.bam")
(set -x; samtools sort "${OUTDIR}/transcriptome/${NAME}.bam" -o "${OUTDIR}/transcriptome/${NAME}.sorted.bam")
(set -x; samtools index "${INDEX}" "${OUTDIR}/transcriptome/${NAME}.sorted.bam")
fi