-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge2lanes.sh
executable file
·90 lines (74 loc) · 2.46 KB
/
merge2lanes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash
# Script: merge2lanes.sh
# Description: This script merges Aviti paired FASTQ files from 2 lanes
# Input reads are found in a subfolder specified by -i option
# Merged reads are written to a new subfolder 'merged_reads' by default
# Version: 1.6
# Date: 2025-01-27
# Author: SP@NC (+AI)
# exit on any error
set -e
# Default values
input_dir=""
output_dir="merged_reads"
parallel_jobs=1
# Function to display usage
usage() {
echo "Usage: ${0} -i input_dir [-o output_dir] [-j parallel_jobs]"
echo " -i input_dir : Input directory (required)"
echo " -o output_dir : Output directory (default: merged_reads)"
echo " -j parallel_jobs: Number of parallel jobs (default: 1)"
exit 1
}
# Parse command line options
while getopts "i:o:j:h" opt; do
case ${opt} in
i )
input_dir=${OPTARG}
;;
o )
output_dir=${OPTARG}
;;
j )
parallel_jobs=${OPTARG}
;;
h )
usage
;;
\? )
usage
;;
esac
done
# Check if input directory is provided
if [ -z "${input_dir}" ]; then
echo "Error: Input directory (-i) is required."
usage
fi
# Create output directory if it doesn't exist
mkdir -p "${output_dir}"
# Find all unique sample numbers in natural order
samples=$(find "${input_dir}" -name "*fastp_R1.fq.gz" \
| sed -E 's/.*_S([0-9]+)_.*/\1/' | sort -u -V)
# Function to process a single sample
process_sample() {
local sample=${1}
echo "Processing sample _S${sample}_"
# Get the prefix from the first input file
local prefix=$(find "${input_dir}" -name "*_S${sample}_*fastp_R1.fq.gz" \
| head -n 1 | sed -E 's/.*\/(.*?)_S.*/\1/')
# Merge R1 files
find "${input_dir}" -name "*_S${sample}_*fastp_R1.fq.gz" | sort \
| xargs zcat | gzip -c > "${output_dir}/${prefix}_S${sample}_merged_R1.fq.gz"
# Merge R2 files
find "${input_dir}" -name "*_S${sample}_*fastp_R2.fq.gz" | sort \
| xargs zcat | gzip -c > "${output_dir}/${prefix}_S${sample}_merged_R2.fq.gz"
echo -e "# sample _S${sample}_ lanes were merged"
}
# Export the function and necessary variables so they're available to parallel processes
export -f process_sample
export input_dir
export output_dir
# Process samples in parallel using GNU Parallel, with the specified number of jobs
echo "${samples}" | parallel -j "${parallel_jobs}" process_sample
echo "All samples processed successfully"