Skip to content

Commit

Permalink
v1.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
Serka-M authored Jun 29, 2023
1 parent 0a8fa1d commit 6dfb454
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 42 deletions.
33 changes: 18 additions & 15 deletions src/ENAdumper
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@

# Pre-set default settings
set -eo pipefail
version=1.0
version=1.0.1
mode="OFF"
script=$(dirname -- "$BASH_SOURCE")
config=$script/config.yaml

splits=$(grep splits $config | cut -d" " -f2 | tr -d '\r')
fast5=$(grep fast5 $config | cut -d" " -f2 | tr -d '\r')
np_dir=$(grep np_dir $config | cut -d" " -f2 | tr -d '\r')
fastq=$(grep fastq $config | cut -d" " -f2 | tr -d '\r')
extension=$(grep extension $config | cut -d" " -f2 | tr -d '\r')
sample=$(grep sample $config | cut -d" " -f2 | tr -d '\r')

webin_user=$(grep webin_user $config | cut -d" " -f2 | tr -d '\r')
Expand All @@ -34,9 +35,10 @@ For issues or feedback please use https://github.com/Serka-M/ENAdumper/issues or
COMPRESSION INPUTS:
-fq --fastq_list Path to list with FastQ files (default:$fastq)
-np --fast5_dir Path to directory with Fast5 files (default:$fast5)
-np --nanopore_dir Path to directory with raw Nanopore files (default:$np_dir)
-n --sample Sample name (default:$sample)
-p --processes Number of processes and fast5 batch count (default:$splits)
-p --processes Number of processes and batch count (default:$splits)
-x --extension Filename extension for raw Nanopore data (default:$extension)
UPLOAD INPUTS:
-user --webin_user Webin username (default:$webin_user)
Expand All @@ -62,14 +64,15 @@ while test $# -gt 0; do
case "$1" in
-h | --help) printf -- "$usage_text"; exit 1;;
-v | --version) printf -- "ENAdumper, v$version\n"; exit 1;;
-np | --fast5_dir) shift; fast5=$1 && mode="fast5" && snakefile=$script/snake_np; shift;;
-np | --nanopore_dir) shift; np_dir=$1 && mode="Nanopore" && snakefile=$script/snake_np; shift;;
-fq | --fastq_list) shift; fastq=$1 && mode="fastq" && snakefile=$script/snake_fq; shift;;
-n | --sample) shift; sample=$1; shift;;
-s | --study) shift; study=$1; shift;;
-k | --key) shift; key=$1; shift;;
-user | --webin_user) shift; webin_user=$1; shift;;
-pass | --webin_pass) shift; webin_pass=$1; shift;;
-p | --processes) shift; splits=$1; shift;;
-x | --extension) shift; extension=$1; shift;;
-ins | --instrument_model) shift; instrument_model=$1; shift;;
-lib_n | --library_name) shift; library_name=$1; shift;;
-lib_src | --library_source) shift; library_source=$1; shift;;
Expand All @@ -83,15 +86,15 @@ while test $# -gt 0; do
done

# Check mode type
if [ $mode == "OFF" ]; then echo "Input files not provided. Exitting..." && exit 1; fi
if [ ! $fast5 == "fast5" ] && [ ! $fastq == "fastq" ]; then echo "Multiple input files provided. Exitting..." && exit 1; fi
if [ $mode == "OFF" ]; then echo "Input files not provided. Aborting..." && exit 1; fi
if [ ! $np_dir == "Nanopore" ] && [ ! $fastq == "fastq" ]; then echo "Multiple input files provided. Aborting..." && exit 1; fi

# Checks for fast5 transfer mode
if [ $mode == "fast5" ]; then
# Turn relative path into absolute for fast5 input
if [[ ! "$fast5" = /* ]]; then fast5="$(pwd)/$fast5"; fi
# Check if fast5 input directory exists
if [ ! -d "$fast5" ]; then echo "ERROR: fast5 directory not found at $fast5. Aborting..." && exit 1; fi; fi
# Checks for Nanopore data transfer mode
if [ $mode == "Nanopore" ]; then
# Turn relative path into absolute for Nanopore data input
if [[ ! "$np_dir" = /* ]]; then np_dir="$(pwd)/$np_dir"; fi
# Check if Nanopore data input directory exists
if [ ! -d "$np_dir" ]; then echo "ERROR: directory for raw Nanopore data not found at $np_dir. Aborting..." && exit 1; fi; fi

# Checks for fastq transfer mode
if [ $mode == "fastq" ]; then
Expand Down Expand Up @@ -126,8 +129,8 @@ if ! command -v basename &> /dev/null; then echo "Dependency "basename" could no
if [ ! -d "$(pwd)/ENAdumper_${mode}_${sample}" ]; then mkdir $(pwd)/ENAdumper_${mode}_${sample}; fi

# Run snakemake workflow
echo "Starting Snakemake workflow for uploading $mode files..."
echo "Starting Snakemake workflow for uploading raw $mode files..."
cd $(pwd)/ENAdumper_${mode}_${sample}
snakemake --cores $splits -s $snakefile --configfile $config --config sample=$sample study=$study fast5=$fast5 fastq=$fastq splits=$splits webin_pass=$webin_pass webin_user=$webin_user key=$key instrument_model=$instrument_model library_name=$library_name library_source=$library_source library_selection=$library_selection library_strategy=$library_strategy library_layout=$library_layout
snakemake --cores $splits -s $snakefile --configfile $config --config sample=$sample study=$study np_dir=$np_dir fastq=$fastq extension=$extension splits=$splits webin_pass=$webin_pass webin_user=$webin_user key=$key instrument_model=$instrument_model library_name=$library_name library_source=$library_source library_selection=$library_selection library_strategy=$library_strategy library_layout=$library_layout
cd ..
exit 0
20 changes: 10 additions & 10 deletions src/conda.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name: ENAdumper
channels:
- conda-forge
- bioconda
- hcc
- defaults
dependencies:
- snakemake=7.19.1
- aspera-cli=3.9.6
- git=2.34.1
name: ENAdumper
channels:
- conda-forge
- bioconda
- hcc
- defaults
dependencies:
- snakemake=7.26.0
- aspera-cli=3.9.6
- git=2.34.1
3 changes: 2 additions & 1 deletion src/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
splits: 3
fast5: fast5
np_dir: Nanopore
fastq: fastq
extension: pod5
sample: SampleName
study: StudyName
webin_user: Username
Expand Down
31 changes: 15 additions & 16 deletions src/snake_np
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# DESCRIPTION: Snakemake workflow code for uploading Nanopore raw data to ENA
# DESCRIPTION: Snakemake workflow code for uploading raw Nanopore data to ENA
# AUTHOR: Mantas Sereika ([email protected])
# LICENSE: GNU General Public License

splits=config["splits"]
fast5=config["fast5"]
np_dir=config["np_dir"]
extension=config["extension"]
sample=config["sample"]
study=config["study"]
webin_user=config["webin_user"]
Expand Down Expand Up @@ -33,44 +34,42 @@ rule document:
printf "FileType\tOxfordNanopore_native\tRead submission file type\n" > {output}
printf "sample\tstudy\tinstrument_model\tlibrary_name\tlibrary_source\tlibrary_selection\tlibrary_strategy\tfile_name\tfile_md5\n" >> {output}

for file in {sample}_fast5_[0-9][0-9].tar.gz; do
for file in {sample}_{extension}_[0-9][0-9].tar.gz; do
md5=$(cat $file.md5 | sed 's/ /,/' | cut -f1 -d",")
file_name=$(cat $file.md5 | sed 's/ /,/' | cut -f2 -d",")
printf "$sample_ena\t{study}\t{instrument_model}\t$library\t{library_source}\t{library_selection}\t{library_strategy}\t$file_name\t$md5\n" >> {output}
done
"""

rule split:
params:
find='-name "*.fast5"'
output:
expand("{name}_fast5.txt", name=sample)
expand("{name}_{extension}.txt", name=sample, extension=extension)
shell:
"""
find {fast5} {params.find} > {output}
split -n l/{splits} --numeric-suffixes=1 -d {output} {sample}_fast5_
find {np_dir} -name "*.{extension}" > {output}
split -n l/{splits} --numeric-suffixes=1 -d {output} {sample}_{extension}_
"""

rule compress:
input:
expand("{name}_fast5.txt", name=sample)
expand("{name}_{extension}.txt", name=sample, extension=extension)
output:
expand("{name}_fast5_01.tar.gz", name=sample)
expand("{name}_{extension}_01.tar.gz", name=sample, extension=extension)
shell:
"""
find {sample}_fast5_[0-9][0-9] | xargs -i --max-procs={splits} bash -c "tar -cvzf {{}}.tar.gz -T {{}} --transform 's/.*\///g' --show-transformed"
find {sample}_fast5_[0-9][0-9].tar.gz | xargs -i --max-procs={splits} bash -c "md5sum {{}} > {{}}.md5"
if [ ! -f {output} ]; then printf "Error with fast5 compression" && exit 1; fi
find {sample}_{extension}_[0-9][0-9] | xargs -i --max-procs={splits} bash -c "tar -cvzf {{}}.tar.gz -T {{}} --transform 's/.*\///g' --show-transformed"
find {sample}_{extension}_[0-9][0-9].tar.gz | xargs -i --max-procs={splits} bash -c "md5sum {{}} > {{}}.md5"
if [ ! -f {output} ]; then printf "Error with {extension} compression" && exit 1; fi
"""

rule upload:
input:
expand("{name}_fast5_01.tar.gz", name=sample)
expand("{name}_{extension}_01.tar.gz", name=sample, extension=extension)
output:
'upload.txt'
shell:
"""
export ASPERA_SCP_PASS={webin_pass}
find {sample}_fast5_[0-9][0-9].tar.gz | xargs -i --max-procs={splits} bash -c "ascp -QT -l300M -L- {{}} {webin_user}@webin.ebi.ac.uk:."
printf "Fast5 upload to ENA completed at $(date "+%Y-%m-%d %H:%M:%S")\n" > {output}
find {sample}_{extension}_[0-9][0-9].tar.gz | xargs -i --max-procs={splits} bash -c "ascp -QT -l300M -L- {{}} {webin_user}@webin.ebi.ac.uk:."
printf "{extension} upload to ENA completed at $(date "+%Y-%m-%d %H:%M:%S")\n" > {output}
"""

0 comments on commit 6dfb454

Please sign in to comment.