diff --git a/jgi_assembly.wdl b/jgi_assembly.wdl index 7cc3091..7720971 100755 --- a/jgi_assembly.wdl +++ b/jgi_assembly.wdl @@ -1,70 +1,69 @@ workflow jgi_metaASM { - Array[File] input_file - String outdir - String? threads + String? outdir String? memory + String? threads + String input_file + String proj + String resource + String informed_by String rename_contig_prefix="scaffold" - #String bbtools_container="microbiomedata/bbtools:38.94" - String bbtools_container="microbiomedata/bbtools@sha256:b433db110ef6cdcac4d236afabff95bfe153228063f5d9234306e78657ddbe36" - #String spades_container="microbiomedata/spades:3.15.0" - String spades_container="microbiomedata/spades@sha256:1d94ec809bcb52cd4560de0993d14f24c38e1f88facc7cbb2aba66723c20fd13" + Float uniquekmer=1000 + String? git_url="https://github.com/microbiomedata/mg_annotation/releases/tag/0.1" + String? url_root="https://data.microbiomedata.org/data/" + String bbtools_container="microbiomedata/bbtools:38.96" + String spades_container="microbiomedata/spades:3.15.0" Boolean paired = true + + call stage { + input: + container=bbtools_container, + input_file=input_file + } + + call bbcms { - input: input_files=input_file, container=bbtools_container, memory = memory, paired = paired + input: input_files=stage.assembly_input, container=bbtools_container, memory=memory, paired = paired } call assy { - input: infile1=bbcms.out1, infile2=bbcms.out2, container=spades_container, paired = paired + input: infile1=bbcms.out1, infile2=bbcms.out2, container=spades_container, threads=threads, paired = paired } call create_agp { - input: scaffolds_in=assy.out, container=bbtools_container, memory = memory, rename_contig_prefix = rename_contig_prefix + input: scaffolds_in=assy.out, container=bbtools_container, rename_contig_prefix = rename_contig_prefix, memory=memory } call read_mapping_pairs { - input: reads=input_file, ref=create_agp.outcontigs, container=bbtools_container, memory = memory, threads = threads, paired = paired + input: reads=input_file, ref=create_agp.outcontigs, container=bbtools_container, memory=memory, threads=threads, paired = paired } - if (defined(outdir)) { - call make_output { - input: outdir=outdir, - contigs=create_agp.outcontigs, - scaffolds=create_agp.outscaffolds, - agp=create_agp.outagp, - bam=read_mapping_pairs.outbamfile, - samgz=read_mapping_pairs.outsamfile, - covstats=read_mapping_pairs.outcovfile, - asmstats=create_agp.outstats, - container=bbtools_container - } + + call finish_asm { + input: + proj=proj, + start=stage.start, + git_url=git_url, + url_root=url_root, + container="microbiomedata/workflowmeta:1.1.0", + informed_by=informed_by, + resource=resource, + input_file=input_file, + fasta=create_agp.outcontigs, + scaffold=create_agp.outscaffolds, + agp=create_agp.outagp, + bam=read_mapping_pairs.outbamfile, + samgz=read_mapping_pairs.outsamfile, + covstats=read_mapping_pairs.outcovfile, + asmstats=create_agp.outstats } + output { - File contig = create_agp.outcontigs - File scaffold = create_agp.outscaffolds - File agp=create_agp.outagp - File bam=read_mapping_pairs.outbamfile - File samgz=read_mapping_pairs.outsamfile - File covstats=read_mapping_pairs.outcovfile - File asmstats=create_agp.outstats - File? final_contig = make_output.outcontigs - File? final_scaffold = make_output.outscaffolds - File? final_agp = make_output.outagp - File? final_covstat = make_output.outcovstats - File? final_samgz = make_output.outsamgz - File? final_bam = make_output.outbam - File? final_asmstat = make_output.outasmstats + File contig=finish_asm.outcontigs + File scaffold=finish_asm.outscaffolds + File agp=finish_asm.outagp + File bam=finish_asm.outbam + File samgz=finish_asm.outsamgz + File covstats=finish_asm.outcovstats + File asmstats=finish_asm.outasmstats + File objects=finish_asm.objects } - parameter_meta{ - input_file: "illumina paired-end interleaved fastq files" - outdir: "the final output directory path" - rename_contig_prefix: "contig prefix for fasta header, default: scaffold" - final_contig: "assembled contigs fasta file" - final_scaffold: "assembled scaffold fasta file" - final_agp: "assembled AGP file" - final_covstat: "contig coverage stats file" - final_samgz: "reads aligned to contigs sam file with gz compressed" - final_bam: "reads aligned to contigs bam file" - final_asmstat: "assembled scaffold/contigs statistical numbers" - # memory: "optional for jvm memory for bbtools, ex: 32G" - # threads: "optional for jvm/spades threads for bbtools ex: 16" - } - + meta { author: "Chienchi Lo, B10, LANL" email: "chienchi@lanl.gov" @@ -73,139 +72,171 @@ workflow jgi_metaASM { } -task bbcms { - Array[File] input_files - String container - String? memory - Boolean paired = true +task stage { + String container + String input_file + String? memory = "4G" + String target = "staged.fastq.gz" + String output1 = "input.left.fastq.gz" + String output2 = "input.right.fastq.gz" - String filename_outfile="input.corr.fastq.gz" - String filename_outfile1="input.corr.left.fastq.gz" - String filename_outfile2="input.corr.right.fastq.gz" - String filename_readlen="readlen.txt" - String filename_outlog="stdout.log" - String filename_errlog="stderr.log" - String filename_kmerfile="unique31mer.txt" - String filename_counts="counts.metadata.json" + command <<< + set -e + if [ $( echo ${input_file}|egrep -c "https*:") -gt 0 ] ; then + wget ${input_file} -O ${target} + else + ln ${input_file} ${target} || cp ${input_file} ${target} + fi - runtime { - docker: container - time: "12:00:00" - memory: "115G" - cpu: 16 - } + reformat.sh -Xmx${default="10G" memory} in=${target} out1=${output1} out2=${output2} + # Capture the start time + date --iso-8601=seconds > start.txt - command { - set -eo pipefail - if file --mime -b ${input_files[0]} | grep gzip; then - cat ${sep=" " input_files} > infile.fastq.gz - export bbcms_input="infile.fastq.gz" - fi + >>> - if file --mime -b ${input_files[0]} | grep plain; then - cat ${sep=" " input_files} > infile.fastq - export bbcms_input="infile.fastq" - fi + output{ + Array[File] assembly_input = [output1, output2] + String start = read_string("start.txt") + } + runtime { + cpu: 2 + maxRetries: 1 + docker: container + } +} - bbcms.sh -Xmx${default="105G" memory} metadatafile=${filename_counts} mincount=2 highcountfraction=0.6 in=$bbcms_input out=${filename_outfile} > >(tee -a ${filename_outlog}) 2> >(tee -a ${filename_errlog} >&2) && grep Unique ${filename_errlog} | rev | cut -f 1 | rev > ${filename_kmerfile} +task finish_asm { + String input_file + File fasta + File scaffold + File? agp + File bam + File? samgz + File? covstats + File asmstats + String container + String git_url + String informed_by + String proj + String prefix=sub(proj, ":", "_") + String orig_prefix="scaffold" + String sed="s/${orig_prefix}_/${proj}_/g" + String resource + String url_root + String start + + command<<< - if ${paired}; then - reformat.sh -Xmx${default="105G" memory} in=${filename_outfile} out1=${filename_outfile1} out2=${filename_outfile2} - fi - readlength.sh -Xmx${default="105G" memory} in=${filename_outfile} out=${filename_readlen} - rm $bbcms_input - } + set -e + end=`date --iso-8601=seconds` + ln ${fasta} ${prefix}_contigs.fna + ln ${scaffold} ${prefix}_scaffolds.fna + ln ${covstats} ${prefix}_covstats.txt + ln ${agp} ${prefix}_assembly.agp - output { - File out = filename_outfile - File out1 = if paired then filename_outfile1 else filename_outfile - File out2 = if paired then filename_outfile2 else filename_outfile - File outreadlen = filename_readlen - File stdout = filename_outlog - File stderr = filename_errlog - File outcounts = filename_counts - File outkmer = filename_kmerfile - } -} + ## Bam file + samtools view -h ${bam} | sed ${sed} | \ + samtools view -hb -o ${prefix}_pairedMapped_sorted.bam + ## Sam.gz file + samtools view -h ${samgz} | sed ${sed} | \ + gzip -c - > ${prefix}_pairedMapped.sam.gz -task assy { - File infile1 - File infile2 - String container - String? threads - String outprefix="spades3" - String filename_outfile="${outprefix}/scaffolds.fasta" - String filename_spadeslog ="${outprefix}/spades.log" - String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)" - String spades_cpu=select_first([threads,system_cpu]) - Boolean paired = true + # Remove an extra field from the stats + cat ${asmstats} |jq 'del(.filename)' > stats.json - runtime { - docker: container - time: "12:00:00" - memory: "115G" - cpu: 16 - } - command{ - set -eo pipefail - if ${paired}; then - spades.py -m 2000 -o ${outprefix} --only-assembler -k 33,55,77,99,127 --meta -t ${spades_cpu} -1 ${infile1} -2 ${infile2} - else - spades.py -m 2000 -o ${outprefix} --only-assembler -k 33,55,77,99,127 -t ${spades_cpu} -s ${infile1} - fi - } - output { - File out = filename_outfile - File outlog = filename_spadeslog - } -} + /scripts/generate_object_json.py \ + --type "nmdc:MetagenomeAssembly" \ + --set metagenome_assembly_set \ + --part ${proj} \ + -p "name=Metagenome Assembly Activity for ${proj}" \ + was_informed_by=${informed_by} \ + started_at_time=${start} \ + ended_at_time=$end \ + execution_resource=${resource} \ + git_url=${git_url} \ + --url ${url_root}${proj}/assembly/ \ + --extra stats.json \ + --inputs ${input_file} \ + --outputs \ + ${prefix}_contigs.fna "Final assembly contigs fasta" "Assembly Contigs"\ + ${prefix}_scaffolds.fna "Final assembly scaffolds fasta" "Assembly Scaffolds"\ + ${prefix}_covstats.txt "Assembled contigs coverage information" "Assembly Coverage Stats"\ + ${prefix}_assembly.agp "An AGP format file that describes the assembly" "Assembly AGP"\ + ${prefix}_pairedMapped_sorted.bam "Sorted bam file of reads mapping back to the final assembly" "Assembly Coverage BAM" -task create_agp { - File scaffolds_in - String? memory - String container - String rename_contig_prefix - String prefix="assembly" - String filename_contigs="${prefix}_contigs.fna" - String filename_scaffolds="${prefix}_scaffolds.fna" - String filename_agp="${prefix}.agp" - String filename_legend="${prefix}_scaffolds.legend" + >>> + output { + File outcontigs = "${prefix}_contigs.fna" + File outscaffolds = "${prefix}_scaffolds.fna" + File outagp = "${prefix}_assembly.agp" + File outbam = "${prefix}_pairedMapped_sorted.bam" + File outsamgz = "${prefix}_pairedMapped.sam.gz" + File outcovstats = "${prefix}_covstats.txt" + File outasmstats = "stats.json" + File objects = "objects.json" + } runtime { - docker: container - time: "12:00:00" - memory: "115G" - cpu: 16 + docker: container + memory: "1 GiB" + cpu: 1 } +} - command{ - fungalrelease.sh -Xmx${default="105G" memory} in=${scaffolds_in} out=${filename_scaffolds} outc=${filename_contigs} agp=${filename_agp} legend=${filename_legend} mincontig=200 minscaf=200 sortscaffolds=t sortcontigs=t overwrite=t - if [ "${rename_contig_prefix}" != "scaffold" ]; then - sed -i 's/scaffold/${rename_contig_prefix}_scf/g' ${filename_contigs} ${filename_scaffolds} ${filename_agp} ${filename_legend} - fi - bbstats.sh format=8 in=${filename_scaffolds} out=stats.json - sed -i 's/l_gt50k/l_gt50K/g' stats.json - } - output{ - File outcontigs = filename_contigs - File outscaffolds = filename_scaffolds - File outagp = filename_agp - File outstats = "stats.json" - File outlegend = filename_legend - } +task make_output{ + String outdir + File contigs + File scaffolds + File agp + File bam + File samgz + File covstats + File asmstats + String contigs_name=basename(contigs) + String scaffolds_name=basename(contigs) + String agp_name=basename(contigs) + String bam_name=basename(contigs) + String samgz_name=basename(contigs) + String covstats_name=basename(contigs) + String asmstats_name=basename(contigs) + String container + + command{ + if [ ! -z ${outdir} ]; then + mkdir -p ${outdir} + cp ${contigs} ${scaffolds} ${agp} ${bam} \ + ${samgz} ${covstats} ${asmstats} ${outdir} + chmod 764 -R ${outdir} + fi + } + runtime { + docker: container + memory: "1 GiB" + cpu: 1 + } + output{ + File? outcontigs = "${outdir}/${contigs_name}" + File? outscaffolds = "${outdir}/${scaffolds_name}" + File? outagp = "${outdir}/${agp_name}" + File? outbam = "${outdir}/${bam_name}" + File? outsamgz = "${outdir}/${samgz_name}" + File? outcovstats = "${outdir}/${covstats_name}" + File? outasmstats = "${outdir}/${asmstats_name}" + } } task read_mapping_pairs{ Array[File] reads File ref String container - String? threads String? memory + String? threads Boolean paired = true String bbmap_interleaved_flag = if paired then 'interleaved=true' else 'interleaved=false' + String filename_resources="resources.log" String filename_unsorted="pairedMapped.bam" String filename_outsam="pairedMapped.sam.gz" String filename_sorted="pairedMapped_sorted.bam" @@ -214,15 +245,18 @@ task read_mapping_pairs{ String filename_cov="covstats.txt" String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)" String jvm_threads=select_first([threads,system_cpu]) - runtime { - docker: container - time: "12:00:00" - memory: "115G" - cpu: 16 - } - + docker: container + memory: "120 GiB" + cpu: 16 + maxRetries: 1 + } command{ + echo $(curl --fail --max-time 10 --silent http://169.254.169.254/latest/meta-data/public-hostname) + touch ${filename_resources}; + curl --fail --max-time 10 --silent https://bitbucket.org/berkeleylab/jgi-meta/get/master.tar.gz | tar --wildcards -zxvf - "*/bin/resources.bash" && ./*/bin/resources.bash > ${filename_resources} & + sleep 30 + export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P" set -eo pipefail if [[ ${reads[0]} == *.gz ]] ; then cat ${sep=" " reads} > infile.fastq.gz @@ -244,50 +278,139 @@ task read_mapping_pairs{ File outbamfileidx = filename_sorted_idx File outcovfile = filename_cov File outsamfile = filename_outsam + File outresources = filename_resources } } -task make_output{ - String outdir - File contigs - File scaffolds - File agp - File bam - File samgz - File covstats - File asmstats - String contigs_name=basename(contigs) - String scaffolds_name=basename(contigs) - String agp_name=basename(contigs) - String bam_name=basename(contigs) - String samgz_name=basename(contigs) - String covstats_name=basename(contigs) - String asmstats_name=basename(contigs) - String container - - command{ - if [ -n ${outdir} ]; then - mkdir -p ${outdir} - cp ${contigs} ${scaffolds} ${agp} ${bam} \ - ${samgz} ${covstats} ${asmstats} ${outdir} - chmod 764 -R ${outdir} +task create_agp { + File scaffolds_in + String? memory + String container + String rename_contig_prefix + String filename_resources="resources.log" + String prefix="assembly" + String filename_contigs="${prefix}_contigs.fna" + String filename_scaffolds="${prefix}_scaffolds.fna" + String filename_agp="${prefix}.agp" + String filename_legend="${prefix}_scaffolds.legend" + runtime { + docker: container + memory: "120 GiB" + cpu: 16 + } + command{ + echo $(curl --fail --max-time 10 --silent http://169.254.169.254/latest/meta-data/public-hostname) + touch ${filename_resources}; + curl --fail --max-time 10 --silent https://bitbucket.org/berkeleylab/jgi-meta/get/master.tar.gz | tar --wildcards -zxvf - "*/bin/resources.bash" && ./*/bin/resources.bash > ${filename_resources} & + sleep 30 + export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P" + fungalrelease.sh -Xmx${default="105G" memory} in=${scaffolds_in} out=${filename_scaffolds} outc=${filename_contigs} agp=${filename_agp} legend=${filename_legend} mincontig=200 minscaf=200 sortscaffolds=t sortcontigs=t overwrite=t + if [ "${rename_contig_prefix}" != "scaffold" ]; then + sed -i 's/scaffold/${rename_contig_prefix}_scf/g' ${filename_contigs} ${filename_scaffolds} ${filename_agp} ${filename_legend} fi + bbstats.sh format=8 in=${filename_scaffolds} out=stats.json + sed -i 's/l_gt50k/l_gt50K/g' stats.json + } + output{ + File outcontigs = filename_contigs + File outscaffolds = filename_scaffolds + File outagp = filename_agp + File outstats = "stats.json" + File outlegend = filename_legend + File outresources = filename_resources } +} - runtime { - docker: container - time: "12:00:00" - memory: "115G" - cpu: 16 - } +task assy { + File infile1 + File infile2 + String container + String? threads + String filename_resources="resources.log" + String outprefix="spades3" + String filename_outfile="${outprefix}/scaffolds.fasta" + String filename_spadeslog ="${outprefix}/spades.log" + String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)" + String spades_cpu=select_first([threads,system_cpu]) + Boolean paired = true + runtime { + docker: container + memory: "120 GiB" + cpu: 16 + } + command{ + echo $(curl --fail --max-time 10 --silent http://169.254.169.254/latest/meta-data/public-hostname) + touch ${filename_resources}; + curl --fail --max-time 10 --silent https://bitbucket.org/berkeleylab/jgi-meta/get/master.tar.gz | tar --wildcards -zxvf - "*/bin/resources.bash" && ./*/bin/resources.bash > ${filename_resources} & + sleep 30 + export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P" + set -eo pipefail + if ${paired}; then + spades.py -m 2000 -o ${outprefix} --only-assembler -k 33,55,77,99,127 --meta -t ${spades_cpu} -1 ${infile1} -2 ${infile2} + else + spades.py -m 2000 -o ${outprefix} --only-assembler -k 33,55,77,99,127 -t ${spades_cpu} -s ${infile1} + fi + } + output { + File out = filename_outfile + File outlog = filename_spadeslog + File outresources = filename_resources + } +} - output{ - File? outcontigs = "${outdir}/${contigs_name}" - File? outscaffolds = "${outdir}/${scaffolds_name}" - File? outagp = "${outdir}/${agp_name}" - File? outbam = "${outdir}/${bam_name}" - File? outsamgz = "${outdir}/${samgz_name}" - File? outcovstats = "${outdir}/${covstats_name}" - File? outasmstats = "${outdir}/${asmstats_name}" - } +task bbcms { + Array[File] input_files + String container + String? memory + Boolean paired = true + + String filename_resources="resources.log" + String filename_outfile="input.corr.fastq.gz" + String filename_outfile1="input.corr.left.fastq.gz" + String filename_outfile2="input.corr.right.fastq.gz" + String filename_readlen="readlen.txt" + String filename_outlog="stdout.log" + String filename_errlog="stderr.log" + String filename_kmerfile="unique31mer.txt" + String filename_counts="counts.metadata.json" + runtime { + docker: container + memory: "120 GiB" + cpu: 16 + } + + command { + echo $(curl --fail --max-time 10 --silent http://169.254.169.254/latest/meta-data/public-hostname) + touch ${filename_resources}; + curl --fail --max-time 10 --silent https://bitbucket.org/berkeleylab/jgi-meta/get/master.tar.gz | tar --wildcards -zxvf - "*/bin/resources.bash" && ./*/bin/resources.bash > ${filename_resources} & + sleep 30 + export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P" + set -eo pipefail + if file --mime -b ${input_files[0]} | grep gzip; then + cat ${sep=" " input_files} > infile.fastq.gz + export bbcms_input="infile.fastq.gz" + fi + if file --mime -b ${input_files[0]} | grep plain; then + cat ${sep=" " input_files} > infile.fastq + export bbcms_input="infile.fastq" + fi + bbcms.sh -Xmx${default="105G" memory} metadatafile=${filename_counts} mincount=2 highcountfraction=0.6 in=$bbcms_input out=${filename_outfile} > >(tee -a ${filename_outlog}) 2> >(tee -a ${filename_errlog} >&2) && grep Unique ${filename_errlog} | rev | cut -f 1 | rev > ${filename_kmerfile} + if ${paired}; then + reformat.sh -Xmx${default="105G" memory} in=${filename_outfile} out1=${filename_outfile1} out2=${filename_outfile2} + fi + readlength.sh -Xmx${default="105G" memory} in=${filename_outfile} out=${filename_readlen} + rm $bbcms_input + } + output { + File out = filename_outfile + File out1 = if paired then filename_outfile1 else filename_outfile + File out2 = if paired then filename_outfile2 else filename_outfile + File outreadlen = filename_readlen + File stdout = filename_outlog + File stderr = filename_errlog + File outcounts = filename_counts + File outkmer = filename_kmerfile + File outresources = filename_resources + } } +