diff --git a/README.md b/README.md index 8c605a4..f575a0d 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Description of the files: ## The Docker image and Dockerfile can be found here -[microbiomedata/bbtools:38.94](https://hub.docker.com/r/microbiomedata/bbtools) +[microbiomedata/bbtools:38.96](https://hub.docker.com/r/microbiomedata/bbtools) [microbiomedata/spades:3.15.0](https://hub.docker.com/r/microbiomedata/spades) @@ -24,26 +24,23 @@ Description of the files: 2. contig prefix for fasta header -3. output path +3. project name -4. input_interleaved (boolean) +4. resource where run the workflow -5. forwards reads fastq file (required value when input_interleaved is false, otherwise use [] ) +5. informed_by -6. reverse reads fastq file (required value when input_interleaved is false, otherwise use [] ) +6. memory (optional) ex: "jgi_metaASM.memory": "105G" -7. memory (optional) ex: "jgi_metaASM.memory": "105G" - -8. threads (optional) ex: "jgi_metaASM.threads": "16" +7. threads (optional) ex: "jgi_metaASM.threads": "16" ``` { - "jgi_metaASM.input_file":["/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz"], + "jgi_metaASM.input_file":"/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz", "jgi_metaASM.rename_contig_prefix":"503125_160870", - "jgi_metaASM.outdir":"/global/cfs/projectdirs/m3408/aim2/metagenome/assembly/ficus/503125_160870", - "jgi_metaASM.input_interleaved":true, - "jgi_metaASM.input_fq1":[], - "jgi_metaASM.input_fq2":[], + "jgi_metaASM.proj":"nmdc:503125_160870", + "jgi_metaASM.resource": "NERSC -- perlmutter", + "jgi_metaASM.informed_by": "nmdc:xxxxxx", "jgi_metaASM.memory": "105G", "jgi_metaASM.threads": "16" } diff --git a/input.json b/input.json index 8ea0673..f291ba3 100755 --- a/input.json +++ b/input.json @@ -1,8 +1,9 @@ { "jgi_metaASM.input_file":["/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz"], "jgi_metaASM.rename_contig_prefix":"503125_160870", - "jgi_metaASM.input_interleaved":true, - "jgi_metaASM.input_fq1":[], - "jgi_metaASM.input_fq2":[], - "jgi_metaASM.outdir":"/global/cfs/projectdirs/m3408/aim2/metagenome/assembly/ficus/503125_160870" + "jgi_metaASM.proj":"nmdc:503125_160870", + "jgi_metaASM.resource": "NERSC -- perlmutter", + "jgi_metaASM.informed_by": "nmdc:xxxxxx", + "jgi_metaASM.memory": "105G", + "jgi_metaASM.threads": "16" } diff --git a/jgi_assembly.wdl b/jgi_assembly.wdl index 377ec20..9225ded 100755 --- a/jgi_assembly.wdl +++ b/jgi_assembly.wdl @@ -34,13 +34,16 @@ workflow jgi_metaASM { input: reads=stage.assembly_input, ref=create_agp.outcontigs, container=bbtools_container, memory=memory, threads=threads, paired = paired } + call make_info_file { + input: bbcms_info= bbcms.outcounts, assy_info = assy.outlog, container=bbtools_container, proj=proj + } call finish_asm { input: proj=proj, start=stage.start, git_url=git_url, url_root=url_root, - container="microbiomedata/workflowmeta:1.1.0", + container="microbiomedata/workflowmeta:1.1.1", informed_by=informed_by, resource=resource, input_file=input_file, @@ -62,6 +65,7 @@ workflow jgi_metaASM { File covstats=finish_asm.outcovstats File asmstats=finish_asm.outasmstats File objects=finish_asm.objects + File asminfo=make_info_file.asminfo } meta { @@ -105,8 +109,38 @@ task stage { } } +task make_info_file { + File assy_info + File bbcms_info + String proj + String prefix=sub(proj, ":", "_") + String container + + command<<< + bbtools_version=`grep BBToolsVer ${bbcms_info}| awk '{print $2}' | sed -e 's/"//g' -e 's/,//' ` + spades_version=`grep 'SPAdes version' ${assy_info} | awk '{print $3}'` + echo -e "The workflow takes paired-end reads runs error correction by bbcms.sh (BBTools(1) version $bbtools_version)." > ${prefix}_metaAsm.info + echo -e "The clean reads are assembled by metaSpades(2) version $spades_version with parameters, --only-assembler -k 33,55,77,99,127 --meta" >> ${prefix}_metaAsm.info + echo -e "After assembly, Contigs and Scaffolds are consumed by the *create_agp* task to rename the FASTA header and generate an AGP format (https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/) file which describes the assembly" >> ${prefix}_metaAsm.info + echo -e "In the end, the reads are mapped back to contigs by bbmap (BBTools(1) version $bbtools_version) for coverage information." >> ${prefix}_metaAsm.info + + echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> ${prefix}_metaAsm.info + echo -e "(2) Nurk S, Meleshko D, Korobeynikov A, Pevzner PA. metaSPAdes: a new versatile metagenomic assembler. Genome Res. 2017 May;27(5):824-834." >> ${prefix}_metaAsm.info + >>> + + output { + File asminfo = "${prefix}_metaAsm.info" + } + runtime { + memory: "1 GiB" + cpu: 1 + maxRetries: 1 + docker: container + } +} + task finish_asm { - String input_file + Array[File] input_file File fasta File scaffold File? agp @@ -152,34 +186,35 @@ task finish_asm { /scripts/generate_object_json.py \ - --type "nmdc:MetagenomeAssembly" \ - --set metagenome_assembly_set \ - --part ${proj} \ + --type "nmdc:MetagenomeAssembly" \ + --set metagenome_assembly_set \ + --part ${proj} \ -p "name=Metagenome Assembly Activity for ${proj}" \ was_informed_by=${informed_by} \ started_at_time=${start} \ ended_at_time=$end \ execution_resource=${resource} \ git_url=${git_url} \ + version="v1.0.3-beta" \ --url ${url_root}${proj}/assembly/ \ --extra stats.json \ - --inputs ${input_file} \ + --inputs ${input_file[0]} ${input_file[1]} \ --outputs \ - ${prefix}_contigs.fna "Final assembly contigs fasta" "Assembly Contigs"\ - ${prefix}_scaffolds.fna "Final assembly scaffolds fasta" "Assembly Scaffolds"\ - ${prefix}_covstats.txt "Assembled contigs coverage information" "Assembly Coverage Stats"\ - ${prefix}_assembly.agp "An AGP format file that describes the assembly" "Assembly AGP"\ - ${prefix}_pairedMapped_sorted.bam "Sorted bam file of reads mapping back to the final assembly" "Assembly Coverage BAM" + ${prefix}_contigs.fna "Final assembly contigs fasta" "Assembly Contigs" "Assembly contigs for ${proj}" \ + ${prefix}_scaffolds.fna "Final assembly scaffolds fasta" "Assembly Scaffolds" "Assembly scaffolds for ${proj}" \ + ${prefix}_covstats.txt "Assembled contigs coverage information" "Assembly Coverage Stats" "Coverage Stats for ${proj}" \ + ${prefix}_assembly.agp "An AGP format file that describes the assembly" "Assembly AGP" "AGP for ${proj}" \ + ${prefix}_pairedMapped_sorted.bam "Sorted bam file of reads mapping back to the final assembly" "Assembly Coverage BAM" "Sorted Bam for ${proj}" >>> output { - File outcontigs = "${prefix}_contigs.fna" - File outscaffolds = "${prefix}_scaffolds.fna" + File outcontigs = "${prefix}_contigs.fna" + File outscaffolds = "${prefix}_scaffolds.fna" File outagp = "${prefix}_assembly.agp" - File outbam = "${prefix}_pairedMapped_sorted.bam" - File outsamgz = "${prefix}_pairedMapped.sam.gz" - File outcovstats = "${prefix}_covstats.txt" - File outasmstats = "stats.json" + File outbam = "${prefix}_pairedMapped_sorted.bam" + File outsamgz = "${prefix}_pairedMapped.sam.gz" + File outcovstats = "${prefix}_covstats.txt" + File outasmstats = "stats.json" File objects = "objects.json" } @@ -208,28 +243,28 @@ task make_output{ String asmstats_name=basename(contigs) String container - command{ - if [ ! -z ${outdir} ]; then - mkdir -p ${outdir} - cp ${contigs} ${scaffolds} ${agp} ${bam} \ - ${samgz} ${covstats} ${asmstats} ${outdir} - chmod 764 -R ${outdir} - fi - } - runtime { + command{ + if [ ! -z ${outdir} ]; then + mkdir -p ${outdir} + cp ${contigs} ${scaffolds} ${agp} ${bam} \ + ${samgz} ${covstats} ${asmstats} ${outdir} + chmod 764 -R ${outdir} + fi + } + runtime { docker: container - memory: "1 GiB" - cpu: 1 - } - output{ - File? outcontigs = "${outdir}/${contigs_name}" - File? outscaffolds = "${outdir}/${scaffolds_name}" - File? outagp = "${outdir}/${agp_name}" - File? outbam = "${outdir}/${bam_name}" - File? outsamgz = "${outdir}/${samgz_name}" - File? outcovstats = "${outdir}/${covstats_name}" - File? outasmstats = "${outdir}/${asmstats_name}" - } + memory: "1 GiB" + cpu: 1 + } + output{ + File? outcontigs = "${outdir}/${contigs_name}" + File? outscaffolds = "${outdir}/${scaffolds_name}" + File? outagp = "${outdir}/${agp_name}" + File? outbam = "${outdir}/${bam_name}" + File? outsamgz = "${outdir}/${samgz_name}" + File? outcovstats = "${outdir}/${covstats_name}" + File? outasmstats = "${outdir}/${asmstats_name}" + } } task read_mapping_pairs{ @@ -252,8 +287,8 @@ task read_mapping_pairs{ runtime { docker: container memory: "120 GiB" - cpu: 16 - maxRetries: 1 + cpu: 16 + maxRetries: 1 } command{ set -eo pipefail @@ -269,7 +304,7 @@ task read_mapping_pairs{ samtools sort -m100M -@ ${jvm_threads} ${filename_unsorted} -o ${filename_sorted} samtools index ${filename_sorted} reformat.sh -Xmx${default="105G" memory} in=${filename_unsorted} out=${filename_outsam} overwrite=true - ln ${filename_cov} mapping_stats.txt + ln ${filename_cov} mapping_stats.txt rm $mapping_input } output{ @@ -293,7 +328,7 @@ task create_agp { runtime { docker: container memory: "120 GiB" - cpu: 16 + cpu: 16 } command{ fungalrelease.sh -Xmx${default="105G" memory} in=${scaffolds_in} out=${filename_scaffolds} outc=${filename_contigs} agp=${filename_agp} legend=${filename_legend} mincontig=200 minscaf=200 sortscaffolds=t sortcontigs=t overwrite=t @@ -304,10 +339,10 @@ task create_agp { sed -i 's/l_gt50k/l_gt50K/g' stats.json } output{ - File outcontigs = filename_contigs - File outscaffolds = filename_scaffolds - File outagp = filename_agp - File outstats = "stats.json" + File outcontigs = filename_contigs + File outscaffolds = filename_scaffolds + File outagp = filename_agp + File outstats = "stats.json" File outlegend = filename_legend } } @@ -326,7 +361,7 @@ task assy { runtime { docker: container memory: "120 GiB" - cpu: 16 + cpu: 16 } command{ set -eo pipefail @@ -359,7 +394,7 @@ task bbcms { runtime { docker: container memory: "120 GiB" - cpu: 16 + cpu: 16 } command { @@ -368,7 +403,7 @@ task bbcms { cat ${sep=" " input_files} > infile.fastq.gz export bbcms_input="infile.fastq.gz" fi - if file --mime -b ${input_files[0]} | grep plain; then + if file --mime -b ${input_files[0]} | grep plain; then cat ${sep=" " input_files} > infile.fastq export bbcms_input="infile.fastq" fi @@ -378,6 +413,7 @@ task bbcms { fi readlength.sh -Xmx${default="105G" memory} in=${filename_outfile} out=${filename_readlen} rm $bbcms_input + } output { File out = filename_outfile @@ -388,6 +424,7 @@ task bbcms { File stderr = filename_errlog File outcounts = filename_counts File outkmer = filename_kmerfile + } }