Skip to content

Commit

Permalink
Merge pull request #11 from microbiomedata/make_info
Browse files Browse the repository at this point in the history
Werkflow to master for ASM
  • Loading branch information
scanon authored Jun 10, 2023
2 parents 846ebf0 + 2729674 commit d1fb88e
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 66 deletions.
23 changes: 10 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description of the files:

## The Docker image and Dockerfile can be found here

[microbiomedata/bbtools:38.94](https://hub.docker.com/r/microbiomedata/bbtools)
[microbiomedata/bbtools:38.96](https://hub.docker.com/r/microbiomedata/bbtools)

[microbiomedata/spades:3.15.0](https://hub.docker.com/r/microbiomedata/spades)

Expand All @@ -24,26 +24,23 @@ Description of the files:

2. contig prefix for fasta header

3. output path
3. project name

4. input_interleaved (boolean)
4. resource where run the workflow

5. forwards reads fastq file (required value when input_interleaved is false, otherwise use [] )
5. informed_by

6. reverse reads fastq file (required value when input_interleaved is false, otherwise use [] )
6. memory (optional) ex: "jgi_metaASM.memory": "105G"

7. memory (optional) ex: "jgi_metaASM.memory": "105G"

8. threads (optional) ex: "jgi_metaASM.threads": "16"
7. threads (optional) ex: "jgi_metaASM.threads": "16"

```
{
"jgi_metaASM.input_file":["/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz"],
"jgi_metaASM.input_file":"/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz",
"jgi_metaASM.rename_contig_prefix":"503125_160870",
"jgi_metaASM.outdir":"/global/cfs/projectdirs/m3408/aim2/metagenome/assembly/ficus/503125_160870",
"jgi_metaASM.input_interleaved":true,
"jgi_metaASM.input_fq1":[],
"jgi_metaASM.input_fq2":[],
"jgi_metaASM.proj":"nmdc:503125_160870",
"jgi_metaASM.resource": "NERSC -- perlmutter",
"jgi_metaASM.informed_by": "nmdc:xxxxxx",
"jgi_metaASM.memory": "105G",
"jgi_metaASM.threads": "16"
}
Expand Down
9 changes: 5 additions & 4 deletions input.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
{
"jgi_metaASM.input_file":["/global/cfs/projectdirs/m3408/ficus/11809.7.220839.TCCTGAG-ACTGCAT.fastq.gz"],
"jgi_metaASM.rename_contig_prefix":"503125_160870",
"jgi_metaASM.input_interleaved":true,
"jgi_metaASM.input_fq1":[],
"jgi_metaASM.input_fq2":[],
"jgi_metaASM.outdir":"/global/cfs/projectdirs/m3408/aim2/metagenome/assembly/ficus/503125_160870"
"jgi_metaASM.proj":"nmdc:503125_160870",
"jgi_metaASM.resource": "NERSC -- perlmutter",
"jgi_metaASM.informed_by": "nmdc:xxxxxx",
"jgi_metaASM.memory": "105G",
"jgi_metaASM.threads": "16"
}
135 changes: 86 additions & 49 deletions jgi_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ workflow jgi_metaASM {
input: reads=stage.assembly_input, ref=create_agp.outcontigs, container=bbtools_container, memory=memory, threads=threads, paired = paired
}

call make_info_file {
input: bbcms_info= bbcms.outcounts, assy_info = assy.outlog, container=bbtools_container, proj=proj
}
call finish_asm {
input:
proj=proj,
start=stage.start,
git_url=git_url,
url_root=url_root,
container="microbiomedata/workflowmeta:1.1.0",
container="microbiomedata/workflowmeta:1.1.1",
informed_by=informed_by,
resource=resource,
input_file=input_file,
Expand All @@ -62,6 +65,7 @@ workflow jgi_metaASM {
File covstats=finish_asm.outcovstats
File asmstats=finish_asm.outasmstats
File objects=finish_asm.objects
File asminfo=make_info_file.asminfo
}

meta {
Expand Down Expand Up @@ -105,8 +109,38 @@ task stage {
}
}

task make_info_file {
File assy_info
File bbcms_info
String proj
String prefix=sub(proj, ":", "_")
String container

command<<<
bbtools_version=`grep BBToolsVer ${bbcms_info}| awk '{print $2}' | sed -e 's/"//g' -e 's/,//' `
spades_version=`grep 'SPAdes version' ${assy_info} | awk '{print $3}'`
echo -e "The workflow takes paired-end reads runs error correction by bbcms.sh (BBTools(1) version $bbtools_version)." > ${prefix}_metaAsm.info
echo -e "The clean reads are assembled by metaSpades(2) version $spades_version with parameters, --only-assembler -k 33,55,77,99,127 --meta" >> ${prefix}_metaAsm.info
echo -e "After assembly, Contigs and Scaffolds are consumed by the *create_agp* task to rename the FASTA header and generate an AGP format (https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/) file which describes the assembly" >> ${prefix}_metaAsm.info
echo -e "In the end, the reads are mapped back to contigs by bbmap (BBTools(1) version $bbtools_version) for coverage information." >> ${prefix}_metaAsm.info
echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> ${prefix}_metaAsm.info
echo -e "(2) Nurk S, Meleshko D, Korobeynikov A, Pevzner PA. metaSPAdes: a new versatile metagenomic assembler. Genome Res. 2017 May;27(5):824-834." >> ${prefix}_metaAsm.info
>>>

output {
File asminfo = "${prefix}_metaAsm.info"
}
runtime {
memory: "1 GiB"
cpu: 1
maxRetries: 1
docker: container
}
}

task finish_asm {
String input_file
Array[File] input_file
File fasta
File scaffold
File? agp
Expand Down Expand Up @@ -152,34 +186,35 @@ task finish_asm {
/scripts/generate_object_json.py \
--type "nmdc:MetagenomeAssembly" \
--set metagenome_assembly_set \
--part ${proj} \
--type "nmdc:MetagenomeAssembly" \
--set metagenome_assembly_set \
--part ${proj} \
-p "name=Metagenome Assembly Activity for ${proj}" \
was_informed_by=${informed_by} \
started_at_time=${start} \
ended_at_time=$end \
execution_resource=${resource} \
git_url=${git_url} \
version="v1.0.3-beta" \
--url ${url_root}${proj}/assembly/ \
--extra stats.json \
--inputs ${input_file} \
--inputs ${input_file[0]} ${input_file[1]} \
--outputs \
${prefix}_contigs.fna "Final assembly contigs fasta" "Assembly Contigs"\
${prefix}_scaffolds.fna "Final assembly scaffolds fasta" "Assembly Scaffolds"\
${prefix}_covstats.txt "Assembled contigs coverage information" "Assembly Coverage Stats"\
${prefix}_assembly.agp "An AGP format file that describes the assembly" "Assembly AGP"\
${prefix}_pairedMapped_sorted.bam "Sorted bam file of reads mapping back to the final assembly" "Assembly Coverage BAM"
${prefix}_contigs.fna "Final assembly contigs fasta" "Assembly Contigs" "Assembly contigs for ${proj}" \
${prefix}_scaffolds.fna "Final assembly scaffolds fasta" "Assembly Scaffolds" "Assembly scaffolds for ${proj}" \
${prefix}_covstats.txt "Assembled contigs coverage information" "Assembly Coverage Stats" "Coverage Stats for ${proj}" \
${prefix}_assembly.agp "An AGP format file that describes the assembly" "Assembly AGP" "AGP for ${proj}" \
${prefix}_pairedMapped_sorted.bam "Sorted bam file of reads mapping back to the final assembly" "Assembly Coverage BAM" "Sorted Bam for ${proj}"
>>>
output {
File outcontigs = "${prefix}_contigs.fna"
File outscaffolds = "${prefix}_scaffolds.fna"
File outcontigs = "${prefix}_contigs.fna"
File outscaffolds = "${prefix}_scaffolds.fna"
File outagp = "${prefix}_assembly.agp"
File outbam = "${prefix}_pairedMapped_sorted.bam"
File outsamgz = "${prefix}_pairedMapped.sam.gz"
File outcovstats = "${prefix}_covstats.txt"
File outasmstats = "stats.json"
File outbam = "${prefix}_pairedMapped_sorted.bam"
File outsamgz = "${prefix}_pairedMapped.sam.gz"
File outcovstats = "${prefix}_covstats.txt"
File outasmstats = "stats.json"
File objects = "objects.json"
}

Expand Down Expand Up @@ -208,28 +243,28 @@ task make_output{
String asmstats_name=basename(contigs)
String container

command{
if [ ! -z ${outdir} ]; then
mkdir -p ${outdir}
cp ${contigs} ${scaffolds} ${agp} ${bam} \
${samgz} ${covstats} ${asmstats} ${outdir}
chmod 764 -R ${outdir}
fi
}
runtime {
command{
if [ ! -z ${outdir} ]; then
mkdir -p ${outdir}
cp ${contigs} ${scaffolds} ${agp} ${bam} \
${samgz} ${covstats} ${asmstats} ${outdir}
chmod 764 -R ${outdir}
fi
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
output{
File? outcontigs = "${outdir}/${contigs_name}"
File? outscaffolds = "${outdir}/${scaffolds_name}"
File? outagp = "${outdir}/${agp_name}"
File? outbam = "${outdir}/${bam_name}"
File? outsamgz = "${outdir}/${samgz_name}"
File? outcovstats = "${outdir}/${covstats_name}"
File? outasmstats = "${outdir}/${asmstats_name}"
}
memory: "1 GiB"
cpu: 1
}
output{
File? outcontigs = "${outdir}/${contigs_name}"
File? outscaffolds = "${outdir}/${scaffolds_name}"
File? outagp = "${outdir}/${agp_name}"
File? outbam = "${outdir}/${bam_name}"
File? outsamgz = "${outdir}/${samgz_name}"
File? outcovstats = "${outdir}/${covstats_name}"
File? outasmstats = "${outdir}/${asmstats_name}"
}
}
task read_mapping_pairs{
Expand All @@ -252,8 +287,8 @@ task read_mapping_pairs{
runtime {
docker: container
memory: "120 GiB"
cpu: 16
maxRetries: 1
cpu: 16
maxRetries: 1
}
command{
set -eo pipefail
Expand All @@ -269,7 +304,7 @@ task read_mapping_pairs{
samtools sort -m100M -@ ${jvm_threads} ${filename_unsorted} -o ${filename_sorted}
samtools index ${filename_sorted}
reformat.sh -Xmx${default="105G" memory} in=${filename_unsorted} out=${filename_outsam} overwrite=true
ln ${filename_cov} mapping_stats.txt
ln ${filename_cov} mapping_stats.txt
rm $mapping_input
}
output{
Expand All @@ -293,7 +328,7 @@ task create_agp {
runtime {
docker: container
memory: "120 GiB"
cpu: 16
cpu: 16
}
command{
fungalrelease.sh -Xmx${default="105G" memory} in=${scaffolds_in} out=${filename_scaffolds} outc=${filename_contigs} agp=${filename_agp} legend=${filename_legend} mincontig=200 minscaf=200 sortscaffolds=t sortcontigs=t overwrite=t
Expand All @@ -304,10 +339,10 @@ task create_agp {
sed -i 's/l_gt50k/l_gt50K/g' stats.json
}
output{
File outcontigs = filename_contigs
File outscaffolds = filename_scaffolds
File outagp = filename_agp
File outstats = "stats.json"
File outcontigs = filename_contigs
File outscaffolds = filename_scaffolds
File outagp = filename_agp
File outstats = "stats.json"
File outlegend = filename_legend
}
}
Expand All @@ -326,7 +361,7 @@ task assy {
runtime {
docker: container
memory: "120 GiB"
cpu: 16
cpu: 16
}
command{
set -eo pipefail
Expand Down Expand Up @@ -359,7 +394,7 @@ task bbcms {
runtime {
docker: container
memory: "120 GiB"
cpu: 16
cpu: 16
}

command {
Expand All @@ -368,7 +403,7 @@ task bbcms {
cat ${sep=" " input_files} > infile.fastq.gz
export bbcms_input="infile.fastq.gz"
fi
if file --mime -b ${input_files[0]} | grep plain; then
if file --mime -b ${input_files[0]} | grep plain; then
cat ${sep=" " input_files} > infile.fastq
export bbcms_input="infile.fastq"
fi
Expand All @@ -378,6 +413,7 @@ task bbcms {
fi
readlength.sh -Xmx${default="105G" memory} in=${filename_outfile} out=${filename_readlen}
rm $bbcms_input

}
output {
File out = filename_outfile
Expand All @@ -388,6 +424,7 @@ task bbcms {
File stderr = filename_errlog
File outcounts = filename_counts
File outkmer = filename_kmerfile

}
}

0 comments on commit d1fb88e

Please sign in to comment.