diff --git a/tasks/phylogenetic_inference/task_lyveset.wdl b/tasks/phylogenetic_inference/task_lyveset.wdl index 5df23c1a4..0b30d8768 100644 --- a/tasks/phylogenetic_inference/task_lyveset.wdl +++ b/tasks/phylogenetic_inference/task_lyveset.wdl @@ -9,7 +9,7 @@ task lyveset { String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f" Int memory = 64 Int cpu = 16 - Int disk_size = 100 + Int disk_size = 250 # Lyve-SET Parameters ##COMMON OPTIONS ##--allowedFlanking 0 allowed flanking distance in bp. @@ -60,7 +60,7 @@ task lyveset { Boolean fast = false Boolean downsample = false Boolean sample_sites = false - String? read_cleaner = 'CGP' + String read_cleaner = "CGP" String? mapper String? snpcaller } @@ -73,7 +73,7 @@ task lyveset { read2_array=(~{sep=' ' read2}) read2_array_len=$(echo "${#read2[@]}") - if [ "$read1_array_len" -ne "$read2_index_array_len" ]; then + if [ "$read1_array_len" -ne "$read2_array_len" ]; then echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2 exit 1 fi @@ -81,33 +81,73 @@ task lyveset { # create lyvset project set_manage.pl --create ~{dataset_name} - #shuffle paired end reads - for index in ${!read1_array[@]}; do - cp ${read1_array[$index]} . && cp ${read2_array[$index]} . # move reads to cwd + ### PREFACE ### + # This FASTQ file re-naming strategy is necessary due to filename parsing in shuffleSplitReads.pl here: https://github.com/lskatz/lyve-SET/blob/v1.1.4f/scripts/shuffleSplitReads.pl#L34 + # Curtis' interpretation of perl code: + # It first checks for a pattern like '_R1_' or '_R2_', and if found, sets the $readNumber variable to 1 or 2 respectively. + # If that pattern is not found, it checks for a pattern like '_1.f' or '_2.f', again setting the $readNumber accordingly. + # If neither pattern is matched, it raises an error indicating that the read number could not be parsed from the filename. + ### END PREFACE ### + + mkdir -v input-fastqs + + # Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2 + # Also, place files within input-fastqs/ directory + echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2" + for FASTQ in "${!read1_array[@]}"; do + FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}") + # sed line replaces underscores with dashes, except surrounding R1 or R2 + mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g') + done + # do the same for read2 + for FASTQ in "${!read2_array[@]}"; do + FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}") + # sed line replaces underscores with dashes, except surrounding R1 or R2 + mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g') done - shuffleSplitReads.pl --numcpus ~{cpu} -o ./interleaved *.fastq.gz + ### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ### + # read1 + for FASTQ in input-fastqs/*; do + FASTQ_BASENAME=$(basename ${FASTQ}) + # if the R1 FASTQ filenames end in "_R1.fastq.gz" rename the files to match lyveset naming convention + if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then + echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}" + mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}" + # if the R2 FASTQ filenames end in "_R2.fastq.gz" rename the files to match lyveset naming convention + elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then + echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}" + mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}" + else + echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz" + fi + done + + echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:" + ls -lh input-fastqs/ + echo - # then moved into your project dir - mv ./interleaved/*.fastq.gz ~{dataset_name}/reads/ + echo "DEBUG: merging R1 and R2 FASTQ files into interleaved FASTQ files with shuffleSplitReads.pl now..." + shuffleSplitReads.pl --numcpus ~{cpu} -o "./~{dataset_name}/reads" input-fastqs/*.fastq.gz - # cleanup - rmdir interleaved - mkdir ~{dataset_name}/ref/ - cp ~{reference_genome} ~{dataset_name}/ref/reference.fasta + # make directory for reference genome and copy reference genome into it. Also rename to reference.fasta + mkdir -v ~{dataset_name}/ref/ + cp -v ~{reference_genome} ~{dataset_name}/ref/reference.fasta + + # launch lyveSET workflow now that everything is set up launch_set.pl --numcpus ~{cpu} \ --allowedFlanking ~{allowedFlanking} \ --min_alt_frac ~{min_alt_frac} \ --min_coverage ~{min_coverage} \ ~{'--presets ' + presets} \ - ~{true='--mask_phages' false='' mask_phages} \ - ~{true='--mask_cliffs' false='' mask_cliffs} \ + ~{true='--mask-phages' false='' mask_phages} \ + ~{true='--mask-cliffs' false='' mask_cliffs} \ ~{true='--nomatrix' false='' nomatrix} \ ~{true='--nomsa' false='' nomsa} \ ~{true='--notrees' false='' notrees} \ ~{true='--fast' false='' fast} \ ~{true='--downsample' false='' downsample} \ - ~{true='--sample_sites' false='' sample_sites} \ + ~{true='--sample-sites' false='' sample_sites} \ ~{'--read_cleaner ' + read_cleaner} \ ~{'--mapper ' + mapper} \ ~{'--snpcaller ' + snpcaller} \ @@ -126,11 +166,11 @@ task lyveset { File? lyveset_pooled_snps_vcf = "~{dataset_name}/msa/out.pooled.snps.vcf.gz" File? lyveset_filtered_matrix = "~{dataset_name}/msa/out.filteredMatrix.tsv" File? lyveset_alignment_fasta = "~{dataset_name}/msa/out.aln.fas" - File? lyveset_reference_fasta = "~{dataset_name}/reference/reference.fasta" - File? lyveset_masked_regions = "~{dataset_name}/reference/maskedRegions.bed" + File? lyveset_reference_fasta = "~{dataset_name}/ref/reference.fasta" + File? lyveset_masked_regions = "~{dataset_name}/ref/maskedRegions.bed" Array[File]? lyveset_msa_outputs = glob("~{dataset_name}/msa/out*") Array[File]? lyveset_log_outputs = glob("~{dataset_name}/log/*") - Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/reference/*") + Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/ref/*") Array[File]? lyveset_bam_outputs = glob("~{dataset_name}/bam/*.bam*") Array[File]? lyveset_vcf_outputs = glob("~{dataset_name}/vcf/*.vcf*") File lyveset_log = stdout()