lyveset fastq file parsing bugfix and other improvements (#156)

* correct 3 optional input params for lyveset * lyveSET task: added ability to handle FASTQ files ending with _R1.fastq.gz. Other small improvements. tested fine in miniwdl, need to test in Terra * more concise debug statement for renaming files * corrected paths for 2 optional outputs * overhaul of lyveset FASTQ file ingestion; increased default disk size to 250GB; rename FASTQ files to replace underscores w dashes except for those surrounding R1/R2; rename FASTQs downloaded via SRA_Fetch and basespace_fetch wfs
theiagen · Aug 30, 2023 · f002dbd · f002dbd
1 parent 19d1de4
commit f002dbd
Showing 1 changed file with 59 additions and 19 deletions.
diff --git a/tasks/phylogenetic_inference/task_lyveset.wdl b/tasks/phylogenetic_inference/task_lyveset.wdl
@@ -9,7 +9,7 @@ task lyveset {
     String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f"
     Int memory = 64
     Int cpu = 16
-    Int disk_size = 100
+    Int disk_size = 250
     # Lyve-SET Parameters
     ##COMMON OPTIONS
     ##--allowedFlanking  0              allowed flanking distance in bp.
@@ -60,7 +60,7 @@ task lyveset {
     Boolean fast = false
     Boolean downsample = false
     Boolean sample_sites = false
-    String? read_cleaner = 'CGP'
+    String read_cleaner = "CGP"
     String? mapper
     String? snpcaller
   }
@@ -73,41 +73,81 @@ task lyveset {
     read2_array=(~{sep=' ' read2})
     read2_array_len=$(echo "${#read2[@]}")
 
-    if [ "$read1_array_len" -ne "$read2_index_array_len" ]; then
+    if [ "$read1_array_len" -ne "$read2_array_len" ]; then
       echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2
       exit 1
     fi
 
     # create lyvset project
     set_manage.pl --create ~{dataset_name}
 
-    #shuffle paired end reads
-    for index in ${!read1_array[@]}; do
-      cp ${read1_array[$index]} . && cp ${read2_array[$index]} . # move reads to cwd
+    ### PREFACE ###
+    # This FASTQ file re-naming strategy is necessary due to filename parsing in shuffleSplitReads.pl here: https://github.com/lskatz/lyve-SET/blob/v1.1.4f/scripts/shuffleSplitReads.pl#L34
+    # Curtis' interpretation of perl code:
+    # It first checks for a pattern like '_R1_' or '_R2_', and if found, sets the $readNumber variable to 1 or 2 respectively.
+    # If that pattern is not found, it checks for a pattern like '_1.f' or '_2.f', again setting the $readNumber accordingly.
+    # If neither pattern is matched, it raises an error indicating that the read number could not be parsed from the filename.
+    ### END PREFACE ###
+
+    mkdir -v input-fastqs
+
+    # Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2
+    # Also, place files within input-fastqs/ directory
+    echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2"
+    for FASTQ in "${!read1_array[@]}"; do 
+      FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}")
+      # sed line replaces underscores with dashes, except surrounding R1 or R2
+      mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
+    done
+    # do the same for read2
+    for FASTQ in "${!read2_array[@]}"; do 
+      FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}")
+      # sed line replaces underscores with dashes, except surrounding R1 or R2
+      mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
     done
 
-    shuffleSplitReads.pl --numcpus ~{cpu} -o ./interleaved *.fastq.gz 
+    ### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ###
+    # read1
+    for FASTQ in input-fastqs/*; do
+      FASTQ_BASENAME=$(basename ${FASTQ})
+      # if the R1 FASTQ filenames end in "_R1.fastq.gz"  rename the files to match lyveset naming convention
+      if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then
+        echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
+        mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
+      # if the R2 FASTQ filenames end in "_R2.fastq.gz"  rename the files to match lyveset naming convention
+      elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then
+        echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
+        mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
+      else
+        echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz"
+      fi
+    done
+
+    echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:"
+    ls -lh input-fastqs/
+    echo
 
-    # then moved into your project dir
-    mv ./interleaved/*.fastq.gz ~{dataset_name}/reads/
+    echo "DEBUG: merging R1 and R2 FASTQ files into interleaved FASTQ files with shuffleSplitReads.pl now..."
+    shuffleSplitReads.pl --numcpus ~{cpu} -o "./~{dataset_name}/reads" input-fastqs/*.fastq.gz
 
-    # cleanup
-    rmdir interleaved
-    mkdir ~{dataset_name}/ref/
-    cp ~{reference_genome} ~{dataset_name}/ref/reference.fasta
+    # make directory for reference genome and copy reference genome into it. Also rename to reference.fasta
+    mkdir -v ~{dataset_name}/ref/
+    cp -v ~{reference_genome} ~{dataset_name}/ref/reference.fasta
+
+    # launch lyveSET workflow now that everything is set up
     launch_set.pl --numcpus ~{cpu} \
     --allowedFlanking ~{allowedFlanking} \
     --min_alt_frac ~{min_alt_frac} \
     --min_coverage ~{min_coverage} \
     ~{'--presets ' + presets} \
-    ~{true='--mask_phages' false='' mask_phages} \
-    ~{true='--mask_cliffs' false='' mask_cliffs} \
+    ~{true='--mask-phages' false='' mask_phages} \
+    ~{true='--mask-cliffs' false='' mask_cliffs} \
     ~{true='--nomatrix' false='' nomatrix} \
     ~{true='--nomsa' false='' nomsa} \
     ~{true='--notrees' false='' notrees} \
     ~{true='--fast' false='' fast} \
     ~{true='--downsample' false='' downsample} \
-    ~{true='--sample_sites' false='' sample_sites} \
+    ~{true='--sample-sites' false='' sample_sites} \
     ~{'--read_cleaner ' + read_cleaner} \
     ~{'--mapper ' + mapper} \
     ~{'--snpcaller ' + snpcaller} \
@@ -126,11 +166,11 @@ task lyveset {
     File? lyveset_pooled_snps_vcf = "~{dataset_name}/msa/out.pooled.snps.vcf.gz"
     File? lyveset_filtered_matrix = "~{dataset_name}/msa/out.filteredMatrix.tsv"
     File? lyveset_alignment_fasta = "~{dataset_name}/msa/out.aln.fas"
-    File? lyveset_reference_fasta = "~{dataset_name}/reference/reference.fasta"
-    File? lyveset_masked_regions = "~{dataset_name}/reference/maskedRegions.bed"
+    File? lyveset_reference_fasta = "~{dataset_name}/ref/reference.fasta"
+    File? lyveset_masked_regions = "~{dataset_name}/ref/maskedRegions.bed"
     Array[File]? lyveset_msa_outputs = glob("~{dataset_name}/msa/out*")
     Array[File]? lyveset_log_outputs = glob("~{dataset_name}/log/*")
-    Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/reference/*")
+    Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/ref/*")
     Array[File]? lyveset_bam_outputs = glob("~{dataset_name}/bam/*.bam*")
     Array[File]? lyveset_vcf_outputs = glob("~{dataset_name}/vcf/*.vcf*")
     File lyveset_log = stdout()