update lyveSET FASTQ parsing (#177)

* add sample names to lyveset inputs * i think i fixed it --------- Co-authored-by: Michelle Scribner <[email protected]>
theiagen · Aug 30, 2023 · 87f1695 · 87f1695
1 parent f002dbd
commit 87f1695
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 29 deletions.
diff --git a/tasks/phylogenetic_inference/task_lyveset.wdl b/tasks/phylogenetic_inference/task_lyveset.wdl
@@ -4,6 +4,7 @@ task lyveset {
   input {
     Array[File] read1
     Array[File] read2
+    Array[String] samplename
     File reference_genome
     String dataset_name
     String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f"
@@ -67,17 +68,24 @@ task lyveset {
   command <<<
     date | tee DATE
 
-    # set bash arrays based on inputs to ensure read arrays are of equal length
+    # set bash arrays based on inputs to ensure read and sample arrays are of equal length
     read1_array=(~{sep=' ' read1})
     read1_array_len=$(echo "${#read1[@]}")
     read2_array=(~{sep=' ' read2})
     read2_array_len=$(echo "${#read2[@]}")
+    samplename_array=(~{sep=' ' samplename})
+    samplename_array_len=$(echo "${#samplename[@]}")
 
     if [ "$read1_array_len" -ne "$read2_array_len" ]; then
       echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2
       exit 1
     fi
 
+    if [ "$read1_array_len" -ne "$samplename_array_len" ]; then
+      echo "read1 array (length: $read1_array_len) and samplename index array (length: $samplename_array_len) are of unequal length." >&2
+      exit 1
+    fi
+
     # create lyvset project
     set_manage.pl --create ~{dataset_name}
 
@@ -91,36 +99,19 @@ task lyveset {
 
     mkdir -v input-fastqs
 
-    # Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2
+    # Firstly, rename samplename so that underscores are replaced with dashes 
+    # Then, rename read files with samplenames followed by "_[1,2].fastq.gz"
     # Also, place files within input-fastqs/ directory
-    echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2"
-    for FASTQ in "${!read1_array[@]}"; do 
-      FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}")
-      # sed line replaces underscores with dashes, except surrounding R1 or R2
-      mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
-    done
-    # do the same for read2
-    for FASTQ in "${!read2_array[@]}"; do 
-      FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}")
-      # sed line replaces underscores with dashes, except surrounding R1 or R2
-      mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
-    done
 
-    ### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ###
-    # read1
-    for FASTQ in input-fastqs/*; do
-      FASTQ_BASENAME=$(basename ${FASTQ})
-      # if the R1 FASTQ filenames end in "_R1.fastq.gz"  rename the files to match lyveset naming convention
-      if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then
-        echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
-        mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
-      # if the R2 FASTQ filenames end in "_R2.fastq.gz"  rename the files to match lyveset naming convention
-      elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then
-        echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
-        mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
-      else
-        echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz"
-      fi
+    echo "DEBUG: FASTQ file renaming. Renaming FASTQs to match lyveset naming convention..."
+    # for every sample in the samplename array, move and rename the read1 and read2 files
+    for SAMPLENAME in "${!samplename_array[@]}"; do 
+      SAMPLENAME_NO_UNDERSCORES=$(echo "${samplename_array[$SAMPLENAME]}" | sed -E 's/_/-/g')
+      # sed line replaces underscores with dashes, except surrounding R1 or R2
+      echo "DEBUG: SAMPLENAME_NO_UNDERSCORES= ${SAMPLENAME_NO_UNDERSCORES}"
+      mv -v ${read1_array[$SAMPLENAME]} input-fastqs/${SAMPLENAME_NO_UNDERSCORES}_1.fastq.gz
+      echo 
+      mv -v ${read2_array[$SAMPLENAME]} input-fastqs/${SAMPLENAME_NO_UNDERSCORES}_2.fastq.gz
     done
 
     echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:"

diff --git a/workflows/phylogenetics/wf_lyveset.wdl b/workflows/phylogenetics/wf_lyveset.wdl
@@ -7,13 +7,15 @@ workflow lyveset_workflow {
   input {
     Array[File] read1
     Array[File] read2
+    Array[String] samplename
     String dataset_name
     File reference_genome
   }
   call lyveset.lyveset {
     input:
       read1 = read1,
       read2 = read2,
+      samplename = samplename,
       dataset_name = dataset_name,
       reference_genome = reference_genome
   }