Skip to content

Commit

Permalink
update lyveSET FASTQ parsing (#177)
Browse files Browse the repository at this point in the history
* add sample names to lyveset inputs

* i think i fixed it

---------

Co-authored-by: Michelle Scribner <[email protected]>
  • Loading branch information
kapsakcj and michellescribner authored Aug 30, 2023
1 parent f002dbd commit 87f1695
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 29 deletions.
49 changes: 20 additions & 29 deletions tasks/phylogenetic_inference/task_lyveset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ task lyveset {
input {
Array[File] read1
Array[File] read2
Array[String] samplename
File reference_genome
String dataset_name
String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f"
Expand Down Expand Up @@ -67,17 +68,24 @@ task lyveset {
command <<<
date | tee DATE

# set bash arrays based on inputs to ensure read arrays are of equal length
# set bash arrays based on inputs to ensure read and sample arrays are of equal length
read1_array=(~{sep=' ' read1})
read1_array_len=$(echo "${#read1[@]}")
read2_array=(~{sep=' ' read2})
read2_array_len=$(echo "${#read2[@]}")
samplename_array=(~{sep=' ' samplename})
samplename_array_len=$(echo "${#samplename[@]}")

if [ "$read1_array_len" -ne "$read2_array_len" ]; then
echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2
exit 1
fi

if [ "$read1_array_len" -ne "$samplename_array_len" ]; then
echo "read1 array (length: $read1_array_len) and samplename index array (length: $samplename_array_len) are of unequal length." >&2
exit 1
fi

# create lyvset project
set_manage.pl --create ~{dataset_name}

Expand All @@ -91,36 +99,19 @@ task lyveset {

mkdir -v input-fastqs

# Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2
# Firstly, rename samplename so that underscores are replaced with dashes
# Then, rename read files with samplenames followed by "_[1,2].fastq.gz"
# Also, place files within input-fastqs/ directory
echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2"
for FASTQ in "${!read1_array[@]}"; do
FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done
# do the same for read2
for FASTQ in "${!read2_array[@]}"; do
FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done

### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ###
# read1
for FASTQ in input-fastqs/*; do
FASTQ_BASENAME=$(basename ${FASTQ})
# if the R1 FASTQ filenames end in "_R1.fastq.gz" rename the files to match lyveset naming convention
if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
# if the R2 FASTQ filenames end in "_R2.fastq.gz" rename the files to match lyveset naming convention
elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
else
echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz"
fi
echo "DEBUG: FASTQ file renaming. Renaming FASTQs to match lyveset naming convention..."
# for every sample in the samplename array, move and rename the read1 and read2 files
for SAMPLENAME in "${!samplename_array[@]}"; do
SAMPLENAME_NO_UNDERSCORES=$(echo "${samplename_array[$SAMPLENAME]}" | sed -E 's/_/-/g')
# sed line replaces underscores with dashes, except surrounding R1 or R2
echo "DEBUG: SAMPLENAME_NO_UNDERSCORES= ${SAMPLENAME_NO_UNDERSCORES}"
mv -v ${read1_array[$SAMPLENAME]} input-fastqs/${SAMPLENAME_NO_UNDERSCORES}_1.fastq.gz
echo
mv -v ${read2_array[$SAMPLENAME]} input-fastqs/${SAMPLENAME_NO_UNDERSCORES}_2.fastq.gz
done

echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:"
Expand Down
2 changes: 2 additions & 0 deletions workflows/phylogenetics/wf_lyveset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ workflow lyveset_workflow {
input {
Array[File] read1
Array[File] read2
Array[String] samplename
String dataset_name
File reference_genome
}
call lyveset.lyveset {
input:
read1 = read1,
read2 = read2,
samplename = samplename,
dataset_name = dataset_name,
reference_genome = reference_genome
}
Expand Down

0 comments on commit 87f1695

Please sign in to comment.