Skip to content

Commit

Permalink
lyveset fastq file parsing bugfix and other improvements (#156)
Browse files Browse the repository at this point in the history
* correct 3 optional input params for lyveset

* lyveSET task: added ability to handle FASTQ files ending with _R1.fastq.gz. Other small improvements. tested fine in miniwdl, need to test in Terra

* more concise debug statement for renaming files

* corrected paths for 2 optional outputs

* overhaul of lyveset FASTQ file ingestion; increased default disk size to 250GB; rename FASTQ files to replace underscores w dashes except for those surrounding R1/R2; rename FASTQs downloaded via SRA_Fetch and basespace_fetch wfs
  • Loading branch information
kapsakcj authored Aug 30, 2023
1 parent 19d1de4 commit f002dbd
Showing 1 changed file with 59 additions and 19 deletions.
78 changes: 59 additions & 19 deletions tasks/phylogenetic_inference/task_lyveset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ task lyveset {
String docker_image = "us-docker.pkg.dev/general-theiagen/staphb/lyveset:1.1.4f"
Int memory = 64
Int cpu = 16
Int disk_size = 100
Int disk_size = 250
# Lyve-SET Parameters
##COMMON OPTIONS
##--allowedFlanking 0 allowed flanking distance in bp.
Expand Down Expand Up @@ -60,7 +60,7 @@ task lyveset {
Boolean fast = false
Boolean downsample = false
Boolean sample_sites = false
String? read_cleaner = 'CGP'
String read_cleaner = "CGP"
String? mapper
String? snpcaller
}
Expand All @@ -73,41 +73,81 @@ task lyveset {
read2_array=(~{sep=' ' read2})
read2_array_len=$(echo "${#read2[@]}")

if [ "$read1_array_len" -ne "$read2_index_array_len" ]; then
if [ "$read1_array_len" -ne "$read2_array_len" ]; then
echo "read1 array (length: $read1_array_len) and read2 index array (length: $read2_array_len) are of unequal length." >&2
exit 1
fi

# create lyvset project
set_manage.pl --create ~{dataset_name}

#shuffle paired end reads
for index in ${!read1_array[@]}; do
cp ${read1_array[$index]} . && cp ${read2_array[$index]} . # move reads to cwd
### PREFACE ###
# This FASTQ file re-naming strategy is necessary due to filename parsing in shuffleSplitReads.pl here: https://github.com/lskatz/lyve-SET/blob/v1.1.4f/scripts/shuffleSplitReads.pl#L34
# Curtis' interpretation of perl code:
# It first checks for a pattern like '_R1_' or '_R2_', and if found, sets the $readNumber variable to 1 or 2 respectively.
# If that pattern is not found, it checks for a pattern like '_1.f' or '_2.f', again setting the $readNumber accordingly.
# If neither pattern is matched, it raises an error indicating that the read number could not be parsed from the filename.
### END PREFACE ###

mkdir -v input-fastqs

# Firstly, rename read1 and read2 so that underscores are replaced with dashes except any underscores surrounding R1 or R2
# Also, place files within input-fastqs/ directory
echo "DEBUG: FASTQ file renaming. Replacing underscores with dashes, except underscores surrounding R1 or R2"
for FASTQ in "${!read1_array[@]}"; do
FASTQ_BASENAME=$(basename "${read1_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read1_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done
# do the same for read2
for FASTQ in "${!read2_array[@]}"; do
FASTQ_BASENAME=$(basename "${read2_array[$FASTQ]}")
# sed line replaces underscores with dashes, except surrounding R1 or R2
mv -v ${read2_array[$FASTQ]} input-fastqs/$(echo "${FASTQ_BASENAME}" | sed -E 's/([^R])_+/\1-/g; s/-+(R1|R2)/_\1/g; s/(R1|R2)-+/\1_/g')
done

shuffleSplitReads.pl --numcpus ~{cpu} -o ./interleaved *.fastq.gz
### renaming FASTQs ending with _R1.fastq.gz or _R2.fastq.gz (i.e. those downloaded w/ SRA_Fetch or Basespace_Fetch wfs) ###
# read1
for FASTQ in input-fastqs/*; do
FASTQ_BASENAME=$(basename ${FASTQ})
# if the R1 FASTQ filenames end in "_R1.fastq.gz" rename the files to match lyveset naming convention
if [[ ${FASTQ} =~ _R1.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R1.fastq.gz/_1.fastq.gz}"
# if the R2 FASTQ filenames end in "_R2.fastq.gz" rename the files to match lyveset naming convention
elif [[ ${FASTQ} =~ _R2.fastq.gz$ ]]; then
echo "DEBUG: renaming ${FASTQ_BASENAME} to ${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
mv -v "${FASTQ}" "input-fastqs/${FASTQ_BASENAME//_R2.fastq.gz/_2.fastq.gz}"
else
echo "DEBUG: did not detect any FASTQ files ending in _R1.fastq.gz or _R2.fastq.gz"
fi
done

echo "DEBUG: here's the final FASTQ filenames, prior to shuffling:"
ls -lh input-fastqs/
echo

# then moved into your project dir
mv ./interleaved/*.fastq.gz ~{dataset_name}/reads/
echo "DEBUG: merging R1 and R2 FASTQ files into interleaved FASTQ files with shuffleSplitReads.pl now..."
shuffleSplitReads.pl --numcpus ~{cpu} -o "./~{dataset_name}/reads" input-fastqs/*.fastq.gz

# cleanup
rmdir interleaved
mkdir ~{dataset_name}/ref/
cp ~{reference_genome} ~{dataset_name}/ref/reference.fasta
# make directory for reference genome and copy reference genome into it. Also rename to reference.fasta
mkdir -v ~{dataset_name}/ref/
cp -v ~{reference_genome} ~{dataset_name}/ref/reference.fasta

# launch lyveSET workflow now that everything is set up
launch_set.pl --numcpus ~{cpu} \
--allowedFlanking ~{allowedFlanking} \
--min_alt_frac ~{min_alt_frac} \
--min_coverage ~{min_coverage} \
~{'--presets ' + presets} \
~{true='--mask_phages' false='' mask_phages} \
~{true='--mask_cliffs' false='' mask_cliffs} \
~{true='--mask-phages' false='' mask_phages} \
~{true='--mask-cliffs' false='' mask_cliffs} \
~{true='--nomatrix' false='' nomatrix} \
~{true='--nomsa' false='' nomsa} \
~{true='--notrees' false='' notrees} \
~{true='--fast' false='' fast} \
~{true='--downsample' false='' downsample} \
~{true='--sample_sites' false='' sample_sites} \
~{true='--sample-sites' false='' sample_sites} \
~{'--read_cleaner ' + read_cleaner} \
~{'--mapper ' + mapper} \
~{'--snpcaller ' + snpcaller} \
Expand All @@ -126,11 +166,11 @@ task lyveset {
File? lyveset_pooled_snps_vcf = "~{dataset_name}/msa/out.pooled.snps.vcf.gz"
File? lyveset_filtered_matrix = "~{dataset_name}/msa/out.filteredMatrix.tsv"
File? lyveset_alignment_fasta = "~{dataset_name}/msa/out.aln.fas"
File? lyveset_reference_fasta = "~{dataset_name}/reference/reference.fasta"
File? lyveset_masked_regions = "~{dataset_name}/reference/maskedRegions.bed"
File? lyveset_reference_fasta = "~{dataset_name}/ref/reference.fasta"
File? lyveset_masked_regions = "~{dataset_name}/ref/maskedRegions.bed"
Array[File]? lyveset_msa_outputs = glob("~{dataset_name}/msa/out*")
Array[File]? lyveset_log_outputs = glob("~{dataset_name}/log/*")
Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/reference/*")
Array[File]? lyveset_reference_outputs = glob("~{dataset_name}/ref/*")
Array[File]? lyveset_bam_outputs = glob("~{dataset_name}/bam/*.bam*")
Array[File]? lyveset_vcf_outputs = glob("~{dataset_name}/vcf/*.vcf*")
File lyveset_log = stdout()
Expand Down

0 comments on commit f002dbd

Please sign in to comment.