PacificBiosciences · williamrowell · Jan 10, 2024 · Jan 17, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/README.md b/README.md
@@ -205,8 +205,8 @@ These files will be output for each sample defined in the cohort.
 | Array[File] | sample_hiphase_blocks | Phase block list written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#phase-block-file---blocks-file) | |
 | Array[File] | sample_hiphase_haplotags | Per-read haplotag information, written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotag-file---haplotag-file) | |
 | Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | merged_haplotagged_bam | Aligned (by [pbmm2](https://github.com/PacificBiosciences/pbmm2)), haplotagged (by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotagged-bam-files)) reads (with index) | |
-| Array[File] | haplotagged_bam_mosdepth_summary | [mosdepth](https://github.com/brentp/mosdepth) summary of median depths per chromosome | |
-| Array[File] | haplotagged_bam_mosdepth_region_bed | mosdepthhttps://github.com/brentp/mosdepth BED of median coverage depth per 500 bp window | |
+| Array[File] | mosdepth_summary | [mosdepth](https://github.com/brentp/mosdepth) summary of median depths per chromosome | |
+| Array[File] | mosdepth_region_bed | mosdepth BED of median coverage depth per 500 bp window | |
 | Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_repeat_vcf | Tandem repeat genotypes from [TRGT](https://github.com/PacificBiosciences/trgt/blob/main/docs/vcf_files.md) (with index) | |
 | Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_spanning_reads | Fragments of HiFi reads spanning loci genotyped by TRGT (with index) | |
 | Array[File] | trgt_dropouts | Regions with insufficient coverage to genotype by TRGT | |
@@ -259,7 +259,7 @@ The Docker image used by a particular step of the workflow can be identified by
 | deepvariant | User-defined; default is version [1.6.0](https://github.com/google/deepvariant/releases/tag/v1.6.0) | [DeepVariant GitHub](https://github.com/google/deepvariant) |
 | glnexus | <ul><li>[glnexus v1.4.3](https://github.com/dnanexus-rnd/GLnexus/releases/tag/v1.4.3)</li></ul> | [GLnexus GitHub](https://github.com/dnanexus-rnd/GLnexus) |
 | hificnv | <ul><li>[HiFiCNV v0.1.7](https://github.com/PacificBiosciences/HiFiCNV/releases/tag/v0.1.7)</li><li>[bcftools 1.16](https://github.com/samtools/bcftools/releases/tag/1.16)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/0b0fbe939648087e9fdea4497ae08dc76538ebf0/docker/hificnv) |
-| hiphase | <ul><li>[HiPhase 1.0.0](https://github.com/PacificBiosciences/HiPhase/releases/tag/v1.0.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d26db6204409dfeff56e169cdba0cc14bc272f15/docker/hiphase) |
+| hiphase | <ul><li>[HiPhase 1.1.0](https://github.com/PacificBiosciences/HiPhase/releases/tag/v1.1.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/02f2bae9d10504c587990995fba3aa7335f910f8/docker/hiphase) |
 | htslib | <ul><li>[htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) |
 | mosdepth | <ul><li>[mosdepth 0.2.9](https://github.com/brentp/mosdepth/releases/tag/v0.2.9)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/mosdepth) |
 | paraphase | <ul><li>[minimap2 2.26](https://github.com/lh3/minimap2/releases/tag/v2.26)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[paraphase 3.0.0](https://github.com/PacificBiosciences/paraphase)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/4f510e5f434cc138577853f56558b90e059fd770/docker/paraphase) |
@@ -270,7 +270,7 @@ The Docker image used by a particular step of the workflow can be identified by
 | samtools | <ul><li>[samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) |
 | slivar | <ul><li>[slivar 0.2.2](https://github.com/brentp/slivar/releases/tag/v0.2.2)</li><li>[bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)</li><li>[vcfpy 0.13.3](https://github.com/bihealth/vcfpy/releases/tag/v0.13.3)</li><li>[pysam 0.19.1](https://github.com/pysam-developers/pysam/releases/tag/v0.19.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/slivar) |
 | svpack | <ul><li>[svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)</li><li>[htslib 1.18](https://github.com/samtools/htslib/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/8edbc516abc0ff43ac279b48018003923721b054/docker/svpack) |
-| trgt | <ul><li>[trgt 0.8.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.7.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3bbc033f8f942b10a304b4fa907957a789c73ef7/docker/trgt) |
+| trgt | <ul><li>[trgt 0.8.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.8.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3bbc033f8f942b10a304b4fa907957a789c73ef7/docker/trgt) |
 
 ---
 

diff --git a/wdl-ci.config.json b/wdl-ci.config.json
@@ -348,7 +348,7 @@
       "tasks": {
         "pbsv_discover": {
           "key": "pbsv_discover",
-          "digest": "lbv7nwockw3wcbkfvapzoc2wv7fcodnw",
+          "digest": "winvogvhhjzlbxhsknadxpvwfj6f37wm",
           "tests": [
             {
               "inputs": {
@@ -654,7 +654,7 @@
         },
         "deepvariant_postprocess_variants": {
           "key": "deepvariant_postprocess_variants",
-          "digest": "ey7zqpajaeesvsg372rehhjmkpqld2qx",
+          "digest": "nxh4wiylfi7ngy6cxe65ovrzrwdoutja",
           "tests": [
             {
               "inputs": {
@@ -881,7 +881,7 @@
       "tasks": {
         "run_hiphase": {
           "key": "run_hiphase",
-          "digest": "6k2rtel3k6747xhcfnjufqfzgcnb7g5v",
+          "digest": "6qxqz7p56asf3z6f6cqwebdsd57fs4lh",
           "tests": [
             {
               "inputs": {

diff --git a/workflows/main.wdl b/workflows/main.wdl
@@ -115,8 +115,8 @@ workflow humanwgs {
 		Array[File] sample_hiphase_blocks = sample_analysis.hiphase_blocks
 		Array[File] sample_hiphase_haplotags = sample_analysis.hiphase_haplotags
 		Array[IndexData] merged_haplotagged_bam = sample_analysis.merged_haplotagged_bam
-		Array[File] haplotagged_bam_mosdepth_summary = sample_analysis.haplotagged_bam_mosdepth_summary
-		Array[File] haplotagged_bam_mosdepth_region_bed = sample_analysis.haplotagged_bam_mosdepth_region_bed
+		Array[File] mosdepth_summary = sample_analysis.mosdepth_summary
+		Array[File] mosdepth_region_bed = sample_analysis.mosdepth_region_bed
 
 		# per sample trgt outputs
 		Array[IndexData] trgt_spanning_reads = sample_analysis.trgt_spanning_reads

diff --git a/workflows/sample_analysis/sample_analysis.wdl b/workflows/sample_analysis/sample_analysis.wdl
@@ -49,27 +49,8 @@ workflow sample_analysis {
 		}
 	}
 
-	call DeepVariant.deepvariant {
-		input:
-			sample_id = sample.sample_id,
-			aligned_bams = aligned_bam,
-			reference_fasta = reference.fasta,
-			reference_name = reference.name,
-			deepvariant_version = deepvariant_version,
-			custom_deepvariant_model_tar = custom_deepvariant_model_tar,
-			default_runtime_attributes = default_runtime_attributes
-	}
-
-	call bcftools {
-		input:
-			vcf = deepvariant.vcf.data,
-			stats_params = "--apply-filters PASS --samples ~{sample.sample_id}",
-			reference = reference.fasta.data,
-			runtime_attributes = default_runtime_attributes
-	}
-
 	scatter (shard_index in range(length(pbsv_splits))) {
-        Array[String] region_set = pbsv_splits[shard_index]
+		Array[String] region_set = pbsv_splits[shard_index]
 
 		call PbsvCall.pbsv_call {
 			input:
@@ -84,6 +65,7 @@ workflow sample_analysis {
 		}
 	}
 
+	# concatenate pbsv vcfs
 	call ConcatVcf.concat_vcf {
 		input:
 			vcfs = pbsv_call.pbsv_vcf,
@@ -92,73 +74,97 @@ workflow sample_analysis {
 			runtime_attributes = default_runtime_attributes
 	}
 
-	IndexData zipped_pbsv_vcf = {
-		"data": concat_vcf.concatenated_vcf,
-		"data_index": concat_vcf.concatenated_vcf_index
-	}
-
-	call HiPhase.hiphase {
-		# vcfs order: small variants, SVs
-		input:
-			id = sample.sample_id,
-			refname = reference.name,
-			sample_ids = [sample.sample_id],
-			vcfs = [deepvariant.vcf, zipped_pbsv_vcf],
-			bams = aligned_bam,
-			haplotag = true,
-			reference_fasta = reference.fasta,
-			default_runtime_attributes = default_runtime_attributes
-	}
-
-	# merge haplotagged bams if there are multiple
-	if (length(hiphase.haplotagged_bams) > 1) {
-		scatter (bam_object in hiphase.haplotagged_bams) {
+	# merge aligned bams if there are multiple
+	if (length(aligned_bam) > 1) {
+		scatter (bam_object in aligned_bam) {
 			File bam_to_merge = bam_object.data
 		}
 		call merge_bams {
 			input:
 				bams = bam_to_merge,
-				output_bam_name = "~{sample.sample_id}.~{reference.name}.haplotagged.bam",
+				output_bam_name = "~{sample.sample_id}.~{reference.name}.bam",
 				runtime_attributes = default_runtime_attributes
 		}
 	}
 
-	# select the merged bam if it exists, otherwise select the first (only) haplotagged bam
-	File haplotagged_bam = select_first([merge_bams.merged_bam, hiphase.haplotagged_bams[0].data])
-	File haplotagged_bam_index = select_first([merge_bams.merged_bam_index, hiphase.haplotagged_bams[0].data_index])
+	# select the merged bam if it exists, otherwise select the first (only) aligned bam
+	File aligned_bam_data = select_first([merge_bams.merged_bam, aligned_bam[0].data])
+	File aligned_bam_index = select_first([merge_bams.merged_bam_index, aligned_bam[0].data_index])
 
 	call Mosdepth.mosdepth {
 		input:
-			aligned_bam = haplotagged_bam,
-			aligned_bam_index = haplotagged_bam_index,
+			aligned_bam = aligned_bam_data,
+			aligned_bam_index = aligned_bam_index,
+			runtime_attributes = default_runtime_attributes
+	}
+
+	call DeepVariant.deepvariant {
+		input:
+			sample_id = sample.sample_id,
+			aligned_bams = aligned_bam,
+			reference_fasta = reference.fasta,
+			reference_name = reference.name,
+			deepvariant_version = deepvariant_version,
+			custom_deepvariant_model_tar = custom_deepvariant_model_tar,
+			default_runtime_attributes = default_runtime_attributes
+	}
+
+	call bcftools {
+		input:
+			vcf = deepvariant.vcf.data,
+			stats_params = "--apply-filters PASS --samples ~{sample.sample_id}",
+			reference = reference.fasta.data,
 			runtime_attributes = default_runtime_attributes
 	}
 
 	call trgt {
 		input:
 			sample_id = sample.sample_id,
 			sex = sample.sex,
-			bam = haplotagged_bam,
-			bam_index = haplotagged_bam_index,
+			bam = aligned_bam_data,
+			bam_index = aligned_bam_index,
 			reference = reference.fasta.data,
 			reference_index = reference.fasta.data_index,
 			tandem_repeat_bed = reference.trgt_tandem_repeat_bed,
+			output_prefix = "~{sample.sample_id}.~{reference.name}",
 			runtime_attributes = default_runtime_attributes
 	}
 
+	call HiPhase.hiphase {
+		# vcfs order: small variants, SVs, TRGT
+		input:
+			id = sample.sample_id,
+			refname = reference.name,
+			sample_ids = [sample.sample_id],
+			vcfs = [
+				deepvariant.vcf,
+				{"data": concat_vcf.concatenated_vcf, "data_index": concat_vcf.concatenated_vcf_index}, 
+				{"data": trgt.repeat_vcf, "data_index": trgt.repeat_vcf_index}
+				],
+			bams = [{"data": aligned_bam_data, "data_index": aligned_bam_index}],
+			haplotag = true,
+			reference_fasta = reference.fasta,
+			default_runtime_attributes = default_runtime_attributes
+	}
+
+	IndexData haplotagged_bam = {
+		"data": hiphase.haplotagged_bams[0].data,
+		"data_index": hiphase.haplotagged_bams[0].data_index
+	}
+
 	call coverage_dropouts {
 		input:
-			bam = haplotagged_bam,
-			bam_index = haplotagged_bam_index,
+			bam = haplotagged_bam.data,
+			bam_index = haplotagged_bam.data_index,
 			tandem_repeat_bed = reference.trgt_tandem_repeat_bed,
 			output_prefix = "~{sample.sample_id}.~{reference.name}",
 			runtime_attributes = default_runtime_attributes
 	}
 
 	call cpg_pileup {
 		input:
-			bam = haplotagged_bam,
-			bam_index = haplotagged_bam_index,
+			bam = haplotagged_bam.data,
+			bam_index = haplotagged_bam.data_index,
 			output_prefix = "~{sample.sample_id}.~{reference.name}",
 			reference = reference.fasta.data,
 			reference_index = reference.fasta.data_index,
@@ -168,8 +174,8 @@ workflow sample_analysis {
 	call paraphase {
 		input:
 			sample_id = sample.sample_id,
-			bam = haplotagged_bam,
-			bam_index = haplotagged_bam_index,
+			bam = haplotagged_bam.data,
+			bam_index = haplotagged_bam.data_index,
 			reference = reference.fasta.data,
 			reference_index = reference.fasta.data_index,
 			out_directory = "~{sample.sample_id}.paraphase",
@@ -180,8 +186,8 @@ workflow sample_analysis {
 		input:
 			sample_id = sample.sample_id,
 			sex = sample.sex,
-			bam = haplotagged_bam,
-			bam_index = haplotagged_bam_index,
+			bam = haplotagged_bam.data,
+			bam_index = haplotagged_bam.data_index,
 			phased_vcf = hiphase.phased_vcfs[0].data,
 			phased_vcf_index = hiphase.phased_vcfs[0].data_index,
 			reference = reference.fasta.data,
@@ -195,33 +201,36 @@ workflow sample_analysis {
 	}
 
 	output {
-		# per movie stats, alignments, and svsigs
+		# per movie stats, alignments
 		Array[File] bam_stats = pbmm2_align.bam_stats
 		Array[File] read_length_summary = pbmm2_align.read_length_summary
 		Array[File] read_quality_summary = pbmm2_align.read_quality_summary
 		Array[IndexData] aligned_bams = aligned_bam
+
+		# phased_vcfs ouput order from HiPhase: small variants, SVs, TRGT
+
+		# per sample structural variant signatures and calls
+		IndexData phased_sv_vcf = hiphase.phased_vcfs[1]
 		Array[File] svsigs = pbsv_discover.svsig
 
 		# per sample small variant calls
+		IndexData phased_small_variant_vcf = hiphase.phased_vcfs[0]
 		IndexData small_variant_gvcf = deepvariant.gvcf
 		File small_variant_vcf_stats = bcftools.stats
 		File small_variant_roh_out = bcftools.roh_out
 		File small_variant_roh_bed = bcftools.roh_bed
 
-		# per sample final phased variant calls and haplotagged alignments
-		# phased_vcfs order: small variants, SVs
-		IndexData phased_small_variant_vcf = hiphase.phased_vcfs[0]
-		IndexData phased_sv_vcf = hiphase.phased_vcfs[1]
+		# per sample phasing stats and haplotagged alignments
 		File hiphase_stats = hiphase.hiphase_stats
 		File hiphase_blocks = hiphase.hiphase_blocks
 		File hiphase_haplotags = select_first([hiphase.hiphase_haplotags])
-		IndexData merged_haplotagged_bam = {"data": haplotagged_bam, "data_index": haplotagged_bam_index}
-		File haplotagged_bam_mosdepth_summary = mosdepth.summary
-		File haplotagged_bam_mosdepth_region_bed = mosdepth.region_bed
+		IndexData merged_haplotagged_bam = haplotagged_bam
+		File mosdepth_summary = mosdepth.summary
+		File mosdepth_region_bed = mosdepth.region_bed
 
 		# per sample trgt outputs
+		IndexData trgt_repeat_vcf = hiphase.phased_vcfs[2]
 		IndexData trgt_spanning_reads = {"data": trgt.spanning_reads, "data_index": trgt.spanning_reads_index}
-		IndexData trgt_repeat_vcf = {"data": trgt.repeat_vcf, "data_index": trgt.repeat_vcf_index}
 		File trgt_dropouts = coverage_dropouts.trgt_dropouts
 
 		# per sample cpg outputs
@@ -446,12 +455,13 @@ task trgt {
 		File reference_index
 		File tandem_repeat_bed
 
+		String output_prefix
+
 		RuntimeAttributes runtime_attributes
 	}
 
 	Boolean sex_defined = defined(sex)
 	String karyotype = if select_first([sex, "FEMALE"]) == "MALE" then "XY" else "XX"
-	String bam_basename = basename(bam, ".bam")
 	Int threads = 4
 	Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 2 + 20)
 
@@ -468,37 +478,37 @@ task trgt {
 			--genome ~{reference} \
 			--repeats ~{tandem_repeat_bed} \
 			--reads ~{bam} \
-			--output-prefix ~{bam_basename}.trgt
+			--output-prefix ~{output_prefix}.trgt
 
 		bcftools --version
 
 		bcftools sort \
 			--output-type z \
-			--output ~{bam_basename}.trgt.sorted.vcf.gz \
-			~{bam_basename}.trgt.vcf.gz
+			--output ~{output_prefix}.trgt.sorted.vcf.gz \
+			~{output_prefix}.trgt.vcf.gz
 
 		bcftools index \
 			--threads ~{threads - 1} \
 			--tbi \
-			~{bam_basename}.trgt.sorted.vcf.gz
+			~{output_prefix}.trgt.sorted.vcf.gz
 
 		samtools --version
 
 		samtools sort \
 			-@ ~{threads - 1} \
-			-o ~{bam_basename}.trgt.spanning.sorted.bam \
-			~{bam_basename}.trgt.spanning.bam
+			-o ~{output_prefix}.trgt.spanning.sorted.bam \
+			~{output_prefix}.trgt.spanning.bam
 
 		samtools index \
 			-@ ~{threads - 1} \
-			~{bam_basename}.trgt.spanning.sorted.bam
+			~{output_prefix}.trgt.spanning.sorted.bam
 	>>>
 
 	output {
-		File spanning_reads = "~{bam_basename}.trgt.spanning.sorted.bam"
-		File spanning_reads_index = "~{bam_basename}.trgt.spanning.sorted.bam.bai"
-		File repeat_vcf = "~{bam_basename}.trgt.sorted.vcf.gz"
-		File repeat_vcf_index = "~{bam_basename}.trgt.sorted.vcf.gz.tbi"
+		File spanning_reads = "~{output_prefix}.trgt.spanning.sorted.bam"
+		File spanning_reads_index = "~{output_prefix}.trgt.spanning.sorted.bam.bai"
+		File repeat_vcf = "~{output_prefix}.trgt.sorted.vcf.gz"
+		File repeat_vcf_index = "~{output_prefix}.trgt.sorted.vcf.gz.tbi"
 	}
 
 	runtime {

diff --git a/workflows/wdl-common b/workflows/wdl-common