DataBiosphere · mmorgantaylor · Oct 8, 2024 · Sep 25, 2024 · Sep 25, 2024 · Oct 2, 2024
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -49,3 +49,10 @@ workflows:
     authors:
       - name: Terra Scientific Services
         email: [email protected]
+
+  - subclass: WDL
+    name: UpdateVcfDictionaryHeader
+    primaryDescriptorPath: /pipelines/imputation/scientificValidation/UpdateVcfDictionaryHeader.wdl
+    authors:
+      - name: Terra Scientific Services
+        email: [email protected]
diff --git a/pipelines/imputation/scientificValidation/README.md b/pipelines/imputation/scientificValidation/README.md
@@ -23,13 +23,39 @@ not interact with headers or annotations mostly because
 the only really "required" header is the dictionary
 and that gets transferred across and the imputation
 tool only look at GT and no info/format fields so
-we can just leave them be and have it not affect
+we can just leave them be.
+
+This wdl can optionally additionally extract a subset of samples if the
+optional input `samples_to_subselect` is provided.
 
 #### Inputs
 * input_vcf - input file to be subset
 * input_vcf_index 
 * bed_file - bed file containing intervals to subset by
+* (optional) samples_to_subselect - text file containing 
+newline-separated list of samples to subset the vcf by
 
 #### Outputs
 * subset_vcf - subsetted vcf
 * subset_vcf_index 
+
+## UpdateVcfDictionaryHeader
+### Purpose
+This wdl can be used to update the dictionary header lines
+(i.e. `##contigs=` lines) in a vcf file, based on a supplied
+dictionary file. This is useful when the contigs in the vcf
+file do not match the contigs in the reference panel used
+for imputation. The optional input `disable_sequence_dictionary_validation`,
+by default set to true, checks the contigs in the input vcf file and
+throws an error if they are not valid. If set to false, the contigs
+in the input vcf file are not checked.
+
+#### Inputs
+* input_vcf - input vcf file
+* input_vcf_index
+* dictionary_file - dictionary file to use for updating the vcf header;
+we usually use `gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict`.
+
+#### Outputs
+* output_vcf - reheadered vcf file
+* output_vcf_index
diff --git a/pipelines/imputation/scientificValidation/SubsetVcfByBedFile.wdl b/pipelines/imputation/scientificValidation/SubsetVcfByBedFile.wdl
@@ -5,13 +5,15 @@ workflow SubsetVcfByBedFile {
     input {
         File input_vcf
         File input_vcf_index
+        File? samples_to_subselect
         File bed_file
     }
 
     call BcftoolsSubsetVcf {
         input:
             input_vcf = input_vcf,
             input_vcf_index = input_vcf_index,
+            samples_to_subselect = samples_to_subselect,
             bed_file = bed_file
     }
 
@@ -25,6 +27,7 @@ task BcftoolsSubsetVcf {
     input {
         File input_vcf
         File input_vcf_index
+        File? samples_to_subselect
         File bed_file
 
         Int disk_size_gb = ceil(3 * size(input_vcf, "GiB")) + 20
@@ -41,6 +44,7 @@ task BcftoolsSubsetVcf {
         -R ~{bed_file} \
         -O z \
         -o ~{basename}.subset.vcf.gz \
+        ~{" --samples-file " + samples_to_subselect} \
         ~{input_vcf}
 
         bcftools index -t ~{basename}.subset.vcf.gz

diff --git a/pipelines/imputation/scientificValidation/UpdateVcfHeaderContigs.wdl b/pipelines/imputation/scientificValidation/UpdateVcfHeaderContigs.wdl
@@ -0,0 +1,69 @@
+version 1.0
+
+# This script is under review. It is not actively tested or maintained at this time.
+workflow UpdateVcfDictionaryHeader {
+    input {
+        File input_vcf
+        File input_vcf_index
+        File ref_dict
+    }
+
+    call UpdateVcfDictionaryHeader {
+        input:
+            vcf = input_vcf,
+            vcf_index = input_vcf_index,
+            ref_dict = ref_dict,
+            basename = basename(input_vcf, '.vcf.gz')
+    }
+
+    output {
+        File output_vcf = UpdateVcfDictionaryHeader.output_vcf
+        File output_vcf_index = UpdateVcfDictionaryHeader.output_vcf_index
+    }
+}
+
+task UpdateVcfDictionaryHeader {
+    input {
+        File vcf
+        File vcf_index
+        File ref_dict
+        String basename
+        Boolean disable_sequence_dictionary_validation = true
+
+        Int disk_size_gb = ceil(4*(size(vcf, "GiB") + size(vcf_index, "GiB"))) + 20
+        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
+        Int cpu = 1
+        Int memory_mb = 6000
+        Int preemptible = 0
+    }
+    Int command_mem = memory_mb - 1500
+    Int max_heap = memory_mb - 1000
+
+    String disable_sequence_dict_validation_flag = if disable_sequence_dictionary_validation then "--disable-sequence-dictionary-validation" else ""
+
+    command <<<
+        set -e -o pipefail
+
+        ln -sf ~{vcf} input.vcf.gz
+        ln -sf ~{vcf_index} input.vcf.gz.tbi
+
+        ## update the header of the merged vcf
+        gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
+        UpdateVCFSequenceDictionary \
+        --source-dictionary ~{ref_dict} \
+        --output ~{basename}.vcf.gz \
+        --replace -V input.vcf.gz \
+        ~{disable_sequence_dict_validation_flag}
+    >>>
+    runtime {
+        docker: gatk_docker
+        disks: "local-disk ${disk_size_gb} HDD"
+        memory: "${memory_mb} MiB"
+        cpu: cpu
+        preemptible: preemptible
+    }
+    output {
+        File output_vcf = "~{basename}.vcf.gz"
+        File output_vcf_index = "~{basename}.vcf.gz.tbi"
+    }
+}