Merge branch 'main' into jro-snippy_tree-dev

theiagen · Dec 31, 2022 · 6781a8a · 6781a8a
2 parents c9abd14 + 19d40c6
commit 6781a8a
Show file tree

Hide file tree

Showing 8 changed files with 585 additions and 3 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -2,7 +2,7 @@ version: 1.2
 workflows:
  - name: TheiaEuk_PE
    subclass: WDL
-   primaryDescriptorPath: /workflows/wf_theiaeuk.wdl
+   primaryDescriptorPath: /workflows/wf_theiaeuk_pe.wdl
    testParameterFiles:
     - empty.json
  - name: Cauris_CladeTyper
@@ -18,5 +18,10 @@ workflows:
  - name: Snippy_Tree
    subclass: WDL
    primaryDescriptorPath: /workflows/wf_snippy_tree.wdl
+   testParameterFiles:
+    - empty.json
+ - name: Nullarbor
+   subclass: WDL
+   primaryDescriptorPath: /workflows/wf_nullarbor.wdl
    testParameterFiles:
     - empty.json
diff --git a/tasks/gene_typing/task_snippy_variants.wdl b/tasks/gene_typing/task_snippy_variants.wdl
@@ -90,5 +90,6 @@ task snippy_variants {
       cpu: "~{cpus}"
       disks: "local-disk 100 SSD"
       preemptible: 0
+      maxRetries: 3
   }
 }
diff --git a/tasks/quality_control/task_screen.wdl b/tasks/quality_control/task_screen.wdl
@@ -77,7 +77,7 @@ task check_reads {
         # First Pass; assuming average depth
         mash sketch -o test -k 31 -m 3 -r ~{read1} ~{read2} > mash-output.txt 2>&1
         grep "Estimated genome size:" mash-output.txt | \
-          awk '{if($4){printf("%d", $4)}} END {if (!NR) print "0"}' > genome_size_output
+          awk '{if($4){printf("%5.0f\n", $4)}} END {if (!NR) print "0"}' > genome_size_output
         grep "Estimated coverage:" mash-output.txt | \
           awk '{if($3){printf("%d", $3)}} END {if (!NR) print "0"}' > coverage_output
         rm -rf test.msh
@@ -95,7 +95,7 @@ task check_reads {
           fi
           mash sketch -o test -k 31 ${M} -r ~{read1} ~{read2} > mash-output.txt 2>&1
           grep "Estimated genome size:" mash-output.txt | \
-            awk '{if($4){printf("%d", $4)}} END {if (!NR) print "0"}' > genome_size_output
+            awk '{if($4){printf("%5.0f\n", $4)}} END {if (!NR) print "0"}' > genome_size_output
           grep "Estimated coverage:" mash-output.txt | \
             awk '{if($3){printf("%d", $3)}} END {if (!NR) print "0"}' > coverage_output
           rm -rf test.msh

diff --git a/tasks/task_nullarbor.wdl b/tasks/task_nullarbor.wdl
@@ -0,0 +1,104 @@
+version 1.0
+
+task nullarbor_tsv {
+  #Inputs
+  input { 
+    String run_name = "run1"
+    File ref_genome
+    Array[File] read1
+    Array[File] read2
+    Array[String] samplename
+    String tree_builder = "iqtree_fast"
+    String assembler = "skesa"
+    String taxoner = "kraken2"
+    String docker = "quay.io/biocontainers/nullarbor:2.0.20191013--hdfd78af_3"
+    Int memory = 128
+    Int cpu = 16
+    File kraken1_db = "gs://theiagen-public-files/terra/theiaprok-files/minikraken_20171019_8GB_kraken1.tgz"
+    File kraken2_db = "gs://theiagen-public-files/terra/theiaprok-files/k2_standard_8gb_20210517.tar.gz"
+    File centrifuge_db = "gs://theiagen-public-files/terra/theiaprok-files/p_compressed+h+v.tar.gz"
+  }
+  command <<<
+    # capture date and version
+    # Print and save date
+    date | tee DATE
+    # Print and save version
+    nullarbor.pl --version | tee VERSION 
+    # untar taxoner dbs
+    mkdir k1_db
+    mkdir /cromwell_root/k2_db
+    mkdir cent_db
+    tar -C k1_db -xzvf ~{kraken1_db}
+    tar -C /cromwell_root/k2_db/ -xzvf ~{kraken2_db}
+    tar -C cent_db -xzvf ~{centrifuge_db}
+    mv cent_db/p_compressed+h+v.1.cf cent_db.1.cf
+    mv cent_db/p_compressed+h+v.2.cf cent_db.2.cf
+    mv cent_db/p_compressed+h+v.3.cf cent_db.3.cf
+    mv k1_db/minikraken_20171019_8GB/* k1_db/
+    # assign dbs for taxoners
+    export KRAKEN_DEFAULT_DB=k1_db
+    export KRAKEN2_DEFAULT_DB=/cromwell_root/k2_db/
+    export KRAKEN2_DB_PATH=/cromwell_root/k2_db/
+    export CENTRIFUGE_DEFAULT_DB=cent_db
+    echo four
+    read1_array=(~{sep=' ' read1})
+    read1_array_len=$(echo "${#read1_array[@]}")
+    read2_array=(~{sep=' ' read2})
+    read2_array_len=$(echo "${#read2_array[@]}")
+    samplename_array=(~{sep=' ' samplename})
+    samplename_array_len=$(echo "${#samplename_array[@]}")
+
+    # Ensure read, and samplename arrays are of equal length
+    if [ "$read1_array_len" -ne "$samplename_array_len" ]; then
+      echo "Read1 array array (length: $read1_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
+      exit 1
+    fi
+
+    if [ "$read2_array_len" -ne "$samplename_array_len" ]; then
+      echo "Read2 array (length: $read2_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
+      exit 1
+    fi
+
+    # create file of filenames for nullarbor input
+    touch nullarbor_input.tsv
+    for index in ${!read1_array[@]}; do
+      read1=${read1_array[$index]}
+      read2=${read2_array[$index]}
+      samplename=${samplename_array[$index]}
+
+      echo -e "${samplename}\t${read1}\t${read2}" >> nullarbor_input.tsv
+    done
+    # Run Nullarbor on the input assembly with the --all flag
+    nullarbor.pl \
+      --name ~{run_name} \
+      --ref ~{ref_genome} \
+      --input nullarbor_input.tsv \
+      --outdir nullarbor_outdir \
+      --assembler ~{assembler} \
+      --treebuilder ~{tree_builder} \
+      --taxoner ~{taxoner} \
+      --verbose
+    #Run makefile
+    make preview -C nullarbor_outdir/
+    nice make all -j 2 -l 4 -C nullarbor_outdir/ 2>&1 | tee -a nullarbor_outdir/nullarbor.log
+
+    echo "nullarbor complete,tarballing results now"
+
+    tar -cf - nullarbor_outdir/ | gzip -n --best > ~{run_name}.tar.gz
+
+  >>>
+   output {
+    String nullarbor_version = read_string("VERSION")
+    String nullarbor_docker = "~{docker}"
+    String analysis_date = read_string("DATE")
+    File nullarbor_report = "/cromwell_root/nullarbor_outdir/report/index.html"
+    File nullarbor_output_dir = "~{run_name}.tar.gz"
+  }
+  runtime {
+      docker: "~{docker}"
+      memory: "~{memory} GB"
+      cpu: cpu
+      disks: "local-disk 500 SSD"
+      preemptible: 0
+  }
+}
diff --git a/tasks/taxon_id/task_gambit.wdl b/tasks/taxon_id/task_gambit.wdl
@@ -181,3 +181,167 @@ task gambit {
     preemptible:  0
   }
 }
+
+task gambit_euk {
+  input {
+    File assembly
+    String samplename
+    String docker = "quay.io/staphb/gambit:0.5.0"
+    File gambit_db_genomes = "gs://theiagen-public-files/terra/candida_auris_refs/221006-theiagen-fungal-v0.1.db"
+    File gambit_db_signatures = "gs://theiagen-public-files/terra/candida_auris_refs/221006-theiagen-fungal-v0.1.h5" 
+  }
+  # If "File" type is used Cromwell attempts to localize it, which fails because it doesn't exist yet.
+  String report_path = "~{samplename}_gambit.json"
+  String closest_genomes_path = "~{samplename}_gambit_closest.csv"
+  command <<<
+    # capture date and version
+    date | tee DATE
+    gambit --version | tee GAMBIT_VERSION
+    
+    # set gambit reference dir; will assume that gambit genomes and signatures will be provided by user in tandem or not at all
+    if [[ ! -z "~{gambit_db_genomes}" ]]; then 
+      echo "User gabmit db identified; ~{gambit_db_genomes} will be utilized for alignment"
+      gambit_db_version="$(basename -- '~{gambit_db_genomes}'); $(basename -- '~{gambit_db_signatures}')"
+      gambit_db_dir="${PWD}/gambit_database"
+      mkdir ${gambit_db_dir}
+      cp ~{gambit_db_genomes} ${gambit_db_dir}
+      cp ~{gambit_db_signatures} ${gambit_db_dir}
+    else
+     gambit_db_dir="/gambit-db" 
+     gambit_db_version="unmodified from gambit container: ~{docker}"
+    fi
+    
+    echo ${gambit_db_version} | tee GAMBIT_DB_VERSION
+    
+    gambit -d ${gambit_db_dir} query -f json -o ~{report_path} ~{assembly} 
+    
+    python3 <<EOF
+    import json
+    import csv
+
+    def fmt_dist(d): return format(d, '.4f')
+
+    with open("~{report_path}") as f:
+      data = json.load(f)
+
+    (item,) = data['items']
+    predicted = item['predicted_taxon']
+    next_taxon = item['next_taxon']
+    closest = item['closest_genomes'][0]
+
+    with open('CLOSEST_DISTANCE', 'w') as f:
+      f.write(fmt_dist(closest['distance']))
+
+    # Predicted taxon
+    with open('PREDICTED_TAXON', 'w') as f:
+      if predicted is None:
+        f.write('NA')
+      elif predicted['name'] is None:
+        f.write('NA')
+      else:
+        f.write(predicted['name'])
+    with open('PREDICTED_TAXON_RANK', 'w') as f:
+      if predicted is None:
+        f.write('NA')
+      elif predicted['rank'] is None:
+        f.write('NA')
+      else:
+        f.write(predicted['rank'])
+    with open('PREDICTED_TAXON_THRESHOLD', 'w') as f:
+      if predicted is None:
+        f.write(fmt_dist(0))
+      elif predicted['distance_threshold'] is None:
+        f.write(fmt_dist(0))
+      else:
+        f.write(fmt_dist(predicted['distance_threshold']))
+
+    # Next taxon
+    with open('NEXT_TAXON', 'w') as f:
+      if next_taxon is None:
+        f.write('NA')
+      elif next_taxon['name'] is None:
+        f.write('NA')
+      else:
+        f.write(next_taxon['name'])
+    with open('NEXT_TAXON_RANK', 'w') as f:
+      if next_taxon is None:
+        f.write('NA')
+      elif next_taxon['rank'] is None:
+        f.write('NA')
+      else:
+        f.write(next_taxon['rank'])
+    with open('NEXT_TAXON_THRESHOLD', 'w') as f:
+      if next_taxon is None:
+        f.write(fmt_dist(0))
+      elif next_taxon['distance_threshold'] is None:
+        f.write(fmt_dist(0))
+      else:
+        f.write(fmt_dist(next_taxon['distance_threshold']))
+      
+    # Table of closest genomes
+    with open('~{closest_genomes_path}', 'w', newline='') as f:
+      writer = csv.writer(f)
+
+      # Header
+      writer.writerow([
+        'distance',
+        'genome.description',
+        'genome.taxon.name',
+        'genome.taxon.rank',
+        'genome.taxon.threshold',
+        'matched.name',
+        'matched.rank',
+        'matched.distance_threshold',
+      ])
+
+      for match in item['closest_genomes']:
+        genome = match['genome']
+        genome_taxon = genome['taxonomy'][0]
+        match_taxon = match['matched_taxon']
+
+        writer.writerow([
+          fmt_dist(match['distance']),
+          genome['description'],
+          genome_taxon['name'],
+          genome_taxon['rank'],
+          fmt_dist(genome_taxon['distance_threshold']),
+          '' if match_taxon is None else match_taxon['name'],
+          '' if match_taxon is None else match_taxon['rank'],
+          fmt_dist(0 if match_taxon is None else match_taxon['distance_threshold']),
+        ])
+    EOF
+    # set merlin tags
+    predicted_taxon=$(cat PREDICTED_TAXON)
+    if [[ ${predicted_taxon} == *"Candida auris"* ]] ; then 
+      merlin_tag="Candida auris"
+    elif [[ ${predicted_taxon} == *"Candida albicans"* ]] ; then 
+      merlin_tag="Candida albicans"
+    elif [[ ${predicted_taxon} == *"Aspergillus fumigatus"* ]] ; then 
+      merlin_tag="Aspergillus fumigatus"
+    elif [[ ${predicted_taxon} == *"Cryptococcus neoformans"* ]] ; then
+      merlin_tag="Cryptococcus neoformans"
+    else 
+      merlin_tag="None"
+    fi
+    echo ${merlin_tag} | tee MERLIN_TAG
+  >>>
+  output {
+    File gambit_report_file = report_path
+    File gambit_closest_genomes_file = closest_genomes_path
+    String gambit_predicted_taxon = read_string("PREDICTED_TAXON")
+    String gambit_predicted_taxon_rank = read_string("PREDICTED_TAXON_RANK") 
+    String gambit_next_taxon = read_string("NEXT_TAXON")
+    String gambit_next_taxon_rank = read_string("NEXT_TAXON_RANK")
+    String gambit_version = read_string("GAMBIT_VERSION")
+    String gambit_db_version = read_string("GAMBIT_DB_VERSION")
+    String merlin_tag = read_string("MERLIN_TAG")
+    String gambit_docker = docker
+  }
+  runtime {
+    docker:  "~{docker}"
+    memory:  "16 GB"
+    cpu:   8
+    disks: "local-disk 100 SSD"
+    preemptible:  0
+  }
+}