From 1b333ab144f3b2ff267c0304448c0075fdb7729e Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Thu, 29 Aug 2024 11:46:14 -0700
Subject: [PATCH] Add prM-E build [#15]

---
 phylogenetic/Snakefile                        |   4 +-
 ...config.json => auspice_config_genome.json} |   6 +-
 .../defaults/auspice_config_prM-E.json        |  64 +++++++++++
 phylogenetic/defaults/config.yaml             |  26 +++--
 ...strains.txt => dropped_strains_genome.txt} |   0
 .../defaults/dropped_strains_prM-E.txt        |   0
 .../{genemap.gff => genemap_genome.gff}       |   0
 phylogenetic/defaults/genemap_prM-E.gff       |   5 +
 .../defaults/include_strains_prM-E.txt        |   0
 ...reference.fasta => reference_genome.fasta} |   0
 .../{reference.gb => reference_genome.gb}     |   0
 phylogenetic/defaults/reference_prM-E.fasta   |  13 +++
 phylogenetic/rules/annotate_phylogeny.smk     |   4 +-
 phylogenetic/rules/construct_phylogeny.smk    |   5 +-
 phylogenetic/rules/export.smk                 |   4 +-
 phylogenetic/rules/prepare_sequences.smk      | 102 ++++++++++++++----
 16 files changed, 193 insertions(+), 40 deletions(-)
 rename phylogenetic/defaults/{auspice_config.json => auspice_config_genome.json} (94%)
 create mode 100644 phylogenetic/defaults/auspice_config_prM-E.json
 rename phylogenetic/defaults/{dropped_strains.txt => dropped_strains_genome.txt} (100%)
 create mode 100644 phylogenetic/defaults/dropped_strains_prM-E.txt
 rename phylogenetic/defaults/{genemap.gff => genemap_genome.gff} (100%)
 create mode 100644 phylogenetic/defaults/genemap_prM-E.gff
 create mode 100644 phylogenetic/defaults/include_strains_prM-E.txt
 rename phylogenetic/defaults/{reference.fasta => reference_genome.fasta} (100%)
 rename phylogenetic/defaults/{reference.gb => reference_genome.gb} (100%)
 create mode 100644 phylogenetic/defaults/reference_prM-E.fasta

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 0ada5e7..02f6f73 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -1,11 +1,11 @@
 configfile: "defaults/config.yaml"
 
 # we will likely add gene-specific builds in the future
-gene = ["genome"]
+gene = ["genome", "prM-E"]
 
 rule all:
     input:
-        auspice_json="auspice/yellow-fever-virus_genome.json",
+        auspice_json=expand("auspice/yellow-fever-virus_{gene}.json", gene=gene),
 
 
 include: "rules/prepare_sequences.smk"
diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config_genome.json
similarity index 94%
rename from phylogenetic/defaults/auspice_config.json
rename to phylogenetic/defaults/auspice_config_genome.json
index 8460657..67f88f2 100644
--- a/phylogenetic/defaults/auspice_config.json
+++ b/phylogenetic/defaults/auspice_config_genome.json
@@ -1,5 +1,5 @@
 {
-  "title": "Real-time tracking of yellow fever virus full genome virus evolution",
+  "title": "Real-time tracking of yellow fever virus full genome evolution",
   "maintainers": [
     {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
     {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
@@ -23,9 +23,9 @@
       "type": "categorical"
     },
     {
-      "key": "num_date",
+      "key": "date",
       "title": "Date",
-      "type": "continuous"
+      "type": "temporal"
     },
     {
       "key": "region",
diff --git a/phylogenetic/defaults/auspice_config_prM-E.json b/phylogenetic/defaults/auspice_config_prM-E.json
new file mode 100644
index 0000000..06791f3
--- /dev/null
+++ b/phylogenetic/defaults/auspice_config_prM-E.json
@@ -0,0 +1,64 @@
+{
+  "title": "Real-time tracking of yellow fever virus prM-E region evolution",
+  "maintainers": [
+    {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "data_provenance": [
+    {
+      "name": "GenBank",
+      "url": "https://www.ncbi.nlm.nih.gov/genbank/"
+    }
+  ],
+  "build_url": "https://github.com/nextstrain/yellow-fever",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "clade",
+      "title": "Genotype (via Nextclade tree)",
+      "type": "categorical"
+    },
+    {
+      "key": "date",
+      "title": "Date",
+      "type": "temporal"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true,
+    "color_by": "region"
+  },
+  "filters": [
+    "clade",
+    "region",
+    "country",
+    "author",
+    "host"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
index 47cb057..ac36297 100644
--- a/phylogenetic/defaults/config.yaml
+++ b/phylogenetic/defaults/config.yaml
@@ -1,18 +1,28 @@
 strain_id_field: "accession"
 files:
-    auspice_config: "defaults/auspice_config.json"
     colors: "defaults/colors.tsv"
     description: "defaults/description.md"
-    exclude: "defaults/dropped_strains.txt"
-    genemap: "defaults/genemap.gff"
-    include_genome: "defaults/include_strains_genome.txt"
-    reference_gb: "defaults/reference.gb"
-    reference_fasta: "defaults/reference.fasta"
+    genome:
+        auspice_config: "defaults/auspice_config_genome.json"
+        exclude: "defaults/dropped_strains_genome.txt"
+        genemap: "defaults/genemap_genome.gff"
+        include: "defaults/include_strains_genome.txt"
+        reference: "defaults/reference_genome.fasta"
+    prM-E:
+        auspice_config: "defaults/auspice_config_prM-E.json"
+        exclude: "defaults/dropped_strains_prM-E.txt"
+        genemap: "defaults/genemap_prM-E.gff"
+        include: "defaults/include_strains_prM-E.txt"
+        reference: "defaults/reference_prM-E.fasta"
 filter:
     group_by: "country year"
     min_date: "1900-01-01"
-    min_length: 9000
-    sequences_per_group: 20
+    genome:
+        min_length: 9000
+        sequences_per_group: 20
+    prM-E:
+        min_length: 600
+        sequences_per_group: 200
 refine:
     coalescent: "opt"
     date_inference: "marginal"
diff --git a/phylogenetic/defaults/dropped_strains.txt b/phylogenetic/defaults/dropped_strains_genome.txt
similarity index 100%
rename from phylogenetic/defaults/dropped_strains.txt
rename to phylogenetic/defaults/dropped_strains_genome.txt
diff --git a/phylogenetic/defaults/dropped_strains_prM-E.txt b/phylogenetic/defaults/dropped_strains_prM-E.txt
new file mode 100644
index 0000000..e69de29
diff --git a/phylogenetic/defaults/genemap.gff b/phylogenetic/defaults/genemap_genome.gff
similarity index 100%
rename from phylogenetic/defaults/genemap.gff
rename to phylogenetic/defaults/genemap_genome.gff
diff --git a/phylogenetic/defaults/genemap_prM-E.gff b/phylogenetic/defaults/genemap_prM-E.gff
new file mode 100644
index 0000000..60c279d
--- /dev/null
+++ b/phylogenetic/defaults/genemap_prM-E.gff
@@ -0,0 +1,5 @@
+##sequence-region prM-E 1 672
+NC_002031.1	feature	source	1	672	.	+	.	gene=nuc
+NC_002031.1	feature	gene	1	333	.	+	.	gene_name=prM
+NC_002031.1	feature	gene	109	333	.	+	.	gene_name=M
+NC_002031.1	feature	gene	334	672	.	+	.	gene_name=E
diff --git a/phylogenetic/defaults/include_strains_prM-E.txt b/phylogenetic/defaults/include_strains_prM-E.txt
new file mode 100644
index 0000000..e69de29
diff --git a/phylogenetic/defaults/reference.fasta b/phylogenetic/defaults/reference_genome.fasta
similarity index 100%
rename from phylogenetic/defaults/reference.fasta
rename to phylogenetic/defaults/reference_genome.fasta
diff --git a/phylogenetic/defaults/reference.gb b/phylogenetic/defaults/reference_genome.gb
similarity index 100%
rename from phylogenetic/defaults/reference.gb
rename to phylogenetic/defaults/reference_genome.gb
diff --git a/phylogenetic/defaults/reference_prM-E.fasta b/phylogenetic/defaults/reference_prM-E.fasta
new file mode 100644
index 0000000..1b51ee3
--- /dev/null
+++ b/phylogenetic/defaults/reference_prM-E.fasta
@@ -0,0 +1,13 @@
+> prM-E region (genome 641-1312, 672 nt)
+CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC
+GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG
+CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG
+GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG
+ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC
+CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG
+GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG
+TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC
+ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG
+ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC
+AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG
+AAAGGGAGCATT
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index b278e5b..7a52612 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -7,7 +7,7 @@ rule ancestral:
     """Reconstructing ancestral sequences and mutations"""
     input:
         tree = "results/{gene}/tree.nwk",
-        alignment = "results/{gene}/aligned.fasta"
+        alignment = "results/{gene}/aligned_and_filtered.fasta"
     output:
         node_data = "results/{gene}/nt_muts.json"
     params:
@@ -31,7 +31,7 @@ rule translate:
     input:
         tree = "results/{gene}/tree.nwk",
         node_data = "results/{gene}/nt_muts.json",
-        genemap = "defaults/genemap.gff"
+        genemap = "defaults/genemap_{gene}.gff"
     output:
         node_data = "results/{gene}/aa_muts.json"
     log:
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
index 858c902..6006d51 100644
--- a/phylogenetic/rules/construct_phylogeny.smk
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -5,7 +5,7 @@ This part of the workflow constructs the phylogenetic tree.
 rule tree:
     """Building tree"""
     input:
-        alignment = "results/{gene}/aligned.fasta"
+        alignment = "results/{gene}/aligned_and_filtered.fasta"
     output:
         tree = "results/{gene}/tree_raw.nwk"
     log:
@@ -29,7 +29,7 @@ rule refine:
     """
     input:
         tree = "results/{gene}/tree_raw.nwk",
-        alignment = "results/{gene}/aligned.fasta",
+        alignment = "results/{gene}/aligned_and_filtered.fasta",
         metadata = "../ingest/results/metadata.tsv"
     output:
         tree = "results/{gene}/tree.nwk",
@@ -54,7 +54,6 @@ rule refine:
             --output-tree {output.tree:q} \
             --output-node-data {output.node_data:q} \
             --coalescent {params.coalescent:q} \
-            --timetree \
             --date-confidence \
             --date-inference {params.date_inference:q} \
             --clock-filter-iqd {params.clock_filter_iqd:q} \
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index d36e476..9514f23 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -13,8 +13,8 @@ rule export:
         aa_muts = "results/{gene}/aa_muts.json",
         traits = "results/{gene}/traits.json",
         colors = config["files"]["colors"],
-        auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json",
-        description=config["files"]["description"]
+        auspice_config = lambda w: config["files"][w.gene]["auspice_config"],
+        description=config["files"]["description"],
     output:
         auspice_json = "auspice/yellow-fever-virus_{gene}.json"
     params:
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index b4c296b..3d765a5 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -3,33 +3,27 @@ This part of the workflow prepares sequences for constructing the
 phylogenetic tree.
 """
 
-rule filter:
-    message: """
-    Filtering to
-      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-      - excluding strains in {input.exclude}
-      - minimum genome length of {params.min_length}
-    """
+rule filter_genome:
     input:
-        exclude = config["files"]["exclude"],
-        include = config["files"]["include_genome"],
+        exclude = config["files"]["genome"]["exclude"],
+        include = config["files"]["genome"]["include"],
         # TODO once this repo is fully automated and uploading data to
         # S3, this step should download data from there instead of
         # depending on the ingest build
         metadata = "../ingest/results/metadata.tsv",
         sequences = "../ingest/results/sequences.fasta"
     output:
-        sequences = "results/{gene}/filtered.fasta"
+        sequences = "results/genome/filtered.fasta"
     params:
         group_by = config["filter"]["group_by"],
         min_date = config["filter"]["min_date"],
-        min_length = config["filter"]["min_length"],
-        sequences_per_group = config["filter"]["sequences_per_group"],
+        min_length = config["filter"]["genome"]["min_length"],
+        sequences_per_group = config["filter"]["genome"]["sequences_per_group"],
         strain_id = config["strain_id_field"]
     log:
-        "logs/{gene}/filter.txt",
+        "logs/genome/filter_genome.txt",
     benchmark:
-        "benchmarks/{gene}/filter.txt"
+        "benchmarks/genome/filter_genome.txt"
     shell:
         r"""
         augur filter \
@@ -46,17 +40,16 @@ rule filter:
           2> {log:q}
         """
 
-rule align:
+rule align_genome:
     input:
         sequences="results/genome/filtered.fasta",
-        reference=config["files"]["reference_gb"],
-        genemap=config["files"]["genemap"],
+        reference=config["files"]["genome"]["reference"],
     output:
-        alignment="results/{gene}/aligned.fasta",
+        alignment="results/genome/aligned_and_filtered.fasta",
     log:
-        "logs/{gene}/align.txt",
+        "logs/genome/align_genome.txt",
     benchmark:
-        "benchmarks/{gene}/align.txt"
+        "benchmarks/genome/align_genome.txt"
     shell:
         r"""
         augur align \
@@ -67,3 +60,72 @@ rule align:
             --remove-reference \
           2> {log:q}
         """
+
+
+rule align_and_extract_prME:
+    input:
+        reference=config["files"]["prM-E"]["reference"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+    output:
+        alignment = "results/prM-E/aligned.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["prM-E"]["min_length"],
+        sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
+        strain_id = config["strain_id_field"]
+    log:
+        "logs/genome/filter_and_extract_prM-E.txt",
+    benchmark:
+        "benchmarks/genome/filter_and_extract_prM-E.txt"
+    shell:
+        r"""
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --fill-gaps \
+            --remove-reference \
+          2> {log:q}
+        """
+
+
+rule filter_prME:
+    input:
+        exclude = config["files"]["prM-E"]["exclude"],
+        include = config["files"]["prM-E"]["include"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        sequences = "results/prM-E/aligned.fasta"
+    output:
+        sequences = "results/prM-E/aligned_and_filtered.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["prM-E"]["min_length"],
+        sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"],
+        strain_id = config["strain_id_field"]
+    log:
+        "logs/genome/filter_prM-E.txt",
+    benchmark:
+        "benchmarks/genome/filter_prM-E.txt"
+    shell:
+        r"""
+        augur filter \
+            --sequences {input.sequences:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --exclude {input.exclude:q} \
+            --include {input.include:q} \
+            --output {output.sequences:q} \
+            --group-by {params.group_by} \
+            --sequences-per-group {params.sequences_per_group:q} \
+            --min-date {params.min_date:q} \
+            --min-length {params.min_length:q} \
+          2> {log:q}
+        """