From 1b333ab144f3b2ff267c0304448c0075fdb7729e Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 29 Aug 2024 11:46:14 -0700 Subject: [PATCH] Add prM-E build [#15] --- phylogenetic/Snakefile | 4 +- ...config.json => auspice_config_genome.json} | 6 +- .../defaults/auspice_config_prM-E.json | 64 +++++++++++ phylogenetic/defaults/config.yaml | 26 +++-- ...strains.txt => dropped_strains_genome.txt} | 0 .../defaults/dropped_strains_prM-E.txt | 0 .../{genemap.gff => genemap_genome.gff} | 0 phylogenetic/defaults/genemap_prM-E.gff | 5 + .../defaults/include_strains_prM-E.txt | 0 ...reference.fasta => reference_genome.fasta} | 0 .../{reference.gb => reference_genome.gb} | 0 phylogenetic/defaults/reference_prM-E.fasta | 13 +++ phylogenetic/rules/annotate_phylogeny.smk | 4 +- phylogenetic/rules/construct_phylogeny.smk | 5 +- phylogenetic/rules/export.smk | 4 +- phylogenetic/rules/prepare_sequences.smk | 102 ++++++++++++++---- 16 files changed, 193 insertions(+), 40 deletions(-) rename phylogenetic/defaults/{auspice_config.json => auspice_config_genome.json} (94%) create mode 100644 phylogenetic/defaults/auspice_config_prM-E.json rename phylogenetic/defaults/{dropped_strains.txt => dropped_strains_genome.txt} (100%) create mode 100644 phylogenetic/defaults/dropped_strains_prM-E.txt rename phylogenetic/defaults/{genemap.gff => genemap_genome.gff} (100%) create mode 100644 phylogenetic/defaults/genemap_prM-E.gff create mode 100644 phylogenetic/defaults/include_strains_prM-E.txt rename phylogenetic/defaults/{reference.fasta => reference_genome.fasta} (100%) rename phylogenetic/defaults/{reference.gb => reference_genome.gb} (100%) create mode 100644 phylogenetic/defaults/reference_prM-E.fasta diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 0ada5e7..02f6f73 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,11 +1,11 @@ configfile: "defaults/config.yaml" # we will likely add gene-specific builds in the future -gene = ["genome"] +gene = ["genome", "prM-E"] rule all: input: - auspice_json="auspice/yellow-fever-virus_genome.json", + auspice_json=expand("auspice/yellow-fever-virus_{gene}.json", gene=gene), include: "rules/prepare_sequences.smk" diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config_genome.json similarity index 94% rename from phylogenetic/defaults/auspice_config.json rename to phylogenetic/defaults/auspice_config_genome.json index 8460657..67f88f2 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config_genome.json @@ -1,5 +1,5 @@ { - "title": "Real-time tracking of yellow fever virus full genome virus evolution", + "title": "Real-time tracking of yellow fever virus full genome evolution", "maintainers": [ {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"}, {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} @@ -23,9 +23,9 @@ "type": "categorical" }, { - "key": "num_date", + "key": "date", "title": "Date", - "type": "continuous" + "type": "temporal" }, { "key": "region", diff --git a/phylogenetic/defaults/auspice_config_prM-E.json b/phylogenetic/defaults/auspice_config_prM-E.json new file mode 100644 index 0000000..06791f3 --- /dev/null +++ b/phylogenetic/defaults/auspice_config_prM-E.json @@ -0,0 +1,64 @@ +{ + "title": "Real-time tracking of yellow fever virus prM-E region evolution", + "maintainers": [ + {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"}, + {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} + ], + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], + "build_url": "https://github.com/nextstrain/yellow-fever", + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "clade", + "title": "Genotype (via Nextclade tree)", + "type": "categorical" + }, + { + "key": "date", + "title": "Date", + "type": "temporal" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country", + "region" + ], + "display_defaults": { + "map_triplicate": true, + "color_by": "region" + }, + "filters": [ + "clade", + "region", + "country", + "author", + "host" + ], + "metadata_columns": [ + "author" + ] +} diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 47cb057..ac36297 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -1,18 +1,28 @@ strain_id_field: "accession" files: - auspice_config: "defaults/auspice_config.json" colors: "defaults/colors.tsv" description: "defaults/description.md" - exclude: "defaults/dropped_strains.txt" - genemap: "defaults/genemap.gff" - include_genome: "defaults/include_strains_genome.txt" - reference_gb: "defaults/reference.gb" - reference_fasta: "defaults/reference.fasta" + genome: + auspice_config: "defaults/auspice_config_genome.json" + exclude: "defaults/dropped_strains_genome.txt" + genemap: "defaults/genemap_genome.gff" + include: "defaults/include_strains_genome.txt" + reference: "defaults/reference_genome.fasta" + prM-E: + auspice_config: "defaults/auspice_config_prM-E.json" + exclude: "defaults/dropped_strains_prM-E.txt" + genemap: "defaults/genemap_prM-E.gff" + include: "defaults/include_strains_prM-E.txt" + reference: "defaults/reference_prM-E.fasta" filter: group_by: "country year" min_date: "1900-01-01" - min_length: 9000 - sequences_per_group: 20 + genome: + min_length: 9000 + sequences_per_group: 20 + prM-E: + min_length: 600 + sequences_per_group: 200 refine: coalescent: "opt" date_inference: "marginal" diff --git a/phylogenetic/defaults/dropped_strains.txt b/phylogenetic/defaults/dropped_strains_genome.txt similarity index 100% rename from phylogenetic/defaults/dropped_strains.txt rename to phylogenetic/defaults/dropped_strains_genome.txt diff --git a/phylogenetic/defaults/dropped_strains_prM-E.txt b/phylogenetic/defaults/dropped_strains_prM-E.txt new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/defaults/genemap.gff b/phylogenetic/defaults/genemap_genome.gff similarity index 100% rename from phylogenetic/defaults/genemap.gff rename to phylogenetic/defaults/genemap_genome.gff diff --git a/phylogenetic/defaults/genemap_prM-E.gff b/phylogenetic/defaults/genemap_prM-E.gff new file mode 100644 index 0000000..60c279d --- /dev/null +++ b/phylogenetic/defaults/genemap_prM-E.gff @@ -0,0 +1,5 @@ +##sequence-region prM-E 1 672 +NC_002031.1 feature source 1 672 . + . gene=nuc +NC_002031.1 feature gene 1 333 . + . gene_name=prM +NC_002031.1 feature gene 109 333 . + . gene_name=M +NC_002031.1 feature gene 334 672 . + . gene_name=E diff --git a/phylogenetic/defaults/include_strains_prM-E.txt b/phylogenetic/defaults/include_strains_prM-E.txt new file mode 100644 index 0000000..e69de29 diff --git a/phylogenetic/defaults/reference.fasta b/phylogenetic/defaults/reference_genome.fasta similarity index 100% rename from phylogenetic/defaults/reference.fasta rename to phylogenetic/defaults/reference_genome.fasta diff --git a/phylogenetic/defaults/reference.gb b/phylogenetic/defaults/reference_genome.gb similarity index 100% rename from phylogenetic/defaults/reference.gb rename to phylogenetic/defaults/reference_genome.gb diff --git a/phylogenetic/defaults/reference_prM-E.fasta b/phylogenetic/defaults/reference_prM-E.fasta new file mode 100644 index 0000000..1b51ee3 --- /dev/null +++ b/phylogenetic/defaults/reference_prM-E.fasta @@ -0,0 +1,13 @@ +> prM-E region (genome 641-1312, 672 nt) +CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC +GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG +CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG +GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG +ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC +CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG +GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG +TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC +ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG +ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC +AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG +AAAGGGAGCATT diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index b278e5b..7a52612 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -7,7 +7,7 @@ rule ancestral: """Reconstructing ancestral sequences and mutations""" input: tree = "results/{gene}/tree.nwk", - alignment = "results/{gene}/aligned.fasta" + alignment = "results/{gene}/aligned_and_filtered.fasta" output: node_data = "results/{gene}/nt_muts.json" params: @@ -31,7 +31,7 @@ rule translate: input: tree = "results/{gene}/tree.nwk", node_data = "results/{gene}/nt_muts.json", - genemap = "defaults/genemap.gff" + genemap = "defaults/genemap_{gene}.gff" output: node_data = "results/{gene}/aa_muts.json" log: diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 858c902..6006d51 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -5,7 +5,7 @@ This part of the workflow constructs the phylogenetic tree. rule tree: """Building tree""" input: - alignment = "results/{gene}/aligned.fasta" + alignment = "results/{gene}/aligned_and_filtered.fasta" output: tree = "results/{gene}/tree_raw.nwk" log: @@ -29,7 +29,7 @@ rule refine: """ input: tree = "results/{gene}/tree_raw.nwk", - alignment = "results/{gene}/aligned.fasta", + alignment = "results/{gene}/aligned_and_filtered.fasta", metadata = "../ingest/results/metadata.tsv" output: tree = "results/{gene}/tree.nwk", @@ -54,7 +54,6 @@ rule refine: --output-tree {output.tree:q} \ --output-node-data {output.node_data:q} \ --coalescent {params.coalescent:q} \ - --timetree \ --date-confidence \ --date-inference {params.date_inference:q} \ --clock-filter-iqd {params.clock_filter_iqd:q} \ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index d36e476..9514f23 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -13,8 +13,8 @@ rule export: aa_muts = "results/{gene}/aa_muts.json", traits = "results/{gene}/traits.json", colors = config["files"]["colors"], - auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json", - description=config["files"]["description"] + auspice_config = lambda w: config["files"][w.gene]["auspice_config"], + description=config["files"]["description"], output: auspice_json = "auspice/yellow-fever-virus_{gene}.json" params: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index b4c296b..3d765a5 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -3,33 +3,27 @@ This part of the workflow prepares sequences for constructing the phylogenetic tree. """ -rule filter: - message: """ - Filtering to - - {params.sequences_per_group} sequence(s) per {params.group_by!s} - - excluding strains in {input.exclude} - - minimum genome length of {params.min_length} - """ +rule filter_genome: input: - exclude = config["files"]["exclude"], - include = config["files"]["include_genome"], + exclude = config["files"]["genome"]["exclude"], + include = config["files"]["genome"]["include"], # TODO once this repo is fully automated and uploading data to # S3, this step should download data from there instead of # depending on the ingest build metadata = "../ingest/results/metadata.tsv", sequences = "../ingest/results/sequences.fasta" output: - sequences = "results/{gene}/filtered.fasta" + sequences = "results/genome/filtered.fasta" params: group_by = config["filter"]["group_by"], min_date = config["filter"]["min_date"], - min_length = config["filter"]["min_length"], - sequences_per_group = config["filter"]["sequences_per_group"], + min_length = config["filter"]["genome"]["min_length"], + sequences_per_group = config["filter"]["genome"]["sequences_per_group"], strain_id = config["strain_id_field"] log: - "logs/{gene}/filter.txt", + "logs/genome/filter_genome.txt", benchmark: - "benchmarks/{gene}/filter.txt" + "benchmarks/genome/filter_genome.txt" shell: r""" augur filter \ @@ -46,17 +40,16 @@ rule filter: 2> {log:q} """ -rule align: +rule align_genome: input: sequences="results/genome/filtered.fasta", - reference=config["files"]["reference_gb"], - genemap=config["files"]["genemap"], + reference=config["files"]["genome"]["reference"], output: - alignment="results/{gene}/aligned.fasta", + alignment="results/genome/aligned_and_filtered.fasta", log: - "logs/{gene}/align.txt", + "logs/genome/align_genome.txt", benchmark: - "benchmarks/{gene}/align.txt" + "benchmarks/genome/align_genome.txt" shell: r""" augur align \ @@ -67,3 +60,72 @@ rule align: --remove-reference \ 2> {log:q} """ + + +rule align_and_extract_prME: + input: + reference=config["files"]["prM-E"]["reference"], + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + sequences = "../ingest/results/sequences.fasta", + output: + alignment = "results/prM-E/aligned.fasta" + params: + group_by = config["filter"]["group_by"], + min_date = config["filter"]["min_date"], + min_length = config["filter"]["prM-E"]["min_length"], + sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"], + strain_id = config["strain_id_field"] + log: + "logs/genome/filter_and_extract_prM-E.txt", + benchmark: + "benchmarks/genome/filter_and_extract_prM-E.txt" + shell: + r""" + augur align \ + --sequences {input.sequences} \ + --reference-sequence {input.reference} \ + --output {output.alignment} \ + --fill-gaps \ + --remove-reference \ + 2> {log:q} + """ + + +rule filter_prME: + input: + exclude = config["files"]["prM-E"]["exclude"], + include = config["files"]["prM-E"]["include"], + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + sequences = "results/prM-E/aligned.fasta" + output: + sequences = "results/prM-E/aligned_and_filtered.fasta" + params: + group_by = config["filter"]["group_by"], + min_date = config["filter"]["min_date"], + min_length = config["filter"]["prM-E"]["min_length"], + sequences_per_group = config["filter"]["prM-E"]["sequences_per_group"], + strain_id = config["strain_id_field"] + log: + "logs/genome/filter_prM-E.txt", + benchmark: + "benchmarks/genome/filter_prM-E.txt" + shell: + r""" + augur filter \ + --sequences {input.sequences:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --exclude {input.exclude:q} \ + --include {input.include:q} \ + --output {output.sequences:q} \ + --group-by {params.group_by} \ + --sequences-per-group {params.sequences_per_group:q} \ + --min-date {params.min_date:q} \ + --min-length {params.min_length:q} \ + 2> {log:q} + """