Skip to content

Commit

Permalink
[h5n1-cattle-outbreak] inferred vs known metadata
Browse files Browse the repository at this point in the history
The limited metadata available for this outbreak means we infer division
for many tips, so being able to show known vs inferred metadata can be
clarifying for many users.

As we add more customisations like this the snakemake pipeline becomes
more and more complex. Allowing auspice-config overlays / merging would
solve half of the complexity introduced here. If we find ourselves
commonly duplicating metadata columns we can add a config-parameterised
rule for this.

Note that the colours are generated within Auspice and so differ between
the two representations of division (inferred vs metadata). The also
differ between genome & segment builds. A nice improvement would be
making these consistent over all h5n1-cattle-outbreak datasets.

Context: <https://bedfordlab.slack.com/archives/CD84ELG0N/p1730227555767939>
  • Loading branch information
jameshadfield committed Oct 30, 2024
1 parent 0b5dcae commit 6583482
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
24 changes: 19 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,14 @@ def metadata_by_wildcards(wildcards):
# H5 builds have extra clade-level metadata added to the metadata TSV.
# We may move this to a node-data JSON which would simplify the snakemake logic
# a bit -- see <https://github.com/nextstrain/avian-flu/issues/25>
if wildcards.subtype in ("h5n1", "h5nx", "h5n1-cattle-outbreak"):
if wildcards.subtype in ("h5n1", "h5nx"):
return "results/{subtype}/metadata-with-clade.tsv"
# cattle-flu.smk will make its own modifications as needed
elif wildcards.subtype=="h5n1-cattle-outbreak":
if wildcards.segment=="genome":
return "results/{subtype}/{segment}/default/metadata-with-clade-and-non-inferred-values.tsv"
else:
return "results/{subtype}/metadata-with-clade.tsv"
else:
return "results/{subtype}/metadata.tsv",

Expand Down Expand Up @@ -530,14 +536,22 @@ rule auspice_config:
run:
import json
with open(input.auspice_config) as fh:
config = json.load(fh)
auspice_config = json.load(fh)
if wildcards.subtype == "h5n1-cattle-outbreak":
if wildcards.segment == "genome":
config['display_defaults']['distance_measure'] = "num_date"
auspice_config['display_defaults']['distance_measure'] = "num_date"
division_idx = next((i for i,c in enumerate(auspice_config['colorings']) if c['key']=='division'), None)
assert division_idx!=None, "Auspice config did not have a division coloring!"
auspice_config['colorings'].insert(division_idx+1, {
"key": "division_metadata",
"title": auspice_config['colorings'][division_idx]["title"] + " (metadata)",
"type": "categorical",
})
auspice_config['colorings'][division_idx]["title"] += " (inferred)"
else:
config['display_defaults']['distance_measure'] = "div"
auspice_config['display_defaults']['distance_measure'] = "div"
with open(output.auspice_config, 'w') as fh:
json.dump(config, fh, indent=2)
json.dump(auspice_config, fh, indent=2)


rule export:
Expand Down
34 changes: 34 additions & 0 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,37 @@ rule prune_tree:
--output-tree {output.tree} \
--output-metadata {output.node_data}
"""

def assert_expected_config(w):
try:
# TODO: once we refactor things we should use `get_config()` here
# see <https://github.com/nextstrain/avian-flu/pull/100#discussion_r1823047047>
# but currently this snakefile doesn't have access to that function.
assert len(config['traits']['genome_columns'])==1 and config['traits']['genome_columns']['FALLBACK']=="division"
except Exception as err:
raise Exception("Rule add_metadata_columns_to_show_non_inferred_values expected a certain format for config['traits'] that has since changed") from err

rule add_metadata_columns_to_show_non_inferred_values:
"""
Genome builds run `augur traits` for "division" (we assert this below) so we want to add a metadata
column `division_metadata` which is a duplicate of `division`.
NOTE: long-term we should be consulting `traits_params()` to work out the columns to duplicate, but
that function's not visible to this .smk file so would require deeper refactoring.
"""
input:
metadata = "results/{subtype}/metadata-with-clade.tsv",
output:
metadata = "results/{subtype}/{segment}/{time}/metadata-with-clade-and-non-inferred-values.tsv",
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
segment="genome",
time="default",
params:
old_column = "division",
new_column = "division_metadata",
assert_traits = assert_expected_config,
shell:
"""
cat {input.metadata} | csvtk mutate -t -f {params.old_column} -n {params.new_column} > {output.metadata}
"""

0 comments on commit 6583482

Please sign in to comment.