From a0acd2dfde2a9b114a3136095388843b4aac4254 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Wed, 20 Nov 2024 20:21:24 +1300
Subject: [PATCH 01/11] WIP Change config syntax

Changes the syntax for a rule parameter for to either be a scalar or a
dictionary where they keys represent the wildcards of the associated
build, allowing '*' to represent a catch-all for a wildcard. This allows
significant simplifications throughout the workflow and config YAMLs,
largely because this format lets us express parameters for any wildcard
combination without the need to maintain different structures for
genome-specific configs, or different structures for parameters which
vary per segment vs those which don't.

The subsequent commits will make more involved changes to the snakemake
pipeline facilitated by this change, this commit represents the minimal
set of changes to keep functionality.
---
 Snakefile                        | 151 ++++++++++++++++-------
 gisaid/config.yaml               | 197 +++++++++++++------------------
 h5n1-cattle-outbreak/config.yaml | 116 +++++++++---------
 rules/cattle-flu.smk             |  12 +-
 4 files changed, 249 insertions(+), 227 deletions(-)

diff --git a/Snakefile b/Snakefile
index be81d46..1361f32 100755
--- a/Snakefile
+++ b/Snakefile
@@ -40,10 +40,9 @@ def resolve_config_path(original_path, wildcards=None):
 
     if re.search(r'\{.+\}', path):
         if wildcards:
-            print(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.", file=sys.stderr)
+            raise InvalidConfigError(f"The call to `resolve_config_path({original_path!r}, wildcards)` still includes wildcard-like placeholders afrer resolving: {path!r}.")
         else:
-            print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
-        exit(2)
+            raise InvalidConfigError(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.")
 
     if os.path.exists(path): # isfile?
         return path
@@ -70,6 +69,63 @@ def resolve_config_path(original_path, wildcards=None):
         f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
         f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
 
+def resolve_config_value(rule_parts, wildcards, sep="/"):
+    """
+    Resolve a config value defined by the *rule_parts of the config,
+    e.g. rule_parts = ['filter', 'min_length'] then we expect a scalar or
+    a dictionary to be present at config['filter']['min_length'].
+
+    If a scalar then that value is returned, i.e. it's always the same no matter
+    what the wildcards are.
+
+    If a dictionary we search it for the relevant value by finding the closest matching
+    key once wildcards have been considered. For instance in a three-tiered wildcard pipeline
+    such as this with subtype, segment and time we interpret these as ordered in specificity.
+    For instance, if only one wildcard value is specified in the config (the others are '*')
+    then matching subtype is more specific than segment. Given example 
+    wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
+    - 'h5nx/pb2/2y'   ─ all 3 wildcard values specified
+    - 'h5nx/pb2/*'    ┐
+    - 'h5nx/*/2y'     ├ 2/3 wildcard values specified
+    - '*/pb2/2y'      ┘
+    - 'h5nx/*/*'      ┐
+    - '*/pb2/*'       ├ 1/3 wildcard values specified
+    - '*/*/2y'        ┘
+    - '*/*/*'         ─ default / fall-back
+    and the first key present in the config is used.
+    """
+    try:
+        config_lookup = config
+        for i,rule_key in enumerate(rule_parts): # or use functools.reduce etc
+            config_lookup = config_lookup[rule_key]
+    except KeyError:
+        raise InvalidConfigError('Config missing entire entry for config'+''.join(['["'+rule_parts[j]+'""]' for j in range(0,i+1)]))
+
+    if any([isinstance(config_lookup, t) for t in [float, int, bool, str]]):
+        return config_lookup
+
+    if not isinstance(config_lookup, dict):
+        raise InvalidConfigError(f"ERROR: config under {'.'.join(rule_parts)} must be a scalar value or a dictionary")
+
+    wild_keys = ['subtype', 'segment', 'time'] # workflow specific
+    search_keys = [                            # workflow independent, as long as there are 3 or fewer wildcard categories
+        sep.join([wildcards[k] for k in wild_keys]),
+        *([sep.join(['*' if i==k else wildcards[key] for k,key in enumerate(wild_keys)])
+            for i in range(len(wild_keys)-1, -1, -1)] if len(wild_keys)>=2 else []),
+        *([sep.join(['*' if i!=k else wildcards[key] for k,key in enumerate(wild_keys)])
+            for i in range(0, len(wild_keys))] if len(wild_keys)==3 else []),
+        sep.join(['*']*len(wild_keys))
+    ]
+
+    for key in search_keys:
+        if key in config_lookup:
+            return config_lookup[key]
+    msg  =  'Config structure incorrect or incomplete for config'+''.join(['["'+rule_parts[j]+'"]' for j in range(0,i+1)])
+    msg += f'\n\tThe dictionary is missing a matching key for the current target of {search_keys[0]!r}, or a fuzzy match (i.e. using "*" placeholders)'
+    msg +=  '\n\tP.S. If you want to use a single value across all builds then set a scalar value (number, string, boolean)'
+    raise InvalidConfigError(msg)
+
+
 
 # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
 # will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps:
@@ -149,6 +205,7 @@ files = rules.files.params
 
 
 def subtypes_by_subtype_wildcard(wildcards):
+    # TODO - shift this to config
     db = {
         'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'],
         'h5n1': ['h5n1'],
@@ -239,44 +296,46 @@ def metadata_by_wildcards(wildcards):
     else:
         return "results/{subtype}/metadata.tsv",
 
-
-def get_config(rule_name, rule_key, wildcards, segment=None, fallback="FALLBACK"):
-    assert rule_name in config, f"Config missing top-level {rule_name} key"
-    assert rule_key in config[rule_name], f"Config missing entry for {rule_name}.{rule_key}"
-    try:
-        return config[rule_name][rule_key][wildcards.subtype][wildcards.time]
-    except KeyError:
-        assert fallback in config[rule_name][rule_key], f"config.{rule_name!r}.{rule_key!r} either needs " \
-            f"an entry for {wildcards.subtype!r}.{wildcards.time!r} added or a (default) {fallback!r} key."
-        return config[rule_name][rule_key][fallback]
-
 def refine_clock_rates(w):
-    info = get_config('refine', 'clock_rates', w)
 
     if w.segment == 'genome':
+        # TODO XXX - can this be part of cattle-flu.smk?
         # calculate the genome rate via a weighted average of the segment rates
-        assert 'genome' not in info, ("This snakemake pipeline is currently set up to calculate the genome clock rate "
-            "based on the segment rates, however you have provided a genome rate in the config.")
+
+        # Sanity check: check that the config hasn't (mistakenly?) set a genome rate
         try:
-            segment_lengths= get_config('refine', 'segment_lengths', w)
-        except AssertionError as e:
-            # py11 brings e.add_note() which is nicer
-            e.args = (*e.args, "NOTE: For segment=genome we require the segment_lengths to be defined as we use them to calculate the clock rate")
-            raise
-        mean = sum([info[seg][0]*length for seg,length in segment_lengths.items()])/sum(segment_lengths.values())
+            info = resolve_config_value(['refine', 'clock_rates'], w)
+        except InvalidConfigError as e:
+            pass # this is behaviour we expect - i.e. no clock rates should be provided for genome
+        else:
+            raise InvalidConfigError("config['refine']['clock_rates'] should not contain a key which can be used for 'genome' builds as this is calculated by the workflow")
+
+        segment_lengths = [
+            resolve_config_value(['refine', 'segment_lengths'], wildcards)
+            for wildcards in
+            [{"subtype": w.subtype, "time": w.time, "segment": segment} for segment in SEGMENTS]
+        ]
+        clock_rates = [
+            resolve_config_value(['refine', 'clock_rates'], wildcards)
+            for wildcards in
+            [{"subtype": w.subtype, "time": w.time, "segment": segment} for segment in SEGMENTS]
+        ]
+        for rates in clock_rates:
+            assert isinstance(rates, list) and len(rates)==2, "The clock rates for each segment must be a list of (rate, std-dev), not {rates!r}"
+
+        mean = sum([length*rate[0] for length,rate in zip(segment_lengths, clock_rates)]) / sum(segment_lengths)
         stdev = mean/2
         return f"--clock-rate {mean} --clock-std-dev {stdev}"
 
-    assert w.segment in info, 'config.refine.clock_rates: data must be provided for each segment. Use "" for inference.'
-    if info[w.segment] == "":
+    info = resolve_config_value(['refine', 'clock_rates'], w)
+    if info == "":
         return ""
 
-    assert isinstance(info[w.segment], list), "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)"
-    assert len(info[w.segment])==2, "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)"
-    return f"--clock-rate {info[w.segment][0]} --clock-std-dev {info[w.segment][1]}"
+    assert isinstance(info, list) and len(info)==2, "The clock rates for {w.subtype!r}/{w.time!r}/{w.segment!r} must be a list of (rate, std-dev), not {info!r}"
+    return f"--clock-rate {info[0]} --clock-std-dev {info[1]}"
 
 def refine_clock_filter(w):
-    filter = get_config('refine', 'clock_filter_iqd', w)
+    filter = resolve_config_value(['refine', 'clock_filter_iqd'], w)
     return f"--clock-filter-iqd {filter}" if filter else ""
 
 
@@ -319,7 +378,7 @@ def _filter_params(wildcards, input, output, threads, resources):
             raise Exception("A strains input should only be present for SAME_STRAINS + HA!")
         return f"--exclude-all --include {input.strains} {input.include}"
 
-    exclude_where = get_config('filter', 'exclude_where', wildcards)
+    exclude_where = resolve_config_value(['filter', 'exclude_where'], wildcards)
     # If SAME_STRAINS (and due to the above conditional we have the HA segment at this point)
     # then we want to restrict to strains present in all 8 segments. Note that force-included
     # strains may not have all segments, but that's preferable to filtering them out.
@@ -329,14 +388,14 @@ def _filter_params(wildcards, input, output, threads, resources):
 
     cmd = ""
 
-    group_by_value = get_config('filter', 'group_by', wildcards)
+    group_by_value = resolve_config_value(['filter', 'group_by'], wildcards)
     cmd += f" --group-by {group_by_value}" if group_by_value else ""
 
     cmd += f" --subsample-max-sequences {config['target_sequences_per_tree']}"
-    cmd += f" --min-date {get_config('filter', 'min_date', wildcards)}"
+    cmd += f" --min-date {resolve_config_value(['filter', 'min_date'], wildcards)}"
     cmd += f" --include {input.include}"
     cmd += f" --exclude-where {exclude_where}"
-    cmd += f" --min-length {get_config('filter', 'min_length', wildcards)[wildcards.segment]}"
+    cmd += f" --min-length {resolve_config_value(['filter', 'min_length'], wildcards)}"
     cmd += f" --non-nucleotide"
     return cmd
 
@@ -417,9 +476,10 @@ rule tree:
 
 
 def refine_root(wildcards):
-    root = get_config('refine', 'genome_root', wildcards) \
+    # TODO XXX - can we simplify this?
+    root = resolve_config_value(['refine', 'genome_root'], wildcards) \
         if wildcards.segment=='genome' \
-        else get_config('refine', 'root', wildcards)
+        else resolve_config_value(['refine', 'root'], wildcards)
     return f"--root {root}" if root else ""
 
 rule refine:
@@ -472,9 +532,9 @@ def refined_tree(w):
     return "results/{subtype}/{segment}/{time}/tree.nwk",
 
 def ancestral_root_seq(wildcards):
-    root_seq = get_config('ancestral', 'genome_root_seq', wildcards) \
+    root_seq = resolve_config_value(['ancestral', 'genome_root_seq'], wildcards) \
         if wildcards.segment=='genome' \
-        else get_config('ancestral', 'root_seq', wildcards)
+        else resolve_config_value(['ancestral', 'root_seq'], wildcards)
     if not root_seq:
         return ""
     return f"--root-sequence {resolve_config_path(root_seq, wildcards)}"
@@ -518,15 +578,15 @@ rule translate:
         """
 
 def traits_params(wildcards):
-    columns = get_config('traits', 'genome_columns', wildcards) \
+    columns = resolve_config_value(['traits', 'genome_columns'], wildcards) \
         if wildcards.segment=='genome' \
-        else get_config('traits', 'columns', wildcards)
+        else resolve_config_value(['traits', 'columns'], wildcards)
 
-    bias = get_config('traits', 'genome_sampling_bias_correction', wildcards) \
+    bias = resolve_config_value(['traits', 'genome_sampling_bias_correction'], wildcards) \
         if wildcards.segment=='genome' \
-        else get_config('traits', 'sampling_bias_correction', wildcards)
+        else resolve_config_value(['traits', 'sampling_bias_correction'], wildcards)
 
-    confidence = get_config('traits', 'confidence', wildcards)
+    confidence = resolve_config_value(['traits', 'confidence'], wildcards)
 
     args = f"--columns {columns}"
     if bias:
@@ -592,9 +652,9 @@ def export_node_data_files(wildcards):
 def additional_export_config(wildcards):
     args = ""
 
-    title_overrides = get_config('export', 'genome_title', wildcards) \
+    title_overrides = resolve_config_value(['export', 'genome_title'], wildcards) \
         if wildcards.segment=='genome' \
-        else get_config('export', 'title', wildcards)
+        else resolve_config_value(['export', 'title'], wildcards)
     if title_overrides:
         args += ("--title '" +
             title_overrides.format(segment=wildcards.segment.upper(), subtype=wildcards.subtype.upper(), time=wildcards.time) +
@@ -721,6 +781,7 @@ rule clean:
     shell:
         "rm -rfv {params}"
 
-
+# Add in additional rules at/towards the end of the Snakefile so that
+# they have access to functions and values declared above
 for rule_file in config.get('custom_rules', []):
     include: rule_file
diff --git a/gisaid/config.yaml b/gisaid/config.yaml
index 284d36e..bf2acc9 100644
--- a/gisaid/config.yaml
+++ b/gisaid/config.yaml
@@ -1,25 +1,33 @@
-#### Parameters which define which builds to produce via this config ###
+#### Which builds to produce via this config? ###
+# The GISAID builds are a combination of three parts: subtype (e.g. h5n1),
+# time (e.g. all-time) and segment (e.g. ha)
 builds:
-  h5nx: 
-    - all-time
-    - 2y
-  h5n1: 
-    - all-time
-    - 2y
-  h7n9: 
-    - all-time
-  h9n2: 
-    - all-time
-
-segments:
-  - pb2
-  - pb1
-  - pa
-  - ha
-  - np
-  - na
-  - mp
-  - ns
+  - subtype:
+      - h5nx
+      - h5n1
+    segment: &segments
+      - pb2
+      - pb1
+      - pa
+      - ha
+      - np
+      - na
+      - mp
+      - ns
+    time:
+      - 2y
+      - all-time
+  - subtype:
+      - h7n9
+      - h9n2
+    segment: *segments
+    time:
+      - all-time
+
+# The default target for this build is the Auspice JSON, however config overlays may wish to change this
+# (e.g. to target "intermediate" files instead)
+target_patterns:
+  - "auspice/avian-flu_{subtype}_{segment}_{time}.json"
 
 #### Parameters which define the input source ####
 s3_src:
@@ -47,119 +55,82 @@ description: config/description_gisaid.md
 
 
 #### Rule-specific parameters ####
-filter:
-  # rule parameters are typically defined for each build - e.g.
-  # min_length.h5nx.2y = ... The "FALLBACK" key (e.g. min_length.FALLBACK)
-  # is used as a fallback value if nothing is specifically defined
-  # for the subtype/time combination.
-  # Some parameters have an extra hierarchy of segment, some don't
+# The formatting here represents the three-tiered nature of the avian-flu build which
+# comprises subtype / segment / time-resolution
+# You can use '*' as a catch-all for any of these tiers.
+# There's one exception: If a config value is constant for any and all builds then you
+# can just use a scalar value (number, string, boolean)
+filter: 
   min_length:
-    FALLBACK:
-      pb2: 2100
-      pb1: 2100
-      pa: 2000
-      ha: 1600
-      np: 1400
-      na: 1270
-      mp: 900
-      ns: 800
+    "*/pb2/*": 2100
+    "*/pb1/*": 2100
+    "*/pa/*": 2000
+    "*/ha/*": 1600
+    "*/np/*": 1400
+    "*/na/*": 1270
+    "*/mp/*": 900
+    "*/ns/*": 800
 
   min_date:
-    h5nx:
-      all-time: 1996
-      2y: 2Y
-    h5n1:
-      all-time: 1996
-      2y: 2Y
-    h7n9:
-      all-time: 2013
-    h9n2:
-      all-time: 1966
+    "*/*/2y": 2Y
+    "h5nx/*/all-time": 1996
+    "h5n1/*/all-time": 1996
+    "h7n9/*/all-time": 2013
+    "h9n2/*/all-time": 1966 # TODO - go back and check we didn't make a typo from 1996 to 1966
 
   group_by:
-    h5nx:
-      all-time: subtype country year
-      2y: subtype region month host
-    h5n1:
-      all-time: region country year
-      2y: subtype region month host
-    h7n9:
-      all-time: division year
-    h9n2:
-      all-time: country year
+    "*/*/2y": subtype region month host
+    "h5nx/*/all-time": subtype country year
+    "h5n1/*/all-time": region country year
+    "h7n9/*/all-time": division year
+    "h9n2/*/all-time": country year
 
   exclude_where:
-    FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host country=? region=? gisaid_clade=3C.2
+    host=laboratoryderived host=ferret host=unknown host=other host=host country=? region=? gisaid_clade=3C.2
 
 refine:
   coalescent: const
   date_inference: marginal
 
-  clock_filter_iqd:
-    FALLBACK: 4
+  clock_filter_iqd: 4
 
-  root:
-    FALLBACK: false
-
-  __clock_std_dev:  &clock_std_dev  0.00211 # YAML anchor so we can reference this value
+  root: false
 
   clock_rates:
-    FALLBACK:            # anything not specified by a subtype/time combination
-        pb2: ''          # falls back to FALLBACK, and the empty string means no
-        pb1: ''          # supplied clock rate, i.e. infer the clock
-        pa: ''
-        ha: ''
-        np: ''
-        na: ''
-        mp: ''
-        ns: ''
-    h5nx:
-      2y:
-        pb2: [0.00287, *clock_std_dev]
-        pb1: [0.00267, *clock_std_dev]
-        pa: [0.00238, *clock_std_dev]
-        ha: [0.0048, *clock_std_dev]
-        np: [0.0022, *clock_std_dev]
-        na: [0.0028, *clock_std_dev]
-        mp: [0.0017, *clock_std_dev]
-        ns: [0.0017, *clock_std_dev]
-    h5n1:
-      2y:
-        pb2: [0.00287, *clock_std_dev]
-        pb1: [0.00264, *clock_std_dev]
-        pa: [0.00248, *clock_std_dev]
-        ha: [0.00455, *clock_std_dev]
-        np: [0.00252, *clock_std_dev]
-        na: [0.00349, *clock_std_dev]
-        mp: [0.00191, *clock_std_dev]
-        ns: [0.00249, *clock_std_dev]
-
+    # NOTE: an empty value means no supplied clock rate, i.e. infer the clock
+    # (This is the fallback value - */*/* - to be used if we don't provide a more specific value below)
+    "*/*/*": ''
+    # otherwise supply a list of [rate, std_dev]
+    "h5nx/pb2/2y": [0.00287, &clock_std_dev 0.00211]
+    "h5nx/pb1/2y": [0.00267, *clock_std_dev]
+    "h5nx/pa/2y": [0.00238, *clock_std_dev]
+    "h5nx/ha/2y": [0.0048, *clock_std_dev]
+    "h5nx/np/2y": [0.0022, *clock_std_dev]
+    "h5nx/na/2y": [0.0028, *clock_std_dev]
+    "h5nx/mp/2y": [0.0017, *clock_std_dev]
+    "h5nx/ns/2y": [0.0017, *clock_std_dev]
+    "h5n1/pb2/2y": [0.00287, *clock_std_dev]
+    "h5n1/pb1/2y": [0.00264, *clock_std_dev]
+    "h5n1/pa/2y": [0.00248, *clock_std_dev]
+    "h5n1/ha/2y": [0.00455, *clock_std_dev]
+    "h5n1/np/2y": [0.00252, *clock_std_dev]
+    "h5n1/na/2y": [0.00349, *clock_std_dev]
+    "h5n1/mp/2y": [0.00191, *clock_std_dev]
+    "h5n1/ns/2y": [0.00249, *clock_std_dev]
 
 ancestral:
   inference: joint
-  root_seq:
-    FALLBACK: false
+  root_seq: false
 
 traits:
   columns:
-    h5nx:
-      all-time: region
-      2y: region
-    h5n1:
-      all-time: region country
-      2y: region country
-    h7n9:
-      all-time: country division
-    h9n2:
-      all-time: region country
-
-  sampling_bias_correction:
-    FALLBACK: false
-
-  confidence:
-    FALLBACK: true
+    "h5nx/*/*": region
+    "h5n1/*/*": region country
+    "h7n9/*/*": country division
+    "h9n2/*/*": region country
+  sampling_bias_correction: false
+  confidence: false
 
 export:
-  title:
-    FALLBACK: false # use the title in the auspice JSON
+  title: false # use the title in the auspice JSON
     # TODO - if we parameterise this here we can use the subtype wildcard to customise the title
\ No newline at end of file
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 363c09d..a7fbbac 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -3,6 +3,7 @@
 custom_rules:
   - "rules/cattle-flu.smk"
 
+# TODO XXX - explain how/why time=default, even I forget...
 
 #### Parameters which define which builds to produce via this config ###
 builds:
@@ -52,88 +53,77 @@ description: config/{subtype}/description_{subtype}.md
 #### Rule-specific parameters ####
 filter:
   min_length:
-    FALLBACK:
-      pb2: 2100
-      pb1: 2100
-      pa: 2000
-      ha: 1600
-      np: 1400
-      na: 1270
-      mp: 900
-      ns: 800
-
-  min_date:
-    FALLBACK: 2024
-
-  group_by:
-    FALLBACK: false # no grouping during filter
+    "*/pb2/*": 2100 # Note: could use "h5n1-cattle-outbreak/pb2/default: 2100" if desired
+    "*/pb1/*": 2100
+    "*/pa/*": 2000
+    "*/ha/*": 1600
+    "*/np/*": 1400
+    "*/na/*": 1270
+    "*/mp/*": 900
+    "*/ns/*": 800
+
+  min_date: 2024
+
+  group_by: false # no grouping during filter
 
   exclude_where:
-    FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
+    host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
 
 
 refine:
   coalescent: const
   date_inference: marginal
 
-  clock_filter_iqd:
-    FALLBACK: false
+  clock_filter_iqd: false
 
-  root:
-    FALLBACK: false
+  root: false
 
   # For the genome only we use the closest outgroup as the root
   # P.S. Make sure this strain is force included via augur filter --include
   # (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
-  genome_root:
-      FALLBACK: A/skunk/NewMexico/24-006483-001/2024
+  genome_root: A/skunk/NewMexico/24-006483-001/2024
 
   segment_lengths:
-    FALLBACK:
-      {'pb2': 2341, 'pb1': 2341, 'pa': 2233, 'ha': 1760, 'np': 1565, 'na': 1458, 'mp': 1027, 'ns': 865}
-
-  __clock_std_dev:  &clock_std_dev  0.00211 # YAML anchor so we can reference this value below
+    "*/pb2/*": 2341
+    "*/pb1/*": 2341
+    "*/pa/*": 2233
+    "*/ha/*": 1565
+    "*/np/*": 1400
+    "*/na/*": 1458
+    "*/mp/*": 1027
+    "*/ns/*": 865
 
   clock_rates:
-    FALLBACK:
-      # The rates for the 8 segments are taken from the GISAID H5N1/2y config
-      pb2: [0.00287, *clock_std_dev]
-      pb1: [0.00264, *clock_std_dev]
-      pa: [0.00248, *clock_std_dev]
-      ha: [0.00455, *clock_std_dev]
-      np: [0.00252, *clock_std_dev]
-      na: [0.00349, *clock_std_dev]
-      mp: [0.00191, *clock_std_dev]
-      ns: [0.00249, *clock_std_dev]
-      # the genome clock rate is calculated by a function in the snakemake pipeline
-      # using the segment rates weighted by their lengths
+    # The rates for the 8 segments are taken from the GISAID H5N1/2y config
+    "*/pb2/*": [0.00287, &clock_std_dev 0.00211]
+    "*/pb1/*": [0.00264, *clock_std_dev]
+    "*/pa/*": [0.00248, *clock_std_dev]
+    "*/ha/*": [0.00455, *clock_std_dev]
+    "*/np/*": [0.00252, *clock_std_dev]
+    "*/na/*": [0.00349, *clock_std_dev]
+    "*/mp/*": [0.00191, *clock_std_dev]
+    "*/ns/*": [0.00249, *clock_std_dev]
+    # NOTE:
+    # the genome clock rate is calculated by a function in the snakemake pipeline
+    # using the segment rates weighted by their lengths
 
 ancestral:
   inference: joint
-  root_seq:
-    FALLBACK: false
-  genome_root_seq:
-    FALLBACK: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+  root_seq: false
+  genome_root_seq: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
 
 traits:
-  # genome build has different parameters...
-  genome_columns:
-    FALLBACK: division
-  genome_sampling_bias_correction:
-    FALLBACK: 5
-
-  # segment builds:
-  columns:
-    FALLBACK: region country # same as GISAID H5N1 builds
-  sampling_bias_correction:
-    FALLBACK: false
-
-  # all builds
-  confidence: 
-    FALLBACK: true
-
-export:
-  genome_title:
-    FALLBACK: Full genome analysis of the ongoing influenza A/H5N1 cattle outbreak in North America
-  title:
-    FALLBACK: Ongoing influenza A/H5N1 cattle outbreak in North America ({segment} segment)
+  # genome build has different parameters... - # TODO XXX
+  genome_columns: division
+  genome_sampling_bias_correction: 5
+
+  # segment builds - # TODO XXX
+  columns: region country # same as GISAID H5N1 builds
+  sampling_bias_correction: false
+
+  # all builds - # TODO XXX
+  confidence: true
+
+export: # TODO XXX
+  genome_title: Full genome analysis of the ongoing influenza A/H5N1 cattle outbreak in North America
+  title: Ongoing influenza A/H5N1 cattle outbreak in North America ({segment} segment)
diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
index 2bc29d6..5abf112 100644
--- a/rules/cattle-flu.smk
+++ b/rules/cattle-flu.smk
@@ -105,7 +105,7 @@ def assert_expected_config(w):
         # TODO: once we refactor things we should use `get_config()` here
         # see <https://github.com/nextstrain/avian-flu/pull/100#discussion_r1823047047>
         # but currently this snakefile doesn't have access to that function.
-        assert len(config['traits']['genome_columns'])==1 and config['traits']['genome_columns']['FALLBACK']=="division"
+        assert config['traits']['genome_columns']=='division'
     except Exception as err:
         raise Exception("Rule add_metadata_columns_to_show_non_inferred_values expected a certain format for config['traits'] that has since changed") from err
 
@@ -160,12 +160,12 @@ rule prune_tree:
 rule colors_genome:
     # TODO: add these input files / params to the config YAML. The config YAML must also
     # define the concept of whether this rule should run so this isn't trivial and is
-    # thus left as a to-do
+    # thus left as a to-do. Once they are in the YAML we should switch to `resolve_config_path`
     input:
         metadata = "results/{subtype}/genome/{time}/metadata.tsv", # Always use the genome metadata, even for segment builds
-        ordering = "config/h5n1-cattle-outbreak/color_ordering.tsv",
-        schemes = "config/h5n1-cattle-outbreak/color_schemes.tsv",
-        colors = files.colors,
+        ordering = lambda w: resolve_config_path("config/h5n1-cattle-outbreak/color_ordering.tsv", w),
+        schemes = lambda w: resolve_config_path("config/h5n1-cattle-outbreak/color_schemes.tsv", w),
+        colors = lambda w: resolve_config_path(files.colors, w)
     output:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     params:
@@ -177,7 +177,7 @@ rule colors_genome:
     shell:
         r"""
         cp {input.colors} {output.colors} && \
-        python3 {params.script} \ \
+        python3 {params.script} \
             --metadata {input.metadata} \
             --ordering {input.ordering} \
             --color-schemes {input.schemes} \

From 3b582b6a27f9f69dc99965fb1e7a332ef4388294 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Thu, 21 Nov 2024 11:20:09 +1300
Subject: [PATCH 02/11] Improve config.export.title structure

Removes the need for any special-casing for genome builds as the new
config pattern is flexible enough to handle them
---
 Snakefile                        | 14 ++++++--------
 h5n1-cattle-outbreak/config.yaml |  7 ++++---
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1361f32..1ca497e 100755
--- a/Snakefile
+++ b/Snakefile
@@ -652,14 +652,12 @@ def export_node_data_files(wildcards):
 def additional_export_config(wildcards):
     args = ""
 
-    title_overrides = resolve_config_value(['export', 'genome_title'], wildcards) \
-        if wildcards.segment=='genome' \
-        else resolve_config_value(['export', 'title'], wildcards)
-    if title_overrides:
-        args += ("--title '" +
-            title_overrides.format(segment=wildcards.segment.upper(), subtype=wildcards.subtype.upper(), time=wildcards.time) +
-            "'")
-
+    title = resolve_config_value(['export', 'title'], wildcards)
+    if title:
+        # Config defined title strings may include wildcards to be expanded
+        title = title.format(segment=wildcards.segment.upper(), subtype=wildcards.subtype.upper(), time=wildcards.time)
+        args += f"--title {title!r}"
+        
     return args
 
 rule auspice_config:
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index a7fbbac..4b6816d 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -124,6 +124,7 @@ traits:
   # all builds - # TODO XXX
   confidence: true
 
-export: # TODO XXX
-  genome_title: Full genome analysis of the ongoing influenza A/H5N1 cattle outbreak in North America
-  title: Ongoing influenza A/H5N1 cattle outbreak in North America ({segment} segment)
+export:
+  title:
+    "*/genome/*": Full genome analysis of the ongoing influenza A/H5N1 cattle outbreak in North America
+    "*/*/*": Ongoing influenza A/H5N1 cattle outbreak in North America ({segment} segment)

From 5aba493a106e462c29cb1a046ac5d27c2f80c09d Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Thu, 21 Nov 2024 12:28:05 +1300
Subject: [PATCH 03/11] Improve config.traits structure

The new syntax allows this to be simpler and avoid the genome-specific
top-level keys. This allows a number of downstream simplifications
to the snakefiles, as well as increased robustness.
---
 Snakefile                        | 18 ++++--------------
 h5n1-cattle-outbreak/config.yaml | 13 ++++++-------
 rules/cattle-flu.smk             | 22 ++++++++--------------
 3 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1ca497e..d7b8c43 100755
--- a/Snakefile
+++ b/Snakefile
@@ -578,20 +578,10 @@ rule translate:
         """
 
 def traits_params(wildcards):
-    columns = resolve_config_value(['traits', 'genome_columns'], wildcards) \
-        if wildcards.segment=='genome' \
-        else resolve_config_value(['traits', 'columns'], wildcards)
-
-    bias = resolve_config_value(['traits', 'genome_sampling_bias_correction'], wildcards) \
-        if wildcards.segment=='genome' \
-        else resolve_config_value(['traits', 'sampling_bias_correction'], wildcards)
-
-    confidence = resolve_config_value(['traits', 'confidence'], wildcards)
-
-    args = f"--columns {columns}"
-    if bias:
+    args = f"--columns {resolve_config_value(['traits', 'columns'], wildcards)}"
+    if bias:=resolve_config_value(['traits', 'sampling_bias_correction'], wildcards):
         args += f" --sampling-bias-correction {bias}"
-    if confidence:
+    if confidence:=resolve_config_value(['traits', 'confidence'], wildcards):
         args += f" --confidence"
     return args
 
@@ -605,7 +595,7 @@ rule traits:
     params:
         info = traits_params,
     shell:
-        """
+        r"""
         augur traits \
             --tree {input.tree} \
             --metadata {input.metadata} \
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 4b6816d..027d9b5 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -113,15 +113,14 @@ ancestral:
   genome_root_seq: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
 
 traits:
-  # genome build has different parameters... - # TODO XXX
-  genome_columns: division
-  genome_sampling_bias_correction: 5
+  columns:
+    "*/genome/*": division
+    "*/*/*": region country  # segment builds are the same as GISAID H5N1 builds
 
-  # segment builds - # TODO XXX
-  columns: region country # same as GISAID H5N1 builds
-  sampling_bias_correction: false
+  sampling_bias_correction:
+    "*/genome/*": 5
+    "*/*/*": false
 
-  # all builds - # TODO XXX
   confidence: true
 
 export:
diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
index 5abf112..0a128fc 100644
--- a/rules/cattle-flu.smk
+++ b/rules/cattle-flu.smk
@@ -99,15 +99,11 @@ rule genome_metadata:
         augur filter --metadata {input.metadata} --sequences {input.sequences} --output-metadata {output.metadata}
         """
 
-
-def assert_expected_config(w):
-    try:
-        # TODO: once we refactor things we should use `get_config()` here
-        # see <https://github.com/nextstrain/avian-flu/pull/100#discussion_r1823047047>
-        # but currently this snakefile doesn't have access to that function.
-        assert config['traits']['genome_columns']=='division'
-    except Exception as err:
-        raise Exception("Rule add_metadata_columns_to_show_non_inferred_values expected a certain format for config['traits'] that has since changed") from err
+def metadata_columns_to_duplicate(wildcards):
+    column = resolve_config_value(['traits', 'columns'], wildcards)
+    assert isinstance(column, str) and ' ' not in column, \
+        "cattle-flu.smk::metadata_columns_to_duplicate: Genome workflow only expects there to be a single column to run `augur traits` on."
+    return [column, column+"_metadata"]
 
 rule add_metadata_columns_to_show_non_inferred_values:
     """
@@ -126,12 +122,10 @@ rule add_metadata_columns_to_show_non_inferred_values:
         segment="genome",
         time="default",
     params:
-        old_column = "division",
-        new_column = "division_metadata",
-        assert_traits = assert_expected_config,
+        columns = lambda w: metadata_columns_to_duplicate(w),
     shell:
         """
-        cat {input.metadata} | csvtk mutate -t -f {params.old_column} -n {params.new_column} > {output.metadata}
+        cat {input.metadata} | csvtk mutate -t -f {params.columns[0]} -n {params.columns[1]} > {output.metadata}
         """
 
 ruleorder: add_metadata_columns_to_show_non_inferred_values > filter
@@ -169,7 +163,7 @@ rule colors_genome:
     output:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     params:
-        duplications = "division=division_metadata",
+        duplications = lambda w: "=".join(metadata_columns_to_duplicate({**w, 'segment': 'genome'})),
         script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py")
     wildcard_constraints:
         subtype="h5n1-cattle-outbreak",

From 695bb5f5b61531e76fe8e2af05ea67bfac7ac83c Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Thu, 21 Nov 2024 12:42:04 +1300
Subject: [PATCH 04/11] Improve config.refine structure

The new syntax allows this to be simpler and avoid the genome-specific
"root_genome" key, as well as simplifying the snakemake pipeline
---
 Snakefile                        |  9 +--------
 h5n1-cattle-outbreak/config.yaml | 12 ++++++------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/Snakefile b/Snakefile
index d7b8c43..6a0a602 100755
--- a/Snakefile
+++ b/Snakefile
@@ -475,13 +475,6 @@ rule tree:
         """
 
 
-def refine_root(wildcards):
-    # TODO XXX - can we simplify this?
-    root = resolve_config_value(['refine', 'genome_root'], wildcards) \
-        if wildcards.segment=='genome' \
-        else resolve_config_value(['refine', 'root'], wildcards)
-    return f"--root {root}" if root else ""
-
 rule refine:
     message:
         """
@@ -502,7 +495,7 @@ rule refine:
         date_inference = config['refine']['date_inference'],
         clock_rates = refine_clock_rates,
         clock_filter = refine_clock_filter,
-        root = refine_root,
+        root = lambda w: f"--root {resolve_config_value(['refine', 'root'], w)}" if resolve_config_value(['refine', 'root'], w) else ''
     shell:
         """
         augur refine \
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 027d9b5..186e10f 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -76,12 +76,12 @@ refine:
 
   clock_filter_iqd: false
 
-  root: false
-
-  # For the genome only we use the closest outgroup as the root
-  # P.S. Make sure this strain is force included via augur filter --include
-  # (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
-  genome_root: A/skunk/NewMexico/24-006483-001/2024
+  root:
+    # For the genome only we use the closest outgroup as the root
+    # P.S. Make sure this strain is force included via augur filter --include
+    # (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
+    "*/genome/*": A/skunk/NewMexico/24-006483-001/2024
+    "*/*/*": false
 
   segment_lengths:
     "*/pb2/*": 2341

From a4be4691f3fca2c1d6d15c5406e1afdd6fe426b0 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Mon, 2 Dec 2024 13:39:49 +1300
Subject: [PATCH 05/11] Improve config.ancestral structure

The new syntax allows this to be simpler and avoid the special-case
genome-specific keys in config['ancestral']. This also removes all
the associated genome-specific code in the Snakefile.
---
 Snakefile                        | 8 +++-----
 h5n1-cattle-outbreak/config.yaml | 5 +++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 6a0a602..71d4774 100755
--- a/Snakefile
+++ b/Snakefile
@@ -525,12 +525,10 @@ def refined_tree(w):
     return "results/{subtype}/{segment}/{time}/tree.nwk",
 
 def ancestral_root_seq(wildcards):
-    root_seq = resolve_config_value(['ancestral', 'genome_root_seq'], wildcards) \
-        if wildcards.segment=='genome' \
-        else resolve_config_value(['ancestral', 'root_seq'], wildcards)
-    if not root_seq:
+    config_value = resolve_config_value(['ancestral', 'root_seq'], wildcards)
+    if not config_value: # falsey values skip the --root-sequence argument
         return ""
-    return f"--root-sequence {resolve_config_path(root_seq, wildcards)}"
+    return f"--root-sequence {resolve_config_path(config_value, wildcards)}"
 
 rule ancestral:
     message: "Reconstructing ancestral sequences and mutations"
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 186e10f..e4739f1 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -109,8 +109,9 @@ refine:
 
 ancestral:
   inference: joint
-  root_seq: false
-  genome_root_seq: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+  root_seq: 
+    "*/genome/*": config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+    "*/*/*": false
 
 traits:
   columns:

From 35dd531e23d4aa935a09bd496085a7cf1b7fe03b Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Mon, 2 Dec 2024 14:01:56 +1300
Subject: [PATCH 06/11] Improve configs for genome reference

Removing hardcoded cattle-outbreak specific logic from the snakefiles
and the configs.
---
 Snakefile                                                  | 2 +-
 .../{h5_cattle_genome_root.gb => reference_genome.gb}      | 0
 config/h5n1-cattle-outbreak/reference_ha.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_mp.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_na.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_np.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_ns.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_pa.gb                | 1 +
 config/h5n1-cattle-outbreak/reference_pb1.gb               | 1 +
 config/h5n1-cattle-outbreak/reference_pb2.gb               | 1 +
 h5n1-cattle-outbreak/config.yaml                           | 5 ++---
 rules/cattle-flu.smk                                       | 7 +------
 12 files changed, 12 insertions(+), 10 deletions(-)
 rename config/h5n1-cattle-outbreak/{h5_cattle_genome_root.gb => reference_genome.gb} (100%)
 create mode 120000 config/h5n1-cattle-outbreak/reference_ha.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_mp.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_na.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_np.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_ns.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_pa.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_pb1.gb
 create mode 120000 config/h5n1-cattle-outbreak/reference_pb2.gb

diff --git a/Snakefile b/Snakefile
index 71d4774..c43f5a1 100755
--- a/Snakefile
+++ b/Snakefile
@@ -556,7 +556,7 @@ rule translate:
     input:
         tree = refined_tree,
         node_data = rules.ancestral.output.node_data,
-        reference = lambda w: resolve_config_path(config['genome_reference'] if w.segment=='genome' else files.reference, w)
+        reference = lambda w: resolve_config_path(files.reference, w)
     output:
         node_data = "results/{subtype}/{segment}/{time}/aa-muts.json"
     shell:
diff --git a/config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb b/config/h5n1-cattle-outbreak/reference_genome.gb
similarity index 100%
rename from config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
rename to config/h5n1-cattle-outbreak/reference_genome.gb
diff --git a/config/h5n1-cattle-outbreak/reference_ha.gb b/config/h5n1-cattle-outbreak/reference_ha.gb
new file mode 120000
index 0000000..2e3d9cc
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_ha.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_ha.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_mp.gb b/config/h5n1-cattle-outbreak/reference_mp.gb
new file mode 120000
index 0000000..a3c59ab
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_mp.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_mp.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_na.gb b/config/h5n1-cattle-outbreak/reference_na.gb
new file mode 120000
index 0000000..32e3088
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_na.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_na.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_np.gb b/config/h5n1-cattle-outbreak/reference_np.gb
new file mode 120000
index 0000000..2023f58
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_np.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_np.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_ns.gb b/config/h5n1-cattle-outbreak/reference_ns.gb
new file mode 120000
index 0000000..65fb6bc
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_ns.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_ns.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_pa.gb b/config/h5n1-cattle-outbreak/reference_pa.gb
new file mode 120000
index 0000000..f8ecb49
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_pa.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_pa.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_pb1.gb b/config/h5n1-cattle-outbreak/reference_pb1.gb
new file mode 120000
index 0000000..f573ef0
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_pb1.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_pb1.gb
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak/reference_pb2.gb b/config/h5n1-cattle-outbreak/reference_pb2.gb
new file mode 120000
index 0000000..fbdd304
--- /dev/null
+++ b/config/h5n1-cattle-outbreak/reference_pb2.gb
@@ -0,0 +1 @@
+../h5n1/reference_h5n1_pb2.gb
\ No newline at end of file
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index e4739f1..11ac09d 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -38,8 +38,7 @@ target_sequences_per_tree: 10_000
 
 #### Config files ####
 
-reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
-genome_reference: config/{subtype}/h5_cattle_genome_root.gb
+reference: config/h5n1-cattle-outbreak/reference_{segment}.gb
 auspice_config: config/{subtype}/auspice_config_{subtype}.json
 colors: config/h5n1/colors_h5n1.tsv # use H5N1 colors
 lat_longs: config/h5n1/lat_longs_h5n1.tsv # use H5N1 lat-longs
@@ -110,7 +109,7 @@ refine:
 ancestral:
   inference: joint
   root_seq: 
-    "*/genome/*": config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+    "*/genome/*": config/h5n1-cattle-outbreak/reference_genome.gb
     "*/*/*": false
 
 traits:
diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
index 0a128fc..d6d5ec5 100644
--- a/rules/cattle-flu.smk
+++ b/rules/cattle-flu.smk
@@ -38,12 +38,7 @@ rule filter_segments_for_genome:
 rule align_segments_for_genome:
     input:
         sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta",
-        # Use the H5N1 reference sequences for alignment
-        reference = lambda w: [
-            resolve_config_path(expanded, w)
-            for expanded in
-            expand(config['reference'], subtype='h5n1', segment=w.genome_seg)
-        ]
+        reference = lambda w: resolve_config_path(files.reference, {'subtype': 'h5n1-cattle-outbreak', 'segment': w.genome_seg, 'time': 'default'})
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned_{genome_seg}.fasta"
     wildcard_constraints:

From fbfc193b3b0457f3dde2f102d6099a7a36438805 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 22 Nov 2024 10:00:31 +1300
Subject: [PATCH 07/11] Refactor config.builds

The new structure is both more flexible and more amenable to config
overlays. Specifying the segments per subtype (or set of subtypes)
allows a pipeline to produce different sets of segments (e.g. ha/na
for some subtypes, all 8 for others). Encoding as a list makes it much
simpler to override via additional configs. For instance, here's a
GISAID overlay which just does what you expect:

```yaml
builds:
  - subtype: h5n1
    segment:
      - ha
      - na
    time: 2y
```
---
 Snakefile                        | 45 ++++++++++++++++++++++++++------
 h5n1-cattle-outbreak/config.yaml | 29 ++++++++++----------
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/Snakefile b/Snakefile
index c43f5a1..013b2b7 100755
--- a/Snakefile
+++ b/Snakefile
@@ -69,6 +69,9 @@ def resolve_config_path(original_path, wildcards=None):
         f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
         f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
 
+def is_scalar(x):
+    return any([isinstance(x, t) for t in [float, int, bool, str]])
+
 def resolve_config_value(rule_parts, wildcards, sep="/"):
     """
     Resolve a config value defined by the *rule_parts of the config,
@@ -101,7 +104,7 @@ def resolve_config_value(rule_parts, wildcards, sep="/"):
     except KeyError:
         raise InvalidConfigError('Config missing entire entry for config'+''.join(['["'+rule_parts[j]+'""]' for j in range(0,i+1)]))
 
-    if any([isinstance(config_lookup, t) for t in [float, int, bool, str]]):
+    if is_scalar(config_lookup):
         return config_lookup
 
     if not isinstance(config_lookup, dict):
@@ -156,19 +159,45 @@ def sanity_check_config():
 
 sanity_check_config()
 
+def as_list(x):
+    return x if isinstance(x, list) else [x]
+
 def collect_builds():
     """
     iteratively create workflow targets from config.builds for the `all` rule
     you can over-ride this by specifying targets (filenames) on the command line
     """
     targets = []
-    for subtype,times in config.get('builds', {}).items():
-        for segment in config.get('segments', []):
-            if len(times):
-                for time in times:
-                    targets.append(f"auspice/avian-flu_{subtype}_{segment}_{time}.json")
-            else:
-                targets.append(f"auspice/avian-flu_{subtype}_{segment}.json")
+
+    if 'builds' not in config:
+        raise InvalidConfigError('config["builds"] is not defined!')
+    if not isinstance(config['builds'], list):
+        raise InvalidConfigError('config["builds"] must be a list')
+
+    if 'segments' in config:
+        raise InvalidConfigError('config["segments"] is no longer used. Please remove it and encode the target segments within config["builds"]')
+
+    for i,subconfig in enumerate(config['builds']):
+        required_keys = ['subtype', 'segment']
+        optional_keys = ['time']
+        if not isinstance(subconfig, dict):
+            raise InvalidConfigError(f'config["builds"][{i}] must be a dictionary!')
+        if not all([k in subconfig for k in required_keys]):
+            raise InvalidConfigError(f'config["builds"][{i}] must have {", ".join(required_keys)} keys')
+        if not all([isinstance(v, list) or is_scalar(v) for v in subconfig.values()]):
+            raise InvalidConfigError(f'config["builds"][{i}] values must all be scalars or lists')
+    
+        for subtype in as_list(subconfig['subtype']):
+            for segment in as_list(subconfig['segment']):
+                # Some builds (GISAID) have a time component, some (cattle-outbreak) don't
+                if 'time' in subconfig:
+                    for time in as_list(subconfig['time']):
+                        target = "auspice/avian-flu_{subtype}_{segment}_{time}.json".format(subtype=subtype, segment=segment, time=time)
+                else:
+                    target = "auspice/avian-flu_{subtype}_{segment}.json".format(subtype=subtype, segment=segment)
+                if target not in targets:
+                    targets.append(target)
+
     return targets
 
 rule all:
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 11ac09d..bd43502 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -3,22 +3,21 @@
 custom_rules:
   - "rules/cattle-flu.smk"
 
-# TODO XXX - explain how/why time=default, even I forget...
-
-#### Parameters which define which builds to produce via this config ###
+#### Which builds to produce via this config? ###
+# These builds are a combination of two parts: subtype (h5n1-cattle-outbreak)
+# and segment (e.g. ha)
 builds:
-  h5n1-cattle-outbreak: ''
-
-segments:
-  - genome
-  - pb2
-  - pb1
-  - pa
-  - ha
-  - np
-  - na
-  - mp
-  - ns
+  - subtype: h5n1-cattle-outbreak
+    segment:
+      - genome
+      - pb2
+      - pb1
+      - pa
+      - ha
+      - np
+      - na
+      - mp
+      - ns
 
 
 #### Parameters which define the input source ####

From dae855d3eb7678f54626a076526eb2e1842f4089 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 22 Nov 2024 10:24:11 +1300
Subject: [PATCH 08/11] Add config.target_patterns

A common use case is to want intermediate files (e.g. filtered metadata)
and this is a pain when we have many build targets as you have to
list these out on the command line manually. The other approach is
to modify the snakefile targets within the Snakefile, which is my
interpretation of the recent (commented out) additions in
18958650bef5a7ffa93ccc88f56867f45f5ccdc9:

```
rule all:
    input:
        auspice_json = collect_builds()
        #sequences = expand("results/{subtype}/{segment}/sequences.fasta", segment=SEGMENTS, subtype=SUBTYPES),
        #metadata = expand("results/{subtype}/metadata.tsv", segment=SEGMENTS, subtype=SUBTYPES)
```

Allowing config overlays to set these is really nice. The above
snakefile changes can now be replaced with the following config (even
more important when we start thinking about analysis directories
separate to the pathogen repo):

```yaml
extends: gisaid.yaml
target_patterns:
  - "results/{subtype}/{segment}/sequences.fasta"
  - "results/{subtype}/metadata.tsv"
```
---
 Snakefile                        | 26 +++++++++++++-------------
 h5n1-cattle-outbreak/config.yaml |  4 ++++
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/Snakefile b/Snakefile
index 013b2b7..8502f1e 100755
--- a/Snakefile
+++ b/Snakefile
@@ -6,7 +6,6 @@ wildcard_constraints:
 
 # defined before extra rules `include`d as they reference this constant
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
-#SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]
 
 CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty
 
@@ -162,13 +161,17 @@ sanity_check_config()
 def as_list(x):
     return x if isinstance(x, list) else [x]
 
-def collect_builds():
+def expand_target_patterns():
     """
     iteratively create workflow targets from config.builds for the `all` rule
     you can over-ride this by specifying targets (filenames) on the command line
     """
     targets = []
 
+    if not isinstance(config.get('target_patterns', None), list) and not isinstance(config.get('target_patterns', None), str):
+        raise InvalidConfigError('config["target_patterns"] must be defined (either as a list or a single string)')
+    target_patterns = as_list(config['target_patterns'])
+
     if 'builds' not in config:
         raise InvalidConfigError('config["builds"] is not defined!')
     if not isinstance(config['builds'], list):
@@ -190,21 +193,18 @@ def collect_builds():
         for subtype in as_list(subconfig['subtype']):
             for segment in as_list(subconfig['segment']):
                 # Some builds (GISAID) have a time component, some (cattle-outbreak) don't
-                if 'time' in subconfig:
-                    for time in as_list(subconfig['time']):
-                        target = "auspice/avian-flu_{subtype}_{segment}_{time}.json".format(subtype=subtype, segment=segment, time=time)
-                else:
-                    target = "auspice/avian-flu_{subtype}_{segment}.json".format(subtype=subtype, segment=segment)
-                if target not in targets:
-                    targets.append(target)
+                for time in (as_list(subconfig['time']) if 'time' in subconfig else [None]):
+                    for target_pattern in target_patterns:
+                        if time is None and '{time}' in target_pattern:
+                            raise InvalidConfigError(f'target pattern {target_pattern!r} specifies time, but config["builds"][{i}] doesn\'t!')
+                        target = target_pattern.format(subtype=subtype, segment=segment, time=time)
+                        if target not in targets:
+                            targets.append(target)
 
     return targets
 
 rule all:
-    input:
-        auspice_json = collect_builds()
-        #sequences = expand("results/{subtype}/{segment}/sequences.fasta", segment=SEGMENTS, subtype=SUBTYPES),
-        #metadata = expand("results/{subtype}/metadata.tsv", segment=SEGMENTS, subtype=SUBTYPES)
+    input: expand_target_patterns()
 
 
 # This must be after the `all` rule above since it depends on its inputs
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index bd43502..398313a 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -19,6 +19,10 @@ builds:
       - mp
       - ns
 
+# The default target for this build is the Auspice JSON, however config overlays may wish to change this
+# (e.g. to target "intermediate" files instead)
+target_patterns:
+  - "auspice/avian-flu_{subtype}_{segment}.json"
 
 #### Parameters which define the input source ####
 s3_src:

From 2a19000348670a569a665873864cf204ed3e0092 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 22 Nov 2024 13:35:53 +1300
Subject: [PATCH 09/11] Shift subtype lookups to config

Shifting hardcoded parameters into configs is generally simpler to
reason with and opens the door (ever so slightly) to adding novel
(wildcard) subtypes to the pipeline via config-only modifications.
---
 Snakefile                        | 17 ++---------------
 gisaid/config.yaml               |  7 +++++++
 h5n1-cattle-outbreak/config.yaml |  4 ++++
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/Snakefile b/Snakefile
index 8502f1e..3113644 100755
--- a/Snakefile
+++ b/Snakefile
@@ -233,19 +233,6 @@ rule files:
 files = rules.files.params
 
 
-def subtypes_by_subtype_wildcard(wildcards):
-    # TODO - shift this to config
-    db = {
-        'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'],
-        'h5n1': ['h5n1'],
-        'h7n9': ['h7n9'],
-        'h9n2': ['h9n2'],
-    }
-    db['h5n1-cattle-outbreak'] = [*db['h5nx']]
-    assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function "
-        "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?")
-    return(db[wildcards.subtype])
-
 rule download_sequences:
     output:
         sequences = f"data/{S3_SRC.get('name', None)}/sequences_{{segment}}.fasta",
@@ -291,7 +278,7 @@ rule filter_sequences_by_subtype:
     output:
         sequences = "results/{subtype}/{segment}/sequences.fasta",
     params:
-        subtypes=subtypes_by_subtype_wildcard,
+        subtypes=lambda w: config['subtype_lookup'][w.subtype],
     shell:
         """
         augur filter \
@@ -307,7 +294,7 @@ rule filter_metadata_by_subtype:
     output:
         metadata = "results/{subtype}/metadata.tsv",
     params:
-        subtypes=subtypes_by_subtype_wildcard,
+        subtypes=lambda w: config['subtype_lookup'][w.subtype],
     shell:
         """
         augur filter \
diff --git a/gisaid/config.yaml b/gisaid/config.yaml
index bf2acc9..0ab15b3 100644
--- a/gisaid/config.yaml
+++ b/gisaid/config.yaml
@@ -37,6 +37,13 @@ s3_src:
 local_ingest: false
 # P.S. To use local ingest files, comment out s3_src and change to local_ingest: fauna
 
+# For subtypes defined as build wildcards (e.g. "h5n1", "h5nx"), list out the subtype values
+# that we'll use to filter the starting metadata's 'subtype' column
+subtype_lookup:
+  h5nx: ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9']
+  h5n1: ['h5n1']
+  h7n9: ['h7n9']
+  h9n2: ['h9n2']
 
 #### Parameters which control large overarching aspects of the build
 target_sequences_per_tree: 3000
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 398313a..06185eb 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -32,6 +32,10 @@ s3_src:
 local_ingest: false
 # P.S. To use local ingest files, comment out s3_src and change to local_ingest: joined-ncbi (e.g.)
 
+# For subtypes defined as build wildcards (i.e. "h5n1-cattle-outbreak"), list out the subtype values
+# that we'll use to filter the starting metadata's 'subtype' column
+subtype_lookup:
+  h5n1-cattle-outbreak: ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9']
 
 #### Parameters which control large overarching aspects of the build
 # Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down

From 2aaf006b507ca70f8ff95964d2747e269ca877c1 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 22 Nov 2024 14:49:50 +1300
Subject: [PATCH 10/11] Update max-sequences filtering parameter

It makes more sense to specify this as a filtering parameter. We could
continue using a value which can't be changed according to wildcards
(e.g. `target_sequences_per_tree: 3000`) however by using the
"*/*/*: 3000" syntax we make it clearer that it's possible to make
this specific to certain builds.

The new syntax makes this trivial to implement using a
---
 Snakefile                        |  2 +-
 gisaid/config.yaml               |  4 +++-
 h5n1-cattle-outbreak/config.yaml | 11 +++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3113644..3f7accc 100755
--- a/Snakefile
+++ b/Snakefile
@@ -407,7 +407,7 @@ def _filter_params(wildcards, input, output, threads, resources):
     group_by_value = resolve_config_value(['filter', 'group_by'], wildcards)
     cmd += f" --group-by {group_by_value}" if group_by_value else ""
 
-    cmd += f" --subsample-max-sequences {config['target_sequences_per_tree']}"
+    cmd += f" --subsample-max-sequences {resolve_config_value(['filter', 'target_sequences_per_tree'], wildcards)}"
     cmd += f" --min-date {resolve_config_value(['filter', 'min_date'], wildcards)}"
     cmd += f" --include {input.include}"
     cmd += f" --exclude-where {exclude_where}"
diff --git a/gisaid/config.yaml b/gisaid/config.yaml
index 0ab15b3..779d2df 100644
--- a/gisaid/config.yaml
+++ b/gisaid/config.yaml
@@ -46,7 +46,6 @@ subtype_lookup:
   h9n2: ['h9n2']
 
 #### Parameters which control large overarching aspects of the build
-target_sequences_per_tree: 3000
 same_strains_per_segment: false
 
 
@@ -68,6 +67,9 @@ description: config/description_gisaid.md
 # There's one exception: If a config value is constant for any and all builds then you
 # can just use a scalar value (number, string, boolean)
 filter: 
+  target_sequences_per_tree:
+    "*/*/*": 3000
+
   min_length:
     "*/pb2/*": 2100
     "*/pb1/*": 2100
diff --git a/h5n1-cattle-outbreak/config.yaml b/h5n1-cattle-outbreak/config.yaml
index 06185eb..d1d97f0 100644
--- a/h5n1-cattle-outbreak/config.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -37,12 +37,6 @@ local_ingest: false
 subtype_lookup:
   h5n1-cattle-outbreak: ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9']
 
-#### Parameters which control large overarching aspects of the build
-# Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down
-# as part of the workflow
-target_sequences_per_tree: 10_000
-
-
 #### Config files ####
 
 reference: config/h5n1-cattle-outbreak/reference_{segment}.gb
@@ -58,6 +52,11 @@ description: config/{subtype}/description_{subtype}.md
 
 #### Rule-specific parameters ####
 filter:
+  # Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down
+  # as part of the workflow
+  target_sequences_per_tree:
+    "*/*/*": 3000
+
   min_length:
     "*/pb2/*": 2100 # Note: could use "h5n1-cattle-outbreak/pb2/default: 2100" if desired
     "*/pb1/*": 2100

From fb999036fa3307d2cf7e38114bfbac973d583e46 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 22 Nov 2024 15:36:46 +1300
Subject: [PATCH 11/11] WIP add docs to README

WIP as
- I removed the quickstart section. Do we want to maintain this?
- I need to check the "other build customisations" section and ensure
  they still work with the current setup
- this is a prototype, and so the invocation syntax is all subject to
  change!
---
 README.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 126 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 4ec2f54..be7b1d2 100755
--- a/README.md
+++ b/README.md
@@ -6,10 +6,20 @@ Please see [nextstrain.org/docs](https://nextstrain.org/docs) for details about
 
 The Snakemake pipeline is parameterised by two config files, one for the A/H5N1, A/H5NX, A/H7N9, and A/H9N2 builds and one for the 2024 A/H5N1 cattle-flu outbreak.
 
+### Table of Contents
 
-## Segment-level GISAID builds
+* [Running segment-level GISAID builds](#running-segment-level-gisaid-builds)
+* [Running H5N1 Cattle Outbreak (2024) builds](#running-h5n1-cattle-outbreak-2024-builds)
+* [Creating a custom build via config overlays](#creating-a-custom-build-via-config-overlays)
+* [Running builds in a separate working directory](#running-builds-in-a-separate-working-directory)
+* [Cleavage Site Annotations](#cleavage-side-annotations)
+* [H5 Clade Labeling](#h5-clade-labeling)
+* [Other build customisations](#other-build-customisations)
 
-The `config/gisaid.yaml` config builds 32 Auspice datasets (8 segments x 4 subtypes (A/H5N1, A/H5NX, A/H7N9, A/H9N2)) using GISAID data and can be run via
+
+## Running segment-level GISAID builds
+
+The `config/gisaid.yaml` config builds 48 Auspice datasets (8 segments x 4 subtypes (A/H5N1, A/H5NX, A/H7N9, A/H9N2) x 1-2 time resolutions (all-time, 2y)) using GISAID data and can be run via
 
 ```bash
 snakemake --cores 1 -pf --configfile config/gisaid.yaml
@@ -34,7 +44,7 @@ to nextstrain.org by running:
 nextstrain build . --configfile config/gisaid.yaml -f deploy_all
 ```
 
-## H5N1 Cattle Outbreak (2024)
+## Running H5N1 Cattle Outbreak (2024) builds
 
 We produce per-segment and whole-genome (concatenated segments) builds for the ongoing H5N1 cattle-flu outbreak.
 These use NCBI data including consensus genomes and SRA data assembled via the Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza).
@@ -64,15 +74,119 @@ This should allow any reassortments to be highlighted and will also include outb
 > Note that generating any segment-level build here will necessarily build the genome tree, as it's needed to identify the clade of interest in each segment.
 
 
-## Creating a custom build
-The easiest way to generate your own, custom avian-flu build is to use the quickstart-build as a starting template. Simply clone the quickstart-build, run with the example data, and edit the Snakefile to customize. This build includes example data and a simplified, heavily annotated Snakefile that goes over the structure of Snakefiles and annotates rules and inputs/outputs that can be modified. This build, with it's own readme, is available [here](https://github.com/nextstrain/avian-flu/tree/master/quickstart-build).
+## Creating a custom build via config overlays
+
+The two config files introduced above are designed to be extended (customised) by overlaying your own YAML configurations.
+The aim is to allow easy customisation of the workflow outputs (e.g. which subtypes / segments / time-frames to run) and the stopping points (Auspice JSONs or particular intermediate files) via such overlays.
+Additionally, modification of parameters (e.g. clock rates, minimum lengths, subsampling criteria, DTA metadata) is possible through overlays without needing to modify the underlying base config or Snakemake pipeline itself.
+
+Config overlays allow you to essentially maintain one or more modifications of the workflow for your own purposes.
+For instance you may want a way to easily run only H5N1 HA+NA 2y builds, and a config overlay can achieve this.
+Using an overlay keeps this change separate from the config used for Nextstrain automation and also avoids `git` conflicts emerging over time.
+When combined with running in a separate working directory (explained below) this becomes even more powerful.
+
+As an example we'll create a custom config `config_local.yaml` which you can then add to the build command described above, e.g. `nextstrain build . --configfile config/gisaid.yaml config_local.yaml`.
+We'll use the GISAID config as the example we're extending, however the concepts are the similar for the H5N1 cattle outbreak config.
+
+
+### Restrict which builds we want to produce
+
+By default we produce 48 Auspice JSONs (4 subtypes * 8 segments * 1-2 time resolutions). We can restrict these by redefining the builds in our config overlay. For instance the following will produce only 2 datasets, `h5n1/ha/2y` and `h5n1/na/2y`:
+```yaml
+builds:
+  - subtype: h5n1
+    segment:
+      - ha
+      - na
+    time: 2y
+```
+Here `builds` is an array of sub-configs, each of which define a combination of subtype, segment and time parameters. Each subtype, segment and time can be a single string (as subtype and time are, above) or an array (as segment is, above).
+
+### Only produce certain intermediate files, not Auspice datasets
+
+By default the "target" for each build is the Auspice JSON (`auspice/avian-flu_h5n1_ha_2y.json` etc) however we can change this if we just want certain intermediate files. Adding the following to the config overlay will stop once we've filtered to the metadata & sequences we would use for each tree
+
+```yaml
+target_patterns:
+  - "results/{subtype}/{segment}/{time}/metadata.tsv"
+  - "results/{subtype}/{segment}/{time}/sequences.tsv"
+```
+
+(This works in combination with the custom `builds` definition, above).
+
+### Change the target number of sequences per build
+
+In the `config/gisaid.yaml` this parameter is defined as
+
+```yaml
+filter: 
+  target_sequences_per_tree:
+    "*/*/*": 3000
+```
+
+Where the `"*/*/*"` syntax is slash-separated matchers for subtype, segment and time-resolution, and the `*` character means "match everything".
+So here we're saying "for every subtype, for every segment, for every time-resolution target 3000 sequences".
+
+If we want to change our h5n1 builds to instead have 5000 sequences (whilst keeping the rest at 3000) we could add the following to our config overlay:
+```yaml
+filter: 
+  target_sequences_per_tree:
+    "h5n1/*/*": 5000
+```
+And since `"h5n1/*/*"` is more specific than `"*/*/*"` it'll take precedence when `subtype="h5n1"`.
+(Internally, Snakemake merges these configs together resulting in `'target_sequences_per_tree': {'h5n1/*/*': 5000, '*/*/*': 3000}`. For each combination of subtype/segment/time values within the workflow we consult these dictionaries and pick the most specific match.)
 
-## Features unique to avian flu builds
 
-#### cleavage site annotations
+This syntax is concise but powerful, for instance we can parameterise the builds like so:
+```yaml
+filter: 
+  target_sequences_per_tree:
+    "*/*/all-time": 5000 # target 5k sequences for the all-time builds
+    "h5n1/*/all-time": 10000 # but for h5n1 all-time builds target 10k sequences (this is more specific as it only includes one '*' character)
+    "*/*/*": 1000 # target 1k sequences for any other builds (i.e. the 2y builds)
+```
+
+
+
+### Change other parameters
+
+The pattern introduced in the preceeding section generally applies for all parameters in the workflow.
+By reading through the base config YAML (e.g. `config/gisaid.yaml`) you should be able to learn the config structure and add then modify that within your overlay config as needed.
+
+If the parameter is not exposed via the config YAML and you find yourself modifying the underlying Snakefile consider exposing it via the config so that this customisation then becomes available to config overlays.
+
+
+## Running builds in a separate working directory
+
+> Note: This section doesn't yet work using AWS / docker runtimes but support is forthcoming.
+
+We can run the worfklow from an analysis directory separate to this repo. Imagine the following directory structure:
+
+```console
+├── avian-flu
+│   ├── README.md
+│   ├── Snakefile
+│   └── ... (lots more files here)
+└── analysis_dir
+    └── config.yaml
+```
+
+Where `config.yaml` is the your config overlay, introduced in the previous section as `config_local.yaml`.
+
+> TODO XXX following uses a different invocation - this is all in flux
+
+From within the `analysis_dir` we can run `snakemake --snakefile ../avian-flu/gisaid/Snakefile --cores 1` and we'll automatically run the workflow using your `config.yaml` config overlay.
+This keeps the workflow outputs separate and isolated from the workflow itself.
+Depending on how you run builds this can be very liberating; for instance if you are switching between versions of the workflow you can maintain different analysis directories for each version, or if you are running the workflow at different times you can separate the analyses for easy before/after comparisons.
+
+> You don't have to name it `config.yaml`, but if you use a different filename you'll have to specify it via `--configfile <filename>`.
+
+
+## Cleavage Site Annotations
+
 Influenza virus HA is translated as a single peptide (HA0) that is cleaved to form the mature, functional form (HA1 and HA2). In all human strains and many avian strains, the cleavage site is composed of a single, basic amino acid residue. However, some avian influenza subtypes, particularly H5s, have acquired additional basic residues immediately preceding the HA cleavage site. In some cases, this results in addition of a furin cleavage motif, allowing HA to be cleaved by furin, which is ubiquitously expressed, and allows for viral replication across a range of tissues. The addition of this "polybasic cleavage site" is one of the prime determinants of avian influenza virulence. In these builds, we have annotated whether strains contain a furin cleavage motif, defined here as the sequence `R-X-K/R-R` immediately preceding the start of HA2, where `X` can be any amino acid. We have also added a color by for the cleavage site sequence, which we define here as the 4 bases preceding HA2.
 
-#### clade labeling
+## H5 Clade Labeling
 H5 viruses are classified into clades, which are currently viewable as a color by on [nextstrain.org](https://nextstrain.org/avian-flu/h5n1/ha?c=h5_label_clade). Because clade annotations are not available in all public databases, we annotate sequences with their most likely clade using a tool developed by Samuel S. Shepard at CDC called [LABEL](https://wonder.cdc.gov/amd/flu/label/). The assigned clade for each H5N1 or H5Nx sequence are available as public tsvs [here](https://github.com/nextstrain/avian-flu/tree/master/clade-labeling).
 
 To update the `clades.tsv` file with clade annotations for new sequences, run:
@@ -87,7 +201,9 @@ To string these together and update the `clades.tsv` file for new sequences and
 
 `snakemake -s Snakefile.clades -p --cores 1 && snakemake -p --cores 1`
 
-#### Using the same strains for all segments
+## Other build customisations
+
+### Using the same strains for all segments
 
 By default we subsample data for each segment independently.
 Alternatively, you can ask the pipeline to use the same strains for each segment.
@@ -106,7 +222,7 @@ If you are using `nextstrain build` then add that to the end of the command (i.e
 
 Note that you may need to remove any existing data in `results/` in order for snakemake to correctly regenerate the intermediate files.
 
-#### Using locally ingested data (instead of downloading from S3)
+### Using locally ingested data (instead of downloading from S3)
 
 Run the pipeline with `--config 'local_ingest=True'` to use the locally available files produced by the ingest pipeline (see `./ingest/README.md` for details on how to run).
 Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/results/sequences_{SEGMENT}.fasta`.