From 684664a882381a63a90c8864302cdbb5f3c3587c Mon Sep 17 00:00:00 2001 From: james hadfield Date: Fri, 22 Nov 2024 14:49:50 +1300 Subject: [PATCH] Update max-sequences filtering parameter It makes more sense to specify this as a filtering parameter. We could continue using a value which can't be changed according to wildcards (e.g. `target_sequences_per_tree: 3000`) however by using the "*/*/*: 3000" syntax we make it clearer that it's possible to make this specific to certain builds. The new syntax makes this trivial to implement using a --- Snakefile | 2 +- config/gisaid.yaml | 4 +++- config/h5n1-cattle-outbreak.yaml | 11 +++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Snakefile b/Snakefile index 90f467b..ca23abe 100755 --- a/Snakefile +++ b/Snakefile @@ -413,7 +413,7 @@ def _filter_params(wildcards, input, output, threads, resources): group_by_value = resolve_config_value(['filter', 'group_by'], wildcards) cmd += f" --group-by {group_by_value}" if group_by_value else "" - cmd += f" --subsample-max-sequences {config['target_sequences_per_tree']}" + cmd += f" --subsample-max-sequences {resolve_config_value(['filter', 'target_sequences_per_tree'], wildcards)}" cmd += f" --min-date {resolve_config_value(['filter', 'min_date'], wildcards)}" cmd += f" --include {input.include}" cmd += f" --exclude-where {exclude_where}" diff --git a/config/gisaid.yaml b/config/gisaid.yaml index 0ab15b3..779d2df 100644 --- a/config/gisaid.yaml +++ b/config/gisaid.yaml @@ -46,7 +46,6 @@ subtype_lookup: h9n2: ['h9n2'] #### Parameters which control large overarching aspects of the build -target_sequences_per_tree: 3000 same_strains_per_segment: false @@ -68,6 +67,9 @@ description: config/description_gisaid.md # There's one exception: If a config value is constant for any and all builds then you # can just use a scalar value (number, string, boolean) filter: + target_sequences_per_tree: + "*/*/*": 3000 + min_length: "*/pb2/*": 2100 "*/pb1/*": 2100 diff --git a/config/h5n1-cattle-outbreak.yaml b/config/h5n1-cattle-outbreak.yaml index a90474e..9981f21 100644 --- a/config/h5n1-cattle-outbreak.yaml +++ b/config/h5n1-cattle-outbreak.yaml @@ -37,12 +37,6 @@ local_ingest: false subtype_lookup: h5n1-cattle-outbreak: ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'] -#### Parameters which control large overarching aspects of the build -# Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down -# as part of the workflow -target_sequences_per_tree: 10_000 - - #### Config files #### reference: config/h5n1/reference_h5n1_{segment}.gb # use H5N1 references genome_reference: config/{subtype}/h5_cattle_genome_root.gb @@ -58,6 +52,11 @@ description: config/{subtype}/description_{subtype}.md #### Rule-specific parameters #### filter: + # Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down + # as part of the workflow + target_sequences_per_tree: + "*/*/*": 3000 + min_length: "*/pb2/*": 2100 # Note: could use "h5n1-cattle-outbreak/pb2/default: 2100" if desired "*/pb1/*": 2100