WIP: prototype separate workflows as entrypoints

See <https://bedfordlab.slack.com/archives/C01LCTT7JNN/p1732568407123369> for context. Able to be run via a number of different ways: - From the 'avian-flu' repo: - `snakemake -s gisaid/Snakefile ...` - `cd gisaid && snakemake ...` - `snakemake --configfile gisaid/config.yaml` - From a separate analysis directory, where ${AVIAN_FLU} is the path to the (locally checked out) avian-flu repo - without any config overlays: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile` - with a `config.yaml` overlay: (same as above) - with a `foo.yaml` overlay: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile --configfile foo.yaml`
nextstrain · Nov 28, 2024 · 90b994c · 90b994c
1 parent 96a0ded
commit 90b994c
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 58 deletions.
diff --git a/Snakefile b/Snakefile
@@ -8,59 +8,33 @@ wildcard_constraints:
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
 #SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]
 
-# ----------------------------------------------------------------------------
-# Allow this to work from a separate workdir by using a config in that workdir
-# which extends one of our base configs
-# ----------------------------------------------------------------------------
-if os.path.exists("config.yaml"):
-    configfile: "config.yaml"
-    # See commentary below
-    # print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr)
-    # exit(2)
-
-if config.get('extends', False):
-    extend_path = os.path.join(workflow.basedir, "config", config['extends'])
-    if not os.path.isfile(extend_path):
-        sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}")
-    configfile: extend_path
-
-# NOTE:
-# In the situation where we're running outside of the repo, and we have a custom config YAML
-# such as `foo.yaml`:
-#        extends: h5n1-cattle-outbreak.yaml
-#        segments: ['pb2']
-# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!)
-# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml",
-# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code)
-# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...]
-# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2'].
-# So it seems like the `--configfile` definitions are being re-applied a second time?!?
-#
-# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach,
-# which is why it's not going to work without the following additional update_config
-# step (or something else?)
+CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty
 
+# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile)
+if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')):
+    configfile: os.path.join(workflow.basedir, 'config.yaml')
+
+# load a config.yaml file if it exists in the current working directory
 if os.path.exists("config.yaml"):
-    # Following <https://github.com/snakemake/snakemake/blob/76d53290a003891c5ee41f81e8eb4821c406255d/snakemake/common/configfile.py#L7-L33>
-    import yte
-    with open("config.yaml", encoding='utf-8') as f:
-        overwrite_config = yte.process_yaml(f, require_use_yte=True)
-    snakemake.utils.update_config(config, overwrite_config)
+    configfile: "config.yaml"
 
 from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove
 
+class InvalidConfigError(Exception):
+    pass
 
 def resolve_config_path(original_path, wildcards=None):
     """
-    Resolve a relative *path* given in a configuration value.
-    Resolves *path* as relative to the workflow's ``config/`` directory (i.e.
-    ``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist
-    in the workflow's analysis directory (i.e. the current working
-    directory, or workdir, usually given by ``--directory`` (``-d``)).
-    This behaviour allows a default configuration value to point to a default
-    auxiliary file while also letting the file used be overridden either by
-    setting an alternate file path in the configuration or by creating a file
-    with the conventional name in the workflow's analysis directory.
+    Resolve a relative *path* given in a configuration value. Before resolving
+    any '{x}' substrings are replaced by their corresponding wildcards (if the
+    `wildcards` argument is provided).
+    
+    Search order (first match returned):
+    1. Relative to the analysis directory
+    2. Relative to the directory the entry snakefile was in. Typically this
+       is not the Snakefile you are looking at now but (e.g.) the one in
+       avian-flu/gisaid
+    3. Relative to where this Snakefile is (i.e. `avian-flu/`)
     """
     path = original_path.format(**wildcards) if wildcards else original_path
 
@@ -71,17 +45,30 @@ def resolve_config_path(original_path, wildcards=None):
             print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
         exit(2)
 
-    if not os.path.exists(path):
-        # Check if the path exists relative to the basedir. This catches things like "config/…"
-        # as well as "clade-labeling/h5n1-clades.tsv"
-        basepath = os.path.join(workflow.basedir, path)
+    if os.path.exists(path): # isfile?
+        return path
+
+    # Check if the path exists relative to the subdir where the entry snakefile is
+    # (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv)
+    # you're expected to supply the 'config/x.tsv' as the value in the config YAML
+    # NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but
+    # given the different directories avian-flu uses that's acceptable. In other words, if we standardised
+    # avian-flu then we could add subdirectories to the search order here
+    basepath = os.path.join(workflow.basedir, path)
+    if os.path.exists(basepath):
+        return basepath
+
+    # Check if the path exists relative to where _this_ snakefile is, i.e. relative to `avian-flu/`.
+    if workflow.basedir != CURRENT_BASEDIR:
+        basepath = os.path.join(CURRENT_BASEDIR, path)
         if os.path.exists(basepath):
             return basepath
 
-        print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr)
-        exit(2)
-
-    return path
+    raise InvalidConfigError(f"Unable to resolve the config-provided path {original_path!r}, expanded to {path!r} after filling in wildcards. "
+        f"The following directories were searched:\n"
+        f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n"
+        f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
+        f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
 
 
 # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
@@ -95,6 +82,14 @@ S3_SRC = config.get('s3_src', {})
 LOCAL_INGEST = config.get('local_ingest', None)
 
 def sanity_check_config():
+    if not len(config.keys()):
+        print("-"*80 + "\nNo config loaded!", file=sys.stderr)
+        print("Avian-flu is indented to be run from the snakefile inside a subdir " 
+            "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " 
+            "Alternatively you can pass in the config via `--configfile`", file=sys.stderr)
+        print("-"*80, file=sys.stderr)
+        raise InvalidConfigError("No config")
+
     assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'"
     # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST
     # if we want to use `--config local_ingest=gisaid` overrides.
@@ -293,7 +288,7 @@ rule add_h5_clade:
     output:
         metadata= "results/{subtype}/metadata-with-clade.tsv"
     params:
-        script = os.path.join(workflow.basedir, "clade-labeling/add-clades.py")
+        script = os.path.join(workflow.current_basedir, "clade-labeling/add-clades.py")
     shell:
         r"""
         python {params.script} \
@@ -570,7 +565,7 @@ rule cleavage_site:
         cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json",
         cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json"
     params:
-        script = os.path.join(workflow.basedir, "scripts/annotate-ha-cleavage-site.py")
+        script = os.path.join(workflow.current_basedir, "scripts/annotate-ha-cleavage-site.py")
     shell:
         """
         python {params.script} \

diff --git a/gisaid/Snakefile b/gisaid/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
diff --git a/config/gisaid.yaml → gisaid/config.yaml b/config/gisaid.yaml → gisaid/config.yaml
diff --git a/h5n1-cattle-outbreak/Snakefile b/h5n1-cattle-outbreak/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
diff --git a/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml b/config/h5n1-cattle-outbreak.yaml → h5n1-cattle-outbreak/config.yaml
@@ -36,6 +36,7 @@ target_sequences_per_tree: 10_000
 
 
 #### Config files ####
+
 reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
 genome_reference: config/{subtype}/h5_cattle_genome_root.gb
 auspice_config: config/{subtype}/auspice_config_{subtype}.json

diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
@@ -76,7 +76,7 @@ rule join_segments:
         segment = 'genome',
         time = 'default',
     params:
-        script = os.path.join(workflow.basedir, "scripts/join-segments.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/join-segments.py")
     shell:
         """
         python {params.script} \
@@ -147,7 +147,7 @@ rule prune_tree:
         subtype="h5n1-cattle-outbreak",
         time="default",
     params:
-        script = os.path.join(workflow.basedir, "scripts/restrict-via-common-ancestor.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/restrict-via-common-ancestor.py")
     shell:
         r"""
         python3 {params.script} \
@@ -170,7 +170,7 @@ rule colors_genome:
         colors = "results/{subtype}/{segment}/{time}/colors.tsv",
     params:
         duplications = "division=division_metadata",
-        script = os.path.join(workflow.basedir, "scripts/assign-colors.py")
+        script = os.path.join(workflow.current_basedir, "../scripts/assign-colors.py")
     wildcard_constraints:
         subtype="h5n1-cattle-outbreak",
         time="default",