From c3cd0a3b829c14bbf53db2edabd733de980c8ce2 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 28 Jan 2025 10:31:15 +0100 Subject: [PATCH] feat: use nightly nextclade tree - [x] switch Nextclade dataset in directory format (which allows to replace dataset files) - [x] replace reference tree in the dataset with the nightly tree from https://nextstrain.org/staging/nextclade/sars-cov-2 This allows to bypass laggy Nextclade dataset updates and use the latest data always. Which may or may not be what we want. This aims to be a workaround until the dataset updates are sorted out. Potential problems: - nightly trees are not systematically reviewed and can contain bugs - does any other parts of the dataset need to be updated along with the tree? (such as pathogen.json) - does any other repos need to be updated to use nightly tree? (e.g. ncov-ingest) i.e. is there an assumption that the exact same dataset is used in 2 or more places? --- workflow/snakemake_rules/main_workflow.smk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 8834897bf..9d4727d38 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -455,14 +455,20 @@ rule prepare_nextclade: Downloading reference files for nextclade (used for alignment and qc). """ output: - nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", + nextclade_dataset = "data/sars-cov-2-nextclade-defaults", params: name = config["nextclade_dataset"], conda: config["conda_environment"] shell: r""" nextclade --version - nextclade dataset get --name {params.name} --output-zip {output.nextclade_dataset} + nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset} + + # override tree.json with nightly tree + curl -fsSL \ + -o {output.nextclade_dataset}/tree.json \ + -H "Accept: application/vnd.nextstrain.dataset.main+json;q=1, application/json;q=0.9, text/plain;q=0.8, */*;q=0.1" \ + "https://nextstrain.org/staging/nextclade/sars-cov-2" """ rule build_align: @@ -473,7 +479,7 @@ rule build_align: """ input: sequences = rules.combine_samples.output.sequences, - nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", + nextclade_dataset = "data/sars-cov-2-nextclade-defaults", output: alignment = "results/{build_name}/aligned.fasta", nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',