From c316794689ab18046e4fed68d6f34be103b78c6a Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Tue, 6 Aug 2024 13:25:47 -0400 Subject: [PATCH] Streamline Nextclade interactions This changeset replaces a Nextstrain API call (to get the COV-SARS-2 reference sequence and a tree) with a call to the datasets function of the Nextclade CLI. The resulting download is a .zip file that can be used as input to the Nextclade CLI's run function when assigning sequences to clades. --- Dockerfile | 2 +- src/virus_clade_utils/assign_clades.py | 36 +++----------- src/virus_clade_utils/util/reference.py | 66 ++++++++++++------------- tests/unit/util/test_reference.py | 59 +++------------------- 4 files changed, 47 insertions(+), 116 deletions(-) diff --git a/Dockerfile b/Dockerfile index 356f5f3..849ba7b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ # Pulling the nextclade image to grab the CLI binary is kind of overkill, but it's # an easy way to control the version we're using. # TODO: fix up the reference tree/root sequence so we can use a newer version -FROM nextstrain/nextclade:3.3.1 +FROM nextstrain/nextclade:3.8.2 FROM python:3.12-slim-bookworm COPY --from=0 /usr/bin/nextclade /opt/src/virus_clade_utils/bin/ diff --git a/src/virus_clade_utils/assign_clades.py b/src/virus_clade_utils/assign_clades.py index 098678a..c5ffb00 100644 --- a/src/virus_clade_utils/assign_clades.py +++ b/src/virus_clade_utils/assign_clades.py @@ -1,5 +1,4 @@ import datetime -import json import os import subprocess @@ -8,7 +7,7 @@ import structlog from virus_clade_utils.util.config import Config -from virus_clade_utils.util.reference import get_reference_data +from virus_clade_utils.util.reference import get_nextclade_dataset from virus_clade_utils.util.sequence import ( get_covid_genome_data, parse_sequence_assignments, @@ -67,25 +66,7 @@ def get_sequence_metadata(config: Config): logger.info("extracted sequence metadata", metadata_file=config.ncbi_sequence_metadata_file) -def save_reference_info(config: Config): - """Download a reference tree and save it to a file.""" - - reference = get_reference_data(config.nextclade_base_url, config.reference_tree_date) - - with open(config.reference_tree_file, "w") as f: - json.dump(reference, f) - - with open(config.root_sequence_file, "w") as f: - f.write(reference["root_sequence"]) - - logger.info( - "Reference data saved", - tree_path=str(config.reference_tree_file), - root_sequence_path=str(config.root_sequence_file), - ) - - -def assign_clades(config: Config): +def assign_clades(config: Config, nextclade_dataset_path: str): """Assign downloaded genbank sequences to a clade.""" logger.info("Assigning sequences to clades using reference tree") @@ -94,13 +75,11 @@ def assign_clades(config: Config): [ "nextclade", "run", - "--input-tree", - f"{config.reference_tree_file}", - "--input-ref", - f"{config.root_sequence_file}", + f"{config.ncbi_sequence_file}", + "--input-dataset", + nextclade_dataset_path, "--output-csv", f"{config.assignment_no_metadata_file}", - f"{config.ncbi_sequence_file}", ] ) @@ -172,11 +151,10 @@ def main(sequence_released_since_date: datetime.date, reference_tree_date: datet logger.info("Starting pipeline", reference_tree_date=reference_tree_date, run_time=config.run_time) os.makedirs(config.data_path, exist_ok=True) + nextclade_dataset_path = get_nextclade_dataset(config.reference_tree_date, config.data_path) get_sequences(config) get_sequence_metadata(config) - save_reference_info(config) - assign_clades(config) - + assign_clades(config, nextclade_dataset_path) merged_data = merge_metadata(config) merged_data.write_csv(config.assignment_file) diff --git a/src/virus_clade_utils/util/reference.py b/src/virus_clade_utils/util/reference.py index b915e31..48bc027 100644 --- a/src/virus_clade_utils/util/reference.py +++ b/src/virus_clade_utils/util/reference.py @@ -1,46 +1,42 @@ """Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data.""" -import requests +import subprocess +from pathlib import Path + import structlog -from virus_clade_utils.util.session import check_response, get_session logger = structlog.get_logger() -def get_reference_data(base_url: str, as_of_date: str) -> dict: - """Return a reference tree as of a given date in YYYY-MM-DD format.""" - headers = { - "Accept": "application/vnd.nextstrain.dataset.main+json", - } - session = get_session() - session.headers.update(headers) - - response = requests.get(f"{base_url}@{as_of_date}", headers=headers) - check_response(response) - reference_data = response.json() +def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str: + """ + Return the Nextclade dataset relevant to a specified as_of_date. The dataset is + in .zip format and contains two components required for assignming virus + genome sequences to clades: a tree and the reference sequence of the virus. + """ + + # Until Nextstrain provides this information, we're hard-coding a + # a specific version of the nextclade dataset here. + as_of_date = "not yet implemented" + DATASET_VERSION = "2024-07-17--12-57-03Z" + DATASET_PATH = Path(f"{data_path_root}/nextclade_dataset_{DATASET_VERSION}.zip") + + subprocess.run( + [ + "nextclade", + "dataset", + "get", + "--name", + "sars-cov-2", + "--tag", + DATASET_VERSION, + "--output-zip", + str(DATASET_PATH), + ] + ) logger.info( - "Reference data retrieved", - tree_updated=reference_data["meta"].get("updated"), + "Nextclade reference dataset retrieved", as_of_date=as_of_date, version=DATASET_VERSION, output_zip=DATASET_PATH ) - reference = { - "tree": reference_data["tree"], - "meta": reference_data["meta"], - } - - try: - # response schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-v2.json - # root sequence schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-root-sequence.json - # this code adds a fasta-compliant header to the root sequence returned by the API - fasta_root_header = ( - ">NC_045512.2 Severe acute respiratory syndrome" " coronavirus 2 isolate Wuhan-Hu-1, complete genome" - ) - root_sequence = reference_data["root_sequence"]["nuc"] - reference["root_sequence"] = f"{fasta_root_header}\n{root_sequence}" - except KeyError: - # Older versions of the dataset don't include a root_sequence. - logger.error("Aborting pipeline: no root sequence found in reference data.", as_of_date=as_of_date) - raise SystemExit(f"\nAborting pipeline: no root sequence found for date {as_of_date}") - - return reference + return DATASET_PATH diff --git a/tests/unit/util/test_reference.py b/tests/unit/util/test_reference.py index a36dd18..3fe76b0 100644 --- a/tests/unit/util/test_reference.py +++ b/tests/unit/util/test_reference.py @@ -1,56 +1,13 @@ from unittest import mock -import pytest -from requests import Response -from virus_clade_utils.util.reference import get_reference_data +from virus_clade_utils.util.reference import get_nextclade_dataset -@pytest.fixture -def get_nextclade_response(): - def _get_nextclade_response(): - return { - "tree": "cladesandstuff", - "meta": {"updated": "2021-09-01"}, - "root_sequence": {"nuc": "fastasequence"}, - } +@mock.patch("subprocess.run") +def test_get_nextclade_dataset(tmp_path): + dataset_path = get_nextclade_dataset("2021-09-01", tmp_path) - return _get_nextclade_response - - -@pytest.fixture -def get_nextclade_response_no_root(): - def _get_nextclade_response_no_root(): - return { - "tree": "cladesandstuff", - "meta": {"updated": "2021-09-01"}, - } - - return _get_nextclade_response_no_root - - -@mock.patch("requests.get") -def test_get_reference_data(mock_get, get_nextclade_response): - mock_response = Response() - mock_response.status_code = 200 - mock_response.json = get_nextclade_response - mock_get.return_value = mock_response - - reference = get_reference_data("www.fakenextclade.com", "2021-09-01") - - assert reference["tree"] == "cladesandstuff" - assert reference["meta"]["updated"] == "2021-09-01" - assert "fastasequence" in reference["root_sequence"] - - -@mock.patch("requests.get") -def test_missing_root_sequence(mock_get, get_nextclade_response_no_root, capsys): - mock_response = Response() - mock_response.status_code = 200 - mock_response.json = get_nextclade_response_no_root - mock_get.return_value = mock_response - - with pytest.raises(SystemExit): - get_reference_data("www.fakenextclade.com", "2021-09-01") - out, err = capsys.readouterr() - assert "no root sequence" in out - assert "2021-09-01" in out + # the dataset_path being returned should contain the correct nextclade + # datasetset version, as determined by the as_of_date being passed + # (returned version is temporarily hard-coded until Nextstrain provides the info we need) + assert "2024-07-17--12-57-03Z" in dataset_path[0]