From c316794689ab18046e4fed68d6f34be103b78c6a Mon Sep 17 00:00:00 2001
From: Becky Sweger <rsweger@umass.edu>
Date: Tue, 6 Aug 2024 13:25:47 -0400
Subject: [PATCH] Streamline Nextclade interactions

This changeset replaces a Nextstrain API call (to get the COV-SARS-2
reference sequence and a tree) with a call to the datasets function
of the Nextclade CLI. The resulting download is a .zip file that
can be used as input to the Nextclade CLI's run function when
assigning sequences to clades.
---
 Dockerfile                              |  2 +-
 src/virus_clade_utils/assign_clades.py  | 36 +++-----------
 src/virus_clade_utils/util/reference.py | 66 ++++++++++++-------------
 tests/unit/util/test_reference.py       | 59 +++-------------------
 4 files changed, 47 insertions(+), 116 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 356f5f3..849ba7b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@
 # Pulling the nextclade image to grab the CLI binary is kind of overkill, but it's
 # an easy way to control the version we're using.
 # TODO: fix up the reference tree/root sequence so we can use a newer version
-FROM nextstrain/nextclade:3.3.1
+FROM nextstrain/nextclade:3.8.2
 
 FROM python:3.12-slim-bookworm
 COPY --from=0 /usr/bin/nextclade /opt/src/virus_clade_utils/bin/
diff --git a/src/virus_clade_utils/assign_clades.py b/src/virus_clade_utils/assign_clades.py
index 098678a..c5ffb00 100644
--- a/src/virus_clade_utils/assign_clades.py
+++ b/src/virus_clade_utils/assign_clades.py
@@ -1,5 +1,4 @@
 import datetime
-import json
 import os
 import subprocess
 
@@ -8,7 +7,7 @@
 import structlog
 
 from virus_clade_utils.util.config import Config
-from virus_clade_utils.util.reference import get_reference_data
+from virus_clade_utils.util.reference import get_nextclade_dataset
 from virus_clade_utils.util.sequence import (
     get_covid_genome_data,
     parse_sequence_assignments,
@@ -67,25 +66,7 @@ def get_sequence_metadata(config: Config):
     logger.info("extracted sequence metadata", metadata_file=config.ncbi_sequence_metadata_file)
 
 
-def save_reference_info(config: Config):
-    """Download a reference tree and save it to a file."""
-
-    reference = get_reference_data(config.nextclade_base_url, config.reference_tree_date)
-
-    with open(config.reference_tree_file, "w") as f:
-        json.dump(reference, f)
-
-    with open(config.root_sequence_file, "w") as f:
-        f.write(reference["root_sequence"])
-
-    logger.info(
-        "Reference data saved",
-        tree_path=str(config.reference_tree_file),
-        root_sequence_path=str(config.root_sequence_file),
-    )
-
-
-def assign_clades(config: Config):
+def assign_clades(config: Config, nextclade_dataset_path: str):
     """Assign downloaded genbank sequences to a clade."""
 
     logger.info("Assigning sequences to clades using reference tree")
@@ -94,13 +75,11 @@ def assign_clades(config: Config):
         [
             "nextclade",
             "run",
-            "--input-tree",
-            f"{config.reference_tree_file}",
-            "--input-ref",
-            f"{config.root_sequence_file}",
+            f"{config.ncbi_sequence_file}",
+            "--input-dataset",
+            nextclade_dataset_path,
             "--output-csv",
             f"{config.assignment_no_metadata_file}",
-            f"{config.ncbi_sequence_file}",
         ]
     )
 
@@ -172,11 +151,10 @@ def main(sequence_released_since_date: datetime.date, reference_tree_date: datet
     logger.info("Starting pipeline", reference_tree_date=reference_tree_date, run_time=config.run_time)
 
     os.makedirs(config.data_path, exist_ok=True)
+    nextclade_dataset_path = get_nextclade_dataset(config.reference_tree_date, config.data_path)
     get_sequences(config)
     get_sequence_metadata(config)
-    save_reference_info(config)
-    assign_clades(config)
-
+    assign_clades(config, nextclade_dataset_path)
     merged_data = merge_metadata(config)
     merged_data.write_csv(config.assignment_file)
 
diff --git a/src/virus_clade_utils/util/reference.py b/src/virus_clade_utils/util/reference.py
index b915e31..48bc027 100644
--- a/src/virus_clade_utils/util/reference.py
+++ b/src/virus_clade_utils/util/reference.py
@@ -1,46 +1,42 @@
 """Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data."""
 
-import requests
+import subprocess
+from pathlib import Path
+
 import structlog
-from virus_clade_utils.util.session import check_response, get_session
 
 logger = structlog.get_logger()
 
 
-def get_reference_data(base_url: str, as_of_date: str) -> dict:
-    """Return a reference tree as of a given date in YYYY-MM-DD format."""
-    headers = {
-        "Accept": "application/vnd.nextstrain.dataset.main+json",
-    }
-    session = get_session()
-    session.headers.update(headers)
-
-    response = requests.get(f"{base_url}@{as_of_date}", headers=headers)
-    check_response(response)
-    reference_data = response.json()
+def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str:
+    """
+    Return the Nextclade dataset relevant to a specified as_of_date. The dataset is
+    in .zip format and contains two components required for assignming virus
+    genome sequences to clades: a tree and the reference sequence of the virus.
+    """
+
+    # Until Nextstrain provides this information, we're hard-coding a
+    # a specific version of the nextclade dataset here.
+    as_of_date = "not yet implemented"
+    DATASET_VERSION = "2024-07-17--12-57-03Z"
+    DATASET_PATH = Path(f"{data_path_root}/nextclade_dataset_{DATASET_VERSION}.zip")
+
+    subprocess.run(
+        [
+            "nextclade",
+            "dataset",
+            "get",
+            "--name",
+            "sars-cov-2",
+            "--tag",
+            DATASET_VERSION,
+            "--output-zip",
+            str(DATASET_PATH),
+        ]
+    )
 
     logger.info(
-        "Reference data retrieved",
-        tree_updated=reference_data["meta"].get("updated"),
+        "Nextclade reference dataset retrieved", as_of_date=as_of_date, version=DATASET_VERSION, output_zip=DATASET_PATH
     )
 
-    reference = {
-        "tree": reference_data["tree"],
-        "meta": reference_data["meta"],
-    }
-
-    try:
-        # response schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-v2.json
-        # root sequence schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-root-sequence.json
-        # this code adds a fasta-compliant header to the root sequence returned by the API
-        fasta_root_header = (
-            ">NC_045512.2 Severe acute respiratory syndrome" " coronavirus 2 isolate Wuhan-Hu-1, complete genome"
-        )
-        root_sequence = reference_data["root_sequence"]["nuc"]
-        reference["root_sequence"] = f"{fasta_root_header}\n{root_sequence}"
-    except KeyError:
-        # Older versions of the dataset don't include a root_sequence.
-        logger.error("Aborting pipeline: no root sequence found in reference data.", as_of_date=as_of_date)
-        raise SystemExit(f"\nAborting pipeline: no root sequence found for date {as_of_date}")
-
-    return reference
+    return DATASET_PATH
diff --git a/tests/unit/util/test_reference.py b/tests/unit/util/test_reference.py
index a36dd18..3fe76b0 100644
--- a/tests/unit/util/test_reference.py
+++ b/tests/unit/util/test_reference.py
@@ -1,56 +1,13 @@
 from unittest import mock
 
-import pytest
-from requests import Response
-from virus_clade_utils.util.reference import get_reference_data
+from virus_clade_utils.util.reference import get_nextclade_dataset
 
 
-@pytest.fixture
-def get_nextclade_response():
-    def _get_nextclade_response():
-        return {
-            "tree": "cladesandstuff",
-            "meta": {"updated": "2021-09-01"},
-            "root_sequence": {"nuc": "fastasequence"},
-        }
+@mock.patch("subprocess.run")
+def test_get_nextclade_dataset(tmp_path):
+    dataset_path = get_nextclade_dataset("2021-09-01", tmp_path)
 
-    return _get_nextclade_response
-
-
-@pytest.fixture
-def get_nextclade_response_no_root():
-    def _get_nextclade_response_no_root():
-        return {
-            "tree": "cladesandstuff",
-            "meta": {"updated": "2021-09-01"},
-        }
-
-    return _get_nextclade_response_no_root
-
-
-@mock.patch("requests.get")
-def test_get_reference_data(mock_get, get_nextclade_response):
-    mock_response = Response()
-    mock_response.status_code = 200
-    mock_response.json = get_nextclade_response
-    mock_get.return_value = mock_response
-
-    reference = get_reference_data("www.fakenextclade.com", "2021-09-01")
-
-    assert reference["tree"] == "cladesandstuff"
-    assert reference["meta"]["updated"] == "2021-09-01"
-    assert "fastasequence" in reference["root_sequence"]
-
-
-@mock.patch("requests.get")
-def test_missing_root_sequence(mock_get, get_nextclade_response_no_root, capsys):
-    mock_response = Response()
-    mock_response.status_code = 200
-    mock_response.json = get_nextclade_response_no_root
-    mock_get.return_value = mock_response
-
-    with pytest.raises(SystemExit):
-        get_reference_data("www.fakenextclade.com", "2021-09-01")
-    out, err = capsys.readouterr()
-    assert "no root sequence" in out
-    assert "2021-09-01" in out
+    # the dataset_path being returned should contain the correct nextclade
+    # datasetset version, as determined by the as_of_date being passed
+    # (returned version is temporarily hard-coded until Nextstrain provides the info we need)
+    assert "2024-07-17--12-57-03Z" in dataset_path[0]