Skip to content

Commit

Permalink
Streamline Nextclade interactions
Browse files Browse the repository at this point in the history
This changeset replaces a Nextstrain API call (to get the COV-SARS-2
reference sequence and a tree) with a call to the datasets function
of the Nextclade CLI. The resulting download is a .zip file that
can be used as input to the Nextclade CLI's run function when
assigning sequences to clades.
  • Loading branch information
bsweger committed Aug 6, 2024
1 parent d484eb7 commit c316794
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 116 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Pulling the nextclade image to grab the CLI binary is kind of overkill, but it's
# an easy way to control the version we're using.
# TODO: fix up the reference tree/root sequence so we can use a newer version
FROM nextstrain/nextclade:3.3.1
FROM nextstrain/nextclade:3.8.2

FROM python:3.12-slim-bookworm
COPY --from=0 /usr/bin/nextclade /opt/src/virus_clade_utils/bin/
Expand Down
36 changes: 7 additions & 29 deletions src/virus_clade_utils/assign_clades.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
import json
import os
import subprocess

Expand All @@ -8,7 +7,7 @@
import structlog

from virus_clade_utils.util.config import Config
from virus_clade_utils.util.reference import get_reference_data
from virus_clade_utils.util.reference import get_nextclade_dataset
from virus_clade_utils.util.sequence import (
get_covid_genome_data,
parse_sequence_assignments,
Expand Down Expand Up @@ -67,25 +66,7 @@ def get_sequence_metadata(config: Config):
logger.info("extracted sequence metadata", metadata_file=config.ncbi_sequence_metadata_file)


def save_reference_info(config: Config):
"""Download a reference tree and save it to a file."""

reference = get_reference_data(config.nextclade_base_url, config.reference_tree_date)

with open(config.reference_tree_file, "w") as f:
json.dump(reference, f)

with open(config.root_sequence_file, "w") as f:
f.write(reference["root_sequence"])

logger.info(
"Reference data saved",
tree_path=str(config.reference_tree_file),
root_sequence_path=str(config.root_sequence_file),
)


def assign_clades(config: Config):
def assign_clades(config: Config, nextclade_dataset_path: str):
"""Assign downloaded genbank sequences to a clade."""

logger.info("Assigning sequences to clades using reference tree")
Expand All @@ -94,13 +75,11 @@ def assign_clades(config: Config):
[
"nextclade",
"run",
"--input-tree",
f"{config.reference_tree_file}",
"--input-ref",
f"{config.root_sequence_file}",
f"{config.ncbi_sequence_file}",
"--input-dataset",
nextclade_dataset_path,
"--output-csv",
f"{config.assignment_no_metadata_file}",
f"{config.ncbi_sequence_file}",
]
)

Expand Down Expand Up @@ -172,11 +151,10 @@ def main(sequence_released_since_date: datetime.date, reference_tree_date: datet
logger.info("Starting pipeline", reference_tree_date=reference_tree_date, run_time=config.run_time)

os.makedirs(config.data_path, exist_ok=True)
nextclade_dataset_path = get_nextclade_dataset(config.reference_tree_date, config.data_path)
get_sequences(config)
get_sequence_metadata(config)
save_reference_info(config)
assign_clades(config)

assign_clades(config, nextclade_dataset_path)
merged_data = merge_metadata(config)
merged_data.write_csv(config.assignment_file)

Expand Down
66 changes: 31 additions & 35 deletions src/virus_clade_utils/util/reference.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,42 @@
"""Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data."""

import requests
import subprocess
from pathlib import Path

import structlog
from virus_clade_utils.util.session import check_response, get_session

logger = structlog.get_logger()


def get_reference_data(base_url: str, as_of_date: str) -> dict:
"""Return a reference tree as of a given date in YYYY-MM-DD format."""
headers = {
"Accept": "application/vnd.nextstrain.dataset.main+json",
}
session = get_session()
session.headers.update(headers)

response = requests.get(f"{base_url}@{as_of_date}", headers=headers)
check_response(response)
reference_data = response.json()
def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str:
"""
Return the Nextclade dataset relevant to a specified as_of_date. The dataset is
in .zip format and contains two components required for assignming virus
genome sequences to clades: a tree and the reference sequence of the virus.
"""

# Until Nextstrain provides this information, we're hard-coding a
# a specific version of the nextclade dataset here.
as_of_date = "not yet implemented"
DATASET_VERSION = "2024-07-17--12-57-03Z"
DATASET_PATH = Path(f"{data_path_root}/nextclade_dataset_{DATASET_VERSION}.zip")

subprocess.run(
[
"nextclade",
"dataset",
"get",
"--name",
"sars-cov-2",
"--tag",
DATASET_VERSION,
"--output-zip",
str(DATASET_PATH),
]
)

logger.info(
"Reference data retrieved",
tree_updated=reference_data["meta"].get("updated"),
"Nextclade reference dataset retrieved", as_of_date=as_of_date, version=DATASET_VERSION, output_zip=DATASET_PATH
)

reference = {
"tree": reference_data["tree"],
"meta": reference_data["meta"],
}

try:
# response schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-v2.json
# root sequence schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-root-sequence.json
# this code adds a fasta-compliant header to the root sequence returned by the API
fasta_root_header = (
">NC_045512.2 Severe acute respiratory syndrome" " coronavirus 2 isolate Wuhan-Hu-1, complete genome"
)
root_sequence = reference_data["root_sequence"]["nuc"]
reference["root_sequence"] = f"{fasta_root_header}\n{root_sequence}"
except KeyError:
# Older versions of the dataset don't include a root_sequence.
logger.error("Aborting pipeline: no root sequence found in reference data.", as_of_date=as_of_date)
raise SystemExit(f"\nAborting pipeline: no root sequence found for date {as_of_date}")

return reference
return DATASET_PATH
59 changes: 8 additions & 51 deletions tests/unit/util/test_reference.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,13 @@
from unittest import mock

import pytest
from requests import Response
from virus_clade_utils.util.reference import get_reference_data
from virus_clade_utils.util.reference import get_nextclade_dataset


@pytest.fixture
def get_nextclade_response():
def _get_nextclade_response():
return {
"tree": "cladesandstuff",
"meta": {"updated": "2021-09-01"},
"root_sequence": {"nuc": "fastasequence"},
}
@mock.patch("subprocess.run")
def test_get_nextclade_dataset(tmp_path):
dataset_path = get_nextclade_dataset("2021-09-01", tmp_path)

return _get_nextclade_response


@pytest.fixture
def get_nextclade_response_no_root():
def _get_nextclade_response_no_root():
return {
"tree": "cladesandstuff",
"meta": {"updated": "2021-09-01"},
}

return _get_nextclade_response_no_root


@mock.patch("requests.get")
def test_get_reference_data(mock_get, get_nextclade_response):
mock_response = Response()
mock_response.status_code = 200
mock_response.json = get_nextclade_response
mock_get.return_value = mock_response

reference = get_reference_data("www.fakenextclade.com", "2021-09-01")

assert reference["tree"] == "cladesandstuff"
assert reference["meta"]["updated"] == "2021-09-01"
assert "fastasequence" in reference["root_sequence"]


@mock.patch("requests.get")
def test_missing_root_sequence(mock_get, get_nextclade_response_no_root, capsys):
mock_response = Response()
mock_response.status_code = 200
mock_response.json = get_nextclade_response_no_root
mock_get.return_value = mock_response

with pytest.raises(SystemExit):
get_reference_data("www.fakenextclade.com", "2021-09-01")
out, err = capsys.readouterr()
assert "no root sequence" in out
assert "2021-09-01" in out
# the dataset_path being returned should contain the correct nextclade
# datasetset version, as determined by the as_of_date being passed
# (returned version is temporarily hard-coded until Nextstrain provides the info we need)
assert "2024-07-17--12-57-03Z" in dataset_path[0]

0 comments on commit c316794

Please sign in to comment.