-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This changeset replaces a Nextstrain API call (to get the COV-SARS-2 reference sequence and a tree) with a call to the datasets function of the Nextclade CLI. The resulting download is a .zip file that can be used as input to the Nextclade CLI's run function when assigning sequences to clades.
- Loading branch information
Showing
4 changed files
with
47 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,42 @@ | ||
"""Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data.""" | ||
|
||
import requests | ||
import subprocess | ||
from pathlib import Path | ||
|
||
import structlog | ||
from virus_clade_utils.util.session import check_response, get_session | ||
|
||
logger = structlog.get_logger() | ||
|
||
|
||
def get_reference_data(base_url: str, as_of_date: str) -> dict: | ||
"""Return a reference tree as of a given date in YYYY-MM-DD format.""" | ||
headers = { | ||
"Accept": "application/vnd.nextstrain.dataset.main+json", | ||
} | ||
session = get_session() | ||
session.headers.update(headers) | ||
|
||
response = requests.get(f"{base_url}@{as_of_date}", headers=headers) | ||
check_response(response) | ||
reference_data = response.json() | ||
def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str: | ||
""" | ||
Return the Nextclade dataset relevant to a specified as_of_date. The dataset is | ||
in .zip format and contains two components required for assignming virus | ||
genome sequences to clades: a tree and the reference sequence of the virus. | ||
""" | ||
|
||
# Until Nextstrain provides this information, we're hard-coding a | ||
# a specific version of the nextclade dataset here. | ||
as_of_date = "not yet implemented" | ||
DATASET_VERSION = "2024-07-17--12-57-03Z" | ||
DATASET_PATH = Path(f"{data_path_root}/nextclade_dataset_{DATASET_VERSION}.zip") | ||
|
||
subprocess.run( | ||
[ | ||
"nextclade", | ||
"dataset", | ||
"get", | ||
"--name", | ||
"sars-cov-2", | ||
"--tag", | ||
DATASET_VERSION, | ||
"--output-zip", | ||
str(DATASET_PATH), | ||
] | ||
) | ||
|
||
logger.info( | ||
"Reference data retrieved", | ||
tree_updated=reference_data["meta"].get("updated"), | ||
"Nextclade reference dataset retrieved", as_of_date=as_of_date, version=DATASET_VERSION, output_zip=DATASET_PATH | ||
) | ||
|
||
reference = { | ||
"tree": reference_data["tree"], | ||
"meta": reference_data["meta"], | ||
} | ||
|
||
try: | ||
# response schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-v2.json | ||
# root sequence schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-root-sequence.json | ||
# this code adds a fasta-compliant header to the root sequence returned by the API | ||
fasta_root_header = ( | ||
">NC_045512.2 Severe acute respiratory syndrome" " coronavirus 2 isolate Wuhan-Hu-1, complete genome" | ||
) | ||
root_sequence = reference_data["root_sequence"]["nuc"] | ||
reference["root_sequence"] = f"{fasta_root_header}\n{root_sequence}" | ||
except KeyError: | ||
# Older versions of the dataset don't include a root_sequence. | ||
logger.error("Aborting pipeline: no root sequence found in reference data.", as_of_date=as_of_date) | ||
raise SystemExit(f"\nAborting pipeline: no root sequence found for date {as_of_date}") | ||
|
||
return reference | ||
return DATASET_PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,13 @@ | ||
from unittest import mock | ||
|
||
import pytest | ||
from requests import Response | ||
from virus_clade_utils.util.reference import get_reference_data | ||
from virus_clade_utils.util.reference import get_nextclade_dataset | ||
|
||
|
||
@pytest.fixture | ||
def get_nextclade_response(): | ||
def _get_nextclade_response(): | ||
return { | ||
"tree": "cladesandstuff", | ||
"meta": {"updated": "2021-09-01"}, | ||
"root_sequence": {"nuc": "fastasequence"}, | ||
} | ||
@mock.patch("subprocess.run") | ||
def test_get_nextclade_dataset(tmp_path): | ||
dataset_path = get_nextclade_dataset("2021-09-01", tmp_path) | ||
|
||
return _get_nextclade_response | ||
|
||
|
||
@pytest.fixture | ||
def get_nextclade_response_no_root(): | ||
def _get_nextclade_response_no_root(): | ||
return { | ||
"tree": "cladesandstuff", | ||
"meta": {"updated": "2021-09-01"}, | ||
} | ||
|
||
return _get_nextclade_response_no_root | ||
|
||
|
||
@mock.patch("requests.get") | ||
def test_get_reference_data(mock_get, get_nextclade_response): | ||
mock_response = Response() | ||
mock_response.status_code = 200 | ||
mock_response.json = get_nextclade_response | ||
mock_get.return_value = mock_response | ||
|
||
reference = get_reference_data("www.fakenextclade.com", "2021-09-01") | ||
|
||
assert reference["tree"] == "cladesandstuff" | ||
assert reference["meta"]["updated"] == "2021-09-01" | ||
assert "fastasequence" in reference["root_sequence"] | ||
|
||
|
||
@mock.patch("requests.get") | ||
def test_missing_root_sequence(mock_get, get_nextclade_response_no_root, capsys): | ||
mock_response = Response() | ||
mock_response.status_code = 200 | ||
mock_response.json = get_nextclade_response_no_root | ||
mock_get.return_value = mock_response | ||
|
||
with pytest.raises(SystemExit): | ||
get_reference_data("www.fakenextclade.com", "2021-09-01") | ||
out, err = capsys.readouterr() | ||
assert "no root sequence" in out | ||
assert "2021-09-01" in out | ||
# the dataset_path being returned should contain the correct nextclade | ||
# datasetset version, as determined by the as_of_date being passed | ||
# (returned version is temporarily hard-coded until Nextstrain provides the info we need) | ||
assert "2024-07-17--12-57-03Z" in dataset_path[0] |