Skip to content

Commit

Permalink
Add ncov_metadata property to Tree class
Browse files Browse the repository at this point in the history
Since it's possible to mix and match sequence_as_of and
tree_as_of dates in cladetime, sequences and reference
trees may have different ncov_metadata attributes
(dataset version, nexclade cli version, for example)
Add an ncov_metadata property to Tree that reflects
metadata for the tree_as_of date (as opposed to
CladeTime's ncov_metadata property, which reflects
sequence_as_of).

We'll use this new property to make sure we're using
the correct nextclade dataset when assigning
clades.
  • Loading branch information
bsweger committed Nov 6, 2024
1 parent a593c16 commit f07d85c
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 21 deletions.
55 changes: 34 additions & 21 deletions src/cladetime/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import structlog

from cladetime import CladeTime
from cladetime import CladeTime, sequence
from cladetime.exceptions import NextcladeNotAvailableError, TreeNotAvailableError
from cladetime.util.reference import _docker_installed, _get_nextclade_dataset, _get_s3_object_url
from cladetime.util.sequence import _get_ncov_metadata
Expand All @@ -35,9 +35,19 @@ def __init__(self, clade_time: CladeTime):
"""Tree constructor."""
self._clade_time = clade_time
self.as_of = self._clade_time.tree_as_of
self._nextclade_data_url = self._clade_time._config.nextclade_data_url
self._nextclade_data_url_version = self._clade_time._config.nextclade_data_url_version
self._tree_name = self._clade_time._config.nextclade_input_tree_name
self._config = self._clade_time._config
self._nextclade_data_url = self._config.nextclade_data_url
self._nextclade_data_url_version = self._config.nextclade_data_url_version
self._tree_name = self._config.nextclade_input_tree_name

# Nextstrain began publishing ncov pipeline metadata starting on 2024-08-01
if self.as_of >= self._config.nextstrain_min_ncov_metadata_date:
self.url_ncov_metadata = _get_s3_object_url(
self._config.nextstrain_ncov_bucket, self._config.nextstrain_ncov_metadata_key, self.as_of
)[1]
else:
self.url_ncov_metadata = None
self._ncov_metadata = self.ncov_metadata
self._url = self.url

def __repr__(self):
Expand All @@ -47,6 +57,19 @@ def __repr__(self):
def __str__(self):
return f"Represents Nexclade reference tree data as of {self.as_of.strftime('%Y-%m-%d')}"

@property
def ncov_metadata(self) -> dict:
"""
dict : Metadata from the Nextstrain pipeline run that corresponds
to as_of.
"""
if self.url_ncov_metadata:
metadata = sequence._get_ncov_metadata(self.url_ncov_metadata)
return metadata
else:
metadata = {}
return metadata

@property
def url(self) -> str:
"""
Expand Down Expand Up @@ -100,15 +123,15 @@ def _get_tree_url(self):

# we can only reliably retrieve the a past reference tree if we
# have access to the ncov metadata for that date
min_tree_as_of = self._clade_time._config.nextstrain_min_ncov_metadata_date
min_tree_as_of = self._config.nextstrain_min_ncov_metadata_date
if min_tree_as_of > self.as_of:
logger.error("Reference tree not available", tree_as_of=self.as_of)
raise TreeNotAvailableError(
f"Reference tree not available for {self.as_of} (earliest available tree date is {min_tree_as_of})"
)

# get the ncov metadata as of the CladeTime's tree_as_of date
url_ncov_metadata = self._get_url_ncov_metadata()
url_ncov_metadata = self.url_ncov_metadata

if url_ncov_metadata is None:
logger.error("Reference tree not available", tree_as_of=self.clade_time.tree_as_of)
Expand All @@ -125,14 +148,6 @@ def _get_tree_url(self):
)
return tree_url

def _get_url_ncov_metadata(self):
"""Get the URL to the ncov metadata file for the tree_as_of date."""
return _get_s3_object_url(
self._clade_time._config.nextstrain_ncov_bucket,
self._clade_time._config.nextstrain_ncov_metadata_key,
self.as_of,
)[1]

def _get_reference_tree(self) -> dict:
"""Return a reference tree used for SARS-CoV-2 clade assignments
Expand All @@ -147,18 +162,16 @@ def _get_reference_tree(self) -> dict:
A Python dictionary that represents the reference tree.
"""
# get the ncov metadata as of the CladeTime's tree_as_of date
url_ncov_metadata = self._get_url_ncov_metadata()
if url_ncov_metadata is None:
if self.url_ncov_metadata is None:
logger.error("Reference tree not available", tree_as_of=self.as_of)
raise TreeNotAvailableError(f"Reference tree not available for {self.as_of}")

ncov_metadata = _get_ncov_metadata(url_ncov_metadata)
nextclade_version_num = ncov_metadata.get("nextclade_version_num", "")
nextclade_dataset_name = ncov_metadata.get("nextclade_dataset_name", "")
nextclade_dataset_version = ncov_metadata.get("nextclade_dataset_version", "")
nextclade_version_num = self.ncov_metadata.get("nextclade_version_num", "")
nextclade_dataset_name = self.ncov_metadata.get("nextclade_dataset_name", "")
nextclade_dataset_version = self.ncov_metadata.get("nextclade_dataset_version", "")
if not all([nextclade_version_num, nextclade_dataset_name, nextclade_dataset_version]):
logger.error("Incomplete ncov metadata", tree_as_of=self._clade_time.tree_as_of)
raise TreeNotAvailableError(f"Incomplete ncov metadata {ncov_metadata}")
raise TreeNotAvailableError(f"Incomplete ncov metadata {self.ncov_metadata}")

with tempfile.TemporaryDirectory() as tmpdir:
nextclade_dataset = _get_nextclade_dataset(
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/test_tree.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
from urllib.parse import urlparse

import pytest
Expand All @@ -24,6 +25,18 @@ def test__get_tree_url_bad_date():
Tree(CladeTime(tree_as_of="2024-07-13"))


def test_tree_ncov_metadata():
with freeze_time("2024-11-05 16:21:34"):
# when tree_as_of <> sequence_as_of, the respective ncov_metadata
# properties of CladeTime and Tree may differ
ct = CladeTime(sequence_as_of=datetime.now(), tree_as_of="2024-08-02")
tree = Tree(ct)
assert tree.ncov_metadata.get("nextclade_version_num") == "3.8.2"
assert tree.ncov_metadata.get("nextclade_dataset_version") == "2024-07-17--12-57-03Z"
assert ct.ncov_metadata.get("nextclade_version_num") == "3.9.1"
assert ct.ncov_metadata.get("nextclade_dataset_version") == "2024-10-17--16-48-48Z"


@pytest.mark.skipif(not docker_enabled, reason="Docker is not installed")
def test__get_reference_tree():
with freeze_time("2024-08-13 16:21:34"):
Expand Down

0 comments on commit f07d85c

Please sign in to comment.