Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add assign_clade method to CladeTime class #57

Merged
merged 13 commits into from
Nov 13, 2024
29 changes: 15 additions & 14 deletions src/cladetime/util/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _get_s3_object_url(bucket_name: str, object_key: str, date: datetime) -> Tup


def _run_nextclade_cli(
nextclade_cli_version: str, nextclade_command: list[str], output_file: Path, input_files: list[Path] | None = None
nextclade_cli_version: str, nextclade_command: list[str], output_path: Path, input_files: list[Path] | None = None
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chipping away at #52. The docker run command only needs to know about the output_path (for volume mounting). The actual file nam efor Nextclade CLI output is specified in nextclade_command

) -> Path:
"""Invoke Nextclade CLI commands via Docker."""

Expand All @@ -83,7 +83,6 @@ def _run_nextclade_cli(
"Unable to create client for Nextstrain CLI. Is Docker installed and running?"
) from err

output_path = output_file.parent
volumes = {str(output_path): {"bind": "/data/", "mode": "rw"}}

# if the nextclade command requires input files, add those to the volumes
Expand All @@ -92,9 +91,10 @@ def _run_nextclade_cli(
for file in input_files:
volumes[str(file)] = {"bind": f"/data/{file.name}", "mode": "rw"}

image = f"nextstrain/nextclade:{nextclade_cli_version}"
try:
client.containers.run(
image=f"nextstrain/nextclade:{nextclade_cli_version}",
image=image,
command=nextclade_command,
volumes=volumes,
remove=True,
Expand All @@ -104,14 +104,13 @@ def _run_nextclade_cli(
msg = "Error running Nextclade CLI via Docker"
logger.error(
msg,
cli_version=nextclade_cli_version,
image=image,
command=nextclade_command,
volumes=volumes,
error=err,
)
raise NextcladeNotAvailableError(msg) from err

return output_file


def _get_nextclade_dataset(
nextclade_cli_version: str, dataset_name: str, dataset_version: str, output_path: Path
Expand Down Expand Up @@ -159,13 +158,13 @@ def _get_nextclade_dataset(
f"/data/{zip_filename}",
]

_run_nextclade_cli(nextclade_cli_version, command, output_file)
_run_nextclade_cli(nextclade_cli_version, command, output_path)

return output_file


def _get_clade_assignments(
nextclade_cli_version: str, sequence_file: Path, nextclade_dataset: Path, output_path: Path
nextclade_cli_version: str, sequence_file: Path, nextclade_dataset: Path, output_file: Path
) -> Path:
"""Assign clades to sequences using the Nextclade CLI.

Expand All @@ -186,8 +185,8 @@ def _get_clade_assignments(
that contains the reference tree and root sequence to use
for clade assignment. Use :func:`get_nextclade_dataset` to
get a dataset that corresponds to a specific point in time.
output_path : pathlib.Path
Where to save the clade assignment file
output_file : pathlib.Path
The full filename to use for saving the clade assignment output.

Returns
-------
Expand All @@ -202,9 +201,11 @@ def _get_clade_assignments(
If there is an error creating a Docker client or running Nextclade
CLI commands using the Docker image.
"""
assignment_filename = "nextclade_assignment.csv"
output_file = output_path / assignment_filename
output_path.parent.mkdir(parents=True, exist_ok=True)
if not output_file.suffix:
raise ValueError("output_file should be a full path to the output file, including filename")
output_path = output_file.parent
output_path.mkdir(parents=True, exist_ok=True)
assignment_filename = output_file.name

# all files in the input_files list will be mounted to
# the docker image's "/data/" directory when running
Expand All @@ -222,6 +223,6 @@ def _get_clade_assignments(
f"/data/{sequence_file.name}",
]

_run_nextclade_cli(nextclade_cli_version, command, output_file, input_files=input_files)
_run_nextclade_cli(nextclade_cli_version, command, output_path, input_files=input_files)

return output_file
8 changes: 4 additions & 4 deletions tests/integration/test_nextclade_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def test_get_clade_assignments(test_file_path, tmp_path):
sequence_file = test_file_path / "test_sequences.fasta"
nextclade_dataset = test_file_path / "test_nextclade_dataset.zip"
# _get_clade_assignments should create the output directory if it doesn't exist
output_path = tmp_path / "clade_assignments"
output_file = tmp_path / "clade_assignments" / "nextclade_assignments.csv"

assignment_file = _get_clade_assignments("latest", sequence_file, nextclade_dataset, output_path)
assignment_file = _get_clade_assignments("latest", sequence_file, nextclade_dataset, output_file)
assignment_df = pl.read_csv(assignment_file, separator=";").select(
["seqName", "clade", "clade_nextstrain", "Nextclade_pango"]
)
Expand All @@ -49,9 +49,9 @@ def test_get_clade_assignments_no_matches(test_file_path, tmp_path):
sequence_file = test_file_path / "test_sequences_fake.fasta"
nextclade_dataset = test_file_path / "test_nextclade_dataset.zip"
# _get_clade_assignments should create the output directory if it doesn't exist
output_path = tmp_path / "clade_assignments"
output_file = tmp_path / "clade_assignments" / "nextclade_assignments.csv"

assignment_file = _get_clade_assignments("latest", sequence_file, nextclade_dataset, output_path)
assignment_file = _get_clade_assignments("latest", sequence_file, nextclade_dataset, output_file)
assignment_df = pl.read_csv(assignment_file, separator=";").select(
["seqName", "clade", "clade_nextstrain", "Nextclade_pango"]
)
Expand Down