Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bsweger/more state permutations/37 #45

Merged
merged 8 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
pygments_style = "friendly"

# Show typehints as content of the function or method
autodoc_typehints = "description"
autodoc_typehints = "signature"
autodoc_member_order = "bysource"

# Open Graph metadata
Expand All @@ -135,6 +135,8 @@
nitpick_ignore = [
("py:class", "datetime"),
("py:class", "polars.LazyFrame"),
("py:class", "polars.dataframe.frame.DataFrame"),
("py:class", "polars.DataFrame"),
("py:class", "polars.lazyframe.frame.LazyFrame"),
]

Expand Down
3 changes: 3 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ API Reference
.. toctree::

cladetime
sequence
types

6 changes: 6 additions & 0 deletions docs/reference/sequence.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=========
sequence
=========

.. autofunction:: cladetime.sequence.filter_sequence_metadata

7 changes: 7 additions & 0 deletions docs/reference/types.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
=====
types
=====


.. autoclass:: cladetime.types.StateFormat
:members:
4 changes: 1 addition & 3 deletions docs/user-guide.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
===============
User Guide
===============

===========


Finding Nextstrain SARS-CoV-2 sequences and sequence metadata
Expand Down
4 changes: 4 additions & 0 deletions src/cladetime/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import sys

import structlog
Expand All @@ -6,6 +7,9 @@

__all__ = ["CladeTime"]

# tells us package to consider DC a state
os.environ["DC_STATEHOOD"] = "1"


def setup_logging():
shared_processors = [
Expand Down
10 changes: 0 additions & 10 deletions src/cladetime/_typing.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/cladetime/assign_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import structlog

from cladetime import CladeTime
from cladetime.sequence import _download_from_url, filter_sequence_metadata
from cladetime.util.config import Config
from cladetime.util.sequence import _download_from_url, filter_covid_genome_metadata
from cladetime.util.session import _get_session
from cladetime.util.timing import time_function

Expand Down Expand Up @@ -60,7 +60,7 @@ def get_sequence_metadata(metadata: pl.DataFrame, sequence_collection_date: date
]

# clean and filter metadata (same process used to generate the weekly clade list)
filtered_metadata = filter_covid_genome_metadata(metadata, cols)
filtered_metadata = filter_sequence_metadata(metadata, cols)

# add filters based on user input
filtered_metadata = filtered_metadata.filter(pl.col("date") >= sequence_collection_date)
Expand Down
2 changes: 1 addition & 1 deletion src/cladetime/cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import structlog

from cladetime.exceptions import CladeTimeFutureDateWarning, CladeTimeInvalidDateError, CladeTimeInvalidURLError
from cladetime.sequence import _get_ncov_metadata, get_covid_genome_metadata
from cladetime.util.config import Config
from cladetime.util.reference import _get_s3_object_url
from cladetime.util.sequence import _get_ncov_metadata, get_covid_genome_metadata

logger = structlog.get_logger()

Expand Down
263 changes: 263 additions & 0 deletions src/cladetime/sequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
"""Functions for retrieving and parsing SARS-CoV-2 virus genome data."""

import lzma
import os
from pathlib import Path
from urllib.parse import urlparse

import polars as pl
import structlog
import us
from requests import Session

from cladetime.types import StateFormat
from cladetime.util.session import _get_session
from cladetime.util.timing import time_function

logger = structlog.get_logger()


@time_function
def _download_from_url(session: Session, url: str, data_path: Path) -> Path:
"""Download a file from the specified URL and save it to data_path."""

parsed_url = urlparse(url)
url_filename = os.path.basename(parsed_url.path)
filename = data_path / url_filename

with session.get(url, stream=True) as result:
result.raise_for_status()
with open(filename, "wb") as f:
for chunk in result.iter_content(chunk_size=None):
f.write(chunk)

return filename


def get_covid_genome_metadata(
metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None
) -> pl.LazyFrame:
"""
Read GenBank genome metadata into a Polars LazyFrame.

Parameters
----------
metadata_path : Path | None
Path to location of a NextStrain GenBank genome metadata file.
Cannot be used with metadata_url.
metadata_url: str | None
URL to a NextStrain GenBank genome metadata file.
Cannot be used with metadata_path.
num_rows : int | None, default = None
The number of genome metadata rows to request.
When not supplied, request all rows.
"""

path_flag = metadata_path is not None
url_flag = metadata_url is not None

assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both."

if metadata_url:
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
return metadata

if metadata_path:
if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000
).lazy()

return metadata


def _get_ncov_metadata(
url_ncov_metadata: str,
session: Session | None = None,
) -> dict:
"""Return metadata emitted by the Nextstrain ncov pipeline."""
if not session:
session = _get_session(retry=False)

response = session.get(url_ncov_metadata)
if not response.ok:
logger.warn(
"Failed to retrieve ncov metadata",
status_code=response.status_code,
response_text=response.text,
request=response.request.url,
request_body=response.request.body,
)
return {}

metadata = response.json()
if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2":
metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs"

return metadata


def filter_sequence_metadata(
metadata: pl.DataFrame | pl.LazyFrame, cols: list | None = None, state_format: StateFormat = StateFormat.ABBR
) -> pl.DataFrame | pl.LazyFrame:
"""Apply standard filters to Nextstrain's SARS-CoV-2 sequence metadata.

A helper function to apply commonly-used filters to a Polars DataFrame
or LazyFrame that represents Nextstrain's SARS-CoV-2 sequence metadata.
It filters on human sequences from the United States (including Puerto Rico
and Washington, DC).

This function also performs small transformations to the metadata,
such as casting the collection date to a date type, renaming columns,
and returning alternate state formats if requested.

Parameters
----------
metadata : :class:`polars.DataFrame` or :class:`polars.LazyFrame`
A Polars DataFrame or LazyFrame that represents SARS-CoV-2
sequence metadata produced by Nextstrain as an intermediate file in
their daily workflow. This parameter is often the
:attr:`cladetime.CladeTime.url_sequence_metadata` attribute
of a :class:`cladetime.CladeTime` object
cols : list
Optional. A list of columns to include in the filtered metadata.
The default columns included in the filtered metadata are:
clade_nextstrain, country, date, division, genbank_accession,
genbank_accession_rev, host
state_format : :class:`cladetime.types.StateFormat`
Optional. The state name format returned in the filtered metadata's
location column. Defaults to `StateFormat.ABBR`

Returns
-------
:class:`polars.DataFrame` or :class:`polars.LazyFrame`
A Polars object that represents the filtered SARS-CoV-2 sequence
metadata. The type of returned object will match the type of the
function's metadata parameter.

Raises
------
ValueError
If the state_format parameter is not a valid
:class:`cladetime.types.StateFormat`.

Notes
-----
This function will filter out metadata rows with invalid state names or
date strings that cannot be cast to a Polars date format.

Example:
--------
>>> from cladetime import CladeTime
>>> from cladetime.sequence import filter_covid_genome_metadata

Apply common filters to the sequence metadata of a CladeTime object:

>>> ct = CladeTime(seq_as_of="2024-10-15")
>>> ct = CladeTime(sequence_as_of="2024-10-15")
>>> filtered_metadata = filter_covid_genome_metadata(ct.sequence_metadata)
>>> filtered_metadata.collect().head(5)
shape: (5, 7)
┌───────┬─────────┬────────────┬────────────┬────────────┬──────────────┬──────┬
│ clade ┆ country ┆ date ┆ genbank_ ┆ genbank_ac ┆ host ┆ loca │
│ ┆ ┆ ┆ accession ┆ cession_rev┆ ┆ tion │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str ┆ str ┆ str ┆ str │
│ ┆ ┆ ┆ ┆ ┆ ┆ │
╞═══════╪═════════╪════════════╪════════════╪════════════╪══════════════╪══════╡
│ 22A ┆ USA ┆ 2022-07-07 ┆ PP223234 ┆ PP223234.1 ┆ Homo sapiens ┆ AL │
│ 22B ┆ USA ┆ 2022-07-02 ┆ PP223435 ┆ PP223435.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-19 ┆ PP223235 ┆ PP223235.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-15 ┆ PP223236 ┆ PP223236.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-20 ┆ PP223237 ┆ PP223237.1 ┆ Homo sapiens ┆ AZ │
└───────┴─────────┴────────────┴────────────┴────────────┴─────────────────────┴
"""
if state_format not in StateFormat:
raise ValueError(f"Invalid state_format. Must be one of: {list(StateFormat.__members__.items())}")

# Default columns to include in the filtered metadata
if cols is None:
cols = [
"clade_nextstrain",
"country",
"date",
"division",
"genbank_accession",
"genbank_accession_rev",
"host",
]

# There are some other odd divisions in the data, but these are 50 states, DC and PR
states = [state.name for state in us.states.STATES]
states.extend(["Washington DC", "District of Columbia", "Puerto Rico"])

# Filter dataset and do some general tidying
filtered_metadata = (
metadata.select(cols)
.filter(
pl.col("country") == "USA",
pl.col("division").is_in(states),
pl.col("host") == "Homo sapiens",
)
.rename({"clade_nextstrain": "clade"})
.cast({"date": pl.Date}, strict=False)
# date filtering at the end ensures we filter out null
# values created by the above .cast operation
.filter(
pl.col("date").is_not_null(),
)
)

# Create state mappings based on state_format parameter, including a DC alias, since
# Nextrain's metadata uses a different name than the us package
if state_format == StateFormat.FIPS:
state_dict = {state.name: state.fips for state in us.states.STATES_AND_TERRITORIES}
state_dict["Washington DC"] = us.states.DC.fips
elif state_format == StateFormat.ABBR:
state_dict = {state.name: state.abbr for state in us.states.STATES_AND_TERRITORIES}
state_dict["Washington DC"] = us.states.DC.abbr
else:
state_dict = {state.name: state.name for state in us.states.STATES_AND_TERRITORIES}
state_dict["Washington DC"] = "Washington DC"

filtered_metadata = filtered_metadata.with_columns(pl.col("division").replace(state_dict).alias("location")).drop(
"division"
)

return filtered_metadata


def get_clade_counts(filtered_metadata: pl.LazyFrame) -> pl.LazyFrame:
"""Return a count of clades by location and date."""

cols = [
"clade",
"country",
"date",
"location",
"host",
]

counts = filtered_metadata.select(cols).group_by("location", "date", "clade").agg(pl.len().alias("count"))

return counts


def parse_sequence_assignments(df_assignments: pl.DataFrame) -> pl.DataFrame:
"""Parse out the sequence number from the seqName column returned by the clade assignment tool."""

# polars apparently can't split out the sequence number from that big name column
# without resorting an apply, so here we're dropping into pandas to do that
# (might be a premature optimization, since this manoever requires both pandas and pyarrow)
seq = pl.from_pandas(df_assignments.to_pandas()["seqName"].str.split(" ").str[0].rename("seq"))

# we're expecting one row per sequence
if seq.n_unique() != df_assignments.shape[0]:
raise ValueError("Clade assignment data contains duplicate sequence. Stopping assignment process.")

# add the parsed sequence number as a new column
df_assignments = df_assignments.insert_column(1, seq) # type: ignore

return df_assignments
14 changes: 14 additions & 0 deletions src/cladetime/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Type aliases for this package."""

from enum import StrEnum


class StateFormat(StrEnum):
"""Options for formatting state names in sequence metadata"""

ABBR = "abbr"
"""Format states as two-letter abbreviations"""
NAME = "name"
"""Format states as full names"""
FIPS = "fips"
"""Format states as FIPS codes"""
Loading