diff --git a/docs/conf.py b/docs/conf.py index e969f1f..45da7dd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -117,7 +117,7 @@ pygments_style = "friendly" # Show typehints as content of the function or method -autodoc_typehints = "description" +autodoc_typehints = "signature" autodoc_member_order = "bysource" # Open Graph metadata @@ -135,6 +135,8 @@ nitpick_ignore = [ ("py:class", "datetime"), ("py:class", "polars.LazyFrame"), + ("py:class", "polars.dataframe.frame.DataFrame"), + ("py:class", "polars.DataFrame"), ("py:class", "polars.lazyframe.frame.LazyFrame"), ] diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 7660209..2ddb1d7 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -4,3 +4,6 @@ API Reference .. toctree:: cladetime + sequence + types + diff --git a/docs/reference/sequence.rst b/docs/reference/sequence.rst new file mode 100644 index 0000000..49f4800 --- /dev/null +++ b/docs/reference/sequence.rst @@ -0,0 +1,6 @@ +========= +sequence +========= + +.. autofunction:: cladetime.sequence.filter_sequence_metadata + diff --git a/docs/reference/types.rst b/docs/reference/types.rst new file mode 100644 index 0000000..a577dd1 --- /dev/null +++ b/docs/reference/types.rst @@ -0,0 +1,7 @@ +===== +types +===== + + +.. autoclass:: cladetime.types.StateFormat + :members: diff --git a/docs/user-guide.rst b/docs/user-guide.rst index fb1f6aa..e02f56e 100644 --- a/docs/user-guide.rst +++ b/docs/user-guide.rst @@ -1,7 +1,5 @@ -=============== User Guide -=============== - +=========== Finding Nextstrain SARS-CoV-2 sequences and sequence metadata diff --git a/src/cladetime/__init__.py b/src/cladetime/__init__.py index 0822bea..d4bb66a 100644 --- a/src/cladetime/__init__.py +++ b/src/cladetime/__init__.py @@ -1,3 +1,4 @@ +import os import sys import structlog @@ -6,6 +7,9 @@ __all__ = ["CladeTime"] +# tells us package to consider DC a state +os.environ["DC_STATEHOOD"] = "1" + def setup_logging(): shared_processors = [ diff --git a/src/cladetime/_typing.py b/src/cladetime/_typing.py deleted file mode 100644 index 05ebc43..0000000 --- a/src/cladetime/_typing.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Type aliases for this package.""" - -from pathlib import Path -from typing import TypeAlias, Union - -from cloudpathlib import AnyPath, CloudPath - -# Data types -# Pathlike: TypeAlias = Path | AnyPath | CloudPath -Pathlike: TypeAlias = Union["Path", "AnyPath", "CloudPath"] diff --git a/src/cladetime/assign_clades.py b/src/cladetime/assign_clades.py index 0f26d40..1b48ab6 100644 --- a/src/cladetime/assign_clades.py +++ b/src/cladetime/assign_clades.py @@ -11,8 +11,8 @@ import structlog from cladetime import CladeTime +from cladetime.sequence import _download_from_url, filter_sequence_metadata from cladetime.util.config import Config -from cladetime.util.sequence import _download_from_url, filter_covid_genome_metadata from cladetime.util.session import _get_session from cladetime.util.timing import time_function @@ -60,7 +60,7 @@ def get_sequence_metadata(metadata: pl.DataFrame, sequence_collection_date: date ] # clean and filter metadata (same process used to generate the weekly clade list) - filtered_metadata = filter_covid_genome_metadata(metadata, cols) + filtered_metadata = filter_sequence_metadata(metadata, cols) # add filters based on user input filtered_metadata = filtered_metadata.filter(pl.col("date") >= sequence_collection_date) diff --git a/src/cladetime/cladetime.py b/src/cladetime/cladetime.py index d54bb87..ee6ded3 100644 --- a/src/cladetime/cladetime.py +++ b/src/cladetime/cladetime.py @@ -7,9 +7,9 @@ import structlog from cladetime.exceptions import CladeTimeFutureDateWarning, CladeTimeInvalidDateError, CladeTimeInvalidURLError +from cladetime.sequence import _get_ncov_metadata, get_covid_genome_metadata from cladetime.util.config import Config from cladetime.util.reference import _get_s3_object_url -from cladetime.util.sequence import _get_ncov_metadata, get_covid_genome_metadata logger = structlog.get_logger() diff --git a/src/cladetime/sequence.py b/src/cladetime/sequence.py new file mode 100644 index 0000000..c06ffbc --- /dev/null +++ b/src/cladetime/sequence.py @@ -0,0 +1,263 @@ +"""Functions for retrieving and parsing SARS-CoV-2 virus genome data.""" + +import lzma +import os +from pathlib import Path +from urllib.parse import urlparse + +import polars as pl +import structlog +import us +from requests import Session + +from cladetime.types import StateFormat +from cladetime.util.session import _get_session +from cladetime.util.timing import time_function + +logger = structlog.get_logger() + + +@time_function +def _download_from_url(session: Session, url: str, data_path: Path) -> Path: + """Download a file from the specified URL and save it to data_path.""" + + parsed_url = urlparse(url) + url_filename = os.path.basename(parsed_url.path) + filename = data_path / url_filename + + with session.get(url, stream=True) as result: + result.raise_for_status() + with open(filename, "wb") as f: + for chunk in result.iter_content(chunk_size=None): + f.write(chunk) + + return filename + + +def get_covid_genome_metadata( + metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None +) -> pl.LazyFrame: + """ + Read GenBank genome metadata into a Polars LazyFrame. + + Parameters + ---------- + metadata_path : Path | None + Path to location of a NextStrain GenBank genome metadata file. + Cannot be used with metadata_url. + metadata_url: str | None + URL to a NextStrain GenBank genome metadata file. + Cannot be used with metadata_path. + num_rows : int | None, default = None + The number of genome metadata rows to request. + When not supplied, request all rows. + """ + + path_flag = metadata_path is not None + url_flag = metadata_url is not None + + assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both." + + if metadata_url: + metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows) + return metadata + + if metadata_path: + if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]: + metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows) + elif compression_type == ".xz": + metadata = pl.read_csv( + lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000 + ).lazy() + + return metadata + + +def _get_ncov_metadata( + url_ncov_metadata: str, + session: Session | None = None, +) -> dict: + """Return metadata emitted by the Nextstrain ncov pipeline.""" + if not session: + session = _get_session(retry=False) + + response = session.get(url_ncov_metadata) + if not response.ok: + logger.warn( + "Failed to retrieve ncov metadata", + status_code=response.status_code, + response_text=response.text, + request=response.request.url, + request_body=response.request.body, + ) + return {} + + metadata = response.json() + if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2": + metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" + + return metadata + + +def filter_sequence_metadata( + metadata: pl.DataFrame | pl.LazyFrame, cols: list | None = None, state_format: StateFormat = StateFormat.ABBR +) -> pl.DataFrame | pl.LazyFrame: + """Apply standard filters to Nextstrain's SARS-CoV-2 sequence metadata. + + A helper function to apply commonly-used filters to a Polars DataFrame + or LazyFrame that represents Nextstrain's SARS-CoV-2 sequence metadata. + It filters on human sequences from the United States (including Puerto Rico + and Washington, DC). + + This function also performs small transformations to the metadata, + such as casting the collection date to a date type, renaming columns, + and returning alternate state formats if requested. + + Parameters + ---------- + metadata : :class:`polars.DataFrame` or :class:`polars.LazyFrame` + A Polars DataFrame or LazyFrame that represents SARS-CoV-2 + sequence metadata produced by Nextstrain as an intermediate file in + their daily workflow. This parameter is often the + :attr:`cladetime.CladeTime.url_sequence_metadata` attribute + of a :class:`cladetime.CladeTime` object + cols : list + Optional. A list of columns to include in the filtered metadata. + The default columns included in the filtered metadata are: + clade_nextstrain, country, date, division, genbank_accession, + genbank_accession_rev, host + state_format : :class:`cladetime.types.StateFormat` + Optional. The state name format returned in the filtered metadata's + location column. Defaults to `StateFormat.ABBR` + + Returns + ------- + :class:`polars.DataFrame` or :class:`polars.LazyFrame` + A Polars object that represents the filtered SARS-CoV-2 sequence + metadata. The type of returned object will match the type of the + function's metadata parameter. + + Raises + ------ + ValueError + If the state_format parameter is not a valid + :class:`cladetime.types.StateFormat`. + + Notes + ----- + This function will filter out metadata rows with invalid state names or + date strings that cannot be cast to a Polars date format. + + Example: + -------- + >>> from cladetime import CladeTime + >>> from cladetime.sequence import filter_covid_genome_metadata + + Apply common filters to the sequence metadata of a CladeTime object: + + >>> ct = CladeTime(seq_as_of="2024-10-15") + >>> ct = CladeTime(sequence_as_of="2024-10-15") + >>> filtered_metadata = filter_covid_genome_metadata(ct.sequence_metadata) + >>> filtered_metadata.collect().head(5) + shape: (5, 7) + ┌───────┬─────────┬────────────┬────────────┬────────────┬──────────────┬──────┬ + │ clade ┆ country ┆ date ┆ genbank_ ┆ genbank_ac ┆ host ┆ loca │ + │ ┆ ┆ ┆ accession ┆ cession_rev┆ ┆ tion │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ date ┆ str ┆ str ┆ str ┆ str │ + │ ┆ ┆ ┆ ┆ ┆ ┆ │ + ╞═══════╪═════════╪════════════╪════════════╪════════════╪══════════════╪══════╡ + │ 22A ┆ USA ┆ 2022-07-07 ┆ PP223234 ┆ PP223234.1 ┆ Homo sapiens ┆ AL │ + │ 22B ┆ USA ┆ 2022-07-02 ┆ PP223435 ┆ PP223435.1 ┆ Homo sapiens ┆ AZ │ + │ 22B ┆ USA ┆ 2022-07-19 ┆ PP223235 ┆ PP223235.1 ┆ Homo sapiens ┆ AZ │ + │ 22B ┆ USA ┆ 2022-07-15 ┆ PP223236 ┆ PP223236.1 ┆ Homo sapiens ┆ AZ │ + │ 22B ┆ USA ┆ 2022-07-20 ┆ PP223237 ┆ PP223237.1 ┆ Homo sapiens ┆ AZ │ + └───────┴─────────┴────────────┴────────────┴────────────┴─────────────────────┴ + """ + if state_format not in StateFormat: + raise ValueError(f"Invalid state_format. Must be one of: {list(StateFormat.__members__.items())}") + + # Default columns to include in the filtered metadata + if cols is None: + cols = [ + "clade_nextstrain", + "country", + "date", + "division", + "genbank_accession", + "genbank_accession_rev", + "host", + ] + + # There are some other odd divisions in the data, but these are 50 states, DC and PR + states = [state.name for state in us.states.STATES] + states.extend(["Washington DC", "District of Columbia", "Puerto Rico"]) + + # Filter dataset and do some general tidying + filtered_metadata = ( + metadata.select(cols) + .filter( + pl.col("country") == "USA", + pl.col("division").is_in(states), + pl.col("host") == "Homo sapiens", + ) + .rename({"clade_nextstrain": "clade"}) + .cast({"date": pl.Date}, strict=False) + # date filtering at the end ensures we filter out null + # values created by the above .cast operation + .filter( + pl.col("date").is_not_null(), + ) + ) + + # Create state mappings based on state_format parameter, including a DC alias, since + # Nextrain's metadata uses a different name than the us package + if state_format == StateFormat.FIPS: + state_dict = {state.name: state.fips for state in us.states.STATES_AND_TERRITORIES} + state_dict["Washington DC"] = us.states.DC.fips + elif state_format == StateFormat.ABBR: + state_dict = {state.name: state.abbr for state in us.states.STATES_AND_TERRITORIES} + state_dict["Washington DC"] = us.states.DC.abbr + else: + state_dict = {state.name: state.name for state in us.states.STATES_AND_TERRITORIES} + state_dict["Washington DC"] = "Washington DC" + + filtered_metadata = filtered_metadata.with_columns(pl.col("division").replace(state_dict).alias("location")).drop( + "division" + ) + + return filtered_metadata + + +def get_clade_counts(filtered_metadata: pl.LazyFrame) -> pl.LazyFrame: + """Return a count of clades by location and date.""" + + cols = [ + "clade", + "country", + "date", + "location", + "host", + ] + + counts = filtered_metadata.select(cols).group_by("location", "date", "clade").agg(pl.len().alias("count")) + + return counts + + +def parse_sequence_assignments(df_assignments: pl.DataFrame) -> pl.DataFrame: + """Parse out the sequence number from the seqName column returned by the clade assignment tool.""" + + # polars apparently can't split out the sequence number from that big name column + # without resorting an apply, so here we're dropping into pandas to do that + # (might be a premature optimization, since this manoever requires both pandas and pyarrow) + seq = pl.from_pandas(df_assignments.to_pandas()["seqName"].str.split(" ").str[0].rename("seq")) + + # we're expecting one row per sequence + if seq.n_unique() != df_assignments.shape[0]: + raise ValueError("Clade assignment data contains duplicate sequence. Stopping assignment process.") + + # add the parsed sequence number as a new column + df_assignments = df_assignments.insert_column(1, seq) # type: ignore + + return df_assignments diff --git a/src/cladetime/types.py b/src/cladetime/types.py new file mode 100644 index 0000000..b6272d5 --- /dev/null +++ b/src/cladetime/types.py @@ -0,0 +1,14 @@ +"""Type aliases for this package.""" + +from enum import StrEnum + + +class StateFormat(StrEnum): + """Options for formatting state names in sequence metadata""" + + ABBR = "abbr" + """Format states as two-letter abbreviations""" + NAME = "name" + """Format states as full names""" + FIPS = "fips" + """Format states as FIPS codes""" diff --git a/src/cladetime/util/sequence.py b/src/cladetime/util/sequence.py index 85c45fa..71b55e3 100644 --- a/src/cladetime/util/sequence.py +++ b/src/cladetime/util/sequence.py @@ -1,171 +1,6 @@ -"""Functions for retrieving and parsing SARS-CoV-2 virus genome data.""" +"""cladetime.util.sequence moved to cladetime.sequence.""" -import lzma -import os -from pathlib import Path -from urllib.parse import urlparse - -import polars as pl -import structlog -import us -from requests import Session - -from cladetime.util.session import _get_session -from cladetime.util.timing import time_function - -logger = structlog.get_logger() - - -@time_function -def _download_from_url(session: Session, url: str, data_path: Path) -> Path: - """Download a file from the specified URL and save it to data_path.""" - - parsed_url = urlparse(url) - url_filename = os.path.basename(parsed_url.path) - filename = data_path / url_filename - - with session.get(url, stream=True) as result: - result.raise_for_status() - with open(filename, "wb") as f: - for chunk in result.iter_content(chunk_size=None): - f.write(chunk) - - return filename - - -def get_covid_genome_metadata( - metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None -) -> pl.LazyFrame: - """ - Read GenBank genome metadata into a Polars LazyFrame. - - Parameters - ---------- - metadata_path : Path | None - Path to location of a NextStrain GenBank genome metadata file. - Cannot be used with metadata_url. - metadata_url: str | None - URL to a NextStrain GenBank genome metadata file. - Cannot be used with metadata_path. - num_rows : int | None, default = None - The number of genome metadata rows to request. - When not supplied, request all rows. - """ - - path_flag = metadata_path is not None - url_flag = metadata_url is not None - - assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both." - - if metadata_url: - metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows) - return metadata - - if metadata_path: - if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]: - metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows) - elif compression_type == ".xz": - metadata = pl.read_csv( - lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000 - ).lazy() - - return metadata - - -def _get_ncov_metadata( - url_ncov_metadata: str, - session: Session | None = None, -) -> dict: - """Return metadata emitted by the Nextstrain ncov pipeline.""" - if not session: - session = _get_session(retry=False) - - response = session.get(url_ncov_metadata) - if not response.ok: - logger.warn( - "Failed to retrieve ncov metadata", - status_code=response.status_code, - response_text=response.text, - request=response.request.url, - request_body=response.request.body, - ) - return {} - - metadata = response.json() - if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2": - metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" - - return metadata - - -def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.LazyFrame: - """Apply a standard set of filters to the GenBank genome metadata.""" - - # Default columns to include in the filtered metadata - if len(cols) == 0: - cols = [ - "clade_nextstrain", - "country", - "date", - "division", - "genbank_accession", - "genbank_accession_rev", - "host", - ] - - # There are some other odd divisions in the data, but these are 50 states, DC and PR - states = [state.name for state in us.states.STATES] - states.extend(["Washington DC", "Puerto Rico"]) - - # Filter dataset and do some general tidying - filtered_metadata = ( - metadata.select(cols) - .filter( - pl.col("country") == "USA", - pl.col("division").is_in(states), - pl.col("host") == "Homo sapiens", - ) - .rename({"clade_nextstrain": "clade", "division": "location"}) - .cast({"date": pl.Date}, strict=False) - # date filtering at the end ensures we filter out null - # values created by the above .cast operation - .filter( - pl.col("date").is_not_null(), - ) - ) - - return filtered_metadata - - -def get_clade_counts(filtered_metadata: pl.LazyFrame) -> pl.LazyFrame: - """Return a count of clades by location and date.""" - - cols = [ - "clade", - "country", - "date", - "location", - "host", - ] - - counts = filtered_metadata.select(cols).group_by("location", "date", "clade").agg(pl.len().alias("count")) - - return counts - - -def parse_sequence_assignments(df_assignments: pl.DataFrame) -> pl.DataFrame: - """Parse out the sequence number from the seqName column returned by the clade assignment tool.""" - - # polars apparently can't split out the sequence number from that big name column - # without resorting an apply, so here we're dropping into pandas to do that - # (might be a premature optimization, since this manoever requires both pandas and pyarrow) - seq = pl.from_pandas(df_assignments.to_pandas()["seqName"].str.split(" ").str[0].rename("seq")) - - # we're expecting one row per sequence - if seq.n_unique() != df_assignments.shape[0]: - raise ValueError("Clade assignment data contains duplicate sequence. Stopping assignment process.") - - # add the parsed sequence number as a new column - df_assignments = df_assignments.insert_column(1, seq) # type: ignore - - return df_assignments +# For temporary backwards compatibility +from cladetime.sequence import _get_ncov_metadata as _get_ncov_metadata # noqa: F401 +from cladetime.sequence import filter_sequence_metadata as filter_covid_genome_metadata # noqa: F401 +from cladetime.sequence import get_clade_counts as get_clade_counts diff --git a/tests/unit/util/test_sequence.py b/tests/unit/util/test_sequence.py index 646250a..515d0df 100644 --- a/tests/unit/util/test_sequence.py +++ b/tests/unit/util/test_sequence.py @@ -1,13 +1,12 @@ -from collections import Counter from pathlib import Path import polars as pl import pytest -from cladetime.util.sequence import ( - filter_covid_genome_metadata, +from cladetime.sequence import ( + filter_sequence_metadata, get_covid_genome_metadata, - parse_sequence_assignments, ) +from cladetime.types import StateFormat @pytest.fixture @@ -78,7 +77,7 @@ def test_filter_covid_genome_metadata(): "Homo sapiens", ], "country": ["USA", "Argentina", "USA", "USA", "USA", "USA", "USA"], - "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania", "Pennsylvania"], + "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Washington DC", "Pennsylvania"], "clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "FFF"], "location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia", "Cardassia"], "genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"], @@ -87,9 +86,13 @@ def test_filter_covid_genome_metadata(): } lf_metadata = pl.LazyFrame(test_genome_metadata) - lf_filtered = filter_covid_genome_metadata(lf_metadata) + lf_filtered = filter_sequence_metadata(lf_metadata).collect() - assert len(lf_filtered.collect()) == 2 + assert len(lf_filtered) == 2 + + locations = lf_filtered["location"].to_list() + locations.sort() + assert locations == ["AK", "DC"] actual_schema = lf_filtered.collect_schema() expected_schema = pl.Schema( @@ -97,27 +100,58 @@ def test_filter_covid_genome_metadata(): "clade": pl.String, "country": pl.String, "date": pl.Date, - "location": pl.String, "genbank_accession": pl.String, "genbank_accession_rev": pl.String, "host": pl.String, + "location": pl.String, } ) assert actual_schema == expected_schema -def test_parse_sequence_assignments(df_assignments): - result = parse_sequence_assignments(df_assignments) +def test_filter_covid_genome_metadata_state_name(): + num_test_rows = 4 + test_genome_metadata = { + "date": ["2022-01-01"] * num_test_rows, + "host": ["Homo sapiens"] * num_test_rows, + "country": ["USA"] * num_test_rows, + "clade_nextstrain": ["AAA"] * num_test_rows, + "location": ["Earth"] * num_test_rows, + "genbank_accession": ["A1"] * num_test_rows, + "genbank_accession_rev": ["A1.1"] * num_test_rows, + "division": ["Alaska", "Puerto Rico", "Washington DC", "Fake State"], + } + + lf_metadata = pl.LazyFrame(test_genome_metadata) + lf_filtered = filter_sequence_metadata(lf_metadata, state_format=StateFormat.NAME) + lf_filtered = lf_filtered.collect() + + # Un-mapped states are dropped from dataset + assert len(lf_filtered) == 3 + + locations = set(lf_filtered["location"].to_list()) + assert locations == {"Alaska", "Puerto Rico", "Washington DC"} - # resulting dataframe should have an additional column called "seq" - assert Counter(result.columns) == Counter(["seqName", "clade", "seq"]) - # check resulting sequence numbers - assert Counter(result["seq"].to_list()) == Counter(["PP782799.1", "ABCDEFG", "12345678"]) +def test_filter_covid_genome_metadata_state_fips(): + num_test_rows = 4 + test_genome_metadata = { + "date": ["2022-01-01"] * num_test_rows, + "host": ["Homo sapiens"] * num_test_rows, + "country": ["USA"] * num_test_rows, + "clade_nextstrain": ["AAA"] * num_test_rows, + "location": ["Earth"] * num_test_rows, + "genbank_accession": ["A1"] * num_test_rows, + "genbank_accession_rev": ["A1.1"] * num_test_rows, + "division": ["Massachusetts", "Puerto Rico", "Washington DC", "Fake State"], + } + lf_metadata = pl.LazyFrame(test_genome_metadata) + lf_filtered = filter_sequence_metadata(lf_metadata, state_format=StateFormat.FIPS) + lf_filtered = lf_filtered.collect() -def test_parse_sequence_duplicates(df_assignments): - df_duplicates = pl.concat([df_assignments, df_assignments]) + # Un-mapped states are dropped from dataset + assert len(lf_filtered) == 3 - with pytest.raises(ValueError): - parse_sequence_assignments(df_duplicates) + locations = set(lf_filtered["location"].to_list()) + assert locations == {"11", "25", "72"}