Skip to content

Commit

Permalink
Remove null collection dates from SAR-CoV-2 sequence metadata
Browse files Browse the repository at this point in the history
    Closes #36

    This corrects a regression introduced in #4c0728a and adds
    a test case for non-empty/bad-format dates in the metadata file.
  • Loading branch information
bsweger committed Oct 24, 2024
1 parent 5847385 commit 8833418
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 11 deletions.
12 changes: 10 additions & 2 deletions src/cladetime/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,11 @@ def _get_ncov_metadata(
)
return {}

return response.json()
metadata = response.json()
if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2":
metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs"

return metadata


def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.LazyFrame:
Expand Down Expand Up @@ -175,11 +179,15 @@ def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.
.filter(
pl.col("country") == "USA",
pl.col("division").is_in(states),
pl.col("date").is_not_null(),
pl.col("host") == "Homo sapiens",
)
.rename({"clade_nextstrain": "clade", "division": "location"})
.cast({"date": pl.Date}, strict=False)
# date filtering at the end ensures we filter out null
# values created by the above .cast operation
.filter(
pl.col("date").is_not_null(),
)
)

return filtered_metadata
Expand Down
26 changes: 17 additions & 9 deletions tests/unit/util/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,23 @@ def test_download_covid_genome_metadata_no_history(s3_setup, tmp_path, mock_sess

def test_filter_covid_genome_metadata():
test_genome_metadata = {
"date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27"],
"host": ["Homo sapiens", "Homo sapiens", "Homo sapiens", "Narwhals", "Homo sapiens", "Homo sapiens"],
"country": ["USA", "Argentina", "USA", "USA", "USA", "USA"],
"division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania"],
"clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF"],
"location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia"],
"genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2"],
"genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1"],
"unwanted_column": [1, 2, 3, 4, 5, 6],
"date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27", "2023-05"],
"host": [
"Homo sapiens",
"Homo sapiens",
"Homo sapiens",
"Narwhals",
"Homo sapiens",
"Homo sapiens",
"Homo sapiens",
],
"country": ["USA", "Argentina", "USA", "USA", "USA", "USA", "USA"],
"division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania", "Pennsylvania"],
"clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "FFF"],
"location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia", "Cardassia"],
"genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"],
"genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1", "C2.1"],
"unwanted_column": [1, 2, 3, 4, 5, 6, 7],
}

lf_metadata = pl.LazyFrame(test_genome_metadata)
Expand Down

0 comments on commit 8833418

Please sign in to comment.