Skip to content

Commit

Permalink
Use "strain" as the id for filtering sequences
Browse files Browse the repository at this point in the history
Still in the NCBI mindset, earlier versions of sequence.filter
used accession numbers to compare .fasta records to a set
of sequence "ids". However, for the processed Nextstrain
sequences, we need to use the "strain" column
  • Loading branch information
bsweger committed Nov 6, 2024
1 parent f07d85c commit d516677
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 71 deletions.
3 changes: 1 addition & 2 deletions src/cladetime/assign_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ def get_sequence_metadata(metadata: pl.DataFrame, sequence_collection_date: date
"country",
"date",
"division",
"genbank_accession",
"genbank_accession_rev",
"strain",
"host",
]

Expand Down
59 changes: 31 additions & 28 deletions src/cladetime/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def _download_from_url(session: Session, url: str, data_path: Path) -> Path:

parsed_url = urlparse(url)
url_filename = os.path.basename(parsed_url.path)
data_path.mkdir(parents=True, exist_ok=True)
filename = data_path / url_filename

with session.get(url, stream=True) as result:
Expand Down Expand Up @@ -131,8 +132,7 @@ def filter_metadata(
cols : list
Optional. A list of columns to include in the filtered metadata.
The default columns included in the filtered metadata are:
clade_nextstrain, country, date, division, genbank_accession,
genbank_accession_rev, host
clade_nextstrain, country, date, division, strain, host
state_format : :class:`cladetime.types.StateFormat`
Optional. The state name format returned in the filtered metadata's
location column. Defaults to `StateFormat.ABBR`
Expand Down Expand Up @@ -167,19 +167,19 @@ def filter_metadata(
>>> filtered_metadata = filter_covid_genome_metadata(ct.sequence_metadata)
>>> filtered_metadata.collect().head(5)
shape: (5, 7)
┌───────┬─────────┬────────────┬────────────────────────┬──────────────┬──────┬
│ clade ┆ country ┆ date ┆ genbank_ ┆ genbank_ac ┆ host ┆ loca │
│ ┆ ┆ ┆ accession ┆ cession_rev┆ ┆ tion │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str ┆ str ┆ str ┆ str │
│ ┆ ┆ ┆ ┆ ┆ │
╞═══════╪═════════╪════════════╪════════════════════════╪══════════════╪══════╡
│ 22A ┆ USA ┆ 2022-07-07 ┆ PP223234 ┆ PP223234.1 ┆ Homo sapiens ┆ AL │
│ 22B ┆ USA ┆ 2022-07-02 ┆ PP223435 ┆ PP223435.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-19 ┆ PP223235 ┆ PP223235.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-15 ┆ PP223236 ┆ PP223236.1 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-20 ┆ PP223237 ┆ PP223237.1 ┆ Homo sapiens ┆ AZ │
└───────┴─────────┴────────────┴────────────────────────┴─────────────────────┴
┌───────┬─────────┬────────────┬────────────────────────────┬──────────────┬──────┬
│ clade ┆ country ┆ date ┆ strain ┆ host ┆ loca │
│ ┆ ┆ ┆ ┆ ┆ tion │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str ┆ str ┆ str │
│ ┆ ┆ ┆ ┆ ┆ │
╞═══════╪═════════╪════════════╪════════════════════════════╪══════════════╪══════╡
│ 22A ┆ USA ┆ 2022-07-07 ┆ Alabama/SEARCH-202312/2022 ┆ Homo sapiens ┆ AL │
│ 22B ┆ USA ┆ 2022-07-02 ┆ Arizona/SEARCH-201153/2022 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-19 ┆ Arizona/SEARCH-203528/2022 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-15 ┆ Arizona/SEARCH-203621/2022 ┆ Homo sapiens ┆ AZ │
│ 22B ┆ USA ┆ 2022-07-20 ┆ Arizona/SEARCH-203625/2022 ┆ Homo sapiens ┆ AZ │
└───────┴─────────┴────────────┴────────────────────────────┴─────────────────────┴
"""
if state_format not in StateFormat:
raise ValueError(f"Invalid state_format. Must be one of: {list(StateFormat.__members__.items())}")
Expand All @@ -191,8 +191,7 @@ def filter_metadata(
"country",
"date",
"division",
"genbank_accession",
"genbank_accession_rev",
"strain",
"host",
]

Expand Down Expand Up @@ -256,7 +255,8 @@ def get_metadata_ids(sequence_metadata: pl.DataFrame | pl.LazyFrame) -> set:
"""Return sequence IDs for a specified set of Nextstrain sequence metadata.
For a given input of GenBank-based SARS-Cov-2 sequence metadata (as
published by Nextstrain), return a set of GenBank accession numbers.
published by Nextstrain), return a set of strains. This function is
mostly used to filter a sequence file.
Parameters
----------
Expand All @@ -265,21 +265,23 @@ def get_metadata_ids(sequence_metadata: pl.DataFrame | pl.LazyFrame) -> set:
Returns
-------
set
A set of GenBank accession numbers
A set of
:external:doc:`strains<reference/metadata-fields.html#column-1-strain>`
Raises
------
ValueError
If the sequence metadata does not contain a genbank_accession column
If the sequence metadata does not contain a strain column
"""
logger.info("Collecting sequence IDs from metadata")
metadata_columns = sequence_metadata.collect_schema().names()
if "genbank_accession" not in metadata_columns:
logger.error("Missing column from sequence_metadata", column="genbank_accession")
raise ValueError("Sequence metadata does not contain a genbank_accession column.")
sequences = sequence_metadata.select("genbank_accession").unique()
if "strain" not in metadata_columns:
logger.error("Missing column from sequence_metadata", column="strain")
raise ValueError("Sequence metadata does not contain a strain column.")
sequences = sequence_metadata.select("strain").unique()
if isinstance(sequence_metadata, pl.LazyFrame):
sequences = sequences.collect() # type: ignore
seq_set = set(sequences["genbank_accession"].to_list()) # type: ignore
seq_set = set(sequences["strain"].to_list()) # type: ignore

return seq_set

Expand All @@ -302,17 +304,18 @@ def parse_sequence_assignments(df_assignments: pl.DataFrame) -> pl.DataFrame:
return df_assignments


@time_function
def filter(sequence_ids: set, url_sequence: str, output_path: Path) -> Path:
"""Filter a fasta file against a specific set of sequences.
Download a sequence file (in FASTA format) from Nexstrain, filter
it against a set of specific sequence ids (GenBank accession numbers),
and write the filtered sequences to a new file.
it against a set of specific strains, and write the filtered
sequences to a new file.
Parameters
----------
sequence_ids : set
GenBank accession numbers used to filter the sequence file
Strains used to filter the sequence file
url_sequence : str
The URL to a file of SARS-CoV-2 GenBank sequences published by Nexstrain.
The file is should be in .fasta format using the lzma compression
Expand Down
60 changes: 30 additions & 30 deletions tests/data/test_metadata.tsv
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
genbank_accession genbank_accession_rev unwanted_column date host country division clade_nextstrain location another unwanted column
abc abc.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 Homo sapiens Canada Alberta DD Vulcan hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 marmots USA Massachusetts DD Vulcan hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Puerto Rico DD Reisa hummus a tune
abc abc.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts DD Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA.ZZ Bajor hummus a tune
abc abc.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 Homo sapiens Canada Mississippi DD Earth hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 marmots USA Massachusetts DD Cardassia hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Puerto Rico DD Bajor hummus a tune
abcd abcd.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts FF Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 Homo sapiens Canada Mississippi FF Earth hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 marmots USA Massachusetts FF Cardassia hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Guam FF Bajor hummus a tune
strain unwanted_column date host country division clade_nextstrain location another unwanted column
Abc/SEARCH-123/2022 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
Abc/VULCAN-123/2022 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
Def/VULCAN-XXX/3024 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Earth hummus a tune
Cardassia/SEARCH-123/2000 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah BB Cardassia hummus a tune
Bajor/STRAIN-789/2450 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah CC Bajor hummus a tune
Canada/STRAIN-WWW/2022 i ❤️ wombats 2024-09-01 Homo sapiens Canada Alberta DD Vulcan hummus a tune
Massachusetts/SEARCH-123/2022 i ❤️ wombats 2024-09-01 marmots USA Massachusetts DD Vulcan hummus a tune
PR/STRAIN-QQQ/2022 i ❤️ wombats 2024-09-01 Homo sapiens USA Puerto Rico DD Reisa hummus a tune
Massachusetts/SEARCH-123/2024 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
Massachusetts/SEARCH-123/2025 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
Massachusetts/SEARCH-123/2026 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts DD Earth hummus a tune
Cardassia/STRAIN-EEE/3001 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA Cardassia hummus a tune
Utah/STRAIN-123/2022 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA.ZZ Bajor hummus a tune
Massachusetts/SEARCH-123/2027 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
Massachusetts/SEARCH-123/2028 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
Massachusetts/SEARCH-123/2029 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Earth hummus a tune
Utah/STRAIN-456/2022 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah BB Cardassia hummus a tune
Utah/STRAIN-456/2023 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah CC Bajor hummus a tune
Canada!/SEARCH-123/2022 i ❤️ wombats 2024-09-15 Homo sapiens Canada Mississippi DD Earth hummus a tune
Massachusetts/SEARCH-456/2025 i ❤️ wombats 2024-09-15 marmots USA Massachusetts DD Cardassia hummus a tune
PR/SEARCH-123/2030 i ❤️ wombats 2024-09-15 Homo sapiens USA Puerto Rico DD Bajor hummus a tune
Massachusetts/SEARCH-456/2026 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts FF Vulcan hummus a tune
Massachusetts/SEARCH-456/2027 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
Massachusetts/SEARCH-456/2028 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Earth hummus a tune
Utah/STRAIN-456/2031 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah BB Cardassia hummus a tune
Utah/STRAIN-456/2032 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah CC Bajor hummus a tune
CanadaAgain/STRAIN-JJJ/XXXX i ❤️ wombats 2024-09-22 Homo sapiens Canada Mississippi FF Earth hummus a tune
Massachusetts/SEARCH-456/2029 i ❤️ wombats 2024-09-22 marmots USA Massachusetts FF Cardassia hummus a tune
Guam/SEARCH-123/2022 i ❤️ wombats 2024-09-22 Homo sapiens USA Guam FF Bajor hummus a tune
17 changes: 6 additions & 11 deletions tests/unit/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def test_get_metadata(test_file_path, metadata_file):
"country",
"division",
"clade_nextstrain",
"genbank_accession",
"genbank_accession_rev",
"strain",
}
assert expected_cols.issubset(metadata_cols)

Expand Down Expand Up @@ -81,8 +80,7 @@ def test_filter_metadata():
"division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Washington DC", "Pennsylvania"],
"clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "FFF"],
"location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia", "Cardassia"],
"genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"],
"genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1", "C2.1"],
"strain": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"],
"unwanted_column": [1, 2, 3, 4, 5, 6, 7],
}

Expand All @@ -101,8 +99,7 @@ def test_filter_metadata():
"clade": pl.String,
"country": pl.String,
"date": pl.Date,
"genbank_accession": pl.String,
"genbank_accession_rev": pl.String,
"strain": pl.String,
"host": pl.String,
"location": pl.String,
}
Expand All @@ -118,8 +115,7 @@ def test_filter_metadata_state_name():
"country": ["USA"] * num_test_rows,
"clade_nextstrain": ["AAA"] * num_test_rows,
"location": ["Earth"] * num_test_rows,
"genbank_accession": ["A1"] * num_test_rows,
"genbank_accession_rev": ["A1.1"] * num_test_rows,
"strain": ["A1"] * num_test_rows,
"division": ["Alaska", "Puerto Rico", "Washington DC", "Fake State"],
}

Expand All @@ -142,8 +138,7 @@ def test_filter_metadata_state_fips():
"country": ["USA"] * num_test_rows,
"clade_nextstrain": ["AAA"] * num_test_rows,
"location": ["Earth"] * num_test_rows,
"genbank_accession": ["A1"] * num_test_rows,
"genbank_accession_rev": ["A1.1"] * num_test_rows,
"strain": ["A1"] * num_test_rows,
"division": ["Massachusetts", "Puerto Rico", "Washington DC", "Fake State"],
}

Expand All @@ -160,7 +155,7 @@ def test_filter_metadata_state_fips():

def test_get_metadata_ids():
metadata = {
"genbank_accession": ["A1", "A2", "A2", "A4"],
"strain": ["A1", "A2", "A2", "A4"],
"country": ["USA", "Canada", "Mexico", "Brazil"],
"location": ["Earth", "Earth", "Earth", "Earth"],
}
Expand Down

0 comments on commit d516677

Please sign in to comment.