Skip to content

Commit 509401a

Browse files
ENH: add 'filter-contigs' action (#92)
* ENH: add 'filter-contigs' action * Update q2_assembly/_action_params.py Co-authored-by: VinzentRisch <[email protected]> * Add tests * fixup! Add tests * Update q2_assembly/filter/filter.py Co-authored-by: VinzentRisch <[email protected]> --------- Co-authored-by: VinzentRisch <[email protected]>
1 parent fda616a commit 509401a

File tree

10 files changed

+308
-1
lines changed

10 files changed

+308
-1
lines changed

q2_assembly/__init__.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# ----------------------------------------------------------------------------
88

99
from .bowtie2 import indexing, mapping
10+
from .filter import filter
1011
from .helpers import helpers
1112
from .iss import iss
1213
from .megahit import megahit
@@ -18,4 +19,13 @@
1819
except ModuleNotFoundError:
1920
__version__ = '0.0.0+notfound'
2021

21-
__all__ = ["indexing", "mapping", "iss", "megahit", "quast", "spades", "helpers"]
22+
__all__ = [
23+
"indexing",
24+
"mapping",
25+
"iss",
26+
"megahit",
27+
"quast",
28+
"spades",
29+
"helpers",
30+
"filter",
31+
]

q2_assembly/_action_params.py

+28
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# ----------------------------------------------------------------------------
88

99
from qiime2.core.type import Bool, Choices, Float, Int, List, Range, Str
10+
from qiime2.plugin import Metadata
1011

1112
megahit_params = {
1213
"presets": Str % Choices(["meta", "meta-sensitive", "meta-large", "disabled"]),
@@ -455,3 +456,30 @@
455456
" samples."
456457
}
457458
# fmt: on
459+
460+
filter_contigs_params = {
461+
"metadata": Metadata,
462+
"where": Str,
463+
"exclude_ids": Bool,
464+
"remove_empty": Bool,
465+
"length_threshold": Int % Range(0, None),
466+
}
467+
# fmt: off
468+
filter_contigs_param_descriptions = {
469+
"metadata": "Sample metadata indicating which sample ids to filter. "
470+
"The optional `where` parameter may be used to filter ids "
471+
"based on specified conditions in the metadata. The "
472+
"optional `exclude_ids` parameter may be used to exclude "
473+
"the ids specified in the metadata from the filter.",
474+
"where": "Optional SQLite WHERE clause specifying sample metadata "
475+
"criteria that must be met to be included in the filtered "
476+
"data. If not provided, all samples in `metadata` that are "
477+
"also in the contig data will be retained.",
478+
"exclude_ids": "If True, the samples selected by "
479+
"the `metadata` and optional `where` parameter will be "
480+
"excluded from the filtered data.",
481+
"remove_empty": "If True, samples with no contigs will be removed from "
482+
"the filtered data.",
483+
"length_threshold": "Only keep contigs of the given length and longer."
484+
}
485+
# fmt: on

q2_assembly/filter/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2023, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
9+
from .filter import filter_contigs
10+
11+
__all__ = [
12+
"filter_contigs",
13+
]

q2_assembly/filter/filter.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2023, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
import os
9+
10+
import skbio
11+
from q2_types.per_sample_sequences import ContigSequencesDirFmt
12+
from qiime2 import Metadata
13+
from qiime2.util import duplicate
14+
15+
16+
def _find_empty_samples(samples: dict) -> set:
17+
empty_samples = set()
18+
for sample_id, sample_fp in samples.items():
19+
if os.path.getsize(sample_fp) == 0:
20+
empty_samples.add(sample_id)
21+
return empty_samples
22+
23+
24+
def _filter_by_length(
25+
contigs: ContigSequencesDirFmt, threshold: int
26+
) -> ContigSequencesDirFmt:
27+
results = ContigSequencesDirFmt()
28+
print(
29+
f"Filtering contigs by length - only contigs >= {threshold} bp long will "
30+
f"be retained."
31+
)
32+
for sample_id, sample_fp in contigs.sample_dict().items():
33+
out_fp = os.path.join(str(results), f"{sample_id}_contigs.fa")
34+
keep, remove = 0, 0
35+
with open(out_fp, "w") as f_out:
36+
for contig in skbio.io.read(sample_fp, format="fasta"):
37+
if len(contig) >= threshold:
38+
skbio.io.write(contig, format="fasta", into=f_out)
39+
keep += 1
40+
else:
41+
remove += 1
42+
print(
43+
f"Sample {sample_id}: {remove + keep} contigs\n {remove} contigs "
44+
f"removed\n {keep} contigs retained"
45+
)
46+
return results
47+
48+
49+
def filter_contigs(
50+
contigs: ContigSequencesDirFmt,
51+
metadata: Metadata = None,
52+
where: str = None,
53+
exclude_ids: bool = False,
54+
length_threshold: int = 0,
55+
remove_empty: bool = False,
56+
) -> ContigSequencesDirFmt:
57+
if not any([metadata, length_threshold, remove_empty]):
58+
raise ValueError(
59+
"At least one of the following parameters must be provided: "
60+
"metadata, length_threshold, remove_empty."
61+
)
62+
63+
if metadata and not where:
64+
raise ValueError(
65+
"A filter query must be provided through the 'where' parameter "
66+
"when filtering by metadata."
67+
)
68+
69+
if length_threshold > 0:
70+
contigs = _filter_by_length(contigs, length_threshold)
71+
72+
results = ContigSequencesDirFmt()
73+
samples = contigs.sample_dict()
74+
ids_to_keep = set(samples.keys())
75+
if remove_empty:
76+
ids_to_remove = _find_empty_samples(samples)
77+
ids_to_keep -= ids_to_remove
78+
if ids_to_remove:
79+
print(f"Removing empty samples: {', '.join(sorted(ids_to_remove))}")
80+
81+
if metadata:
82+
selected_ids = metadata.get_ids(where=where)
83+
if not selected_ids:
84+
print("The filter query returned no IDs to filter out.")
85+
86+
if exclude_ids:
87+
ids_to_keep -= set(selected_ids)
88+
else:
89+
ids_to_keep &= set(selected_ids)
90+
91+
if len(ids_to_keep) == 0:
92+
raise ValueError("No samples remain after filtering.")
93+
94+
try:
95+
for _id in ids_to_keep:
96+
duplicate(samples[_id], os.path.join(str(results), f"{_id}_contigs.fa"))
97+
except KeyError:
98+
raise ValueError(f"{_id!r} is not a sample present in the contig data.")
99+
100+
return results

q2_assembly/filter/tests/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2023, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# ----------------------------------------------------------------------------
2+
# Copyright (c) 2023, QIIME 2 development team.
3+
#
4+
# Distributed under the terms of the Modified BSD License.
5+
#
6+
# The full license is in the file LICENSE, distributed with this software.
7+
# ----------------------------------------------------------------------------
8+
import os
9+
10+
import pandas as pd
11+
import qiime2 as q2
12+
import skbio
13+
from q2_types.per_sample_sequences import ContigSequencesDirFmt
14+
from qiime2.plugin.testing import TestPluginBase
15+
16+
from q2_assembly.filter import filter_contigs
17+
18+
19+
class TestFilterContigs(TestPluginBase):
20+
package = "q2_assembly.tests"
21+
22+
def setUp(self):
23+
super().setUp()
24+
self.contigs = ContigSequencesDirFmt(
25+
self.get_data_path("contigs-with-empty"), 'r'
26+
)
27+
self.metadata_df = pd.DataFrame(
28+
data={'col1': ['yes', 'no', 'yes']},
29+
index=pd.Index(['sample1', 'sample2', 'sample3'], name='id')
30+
)
31+
self.metadata = q2.Metadata(self.metadata_df)
32+
33+
def test_filter_metadata(self):
34+
obs = filter_contigs(
35+
contigs=self.contigs, metadata=self.metadata, where="col1='yes'"
36+
)
37+
38+
self.assertDictEqual(
39+
obs.sample_dict(),
40+
{'sample1': os.path.join(obs.path, 'sample1_contigs.fa'),
41+
'sample3': os.path.join(obs.path, 'sample3_contigs.fa')}
42+
)
43+
44+
def test_filter_metadata_exclude_ids(self):
45+
obs = filter_contigs(
46+
contigs=self.contigs, metadata=self.metadata,
47+
where="col1='yes'", exclude_ids=True
48+
)
49+
50+
self.assertDictEqual(
51+
obs.sample_dict(),
52+
{'sample2': os.path.join(obs.path, 'sample2_contigs.fa')}
53+
)
54+
55+
def test_filter_metadata_no_query_no_metadata(self):
56+
with self.assertRaisesRegex(
57+
ValueError,
58+
'At least one of the following parameters must be provided'
59+
):
60+
filter_contigs(contigs=self.contigs)
61+
62+
def test_filter_metadata_no_query(self):
63+
with self.assertRaisesRegex(
64+
ValueError,
65+
'A filter query must be provided'
66+
):
67+
filter_contigs(contigs=self.contigs, metadata=self.metadata)
68+
69+
def test_filter_by_length(self):
70+
obs = filter_contigs(contigs=self.contigs, length_threshold=320)
71+
72+
self.assertEqual(len(obs.sample_dict()), 3)
73+
74+
exp_counts = (6, 1, 0)
75+
for (_id, fp), count in zip(obs.sample_dict().items(), exp_counts):
76+
with (open(fp) as f):
77+
self.assertEqual(
78+
len(list(skbio.io.read(f, format='fasta'))), count
79+
)
80+
81+
def test_filter_remove_empty(self):
82+
obs = filter_contigs(contigs=self.contigs, remove_empty=True)
83+
84+
self.assertDictEqual(
85+
obs.sample_dict(),
86+
{'sample1': os.path.join(obs.path, 'sample1_contigs.fa'),
87+
'sample2': os.path.join(obs.path, 'sample2_contigs.fa')}
88+
)
89+
90+
def test_filter_by_length_and_remove_empty(self):
91+
obs = filter_contigs(
92+
contigs=self.contigs, length_threshold=400, remove_empty=True
93+
)
94+
95+
self.assertEqual(len(obs.sample_dict()), 1)
96+
97+
with (open(os.path.join(obs.path, 'sample1_contigs.fa')) as f):
98+
self.assertEqual(
99+
len(list(skbio.io.read(f, format='fasta'))), 2
100+
)
101+
102+
def test_filter_everything(self):
103+
with self.assertRaisesRegex(
104+
ValueError, 'No samples remain after filtering'
105+
):
106+
filter_contigs(
107+
contigs=self.contigs, length_threshold=1000, remove_empty=True
108+
)

q2_assembly/plugin_setup.py

+13
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
quast_params,
4242
spades_param_descriptions,
4343
spades_params,
44+
filter_contigs_params,
45+
filter_contigs_param_descriptions,
4446
)
4547
from q2_assembly.quast.types import (
4648
QUASTResults,
@@ -438,6 +440,17 @@
438440
"GenomeData[DNASequence] to a GenomeData[DNASequence] artifact.",
439441
)
440442

443+
plugin.methods.register_function(
444+
function=q2_assembly.filter.filter_contigs,
445+
inputs={"contigs": SampleData[Contigs]},
446+
parameters=filter_contigs_params,
447+
outputs={"filtered_contigs": SampleData[Contigs]},
448+
input_descriptions={"contigs": "The contigs to filter."},
449+
parameter_descriptions=filter_contigs_param_descriptions,
450+
name="Filter contigs.",
451+
description="Filter contigs based on metadata.",
452+
)
453+
441454
plugin.register_semantic_types(QUASTResults)
442455
plugin.register_semantic_type_to_format(
443456
QUASTResults, artifact_format=QUASTResultsDirectoryFormat
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
>k141_9 flag=3 multi=1.0108 len=234
2+
AGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAACTGCTAAGAAAAAGAACCCCATAGCAAAGAAGGCTGTCGCGAAAAAAGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAACTGCTAAGAAAAAGAACCCCATAGCAAAGAAGGCTGTCGCGAAAAAAGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAAC
3+
>k141_2 flag=1 multi=2.0000 len=350
4+
TTGCAAGCGTTTGGATAAGGTTACGCTGTGCACATCATTGACAAGTACTTGCTGTAACTTAGCTAAAAGAACATCAAATTCTTTGTCGTCTGTTTTTGTTTTTTCTTCGCTTTTATCTTCTGCTGTCCCAAGGTTTTCAAGATCAGCGCCCGCTCGCGTTACAGATTTAAAGTTTAAACCATCATAGCTTAGATTATGTTGAGCCAAAACTCATCAACGGTGTCTGTCATAAACAGAACATTAATACCTCGTGCTTTGAAGCCTTCTAATTGTGGCGCATTTTTTACGCTGGCAATAGAATCTCCAGAAATGTAATAGATATCTTTCTGATTCTCGGGCATTTTTTCTTT
5+
>k141_4 flag=1 multi=1.0000 len=309
6+
ACAATGGACGTATTTGACTACGTTGAGGATTCTATCACGCATCAAGAACCTGACTTCGAAACCTTTCTTTTAGTCCATGCGGTTCGCCGAGATTACTTCTTACAAAAGCAAGGAGGCGTGCCCTCAAAACAACGTATTGATTCGACAAAACTTCCCGCCAAACAAAACAAAGTGCGTGAAATTGAATGGTTGCCACGTCTTCTGCCAAAAGCACTAGCAAAGCTTCGTGGAGAGCTGCCTTCAGAAACAATGTATGGTTGCGGTGGCGATCGGCGATTTTTTAAAACCCACAACATTCACCCCGCGGAA
7+
>k141_6 flag=1 multi=1.0000 len=316
8+
ACAATATAGCAACGGTTATTGAGGATCGCAGGATTGGGACTACAGTGACCAATAATTCTTCCCTGCTCGGCTTCGTAACCGGTCTCCTCTTTAAGCTCACGAAGTCCAGCTACTAGCGGATCCTCTCCGGCATCCACAATCCCGCCGGGTAATTCCCAAGATAGTGCATCCATCCCCCAGCGAAATTGGCGTACCATCACCAGTTCACCTGCTACAGTTCGCGCCACGACAATTACCCAATCACGCGAATCTAAATAATAAAATTCACCCTCTTTACCATTATGTGGATGTCGGTAGCGCCGCGCGCGCAAATCCC
9+
>k141_0 flag=1 multi=1.0000 len=307
10+
ACAGTTGAAACGATTGCAATCAATATTGACGGAGTCGATGTTCAGGTTCCAAAGGGAATTAATGCGATCGAAGCAGCTAAATTGCATGGTAAGGAGGTCCCGCATTATTGTTATCATCCTAAGCTTTCGATCGCTGGCAACTGTCGTATGTGCTTAATTGAAAAGGGATTGCCTATGCGCGATCGGGCTACTGGAGATCCCGTTATGGATACCAATGGTGAGCAGAAGATCGGATGGTCGCCTAAGCCGACGATAGGATGTTCCACCAATGTATCTCCAGGAATGCATTTACGAACTAATTCTACTA
11+
>k141_7 flag=1 multi=1.0000 len=357
12+
AAGCGTATTATTCGCAGCTGCCGTAACAAGAAGCGCCAGACACTACTTTTTTCTGCCACTGTATCAGAAGATATTAAGCGACTTATCAAGGGCAGCCTTCGAGACCCTGTTGAGGTGGCTATTAGTATCAAAATATCGCCCGCGGAGACCGTGAAGCACGAAGTCTATCCGGTTGGGGCAATGCAAAAGTTTGATCTTTTGATCGCGCTCATCGATCAAATGGAAATTGACAGTATGATTATCTTTTGCCGCATGAAAGTCGGAGCCGACCGTATCACTCGATGGCTACAAGAGCGCGGGCTAAGCTGCGGAGCGATGCATTCGAATTTGCCGCAAAAAGCGCGCACCAAGGCATTG
13+
>k141_8 flag=1 multi=1.0000 len=446
14+
ACGGCCACGAATAACATCATTCAGGTCACGCTTTGGCATCAATCCCAATTGTACATCAACATTGGCAAGGCCTAAATCACCGTCAAAAAGAAGTACTTTTTTACCTGCTCTAGCAAGTGCATGGGAGAACGAAATAGAAAACCATGTTTTACCAACGCCGCCTTTACCGCTTGCGACAGCAATAACATTCTTACCGTAAGGGAATGTGCCACCCGTCTTTAGTGTATCCGCATCTTTTGTTGCTTGTTGATCGACCATATTACTCACCTTGCCTTGTTGTCTTCTTGACCTTGTATTCTGCCTCTTGCTTGGTTTGAGGCATGAAATAATTCGTTAAAATTTCAGCATTTAATTCTGTTAAACCATCGGCTACGGCCTCTGTATGAGAGCCTTGTGTGAAGGCCATACCGCCCTTTTCTGCCGCTGATAATAAACCACCTAAACGA
15+
>k141_5 flag=1 multi=1.0000 len=373
16+
AAGGATGGAGATTTTAAATGTGCTTGTTCTGATGCATGTTCTTCAGGTGCAAGGTTTTTGGAGACGTTAATGATAAATCAAGTATAGTGTCAGGCCTTAAAGAAGATGAAAGAATGTATCACTTGCTTGAAAGTGTTGGTACAAAACCCAACGTGATGTATCAAACTAAAGTTAGAAATACCGACGAAACCTTGAAATAGAAAAACGAACCGTAAGGCATCTCATTACGAAGCACCAATAAGAAAACCTCTAGTAACAGGTGATAAAAGTTATCATGACGTAACTATTGATGTCGCTGCTGTGGTTGAGGGAAAAGCCAATCGTCAATGGTGGATTGTTTTTGGAATTGCTTTGACTGCCTTTCTATGGGGAC
17+
>k141_3 flag=1 multi=1.0000 len=480
18+
AAATTTCTGATATACAGTGTTACGCACTCCGCGCCGATATAAGTCGTTCGATAGGATTTTCTCAGAGCTATATTGTTTCAGAAAGTGGTCGGGCAATTGTAGCGCCCCATTCTATATTGGTTACCAATGTGTGTGATCGAATATCTAAAAAATCGGACTCCAATATATTAATAAATAATAAAACAGGTAATCCTTTACTGCAAGAATTGGAATCAATTCTCATCAATGATCATTATTCAAGCCTTTTAGAGAAATAGTTATTAAAGTTTGGTCCCGGCTTCATCCCTAAACTAATTAAGTGACGACCCATAAGTAGTGGTTTAGGTGCATTCGATTGTATGTGCAAAGCTTCTGCCTTTTTTAGTAGCCATTCGCCTTCAGGGAATTCTATTGTATTTATTGGAGGGCGACCTAATTGATCGGCGCAAGAAACACGCACTAATCGATCGACTCGTTGTACACTTGCAGCAAGTCTACGAA
19+
>k141_1 flag=1 multi=1.0000 len=336
20+
ACTTTACCACCGGCCAAAACTTCTTGATTTTCTTCGTCGAAAGTAGCTTTAAGCCCTGTTCTCATCGATGCAGTAGTCATAATATTAGCAATAGAGTGGTTATATCCAGCCTCGACAGGTGCGTTAGTTTTTTCTCTAGAACGAACACATTCCAGCCAGTTGAGCATATGATTAGACGTCATGTTGTCTACTCCAGTATTTGCTCTAGTAACAACTTTGGCTTCACTACTTGATAAAGAAAATTCTTTTAAATTATTAGGTTCCATACCCATTATACTTGCTTGGCGCTTAGAAAGTCCTCCATTAGGTGTAACTTTATTTGTCTTCATATTGAGT
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
>k145_4 flag=3 multi=1.0108 len=234
2+
AGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAACTGCTAAGAAAAAGAACCCCATAGCAAAGAAGGCTGTCGCGAAAAAAGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAACTGCTAAGAAAAAGAACCCCATAGCAAAGAAGGCTGTCGCGAAAAAAGTAGCCAGTAAGAAGACTACTACTAAAAAGCCAGTTGCTAAAAAAAC
3+
>k145_6 flag=1 multi=2.0000 len=350
4+
TTGCAAGCGTTTGGATAAGGTTACGCTGTGCACATCATTGACAAGTACTTGCTGTAACTTAGCTAAAAGAACATCAAATTCTTTGTCGTCTGTTTTTGTTTTTTCTTCGCTTTTATCTTCTGCTGTCCCAAGGTTTTCAAGATCAGCGCCCGCTCGCGTTACAGATTTAAAGTTTAAACCATCATAGCTTAGATTATGTTGAGCCAAAACTCATCAACGGTGTCTGTCATAAACAGAACATTAATACCTCGTGCTTTGAAGCCTTCTAATTGTGGCGCATTTTTTACGCTGGCAATAGAATCTCCAGAAATGTAATAGATATCTTTCTGATTCTCGGGCATTTTTTCTTT
5+
>k145_7 flag=1 multi=1.0000 len=309
6+
ACAATGGACGTATTTGACTACGTTGAGGATTCTATCACGCATCAAGAACCTGACTTCGAAACCTTTCTTTTAGTCCATGCGGTTCGCCGAGATTACTTCTTACAAAAGCAAGGAGGCGTGCCCTCAAAACAACGTATTGATTCGACAAAACTTCCCGCCAAACAAAACAAAGTGCGTGAAATTGAATGGTTGCCACGTCTTCTGCCAAAAGCACTAGCAAAGCTTCGTGGAGAGCTGCCTTCAGAAACAATGTATGGTTGCGGTGGCGATCGGCGATTTTTTAAAACCCACAACATTCACCCCGCGGAA
7+
>k145_8 flag=1 multi=1.0000 len=316
8+
ACAATATAGCAACGGTTATTGAGGATCGCAGGATTGGGACTACAGTGACCAATAATTCTTCCCTGCTCGGCTTCGTAACCGGTCTCCTCTTTAAGCTCACGAAGTCCAGCTACTAGCGGATCCTCTCCGGCATCCACAATCCCGCCGGGTAATTCCCAAGATAGTGCATCCATCCCCCAGCGAAATTGGCGTACCATCACCAGTTCACCTGCTACAGTTCGCGCCACGACAATTACCCAATCACGCGAATCTAAATAATAAAATTCACCCTCTTTACCATTATGTGGATGTCGGTAGCGCCGCGCGCGCAAATCCC

q2_assembly/tests/data/contigs-with-empty/sample3_contigs.fa

Whitespace-only changes.

0 commit comments

Comments
 (0)