Skip to content

Commit 2dbb69f

Browse files
DorielaGrabockaebolyenmisialq
authored
ENH: add action to collate genomes to GenomeData[DNASequences] (#99)
* ENH: action to convert FeatureData to Genome data added * ENH: action to convert FeatureData to Genome data refined * ENH: action to convert FeatureData to Genome data tests added * ENH: set up test paths added * ENH: imports modified * REF: fix private imports for q2-types refactor (#95) * ENH: changes requested done * ENH: imports modified * ENH: description modified * Update q2_assembly/plugin_setup.py * ENH: tests modified * ENH: tests modified * ENH: checking for duplicate IDs code added * ENH: accounting for duplicates case * ENH: accounting for duplicates case, modifying tests to check file content * ENH: code refactoring * ENH: tests corrected * Update q2_assembly/helpers/helpers.py --------- Co-authored-by: Evan Bolyen <[email protected]> Co-authored-by: Michal Ziemski <[email protected]>
1 parent 789cc26 commit 2dbb69f

File tree

9 files changed

+229
-1
lines changed

9 files changed

+229
-1
lines changed

q2_assembly/helpers/helpers.py

+52
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,14 @@
99
import os
1010
import shutil
1111
import warnings
12+
from typing import Union
13+
from warnings import warn
1214

1315
import numpy as np
16+
import skbio.io
1417
from q2_types.bowtie2 import Bowtie2IndexDirFmt
18+
from q2_types.feature_data import DNAFASTAFormat, DNAIterator
19+
from q2_types.genome_data import GenomeSequencesDirectoryFormat
1520
from q2_types.per_sample_sequences import BAMDirFmt, ContigSequencesDirFmt
1621
from qiime2.util import duplicate
1722

@@ -109,3 +114,50 @@ def collate_alignments(alignments: BAMDirFmt) -> BAMDirFmt:
109114
duplicate(_alignment, os.path.join(collated_alignments.path, filename))
110115

111116
return collated_alignments
117+
118+
119+
def collate_genomes(
120+
genomes: Union[DNAFASTAFormat, GenomeSequencesDirectoryFormat],
121+
on_duplicates: str = "warn",
122+
) -> GenomeSequencesDirectoryFormat:
123+
genomes_dir = GenomeSequencesDirectoryFormat()
124+
error_on_duplicates = True if on_duplicates == "error" else False
125+
ids = set()
126+
duplicate_ids = set()
127+
msg = "Duplicate sequence files were found for the following IDs: {}."
128+
if isinstance(genomes[0], DNAFASTAFormat):
129+
for genome_file in genomes:
130+
for genome in genome_file.view(DNAIterator):
131+
fn = genome.metadata["id"]
132+
if fn not in ids:
133+
with open(os.path.join(genomes_dir.path, fn + ".fasta"), "w") as f:
134+
skbio.io.write(genome, format="fasta", into=f)
135+
ids.add(fn)
136+
else:
137+
duplicate_ids.add(fn)
138+
if error_on_duplicates:
139+
raise ValueError(msg.format(", ".join(duplicate_ids)))
140+
141+
else:
142+
for genome in genomes:
143+
for fp in genome.path.iterdir():
144+
fn = os.path.basename(fp)
145+
if fn not in ids:
146+
shutil.copyfile(
147+
fp,
148+
os.path.join(genomes_dir.path, fn),
149+
)
150+
ids.add(fn)
151+
else:
152+
duplicate_ids.add(fn)
153+
if error_on_duplicates:
154+
raise ValueError(msg.format(", ".join(duplicate_ids)))
155+
156+
if duplicate_ids:
157+
warn(
158+
msg.format(", ".join(sorted(duplicate_ids)))
159+
+ " The latest occurrence will overwrite all previous "
160+
"occurrences for each corresponding ID."
161+
)
162+
163+
return genomes_dir

q2_assembly/plugin_setup.py

+20
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,26 @@
418418
description="Not to be called directly. Used by map_reads.",
419419
)
420420

421+
plugin.methods.register_function(
422+
function=q2_assembly.helpers.collate_genomes,
423+
inputs={"genomes": List[FeatureData[Sequence]] | List[GenomeData[DNASequence]]},
424+
parameters={"on_duplicates": Str % Choices(["error", "warn"])},
425+
outputs={"collated_genomes": GenomeData[DNASequence]},
426+
input_descriptions={"genomes": "A list of genomes to be collated."},
427+
parameter_descriptions={
428+
"on_duplicates": "Preferred behaviour when duplicated genome IDs "
429+
'are encountered: "warn" displays a warning and '
430+
"continues with the combination of the genomes "
431+
'while "error" raises an error and aborts further '
432+
"execution."
433+
},
434+
output_descriptions={"collated_genomes": "The converted genomes."},
435+
name="Convert a list of FeatureData[Sequence] or a list of GenomeData[DNASequence] "
436+
"to GenomeData[DNASequence].",
437+
description="This method converts a list of FeatureData[Sequence] or a list of "
438+
"GenomeData[DNASequence] to a GenomeData[DNASequence] artifact.",
439+
)
440+
421441
plugin.register_semantic_types(QUASTResults)
422442
plugin.register_semantic_type_to_format(
423443
QUASTResults, artifact_format=QUASTResultsDirectoryFormat
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>ref1 d_Bacteria_1
2+
ACGTACGT
3+
>ref2 d_Bacteria_2
4+
CGTCGTCC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>ref5 d_Bacteria_3
2+
ACGTACGT
3+
>ref6 d_Bacteria_4
4+
CGTCGTCC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>ref1
2+
ACGTTACGT
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>ref2
2+
ACGGGTACT
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>ref3
2+
ACGTTACGT

q2_assembly/tests/test_helpers.py

+140-1
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,19 @@
1313
import tempfile
1414
import unittest
1515
import uuid
16+
import warnings
1617
from unittest.mock import ANY, call, patch
1718

1819
import shortuuid
1920
import skbio
2021
from parameterized import parameterized
22+
from q2_types.feature_data import DNAFASTAFormat
23+
from q2_types.genome_data import GenomeSequencesDirectoryFormat
2124
from q2_types.per_sample_sequences import ContigSequencesDirFmt
2225
from qiime2.plugin.testing import TestPluginBase
26+
from qiime2.plugins import assembly
2327

24-
from q2_assembly.helpers.helpers import rename_contigs
28+
from q2_assembly.helpers.helpers import collate_genomes, rename_contigs
2529

2630

2731
class TestUtils(TestPluginBase):
@@ -57,6 +61,141 @@ def test_is_valid_shortuuid(self):
5761
self.assertTrue(self.is_valid_shortuuid(true_shortuuid))
5862
self.assertFalse(self.is_valid_shortuuid(false_shortuuid))
5963

64+
@parameterized.expand(["single", "multiple"])
65+
def test_collate_genomes_dnafastaformat(self, input):
66+
genomes1 = DNAFASTAFormat(
67+
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
68+
)
69+
genomes2 = DNAFASTAFormat(
70+
self.get_data_path("dna-fasta-format/dna-sequences2.fasta"), "r"
71+
)
72+
if input == "single":
73+
genomes = [genomes1]
74+
content = {
75+
"ref1": {"description": "d_Bacteria_1", "sequence": "ACGTACGT"},
76+
"ref2": {"description": "d_Bacteria_2", "sequence": "CGTCGTCC"},
77+
}
78+
exp_files = ["ref1.fasta", "ref2.fasta"]
79+
else:
80+
genomes = [genomes1, genomes2]
81+
content = {
82+
"ref1": {"description": "d_Bacteria_1", "sequence": "ACGTACGT"},
83+
"ref2": {"description": "d_Bacteria_2", "sequence": "CGTCGTCC"},
84+
"ref5": {"description": "d_Bacteria_3", "sequence": "ACGTACGT"},
85+
"ref6": {"description": "d_Bacteria_4", "sequence": "CGTCGTCC"},
86+
}
87+
exp_files = ["ref1.fasta", "ref2.fasta", "ref5.fasta", "ref6.fasta"]
88+
89+
collated_genomes = collate_genomes(genomes=genomes)
90+
actual_files = sorted(os.listdir(collated_genomes.path))
91+
self.assertEqual(actual_files, exp_files)
92+
93+
for fn in actual_files:
94+
fp = os.path.join(collated_genomes.path, fn)
95+
with open(fp, "r") as fasta_file:
96+
for seq in skbio.io.read(fasta_file, "fasta"):
97+
actual_id = seq.metadata["id"]
98+
actual_description = seq.metadata["description"]
99+
actual_sequence = str(seq)
100+
expected_id = fn.split(".")[0]
101+
expected_desc = content[expected_id]["description"]
102+
expected_sequence = content[expected_id]["sequence"]
103+
104+
self.assertEquals(actual_id, expected_id)
105+
self.assertEqual(actual_description, expected_desc)
106+
self.assertEqual(actual_sequence, expected_sequence)
107+
108+
def test_collate_genomes_genome_dir_multiple(self):
109+
genomes1 = GenomeSequencesDirectoryFormat(
110+
self.get_data_path("genomes-dir-format1"), "r"
111+
)
112+
genomes2 = GenomeSequencesDirectoryFormat(
113+
self.get_data_path("genomes-dir-format2"), "r"
114+
)
115+
genomes = [genomes1, genomes2]
116+
collated_genomes = collate_genomes(genomes=genomes)
117+
exp_files = ["ref1.fasta", "ref2.fasta", "ref3.fasta"]
118+
actual_files = sorted(os.listdir(collated_genomes.path))
119+
self.assertEqual(exp_files, actual_files)
120+
121+
def test_collate_genomes_mix(self):
122+
# should throw TypeError
123+
genomes1 = DNAFASTAFormat(
124+
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
125+
)
126+
genomes2 = GenomeSequencesDirectoryFormat(
127+
self.get_data_path("genomes-dir-format2"), "r"
128+
)
129+
genomes = [genomes2, genomes1]
130+
with self.assertRaises(TypeError):
131+
assembly.methods.collate_genomes(genomes=genomes)
132+
133+
@parameterized.expand(["GenomeData", "DNAFASTAFormat"])
134+
def test_collate_genomes_dnafastaformat_multiple_duplicates_warn(self, dir_fmt):
135+
duplicate_ids = (
136+
["ref1.fasta", "ref2.fasta"]
137+
if dir_fmt == "GenomeData"
138+
else ["ref1", "ref2"]
139+
)
140+
warn_msg = (
141+
"Duplicate sequence files were found for the following IDs: {}. "
142+
"The latest occurrence will overwrite all previous occurrences "
143+
"for each corresponding ID."
144+
).format(", ".join(duplicate_ids))
145+
if dir_fmt == "GenomeData":
146+
genomes1 = GenomeSequencesDirectoryFormat(
147+
self.get_data_path("genomes-dir-format1"), "r"
148+
)
149+
else:
150+
genomes1 = DNAFASTAFormat(
151+
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
152+
)
153+
with warnings.catch_warnings(record=True) as w:
154+
collated_genomes = collate_genomes(genomes=[genomes1, genomes1])
155+
exp_files = ["ref1.fasta", "ref2.fasta"]
156+
actual_files = sorted(os.listdir(collated_genomes.path))
157+
self.assertEqual(actual_files, exp_files)
158+
self.assertEqual(warn_msg, str(w[0].message))
159+
160+
if dir_fmt == "DNAFASTAFormat":
161+
content = {
162+
"ref1": {"description": "d_Bacteria_1", "sequence": "ACGTACGT"},
163+
"ref2": {"description": "d_Bacteria_2", "sequence": "CGTCGTCC"},
164+
}
165+
166+
for fn in actual_files:
167+
fp = os.path.join(collated_genomes.path, fn)
168+
with open(fp, "r") as fasta_file:
169+
for seq in skbio.io.read(fasta_file, "fasta"):
170+
actual_id = seq.metadata["id"]
171+
actual_description = seq.metadata["description"]
172+
actual_sequence = str(seq)
173+
expected_id = fn.split(".")[0]
174+
expected_desc = content[expected_id]["description"]
175+
expected_sequence = content[expected_id]["sequence"]
176+
177+
self.assertEquals(actual_id, expected_id)
178+
self.assertEqual(actual_description, expected_desc)
179+
self.assertEqual(actual_sequence, expected_sequence)
180+
181+
@parameterized.expand(["GenomeData", "DNAFASTAFormat"])
182+
def test_collate_genomes_duplicates_error(self, dir_fmt):
183+
duplicate_ids = ["ref3.fasta"] if dir_fmt == "GenomeData" else ["ref1"]
184+
error_msg = (
185+
"Duplicate sequence files were found for the "
186+
"following IDs: %s." % ", ".join(duplicate_ids)
187+
)
188+
if dir_fmt == "GenomeData":
189+
genomes1 = GenomeSequencesDirectoryFormat(
190+
self.get_data_path("genomes-dir-format2"), "r"
191+
)
192+
else:
193+
genomes1 = DNAFASTAFormat(
194+
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
195+
)
196+
with self.assertRaisesRegex(ValueError, error_msg):
197+
_ = collate_genomes(genomes=[genomes1, genomes1], on_duplicates="error")
198+
60199
@parameterized.expand(
61200
[
62201
("uuid4", UUID4_REGEX),

setup.py

+3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@
4343
"data/zip_test_data/expected/*/*/*/*",
4444
"data/formatted-reads/single-end/*",
4545
"data/formatted-reads/paired-end/*",
46+
"data/dna-fasta-format/*",
47+
"data/genomes-dir-format1/*",
48+
"data/genomes-dir-format2/*",
4649
],
4750
"q2_assembly.bowtie2.tests": [
4851
"data/*",

0 commit comments

Comments
 (0)