Skip to content

Commit 789cc26

Browse files
ENH: QUAST outputs downloaded genome artifacts (#96)
* ENH: references modified to accept GenomeData[DNASequences] * ENH: description modified * ENH: tests on quast done * ENH: sorting of refs before processing added * ENH: move references test modified * ENH: changes of review made * ENH: sorting reference files list(required for tests) * ENH: coverage improved * ENH: changes requested done * ENH: genome files named after genome ids, test modified * ENH: error cases modified, tests modified * ENH: modified warnings and imports, modified tests * ENH: modified descritpion in genomes_dir * ENH: tests separated * ENH: tests refactored --------- Co-authored-by: Michal Ziemski <[email protected]>
1 parent 8933fac commit 789cc26

File tree

3 files changed

+293
-71
lines changed

3 files changed

+293
-71
lines changed

q2_assembly/plugin_setup.py

+21-7
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
from q2_types.feature_data import FeatureData, Sequence
1212
from q2_types.feature_data_mag import MAG, Contig
1313
from q2_types.feature_table import FeatureTable, Frequency
14+
from q2_types.genome_data import DNASequence, GenomeData
1415
from q2_types.per_sample_sequences import (
16+
AlignmentMap,
1517
Contigs,
1618
MAGs,
1719
PairedEndSequencesWithQuality,
1820
SequencesWithQuality,
1921
SingleBowtie2Index,
2022
)
21-
from q2_types.per_sample_sequences import AlignmentMap
2223
from q2_types.sample_data import SampleData
2324
from qiime2.core.type import Bool, Choices, Properties, Str, TypeMap, Visualization
2425
from qiime2.plugin import Citations, Collection, Int, List, Plugin, Range
@@ -156,21 +157,27 @@
156157
inputs={
157158
"contigs": SampleData[Contigs],
158159
"reads": SampleData[SequencesWithQuality | PairedEndSequencesWithQuality],
159-
"references": List[FeatureData[Sequence]],
160+
"references": GenomeData[DNASequence],
160161
"mapped_reads": SampleData[AlignmentMap],
161162
},
162-
parameters=quast_params,
163+
parameters={**quast_params, "genomes_dir": Str},
163164
input_descriptions={
164165
"contigs": "Assembled contigs to be analyzed.",
165166
"reads": "Original single- or paired-end reads.",
166167
"references": "Reference genomes to align the assembled contigs against.",
167168
"mapped_reads": "Reads-to-contigs alignment maps (alternative to 'reads')."
168169
"directly.",
169170
},
170-
parameter_descriptions=quast_param_descriptions,
171+
parameter_descriptions={
172+
**quast_param_descriptions,
173+
"genomes_dir": "Path of the directory from which GenomeData[DNASequence] "
174+
"will be created.",
175+
},
171176
name="Visualize the quality of the assembled contigs after using metaQUAST.",
172177
description="This method visualizes the results of metaQUAST after "
173-
"assessing the quality of assembled metagenomes.",
178+
"assessing the quality of assembled metagenomes. WARNING: This action "
179+
"should not be used as a standalone-action. It is designed to be called "
180+
"by the evaluate-contigs action!",
174181
citations=[citations["Mikheenko2016"], citations["Mikheenko2018"]],
175182
)
176183

@@ -179,11 +186,15 @@
179186
inputs={
180187
"contigs": SampleData[Contigs],
181188
"reads": SampleData[SequencesWithQuality | PairedEndSequencesWithQuality],
182-
"references": List[FeatureData[Sequence]],
189+
"references": GenomeData[DNASequence],
183190
"mapped_reads": SampleData[AlignmentMap],
184191
},
185192
parameters=quast_params,
186-
outputs={"results_table": QUASTResults, "visualization": Visualization},
193+
outputs=[
194+
("results_table", QUASTResults),
195+
("visualization", Visualization),
196+
("reference_genomes", GenomeData[DNASequence]),
197+
],
187198
input_descriptions={
188199
"contigs": "Assembled contigs to be analyzed.",
189200
"reads": "Original single- or paired-end reads.",
@@ -195,6 +206,9 @@
195206
output_descriptions={
196207
"results_table": "QUAST result table.",
197208
"visualization": "Visualization of the QUAST results.",
209+
"reference_genomes": "Genome sequences downloaded by QUAST. NOTE: If the user"
210+
"provides the sequences as input, then this artifact"
211+
"will be the input artifact.",
198212
},
199213
name="Evaluate quality of the assembled contigs using metaQUAST.",
200214
description="This method uses metaQUAST to assess the quality of "

q2_assembly/quast/quast.py

+70-18
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,22 @@
1414
import tempfile
1515
from distutils.dir_util import copy_tree
1616
from typing import List, Union
17+
from warnings import warn
1718
from zipfile import ZipFile
1819

1920
import pandas as pd
2021
import pkg_resources
2122
import q2templates
23+
import skbio
2224
from q2_types.feature_data import DNAFASTAFormat, DNAIterator
25+
from q2_types.genome_data import GenomeSequencesDirectoryFormat
2326
from q2_types.per_sample_sequences import (
2427
BAMDirFmt,
2528
ContigSequencesDirFmt,
2629
SingleLanePerSamplePairedEndFastqDirFmt,
2730
SingleLanePerSampleSingleEndFastqDirFmt,
2831
)
32+
from qiime2.core.exceptions import ValidationError
2933

3034
from q2_assembly.quast.utils import _parse_columns
3135

@@ -87,7 +91,7 @@ def _evaluate_contigs(
8791
contigs: ContigSequencesDirFmt,
8892
reads: dict,
8993
paired: bool,
90-
references: List[DNAFASTAFormat],
94+
references: GenomeSequencesDirectoryFormat,
9195
mapped_reads: BAMDirFmt,
9296
common_args: list,
9397
) -> List[str]:
@@ -153,17 +157,9 @@ def _evaluate_contigs(
153157
)
154158

155159
if references:
156-
all_refs_dir = os.path.join(results_dir, "references")
157-
os.makedirs(all_refs_dir, exist_ok=True)
158-
all_ref_fps = []
159-
# we need to split the references into separate files so that QUAST
160-
# can correctly display alignment details per reference (otherwise it
161-
# will show those as if all the provided sequences belonged to a single
162-
# reference
163-
for ref in references:
164-
all_ref_fps.extend(_split_reference(ref, all_refs_dir))
165-
for fp in all_ref_fps:
166-
cmd.extend(["-r", fp])
160+
files = sorted(os.listdir(references.path))
161+
for fp in files:
162+
cmd.extend(["-r", os.path.join(references.path, fp)])
167163

168164
try:
169165
run_command(cmd, verbose=True)
@@ -223,6 +219,16 @@ def _zip_additional_reports(path_to_dirs: list, output_filename: str) -> None:
223219
_zip_dir(zipf, directory)
224220

225221

222+
def _move_references(src, dest):
223+
for fp in glob.glob(os.path.join(src, "*.fa*")):
224+
seqs = skbio.io.read(fp, format="fasta")
225+
for seq in seqs:
226+
seq_id = seq.metadata["id"]
227+
new_fp = os.path.join(dest, seq_id + ".fasta")
228+
with open(new_fp, "w") as f:
229+
skbio.io.write(seq, format="fasta", into=f)
230+
231+
226232
def _visualize_quast(
227233
output_dir: str,
228234
contigs: ContigSequencesDirFmt,
@@ -240,14 +246,23 @@ def _visualize_quast(
240246
reads: Union[
241247
SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt
242248
] = None,
243-
references: DNAFASTAFormat = None,
249+
references: GenomeSequencesDirectoryFormat = None,
250+
genomes_dir: str = None,
244251
mapped_reads: BAMDirFmt = None,
245252
) -> None:
246253
kwargs = {
247254
k: v
248255
for k, v in locals().items()
249256
if k
250-
not in ["output_dir", "contigs", "reads", "references", "mapped_reads", "ctx"]
257+
not in [
258+
"output_dir",
259+
"contigs",
260+
"reads",
261+
"references",
262+
"mapped_reads",
263+
"ctx",
264+
"genomes_dir",
265+
]
251266
}
252267

253268
common_args = _process_common_input_params(
@@ -291,6 +306,13 @@ def _visualize_quast(
291306
# Copy results to output dir
292307
copy_tree(results_dir, os.path.join(output_dir, "quast_data"))
293308

309+
# Save the downloaded references
310+
if not references:
311+
downloaded_references = os.path.join(
312+
results_dir, "quast_downloaded_references"
313+
)
314+
_move_references(downloaded_references, genomes_dir)
315+
294316
# Zip summary, not_aligned and runs_per_reference dirs for download
295317
dirnames = ["not_aligned", "runs_per_reference", "summary"]
296318
zip_these_dirs = [
@@ -365,10 +387,17 @@ def evaluate_contigs(
365387
ambiguity_score=0.99,
366388
):
367389
kwargs = {k: v for k, v in locals().items() if k not in ["contigs", "ctx"]}
390+
# 1. generate the visualization
391+
_visualize_quast = ctx.get_action("assembly", "_visualize_quast")
392+
genomes = references
368393
with tempfile.TemporaryDirectory() as tmp:
369-
# 1. generate the visualization
370-
_visualize_quast = ctx.get_action("assembly", "_visualize_quast")
371-
(visualization,) = _visualize_quast(contigs, **kwargs)
394+
if references:
395+
(visualization,) = _visualize_quast(contigs, **kwargs)
396+
else:
397+
genomes_dir = GenomeSequencesDirectoryFormat()
398+
(visualization,) = _visualize_quast(
399+
contigs, **kwargs, genomes_dir=str(genomes_dir.path)
400+
)
372401

373402
# 2. after the visualization is generated we need to export the files
374403
# to get the results table out
@@ -382,4 +411,27 @@ def evaluate_contigs(
382411
# 3. read it as a pandas dataframe then we create the QUASTResults
383412
tabular_results = ctx.make_artifact("QUASTResults", report_df)
384413

385-
return tabular_results, visualization
414+
# 4. create the Genomes
415+
if not references:
416+
try:
417+
genomes = ctx.make_artifact("GenomeData[DNASequence]", genomes_dir)
418+
except ValidationError as e:
419+
if "Missing one or more" in str(e): # no downloaded genomes
420+
warn(
421+
"QUAST did not download any genomes. The returned "
422+
"GenomeData[DNASequence] artifact is empty. Please check "
423+
"the network connection or provide the reference genomes. The "
424+
f"original error was '{e}'"
425+
)
426+
else: # corrupt files
427+
warn(
428+
"There was a problem with the genome files downloaded by "
429+
"QUAST. The returned GenomeData[DNASequence] artifact will "
430+
f"be empty. The original error was '{e}'"
431+
)
432+
433+
genomes_dir = GenomeSequencesDirectoryFormat()
434+
open(os.path.join(genomes_dir.path, "empty.fasta"), "w").close()
435+
genomes = ctx.make_artifact("GenomeData[DNASequence]", genomes_dir)
436+
437+
return tabular_results, visualization, genomes

0 commit comments

Comments
 (0)