14
14
import tempfile
15
15
from distutils .dir_util import copy_tree
16
16
from typing import List , Union
17
+ from warnings import warn
17
18
from zipfile import ZipFile
18
19
19
20
import pandas as pd
20
21
import pkg_resources
21
22
import q2templates
23
+ import skbio
22
24
from q2_types .feature_data import DNAFASTAFormat , DNAIterator
25
+ from q2_types .genome_data import GenomeSequencesDirectoryFormat
23
26
from q2_types .per_sample_sequences import (
24
27
BAMDirFmt ,
25
28
ContigSequencesDirFmt ,
26
29
SingleLanePerSamplePairedEndFastqDirFmt ,
27
30
SingleLanePerSampleSingleEndFastqDirFmt ,
28
31
)
32
+ from qiime2 .core .exceptions import ValidationError
29
33
30
34
from q2_assembly .quast .utils import _parse_columns
31
35
@@ -87,7 +91,7 @@ def _evaluate_contigs(
87
91
contigs : ContigSequencesDirFmt ,
88
92
reads : dict ,
89
93
paired : bool ,
90
- references : List [ DNAFASTAFormat ] ,
94
+ references : GenomeSequencesDirectoryFormat ,
91
95
mapped_reads : BAMDirFmt ,
92
96
common_args : list ,
93
97
) -> List [str ]:
@@ -153,17 +157,9 @@ def _evaluate_contigs(
153
157
)
154
158
155
159
if references :
156
- all_refs_dir = os .path .join (results_dir , "references" )
157
- os .makedirs (all_refs_dir , exist_ok = True )
158
- all_ref_fps = []
159
- # we need to split the references into separate files so that QUAST
160
- # can correctly display alignment details per reference (otherwise it
161
- # will show those as if all the provided sequences belonged to a single
162
- # reference
163
- for ref in references :
164
- all_ref_fps .extend (_split_reference (ref , all_refs_dir ))
165
- for fp in all_ref_fps :
166
- cmd .extend (["-r" , fp ])
160
+ files = sorted (os .listdir (references .path ))
161
+ for fp in files :
162
+ cmd .extend (["-r" , os .path .join (references .path , fp )])
167
163
168
164
try :
169
165
run_command (cmd , verbose = True )
@@ -223,6 +219,16 @@ def _zip_additional_reports(path_to_dirs: list, output_filename: str) -> None:
223
219
_zip_dir (zipf , directory )
224
220
225
221
222
+ def _move_references (src , dest ):
223
+ for fp in glob .glob (os .path .join (src , "*.fa*" )):
224
+ seqs = skbio .io .read (fp , format = "fasta" )
225
+ for seq in seqs :
226
+ seq_id = seq .metadata ["id" ]
227
+ new_fp = os .path .join (dest , seq_id + ".fasta" )
228
+ with open (new_fp , "w" ) as f :
229
+ skbio .io .write (seq , format = "fasta" , into = f )
230
+
231
+
226
232
def _visualize_quast (
227
233
output_dir : str ,
228
234
contigs : ContigSequencesDirFmt ,
@@ -240,14 +246,23 @@ def _visualize_quast(
240
246
reads : Union [
241
247
SingleLanePerSamplePairedEndFastqDirFmt , SingleLanePerSampleSingleEndFastqDirFmt
242
248
] = None ,
243
- references : DNAFASTAFormat = None ,
249
+ references : GenomeSequencesDirectoryFormat = None ,
250
+ genomes_dir : str = None ,
244
251
mapped_reads : BAMDirFmt = None ,
245
252
) -> None :
246
253
kwargs = {
247
254
k : v
248
255
for k , v in locals ().items ()
249
256
if k
250
- not in ["output_dir" , "contigs" , "reads" , "references" , "mapped_reads" , "ctx" ]
257
+ not in [
258
+ "output_dir" ,
259
+ "contigs" ,
260
+ "reads" ,
261
+ "references" ,
262
+ "mapped_reads" ,
263
+ "ctx" ,
264
+ "genomes_dir" ,
265
+ ]
251
266
}
252
267
253
268
common_args = _process_common_input_params (
@@ -291,6 +306,13 @@ def _visualize_quast(
291
306
# Copy results to output dir
292
307
copy_tree (results_dir , os .path .join (output_dir , "quast_data" ))
293
308
309
+ # Save the downloaded references
310
+ if not references :
311
+ downloaded_references = os .path .join (
312
+ results_dir , "quast_downloaded_references"
313
+ )
314
+ _move_references (downloaded_references , genomes_dir )
315
+
294
316
# Zip summary, not_aligned and runs_per_reference dirs for download
295
317
dirnames = ["not_aligned" , "runs_per_reference" , "summary" ]
296
318
zip_these_dirs = [
@@ -365,10 +387,17 @@ def evaluate_contigs(
365
387
ambiguity_score = 0.99 ,
366
388
):
367
389
kwargs = {k : v for k , v in locals ().items () if k not in ["contigs" , "ctx" ]}
390
+ # 1. generate the visualization
391
+ _visualize_quast = ctx .get_action ("assembly" , "_visualize_quast" )
392
+ genomes = references
368
393
with tempfile .TemporaryDirectory () as tmp :
369
- # 1. generate the visualization
370
- _visualize_quast = ctx .get_action ("assembly" , "_visualize_quast" )
371
- (visualization ,) = _visualize_quast (contigs , ** kwargs )
394
+ if references :
395
+ (visualization ,) = _visualize_quast (contigs , ** kwargs )
396
+ else :
397
+ genomes_dir = GenomeSequencesDirectoryFormat ()
398
+ (visualization ,) = _visualize_quast (
399
+ contigs , ** kwargs , genomes_dir = str (genomes_dir .path )
400
+ )
372
401
373
402
# 2. after the visualization is generated we need to export the files
374
403
# to get the results table out
@@ -382,4 +411,27 @@ def evaluate_contigs(
382
411
# 3. read it as a pandas dataframe then we create the QUASTResults
383
412
tabular_results = ctx .make_artifact ("QUASTResults" , report_df )
384
413
385
- return tabular_results , visualization
414
+ # 4. create the Genomes
415
+ if not references :
416
+ try :
417
+ genomes = ctx .make_artifact ("GenomeData[DNASequence]" , genomes_dir )
418
+ except ValidationError as e :
419
+ if "Missing one or more" in str (e ): # no downloaded genomes
420
+ warn (
421
+ "QUAST did not download any genomes. The returned "
422
+ "GenomeData[DNASequence] artifact is empty. Please check "
423
+ "the network connection or provide the reference genomes. The "
424
+ f"original error was '{ e } '"
425
+ )
426
+ else : # corrupt files
427
+ warn (
428
+ "There was a problem with the genome files downloaded by "
429
+ "QUAST. The returned GenomeData[DNASequence] artifact will "
430
+ f"be empty. The original error was '{ e } '"
431
+ )
432
+
433
+ genomes_dir = GenomeSequencesDirectoryFormat ()
434
+ open (os .path .join (genomes_dir .path , "empty.fasta" ), "w" ).close ()
435
+ genomes = ctx .make_artifact ("GenomeData[DNASequence]" , genomes_dir )
436
+
437
+ return tabular_results , visualization , genomes
0 commit comments