From 3963d920533bdeafd84c1ac69fbd1182a82847a0 Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Thu, 7 Nov 2024 18:04:10 -0500 Subject: [PATCH] --longreads option added for FASTQ long reads inputs such as PacBio, ONT, etc. Gives better mapping results --- ectyper/commandLineOptions.py | 7 +++++++ ectyper/genomeFunctions.py | 8 +++++--- test/test_O_serotyping.py | 2 +- test/test_complex_inputs.py | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py index 3ff2308..9dc3ab5 100644 --- a/ectyper/commandLineOptions.py +++ b/ectyper/commandLineOptions.py @@ -61,6 +61,13 @@ def checkdbversion(): nargs="+" ) + parser.add_argument( + "--longreads", + action="store_true", + default=False, + help="Enable for raw long reads FASTQ inputs (ONT, PacBio, other sequencing platforms). [default %(default)s]" + ) + parser.add_argument( "--maxdirdepth", help="Maximum number of directories to descend when searching an input directory of files [default %(default)s levels]. Only works on path inputs not containing '*' wildcard", diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py index ebef144..c15c8b2 100644 --- a/ectyper/genomeFunctions.py +++ b/ectyper/genomeFunctions.py @@ -211,7 +211,7 @@ def create_bowtie_base(temp_dir, reference, cores): return bowtie_base -def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1): +def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1, longreads=False): """ Assembles fastq reads to the specified reference file. :param reads: The fastq file to assemble @@ -232,7 +232,6 @@ def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1): bowtie_run = [ 'bowtie2', '--threads',f'{cores}', - '--local', '--score-min L,1,-0.5', '--np 5', '--no-unal', @@ -240,6 +239,8 @@ def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1): '-U', reads, '-S', sam_reads ] + if longreads == True: #for nanopore reads do local alignment as long reads are longer than references + bowtie_run.append('--local') subprocess_util.run_subprocess(bowtie_run) @@ -377,7 +378,8 @@ def assemble_fastq(raw_files_dict, temp_dir, combined_fasta, bowtie_base, args): bowtie_base=bowtie_base, combined_fasta=combined_fasta, temp_dir=temp_dir, - cores=cores) + cores=cores, + longreads=args.longreads) all_fasta_files_dict = dict.fromkeys(raw_files_dict['fasta']) #add assembled genomes as new keys with Pool(processes=args.cores) as pool: diff --git a/test/test_O_serotyping.py b/test/test_O_serotyping.py index e5e2915..b36ea97 100644 --- a/test/test_O_serotyping.py +++ b/test/test_O_serotyping.py @@ -138,7 +138,7 @@ def test_Ecoli_O17H18(caplog): rows = outfp.readlines() secondrow=rows[1:][0] #check only second row assert "Escherichia coli" in secondrow.split('\t') - assert "O17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow + assert "O17/O44/O77/O106\tH18\tO17/O44/O77/O106:H18\tWARNING MIXED O-TYPE" in secondrow def test_download_refseq_mash(caplog, tmpdir): caplog.set_level(logging.DEBUG) diff --git a/test/test_complex_inputs.py b/test/test_complex_inputs.py index c8bb97e..6fc4c9b 100644 --- a/test/test_complex_inputs.py +++ b/test/test_complex_inputs.py @@ -91,7 +91,7 @@ def test_multiple_inputs(caplog): output_tsv_lines = fp.readlines() with open(output_blastn_antigens) as fp: output_blastn_antigens_lines = fp.readlines() - assert any([True if 'O17/O77/O44/O106:H18' in line else False for line in output_tsv_lines]), "No matches of 'O17/O77/O44/O106:H18' serotype" + assert any([True if 'O17/O44/O77/O106:H18' in line else False for line in output_tsv_lines]), "No matches of 'O17/O44/O77/O106:H18' serotype" assert any([True if 'O28/O42:H25' in line else False for line in output_tsv_lines]), "No matches of 'O28/O42:H25' serotype" assert any([True if 'EscherichiaO17H18' in line else False for line in output_blastn_antigens_lines]), "No matches of 'EscherichiaO17H18' in BLAST output" assert any([True if 'EscherichiaO28H5' in line else False for line in output_blastn_antigens_lines]), "No matches of 'EscherichiaO28H5' in BLAST output"