diff --git a/Dockerfile b/Dockerfile index af327bbd0..9ba16134e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,10 +85,10 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev +RUN pip install --upgrade pip COPY . /opt/micall/ -RUN pip install --upgrade pip RUN pip install /opt/micall[basespace] RUN micall make_blast_db diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 31094ee76..723ff10ec 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -12,13 +12,13 @@ import logging from fractions import Fraction from operator import itemgetter -from aligntools import connect_cigar_hits, CigarHit +from aligntools import CigarHit, connect_nonoverlapping_cigar_hits, drop_overlapping_cigar_hits from micall.core.project_config import ProjectConfig from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig -from micall.utils.alignment import Alignment, align_consensus +from micall.utils.alignment import align_consensus import micall.utils.contig_stitcher_events as events @@ -163,41 +163,41 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: yield contig return - def init_hit(x: Alignment) -> Tuple[CigarHit, Literal["forward", "reverse"]]: - cigar = x.to_cigar_hit() - return cigar, "forward" if x.strand == 1 else "reverse" - alignments, _algo = align_consensus(contig.ref_seq, contig.seq) - hits_array = [init_hit(x) for x in alignments] + hits = [x.to_cigar_hit() for x in alignments] + strands: List[Literal["forward", "reverse"]] = ["forward" if x.strand == 1 else "reverse" for x in alignments] - for i, (hit, strand) in enumerate(hits_array): + for i, (hit, strand) in enumerate(zip(hits, strands)): log(events.InitialHit(contig, i, hit, strand)) - if not hits_array: + if not hits: log(events.ZeroHits(contig)) yield contig return - if len(set(strand for hit, strand in hits_array)) > 1: + if len(set(strands)) > 1: log(events.StrandConflict(contig)) yield contig return - strand = hits_array[0][1] + strand = strands[0] if strand == "reverse": rc = str(Seq.Seq(contig.seq).reverse_complement()) original_contig = contig new_contig = replace(contig, seq=rc) contig = new_contig - hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand) - for hit, strand in hits_array] + hits = [replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1) for hit in hits] log(events.ReverseComplement(original_contig, new_contig)) - for i, (hit, strand) in enumerate(hits_array): + for i, (hit, strand) in enumerate(zip(hits, strands)): log(events.InitialHit(contig, i, hit, strand)) - connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] - log(events.HitNumber(contig, hits_array, connected)) + def quality(x: CigarHit): + return x.ref_length + + filtered = list(drop_overlapping_cigar_hits(hits, quality)) + connected = list(connect_nonoverlapping_cigar_hits(filtered)) + log(events.HitNumber(contig, list(zip(hits, strands)), connected)) for i, single_hit in enumerate(connected): query = replace(contig, name=None) diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py index a99604c85..1a869a261 100644 --- a/micall/tests/test_aln2counts_report.py +++ b/micall/tests/test_aln2counts_report.py @@ -2117,17 +2117,26 @@ def test_minimap_overlap(default_sequence_report, projects): # A,C,G,T expected_text = """\ -seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,A,C,G,T,N,del,ins,clip,v3_overlap,coverage -HIV1-B-FR-K03455-seed,INT,15,1,212,4441,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,2,213,4442,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,3,214,4443,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,4,215,4444,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,5,216,4445,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,6,217,4446,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,7,218,4447,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,8,219,4448,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,9,220,4449,0,9,0,0,0,0,0,0,0,9""" - +HIV1-B-FR-K03455-seed,INT,15,51,262,4491,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,52,263,4492,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,53,264,4493,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,54,265,4494,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,55,266,4495,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,56,267,4496,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,57,268,4497,0,9,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,58,269,4498,0,9,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,59,270,4499,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,60,271,4500,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,61,452,3001,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,62,453,3002,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,63,454,3003,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,64,455,3004,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,65,456,3005,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,66,457,3006,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,67,458,3007,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,68,459,3008,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,69,460,3009,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,RT,15,70,461,3010,9,0,0,0,0,0,0,0,0,9""" report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads) @@ -2135,11 +2144,11 @@ def test_minimap_overlap(default_sequence_report, projects): report = report_file.getvalue() report_lines = report.splitlines() - expected_size = 61 + expected_size = 121 if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') - key_lines = report_lines[0:10] + key_lines = report_lines[51:71] key_report = '\n'.join(key_lines) assert key_report == expected_text @@ -2147,26 +2156,24 @@ def test_minimap_overlap(default_sequence_report, projects): def test_minimap_overlap_at_start(default_sequence_report, projects): """ Actual overlaps cause blank query position. Check consensus offset. - In this example, the start of PR (first 6 nuleotides) appears twice, - so the shorter alignment is discarded. Make sure that the PR consensus - has the correct offset and doesn't crash. + In this example, the start of PR appears twice, so the consensus index + gets blanked. Make sure that the PR consensus has the correct offset and + doesn't crash. """ seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) - first_part = seed_seq[2252:2400] - second_part = seed_seq[2000:2258] - _read_seq = first_part + second_part + read_seq = seed_seq[2252:2400] + seed_seq[2000:2258] # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads(f"""\ -HIV1-B-FR-K03455-seed,15,0,9,0,{second_part} +HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq} """) expected_text = f"""\ seed,region,q-cutoff,consensus-percent-cutoff,seed-offset,region-offset,sequence -HIV1-B-FR-K03455-seed,,15,MAX,0,,{second_part} -HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1211,{second_part} -HIV1-B-FR-K03455-seed,PR,15,MAX,252,0,{second_part[-6:]} +HIV1-B-FR-K03455-seed,,15,MAX,0,,{read_seq} +HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1463,{read_seq} +HIV1-B-FR-K03455-seed,PR,15,MAX,0,0,{read_seq} """ report_file = StringIO() default_sequence_report.write_consensus_all_header(report_file) @@ -2824,7 +2831,7 @@ def test_write_sequence_coverage_counts_with_double_mapped_edges( hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) seq = ref[2858:2908] + ref[8187:8237] - expected_ref_positions = list(range(2859, 2908 + (8237-8187))) + expected_ref_positions = (list(range(2859, 2916)) + list(range(8196, 8238))) expected_query_positions = list(range(1, len(seq)+1)) report_file = StringIO() diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 129d5bee6..a53e9ad98 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -195,19 +195,26 @@ def test_start_contig_overlapping_sections(projects): In this example, positions 1-60 of the read map to pos 4441-4500 of the reference. Positions 55-120 of the read map to pos 2995-3060 of the ref. - This way, positions 55-60 are in both alignments. We expect our aligner - to drop the alignment with lesser quality - (quality is calculated from `mapq`, `query_length`, and others) + Since positions 55-60 are in both alignments, remove them from the second + one. """ seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[4440:4500] + seed_seq[3000:3060] reading_frames = create_reading_frames(consensus) int_ref = projects.getReference('INT') + rt_ref = projects.getReference('RT') aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, reading_frames=reading_frames) + rt_aminos: typing.List[ReportAmino] = [] + rt_nucleotides: typing.List[ReportNucleotide] = [] + aligner.report_region(2550, + 4229, + rt_nucleotides, + rt_aminos, + amino_ref=rt_ref) int_aminos: typing.List[ReportAmino] = [] int_nucleotides: typing.List[ReportNucleotide] = [] aligner.report_region(4230, @@ -221,6 +228,11 @@ def test_start_contig_overlapping_sections(projects): list(range(3001, 3061)), 4230, 5096) + assert_consensus_nuc_indexes(rt_aminos, + list(range(4441, 4501)) + + list(range(3001, 3061)), + 2550, + 4229) # noinspection DuplicatedCode @@ -414,6 +426,28 @@ def test_start_contig_short_consensus(projects): assert aligner.algorithm == 'gotoh' +def test_start_contig_deletion_minimap2(projects): + seed_name = 'SARS-CoV-2-seed' + seed_seq = projects.getReference(seed_name) + consensus = seed_seq[2000:2030] + seed_seq[2031:2060] + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2060, + q_st=0, + q_en=59, + mapq=9, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (29, CigarActions.MATCH)]) + aligner = ConsensusAligner(projects) + + aligner.start_contig(seed_name, consensus) + + assert_alignments(aligner, expected_alignment) + assert aligner.algorithm == 'minimap2' + + def test_start_contig_big_deletion_minimap2(projects): seed_name = 'HCV-1a' seed_seq = projects.getReference(seed_name) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index cccf10c31..5da6bcba7 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -3,7 +3,7 @@ from operator import attrgetter from itertools import groupby -from aligntools import CigarActions, Cigar, CigarHit, connect_cigar_hits +from aligntools import CigarActions, Cigar, CigarHit, connect_nonoverlapping_cigar_hits from gotoh import align_it from mappy import Aligner import mappy @@ -114,7 +114,7 @@ def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: for (strand, ctg, ctg_len), group_iter in stranded: group = list(group_iter) hits = list(map(Alignment.to_cigar_hit, group)) - connected_hits = connect_cigar_hits(hits) + connected_hits = connect_nonoverlapping_cigar_hits(hits) mapq = min(x.mapq for x in group) for hit in connected_hits: yield Alignment.from_cigar_hit(hit, diff --git a/pyproject.toml b/pyproject.toml index 6e3db0585..b2a817763 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "mappy==2.17", "drawsvg==2.3.0", "cairosvg==2.7.1", - "aligntools", + "aligntools==1.1.1", ] [project.optional-dependencies]