Skip to content

Commit

Permalink
Merge branch 'master' into connect-minimap-hits
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Oct 25, 2024
2 parents 56e7afc + 94bf6d6 commit f49bdab
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 47 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \

## Install dependencies for genetracks/drawsvg
RUN apt-get install -q -y libcairo2-dev
RUN pip install --upgrade pip

COPY . /opt/micall/

RUN pip install --upgrade pip
RUN pip install /opt/micall[basespace]
RUN micall make_blast_db

Expand Down
32 changes: 16 additions & 16 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
import logging
from fractions import Fraction
from operator import itemgetter
from aligntools import connect_cigar_hits, CigarHit
from aligntools import CigarHit, connect_nonoverlapping_cigar_hits, drop_overlapping_cigar_hits

from micall.core.project_config import ProjectConfig
from micall.core.plot_contigs import plot_stitcher_coverage
from micall.utils.contig_stitcher_context import context, StitcherContext
from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig
from micall.utils.alignment import Alignment, align_consensus
from micall.utils.alignment import align_consensus
import micall.utils.contig_stitcher_events as events


Expand Down Expand Up @@ -163,41 +163,41 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]:
yield contig
return

def init_hit(x: Alignment) -> Tuple[CigarHit, Literal["forward", "reverse"]]:
cigar = x.to_cigar_hit()
return cigar, "forward" if x.strand == 1 else "reverse"

alignments, _algo = align_consensus(contig.ref_seq, contig.seq)
hits_array = [init_hit(x) for x in alignments]
hits = [x.to_cigar_hit() for x in alignments]
strands: List[Literal["forward", "reverse"]] = ["forward" if x.strand == 1 else "reverse" for x in alignments]

for i, (hit, strand) in enumerate(hits_array):
for i, (hit, strand) in enumerate(zip(hits, strands)):
log(events.InitialHit(contig, i, hit, strand))

if not hits_array:
if not hits:
log(events.ZeroHits(contig))
yield contig
return

if len(set(strand for hit, strand in hits_array)) > 1:
if len(set(strands)) > 1:
log(events.StrandConflict(contig))
yield contig
return

strand = hits_array[0][1]
strand = strands[0]
if strand == "reverse":
rc = str(Seq.Seq(contig.seq).reverse_complement())
original_contig = contig
new_contig = replace(contig, seq=rc)
contig = new_contig
hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand)
for hit, strand in hits_array]
hits = [replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1) for hit in hits]

log(events.ReverseComplement(original_contig, new_contig))
for i, (hit, strand) in enumerate(hits_array):
for i, (hit, strand) in enumerate(zip(hits, strands)):
log(events.InitialHit(contig, i, hit, strand))

connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else []
log(events.HitNumber(contig, hits_array, connected))
def quality(x: CigarHit):
return x.ref_length

filtered = list(drop_overlapping_cigar_hits(hits, quality))
connected = list(connect_nonoverlapping_cigar_hits(filtered))
log(events.HitNumber(contig, list(zip(hits, strands)), connected))

for i, single_hit in enumerate(connected):
query = replace(contig, name=None)
Expand Down
55 changes: 31 additions & 24 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2117,56 +2117,63 @@ def test_minimap_overlap(default_sequence_report, projects):

# A,C,G,T
expected_text = """\
seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,A,C,G,T,N,del,ins,clip,v3_overlap,coverage
HIV1-B-FR-K03455-seed,INT,15,1,212,4441,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,2,213,4442,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,3,214,4443,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,4,215,4444,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,5,216,4445,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,6,217,4446,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,7,218,4447,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,8,219,4448,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,9,220,4449,0,9,0,0,0,0,0,0,0,9"""

HIV1-B-FR-K03455-seed,INT,15,51,262,4491,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,52,263,4492,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,53,264,4493,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,54,265,4494,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,55,266,4495,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,56,267,4496,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,57,268,4497,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,58,269,4498,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,59,270,4499,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,60,271,4500,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,61,452,3001,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,62,453,3002,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,63,454,3003,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,64,455,3004,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,65,456,3005,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,66,457,3006,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,67,458,3007,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,68,459,3008,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,69,460,3009,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,70,461,3010,9,0,0,0,0,0,0,0,0,9"""
report_file = StringIO()
default_sequence_report.write_nuc_header(report_file)
default_sequence_report.read(aligned_reads)
default_sequence_report.write_nuc_counts()

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 61
expected_size = 121
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

key_lines = report_lines[0:10]
key_lines = report_lines[51:71]
key_report = '\n'.join(key_lines)
assert key_report == expected_text


def test_minimap_overlap_at_start(default_sequence_report, projects):
""" Actual overlaps cause blank query position. Check consensus offset.
In this example, the start of PR (first 6 nuleotides) appears twice,
so the shorter alignment is discarded. Make sure that the PR consensus
has the correct offset and doesn't crash.
In this example, the start of PR appears twice, so the consensus index
gets blanked. Make sure that the PR consensus has the correct offset and
doesn't crash.
"""
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
first_part = seed_seq[2252:2400]
second_part = seed_seq[2000:2258]
_read_seq = first_part + second_part
read_seq = seed_seq[2252:2400] + seed_seq[2000:2258]

# refname,qcut,rank,count,offset,seq
aligned_reads = prepare_reads(f"""\
HIV1-B-FR-K03455-seed,15,0,9,0,{second_part}
HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq}
""")

expected_text = f"""\
seed,region,q-cutoff,consensus-percent-cutoff,seed-offset,region-offset,sequence
HIV1-B-FR-K03455-seed,,15,MAX,0,,{second_part}
HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1211,{second_part}
HIV1-B-FR-K03455-seed,PR,15,MAX,252,0,{second_part[-6:]}
HIV1-B-FR-K03455-seed,,15,MAX,0,,{read_seq}
HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1463,{read_seq}
HIV1-B-FR-K03455-seed,PR,15,MAX,0,0,{read_seq}
"""
report_file = StringIO()
default_sequence_report.write_consensus_all_header(report_file)
Expand Down Expand Up @@ -2824,7 +2831,7 @@ def test_write_sequence_coverage_counts_with_double_mapped_edges(
hxb2_name = 'HIV1-B-FR-K03455-seed'
ref = projects.getReference(hxb2_name)
seq = ref[2858:2908] + ref[8187:8237]
expected_ref_positions = list(range(2859, 2908 + (8237-8187)))
expected_ref_positions = (list(range(2859, 2916)) + list(range(8196, 8238)))
expected_query_positions = list(range(1, len(seq)+1))

report_file = StringIO()
Expand Down
40 changes: 37 additions & 3 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,19 +195,26 @@ def test_start_contig_overlapping_sections(projects):
In this example, positions 1-60 of the read map to pos 4441-4500 of the
reference. Positions 55-120 of the read map to pos 2995-3060 of the ref.
This way, positions 55-60 are in both alignments. We expect our aligner
to drop the alignment with lesser quality
(quality is calculated from `mapq`, `query_length`, and others)
Since positions 55-60 are in both alignments, remove them from the second
one.
"""
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[4440:4500] + seed_seq[3000:3060]
reading_frames = create_reading_frames(consensus)
int_ref = projects.getReference('INT')
rt_ref = projects.getReference('RT')
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, reading_frames=reading_frames)

rt_aminos: typing.List[ReportAmino] = []
rt_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(2550,
4229,
rt_nucleotides,
rt_aminos,
amino_ref=rt_ref)
int_aminos: typing.List[ReportAmino] = []
int_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(4230,
Expand All @@ -221,6 +228,11 @@ def test_start_contig_overlapping_sections(projects):
list(range(3001, 3061)),
4230,
5096)
assert_consensus_nuc_indexes(rt_aminos,
list(range(4441, 4501)) +
list(range(3001, 3061)),
2550,
4229)


# noinspection DuplicatedCode
Expand Down Expand Up @@ -414,6 +426,28 @@ def test_start_contig_short_consensus(projects):
assert aligner.algorithm == 'gotoh'


def test_start_contig_deletion_minimap2(projects):
seed_name = 'SARS-CoV-2-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[2000:2030] + seed_seq[2031:2060]
expected_alignment = make_alignment(ctg='N/A',
ctg_len=len(seed_seq),
r_st=2000,
r_en=2060,
q_st=0,
q_en=59,
mapq=9,
cigar=[(30, CigarActions.MATCH),
(1, CigarActions.DELETE),
(29, CigarActions.MATCH)])
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, expected_alignment)
assert aligner.algorithm == 'minimap2'


def test_start_contig_big_deletion_minimap2(projects):
seed_name = 'HCV-1a'
seed_seq = projects.getReference(seed_name)
Expand Down
4 changes: 2 additions & 2 deletions micall/utils/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from operator import attrgetter
from itertools import groupby

from aligntools import CigarActions, Cigar, CigarHit, connect_cigar_hits
from aligntools import CigarActions, Cigar, CigarHit, connect_nonoverlapping_cigar_hits
from gotoh import align_it
from mappy import Aligner
import mappy
Expand Down Expand Up @@ -114,7 +114,7 @@ def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]:
for (strand, ctg, ctg_len), group_iter in stranded:
group = list(group_iter)
hits = list(map(Alignment.to_cigar_hit, group))
connected_hits = connect_cigar_hits(hits)
connected_hits = connect_nonoverlapping_cigar_hits(hits)
mapq = min(x.mapq for x in group)
for hit in connected_hits:
yield Alignment.from_cigar_hit(hit,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies = [
"mappy==2.17",
"drawsvg==2.3.0",
"cairosvg==2.7.1",
"aligntools",
"aligntools==1.1.1",
]

[project.optional-dependencies]
Expand Down

0 comments on commit f49bdab

Please sign in to comment.