Skip to content

Commit

Permalink
Split alignments with big deletions in align_consensus
Browse files Browse the repository at this point in the history
This change tries to revert the behaviour of the aligner
to one that is closer to that of the original mappy-based one.
The original mappy aligner was reporting multiple alignments
if they were far enough apart from each other.
I am not sure how much of that original behaviour is intentional,
or was it just a limitation of mappy.
In case we don't want to split the alignments like that,
we simply need to revert this commit.
  • Loading branch information
Donaim committed Oct 24, 2024
1 parent 1da4480 commit 56e7afc
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 31 deletions.
12 changes: 7 additions & 5 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2283,7 +2283,7 @@ def test_contig_coverage_report_huge_gap(default_sequence_report):
""" A gap so big that Gotoh can't bridge it, but minimap2 can. """
ref = default_sequence_report.projects.getReference('HIV1-B-FR-K03455-seed')
seq = ref[100:150] + ref[1000:1050]
expected_positions = list(range(101, 1051))
expected_positions = list(range(101, 151)) + list(range(1001, 1051))
remap_conseq_csv = StringIO(f"""\
region,sequence
HIV1-B-FR-K03455-seed,{seq}
Expand Down Expand Up @@ -2548,7 +2548,7 @@ def test_write_sequence_coverage_counts_without_coverage(projects,
hxb2_name = 'HIV1-B-FR-K03455-seed'
ref = projects.getReference(hxb2_name)
seq = ref[100:150] + ref[1000:1050]
expected_positions = list(range(101, 1051))
expected_positions = list(range(101, 151)) + list(range(1001, 1051))

report_file = StringIO()
sequence_report.projects = projects
Expand Down Expand Up @@ -2782,13 +2782,14 @@ def test_write_sequence_coverage_counts_with_unaligned_middle(projects,
sequence_report):
""" The middle 100 bases are from a different reference.
They get reported with query positions, but with deletions at reference positions.
They get reported with query positions, but no reference positions.
"""
hxb2_name = 'HIV1-B-FR-K03455-seed'
ref = projects.getReference(hxb2_name)
hcv_ref = projects.getReference('HCV-1a')
seq = ref[:100] + hcv_ref[1000:1100] + ref[1000:1100]
expected_ref_positions = list(range(1, 1101))
expected_ref_positions = (list(range(1, 101)) +
list(range(1001, 1101)))
expected_query_positions = list(range(1, 301))

report_file = StringIO()
Expand Down Expand Up @@ -2852,7 +2853,8 @@ def test_write_sequence_coverage_minimap_hits(projects, sequence_report):
seq = ref[1000:1100] + ref[2000:2100]
expected_minimap_hits = """\
contig,ref_name,start,end,ref_start,ref_end
1-my-contig,HIV1-B-FR-K03455-seed,1,200,1001,2100
1-my-contig,HIV1-B-FR-K03455-seed,1,100,1001,1100
1-my-contig,HIV1-B-FR-K03455-seed,101,200,2001,2100
"""
report_file = StringIO()
sequence_report.projects = projects
Expand Down
36 changes: 10 additions & 26 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,28 +414,6 @@ def test_start_contig_short_consensus(projects):
assert aligner.algorithm == 'gotoh'


def test_start_contig_deletion_minimap2(projects):
seed_name = 'SARS-CoV-2-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[2000:2030] + seed_seq[2031:2060]
expected_alignment = make_alignment(ctg='N/A',
ctg_len=len(seed_seq),
r_st=2000,
r_en=2060,
q_st=0,
q_en=59,
mapq=9,
cigar=[(30, CigarActions.MATCH),
(1, CigarActions.DELETE),
(29, CigarActions.MATCH)])
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, expected_alignment)
assert aligner.algorithm == 'minimap2'


def test_start_contig_big_deletion_minimap2(projects):
seed_name = 'HCV-1a'
seed_seq = projects.getReference(seed_name)
Expand All @@ -445,13 +423,19 @@ def test_start_contig_big_deletion_minimap2(projects):
expected_alignment = [make_alignment(ctg='N/A',
ctg_len=len(seed_seq),
r_st=290,
r_en=9269,
r_en=983,
q_st=0,
q_en=693,
mapq=60,
cigar=[(693, CigarActions.MATCH)]),
make_alignment(ctg='N/A',
ctg_len=len(seed_seq),
r_st=3000,
r_en=9269,
q_st=693,
q_en=6962,
mapq=60,
cigar=[(693, CigarActions.MATCH),
(2017, CigarActions.DELETE),
(6269, CigarActions.MATCH)])]
cigar=[(6269, CigarActions.MATCH)])]

aligner = ConsensusAligner(projects)

Expand Down
42 changes: 42 additions & 0 deletions micall/utils/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
import mappy


#
# Alignments with deletions larger than MAX_GAP_SIZE
# will be split around those deletions into multiple
# separate alignments.
#
MAX_GAP_SIZE = 600 # TODO: make this smaller?


@dataclass(frozen=True)
class Alignment:
"""
Expand Down Expand Up @@ -114,6 +122,39 @@ def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]:
strand=strand, mapq=mapq)


def collect_big_gaps_cut_points(alignment: Alignment) -> Iterator[float]:
hit = alignment.to_cigar_hit()
for deletion in hit.deletions():
if deletion.ref_length > MAX_GAP_SIZE:
midpoint = deletion.r_st + deletion.ref_length / 2
yield int(midpoint) + hit.epsilon


def cut_hit_into_multiple_parts(hit: CigarHit, cut_points: Iterable[float]) -> Iterator[CigarHit]:
for cut_point in cut_points:
left, right = hit.cut_reference(cut_point)
left = left.rstrip_reference()
right = right.lstrip_reference()
yield left
hit = right
yield hit


def split_around_big_gaps(alignments: Iterable[Alignment]) -> Iterator[Alignment]:
for alignment in alignments:
cut_points = list(collect_big_gaps_cut_points(alignment))
if cut_points:
hit = alignment.to_cigar_hit()
for part in cut_hit_into_multiple_parts(hit, cut_points):
yield Alignment.from_cigar_hit(part,
ctg=alignment.ctg,
ctg_len=alignment.ctg_len,
strand=alignment.strand,
mapq=alignment.mapq)
else:
yield alignment


def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]:
aligner = Aligner(seq=coordinate_seq, preset='map-ont')
mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus))
Expand All @@ -139,5 +180,6 @@ def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment
else:
alignments = []

alignments = list(split_around_big_gaps(alignments))
alignments.sort(key=attrgetter('q_st'))
return (alignments, algorithm)

0 comments on commit 56e7afc

Please sign in to comment.