From 466ae6b24a4af156ecb80fd2f51bb295897b7280 Mon Sep 17 00:00:00 2001 From: Adam English Date: Fri, 31 Jan 2025 15:21:11 -0500 Subject: [PATCH] collapse bug fix sub-chunking bug prevented comparison of variants which should have been compared but were separated. But also, refactored chain to be simplier and to no longer 'slide' --- .../collapse/input1_median_collapsed.vcf | 5 +- .../collapse/input1_median_removed.vcf | 3 +- .../collapse/inputissue196_removed.vcf | 2 +- truvari/collapse.py | 86 +++++++++---------- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf index c232d119..11e559c5 100644 --- a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf +++ b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf @@ -889,7 +889,7 @@ chr20 419860 . AGTGACCCTGCACCTGGCT A 60 . QNAME=HG002-S9-H2-000001F;QSTART=37411 chr20 420228 . A C 60 . QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+ GT:PL:DP 0|1:2,3,6:24,29 chr20 420465 . A AG 60 . QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+ GT:PL:DP 0|1:6,10,1:32,35 chr20 420561 . A T 60 . QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+ GT:PL:DP 1|1:5,6,7:36,12 -chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=9.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:. +chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=10.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:. chr20 421409 . G A 60 . QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+ GT:PL:DP 1|1:2,3,7:14,41 chr20 421527 . T C 60 . QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+ GT:PL:DP 0|1:3,8,4:49,42 chr20 422066 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+ GT:PL:DP 1|1:3,7,8:9,39 @@ -1263,7 +1263,7 @@ chr20 639104 . A AT 60 . QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+ GT:PL chr20 640046 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+ GT:PL:DP 1|0:10,10,9:11,48 chr20 640049 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+ GT:PL:DP 1|1:2,3,4:13,44 chr20 641878 . C G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+ GT:PL:DP 1|0:8,1,7:7,13 -chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 1/0:7,5,7:34,13 +chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=15.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66 GT:PL:DP 1/0:7,5,7:34,13 chr20 641944 . GGA G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+ GT:PL:DP 1|0:6,6,6:17,7 chr20 642012 . GGT G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+ GT:PL:DP 1|0:5,7,1:6,23 chr20 642037 . T TG 60 . QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+ GT:PL:DP 1|0:4,10,10:14,47 @@ -1283,7 +1283,6 @@ chr20 642284 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621795;QSTRAND=+ GT:PL: chr20 642300 . G GCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGC 60 . QNAME=HG002-S9-H1-000001F;QSTART=621812;QSTRAND=+;SVTYPE=INS;SVLEN=408 GT:PL:DP 1/0:5,10,5:44,36 chr20 642300 . G GGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597225;QSTRAND=+ GT:PL:DP 0|1:4,10,8:25,7 chr20 642330 . G C 60 . QNAME=HG002-S9-H1-000001F;QSTART=622249;QSTRAND=+ GT:PL:DP 1|0:2,9,8:16,10 -chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 0/1:6,1,9:20,25 chr20 642362 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622282;QSTRAND=+ GT:PL:DP 1|1:10,5,1:17,12 chr20 642391 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622312;QSTRAND=+ GT:PL:DP 1|1:2,9,2:10,50 chr20 642420 . G GC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597415;QSTRAND=+ GT:PL:DP 0|1:7,1,2:41,16 diff --git a/repo_utils/answer_key/collapse/input1_median_removed.vcf b/repo_utils/answer_key/collapse/input1_median_removed.vcf index 1e12e7cd..55817dee 100644 --- a/repo_utils/answer_key/collapse/input1_median_removed.vcf +++ b/repo_utils/answer_key/collapse/input1_median_removed.vcf @@ -48,4 +48,5 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 -chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=9.0 GT:PL:DP 1/0:4,8,6:32,9 +chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;TruScore=99;MatchId=10.0 GT:PL:DP 1/0:4,8,6:32,9 +chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9627;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;TruScore=65;MatchId=15.0 GT:PL:DP 0/1:6,1,9:20,25 diff --git a/repo_utils/answer_key/collapse/inputissue196_removed.vcf b/repo_utils/answer_key/collapse/inputissue196_removed.vcf index d6e23a86..f3f54e1c 100644 --- a/repo_utils/answer_key/collapse/inputissue196_removed.vcf +++ b/repo_utils/answer_key/collapse/inputissue196_removed.vcf @@ -17,4 +17,4 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 chr21 14088212 chr21-14088113-DEL-223 T . . END=14088828;SVTYPE=DEL;SVLEN=-393;PctSeqSimilarity=0;PctSizeSimilarity=0.5674;PctRecOverlap=0.5624;SizeDiff=-170;StartDistance=-100;EndDistance=-270;TruScore=37;MatchId=1.0 GT 1|0 -chr21 14088312 chr21-14088113-DEL-223 A . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=275;StartDistance=100;EndDistance=375;TruScore=38;MatchId=1.0 GT 1|0 +chr21 14088312 chr21-14088113-DEL-223 A . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=-275;StartDistance=-100;EndDistance=-375;TruScore=38;MatchId=1.0 GT 1|0 diff --git a/truvari/collapse.py b/truvari/collapse.py index 4ddef9e5..2e45394d 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -95,24 +95,31 @@ def combine(self, other): self.genotype_mask |= other.genotype_mask -def chain_collapse(cur_collapse, all_collapse): - """ - Perform transitive matching of cur_collapse to all_collapse - Check the cur_collapse's entry to all other collapses' consolidated entries - """ - for m_collap in all_collapse: - for other in m_collap.matches: - mat = cur_collapse.entry.match(other.comp) - mat.matid = m_collap.match_id - if mat.state: - # The other's representative entry will report its - # similarity to the matched call that pulled it in - mat.base, mat.comp = mat.comp, mat.base - m_collap.matches.append(mat) - m_collap.combine(cur_collapse) - return True # you can just ignore it later - return False # You'll have to add it to your set of collapsed calls - +def find_new_matches(base, remaining_calls, dest_collapse, params): + """ + Pull variants from remaining calls that match to the base entry into the destination collapse + Updates everything in place + """ + # Sort based on size difference to current call + remaining_calls.sort(key=partial(relative_size_sorter, base)) + i = 0 + while i < len(remaining_calls): + candidate = remaining_calls[i] + mat = base.match(candidate) + mat.matid = dest_collapse.match_id + if params.hap and not hap_resolve(base, candidate): + mat.state = False + if mat.state and dest_collapse.gt_conflict(candidate, params.gt): + mat.state = False + if mat.state: + dest_collapse.matches.append(mat) + remaining_calls.pop(i) + else: + # move to next one + i += 1 + # short circuit + if mat.sizesim is not None and mat.sizesim < params.pctsize: + return def collapse_chunk(chunk, params): """ @@ -121,7 +128,6 @@ def collapse_chunk(chunk, params): chunk_dict, chunk_id = chunk remaining_calls = sorted(chunk_dict['base'], key=params.sorter) - remaining_calls.sort(key=params.sorter) call_id = -1 ret = [] # list of Collapses while remaining_calls: @@ -132,33 +138,27 @@ def collapse_chunk(chunk, params): m_collap.genotype_mask = m_collap.make_genotype_mask(m_collap.entry, params.gt) - # Sort based on size difference to current call - for candidate in sorted(remaining_calls, key=partial(relative_size_sorter, m_collap.entry)): - mat = m_collap.entry.match(candidate) - mat.matid = m_collap.match_id - if params.hap and not hap_resolve(m_collap.entry, candidate): - mat.state = False - if mat.state and m_collap.gt_conflict(candidate, params.gt): - mat.state = False - if mat.state: - m_collap.matches.append(mat) - elif mat.sizesim is not None and mat.sizesim < params.pctsize: - # The sort tells us that we're going through most->least - # similar size. So the next one will only be worse... - break - - # Does this collap need to go into a previous collap? - if not params.chain or not chain_collapse(m_collap, ret): - ret.append(m_collap) + find_new_matches(m_collap.entry, remaining_calls, m_collap, params) + # Chain will only operate once to prevent 'sliding' + # e.g. collapsing integers within 5 of 1, 4, 6, 8 + # without a single operation 1 + # with a single operation 1, 8 + # not chaining at all 1, 6 + if params.chain: + i = 0 + while remaining_calls and i < len(m_collap.matches): + find_new_matches(m_collap.matches[i].comp, remaining_calls, m_collap, params) + i += 1 # If hap, only allow the best match + # put the rest of the remaining calls back if params.hap and m_collap.matches: mats = sorted(m_collap.matches, reverse=True) m_collap.matches = [mats.pop(0)] - - # Remove everything that was used - to_rm = [_.comp for _ in m_collap.matches] - remaining_calls = [_ for _ in remaining_calls if _ not in to_rm] + remaining_calls.extend(mats) + ret.append(m_collap) + # Sort back to where they need to be to choose the next to evaluate + remaining_calls.sort(key=params.sorter) if params.no_consolidate: for val in ret: @@ -771,12 +771,12 @@ def concatenate(self, other): def merge_intervals(intervals): """ - Merge sorted list of tuples + Merge list of tuples """ if not intervals: return [] + intervals.sort(key=lambda x: (x[0], x[1])) merged = [] - current_start, current_end, current_data = intervals[0] for i in range(1, len(intervals)): next_start, next_end, next_data = intervals[i]