From f46456950ec9d0dda3770eef9ac46ba3b05c9b25 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 22 Jun 2024 10:41:38 +0200 Subject: [PATCH 1/8] inverse patching, everything but output is working --- src/common/wflign/src/wflign_patch.cpp | 92 ++++++++++++++++++-------- src/common/wflign/src/wflign_patch.hpp | 23 ++++--- 2 files changed, 77 insertions(+), 38 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 0754856f..19ab2d8b 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -201,44 +201,75 @@ void do_wfa_patch_alignment( wfa::WFAlignerGapAffine2Pieces& wf_aligner, const wflign_penalties_t& convex_penalties, alignment_t& aln, + alignment_t& rev_aln, const int64_t& chain_gap, const int& max_patching_score) { - const int max_score - = max_patching_score ? max_patching_score : - // gap_opening2/gap_extension1 is usually > gap_opening1/gap_extension2 - convex_penalties.gap_opening2 + - (convex_penalties.gap_extension1 * std::min( - (int)chain_gap, - (int)std::max(target_length, query_length) - )) + 64; - // + 64 because we are not really setting the MaxScore, but the MaxAlignmentSteps. - // AlignmentStep is usually > Score (they are the same with edit-distance) + const int max_score = max_patching_score ? max_patching_score : + convex_penalties.gap_opening2 + + (convex_penalties.gap_extension1 * std::min( + (int)chain_gap, + (int)std::max(target_length, query_length) + )) + 64; wf_aligner.setMaxAlignmentSteps(max_score); - const int status = wf_aligner.alignEnd2End(target + i,target_length,query + j,query_length); + const int status = wf_aligner.alignEnd2End(target + i, target_length, query + j, query_length); aln.ok = (status == WF_STATUS_ALG_COMPLETED); + + std::cerr << "score is " << wf_aligner.getAlignmentScore() << std::endl; + if (aln.ok) { - // No more necessary: correct X/M errors in the cigar - // hack_cigar(wf_aligner->cigar, query, target, query_length, target_length, j, i); +#ifdef VALIDATE_WFA_WFLIGN + if (!validate_cigar(wf_aligner.cigar, query + j, target + i, query_length, + target_length, 0, 0)) { + std::cerr << "cigar failure at alignment " << aln.j << " " << aln.i << std::endl; + unpack_display_cigar(wf_aligner.cigar, query + j, target + i, query_length, + target_length, 0, 0); + std::cerr << ">query" << std::endl + << std::string(query + j, query_length) << std::endl; + std::cerr << ">target" << std::endl + << std::string(target + i, target_length) << std::endl; + assert(false); + } +#endif + wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar); + } + // if the score is -int32 max, we can try to reverse complement the query + if (wf_aligner.getAlignmentScore() == -2147483648) { + // Try reverse complement alignment + std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); + const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); + rev_aln.ok = (wf_aligner.getAlignmentScore() > -2147483648 && rev_status == WF_STATUS_ALG_COMPLETED); + + if (rev_aln.ok) { + std::cerr << "reverse complement alignment worked!" << std::endl; #ifdef VALIDATE_WFA_WFLIGN - if (!validate_cigar(wf_aligner->cigar, query, target, query_length, - target_length, j, i)) { - std::cerr << "cigar failure at alignment " << aln.j << " " << aln.i - << std::endl; - unpack_display_cigar(wf_aligner->cigar, query, target, query_length, - target_length, aln.j, aln.i); - std::cerr << ">query" << std::endl - << std::string(query + j, query_length) << std::endl; - std::cerr << ">target" << std::endl - << std::string(target + i, target_length) << std::endl; - assert(false); -} + if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0)) { + std::cerr << "cigar failure at reverse complement alignment " << j << " " << i << std::endl; + unpack_display_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0); + std::cerr << ">query (reverse complement)" << std::endl + << rev_comp_query << std::endl; + std::cerr << ">target" << std::endl + << std::string(target + i, target_length) << std::endl; + assert(false); + } #endif - wflign_edit_cigar_copy(wf_aligner,&aln.edit_cigar); + wflign_edit_cigar_copy(wf_aligner, &rev_aln.edit_cigar); + rev_aln.j = j; + rev_aln.i = i; + rev_aln.query_length = query_length; + rev_aln.target_length = target_length; + } } + + aln.j = j; + aln.i = i; + aln.query_length = query_length; + aln.target_length = target_length; } void write_merged_alignment( @@ -317,6 +348,7 @@ void write_merged_alignment( #endif // write trace into single cigar vector std::vector tracev; + std::vector rev_patch_alns; { // patch: walk the cigar, patching directly when we have simultaneous // gaps in query and ref and adding our results to the final trace as we @@ -387,6 +419,7 @@ void write_merged_alignment( &wflign_max_len_minor, &distance_close_big_enough_indels, &min_wf_length, &max_dist_threshold, &wf_aligner, + &rev_patch_alns, &convex_penalties, &chain_gap, &max_patching_score #ifdef WFA_PNG_TSV_TIMING @@ -897,11 +930,16 @@ void write_merged_alignment( size_region_to_repatch = 0; { alignment_t patch_aln; + alignment_t rev_patch_aln; // WFA is only global do_wfa_patch_alignment( query, query_pos, query_delta, target - target_pointer_shift, target_pos, target_delta, - wf_aligner, convex_penalties, patch_aln, chain_gap, max_patching_score); + wf_aligner, convex_penalties, patch_aln, rev_patch_aln, + chain_gap, max_patching_score); + if (rev_patch_aln.ok) { + rev_patch_alns.push_back(rev_patch_aln); + } if (patch_aln.ok) { // std::cerr << "got an ok patch aln" << // std::endl; diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index d2d06a8d..908de189 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -46,17 +46,18 @@ namespace wflign { wflign_extend_data_t* extend_data, alignment_t& aln); void do_wfa_patch_alignment( - const char* query, - const uint64_t& j, - const uint64_t& query_length, - const char* target, - const uint64_t& i, - const uint64_t& target_length, - wfa::WFAlignerGapAffine2Pieces& _wf_aligner, - const wflign_penalties_t& convex_penalties, - alignment_t& aln, - const int64_t& chain_gap, - const int& max_patching_score); + const char* query, + const uint64_t& j, + const uint64_t& query_length, + const char* target, + const uint64_t& i, + const uint64_t& target_length, + wfa::WFAlignerGapAffine2Pieces& wf_aligner, + const wflign_penalties_t& convex_penalties, + alignment_t& aln, + alignment_t& rev_aln, + const int64_t& chain_gap, + const int& max_patching_score); void write_merged_alignment( std::ostream &out, const std::vector &trace, From a9e280d81e372c60af2adea6a7d8261717ed8118 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Mon, 24 Jun 2024 19:33:30 +0200 Subject: [PATCH 2/8] rule-of-five the alignment_t object so we can save it in a vector without segfaults --- src/common/wflign/src/wflign_alignment.cpp | 82 +++++++++++++++++++--- src/common/wflign/src/wflign_alignment.hpp | 26 ++++--- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/src/common/wflign/src/wflign_alignment.cpp b/src/common/wflign/src/wflign_alignment.cpp index 2cef142e..e90fa9da 100644 --- a/src/common/wflign/src/wflign_alignment.cpp +++ b/src/common/wflign/src/wflign_alignment.cpp @@ -11,18 +11,82 @@ /* * Wflign Alignment */ -alignment_t::alignment_t() { - j = 0; - i = 0; - query_length = 0; - target_length = 0; - ok = false; - keep = false; - edit_cigar = {NULL, 0, 0}; + +// Default constructor +alignment_t::alignment_t() + : j(0), i(0), query_length(0), target_length(0), ok(false), keep(false) { + edit_cigar = {nullptr, 0, 0}; } + +// Destructor alignment_t::~alignment_t() { - free(edit_cigar.cigar_ops); + free(edit_cigar.cigar_ops); + } + +// Copy constructor +alignment_t::alignment_t(const alignment_t& other) + : j(other.j), i(other.i), query_length(other.query_length), + target_length(other.target_length), ok(other.ok), keep(other.keep) { + if (other.edit_cigar.cigar_ops) { + edit_cigar.cigar_ops = (char*)malloc((other.edit_cigar.end_offset - other.edit_cigar.begin_offset) * sizeof(char)); + memcpy(edit_cigar.cigar_ops, other.edit_cigar.cigar_ops + other.edit_cigar.begin_offset, + (other.edit_cigar.end_offset - other.edit_cigar.begin_offset) * sizeof(char)); + } else { + edit_cigar.cigar_ops = nullptr; + } + edit_cigar.begin_offset = 0; + edit_cigar.end_offset = other.edit_cigar.end_offset - other.edit_cigar.begin_offset; +} + +// Move constructor +alignment_t::alignment_t(alignment_t&& other) noexcept + : j(other.j), i(other.i), query_length(other.query_length), + target_length(other.target_length), ok(other.ok), keep(other.keep), + edit_cigar(other.edit_cigar) { + other.edit_cigar = {nullptr, 0, 0}; +} + +// Copy assignment operator +alignment_t& alignment_t::operator=(const alignment_t& other) { + if (this != &other) { + j = other.j; + i = other.i; + query_length = other.query_length; + target_length = other.target_length; + ok = other.ok; + keep = other.keep; + + free(edit_cigar.cigar_ops); + if (other.edit_cigar.cigar_ops) { + edit_cigar.cigar_ops = (char*)malloc((other.edit_cigar.end_offset - other.edit_cigar.begin_offset) * sizeof(char)); + memcpy(edit_cigar.cigar_ops, other.edit_cigar.cigar_ops + other.edit_cigar.begin_offset, + (other.edit_cigar.end_offset - other.edit_cigar.begin_offset) * sizeof(char)); + } else { + edit_cigar.cigar_ops = nullptr; + } + edit_cigar.begin_offset = 0; + edit_cigar.end_offset = other.edit_cigar.end_offset - other.edit_cigar.begin_offset; + } + return *this; } + +// Move assignment operator +alignment_t& alignment_t::operator=(alignment_t&& other) noexcept { + if (this != &other) { + j = other.j; + i = other.i; + query_length = other.query_length; + target_length = other.target_length; + ok = other.ok; + keep = other.keep; + + free(edit_cigar.cigar_ops); + edit_cigar = other.edit_cigar; + other.edit_cigar = {nullptr, 0, 0}; + } + return *this; +} + //void alignment_t::display(void) { // std::cerr << j << " " << i << " " << query_length << " " // << target_length << " " << ok << std::endl; diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index 9bd0c52f..65ff60d4 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -35,21 +35,27 @@ class alignment_t { int i; uint16_t query_length; uint16_t target_length; - bool ok = false; - bool keep = false; + bool ok; + bool keep; wflign_cigar_t edit_cigar; - // Setup + // Default constructor alignment_t(); + // Destructor ~alignment_t(); - // Utils -// bool validate( -// const char* query, -// const char* target); + // Copy constructor + alignment_t(const alignment_t& other); + // Move constructor + alignment_t(alignment_t&& other) noexcept; + // Copy assignment operator + alignment_t& operator=(const alignment_t& other); + // Move assignment operator + alignment_t& operator=(alignment_t&& other) noexcept; + // Trim functions void trim_front(int query_trim); void trim_back(int query_trim); -// // Display -// void display(void); }; + + /* * Wflign Trace-Pos: Links a position in a traceback matrix to its edit */ @@ -133,4 +139,4 @@ void wflign_edit_cigar_copy( wfa::WFAligner& wf_aligner, wflign_cigar_t* const cigar_dst); -#endif /* WFLIGN_ALIGNMENT_HPP_ */ \ No newline at end of file +#endif /* WFLIGN_ALIGNMENT_HPP_ */ From 02d21daca0195f5260ea9ac07c07afcb9c97fe17 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 25 Jun 2024 09:30:14 +0200 Subject: [PATCH 3/8] write the inverted patches --- src/common/wflign/src/wflign_alignment.hpp | 4 +-- src/common/wflign/src/wflign_patch.cpp | 29 ++++++++++++++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index 65ff60d4..b00abe34 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -33,8 +33,8 @@ class alignment_t { public: int j; int i; - uint16_t query_length; - uint16_t target_length; + int query_length; + int target_length; bool ok; bool keep; wflign_cigar_t edit_cigar; diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 19ab2d8b..c3f4811c 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -216,7 +216,7 @@ void do_wfa_patch_alignment( const int status = wf_aligner.alignEnd2End(target + i, target_length, query + j, query_length); aln.ok = (status == WF_STATUS_ALG_COMPLETED); - std::cerr << "score is " << wf_aligner.getAlignmentScore() << std::endl; + //std::cerr << "score is " << wf_aligner.getAlignmentScore() << std::endl; if (aln.ok) { #ifdef VALIDATE_WFA_WFLIGN @@ -243,7 +243,7 @@ void do_wfa_patch_alignment( rev_aln.ok = (wf_aligner.getAlignmentScore() > -2147483648 && rev_status == WF_STATUS_ALG_COMPLETED); if (rev_aln.ok) { - std::cerr << "reverse complement alignment worked!" << std::endl; + //std::cerr << "reverse complement alignment worked!" << std::endl; #ifdef VALIDATE_WFA_WFLIGN if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, target_length, 0, 0)) { @@ -1824,6 +1824,31 @@ query_start : query_end) // always clean up free(cigarv); + + // write how many reverse complement alignments were found + //std::cerr << "got " << rev_patch_alns.size() << " rev patch alns" << std::endl; + for (auto& rev_patch_aln : rev_patch_alns) { + bool rev_query_is_rev = !query_is_rev; // Flip the orientation + write_alignment( + out, + rev_patch_aln, + query_name, + query_total_length, + query_offset, + query_length, + rev_query_is_rev, // Use the flipped orientation + target_name, + target_total_length, + target_offset, + target_length, + min_identity, + mashmap_estimated_identity, + false // Don't add an endline after each alignment + ); + // write tag indicating that this is a reverse complement alignment + out << "\t" << "rc:Z:true" << "\n"; + } + out << std::flush; } void write_alignment( From b663abefbad8ae94a5a35833156b68bcf28f9e57 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Wed, 26 Jun 2024 16:22:21 +0200 Subject: [PATCH 4/8] inverted patching works but big mess --- src/common/wflign/src/wflign_alignment.cpp | 45 +++++++++ src/common/wflign/src/wflign_alignment.hpp | 2 + src/common/wflign/src/wflign_patch.cpp | 102 +++++++++++++++------ 3 files changed, 121 insertions(+), 28 deletions(-) diff --git a/src/common/wflign/src/wflign_alignment.cpp b/src/common/wflign/src/wflign_alignment.cpp index e90fa9da..7e7c91b9 100644 --- a/src/common/wflign/src/wflign_alignment.cpp +++ b/src/common/wflign/src/wflign_alignment.cpp @@ -645,6 +645,51 @@ void wflign_edit_cigar_copy( cigar_dst->end_offset = cigar_length; memcpy(cigar_dst->cigar_ops,cigar_ops,cigar_length); } + +int calculate_alignment_score(const wflign_cigar_t& cigar, const wflign_penalties_t& penalties) { + int score = 0; + char prev_op = '\0'; + int gap_length = 0; + + auto process_gap = [&](char op, int length) { + switch (op) { + case 'M': + // Match is free (best case) + break; + case 'X': + score += length * penalties.mismatch; + break; + case 'I': + case 'D': + score += penalties.gap_opening1 + penalties.gap_extension1; + if (length > 1) { + score += std::min( + penalties.gap_extension1 * (length - 1), + penalties.gap_opening2 + penalties.gap_extension2 * (length - 1) + ); + } + break; + } + }; + + for (int i = cigar.begin_offset; i <= cigar.end_offset; ++i) { + char op = (i < cigar.end_offset) ? cigar.cigar_ops[i] : '\0'; // '\0' to process last gap + + if (op != prev_op || i == cigar.end_offset) { + if (gap_length > 0) { + process_gap(prev_op, gap_length); + } + gap_length = 1; + } else { + ++gap_length; + } + + prev_op = op; + } + + return score; +} + /* // No more necessary bool hack_cigar(wfa::cigar_t &cigar, const char *query, const char *target, diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index b00abe34..926f8895 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -139,4 +139,6 @@ void wflign_edit_cigar_copy( wfa::WFAligner& wf_aligner, wflign_cigar_t* const cigar_dst); +int calculate_alignment_score(const wflign_cigar_t& cigar, const wflign_penalties_t& penalties); + #endif /* WFLIGN_ALIGNMENT_HPP_ */ diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index c3f4811c..c9f9fee1 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -205,14 +205,32 @@ void do_wfa_patch_alignment( const int64_t& chain_gap, const int& max_patching_score) { - const int max_score = max_patching_score ? max_patching_score : + const int max_score = + max_patching_score ? max_patching_score : convex_penalties.gap_opening2 + (convex_penalties.gap_extension1 * std::min( (int)chain_gap, (int)std::max(target_length, query_length) - )) + 64; - + )) + 64; + + /* + std::cerr << "doing wfa patch alignment with parameters:" + << " query_length " << query_length + << " target_length " << target_length + << " chain_gap " << chain_gap + << " max_patching_score " << max_patching_score + << " max_score " << max_score + << std::endl + << "query " << std::string(query + j, query_length) + << " target " << std::string(target + i, target_length) + << std::endl; + */ + wf_aligner.setMaxAlignmentSteps(max_score); + + int fwd_score = std::numeric_limits::max(); + int rev_score = std::numeric_limits::max(); + const int status = wf_aligner.alignEnd2End(target + i, target_length, query + j, query_length); aln.ok = (status == WF_STATUS_ALG_COMPLETED); @@ -234,36 +252,64 @@ void do_wfa_patch_alignment( #endif wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar); + fwd_score = calculate_alignment_score(aln.edit_cigar, convex_penalties); + //std::cerr << "forward score is " << fwd_score << std::endl; } // if the score is -int32 max, we can try to reverse complement the query - if (wf_aligner.getAlignmentScore() == -2147483648) { - // Try reverse complement alignment - std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); - const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); - rev_aln.ok = (wf_aligner.getAlignmentScore() > -2147483648 && rev_status == WF_STATUS_ALG_COMPLETED); - - if (rev_aln.ok) { - //std::cerr << "reverse complement alignment worked!" << std::endl; + // auto fwd_score = wf_aligner.getAlignmentScore(); // this is broken + // compute with + //int calculate_alignment_score(const char* cigar, const wflign_penalties_t& penalties) { + //if (wf_aligner.getAlignmentScore() == -2147483648) { + // Try reverse complement alignment + std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); + const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); + + //auto rev_score = wf_aligner.getAlignmentScore(); + //rev_aln.ok = (rev_score > fwd_score && rev_status == WF_STATUS_ALG_COMPLETED); + rev_aln.ok = (rev_status == WF_STATUS_ALG_COMPLETED); + + if (rev_aln.ok) { + //std::cerr << "reverse complement alignment worked!" << std::endl; #ifdef VALIDATE_WFA_WFLIGN - if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, - target_length, 0, 0)) { - std::cerr << "cigar failure at reverse complement alignment " << j << " " << i << std::endl; - unpack_display_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, - target_length, 0, 0); - std::cerr << ">query (reverse complement)" << std::endl - << rev_comp_query << std::endl; - std::cerr << ">target" << std::endl - << std::string(target + i, target_length) << std::endl; - assert(false); - } + if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0)) { + std::cerr << "cigar failure at reverse complement alignment " << j << " " << i << std::endl; + unpack_display_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0); + std::cerr << ">query (reverse complement)" << std::endl + << rev_comp_query << std::endl; + std::cerr << ">target" << std::endl + << std::string(target + i, target_length) << std::endl; + assert(false); + } #endif - wflign_edit_cigar_copy(wf_aligner, &rev_aln.edit_cigar); - rev_aln.j = j; - rev_aln.i = i; - rev_aln.query_length = query_length; - rev_aln.target_length = target_length; - } + wflign_edit_cigar_copy(wf_aligner, &rev_aln.edit_cigar); + rev_score = calculate_alignment_score(rev_aln.edit_cigar, convex_penalties); + //std::cerr << "reverse score is " << rev_score << std::endl; + rev_aln.j = j; + rev_aln.i = i; + rev_aln.query_length = query_length; + rev_aln.target_length = target_length; + } + + if (rev_aln.ok && rev_score < fwd_score) { + rev_aln.ok = true; + aln.ok = false; + std::cerr << "got better score with reverse complement alignment" << std::endl + << " query_length " << query_length + << " target_length " << target_length + << " chain_gap " << chain_gap + << " max_patching_score " << max_patching_score + << " max_score " << max_score + << " fwd_score " << fwd_score + << " rev_score " << rev_score + << std::endl + << "query " << std::string(query + j, query_length) + << " target " << std::string(target + i, target_length) + << std::endl; + } else { + rev_aln.ok = false; } aln.j = j; From 42731041e13f10bc00aeacc4a1eaa328e81fce26 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Wed, 26 Jun 2024 18:57:42 +0200 Subject: [PATCH 5/8] now it prints the inverted patch in the right place --- src/common/wflign/src/wflign_patch.cpp | 23 ++++++++++++++--------- src/common/wflign/src/wflign_patch.hpp | 3 ++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index c9f9fee1..7db7a87f 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -784,8 +784,9 @@ void write_merged_alignment( // how long was our last gap? // if it's long enough, patch it int32_t size_region_to_repatch = 0; + // unused! - do { + { got_alignment = false; if ((size_region_to_repatch > 0 || @@ -984,6 +985,7 @@ void write_merged_alignment( wf_aligner, convex_penalties, patch_aln, rev_patch_aln, chain_gap, max_patching_score); if (rev_patch_aln.ok) { + // we got a good reverse alignment rev_patch_alns.push_back(rev_patch_aln); } if (patch_aln.ok) { @@ -995,14 +997,13 @@ void write_merged_alignment( const int end_idx = patch_aln.edit_cigar.end_offset; for (int i = start_idx; i < end_idx; i++) { - // std::cerr << - // patch_aln.edit_cigar.cigar_ops[i]; patched.push_back(patch_aln.edit_cigar.cigar_ops[i]); } // std::cerr << "\n"; // Check if there are too many indels in the // patch + /* uint32_t size_indel = 0; for (int i = end_idx - 1; i >= start_idx; --i) { @@ -1026,12 +1027,14 @@ void write_merged_alignment( size_indel = 0; } } + */ // std::cerr << std::endl; // Not too big, to avoid repatching // structural variants boundaries //std::cerr << "size_region_to_repatch " << size_region_to_repatch << std::endl; //std::cerr << "end_idx - start_idx " << end_idx - start_idx << std::endl; + /* if (size_indel > 7 && size_indel <= 4096 && size_region_to_repatch < (end_idx - start_idx)) { @@ -1039,6 +1042,7 @@ void write_merged_alignment( } else { size_region_to_repatch = 0; } + */ } #ifdef WFA_PNG_TSV_TIMING if (emit_patching_tsv) { @@ -1070,7 +1074,7 @@ void write_merged_alignment( query_delta = 0; target_delta = 0; - } while (size_region_to_repatch > 0); + } } // Tail patching @@ -1889,8 +1893,8 @@ query_start : query_end) target_length, min_identity, mashmap_estimated_identity, - false // Don't add an endline after each alignment - ); + false, // Don't add an endline after each alignment + true); // This is a reverse complement alignment // write tag indicating that this is a reverse complement alignment out << "\t" << "rc:Z:true" << "\n"; } @@ -1911,7 +1915,8 @@ void write_alignment( const uint64_t& target_length, // unused const float& min_identity, const float& mashmap_estimated_identity, - const bool& with_endline) { + const bool& with_endline, + const bool& is_rev_patch) { if (aln.ok) { uint64_t matches = 0; @@ -1939,9 +1944,9 @@ void write_alignment( if (gap_compressed_identity >= min_identity) { uint64_t q_start; - if (query_is_rev) { + if (query_is_rev && !is_rev_patch) { q_start = - query_offset + (query_length - (aln.j + qAlignedLength)); + query_offset + (query_length - (aln.j + qAlignedLength)); } else { q_start = query_offset + aln.j; } diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index 908de189..3382290a 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -112,7 +112,8 @@ namespace wflign { const uint64_t& target_length, // unused const float& min_identity, const float& mashmap_estimated_identity, - const bool& with_endline = true); + const bool& with_endline = true, + const bool& is_rev_patch = false); double float2phred(const double& prob); void sort_indels(std::vector& v); From eb8099aba4be97ab080cc380b06225707592e2d7 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 27 Jun 2024 17:53:07 +0200 Subject: [PATCH 6/8] only record inverted alignments once --- src/common/wflign/src/wflign_patch.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 7db7a87f..989e8525 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -472,11 +472,13 @@ void write_merged_alignment( ,&emit_patching_tsv, &out_patching_tsv #endif - ](std::vector &unpatched, - std::vector &patched, - const uint16_t &min_wfa_head_tail_patch_length, - const uint16_t &min_wfa_patch_length, - const uint16_t &max_dist_to_look_at) { + ](std::vector &unpatched, + std::vector &patched, + const uint16_t &min_wfa_head_tail_patch_length, + const uint16_t &min_wfa_patch_length, + const uint16_t &max_dist_to_look_at, + bool save_rev_patch_alns) { + auto q = unpatched.begin(); uint64_t query_pos = query_start; @@ -984,7 +986,7 @@ void write_merged_alignment( target - target_pointer_shift, target_pos, target_delta, wf_aligner, convex_penalties, patch_aln, rev_patch_aln, chain_gap, max_patching_score); - if (rev_patch_aln.ok) { + if (rev_patch_aln.ok && save_rev_patch_alns) { // we got a good reverse alignment rev_patch_alns.push_back(rev_patch_aln); } @@ -1414,7 +1416,7 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, pre_tracev, 4096, 32, 512); // In the 1st round, we patch more aggressively + patching(erodev, pre_tracev, 4096, 32, 512, false); // In the 1st round, we patch more aggressively and don't save inversions #ifdef VALIDATE_WFA_WFLIGN if (!validate_trace(pre_tracev, query, @@ -1444,7 +1446,7 @@ void write_merged_alignment( //std::cerr << "SECOND PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(pre_tracev, tracev, 256, 8, 128); // In the 2nd round, we patch less aggressively + patching(pre_tracev, tracev, 256, 8, 128, true); // In the 2nd round, we patch less aggressively but we save inversions } // normalize the indels From 5e8be98f20f7b8a3dbde3f2fd6b1a70a299a78ac Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 27 Jun 2024 17:57:03 +0200 Subject: [PATCH 7/8] be quiet inversion patcher --- src/common/wflign/src/wflign_patch.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 989e8525..8433ad32 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -296,6 +296,7 @@ void do_wfa_patch_alignment( if (rev_aln.ok && rev_score < fwd_score) { rev_aln.ok = true; aln.ok = false; +#ifdef WFLIGN_DEBUG std::cerr << "got better score with reverse complement alignment" << std::endl << " query_length " << query_length << " target_length " << target_length @@ -308,6 +309,7 @@ void do_wfa_patch_alignment( << "query " << std::string(query + j, query_length) << " target " << std::string(target + i, target_length) << std::endl; +#endif } else { rev_aln.ok = false; } From c3ffa66f7dab15a3098f880ff6e4d6c5d4a0dfe9 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 27 Jun 2024 19:09:11 +0200 Subject: [PATCH 8/8] implement a min inverse patch length = 23 by hard coding --- src/common/wflign/src/wflign.cpp | 3 ++ src/common/wflign/src/wflign.hpp | 1 + src/common/wflign/src/wflign_patch.cpp | 74 +++++++++++++------------- src/common/wflign/src/wflign_patch.hpp | 4 +- 4 files changed, 45 insertions(+), 37 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 12901641..5b51004d 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -91,6 +91,7 @@ WFlign::WFlign( this->erode_k = erode_k; this->chain_gap = chain_gap; this->max_patching_score = max_patching_score; + this->min_inversion_length = 23; // Query this->query_name = nullptr; this->query = nullptr; @@ -539,6 +540,7 @@ void WFlign::wflign_affine_wavefront( erode_k, chain_gap, max_patching_score, + min_inversion_length, MIN_WF_LENGTH, wf_max_dist_threshold #ifdef WFA_PNG_TSV_TIMING @@ -1007,6 +1009,7 @@ void WFlign::wflign_affine_wavefront( erode_k, chain_gap, max_patching_score, + min_inversion_length, MIN_WF_LENGTH, wf_max_dist_threshold #ifdef WFA_PNG_TSV_TIMING diff --git a/src/common/wflign/src/wflign.hpp b/src/common/wflign/src/wflign.hpp index daec5770..337eaac0 100644 --- a/src/common/wflign/src/wflign.hpp +++ b/src/common/wflign/src/wflign.hpp @@ -64,6 +64,7 @@ namespace wflign { int erode_k; int64_t chain_gap; int max_patching_score; + uint64_t min_inversion_length; // Query const std::string* query_name; char* query; diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 8433ad32..78994f68 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -203,7 +203,8 @@ void do_wfa_patch_alignment( alignment_t& aln, alignment_t& rev_aln, const int64_t& chain_gap, - const int& max_patching_score) { + const int& max_patching_score, + const uint64_t& min_inversion_length) { const int max_score = max_patching_score ? max_patching_score : @@ -255,42 +256,42 @@ void do_wfa_patch_alignment( fwd_score = calculate_alignment_score(aln.edit_cigar, convex_penalties); //std::cerr << "forward score is " << fwd_score << std::endl; } - // if the score is -int32 max, we can try to reverse complement the query - // auto fwd_score = wf_aligner.getAlignmentScore(); // this is broken - // compute with - //int calculate_alignment_score(const char* cigar, const wflign_penalties_t& penalties) { - //if (wf_aligner.getAlignmentScore() == -2147483648) { - // Try reverse complement alignment - std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); - const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); - - //auto rev_score = wf_aligner.getAlignmentScore(); - //rev_aln.ok = (rev_score > fwd_score && rev_status == WF_STATUS_ALG_COMPLETED); - rev_aln.ok = (rev_status == WF_STATUS_ALG_COMPLETED); - - if (rev_aln.ok) { - //std::cerr << "reverse complement alignment worked!" << std::endl; + + if (query_length >= min_inversion_length && target_length >= min_inversion_length) { + // Try reverse complement alignment + std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); + const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); + + //auto rev_score = wf_aligner.getAlignmentScore(); + //rev_aln.ok = (rev_score > fwd_score && rev_status == WF_STATUS_ALG_COMPLETED); + rev_aln.ok = (rev_status == WF_STATUS_ALG_COMPLETED); + + if (rev_aln.ok) { + //std::cerr << "reverse complement alignment worked!" << std::endl; #ifdef VALIDATE_WFA_WFLIGN - if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, - target_length, 0, 0)) { - std::cerr << "cigar failure at reverse complement alignment " << j << " " << i << std::endl; - unpack_display_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, - target_length, 0, 0); - std::cerr << ">query (reverse complement)" << std::endl - << rev_comp_query << std::endl; - std::cerr << ">target" << std::endl - << std::string(target + i, target_length) << std::endl; - assert(false); - } + if (!validate_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0)) { + std::cerr << "cigar failure at reverse complement alignment " << j << " " << i << std::endl; + unpack_display_cigar(wf_aligner.cigar, rev_comp_query.c_str(), target + i, query_length, + target_length, 0, 0); + std::cerr << ">query (reverse complement)" << std::endl + << rev_comp_query << std::endl; + std::cerr << ">target" << std::endl + << std::string(target + i, target_length) << std::endl; + assert(false); + } #endif - wflign_edit_cigar_copy(wf_aligner, &rev_aln.edit_cigar); - rev_score = calculate_alignment_score(rev_aln.edit_cigar, convex_penalties); - //std::cerr << "reverse score is " << rev_score << std::endl; - rev_aln.j = j; - rev_aln.i = i; - rev_aln.query_length = query_length; - rev_aln.target_length = target_length; + wflign_edit_cigar_copy(wf_aligner, &rev_aln.edit_cigar); + rev_score = calculate_alignment_score(rev_aln.edit_cigar, convex_penalties); + //std::cerr << "reverse score is " << rev_score << std::endl; + rev_aln.j = j; + rev_aln.i = i; + rev_aln.query_length = query_length; + rev_aln.target_length = target_length; + } + } else { + rev_aln.ok = false; } if (rev_aln.ok && rev_score < fwd_score) { @@ -351,6 +352,7 @@ void write_merged_alignment( const int& erode_k, const int64_t& chain_gap, const int& max_patching_score, + const uint64_t& min_inversion_length, const int& min_wf_length, const int& max_dist_threshold, #ifdef WFA_PNG_TSV_TIMING @@ -469,7 +471,7 @@ void write_merged_alignment( &max_dist_threshold, &wf_aligner, &rev_patch_alns, &convex_penalties, - &chain_gap, &max_patching_score + &chain_gap, &max_patching_score, &min_inversion_length #ifdef WFA_PNG_TSV_TIMING ,&emit_patching_tsv, &out_patching_tsv @@ -987,7 +989,7 @@ void write_merged_alignment( query, query_pos, query_delta, target - target_pointer_shift, target_pos, target_delta, wf_aligner, convex_penalties, patch_aln, rev_patch_aln, - chain_gap, max_patching_score); + chain_gap, max_patching_score, min_inversion_length); if (rev_patch_aln.ok && save_rev_patch_alns) { // we got a good reverse alignment rev_patch_alns.push_back(rev_patch_aln); diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index 3382290a..3180b018 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -57,7 +57,8 @@ namespace wflign { alignment_t& aln, alignment_t& rev_aln, const int64_t& chain_gap, - const int& max_patching_score); + const int& max_patching_score, + const uint64_t& min_inversion_length); void write_merged_alignment( std::ostream &out, const std::vector &trace, @@ -89,6 +90,7 @@ namespace wflign { const int& erode_k, const int64_t& chain_gap, const int& max_patching_score, + const uint64_t& min_inversion_length, const int& min_wf_length, const int& max_dist_threshold, #ifdef WFA_PNG_TSV_TIMING