Skip to content

Commit

Permalink
Merge pull request #288 from waveygang/biwfa-it
Browse files Browse the repository at this point in the history
biWFA it
  • Loading branch information
ekg authored Oct 30, 2024
2 parents 63fd2ac + 2fbdef1 commit b160f53
Show file tree
Hide file tree
Showing 8 changed files with 308 additions and 165 deletions.
1 change: 1 addition & 0 deletions src/align/include/align_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct Parameters {
uint16_t wflambda_segment_length; //segment length for wflambda

bool force_biwfa_alignment; //force biwfa alignment
bool force_wflign; //force alignment with WFlign instead of the default biWFA

int wfa_mismatch_score;
int wfa_gap_opening_score;
Expand Down
60 changes: 19 additions & 41 deletions src/align/include/computeAlignments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,49 +242,19 @@ std::string processAlignment(seq_record_t* rec) {
skch::CommonFunc::reverseComplement(query_seq.data(), queryRegionStrand.data(), query_seq.size());
}

wflign::wavefront::WFlign wflign(
param.wflambda_segment_length,
param.min_identity,
param.force_biwfa_alignment,
param.wfa_mismatch_score,
param.wfa_gap_opening_score,
param.wfa_gap_extension_score,
param.wfa_patching_mismatch_score,
param.wfa_patching_gap_opening_score1,
param.wfa_patching_gap_extension_score1,
param.wfa_patching_gap_opening_score2,
param.wfa_patching_gap_extension_score2,
rec->currentRecord.mashmap_estimated_identity,
param.wflign_mismatch_score,
param.wflign_gap_opening_score,
param.wflign_gap_extension_score,
param.wflign_max_mash_dist,
param.wflign_min_wavefront_length,
param.wflign_max_distance_threshold,
param.wflign_max_len_major,
param.wflign_max_len_minor,
param.wflign_erode_k,
param.chain_gap,
param.wflign_min_inv_patch_len,
param.wflign_max_patching_score);
// Set up penalties for biWFA
wflign_penalties_t wfa_penalties;
wfa_penalties.match = 0;
wfa_penalties.mismatch = param.wfa_patching_mismatch_score;
wfa_penalties.gap_opening1 = param.wfa_patching_gap_opening_score1;
wfa_penalties.gap_extension1 = param.wfa_patching_gap_extension_score1;
wfa_penalties.gap_opening2 = param.wfa_patching_gap_opening_score2;
wfa_penalties.gap_extension2 = param.wfa_patching_gap_extension_score2;

std::stringstream output;
wflign.set_output(
&output,
#ifdef WFA_PNG_TSV_TIMING
!param.tsvOutputPrefix.empty(),
nullptr,
param.prefix_wavefront_plot_in_png,
param.wfplot_max_size,
!param.path_patching_info_in_tsv.empty(),
nullptr,
#endif
true, // merge alignments
param.emit_md_tag,
!param.sam_format,
param.no_seq_in_sam);

wflign.wflign_affine_wavefront(
// Do direct biWFA alignment
wflign::wavefront::do_biwfa_alignment(
rec->currentRecord.qId,
queryRegionStrand.data(),
rec->queryTotalLength,
Expand All @@ -295,7 +265,15 @@ std::string processAlignment(seq_record_t* rec) {
ref_seq_ptr,
rec->refTotalLength,
rec->currentRecord.rStartPos,
rec->currentRecord.rEndPos - rec->currentRecord.rStartPos);
rec->currentRecord.rEndPos - rec->currentRecord.rStartPos,
output,
wfa_penalties,
param.emit_md_tag,
!param.sam_format,
param.no_seq_in_sam,
param.min_identity,
param.wflign_max_len_minor,
rec->currentRecord.mashmap_estimated_identity);

return output.str();
}
Expand Down
103 changes: 103 additions & 0 deletions src/common/wflign/src/alignment_printer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#ifndef ALIGNMENT_PRINTER_HPP
#define ALIGNMENT_PRINTER_HPP

#include <string>
#include <sstream>
#include "wflign.hpp"

namespace wflign {
namespace wavefront {

void write_tag_and_md_string(
std::ostream &out,
const char *cigar_ops,
const int cigar_start,
const int cigar_end,
const int target_start,
const char *target,
const int64_t target_offset,
const int64_t target_pointer_shift);

void write_alignment_sam(
std::ostream &out,
const alignment_t& patch_aln,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
const float& mashmap_estimated_identity,
const bool& no_seq_in_sam,
const bool& emit_md_tag,
const char* query,
const char* target,
const int64_t& target_pointer_shift);

bool write_alignment_paf(
std::ostream& out,
const alignment_t& aln,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
const float& mashmap_estimated_identity,
const bool& with_endline = true,
const bool& is_rev_patch = false);

void write_merged_alignment(
std::ostream &out,
const std::vector<alignment_t *> &trace,
wfa::WFAlignerGapAffine2Pieces& wf_aligner,
const wflign_penalties_t& convex_penalties,
const bool& emit_md_tag,
const bool& paf_format_else_sam,
const bool& no_seq_in_sam,
const char* query,
const std::string& query_name,
const uint64_t& query_total_length,
const uint64_t& query_offset,
const uint64_t& query_length,
const bool& query_is_rev,
const char* target,
const std::string& target_name,
const uint64_t& target_total_length,
const uint64_t& target_offset,
const uint64_t& target_length,
const float& min_identity,
#ifdef WFA_PNG_TSV_TIMING
const long& elapsed_time_wflambda_ms,
const uint64_t& num_alignments,
const uint64_t& num_alignments_performed,
#endif
const float& mashmap_estimated_identity,
const uint64_t& wflign_max_len_major,
const uint64_t& wflign_max_len_minor,
const int& erode_k,
const int64_t& chain_gap,
const int& max_patching_score,
const uint64_t& min_inversion_length,
const int& min_wf_length,
const int& max_dist_threshold,
#ifdef WFA_PNG_TSV_TIMING
const std::string* prefix_wavefront_plot_in_png,
const uint64_t& wfplot_max_size,
const bool& emit_patching_tsv,
std::ostream* out_patching_tsv,
#endif
const bool& with_endline = true);

} // namespace wavefront
} // namespace wflign

#endif
155 changes: 130 additions & 25 deletions src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,104 @@ namespace wavefront {
/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 1000
#define MIN_WF_LENGTH 256

void do_biwfa_alignment(
const std::string& query_name,
char* const query,
const uint64_t query_total_length,
const uint64_t query_offset,
const uint64_t query_length,
const bool query_is_rev,
const std::string& target_name,
char* const target,
const uint64_t target_total_length,
const uint64_t target_offset,
const uint64_t target_length,
std::ostream& out,
const wflign_penalties_t& penalties,
const bool emit_md_tag,
const bool paf_format_else_sam,
const bool no_seq_in_sam,
const float min_identity,
const uint64_t wflign_max_len_minor,
const float mashmap_estimated_identity) {

// Create WFA aligner with the provided penalties
wfa::WFAlignerGapAffine2Pieces wf_aligner(
0, // match
penalties.mismatch,
penalties.gap_opening1,
penalties.gap_extension1,
penalties.gap_opening2,
penalties.gap_extension2,
wfa::WFAligner::Alignment,
wfa::WFAligner::MemoryUltralow);

wf_aligner.setHeuristicNone();

// Perform the alignment
const int status = wf_aligner.alignEnd2End(target, (int)target_length, query, (int)query_length);

if (status == 0) { // WF_STATUS_SUCCESSFUL
// Create alignment record on stack
alignment_t aln;
aln.ok = true;
aln.j = 0;
aln.i = 0;
aln.query_length = query_length;
aln.target_length = target_length;
aln.is_rev = false;

// Copy alignment CIGAR
wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar);

// Write alignment
if (paf_format_else_sam) {
write_alignment_paf(
out,
aln,
query_name,
query_total_length,
query_offset,
query_length,
query_is_rev,
target_name,
target_total_length,
target_offset,
target_length,
min_identity,
mashmap_estimated_identity);
} else {
// Write SAM output directly
write_alignment_sam(
out,
aln,
query_name,
query_total_length,
query_offset,
query_length,
query_is_rev,
target_name,
target_total_length,
target_offset,
target_length,
min_identity,
mashmap_estimated_identity,
no_seq_in_sam,
emit_md_tag,
query,
target,
0); // No target pointer shift for biwfa
}
}
}

/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 1000

/*
* Utils
*/
Expand All @@ -42,7 +137,7 @@ void clean_up_sketches(std::vector<std::vector<rkmh::hash_t>*> &sketches) {
WFlign::WFlign(
const uint16_t segment_length,
const float min_identity,
const bool force_biwfa_alignment,
const bool force_wflign_,
const int wfa_mismatch_score,
const int wfa_gap_opening_score,
const int wfa_gap_extension_score,
Expand All @@ -68,7 +163,7 @@ WFlign::WFlign(
this->segment_length = segment_length;
this->min_identity = min_identity;

this->force_biwfa_alignment = force_biwfa_alignment;
this->force_wflign = force_wflign_;

this->wfa_mismatch_score = wfa_mismatch_score;
this->wfa_gap_opening_score = wfa_gap_opening_score;
Expand Down Expand Up @@ -331,6 +426,35 @@ void WFlign::wflign_affine_wavefront(
return;
}

// Set penalties for wfa convex
wflign_penalties_t wfa_convex_penalties;
if (wfa_patching_mismatch_score > 0 && wfa_patching_gap_opening_score1 > 0 && wfa_patching_gap_extension_score1 > 0 && wfa_patching_gap_opening_score2 > 0 && wfa_patching_gap_extension_score2 > 0){
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = wfa_patching_mismatch_score;
wfa_convex_penalties.gap_opening1 = wfa_patching_gap_opening_score1;
wfa_convex_penalties.gap_extension1 = wfa_patching_gap_extension_score1;
wfa_convex_penalties.gap_opening2 = wfa_patching_gap_opening_score2;
wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2;
} else {
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = 6;
wfa_convex_penalties.gap_opening1 = 6;
wfa_convex_penalties.gap_extension1 = 2;
wfa_convex_penalties.gap_opening2 = 26;
wfa_convex_penalties.gap_extension2 = 1;
}

// Use biWFA for smaller sequences or very high identity matches

if (!force_wflign && (query_length <= MAX_LEN_FOR_STANDARD_WFA || target_length <= MAX_LEN_FOR_STANDARD_WFA)) {
do_biwfa_alignment(
query_name, query, query_total_length, query_offset, query_length, query_is_rev,
target_name, target, target_total_length, target_offset, target_length,
*out, wfa_convex_penalties, emit_md_tag, paf_format_else_sam, no_seq_in_sam,
min_identity, wflign_max_len_minor, mashmap_estimated_identity);
return;
}

// Check if mashmap_estimated_identity == 1 to avoid division by zero, leading to a minhash_kmer_size of 8.
// Such low value was leading to confusion in HORs alignments in the human centromeres (high runtime and memory usage, and wrong alignments)
const int minhash_kmer_size = mashmap_estimated_identity == 1 ? 17 : std::max(8, std::min(17, (int)std::floor(1.0 / (1.0 - mashmap_estimated_identity))));
Expand Down Expand Up @@ -366,23 +490,6 @@ void WFlign::wflign_affine_wavefront(
*/
}

// Set penalties for wfa convex
wflign_penalties_t wfa_convex_penalties;
if (wfa_patching_mismatch_score > 0 && wfa_patching_gap_opening_score1 > 0 && wfa_patching_gap_extension_score1 > 0 && wfa_patching_gap_opening_score2 > 0 && wfa_patching_gap_extension_score2 > 0){
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = wfa_patching_mismatch_score;
wfa_convex_penalties.gap_opening1 = wfa_patching_gap_opening_score1;
wfa_convex_penalties.gap_extension1 = wfa_patching_gap_extension_score1;
wfa_convex_penalties.gap_opening2 = wfa_patching_gap_opening_score2;
wfa_convex_penalties.gap_extension2 = wfa_patching_gap_extension_score2;
} else {
wfa_convex_penalties.match = 0;
wfa_convex_penalties.mismatch = 5;
wfa_convex_penalties.gap_opening1 = 8;
wfa_convex_penalties.gap_extension1 = 2;
wfa_convex_penalties.gap_opening2 = 49;
wfa_convex_penalties.gap_extension2 = 1;
}

// heuristic bound on the max mash dist, adaptive based on estimated
// identity the goal here is to sparsify the set of alignments in the
Expand Down Expand Up @@ -426,11 +533,8 @@ void WFlign::wflign_affine_wavefront(
#ifdef WFA_PNG_TSV_TIMING
const auto start_time = std::chrono::steady_clock::now();
#endif
if (force_biwfa_alignment ||
(query_length <= segment_length * 8 || target_length <= segment_length * 8) ||
(mashmap_estimated_identity >= 0.99
&& query_length <= MAX_LEN_FOR_STANDARD_WFA && target_length <= MAX_LEN_FOR_STANDARD_WFA)
) {

if (!force_wflign) {
wfa::WFAlignerGapAffine2Pieces* wf_aligner =
new wfa::WFAlignerGapAffine2Pieces(
0,
Expand Down Expand Up @@ -493,6 +597,7 @@ void WFlign::wflign_affine_wavefront(
std::chrono::steady_clock::now() - start_time).count();
#endif


// Free old aligner
delete wf_aligner;

Expand Down
Loading

0 comments on commit b160f53

Please sign in to comment.