Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/better seed debug logging #1240

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions packages_rs/nextclade/src/align/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::alphabet::letter::Letter;
use crate::alphabet::nuc::Nuc;
use crate::make_error;
use eyre::{Report, WrapErr};
use log::{info, trace};
use log::{debug, info, trace};

fn align_pairwise<T: Letter<T>>(
qry_seq: &[T],
Expand Down Expand Up @@ -48,7 +48,7 @@ pub fn align_nuc(
if ref_len + qry_len < (10 * params.seed_length) {
// for very short sequences, use full square
let stripes = full_matrix(ref_len, qry_len);
trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
debug!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
return Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes));
}

Expand Down Expand Up @@ -77,24 +77,34 @@ pub fn align_nuc(
params.max_band_area,
)?;

if log::max_level() == log::Level::Debug {
let band_area: usize = stripes.iter().map(Stripe::len).sum();
let mean_band_width = band_area / ref_len;
debug!("Nucleotide alignment band area={band_area}, mean band width={mean_band_width}");
}

let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes);
alignment.is_reverse_complement = is_reverse_complement;

let alignment_score = alignment.alignment_score;

debug!("Attempt: {attempt}, Alignment score: {alignment_score}");

if alignment.hit_boundary {
terminal_bandwidth *= 2;
excess_bandwidth *= 2;
allowed_mismatches *= 2;
attempt += 1;
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt, alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {alignment_score}", attempt);
} else {
if attempt > 0 {
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {}", attempt+1, alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {alignment_score}", attempt+1);
}
return Ok(alignment);
}

if attempt > params.max_alignment_attempts {
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {}", alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {alignment_score}");
return Ok(alignment);
}
}
Expand Down
3 changes: 2 additions & 1 deletion packages_rs/nextclade/src/align/score_matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ pub fn score_matrix<T: Letter<T>>(
let mut scores = Band2d::<i32>::new(stripes);
let band_size = paths.data_len();

trace!("Score matrix: allocated alignment band of size={band_size}");
let mean_band_width = band_size / ref_len;
trace!("Score matrix: allocated alignment band of size={band_size}, mean band width={mean_band_width}");

let left_align = match params.gap_alignment_side {
GapAlignmentSide::Left => 1,
Expand Down
37 changes: 34 additions & 3 deletions packages_rs/nextclade/src/align/seed_match2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use eyre::Report;
use gcollections::ops::{Bounded, Intersection, IsEmpty, Union};
use interval::interval_set::{IntervalSet, ToIntervalSet};
use itertools::Itertools;
use log::{debug, trace};
use std::borrow::Cow;
use std::cmp::{max, min};
use std::collections::{BTreeMap, VecDeque};
Expand Down Expand Up @@ -481,14 +482,44 @@ pub fn get_seed_matches2(
let seed_matches = chain_seeds(&matches);
// write_matches_to_file(&seed_matches, "chained_matches.csv");

// Trace basic seed alignment stats
if log::max_level() >= log::Level::Debug {
let max_shift = seed_matches.iter().map(|sm| sm.offset.abs()).max().unwrap_or(0);
// Need to iterate in pairs
let max_unmatched_ref_stretch = seed_matches
.iter()
.tuple_windows()
.map(|(sm1, sm2)| sm2.ref_pos - (sm1.ref_pos + sm1.length))
.max()
.unwrap_or(0);
let max_unmatched_qry_stretch = seed_matches
.iter()
.tuple_windows()
.map(|(sm1, sm2)| sm2.qry_pos - (sm1.qry_pos + sm1.length))
.max()
.unwrap_or(0);
let first_match = seed_matches.first().unwrap();
let last_match = seed_matches.last().unwrap();
let first_match_ref_pos = first_match.ref_pos;
let first_match_qry_pos = first_match.qry_pos;
let last_match_ref_pos = ref_seq.len() - last_match.ref_pos - last_match.length;
let last_match_qry_pos = qry_seq.len() - last_match.qry_pos - last_match.length;

let n_matches = seed_matches.len();
debug!("Chained seed stats. max indel: {max_shift}, # matches: {n_matches}, first/last match distance from start/end [start: (ref: {first_match_ref_pos}, qry: {first_match_qry_pos}), end: (ref: {last_match_ref_pos}, qry: {last_match_qry_pos})], max unmatched stretch (ref: {max_unmatched_ref_stretch}, qry stretch: {max_unmatched_qry_stretch})");
Comment on lines +487 to +509
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please extract the body of the condition into a separate function (keeping the condition itself where it is), so that it does not distract from the main algo code.

Note that the conditional jump will always be executed in production. (And every log macro invocation also adds a similar conditional jump underneath, which might hurt performance in tight loops. This particular place seems to be executed rarely though.)

Is this thing required in release builds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point to refactor into a function.

I know that this will be always executed in production - but it's only executed once per sequence, I don't think we need to think about perf at things so rarely executed.

Not required in release builds but it's nice if once can always get this info - we also include lots of trace statements in release builds. What's the argument against including it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. No particular argument other than it becomes unnecessary instructions for most executions. Let's hope branch predictor figures it out :)

}

let sum_of_seed_length: usize = seed_matches.iter().map(|sm| sm.length).sum();
if (sum_of_seed_length as f64 / qry_seq.len() as f64) < params.min_seed_cover {
let qry_coverage = sum_of_seed_length as f64 / qry_seq.len() as f64;
debug!("Seed alignment covers {:.2}% of query length", 100.0 * qry_coverage);
if qry_coverage < params.min_seed_cover {
let query_knowns = qry_seq.iter().filter(|n| n.is_acgt()).count();
let qry_coverage_knowns = sum_of_seed_length as f64 / query_knowns as f64;
if (sum_of_seed_length as f64 / query_knowns as f64) < params.min_seed_cover {
return make_error!(
"Unable to align: seed alignment covers {:.2}% of the query sequence, which is less than expected {:.2}% \
"Unable to align: seed alignment covers {:.2}% of query sequence ACGTs, which is less than required {:.2}% \
(configurable using 'min seed cover' CLI flag or dataset property). This is likely due to low quality of the \
provided sequence, or due to using incorrect reference sequence.",
provided sequence, or due to using an incorrect reference sequence.",
100.0 * (sum_of_seed_length as f64) / (query_knowns as f64),
100.0 * params.min_seed_cover
);
Expand Down