Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

limit block maximum divergence #143

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The `docs/docs/reference.md` file is generated using script `generate-reference-
```bash
cargo build --bin=pangraph
cd docs
./generate-reference-docs "../target/debug/pangraph" "docs/docs/reference.md"
./generate-reference-docs "../target/debug/pangraph" "docs/reference.md"
```

> ⚠️ Do not edit the generated file manually! All manual changes will be overwritten by automation.
Expand Down
1 change: 1 addition & 0 deletions docs/docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ Align genomes into a multiple sequence alignment graph

Default value: `10`
* `-K`, `--kmer-length <KMER_LENGTH>` — Sets kmer length for mmseqs2 aligner
* `-S`, `--strict-max-divergence` — Set strict maximal divergence. When toggled, proposed mergers with maximal divergence above 1/b are rejected
* `-c`, `--circular` — Toggle if input genomes are circular
* `-x`, `--max-self-map <MAX_SELF_MAP>` — Maximum number of alignment rounds to consider per pairwise graph merger

Expand Down
4 changes: 4 additions & 0 deletions packages/pangraph/src/align/alignment_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ pub struct AlignmentArgs {
#[clap(long, short = 'K')]
#[clap(value_hint = ValueHint::Other)]
pub kmer_length: Option<usize>,

/// Set strict maximal divergence. When toggled, proposed mergers with maximal divergence above 1/b are rejected.
#[clap(long, short = 'S')]
pub strict_max_divergence: bool,
}
2 changes: 1 addition & 1 deletion packages/pangraph/src/align/nextclade/align/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ impl Default for NextalignParams {
min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low
allowed_mismatches: 8, // Ns count as mismatches
window_size: 30,
max_alignment_attempts: 3,
max_alignment_attempts: 5,

// The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set)
max_indel: None,
Expand Down
21 changes: 21 additions & 0 deletions packages/pangraph/src/pangraph/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ impl Sub {
alt: self.alt,
}
}

pub fn to_n(&self) -> bool {
self.alt.is_ambiguous()
}
}

#[must_use]
Expand Down Expand Up @@ -249,6 +253,23 @@ impl Edit {
seq_len == 0
}

// Return the average divergence from the consensus sequence,
// given as number of mutations (excluding `N`) per non-deleted base.
pub fn divergence(&self, cons_len: usize, exclude_positions: &[usize]) -> Option<f64> {
let deleted_len = self.dels.iter().map(|d| d.len).sum::<usize>();
let excluded_deleted = exclude_positions
.iter()
.filter(|&pos| self.dels.iter().any(|d| d.contains(*pos)))
.count();
let n_bases = cons_len - deleted_len - exclude_positions.len() + excluded_deleted;
let n_mutations = self
.subs
.iter()
.filter(|s| !s.to_n() && !exclude_positions.contains(&s.pos))
.count();
(n_bases > 0).then_some(n_mutations as f64 / n_bases as f64)
}

#[cfg(any(test, debug_assertions))]
pub fn sanity_check(&self, len: usize) -> Result<(), Report> {
let block_interval = Interval::new(0, len);
Expand Down
43 changes: 39 additions & 4 deletions packages/pangraph/src/pangraph/graph_merging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,13 @@ pub fn self_merge(graph: Pangraph, args: &PangraphBuildArgs) -> Result<(Pangraph
// - calculate energy and keep only matches with E < 0
// - sort them by energy
// - discard incompatible matches (the ones that have overlapping regions)
let mut matches = filter_matches(&matches, &args.aln_args);

let block_div = args
.aln_args
.strict_max_divergence
.then(|| graph.blocks_max_divergence());

let mut matches = filter_matches(&matches, &args.aln_args, &block_div);
debug!("Matches after filtering: {}", matches.len());
trace!("{matches:#?}");

Expand Down Expand Up @@ -179,22 +185,51 @@ pub fn find_matches(
.wrap_err_with(|| format!("When trying to align sequences using {}", &args.alignment_kernel))
}

pub fn filter_matches(alns: &[Alignment], args: &AlignmentArgs) -> Vec<Alignment> {
pub fn filter_matches(
alns: &[Alignment],
args: &AlignmentArgs,
block_divergence: &Option<BTreeMap<BlockId, Option<f64>>>,
) -> Vec<Alignment> {
// - evaluates the energy of the alignments
// - keeps only matches with E < 0
// - (optionally) keeps only matches with max divergence < threshold
// - sorts them by energy
// - discards incompatible matches (the ones that have overlapping regions)

// TODO: energy is calculated for each alignment.
// Consider calculating it earlier and making it a property to simplify filtering and sorting.
let alns = alns
let mut alns = alns
.iter()
.map(|aln| (aln, alignment_energy2(aln, args)))
.filter(|(_, energy)| energy < &0.0)
.sorted_by_key(|(_, energy)| OrderedFloat(*energy))
.map(|(aln, _)| aln)
.collect_vec();

// if strict_max_divergence is set, and beta > 0, then
// filter alignment, only keep those with max divergence below the threshold
if args.strict_max_divergence && (args.beta > 0.0) {
let bd = block_divergence.as_ref().unwrap();
alns = alns
.iter()
.filter(|aln| {
let div_q = bd.get(&aln.qry.name).unwrap();
let div_r = bd.get(&aln.reff.name).unwrap();
let aln_div = aln.divergence.unwrap();

// if any of these options is not set, remove the alignment
match (div_q, div_r) {
(Some(div_q), Some(div_r)) => {
let total_div = div_q + div_r + aln_div;
total_div < 1. / args.beta
},
_ => false,
}
})
.copied()
.collect_vec();
}

// Iteratively accept alignments if they do not overlap with previously accepted ones
let mut accepted_alns = vec![];
let mut accepted_intervals = btreemap![];
Expand Down Expand Up @@ -364,7 +399,7 @@ mod tests {

let alns = [aln_0.clone(), aln_1.clone(), aln_2, aln_3];

assert_eq!(filter_matches(&alns, &args), vec![aln_1, aln_0]);
assert_eq!(filter_matches(&alns, &args, &None), vec![aln_1, aln_0]);

Ok(())
}
Expand Down
9 changes: 9 additions & 0 deletions packages/pangraph/src/pangraph/pangraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,15 @@ impl Pangraph {
}
}

// Returns a map of block ids to the maximum divergence in the block alignment.
pub fn blocks_max_divergence(&self) -> BTreeMap<BlockId, Option<f64>> {
self
.blocks
.iter()
.map(|(bid, block)| (*bid, block.max_divergence()))
.collect()
}

#[cfg(any(test, debug_assertions))]
pub fn sanity_check(&self) -> Result<(), Report> {
for (node_id, node) in &self.nodes {
Expand Down
30 changes: 30 additions & 0 deletions packages/pangraph/src/pangraph/pangraph_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use derive_more::{Display, From};
use eyre::{Report, WrapErr};
use getset::{CopyGetters, Getters};
use maplit::btreemap;
use ordered_float::OrderedFloat;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_json::json;
Expand Down Expand Up @@ -195,6 +196,35 @@ impl PangraphBlock {
})
})
}

pub fn max_divergence(&self) -> Option<f64> {
// length of consensus sequence minus ambiguous nucleotides
let consensus_len = self.consensus_len();
let ambiguous_positions = self.consensus.ambiguous_positions();

// if any of the divergences cannot be calculated, return None
let divs = self
.alignments
.values()
.map(|edit| edit.divergence(consensus_len, &ambiguous_positions))
.collect::<Vec<_>>();

// if any of the divergences cannot be calculated, return None
if divs.iter().any(|div| div.is_none()) {
None
} else {
Some(
divs
.iter()
.map(|div| OrderedFloat(div.unwrap()))
.max()
.unwrap()
.into_inner(),
)
}

// return the maximum divergence
}
}

#[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
Expand Down
13 changes: 13 additions & 0 deletions packages/pangraph/src/representation/seq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,19 @@ impl Seq {
pub fn rotate_right(&mut self, mid: usize) {
self.data.rotate_right(mid);
}

pub fn n_ambiguous_bases(&self) -> usize {
self.data.iter().filter(|&c| c.is_ambiguous()).count()
}

pub fn ambiguous_positions(&self) -> Vec<usize> {
self
.data
.iter()
.enumerate()
.filter_map(|(i, c)| c.is_ambiguous().then_some(i))
.collect()
}
}

impl PartialEq<str> for Seq {
Expand Down
28 changes: 28 additions & 0 deletions packages/pangraph/src/representation/seq_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,34 @@ impl AsciiChar {
debug_assert!(s.len() == 1);
Self(s.as_bytes()[0])
}

pub fn is_ambiguous(&self) -> bool {
matches!(
self.0,
b'N'
| b'n'
| b'R'
| b'r'
| b'Y'
| b'y'
| b'S'
| b's'
| b'W'
| b'w'
| b'K'
| b'k'
| b'M'
| b'm'
| b'B'
| b'b'
| b'D'
| b'd'
| b'H'
| b'h'
| b'V'
| b'v'
)
}
}

impl core::fmt::Display for AsciiChar {
Expand Down
4 changes: 2 additions & 2 deletions packages/pangraph/src/tree/neighbor_joining.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use crate::distance::mash::mash_distance::mash_distance;
use crate::distance::mash::minimizer::MinimizersParams;
use crate::pangraph::pangraph::Pangraph;
use crate::tree::balance::balance;
// use crate::tree::balance::balance;
use crate::tree::clade::Clade;
use crate::utils::lock::Lock;
use crate::utils::ndarray::broadcast;
Expand All @@ -28,7 +28,7 @@ pub fn build_tree_using_neighbor_joining(graphs: Vec<Pangraph>) -> Result<Lock<C
let tree = Lock::new(Clade::from_children(None, &nodes[0], &nodes[1]));

// Balance guide tree (to increase available parallelism during parallel traversal?)
let tree = balance(&tree);
// let tree = balance(&tree);

Ok(tree)
}
Expand Down