diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml new file mode 100644 index 0000000..1e2f14e --- /dev/null +++ b/.github/workflows/test-ci.yml @@ -0,0 +1,14 @@ +name: Continuous integration +on: [push, pull_request] + +jobs: + test: + name: cargo test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - run: cargo test --all-features --release + - run: cargo clippy -- -D warnings \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fa8d85a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +Cargo.lock +target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..6628dea --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "waffle_con" +version = "0.4.2" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +derive_builder = "0.13.0" +env_logger = "0.11.1" +itertools = "0.12.1" +log = "0.4.20" +priority-queue = "1.3.2" +rustc-hash = "1.1.0" +simple-error = "0.3.0" +rand = "0.8.5" + +[dev-dependencies] +criterion = "0.5.1" +csv = "1.3.0" +serde = "1.0.197" + +[[bench]] +name = "consensus_bench" +harness = false + +[profile.release] +lto = true \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..c86d710 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,15 @@ +# Pacific Biosciences Software License Agreement +1. **Introduction and Acceptance.** This Software License Agreement (this “**Agreement**”) is a legal agreement between you (either an individual or an entity) and Pacific Biosciences of California, Inc. (“**PacBio**”) regarding the use of the PacBio software accompanying this Agreement, which includes documentation provided in “online” or electronic form (together, the “**Software**”). PACBIO PROVIDES THE SOFTWARE SOLELY ON THE TERMS AND CONDITIONS SET FORTH IN THIS AGREEMENT AND ON THE CONDITION THAT YOU ACCEPT AND COMPLY WITH THEM. BY DOWNLOADING, DISTRIBUTING, MODIFYING OR OTHERWISE USING THE SOFTWARE, YOU (A) ACCEPT THIS AGREEMENT AND AGREE THAT YOU ARE LEGALLY BOUND BY ITS TERMS; AND (B) REPRESENT AND WARRANT THAT: (I) YOU ARE OF LEGAL AGE TO ENTER INTO A BINDING AGREEMENT; AND (II) IF YOU REPRESENT A CORPORATION, GOVERNMENTAL ORGANIZATION OR OTHER LEGAL ENTITY, YOU HAVE THE RIGHT, POWER AND AUTHORITY TO ENTER INTO THIS AGREEMENT ON BEHALF OF SUCH ENTITY AND BIND SUCH ENTITY TO THESE TERMS. IF YOU DO NOT AGREE TO THE TERMS OF THIS AGREEMENT, PACBIO WILL NOT AND DOES NOT LICENSE THE SOFTWARE TO YOU AND YOU MUST NOT DOWNLOAD, INSTALL OR OTHERWISE USE THE SOFTWARE OR DOCUMENTATION. +2. **Grant of License.** Subject to your compliance with the restrictions set forth in this Agreement, PacBio hereby grants to you a non-exclusive, non-transferable license during the Term to install, copy, use, distribute in binary form only, and host the Software. If you received the Software from PacBio in source code format, you may also modify and/or compile the Software. +3. **License Restrictions.** You may not remove or destroy any copyright notices or other proprietary markings. You may only use the Software to process or analyze data generated on a PacBio instrument or otherwise provided to you by PacBio. Any use, modification, translation, or compilation of the Software not expressly authorized in Section 2 is prohibited. You may not use, modify, host, or distribute the Software so that any part of the Software becomes subject to any license that requires, as a condition of use, modification, hosting, or distribution, that (a) the Software, in whole or in part, be disclosed or distributed in source code form or (b) any third party have the right to modify the Software, in whole or in part. +4. **Ownership.** The license granted to you in Section 2 is not a transfer or sale of PacBio’s ownership rights in or to the Software. Except for the license granted in Section 2, PacBio retains all right, title and interest (including all intellectual property rights) in and to the Software. The Software is protected by applicable intellectual property laws, including United States copyright laws and international treaties. +5. **Third Party Materials.** The Software may include software, content, data or other materials, including related documentation and open source software, that are owned by one or more third parties and that are subject to separate licensee terms (“**Third-Party Licenses**”). A list of all materials, if any, can be found the documentation for the Software. You acknowledge and agree that such third party materials subject to Third-Party Licenses are not licensed to you pursuant to the provisions of this Agreement and that this Agreement shall not be construed to grant any such right and/or license. You shall have only such rights and/or licenses, if any, to use such third party materials as set forth in the applicable Third-Party Licenses. +6. **Feedback.** If you provide any feedback to PacBio concerning the functionality and performance of the Software, including identifying potential errors and improvements (“**Feedback**”), such Feedback shall be owned by PacBio. You hereby assign to PacBio all right, title, and interest in and to the Feedback, and PacBio is free to use the Feedback without any payment or restriction. +7. **Confidentiality.** You must hold in the strictest confidence the Software and any related materials or information including, but not limited to, any Feedback, technical data, research, product plans, or know-how provided by PacBio to you, directly or indirectly in writing, orally or by inspection of tangible objects (“**Confidential Information**”). You will not disclose any Confidential Information to third parties, including any of your employees who do not have a need to know such information, and you will take reasonable measures to protect the secrecy of, and to avoid disclosure and unauthorized use of, the Confidential Information. You will immediately notify the PacBio in the event of any unauthorized or suspected use or disclosure of the Confidential Information. To protect the Confidential Information contained in the Software, you may not reverse engineer, decompile, or disassemble the Software, except to the extent the foregoing restriction is expressly prohibited by applicable law. +8. **Termination.** This Agreement will terminate upon the earlier of: (a) your failure to comply with any term of this Agreement; or (b) return, destruction, or deletion of all copies of the Software in your possession. PacBio’s rights and your obligations will survive the termination of this Agreement. The “**Term**” means the period beginning on when this Agreement becomes effective until the termination of this Agreement. Upon termination of this Agreement for any reason, you will delete from all of your computer libraries or storage devices or otherwise destroy all copies of the Software and derivatives thereof. +9. **NO OTHER WARRANTIES.** THE SOFTWARE IS PROVIDED ON AN “AS IS” BASIS. YOU ASSUME ALL RESPONSIBILITIES FOR SELECTION OF THE SOFTWARE TO ACHIEVE YOUR INTENDED RESULTS, AND FOR THE INSTALLATION OF, USE OF, AND RESULTS OBTAINED FROM THE SOFTWARE. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, PACBIO DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY, QUALITY, ACCURACY, TITLE, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE WITH RESPECT TO THE SOFTWARE AND THE ACCOMPANYING WRITTEN MATERIALS. THERE IS NO WARRANTY AGAINST INTERFERENCE WITH THE ENJOYMENT OF THE SOFTWARE OR AGAINST INFRINGEMENT. THERE IS NO WARRANTY THAT THE SOFTWARE OR PACBIO’S EFFORTS WILL FULFILL ANY OF YOUR PARTICULAR PURPOSES OR NEEDS. +10. **LIMITATION OF LIABILITY.** UNDER NO CIRCUMSTANCES WILL PACBIO BE LIABLE FOR ANY CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, LOSS OF DATA OR OTHER SUCH PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE, EVEN IF PACBIO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL PACBIO’S AGGREGATE LIABILITY FOR DAMAGES ARISING OUT OF THIS AGREEMENT EXCEED $5. THE FOREGOING EXCLUSIONS AND LIMITATIONS OF LIABILITY AND DAMAGES WILL NOT APPLY TO CONSEQUENTIAL DAMAGES FOR PERSONAL INJURY. +11. **Indemnification.** You will indemnify, hold harmless, and defend PacBio (including all of its officers, employees, directors, subsidiaries, representatives, affiliates, and agents) and PacBio’s suppliers from and against any damages (including attorney’s fees and expenses), claims, and lawsuits that arise or result from your use of the Software. +12. **Trademarks.** Certain of the product and PacBio names used in this Agreement, the Software may constitute trademarks of PacBio or third parties. You are not authorized to use any such trademarks. +13. **Export Restrictions.** YOU UNDERSTAND AND AGREE THAT THE SOFTWARE IS SUBJECT TO UNITED STATES AND OTHER APPLICABLE EXPORT-RELATED LAWS AND REGULATIONS AND THAT YOU MAY NOT EXPORT, RE-EXPORT OR TRANSFER THE SOFTWARE OR ANY DIRECT PRODUCT OF THE SOFTWARE EXCEPT AS PERMITTED UNDER THOSE LAWS. WITHOUT LIMITING THE FOREGOING, EXPORT, RE-EXPORT, OR TRANSFER OF THE SOFTWARE TO CUBA, IRAN, NORTH KOREA, SYRIA, RUSSIA, BELARUS, AND THE REGIONS OF CRIMEA, LNR, AND DNR OF UKRAINE IS PROHIBITED. +14. **General.** This Agreement is governed by the laws of the State of California, without reference to its conflict of laws principles. This Agreement is the entire agreement between you and PacBio and supersedes any other communications with respect to the Software. If any provision of this Agreement is held invalid or unenforceable, the remainder of this Agreement will continue in full force and effect. diff --git a/README.md b/README.md index 0ba41ec..8ad567f 100644 --- a/README.md +++ b/README.md @@ -1 +1,55 @@ +[![Build status](https://github.com/PacificBiosciences/waffle_con/actions/workflows/test-ci.yml/badge.svg)](https://github.com/PacificBiosciences/waffle_con/actions) + + # waffle_con +This crate contains our implementation of the Dynamic WFA (DWFA) consensus algorithms, or `waffle_con`. +The algorithms contained within were built to support the consensus steps of [pb-StarPhase](https://github.com/PacificBiosciences/pb-StarPhase). +Any issues related to StarPhase should be reported on the StarPhase GitHub page. + +This crate contains functionality for: + +* ConsensusDWFA - One input string per sequence, one output sequence +* DualConsensusDWFA - One input string per sequence, 1 or 2 output sequences +* PriorityConsensuDWFA - Multiple input strings per sequence (priority), 1+ output sequences + +## Full documentation +`waffle_con` provides extensive in-line documentation. +A user-friendly HTML version can be generated via `cargo doc`. + +## Methods +At a high level, this project provided many consensus algorithms that slowly build upon each other from a baseline single-consensus method. +The single consensus method is based on the idea of cost-based exploration of an assembly (or consensus) space. +"Cost" in this context is basically the edit distance between the assembled sequence and a set of inputs. +We use a dynamic WFA algorithm to both nominate and score the assembled sequences. +These sequences are then explored using a Dijkstra-like approach (least-cost-first). +Thus the core loop is: + +* Pop a candidate from the priority queue - check if this candidate is finished and compare to current best results +* If not finished, each dynamic WFA nominates one or more extensions to the candidate +* Each candidate extension is added to the dynamic WFA for each sequence +* The total combined edit distance is used to score that candidate +* The candidate is placed into the min-cost priority queue + +The algorithm can be extended to the "dual" option by allowing it to split into two candidate sequences when sufficient evidence is present (e.g., sufficient minor allele frequency and/or number of sequences). +Then the best score is used for each input sequence when calculating the edit distance cost. +This can be further split into a multi-consensus by repeatedly running dual-consensus in a binary-tree like system (e.g., repeatedly split the sequences into groups until they do not want to split further). +Finally, priority consensus is just multi-consensus on a chain of sequence inputs instead of a single sequence input. + +## Limitations +`waffle_con` has been designed for the specific purpose of PGx consensus in StarPhase using long, accurate HiFi reads. +The underlying algorithms rely on a basic edit-distance wavefront algorithm, which scales with the total edit distance between two sequences. +Thus, high error or high divergence sequences are more likely to lead to longer run times. +Additionally, high error may cause the traversal algorithm to "get lost" in the search space due to weak consensus, which may ultimatley lead to lower quality consensus sequences. +Additionally, best results are with full-length input sequences. +Partial sequences require estimating start/stop positions, which injects more opportunities for error. + +## Support information +The `waffle_con` crate is a pre-release software intended for research use **only** and not for use in diagnostic procedures. +While efforts were made to ensure that `waffle_con` lives up to the quality that PacBio strives for, we make no warranty regarding this software. + +As `waffle_con` is **not** covered by any service level agreement or the like, please do not contact a PacBio Field Applications Scientists or PacBio Customer Service for assistance with any `waffle_con` release. +Please report all issues through GitHub instead. +We make no warranty that any such issue will be addressed, to any extent or within any time frame. + +### DISCLAIMER +THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES. diff --git a/benches/consensus_bench.rs b/benches/consensus_bench.rs new file mode 100644 index 0000000..befaa7a --- /dev/null +++ b/benches/consensus_bench.rs @@ -0,0 +1,55 @@ + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use waffle_con::cdwfa_config::CdwfaConfigBuilder; +use waffle_con::consensus::ConsensusDWFA; +use waffle_con::example_gen::generate_test; + +pub fn bench_consensus(c: &mut Criterion) { + let alphabet_size = 4; + let seq_lens = [1000, 10000]; + let num_samples = [8, 30]; + let error_rates = [0.0, 0.01, 0.02]; + + let mut benchmark_group = c.benchmark_group("consensus-group"); + benchmark_group.sample_size(10); + + for &sl in seq_lens.iter() { + for &ns in num_samples.iter() { + // require 25% of reads to go forth + let config = CdwfaConfigBuilder::default() + .min_count((ns as u64) / 4) + .build().unwrap(); + for &er in error_rates.iter() { + let (_consensus, dataset) = generate_test(alphabet_size, sl, ns, er); + /* + // uncomment to print out the strings, mostly for initial testing + println!("{consensus:?}"); + for d in dataset.iter() { + println!("{d:?}"); + } + */ + let test_label = format!("consensus_{alphabet_size}x{sl}x{ns}_{er}"); + benchmark_group.bench_function(&test_label, |b| b.iter(|| { + black_box({ + let mut consensus_dwfa = ConsensusDWFA::with_config(config.clone()).unwrap(); + for s in dataset.iter() { + consensus_dwfa.add_sequence(s).unwrap(); + } + let resolved_consensus = consensus_dwfa.consensus().unwrap(); + + // this was an initial sanity check we did as a basic test + // assert_eq!(resolved_consensus[0].sequence(), &consensus); + + resolved_consensus + }); + })); + } + } + } + + benchmark_group.finish(); +} + +criterion_group!(benches, bench_consensus); +criterion_main!(benches); \ No newline at end of file diff --git a/src/cdwfa_config.rs b/src/cdwfa_config.rs new file mode 100644 index 0000000..85e118a --- /dev/null +++ b/src/cdwfa_config.rs @@ -0,0 +1,103 @@ + +/*! +Contains configuration information for the consensus DWFA algorithm. +Typical usage is to the use the builder to construct the config, e.g. +``` +use waffle_con::cdwfa_config::{CdwfaConfig, CdwfaConfigBuilder, ConsensusCost}; +let config: CdwfaConfig = CdwfaConfigBuilder::default() + .consensus_cost(ConsensusCost::L2Distance) + .wildcard(Some(b'N')) + .build() + .unwrap(); +``` +*/ + +/// Enumeration of difference scoring types for a consensus. +/// Initially just using L1 distance, which is the sum of edit distance across all inputs. +#[derive(Clone, Copy, Debug, Default, PartialEq)] +pub enum ConsensusCost { + /// Minimizes the total edit distance across all sequences + #[default] + L1Distance, + /// Minimizes the square of the edit distance across all sequences + L2Distance +} + +/** +Contains configuration information for the consensus DWFA algorithm. +Typical usage is to the use the builder to construct the config, e.g. +``` +use waffle_con::cdwfa_config::{CdwfaConfig, CdwfaConfigBuilder, ConsensusCost}; +let config: CdwfaConfig = CdwfaConfigBuilder::default() + .consensus_cost(ConsensusCost::L2Distance) + .wildcard(Some(b'N')) + .build() + .unwrap(); +``` +*/ +#[derive(derive_builder::Builder, Clone, Debug)] +#[builder(default)] +pub struct CdwfaConfig { + /// The consensus scoring cost + pub consensus_cost: ConsensusCost, + /// Maximum queue size, which controls how many active branches we allow during exploration + pub max_queue_size: usize, + /// Maximum capacity, controls how many nodes of each length we process + pub max_capacity_per_size: usize, + /// Maximum return size, which controls how many equal returns we track + pub max_return_size: usize, + /// Maximum explored nodes without a constraint, this prevents hyper-branching in truly ambiguous regions + pub max_nodes_wo_constraint: u64, + /// Minimum number of occurrences of a candidate extension to get used (largest is always used regardless) + pub min_count: u64, + /// Minimum fraction of sequences of a candidate extension to get used + pub min_af: f64, + /// For dual/multi-consensus, this will weight the nominated extension by the current edit distance, "accelerating" convergence + pub weighted_by_ed: bool, + /// Enables an optional wildcard character that will match anything + pub wildcard: Option, + /// Dual mode DWFA edit distance pruning threshold; if the DWFAs for the two options diverge more than this threshold, the worse one stops getting tracked + pub dual_max_ed_delta: usize, + // if true, then this will not penalize input sequences that are shorter than the final consensus + pub allow_early_termination: bool, + /// if true, this will automatically shift offsets downwards if nothing starts at "0" + pub auto_shift_offsets: bool, + /// The number of bases before the last_offset to search for an optimal start point + pub offset_window: usize, + /// The number of bases to use in the comparison for calculating best optimal start point + pub offset_compare_length: usize +} + +impl Default for CdwfaConfig { + fn default() -> Self { + Self { + // L1 v. L2 is an open question + consensus_cost: ConsensusCost::L1Distance, + // 20 is relatively small, but this seems to work out in practice for our low-error sequences + max_queue_size: 20, + // set to the same by default + max_capacity_per_size: 20, + // Realistically, anything more than 10 is not particularly useful + max_return_size: 10, + // lower values help constrain hyper-branching scenarios, we probably should not go below 10 though + max_nodes_wo_constraint: 1000, + // 3 seems reasonable + min_count: 3, + // by default, we will just let the raw count work + min_af: 0.0, + // by default, it's not clear we want to do this + weighted_by_ed: false, + // by default, we likely do not want a wildcard symbol + wildcard: None, + // if the options have diverged by 20, seems like one is a clear candidate + dual_max_ed_delta: 20, + // to keep with our existing tests, we will not allow this by default + allow_early_termination: false, + // someone might not want this, but most will + auto_shift_offsets: true, + // these were just what we started with + offset_window: 50, + offset_compare_length: 50 + } + } +} \ No newline at end of file diff --git a/src/consensus.rs b/src/consensus.rs new file mode 100644 index 0000000..0d400a8 --- /dev/null +++ b/src/consensus.rs @@ -0,0 +1,849 @@ + +/*! +This module provides access to the ConsensusDWFA, which generates the single best consensus for a set of sequences. + +# Example usage +```rust +use waffle_con::consensus::ConsensusDWFA; + +let sequences = [ + b"ACGT".to_vec(), + b"ACCGT".to_vec(), // this should be the consensus + b"ACCCGT".to_vec() +]; + +// add all the sequences +let mut cdwfa: ConsensusDWFA = Default::default(); +for s in sequences.iter() { + cdwfa.add_sequence(s).unwrap(); +} + +// run consensus and check the results +let consensuses = cdwfa.consensus().unwrap(); +assert_eq!(consensuses.len(), 1); +assert_eq!(consensuses[0].sequence(), sequences[1]); +assert_eq!(consensuses[0].scores(), &[1, 0, 1]); +``` +*/ + +use log::{debug, trace}; +use priority_queue::PriorityQueue; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::bail; +use std::cmp::Reverse; + +use crate::cdwfa_config::{CdwfaConfig, ConsensusCost}; +use crate::dynamic_wfa::DWFALite; +use crate::pqueue_tracker::PQueueTracker; + +type NodePriority = (Reverse, usize); + +/// Contains a final consensus result +#[derive(Clone, Debug, Default, PartialEq)] +pub struct Consensus { + /// The generated consensus + sequence: Vec, + /// The consensus scoring model + consensus_cost: ConsensusCost, + /// Vector of the scores from the consensus to each sequence + scores: Vec +} + +impl Consensus { + /// Constructor + pub fn new(sequence: Vec, consensus_cost: ConsensusCost, scores: Vec) -> Consensus { + Consensus { + sequence, + consensus_cost, + scores + } + } + + // Getters + pub fn sequence(&self) -> &[u8] { + &self.sequence + } + + pub fn consensus_cost(&self) -> ConsensusCost { + self.consensus_cost + } + + pub fn scores(&self) -> &[usize] { + &self.scores + } +} + +/// Core utility that will generate a consensus sequence using the DWFA approach. +/// For now, it will assume that all sequences are full length representations +#[derive(Debug, Default)] +pub struct ConsensusDWFA<'a> { + /// Contains all the sequences that have been added to this consensus so far. + sequences: Vec<&'a [u8]>, + /// Last offset into the consensus that the sequence is allowed to start, this may get adjusted at run-time. If None, then assume start. + offsets: Vec>, + /// The config for this consensus run + config: CdwfaConfig, + /// The alphabet we will use for consensus building + alphabet: HashSet +} + +impl<'a> ConsensusDWFA<'a> { + /// Creates a new instance of ConsensusDWFA and performs sanity checks. + /// # Arguments + /// * `config` - the type of consensus score we want to use + /// # Errors + /// * None so far + pub fn with_config(config: CdwfaConfig) -> Result, Box> { + Ok(ConsensusDWFA { + sequences: vec![], + offsets: vec![], + config, + alphabet: Default::default() + }) + } + + /// Adds a new sequence to the list without an offset. + /// # Arguments + /// * `sequence` - the new sequence to add + /// # Errors + /// * None so far + pub fn add_sequence(&mut self, sequence: &'a [u8]) -> Result<(), Box> { + self.add_sequence_offset(sequence, None) + } + + /// Adds a new sequence to the list + /// # Arguments + /// * `sequence` - the new sequence to add + /// * `last_offset` - an optional offset into the consensus that is the last allowed offset for the sequence to start + /// # Errors + /// * None so far + pub fn add_sequence_offset(&mut self, sequence: &'a [u8], last_offset: Option) -> Result<(), Box> { + // add ever character to our alphabet + self.alphabet.extend(sequence.iter().cloned()); + + // make sure we didn't add the wildcard by accident + // note: it's easier to just remove it once here than do something fancy with the above iteration + if let Some(wc) = self.config.wildcard { + self.alphabet.remove(&wc); + } + + // now add the sequence + self.sequences.push(sequence); + self.offsets.push(last_offset); + Ok(()) + } + + /// The core function that gets called after adding all the sequences we care about + /// # Errors + /// * if the DWFA throws errors + pub fn consensus(&self) -> Result, Box> { + // initialize everything + let mut maximum_error = usize::MAX; + let mut nodes_explored: usize = 0; + let mut nodes_ignored: usize = 0; + let mut peak_queue_size: usize = 0; + let mut farthest_consensus: usize = 0; + let mut last_constraint: u64 = 0; + + let offset_window = self.config.offset_window; // the window around the provided offset that we check + let offset_compare_length = self.config.offset_compare_length; // the number of bases we check to figure out the real start + + let offsets: Vec> = if self.config.auto_shift_offsets { + // first figure out if we need to address the offsets for this re-run of consensus + let mut min_offset = usize::MAX; + let mut start_sequence_found = false; + for offset in self.offsets.iter() { + match offset { + Some(o) => min_offset = min_offset.min(*o), + None => start_sequence_found = true + }; + } + + if !start_sequence_found { + debug!("No start sequence detected, shifting all offsets by {min_offset}"); + self.offsets.iter() + .map(|o| { + let value = o.unwrap(); + if value == min_offset { + // equal to minimum, flag as no offset + None + } else { + // past minimum, so down-shift it + Some(value - min_offset) + } + }) + .collect::>>() + } else { + self.offsets.clone() + } + } else { + self.offsets.clone() + }; + + debug!("Offsets: {:?}", offsets); + + // build up the list of sizes where we need to activate one or more sequences + let mut activate_points: HashMap> = Default::default(); + let mut max_activate = 0; + let mut initially_active: usize = 0; + for (seq_index, offset) in offsets.iter().enumerate() { + if let Some(last_offset) = offset { + // we have an approx, calculate when it can get activated and add it to our list + let activate_length = last_offset + offset_compare_length; + let entry = activate_points.entry(activate_length).or_default(); + entry.push(seq_index); + + if activate_length > max_activate { + max_activate = activate_length; + } + } else { + initially_active += 1; + } + } + + if initially_active == 0 { + bail!("Must have at least one initial offset of None to see the consensus."); + } + + let max_queue_size = self.config.max_queue_size; + let max_capacity_per_size = self.config.max_capacity_per_size; + let initial_size = self.sequences.iter().map(|s| s.len()).max().unwrap(); + let mut pqueue_tracker = PQueueTracker::with_capacity(initial_size, max_capacity_per_size); + + let initial_node = ConsensusNode::new_root_node(&offsets, self.config.wildcard, self.config.allow_early_termination)?; + let initial_priority = initial_node.priority(self.consensus_cost()); + + // start the priority queue, which defaults to bigger is better so we need a Reverse since want smaller costs + let mut pqueue: PriorityQueue = PriorityQueue::new(); + pqueue_tracker.insert(initial_node.consensus().len()); + pqueue.push(initial_node, initial_priority); + + let mut ret = vec![]; + + // the way this will work is that we will eventually find one or more answers and anything worse will get drained off until no possibilities remain + while !pqueue.is_empty() { + // this just tracks how large our actual queue gets + peak_queue_size = peak_queue_size.max(pqueue.len()); + + // first, check if we need to restrict our pqueue threshold + while (pqueue_tracker.len() > max_queue_size || last_constraint >= self.config.max_nodes_wo_constraint) && pqueue_tracker.threshold() < farthest_consensus { + pqueue_tracker.increment_threshold(); + last_constraint = 0; + } + + // get the top node and check if it's bad + let (top_node, top_priority) = pqueue.pop().unwrap(); + let top_len = top_node.consensus().len(); + pqueue_tracker.remove(top_len); + + trace!("Pop: {:?} => {:?}", top_priority, top_node.consensus); + + if top_priority.0.0 > maximum_error || top_priority.1 < pqueue_tracker.threshold() || pqueue_tracker.at_capacity(top_len) { + trace!("\tIgnored"); + nodes_ignored += 1; + continue; + } + + // mark this one as explored and updated farthest if necessary + farthest_consensus = farthest_consensus.max(top_len); + nodes_explored += 1; + last_constraint += 1; + pqueue_tracker.process(top_len)?; + + // now check if this node has reached the end + if top_node.reached_end(&self.sequences, self.config.allow_early_termination) { + // this node *might* be done, we need to finalize it to be sure + // we also need to do it in a clone since it might not be finalized and we need to keep extending + let mut finalized_node = top_node.clone(); + finalized_node.finalize(&self.sequences)?; + + // get the new finalized score, it may have changed + let finalized_score = finalized_node.total_cost(self.config.consensus_cost); + + // check first if it's stricly BETTER than anything so far + if finalized_score < maximum_error { + // this score is better than anything we have seen so far, clear out previous results if we have any + maximum_error = finalized_score; + ret.clear(); + } + + // now check if it's as good as anything so far; the above check will not nullify this one + if finalized_score <= maximum_error && ret.len() < self.config.max_return_size { + // this consensus is at least as good as the best, so add it as a result + ret.push(Consensus::new( + finalized_node.consensus().to_vec(), + self.config.consensus_cost, + finalized_node.costs(self.config.consensus_cost) + )); + } + } + + // this fetches the list of options according to the WFA so far + // NOTE: this CAN include the wildcard, but only if the wildcard is the only character + let extension_candidates = top_node.get_extension_candidates(&self.sequences, self.config.wildcard); + let max_observed = extension_candidates.values().cloned().max_by(|a, b| a.total_cmp(b)) + // if no observation, then just use min count + .unwrap_or(self.config.min_count as f64); + + // the active threshold is the minimum of 1) the configured minimum count OR 2) the highest count we observed + let active_threshold = (self.config.min_count as f64).min(max_observed); + trace!("\tcandidates = {:?}", extension_candidates); + + // pull out the full list of viable candidates + let passing_candidates: Vec = extension_candidates.into_iter() + .filter_map(|(symbol, count)| { + if count >= active_threshold { + Some(symbol) + } else { + None + } + }).collect(); + + let mut new_nodes = vec![]; + if passing_candidates.is_empty() { + if top_len < max_activate { + bail!("Encountered coverage gap: consensus is length {} with no candidates, but sequences activate at {}", top_len, max_activate); + } else { + // no extensions remain, should just happen at the end + } + } else if passing_candidates.len() == 1 { + // we have only one extension, we can extend in place without cloning + let mut new_node = top_node; + new_node.push(&self.sequences, passing_candidates[0])?; + new_nodes.push(new_node); + } else { + // we have 2+ viable, we have to clone them + for symbol in passing_candidates.into_iter() { + let mut new_node = top_node.clone(); + new_node.push(&self.sequences, symbol)?; + new_nodes.push(new_node); + } + } + + // for each need node, do any activations and then add it to the queue + for mut new_node in new_nodes.into_iter() { + // check if we need to activate any strings + let opt_activate_list = activate_points.get(&new_node.consensus().len()); + if let Some(activate_list) = opt_activate_list { + assert!(!activate_list.is_empty()); + for &seq_index in activate_list.iter() { + new_node.activate_sequence(self.sequences[seq_index], seq_index, offset_window, offset_compare_length, self.config.wildcard, self.config.allow_early_termination)?; + } + } + + // get the new cost and put it in the queue + let new_priority = new_node.priority(self.consensus_cost()); + trace!("\tPush {:?} => {:?}", new_priority, new_node.consensus); + pqueue_tracker.insert(new_node.consensus().len()); + pqueue.push(new_node, new_priority); + } + } + + assert_eq!(pqueue_tracker.len(), 0); + + // sort these by the sequence so we always have a fixed order + ret.sort_by(|c1, c2| c1.sequence().cmp(c2.sequence())); + + debug!("nodes_explored: {nodes_explored}"); + debug!("nodes_ignored: {nodes_ignored}"); + debug!("peak_queue_size: {peak_queue_size}"); + Ok(ret) + } + + // getters + pub fn sequences(&self) -> &[&'a [u8]] { + &self.sequences + } + + pub fn consensus_cost(&self) -> ConsensusCost { + self.config.consensus_cost + } + + pub fn alphabet(&self) -> &HashSet { + &self.alphabet + } +} + +/// Wrapper for a node containing a partial consensus as well as the DWFA tracking for that node +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct ConsensusNode { + /// The consensus sequence so far + consensus: Vec, + /// The DWFAs that are tracked for each sequence; these are only None if they have not been activated yet due to an offset + dwfas: Vec> +} + +impl ConsensusNode { + /// Constructor for a new consensus search root node + /// # Arguments + /// * `offsets` - a set of offsets into the sequences where the approximate starts are + /// * `wildcard` - an optional wildcard symbol that will match anything + /// * `allow_early_termination` - if true, then it will allow the consensus to go beyond the provided baseline sequences without penalty + /// # Errors + /// * if DWFA construction fails + fn new_root_node(offsets: &[Option], wildcard: Option, allow_early_termination: bool) -> Result> { + let dwfas: Vec> = offsets.iter() + .map(|offset| match offset { + // we have an offset, so do not create a map + Some(_o) => None, + // we don't have an offset, so this is active from the start + None => Some(DWFALite::new(wildcard, allow_early_termination)) + }) + .collect(); + + let active_count = dwfas.iter().filter(|d| d.is_some()).count(); + if active_count == 0 { + bail!("Root consensus node has no initially active sequences."); + } + + Ok(ConsensusNode { + consensus: vec![], + dwfas + }) + } + + /// This will activate the DWFAs for a particular sequence. + /// # Arguments + /// * `sequence` - the sequence that is getting activated + /// * `seq_index` - the sequence index, which will map to the DWFAs + /// * `offset_window` - the window we are searching for a best match + /// * `offset_compare_length` - the amount of bases we are comparing + /// * `wildcard` - optional wildcard for scoring, passed into DWFA + /// * `allow_early_termination` - enables sequences to end partway through the consensus, passed into DWFA + fn activate_sequence(&mut self, sequence: &[u8], seq_index: usize, offset_window: usize, offset_compare_length: usize, wildcard: Option, allow_early_termination: bool) -> Result<(), Box> { + // make sure everything is currently inactive + assert!(self.dwfas[seq_index].is_none()); + + // build up the search space + let con_len = self.consensus.len(); + + // figure out how far back we search + let start_delta = offset_window + offset_compare_length; // we search from current back to the offset_window + let start_position = con_len.saturating_sub(start_delta); + let end_position = con_len.saturating_sub(offset_compare_length); + + // figure out which offset has the best score; assume the middle of the offset window is the best + let mut best_offset = con_len.saturating_sub(offset_compare_length + offset_window / 2); + let mut min_ed = crate::sequence_alignment::wfa_ed_config(&self.consensus[best_offset..], &sequence[0..offset_compare_length], false, wildcard); + + // now check all the rest around this position + for p in start_position..end_position { + let ed = crate::sequence_alignment::wfa_ed_config(&self.consensus[p..], &sequence[0..offset_compare_length], false, wildcard); + if ed < min_ed { + min_ed = ed; + best_offset = p; + } + } + + // now set up the DWFA with the best offset + let mut new_dwfa = DWFALite::new(wildcard, allow_early_termination); + new_dwfa.set_offset(best_offset); + new_dwfa.update(sequence, &self.consensus)?; + self.dwfas[seq_index] = Some(new_dwfa); + + Ok(()) + } + + /// Adds a new symbol to the DWFAs + /// # Arguments + /// * `symbol` - the symbol to extend each DWFA with + /// # Errors + /// * if any of the extensions fail + fn push(&mut self, baseline_sequences: &[&[u8]], symbol: u8) -> Result<(), Box> { + self.consensus.push(symbol); + for (&baseline, opt_dwfa) in baseline_sequences.iter().zip(self.dwfas.iter_mut()) { + if let Some(dwfa) = opt_dwfa.as_mut() { + dwfa.update(baseline, &self.consensus)?; + } + } + Ok(()) + } + + /// Finalizes all of the contained DWFAs + /// # Errors + /// * if any of the individual finalizes fail + fn finalize(&mut self, baseline_sequences: &[&[u8]]) -> Result<(), Box> { + for (&baseline, opt_dwfa) in baseline_sequences.iter().zip(self.dwfas.iter_mut()) { + if let Some(dwfa) = opt_dwfa.as_mut() { + dwfa.finalize(baseline, &self.consensus)?; + } else { + bail!("Finalize called on DWFA that was never initialized."); + } + } + Ok(()) + } + + /// Returns a vec of all of the scores for this node + fn costs(&self, consensus_cost: ConsensusCost) -> Vec { + self.dwfas.iter() + .map(|opt_d| { + if let Some(d) = opt_d.as_ref() { + match consensus_cost { + ConsensusCost::L1Distance => d.edit_distance(), + ConsensusCost::L2Distance => d.edit_distance().pow(2) + } + } else { + // it's not initialized, here we will just report 0 + 0 + } + }) + .collect() + } + + /// Returns the total score for the node + fn total_cost(&self, consensus_cost: ConsensusCost) -> usize { + self.costs(consensus_cost).iter().sum() + } + + /// Returns the node priority. + /// Currently, this is based on 1) lowest cost and 2) consensus length. + /// # Arguments + /// * `consensus_cost` - cost model to evaluate the cost + fn priority(&self, consensus_cost: ConsensusCost) -> NodePriority { + ( + Reverse(self.total_cost(consensus_cost)), + self.consensus.len() + ) + } + + /// Returns true if any of the dwfas are at the end of their respective baseline sequences + /// # Arguments + /// * `baseline_sequences` - the sequences that are fixed that we want the consensus of + /// * `require_all` - if True, requires all of the sequences to be at their end (this is useful for allow_early_termination) + fn reached_end(&self, baseline_sequences: &[&[u8]], require_all: bool) -> bool { + let mut end_iter = baseline_sequences.iter().zip(self.dwfas.iter()) + .map(|(&baseline, opt_dwfa)| { + if let Some(dwfa) = opt_dwfa.as_ref() { + dwfa.reached_baseline_end(baseline) + } else { + false + } + }); + + // same iterator, different check + if require_all { + end_iter.all(|b| b) + } else { + end_iter.any(|b| b) + } + } + + /// Returns a hashmap of the extension candidates with their votes. + /// The votes are fractional if a sequence has multiple equally likely options. + /// Note that if + /// # Arguments + /// * `baseline_sequences` - the sequences that are fixed that we want the consensus of + /// * `wildcard` - an optional wildcard character, will be removed from return set unless it is the only value in it + fn get_extension_candidates(&self, baseline_sequences: &[&[u8]], wildcard: Option) -> HashMap { + let mut candidates: HashMap = Default::default(); + for (baseline_seq, opt_dwfa) in baseline_sequences.iter().zip(self.dwfas.iter()) { + if let Some(dwfa) = opt_dwfa.as_ref() { + // get the candidates and the total observation weight + let cand = dwfa.get_extension_candidates(baseline_seq, &self.consensus); + let vote_split = cand.values().sum::() as f64; + + // iterate over each candidate and scale it by the occurrences count / total weight + for (&c, &occ) in cand.iter() { + let entry = candidates.entry(c).or_insert(0.0); + *entry += occ as f64 / vote_split; + } + } + } + + if let Some(wc) = wildcard { + if candidates.len() > 1 { + // we have 2+ candidates, we can safely remove the wildcard and have something left over + candidates.remove(&wc); + } + } + + candidates + } + + // getters + fn consensus(&self) -> &[u8] { + &self.consensus + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::cdwfa_config::CdwfaConfigBuilder; + + #[test] + fn test_single_sequence() { + let sequence = b"ACGTACGTACGT"; + let mut consensus_dwfa = ConsensusDWFA::default(); + consensus_dwfa.add_sequence(sequence).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![Consensus { + sequence: sequence.to_vec(), + consensus_cost: ConsensusCost::L1Distance, + scores: vec![0] + }]); + } + + #[test] + fn test_dual_sequence() { + let sequence = b"ACGTACGTACGT"; + let sequence2 = b"ACGTACCTACGT"; + let mut consensus_dwfa = ConsensusDWFA::default(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(sequence2).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is a tie between the two we put it, each 1 ED total away + let consensus = consensus_dwfa.consensus().unwrap(); + + // sequence2 is alphabetically before sequence 1, so it will come first + assert_eq!(consensus, vec![ + Consensus { + sequence: sequence2.to_vec(), + consensus_cost: ConsensusCost::L1Distance, + scores: vec![1, 0] + }, + Consensus { + sequence: sequence.to_vec(), + consensus_cost: ConsensusCost::L1Distance, + scores: vec![0, 1] + }, + ]); + } + + #[test] + fn test_trio_sequence() { + let sequence = b"ACGTACGTACGT"; + let sequence2 = b"ACGTACCTACGT"; + let mut consensus_dwfa = ConsensusDWFA::default(); + + // add the first sequence twice, it will be the consensus + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(sequence2).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found sequence + let consensus = consensus_dwfa.consensus().unwrap(); + + // sequence2 is alphabetically before sequence 1, so it will come first + assert_eq!(consensus, vec![ + Consensus { + sequence: sequence.to_vec(), + consensus_cost: ConsensusCost::L1Distance, + scores: vec![0, 0, 1] + } + ]); + } + + #[test] + fn test_complicated() { + let expected_consensus = b"ACGTACGTACGT"; + let sequences = [ + // b"ACGTACG-TACGT" + // b"AC-TACGGTACGT" + b"ACTACGGTACGT", + // b"ACGTAAG-TCCGT" + b"ACGTAAGTCCGT", + // b"AAGTACG-TACGT" + b"AAGTACGTACGT" + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = ConsensusDWFA::default(); + for &sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 1); + assert_eq!(expected_consensus, consensus[0].sequence()); + } + + #[test] + fn test_wildcards() { + let expected_consensus = b"ACGTACGTACGT"; + let sequences = [ + // C insertion and tail missing + b"ACGTACCGT****".to_vec(), + // C>T and head+tail missing + b"**GTATGTAC**".to_vec(), + // just head missing + b"****ACGTACGT".to_vec() + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 1); + assert_eq!(expected_consensus, consensus[0].sequence()); + assert_eq!(&[1, 1, 0], consensus[0].scores()); + } + + #[test] + fn test_all_wildcards() { + let _expected_consensus = b"ACGTACGTACGT"; + let actual_consensus = b"*CGTACG*ACG*"; + let sequences = [ + // 1 ins + b"*CGTAACG*ACG*".to_vec(), + // nothing other than * + b"*CGTACG*ACG*".to_vec(), + // C>T + b"*CGTACG*ATG*".to_vec() + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 1); + assert_eq!(actual_consensus, consensus[0].sequence()); + assert_eq!(&[1, 0, 1], consensus[0].scores()); + } + + #[test] + fn test_allow_early_termination_costs() { + let expected_consensus = b"ACGT"; + + // first verify baseline is not generating the full thing due to error + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ).unwrap(); + for i in 1..(expected_consensus.len()+1) { + consensus_dwfa.add_sequence(&expected_consensus[0..i]).unwrap(); + } + // this first approach generated multiple possible ones that are in the middle + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, [ + Consensus { sequence: vec![65, 67], consensus_cost: ConsensusCost::L1Distance, scores: vec![1, 0, 1, 2] }, + Consensus { sequence: vec![65, 67, 71], consensus_cost: ConsensusCost::L1Distance, scores: vec![2, 1, 0, 1] } + ]); + + // second, verify that allowing early termination fixes it to the original result + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .allow_early_termination(true) + .build().unwrap() + ).unwrap(); + for i in 1..(expected_consensus.len()+1) { + consensus_dwfa.add_sequence(&expected_consensus[0..i]).unwrap(); + } + // this first approach generated multiple possible ones that are in the middle + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, [ + Consensus { sequence: expected_consensus.to_vec(), consensus_cost: ConsensusCost::L1Distance, scores: vec![0; 4] }, + ]); + } + + #[test] + fn test_offset_windows() { + // this test is _only_ for verifying that our offsets are getting correctly adjust to match + let expected_consensus = b"ACGTACGTACGTACGT"; + let sequences = [ + // no offset + b"ACGTACGTACGTACGT".to_vec(), + // offset 4 + b"ACGTACGTACGT".to_vec(), + // offset 6 + b"GTACGTACGT".to_vec(), + ]; + + // make one offset exactly match and another be right on our boundary (1-shift here) + let offsets = [None, Some(4), Some(7)]; + + // build a consensus over all inputs + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .offset_window(1) // both are exactly one off + .offset_compare_length(4) // semi-arbitrary, just needs to be short here given our seq-len + .build().unwrap() + ).unwrap(); + for (sequence, offset) in sequences.iter().zip(offsets.iter()) { + consensus_dwfa.add_sequence_offset(sequence, offset.clone()).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 1); + assert_eq!(expected_consensus, consensus[0].sequence()); + assert_eq!(&[0, 0, 0], consensus[0].scores()); + } + + #[test] + fn test_offset_gap_err() { + // this test is _only_ for verifying that our offsets are getting correctly adjust to match + let sequences = [ + // no offset + b"ACGTACGTACGTACGT".to_vec(), + // massive offset + b"ACGTACGTACGTACGT".to_vec(), + ]; + + // one starts and one is way out there + let offsets = [None, Some(1000)]; + + // build a consensus over all inputs + let mut consensus_dwfa = ConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .offset_window(1) // both are exactly one off + .offset_compare_length(4) // semi-arbitrary, just needs to be short here given our seq-len + .build().unwrap() + ).unwrap(); + for (sequence, offset) in sequences.iter().zip(offsets.iter()) { + consensus_dwfa.add_sequence_offset(sequence, offset.clone()).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus_err = consensus_dwfa.consensus(); + assert!(consensus_err.is_err()); + let error = consensus_err.err().unwrap(); + assert_eq!(error.to_string(), "Finalize called on DWFA that was never initialized."); + } +} \ No newline at end of file diff --git a/src/dual_consensus.rs b/src/dual_consensus.rs new file mode 100644 index 0000000..fd4bc50 --- /dev/null +++ b/src/dual_consensus.rs @@ -0,0 +1,2048 @@ + +/*! +This module provides access to the DualConsensusDWFA, which generates the two best consensuses for a set of sequences. +Note that it can also generate a single consensus if no splits are identified. + +# Example usage +```rust +use waffle_con::consensus::Consensus; +use waffle_con::dual_consensus::DualConsensusDWFA; +use waffle_con::cdwfa_config::ConsensusCost; + +let sequences = [ + b"TCCGT".to_vec(), + b"ACCGT".to_vec(), // consensus 1 + b"ACCGT".to_vec(), // consensus 1 + b"ACCAT".to_vec(), + b"CCGTAAT".to_vec(), + b"CGTAAAT".to_vec(), + b"CGTAAT".to_vec(), // consensus 2 + b"CGTAAT".to_vec(), // consensus 2 +]; + +// add all the sequences +let mut cdwfa: DualConsensusDWFA = Default::default(); +for s in sequences.iter() { + cdwfa.add_sequence(s).unwrap(); +} + +// run consensus and check the results +let consensuses = cdwfa.consensus().unwrap(); +assert_eq!(consensuses.len(), 1); +assert_eq!(consensuses[0].consensus1(), &Consensus::new(sequences[1].clone(), ConsensusCost::L1Distance, vec![1, 0, 0, 1])); +assert_eq!(consensuses[0].consensus2().unwrap(), &Consensus::new(sequences[6].clone(), ConsensusCost::L1Distance, vec![1, 1, 0, 0])); +assert_eq!(consensuses[0].is_consensus1(), &[true, true, true, true, false, false, false, false]); +``` +*/ + +use log::{debug, trace, warn}; +use priority_queue::PriorityQueue; +use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; +use simple_error::{bail, SimpleError}; +use std::cmp::Reverse; + +use crate::cdwfa_config::{CdwfaConfig, ConsensusCost}; +use crate::consensus::Consensus; +use crate::dynamic_wfa::DWFALite; +use crate::pqueue_tracker::PQueueTracker; + +type NodePriority = (Reverse, usize); + +/// Contains a final multi-consensus result +#[derive(Debug)] +pub struct DualConsensus { + /// The first consensus + consensus1: Consensus, + /// The second consensus - this one can be optional when only one consensus is identified + consensus2: Option, + /// same length as input sequences; for each entry, if True, then the corresponding input sequence matches consensus #1; otherwise, it matches #2 + is_consensus1: Vec, + /// The scores when compared to consensus1, will be None if it stopped tracking + scores1: Vec>, + /// The scores when compared to consensus2, will be None if it stopped tracking + scores2: Vec> +} + +impl PartialEq for DualConsensus { + fn eq(&self, other: &Self) -> bool { + self.consensus1 == other.consensus1 && + self.consensus2 == other.consensus2 && + self.is_consensus1 == other.is_consensus1 + + // for now, we do not care about these in the impl + //&& self.scores1 == other.scores1 && self.scores2 == other.scores2 + } +} + +impl DualConsensus { + /// Generic constructor for outside usage. This does not force consensus order like `from_node(...)`. + pub fn new( + consensus1: Consensus, consensus2: Option, + is_consensus1: Vec, scores1: Vec>, scores2: Vec> + ) -> Result { + + if is_consensus1.len() != scores1.len() || is_consensus1.len() != scores2.len() { + bail!("is_consensus1, scores1, and scores2 must all be the same length"); + } + + Ok(Self { + consensus1, + consensus2, + is_consensus1, + scores1, + scores2, + }) + } + + /// Constructor from a DualConsensusNode. + /// Of note, this will re-order dual alleles alpha-numerically, allowing for predictable outputs. + /// # Arguments + /// * `finalized_node` - the node that we are converting to a consensus + /// * `consensus_cost` - the cost model that gets propated + fn from_node(finalized_node: &DualConsensusNode, consensus_cost: ConsensusCost) -> DualConsensus { + // figure out the best consensus and score for each read; note that this assumes no ties + let (best_consensus_index, best_consensus_score) = finalized_node.costs(consensus_cost); + + // check if we need to swap the order + let swap_order = finalized_node.is_dual && (finalized_node.consensus2 < finalized_node.consensus1); + + // now reformat the above information such that we can build out the multi-consensus return values + let mut is_consensus1: Vec = Default::default(); + let mut consensus_scores: Vec> = vec![vec![]; 2]; + for (best_con_index, best_con_score) in best_consensus_index.into_iter() + .zip(best_consensus_score.into_iter()) { + // these MUST be equal length + // store that this sequence index matches the particular consensus index + assert!(best_con_index <= 1); + // toggle the consensus assignment if we swap_order + is_consensus1.push((best_con_index == 0) ^ swap_order); + // also store this score for this consensus index + consensus_scores[best_con_index].push(best_con_score); + } + + // now we can store the consensus sequences as well as the corresponding indices in the final output + let c1 = Consensus::new(finalized_node.consensus1.clone(), consensus_cost, consensus_scores[0].clone()); + let c2 = Consensus::new(finalized_node.consensus2.clone(), consensus_cost, consensus_scores[1].clone()); + + // reformat the actual consensus assignments based on swappage, and build result + let (consensus1, consensus2) = if swap_order { + assert!(finalized_node.is_dual); + (c2, Some(c1)) + } else { + (c1, if finalized_node.is_dual { Some(c2) } else { None }) + }; + + // now save the scores also + let (s1, s2) = finalized_node.full_cost(consensus_cost); + let (scores1, scores2) = if swap_order { + (s2, s1) + } else { + (s1, s2) + }; + + DualConsensus { + consensus1, + consensus2, + is_consensus1, + scores1, + scores2 + } + } + + // Returns true if this is a dual consensus result + pub fn is_dual(&self) -> bool { + self.consensus2.is_some() + } + + // Getters + pub fn consensus1(&self) -> &Consensus { + &self.consensus1 + } + + pub fn consensus2(&self) -> Option<&Consensus> { + self.consensus2.as_ref() + } + + pub fn is_consensus1(&self) -> &[bool] { + &self.is_consensus1 + } + + pub fn scores1(&self) -> &[Option] { + &self.scores1 + } + + pub fn scores2(&self) -> &[Option] { + &self.scores2 + } +} + +/// Core utility that will generate a consensus sequence using the DWFA approach. +/// For now, it will assume that all sequences are full length representations +#[derive(Debug, Default)] +pub struct DualConsensusDWFA<'a> { + /// Contains all the sequences that have been added to this consensus so far. + sequences: Vec<&'a [u8]>, + /// Approximate offsets into the consensus that sequence starts, this will get adjust at run-time. If None, then assume start. + offsets: Vec>, + /// The config for this consensus run + config: CdwfaConfig, + /// The alphabet we will use for consensus building + alphabet: HashSet +} + +impl<'a> DualConsensusDWFA<'a> { + /// Creates a new instance of ConsensusDWFA with the specified config. + /// # Arguments + /// * `config` - the type of consensus score we want to use + /// # Errors + /// * None so far + pub fn with_config(config: CdwfaConfig) -> Result, Box> { + Ok(DualConsensusDWFA { + config, + ..Default::default() + }) + } + + /// Adds a new sequence to the list with no offset. + /// # Arguments + /// * `sequence` - the new sequence to add + /// # Errors + /// * None so far + pub fn add_sequence(&mut self, sequence: &'a [u8]) -> Result<(), Box> { + self.add_sequence_offset(sequence, None) + } + + /// Adds a new sequence to the list with no offset. + /// # Arguments + /// * `sequence` - the new sequence to add + /// * `last_offset` - an optional offset into the consensus that is the last allowed offset for the sequence to start + /// # Errors + /// * None so far + pub fn add_sequence_offset(&mut self, sequence: &'a [u8], last_offset: Option) -> Result<(), Box> { + // add ever character to our alphabet + self.alphabet.extend(sequence.iter().cloned()); + + // make sure we didn't add the wildcard by accident + // note: it's easier to just remove it once here than do something fancy with the above iteration + if let Some(wc) = self.config.wildcard { + self.alphabet.remove(&wc); + } + + // now add the sequence + self.sequences.push(sequence); + self.offsets.push(last_offset); + Ok(()) + } + + // The core function that gets called after adding all the sequences we care about + /// # Errors + /// * if the DWFA throws errors + pub fn consensus(&self) -> Result, Box> { + // initialize everything + let mut maximum_error = usize::MAX; + let mut nodes_explored: usize = 0; + let mut nodes_ignored: usize = 0; + let mut peak_queue_size: usize = 0; + let mut farthest_consensus_single: usize = 0; + let mut farthest_consensus_dual: usize = 0; + let mut single_last_constraint: u64 = 0; + let mut dual_last_constraint: u64 = 0; + + let offset_window = self.config.offset_window; // the window around the provided offset that we check + let offset_compare_length = self.config.offset_compare_length; // the number of bases we check to figure out the real start + + let offsets: Vec> = if self.config.auto_shift_offsets { + // first figure out if we need to address the offsets for this re-run of consensus + let mut min_offset = usize::MAX; + let mut start_sequence_found = false; + for offset in self.offsets.iter() { + match offset { + Some(o) => min_offset = min_offset.min(*o), + None => start_sequence_found = true + }; + } + + if !start_sequence_found { + debug!("No start sequence detected, shifting all offsets by {min_offset}"); + self.offsets.iter() + .map(|o| { + let value = o.unwrap(); + if value == min_offset { + // equal to minimum, flag as no offset + None + } else { + // past minimum, so down-shift it + Some(value - min_offset) + } + }) + .collect::>>() + } else { + self.offsets.clone() + } + } else { + self.offsets.clone() + }; + + debug!("Offsets: {:?}", offsets); + + // build up the list of sizes where we need to activate one or more sequences + let mut activate_points: HashMap> = Default::default(); + let mut initially_active: usize = 0; + for (seq_index, offset) in offsets.iter().enumerate() { + if let Some(last_offset) = offset { + // we have an approx, calculate when it can get activated and add it to our list + let activate_length = last_offset + offset_compare_length; + let entry = activate_points.entry(activate_length).or_default(); + entry.push(seq_index); + } else { + initially_active += 1; + } + } + + if initially_active == 0 { + bail!("Must have at least one initial offset of None to see the consensus."); + } + + // we now track singletons and dual nodes separately so we can guarantee that we return _something_ even if final dual nodes are imbalanced + let max_queue_size = self.config.max_queue_size; + let max_capacity_per_size = self.config.max_capacity_per_size; + let initial_size = self.sequences.iter().map(|s| s.len()).max().unwrap(); + let mut single_tracker = PQueueTracker::with_capacity(initial_size, max_capacity_per_size); + let mut dual_tracker = PQueueTracker::with_capacity(initial_size, max_capacity_per_size); + + let initial_node = DualConsensusNode::new_root_node(&offsets, self.config.wildcard, self.config.allow_early_termination)?; + let initial_priority = initial_node.priority(self.consensus_cost()); + + // start the priority queue, which defaults to bigger is better so we need a Reverse since want smaller costs + let mut pqueue: PriorityQueue = PriorityQueue::new(); + single_tracker.insert(initial_node.max_consensus_length()); + pqueue.push(initial_node, initial_priority); + + let mut ret: Vec = vec![]; + let mut last_indiv = 0; + + // we need to do min_count based on the number of *active* sequences when we have really high coverage + // dynamic min count which is the max of either the raw minimum OR the number of sequences * min_af + let full_min_count = self.config.min_count.max( + (self.config.min_af * self.sequences.len() as f64).ceil() as u64 + ); + let mut total_active_count = vec![ + initially_active + ]; + let mut active_min_count = vec![ + self.config.min_count.max( + (self.config.min_af * initially_active as f64).ceil() as u64 + ) + ]; + + // the way this will work is that we will eventually find one or more answers and anything worse will get drained off until no possibilities remain + let mut iteration = 0; + while !pqueue.is_empty() { + // this just tracks how large our actual queue gets + peak_queue_size = peak_queue_size.max(pqueue.len()); + + // first, check if we need to restrict our pqueue threshold + while (single_tracker.len() > max_queue_size || single_last_constraint >= self.config.max_nodes_wo_constraint) && single_tracker.threshold() < farthest_consensus_single { + single_tracker.increment_threshold(); + single_last_constraint = 0; + } + while (dual_tracker.len() > max_queue_size || dual_last_constraint >= self.config.max_nodes_wo_constraint) && dual_tracker.threshold() < farthest_consensus_dual { + dual_tracker.increment_threshold(); + dual_last_constraint = 0; + } + + // get the top node and check if it's bad + let (top_node, top_cost) = pqueue.pop().unwrap(); + let top_len = top_node.max_consensus_length(); + + let (threshold_cutoff, at_capacity) = if top_node.is_dual { + trace!( + "{} -> \"{}\" + \"{}\", {:?}", + top_cost.0.0, + top_node.consensus1.len(), + top_node.consensus2.len(), + top_node.costs(self.config.consensus_cost) + ); + dual_tracker.remove(top_len); + (dual_tracker.threshold(), dual_tracker.at_capacity(top_len)) + } else { + trace!("{} -> \"{}\"", top_cost.0.0, top_node.consensus1.len()); + single_tracker.remove(top_len); + (single_tracker.threshold(), single_tracker.at_capacity(top_len)) + }; + + // check if this node is worse than the best full solution OR + if top_cost.0.0 > maximum_error || + // if it's shorter than the pqueue threshold OR + top_len < threshold_cutoff || + // if we have reached process capacity OR + at_capacity || + // if it's a dual node that is no longer containing enough on each group + top_node.is_dual_imbalanced(active_min_count[top_len] as usize) { + nodes_ignored += 1; + trace!("\tignored {} || {} || {}", top_cost.0.0 > maximum_error, top_len < threshold_cutoff, top_node.is_dual_imbalanced(active_min_count[top_node.max_consensus_length()] as usize)); + continue; + } + + // mark this one as explored and updated farthest if necessary + if top_node.is_dual { + farthest_consensus_dual = farthest_consensus_dual.max(top_len); + dual_last_constraint += 1; + dual_tracker.process(top_len)?; + } else { + farthest_consensus_single = farthest_consensus_single.max(top_len); + single_last_constraint += 1; + single_tracker.process(top_len)?; + } + nodes_explored += 1; + + if !top_node.is_dual { + last_indiv = last_indiv.max(top_len); + } + + if iteration % 1000 == 0 { + trace!("i: {}, t: {}, s: {}, d: {}", iteration, pqueue.len(), single_tracker.len(), dual_tracker.len()); + trace!( + "Handling: cost={}, dual={}, h1_len={}, h2_len={}; s_t: {}, d_t: {}", + top_node.total_cost(self.config.consensus_cost), + top_node.is_dual, + top_node.consensus1.len(), + top_node.consensus2.len(), + single_tracker.threshold(), + dual_tracker.threshold() + ); + } + + let l = 50.min(top_node.consensus1.len()); + if l > 0 { + trace!("\t{}..{}", + std::str::from_utf8(&top_node.consensus1[..l]).unwrap(), + std::str::from_utf8(&top_node.consensus1[(top_node.consensus1.len()-l)..]).unwrap()); + } + if top_node.is_dual { + let l = 50.min(top_node.consensus2.len()); + if l > 0 { + trace!("\t{}..{}", + std::str::from_utf8(&top_node.consensus2[..l]).unwrap(), + std::str::from_utf8(&top_node.consensus2[(top_node.consensus2.len()-l)..]).unwrap()); + } + } + iteration += 1; + /* + if iteration == 40000 { + panic!("max"); + } + */ + + // now check if this node has reached the end + if top_node.reached_all_end(&self.sequences, self.config.allow_early_termination) { + // this node *might* be done, we need to finalize it to be sure + // we also need to do it in a clone since it might not be finalized and we need to keep extending + let mut finalized_node = top_node.clone(); + finalized_node.finalize(&self.sequences)?; + + // check if this is a dual node, but without enough support on each allele + let imbalanced = if finalized_node.is_dual { + let (best_consensus_index, _best_consensus_score) = finalized_node.costs(self.consensus_cost()); + let counts1 = best_consensus_index.iter() + .filter(|&&v| v == 0) + .count(); + let counts2 = best_consensus_index.len() - counts1; + + // if either does not have enough support, we will ignore it + (counts1 as u64) < full_min_count || (counts2 as u64) < full_min_count + } else { + // non-dual cannot be imbalanced + false + }; + + if !imbalanced { + // get the new finalized score, it may have changed + let finalized_score = finalized_node.total_cost(self.config.consensus_cost); + + // check first if it's strictly BETTER than anything so far + if finalized_score < maximum_error { + // this score is better than anything we have seen so far, clear out previous results if we have any + maximum_error = finalized_score; + trace!("\tMaximum error set to {maximum_error}"); + ret.clear(); + } + + // now check if it's as good as anything so far; the above check will not nullify this one + if finalized_score <= maximum_error && ret.len() < self.config.max_return_size { + // this consensus is at least as good as the best, so add it as a result + let dual_con_result = DualConsensus::from_node( + &finalized_node, + self.config.consensus_cost + ); + trace!("\tadding to ret");//: {dual_con_result:?}"); + trace!("\tcon1: {}", std::str::from_utf8(dual_con_result.consensus1().sequence())?); + trace!("\tcon2: {}", std::str::from_utf8(dual_con_result.consensus2().unwrap().sequence())?); + ret.push(dual_con_result); + } + } else { + // this node is not balanced correctly for us to return it as a solution + // note that it may still be a candidate for extension though + trace!("Finalized node is imbalanced, ignoring."); + } + } + + // everything past this point is about extending the top_node further + + // check if we need to update the active + if active_min_count.len() == top_len+1 { + // we need to copy and update + let current_active = total_active_count[top_len]; + let new_additions = match activate_points.get(&top_len) { + Some(v) => v.len(), + None => 0 + }; + let new_total = current_active + new_additions; + total_active_count.push(new_total); + // debug!("total_active_count[{}] = {}", total_active_count.len() - 1, new_total); + + let new_min_af = self.config.min_count.max( + (self.config.min_af * new_total as f64).ceil() as u64 + ); + active_min_count.push(new_min_af); + // debug!("active_min_count[{}] = {}", total_active_count.len() - 1, new_min_af); + } + + // this fetches the list of options according to the WFA so far + // NOTE: this CAN include the wildcard, but only if the wildcard is the only character + let weighted_by_ed = self.config.weighted_by_ed; + let extension_candidates1 = top_node.get_extension_candidates(&self.sequences, self.config.wildcard, true, weighted_by_ed); + // let min_count1 = active_min_count[top_len]; + let min_count1 = self.config.min_count.max( + (self.config.min_af * extension_candidates1.values().sum::()).ceil() as u64 + ); + let max_observed1 = extension_candidates1.values().cloned().max_by(|a, b| a.total_cmp(b)) + // if no observation, then just use min count + .unwrap_or(min_count1 as f64); + + // the active threshold is the minimum of 1) the configured minimum count OR 2) the highest count we observed + let active_threshold1 = (min_count1 as f64).min(max_observed1); + + if top_node.is_dual { + // get the second candidate set also + let extension_candidates2 = top_node.get_extension_candidates(&self.sequences, self.config.wildcard, false, weighted_by_ed); + // let min_count2 = active_min_count[top_len]; + let min_count2 = self.config.min_count.max( + (self.config.min_af * extension_candidates2.values().sum::()).ceil() as u64 + ); + let max_observed2 = extension_candidates2.values().cloned().max_by(|a, b| a.total_cmp(b)) + // if no observation, then just use min count + .unwrap_or(min_count2 as f64); + + // the active threshold is the minimum of 1) the configured minimum count OR 2) the highest count we observed + let active_threshold2 = (min_count2 as f64).min(max_observed2); + + // one thing we have to handle here is the situation where alleles are different length + // this means one might want more extensions and the other is all done + // so lets check if either allele is ready to be done + let is_con1_finalized = top_node.reached_consensus_end(&self.sequences, true, self.config.allow_early_termination); + let is_con2_finalized = top_node.reached_consensus_end(&self.sequences, false, self.config.allow_early_termination); + + trace!("\tec1: {extension_candidates1:?}, {active_threshold1}"); + trace!("\tec2: {extension_candidates2:?}, {active_threshold2}"); + + // now create adjust extension lists that can have None + let opt_ec1: Vec> = { + let mut v = vec![]; + // if consensus 1 has at least one final candidate OR the candidate list is empty OR it's already locked + // THEN we need the no-extend option None + if is_con1_finalized || extension_candidates1.is_empty() || top_node.is_con1_locked() { + v.push(None); + } + + // do not add extensions if we are sequence locked + if !top_node.is_con1_locked() { + v.extend(extension_candidates1.iter() + .filter_map(|(&symbol, &count)| { + // if this character occurs fewer than the threshold times, remove it from the candidate set + if count < active_threshold1 { + None + } else { + // double Some is because of filter_map + Some(Some(symbol)) + } + }) + ); + } + v + }; + + let opt_ec2: Vec> = { + let mut v = vec![]; + // if consensus 2 has at least one final candidate OR the candidate list is empty OR it's already locked + // then we need the no-extend option None + if is_con2_finalized || extension_candidates2.is_empty() || top_node.is_con2_locked() { + v.push(None); + } + + // do not add extensions if we are sequence locked + if !top_node.is_con2_locked() { + v.extend(extension_candidates2.iter().filter_map(|(&symbol, &count)| { + // if this character occurs fewer than the threshold times, remove it from the candidate set + if count < active_threshold2 { + None + } else { + // double Some is because of filter_map + Some(Some(symbol)) + } + }) + ); + } + v + }; + + // make sure we always have something in each + assert!(!opt_ec1.is_empty() && !opt_ec2.is_empty()); + + for opt_can1 in opt_ec1.iter() { + for opt_can2 in opt_ec2.iter() { + if opt_can1.is_none() && opt_can2.is_none() { + // we have to extend at least one of them or else it's the exact same node + continue; + } + + trace!("\tExtending with: {opt_can1:?} + {opt_can2:?}"); + + // pair wise add each extension; above check enforces that at least one of these `if` statements are executed + let mut new_node = top_node.clone(); + if let Some(c1) = opt_can1 { + // we have a base to add + new_node.push(&self.sequences, *c1, true)?; + } else { + // we don't, lock the sequence in place so we do not get duplicate nodes + new_node.lock_sequence(true); + } + if let Some(c2) = opt_can2 { + // we have a base to add + new_node.push(&self.sequences, *c2, false)?; + } else { + // we don't, lock the sequence in place so we do not get duplicate nodes + new_node.lock_sequence(false); + } + + // check if we need to activate any strings + let opt_activate_list = activate_points.get(&new_node.max_consensus_length()); + if let Some(activate_list) = opt_activate_list { + assert!(!activate_list.is_empty()); + for &seq_index in activate_list.iter() { + new_node.activate_sequence(self.sequences[seq_index], seq_index, offset_window, offset_compare_length, self.config.wildcard, self.config.allow_early_termination)?; + } + } + + // check if we can prune anything after extensions + new_node.prune_dwfa(self.config.dual_max_ed_delta)?; + + // get the new cost and put it in the queue + let new_priority = new_node.priority(self.consensus_cost()); + assert!(new_node.is_dual); + dual_tracker.insert(new_node.max_consensus_length()); // top_node is already dual + assert!(pqueue.push(new_node.clone(), new_priority).is_none()); + } + } + } else { + // this is currently a non-dual node + // first, handle the option where it stays as non-dual + trace!("\tec1: {extension_candidates1:?}, {active_threshold1}"); + for (&symbol, &count) in extension_candidates1.iter() { + if count < active_threshold1 { + // this symbol was not observed enough times, skip it to reduce overhead + continue; + } + + // clone and add the symbol + let mut new_node = top_node.clone(); + new_node.push(&self.sequences, symbol, true)?; + + // check if we need to activate any strings + let opt_activate_list = activate_points.get(&new_node.max_consensus_length()); + if let Some(activate_list) = opt_activate_list { + assert!(!activate_list.is_empty()); + for &seq_index in activate_list.iter() { + new_node.activate_sequence(self.sequences[seq_index], seq_index, offset_window, offset_compare_length, self.config.wildcard, self.config.allow_early_termination)?; + } + } + + // get the new cost and put it in the queue + let new_priority = new_node.priority(self.consensus_cost()); + assert!(!new_node.is_dual); + single_tracker.insert(new_node.max_consensus_length()); + assert!(pqueue.push(new_node.clone(), new_priority).is_none()); + } + + // now handle dual-node generation + let mut num_passing = 0; + let sorted_candidates = { + let mut sc: Vec<_> = extension_candidates1.iter() + .filter_map(|(&c, &count)| { + // when it comes to dual splitting nodes, we care about hitting the minimum count + if self.config.wildcard.is_some() && c == self.config.wildcard.unwrap() { + // this is the wildcard, we do not count it for the purpose of dual splitting + None + } else { + if count >= min_count1 as f64 { + num_passing += 1; + } + Some((Reverse(count), c)) + } + }) + .collect(); + sc.sort_by(|a, b| a.partial_cmp(b).unwrap()); + sc + }; + + if num_passing > 1 { + // c1 in outer loop + for (i, &(_order, c1)) in sorted_candidates.iter().enumerate() { + // c2 in inner loop, must start after c1 + for &(_order2, c2) in sorted_candidates[(i+1)..].iter() { + // sanity checks to make sure we never try to make a dual node where one extension is the wildcard + assert!(self.config.wildcard.is_none() || c1 != self.config.wildcard.unwrap()); + assert!(self.config.wildcard.is_none() || c2 != self.config.wildcard.unwrap()); + + // convert the clone into a dual node with the candidate; c1 should be the "major" candidate due to the above sort + let mut new_node = top_node.clone(); + new_node.activate_dual(&self.sequences, c1, c2)?; + + // check if we need to activate any strings + let opt_activate_list = activate_points.get(&new_node.max_consensus_length()); + if let Some(activate_list) = opt_activate_list { + assert!(!activate_list.is_empty()); + for &seq_index in activate_list.iter() { + new_node.activate_sequence(self.sequences[seq_index], seq_index, offset_window, offset_compare_length, self.config.wildcard, self.config.allow_early_termination)?; + } + } + + // now prune + new_node.prune_dwfa(self.config.dual_max_ed_delta)?; + + // get the new cost and put it in the queue + let new_priority = new_node.priority(self.consensus_cost()); + assert!(new_node.is_dual); + dual_tracker.insert(new_node.max_consensus_length()); + assert!(pqueue.push(new_node.clone(), new_priority).is_none()); + } + } + } + } + + // TODO: remove this eventually, it's a summation check + assert_eq!(pqueue.len(), single_tracker.unfiltered_len() + dual_tracker.unfiltered_len()); + } + + assert_eq!(single_tracker.len(), 0); + assert_eq!(dual_tracker.len(), 0); + + if ret.len() > 1 { + // sort these by the sequence so we always have a fixed order + ret.sort_by(|c1, c2| { + let empty = vec![]; + + // first consensus key + let s1 = c1.consensus1.sequence(); + let s2 = match c1.consensus2.as_ref() { + Some(s) => s.sequence(), + None => &empty + }; + let k1 = (s1, s2); + + // second consensus key + let s1 = c2.consensus1.sequence(); + let s2 = match c2.consensus2.as_ref() { + Some(s) => s.sequence(), + None => &empty + }; + let k2 = (s1, s2); + k1.cmp(&k2) + }); + } + + // TODO: we can hit this if we remove all final solution because they are imbalanced + // if we find that happens in practice, then we need to somehow distinguish dual and singleton nodes to prevent the scenario where + // we remove all non-dual nodes, but then all the dual nodes fail the final check + // assert!(!ret.is_empty()); + if ret.is_empty() { + warn!("No consensus found that reached end, is there a gap between input sequences?"); + + // TODO: how do we want to handle this long-term? this returns an empty string consensus + let no_offsets = vec![None; self.sequences.len()]; // we need these to get costs of 0 + let root_node = DualConsensusNode::new_root_node(&no_offsets, self.config.wildcard, self.config.allow_early_termination)?; + ret.push(DualConsensus::from_node(&root_node, self.consensus_cost())); + } + + debug!("nodes_explored: {nodes_explored}"); + debug!("nodes_ignored: {nodes_ignored}"); + debug!("peak_queue_size: {peak_queue_size}"); + debug!("last_indiv: {last_indiv}"); + + Ok(ret) + } + + // getters + pub fn sequences(&self) -> &[&'a [u8]] { + &self.sequences + } + + pub fn consensus_cost(&self) -> ConsensusCost { + self.config.consensus_cost + } + + pub fn alphabet(&self) -> &HashSet { + &self.alphabet + } +} + +/// Wrapper for a node containing a partial consensus as well as the DWFA tracking for that node +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct DualConsensusNode { + /// if True, then this node is tracking two consensuses + is_dual: bool, + /// if True, we are not allowed to modify consensus1 anymore + is_con1_locked: bool, + /// if True, we are not allowed to modify consensus2 anymore + is_con2_locked: bool, + /// The primary consensus sequence + consensus1: Vec, + /// The secondary consensus sequence + consensus2: Vec, + /// The set of DWFAs for consensus1; these are options because we stop tracking once the scores diverge + dwfas1: Vec>, + /// The set of DWFAs for consensus2; these are options because we stop tracking once the scores diverge + dwfas2: Vec>, +} + +impl DualConsensusNode { + /// Constructor for a new consensus search root node. + /// Note that initial it is not a dual node, it will become that when divergence is detected. + /// # Arguments + /// * `offsets` - a set of offsets into the sequences where the approximate starts are + /// * `wildcard` - an optional wildcard symbol that will match anything + /// * `allow_early_termination` - if true, then it will allow the consensus to go beyond the provided baseline sequences without penalty + /// # Errors + /// * if DWFA construction fails + fn new_root_node(offsets: &[Option], wildcard: Option, allow_early_termination: bool) -> Result> { + let dwfas: Vec> = offsets.iter() + .map(|offset| match offset { + // we have an offset, so do not create a map + Some(_o) => None, + // we don't have an offset, so this is active from the start + None => Some(DWFALite::new(wildcard, allow_early_termination)) + }) + .collect(); + + let active_count = dwfas.iter().filter(|d| d.is_some()).count(); + if active_count == 0 { + bail!("Root DualConsensus node has no initially active sequences."); + } + + // a root node will have one empty consensus, and one corresponding vec of empty DWFAs + Ok(DualConsensusNode { + is_dual: false, + is_con1_locked: false, + is_con2_locked: false, + consensus1: vec![], + consensus2: vec![], + dwfas1: dwfas, + dwfas2: vec![None; offsets.len()] + }) + } + + /// This will activate the DWFAs for a particular sequence. + /// # Arguments + /// * `sequence` - the sequence that is getting activated + /// * `seq_index` - the sequence index, which will map to the DWFAs + /// * `offset_window` - the window we are searching for a best match + /// * `offset_compare_length` - the amount of bases we are comparing + /// * `wildcard` - optional wildcard for scoring, passed into DWFA + /// * `allow_early_termination` - enables sequences to end partway through the consensus, passed into DWFA + fn activate_sequence(&mut self, sequence: &[u8], seq_index: usize, offset_window: usize, offset_compare_length: usize, wildcard: Option, allow_early_termination: bool) -> Result<(), Box> { + // figure out whether we need to just do con1 or both + let activators = if self.is_dual { + vec![(&mut self.dwfas1, &self.consensus1), (&mut self.dwfas2, &self.consensus2)] + } else { + vec![(&mut self.dwfas1, &self.consensus1)] + }; + + for (dwfas, consensus) in activators.into_iter() { + // make sure everything is currently inactive + assert!(dwfas[seq_index].is_none()); + + let con_len = consensus.len(); + let start_delta = offset_window + offset_compare_length; + let start_position = con_len.saturating_sub(start_delta); + let end_position = con_len.saturating_sub(offset_compare_length); + + // figure out which offset has the best score; assume the middle of the offset window is the best + let mut best_offset = con_len.saturating_sub(offset_compare_length + offset_window / 2); + let mut min_ed = crate::sequence_alignment::wfa_ed_config(&consensus[best_offset..], &sequence[0..offset_compare_length], false, wildcard); + + for p in start_position..end_position { + let ed = crate::sequence_alignment::wfa_ed_config(&consensus[p..], &sequence[0..offset_compare_length], false, wildcard); + if ed < min_ed { + min_ed = ed; + best_offset = p; + } + } + + // now set up the DWFA with the best offset + let mut new_dwfa = DWFALite::new(wildcard, allow_early_termination); + new_dwfa.set_offset(best_offset); + new_dwfa.update(sequence, consensus)?; + dwfas[seq_index] = Some(new_dwfa); + } + + Ok(()) + } + + /// Adds a new symbol to the DWFAs + /// # Arguments + /// * `baseline_sequences` - the sequences that are getting compared against in the DWFAs + /// * `symbol` - the symbol to extend each DWFA with + /// * `is_consensus1` - if True, this will check consensus1 DWFAs, otherwise it checks those for consensus2 + /// # Errors + /// * if any of the extensions fail + fn push(&mut self, baseline_sequences: &[&[u8]], symbol: u8, is_consensus1: bool) -> Result<(), Box> { + if is_consensus1 && self.is_con1_locked { + bail!("Consensus 1 is locked, cannot modify"); + } else if !is_consensus1 && self.is_con2_locked { + bail!("Consensus 2 is locked, cannot modify"); + } + + // get the relevant DWFAs + let dwfa_iter = if is_consensus1 { + self.dwfas1.iter_mut() + } else { + self.dwfas2.iter_mut() + }; + + // get the relevant consensus + let consensus_seq = if is_consensus1 { + &mut self.consensus1 + } else { + &mut self.consensus2 + }; + + // first extend the consensus + consensus_seq.push(symbol); + + // now adjust the corresponding DWFA + for (&baseline, opt_dwfa) in baseline_sequences.iter().zip(dwfa_iter) { + if let Some(dwfa) = opt_dwfa { + dwfa.update(baseline, consensus_seq)?; + } + } + Ok(()) + } + + /// Converts this node into a dual node and extends with the new different symbols. + /// # Arguments + /// * `baseline_sequences` - the sequences that are getting compared against in the DWFAs + /// * `symbol1` - the symbol to extend the first consensus with + /// * `symbol2` - the symbol to extend the second consensus with + /// # Errors + /// * if this is already a dual node + /// * if symbol1 == symbol2, they must be different or it's not a dual node + /// * if any of the extensions fail + fn activate_dual(&mut self, baseline_sequences: &[&[u8]], symbol1: u8, symbol2: u8) -> Result<(), Box> { + if self.is_dual { + bail!("Cannot activate dual on a dual node"); + } + self.is_dual = true; + + if symbol1 == symbol2 { + bail!("Cannot activate dual mode with the same extension symbols"); + } + + // clone everything in 1 into 2 + self.consensus2 = self.consensus1.clone(); + self.dwfas2 = self.dwfas1.clone(); + + // now extend both consensus with the new symbols + self.push(baseline_sequences, symbol1, true)?; + self.push(baseline_sequences, symbol2, false)?; + + Ok(()) + } + + /// Returns true if this is a dual node AND one of the two tracked has fewer counts than the minimum + /// # Arguments + /// * `min_count` - the minimum count required to have a dual allele result + fn is_dual_imbalanced(&self, min_count: usize) -> bool { + if self.is_dual { + let counts1 = self.dwfas1.iter() + .filter_map(|d| d.as_ref()) + .count(); + let counts2 = self.dwfas2.iter() + .filter_map(|d| d.as_ref()) + .count(); + + // return true if either is less than the minimum + counts1 < min_count || counts2 < min_count + + /* + let (c1, c2) = self.full_cost(ConsensusCost::L1Distance); + let mut counts1 = 0.0; + let mut counts2 = 0.0; + let equality_score = 0.5; + for (opt_v1, opt_v2) in c1.into_iter().zip(c2.into_iter()) { + if let (Some(v1), Some(v2)) = (opt_v1, opt_v2) { + // both are present, compare + match v1.cmp(&v2) { + std::cmp::Ordering::Less => counts1 += 1.0, + std::cmp::Ordering::Equal => { + counts1 += equality_score; + counts2 += equality_score; + }, + std::cmp::Ordering::Greater => counts2 += 1.0, + }; + } else if opt_v1.is_some() { + counts1 += 1.0; + } else if opt_v2.is_some() { + counts2 += 1.0; + } else { + // both are None, I think leave this empty for now + } + } + + // return true if either is less than the minimum + counts1 < (min_count as f64) || counts2 < (min_count as f64) + */ + } else { + false + } + } + + /// This will go through the pairs of DWFA and remove any that have diverge from equality. + /// This is a no-op if the node is not dual yet. + /// # Arguments + /// * `ed_delta` - the maximum difference between the edit distances before the higher one gets dropped from tracking + fn prune_dwfa(&mut self, ed_delta: usize) -> Result<(), Box> { + if self.is_dual { + for (opt_d1, opt_d2) in self.dwfas1.iter_mut().zip(self.dwfas2.iter_mut()) { + if let (Some(d1), Some(d2)) = (opt_d1.as_ref(), opt_d2.as_ref()) { + if d1.edit_distance() + ed_delta < d2.edit_distance() { + // d1 is much less than d2; drop d2 + *opt_d2 = None; + } else if d2.edit_distance() + ed_delta < d1.edit_distance() { + // d2 is much less than d1; drop d1 + *opt_d1 = None; + } + } + } + } + Ok(()) + } + + /// Adds an internal lock that prevents any further additions to the sequence + pub fn lock_sequence(&mut self, is_consensus1: bool) { + if is_consensus1 { + self.is_con1_locked = true; + } else { + self.is_con2_locked = true; + } + } + + /// Finalizes all of the contained DWFAs + /// # Errors + /// * if any of the individual finalizes fail + fn finalize(&mut self, baseline_sequences: &[&[u8]]) -> Result<(), Box> { + // we will always loop over the first + for (&baseline, (opt_dwfa1, opt_dwfa2)) in baseline_sequences.iter() + .zip(self.dwfas1.iter_mut().zip(self.dwfas2.iter_mut())) { + let mut seq_finalized = false; + if let Some(dwfa) = opt_dwfa1 { + dwfa.finalize(baseline, &self.consensus1)?; + seq_finalized = true; + } + + // if this is a dual node, loop over those also + if self.is_dual { + if let Some(dwfa) = opt_dwfa2 { + dwfa.finalize(baseline, &self.consensus2)?; + seq_finalized = true; + } + } + + if !seq_finalized { + bail!("Finalize called on DWFA that was never initialized."); + } + } + + // if we are finalizing, we are definitely locking + self.is_con1_locked = true; + self.is_con2_locked = true; + + Ok(()) + } + + /// Returns a tuple of scores for this node of form (indices, scores). + /// It will pick the minimum cost for any sequence where multiple possible consensuses are still being tracked. + /// The minimum cost consensus index will go into `indices` and the actual minimum score into `scores`. + fn costs(&self, consensus_cost: ConsensusCost) -> (Vec, Vec) { + // we need to collect the minimum cost across multiple DWFAs for a given sequence + let num_sequences = self.dwfas1.len(); + let mut best_consensus_index = vec![usize::MAX; num_sequences]; + let mut best_consensus_score = vec![usize::MAX; num_sequences]; + + // iterate over the two DWFA collections + let mut dwfa_iters = [self.dwfas1.iter(), self.dwfas2.iter()]; + for (con_index, dwfa_iter) in dwfa_iters.iter_mut().enumerate() { + for (seq_index, opt_d) in dwfa_iter.enumerate() { + if let Some(d) = opt_d { + let score = match consensus_cost { + ConsensusCost::L1Distance => d.edit_distance(), + ConsensusCost::L2Distance => d.edit_distance().pow(2) + }; + + // if this score is BETTER, then replace the previous + if score < best_consensus_score[seq_index] { + best_consensus_score[seq_index] = score; + best_consensus_index[seq_index] = con_index; + } + } else { + // we can ignore these, they do not have a DWFA for one reason or another + // the assertion below verifies that at least one per sequence is present though + } + } + } + + // make sure all of them match 0 or 1 + // assert!(best_consensus_index.iter().all(|&i| i == 0 || i == 1)); + + // the above no longer holds when a sequence has not been activated yet, replace all of the usize::MAX with 0 + for (bci, bcs) in best_consensus_index.iter().zip(best_consensus_score.iter_mut()) { + if *bci == usize::MAX && *bcs == usize::MAX { + *bcs = 0; + } + } + + (best_consensus_index, best_consensus_score) + } + + /// Returns the total score for the node + fn total_cost(&self, consensus_cost: ConsensusCost) -> usize { + let (_best_indices, best_costs) = self.costs(consensus_cost); + best_costs.iter().sum() + } + + /// Returns the full set of tracked costs for the two consensuses. + fn full_cost(&self, consensus_cost: ConsensusCost) -> (Vec>, Vec>) { + let s1 = self.dwfas1.iter() + .map(|opt_dwfa| { + opt_dwfa.as_ref().map(|d| match consensus_cost { + ConsensusCost::L1Distance => d.edit_distance(), + ConsensusCost::L2Distance => d.edit_distance().pow(2) + }) + }).collect(); + let s2 = self.dwfas2.iter() + .map(|opt_dwfa| { + opt_dwfa.as_ref().map(|d| match consensus_cost { + ConsensusCost::L1Distance => d.edit_distance(), + ConsensusCost::L2Distance => d.edit_distance().pow(2) + }) + }).collect(); + + (s1, s2) + } + + /// Returns the node priority. + /// Currently, this is based on 1) lowest cost and 2) consensus length. + /// # Arguments + /// * `consensus_cost` - cost model to evaluate the cost + fn priority(&self, consensus_cost: ConsensusCost) -> NodePriority { + ( + Reverse(self.total_cost(consensus_cost)), + self.max_consensus_length() + ) + } + + /// Returns true if each consensus has at least one DWFA at the end of their respective baseline sequences. + /// # Arguments + /// * `baseline_sequences` - the sequences that are fixed that we want the consensus of + /// * `require_all` - if True, requires *all* of the sequences to be at their end in at least one of the consensuses (this is useful for allow_early_termination) + fn reached_all_end(&self, baseline_sequences: &[&[u8]], require_all: bool) -> bool { + // create an iterator that determines if we're at the end + let mut iter_map = baseline_sequences.iter() + .zip(self.dwfas1.iter().zip(self.dwfas2.iter())) + .map(|(&baseline, (opt_dwfa1, opt_dwfa2))| { + // assert!(opt_dwfa1.is_some() || opt_dwfa2.is_some()); + // these can be None if either A) this one has not started or B) it has started, but dropped off due to high ED + // in either case, it would default to NOT at end (i.e., false) + let p1 = opt_dwfa1.as_ref().map(|d| d.reached_baseline_end(baseline)); + let p2 = opt_dwfa2.as_ref().map(|d| d.reached_baseline_end(baseline)); + + // at least one of them needs to be at the end to pass; untracked does not count + p1.unwrap_or(false) || p2.unwrap_or(false) + }); + + // handle iterator appropriately + if require_all { + iter_map.all(|b| b) + } else { + iter_map.any(|b| b) + } + } + + /// For the given consensus, returns true if at least one *active* DWFA at the end of the baseline sequences. + /// If this is not a dual node, then it will always return false for consensus 2. + /// # Arguments + /// * `baseline_sequences` - the sequences that are fixed that we want the consensus of + /// * `is_consensus1` - if True, this will check consensus1 DWFAs, otherwise it checks those for consensus2 + /// * `require_all` - if True, this requires all active DWFAs to be at the end + fn reached_consensus_end(&self, baseline_sequences: &[&[u8]], is_consensus1: bool, require_all: bool) -> bool { + // TODO: do we need to refactor this to look at both consensuses at once? + let dwfa_iter = if is_consensus1 { + self.dwfas1.iter() + } else { + if !self.is_dual { + // this is not a dual node, but the user is asking for consensus 2 + return false; + } + + self.dwfas2.iter() + }; + + // if we require all, then we do not want to penalize; so it should be True + // if not, then we don't want to count it; so it should be False + let value_for_inactive = require_all; + + // iterate over each DWFA and check if it's at the end + let mut iter_map = baseline_sequences.iter().zip(dwfa_iter) + .map(|(&baseline, opt_dwfa)| { + opt_dwfa.as_ref() + .map(|d| d.reached_baseline_end(baseline)) + .unwrap_or(value_for_inactive) + }); + + if require_all { + iter_map.all(|b| b) + } else { + iter_map.any(|b| b) + } + } + + /// Returns a hashmap of the extension candidates with their votes. + /// The votes are fractional if a sequence has multiple equally likely options. + /// # Arguments + /// * `baseline_sequences` - the sequences that are fixed that we want the consensus of + /// * `wildcard` - an optional wildcard character, will be removed from return set unless it is the only value in it + /// * `is_consensus1` - if True, this will check consensus1 DWFAs, otherwise it checks those for consensus2 + /// * `weighted_by_ed` - if True, then the weights are scaled based on the dual weights comparison; e.g. if ed1 = 2 and ed2 = 4, then weights for consensus 1 are ~1/3 and for consensus 2 are ~2/3 of the total + fn get_extension_candidates(&self, baseline_sequences: &[&[u8]], wildcard: Option, is_consensus1: bool, weighted_by_ed: bool) -> HashMap { + // get the relevant DWFAs + let dwfa_iter = if is_consensus1 { + self.dwfas1.iter() + } else { + self.dwfas2.iter() + }; + + // get the relevant consensus + let consensus_seq = if is_consensus1 { + &self.consensus1 + } else { + &self.consensus2 + }; + + let weights = if weighted_by_ed { + self.get_ed_weights(is_consensus1, weighted_by_ed) + } else { + // these weights will always factor in current scoring + vec![1.0; self.dwfas1.len()] + }; + + // now pull out all the candidates and count how many times they were voted on + let mut candidates: HashMap = Default::default(); + for ((&baseline_seq, opt_dwfa), &weight) in baseline_sequences.iter().zip(dwfa_iter).zip(weights.iter()) { + if weight > 0.0 { + if let Some(dwfa) = opt_dwfa { + // get the candidates and the total observation weight + let cand = dwfa.get_extension_candidates(baseline_seq, consensus_seq); + let vote_split = cand.values().sum::() as f64; + + // iterate over each candidate and scale it by the occurrences count / total weight + for (&c, &occ) in cand.iter() { + let entry = candidates.entry(c).or_insert(0.0); + *entry += weight * occ as f64 / vote_split; + } + } + } + } + + if let Some(wc) = wildcard { + if candidates.len() > 1 { + // we have 2+ candidates, we can safely remove the wildcard and have something left over + candidates.remove(&wc); + } + } + + candidates + } + + /// This returns a series of weights that are scaled based on the edit distances to con1 and con2. + /// For example, if ed1 = 2 and ed2 = 5, then a consensus1 scaled ed would be ~ 5 / (2+5). + /// Conversely, the weight for consensus2 would be ~ 2 / (2+5). + /// If `weight_by_ed` is False, this will instead just return 0.0, 0.5, or 1.0; giving full weight when one consensus is better. + /// # Arguments + /// * `is_consensus` - If True, then return weights for consensus 1, otherwise consensus 2. + /// * `weight_by_ed` - If True, then weights are scaled by the relative edit distances. If False, then weights are either 0.0 or 1.0 typically, with 0.5 when equal. + fn get_ed_weights(&self, is_consensus1: bool, weight_by_ed: bool) -> Vec { + if self.is_dual { + let min_ed = 0.5; // prevent divide by zero while still enabling a scaling + // TODO: should we treat these as 1.0, 0.5, or 0.0? + let equality_score = 0.5; // using 0.0 means don't let something that isn't for sure mapped here have a vote + self.dwfas1.iter().zip(self.dwfas2.iter()) + .map(|(d1, d2)| { + let c1 = d1.as_ref().map(|d| (d.edit_distance() as f64).max(min_ed)); + let c2 = d2.as_ref().map(|d| (d.edit_distance() as f64).max(min_ed)); + + if let (Some(v1), Some(v2)) = (c1, c2) { + if weight_by_ed { + // numerator is the opposite entry; i.e for con1, if ed1 = 1 and ed2 = 9, we should get 90% of weight + let numer = if is_consensus1 { v2 } else { v1 }; + numer / (v1 + v2) + } else if v1 == v2 { + // equal weights + equality_score + } else if (is_consensus1 && v1 < v2) || (!is_consensus1 && v2 < v1) { + // the consensus we are looking at is less than the other, so give it full weight + 1.0 + } else { + // the consensus we are looking at is more than the other, so it gets no weight + 0.0 + } + } else if (c1.is_some() && is_consensus1) || (c2.is_some() && !is_consensus1) { + // the one we are checking is present, but the other is not + 1.0 + } else { + // everything else gets no weight + 0.0 + } + }).collect() + } else { + // this one isn't dual currently, just return all full weight + vec![1.0; self.dwfas1.len()] + } + } + + // getters + fn max_consensus_length(&self) -> usize { + self.consensus1.len().max(self.consensus2.len()) + } + + fn is_con1_locked(&self) -> bool { + self.is_con1_locked + } + + fn is_con2_locked(&self) -> bool { + self.is_con2_locked + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::cdwfa_config::CdwfaConfigBuilder; + + // first some more targeted tests + #[test] + fn test_get_ed_weights() { + let vec_sequences = vec![ + b"ACGT".to_vec(), + b"CGTA".to_vec() + ]; + let sequences: Vec<&[u8]> = vec_sequences.iter().map(|v| v.as_slice()).collect(); + let offsets = vec![None; sequences.len()]; + + let mut node = DualConsensusNode::new_root_node(&offsets, None, true).unwrap(); + node.activate_dual(sequences.as_slice(), b'A', b'C').unwrap(); + + let weights1 = node.get_ed_weights(true, true); + assert_eq!(weights1, vec![1.0 / 1.5, 0.5 / 1.5]); + let weights2 = node.get_ed_weights(false, true); + assert_eq!(weights2, vec![0.5 / 1.5, 1.0 / 1.5]); + + let weights1 = node.get_ed_weights(true, false); + assert_eq!(weights1, vec![1.0, 0.0]); + let weights2 = node.get_ed_weights(false, false); + assert_eq!(weights2, vec![0.0, 1.0]); + } + + // below here are mostly end-to-end DualConsensusDWFA tests + + #[derive(Debug, serde::Deserialize)] + struct DualRecord { + consensus: usize, + edits: usize, + sequence: String + } + + /// Wrapper test function that loads a dual test from a csv file. + /// Expected columns are "consensus" (1 or 2), "edits" (u64), and "sequence" (String). + /// Returns a tuple of (sequences, DualConsensus). + /// # Arguments + /// * `filename` - the file path to load + /// * `include_consensus` - if True, it will load the first consensus read into the sequences + /// * `cost_mode` - the cost mode getting tested + fn load_dual_csv_test(filename: &std::path::Path, include_consensus: bool, cost_mode: ConsensusCost) -> (Vec>, DualConsensus) { + let mut sequences = vec![]; + let mut is_consensus1 = vec![]; + let mut ed1 = vec![]; + let mut ed2 = vec![]; + + let mut con1: Option> = None; + let mut con2: Option> = None; + + let mut csv_reader = csv::ReaderBuilder::new() + .has_headers(true) + .from_path(filename) + .unwrap(); + for row in csv_reader.deserialize() { + let record: DualRecord = row.unwrap(); + + let is_con1 = record.consensus == 1; + let edits = match cost_mode { + ConsensusCost::L1Distance => record.edits, + ConsensusCost::L2Distance => record.edits.pow(2) + }; + let sequence = record.sequence.as_bytes().to_vec(); + + if is_con1 { + if con1.is_none() && edits == 0 { + // this is the first 0 ED for consensus 1 + con1 = Some(sequence.clone()); + if !include_consensus { + continue; + } + } + ed1.push(edits); + } else { + if con2.is_none() && edits == 0 { + // this is the first 0 ED for consensus 2 + con2 = Some(sequence.clone()); + if !include_consensus { + continue; + } + } + ed2.push(edits); + } + + is_consensus1.push(is_con1); + sequences.push(sequence); + } + + // make sure that either we do not have consensus 2 OR consensus 1 comes before consensus 2 + assert!(con2.is_none() || con1.as_ref().unwrap() < con2.as_ref().unwrap()); + + let consensus1 = Consensus::new(con1.unwrap(), cost_mode, ed1); + let consensus2 = con2.map(|c2| Consensus::new(c2, cost_mode, ed2)); + let consensus = DualConsensus { + consensus1, + consensus2, + is_consensus1, + scores1: vec![None; sequences.len()], + scores2: vec![None; sequences.len()] + }; + + (sequences, consensus) + } + + /// Entry point for most file-based tests. + /// # Arguments + /// * `filename` - the test file to load, will be a csv + /// * `include_consensus` - if True, this will include the consensus sequence line as a read + /// * `opt_config` - optional alternate config, otherwise a default is created + fn run_test_file(filename: &str, include_consensus: bool, opt_config: Option) { + let config = opt_config.unwrap_or( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ); + + let (sequences, expected_consensus) = load_dual_csv_test(&PathBuf::from(filename), include_consensus, config.consensus_cost); + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config(config).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![expected_consensus]); + } + + #[test] + fn test_single_sequence() { + let sequence = b"ACGTACGTACGT"; + let mut consensus_dwfa = DualConsensusDWFA::default(); + consensus_dwfa.add_sequence(sequence).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new( + sequence.to_vec(), + ConsensusCost::L1Distance, + vec![0] + ), + consensus2: None, + is_consensus1: vec![true], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_trio_sequence() { + let sequence = b"ACGTACGTACGT"; + let sequence2 = b"ACGTACCTACGT"; + let mut consensus_dwfa = DualConsensusDWFA::default(); + + // add the first sequence twice, it will be the consensus + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(sequence2).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found sequence + let consensus = consensus_dwfa.consensus().unwrap(); + + // sequence2 is alphabetically before sequence 1, so it will come first + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new( + sequence.to_vec(), + ConsensusCost::L1Distance, + vec![0, 0, 1] + ), + consensus2: None, + is_consensus1: vec![true, true, true], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_complicated() { + let expected_consensus = b"ACGTACGTACGT"; + let sequences = [ + // b"ACGTACG-TACGT" + // b"AC-TACGGTACGT" + b"ACTACGGTACGT", + // b"ACGTAAG-TCCGT" + b"ACGTAAGTCCGT", + // b"AAGTACG-TACGT" + b"AAGTACGTACGT" + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = DualConsensusDWFA::default(); + for &sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + // assert_eq!(consensus.len(), 1); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(expected_consensus.to_vec(), ConsensusCost::L1Distance, vec![2, 2, 1]), + consensus2: None, + is_consensus1: vec![true; 3], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_wildcards() { + let expected_consensus = b"ACGTACGTACGT"; + let sequences = [ + // C insertion and tail missing + b"ACGTACCGT****".to_vec(), + // C>T and head+tail missing + b"**GTATGTAC**".to_vec(), + // just head missing + b"****ACGTACGT".to_vec() + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + // assert_eq!(consensus.len(), 1); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(expected_consensus.to_vec(), ConsensusCost::L1Distance, vec![1, 1, 0]), + consensus2: None, + is_consensus1: vec![true; 3], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_all_wildcards() { + let _expected_consensus = b"ACGTACGTACGT"; + let actual_consensus = b"*CGTACG*ACG*"; + let sequences = [ + // 1 ins + b"*CGTAACG*ACG*".to_vec(), + // nothing other than * + b"*CGTACG*ACG*".to_vec(), + // C>T + b"*CGTACG*ATG*".to_vec() + ]; + + // build a consensus over all inputs + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + // assert_eq!(consensus.len(), 1); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(actual_consensus.to_vec(), ConsensusCost::L1Distance, vec![1, 0, 1]), + consensus2: None, + is_consensus1: vec![true; 3], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_dual_sequence() { + let sequence = b"ACGT"; + let alt_sequence = b"AGGT"; // single snp test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .build().unwrap() + ).unwrap(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(alt_sequence).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(sequence.to_vec(), ConsensusCost::L1Distance, vec![0]), + consensus2: Some(Consensus::new(alt_sequence.to_vec(), ConsensusCost::L1Distance, vec![0])), + is_consensus1: vec![true, false], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_dual_unequal_001() { + let sequence = b"ACGT"; + let alt_sequence = b"AGGTA"; // single snp test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .build().unwrap() + ).unwrap(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(alt_sequence).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(sequence.to_vec(), ConsensusCost::L1Distance, vec![0]), + consensus2: Some(Consensus::new(alt_sequence.to_vec(), ConsensusCost::L1Distance, vec![0])), + is_consensus1: vec![true, false], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_dual_unequal_002() { + let sequence = b"ACGTA"; + let alt_sequence = b"AGGT"; // single snp test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .build().unwrap() + ).unwrap(); + consensus_dwfa.add_sequence(sequence).unwrap(); + consensus_dwfa.add_sequence(alt_sequence).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(sequence.to_vec(), ConsensusCost::L1Distance, vec![0]), + consensus2: Some(Consensus::new(alt_sequence.to_vec(), ConsensusCost::L1Distance, vec![0])), + is_consensus1: vec![true, false], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_dual_noise_before_variation() { + // this test includes a noisy C insertion; it will cause a dual node split that get's corrected later when the true variation is reached + let con1 = b"ACGTACGTACGT"; + let con2 = b"ACGTACGTCCCT"; // two SNPs towards the end that will be consistent + let sequences = [ + b"ACGTACGTACGT".to_vec(), + b"ACCGTACGTACGT".to_vec(), // noisy C insert + b"ACGTACGTACGT".to_vec(), + b"ACGTACGTCCCT".to_vec(), + b"ACGTACGTCCCT".to_vec(), + b"ACCGTACGTCCCT".to_vec() // noisy C insert + ]; + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .max_queue_size(1000) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(con1.to_vec(), ConsensusCost::L1Distance, vec![0, 1, 0]), + consensus2: Some(Consensus::new(con2.to_vec(), ConsensusCost::L1Distance, vec![0, 0, 1])), + is_consensus1: vec![true, true, true, false, false, false], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_multi_extension() { + // test the scenario where 3+ extensions are identified + let con1 = b"ACGTACGTACGT"; + let con2 = b"ACGTACGTCCCT"; // two SNPs towards the end that will be consistent + let sequences = [ + // matches con 1 + b"ACGTACGTACGT".to_vec(), + b"ACGTACGTACGT".to_vec(), + b"ACGTACGTGCGT".to_vec(),// A read as G, should create multi-extension for exploration + // matches con2 + b"ACGTACGTCCCT".to_vec(), + b"ACGTACGTCCCT".to_vec(), + b"ACGTACGTGCCT".to_vec() // first C read as G, should create multi-extension for exploration + ]; + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .max_queue_size(1000) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(con1.to_vec(), ConsensusCost::L1Distance, vec![0, 0, 1]), + consensus2: Some(Consensus::new(con2.to_vec(), ConsensusCost::L1Distance, vec![0, 0, 1])), + is_consensus1: vec![true, true, true, false, false, false], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_equal_options() { + // test the scenario multiple possible dual consensuses are possible + let con1 = b"ACGTACGTACGT"; // 00 + let con2 = b"ACGTCCGTCCGT"; // 11 + let con3 = b"ACGTACGTCCGT"; // 01 + let con4 = b"ACGTCCGTACGT"; // 10 + let sequences = [ + con1.to_vec(), + con2.to_vec(), + con3.to_vec(), + con4.to_vec() + ]; + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .max_queue_size(1000) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // this result actually has 6 equally possible dual consensuses, each of which is ed = 2 in total + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 6); + for dc in consensus.iter() { + // make sure it's dual + assert!(dc.consensus2.is_some()); + // make sure the edits are 2 + assert_eq!(2, dc.consensus1.scores().iter().sum::()+dc.consensus2.as_ref().unwrap().scores().iter().sum::()); + + // future: if we need to test the exact result, we're going to have to add it by hand later + } + } + + #[test] + fn test_tail_extension() { + // test the scenario we have a simple tail extension + // for now, this will NOT create a dual solution, but will only return one possible solution + // future: we may want it to somehow figure out that this is dual, but this seems like a minor edge case for now + let con1 = b"ACGT"; + let con2 = b"ACGTT"; // one consensus is 1 bp longer in the tail + let sequences = [ + con1.to_vec(), + con2.to_vec() + ]; + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .min_count(1) + .max_queue_size(1000) + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, vec![DualConsensus { + consensus1: Consensus::new(con1.to_vec(), ConsensusCost::L1Distance, vec![0, 1]), + consensus2: None, + is_consensus1: vec![true, true], + // these are not checked + scores1: vec![], + scores2: vec![] + }, + DualConsensus { + consensus1: Consensus::new(con2.to_vec(), ConsensusCost::L1Distance, vec![1, 0]), + consensus2: None, + is_consensus1: vec![true, true], + // these are not checked + scores1: vec![], + scores2: vec![] + }]); + } + + #[test] + fn test_csv_dual_001() { + run_test_file("./tests/dual_001.csv", true, None); + } + + #[test] + fn test_dual_max_ed_delta() { + // this is the same test as above BUT we have restricted the dual_max_ed_delta to too low, resulting in one read getting mis-assigned + // this is not normal, but we are testing that the functionality is correct + let cost_mode = ConsensusCost::L1Distance; + let (sequences, expected_consensus) = load_dual_csv_test(&PathBuf::from("./tests/dual_001.csv"), true, cost_mode); + + // third read gets swapped and has a worse ED than before + let expected_consensus = vec![ + DualConsensus { + consensus1: Consensus::new( + expected_consensus.consensus1.sequence().to_vec(), + ConsensusCost::L1Distance, + vec![0, 4, 4, 2] // delete the third entry here + ), + consensus2: Some(Consensus::new( + expected_consensus.consensus2.as_ref().unwrap().sequence().to_vec(), + ConsensusCost::L1Distance, + vec![3, 0, 0, 0, 0, 0] // shift it here, with a worse ED + )), + is_consensus1: vec![true, true, false, true, true, false, false, false, false, false], // mark third from true -> false + // these are not checked + scores1: vec![], + scores2: vec![] + } + ]; + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .dual_max_ed_delta(0) // set this intentionally low to bundle a sequence incorrectly in our result + .build().unwrap() + ).unwrap(); + for sequence in sequences.iter() { + consensus_dwfa.add_sequence(sequence).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, expected_consensus); + } + + #[test] + fn test_csv_length_gap_001() { + run_test_file("./tests/length_gap_001.csv", false, Some( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .min_count(2) + .dual_max_ed_delta(5) + .max_queue_size(1000) + .consensus_cost(ConsensusCost::L2Distance) + .build().unwrap() + )); + } + + #[test] + fn test_csv_early_termination_001() { + run_test_file("./tests/dual_early_termination_001.csv", true, Some( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .allow_early_termination(true) + .build().unwrap() + )); + } + + #[test] + fn test_offset_windows() { + // this test is _only_ for verifying that our offsets are getting correctly adjust to match + let expected_consensus = b"ACGTACGTACGTACGT"; + let sequences = [ + // no offset + b"ACGTACGTACGTACGT".to_vec(), + // offset 4 + b"ACGTACGTACGT".to_vec(), + // offset 6 + b"GTACGTACGT".to_vec(), + ]; + + // make one offset exactly match and another be right on our boundary (1-shift here) + let offsets = [None, Some(4), Some(7)]; + + // build a consensus over all inputs + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .offset_window(1) // both are exactly one off + .offset_compare_length(4) // semi-arbitrary, just needs to be short here given our seq-len + .build().unwrap() + ).unwrap(); + for (sequence, offset) in sequences.iter().zip(offsets.iter()) { + consensus_dwfa.add_sequence_offset(sequence, offset.clone()).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus.len(), 1); + assert!(!consensus[0].is_dual()); + assert_eq!(expected_consensus, consensus[0].consensus1().sequence()); + assert_eq!(&[0, 0, 0], consensus[0].consensus1().scores()); + } + + #[test] + fn test_offset_gap_err() { + // this test is _only_ for verifying that our offsets are getting correctly adjust to match + let sequences = [ + // no offset + b"ACGTACGTACGTACGT".to_vec(), + // massive offset + b"ACGTACGTACGTACGT".to_vec(), + ]; + + // one starts and one is way out there + let offsets = [None, Some(1000)]; + + // build a consensus over all inputs + let mut consensus_dwfa = DualConsensusDWFA::with_config( + CdwfaConfigBuilder::default() + .offset_window(1) // both are exactly one off + .offset_compare_length(4) // semi-arbitrary, just needs to be short here given our seq-len + .build().unwrap() + ).unwrap(); + for (sequence, offset) in sequences.iter().zip(offsets.iter()) { + consensus_dwfa.add_sequence_offset(sequence, offset.clone()).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus found the right thing and only that + let consensus_err = consensus_dwfa.consensus(); + assert!(consensus_err.is_err()); + let error = consensus_err.err().unwrap(); + assert_eq!(error.to_string(), "Finalize called on DWFA that was never initialized."); + } +} \ No newline at end of file diff --git a/src/dynamic_wfa.rs b/src/dynamic_wfa.rs new file mode 100644 index 0000000..7110fd1 --- /dev/null +++ b/src/dynamic_wfa.rs @@ -0,0 +1,483 @@ + +use rustc_hash::FxHashMap as HashMap; +use simple_error::bail; + +/// The core dynamic WFA structure for a lite implementation. +/// It is structured such that all the sequences being built are maintained **outside** of this struct (hence the "lite"). +/// Essentially, it is keep the wavefront information, but all sequences are updated and tracked elsewhere. +/// As a result, all updates must pass the sequences so the information can get tracked. +/// If the outside sequences are changed in any way other than appending, then this may become desynchronized. +/// Conceptually, if this is a 2D grid, the baseline sequence will go from top to bottom (y-axis) and the other sequence will go from left to right (x-axis). +/// This means each character we add to `other_seq` will add a new _column_ to the grid. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct DWFALite { + /// The minimum edit distance from `other_seq` to `baseline_seq` allowing for a large port of the tail of the `baseline_seq` to get ignored. + edit_distance: usize, + /// This is the core wavefront for the current `edit_distance`. + /// It should always be length 2*`edit_distance`-1. + /// For our mental model, the median diagonal means we have used the same number of bases in our baseline and other. + /// Anything below the median diagonal means we have used more baseline than other. + /// Anything above the median diagonal means we have used more other than baseline. + /// The value stored will always be the number of bases consumed in `other_seq`. + /// After a proper character addition, _at least one_ of these should always equal the length of `other_seq` - `offset`. + wavefront: Vec, + /// If true, this DWFA has been finalized, meaning it cannot be extended anymore. + is_finalized: bool, + /// Sets an optional wildcard symbol that will match anything + wildcard: Option, + /// Allows the algorithm to terminate when it reaches the end of the baseline_seq + allow_early_termination: bool, + // this is an offset into `other_seq` that the baseline starts + offset: usize +} + +impl Default for DWFALite { + fn default() -> Self { + DWFALite { + edit_distance: 0, + wavefront: vec![0], + is_finalized: false, + wildcard: None, + allow_early_termination: false, + offset: 0 + } + } +} + + +impl DWFALite { + /// Creates a new dynamic WFA instance with user-provided options + /// # Arguments + /// `wildcard` - a known wildcard symbol that will match anything + pub fn new(wildcard: Option, allow_early_termination: bool) -> DWFALite { + DWFALite { + wildcard, + allow_early_termination, + ..Default::default() + } + } + + /// Sets an offset into `other_seq` where the baseline starts. + /// This will effectively ignore the first `offset` characters of `other_seq`, as if they were not present. + /// # Arguments + /// * `offset` - the number of characters to ignore + pub fn set_offset(&mut self, offset: usize) { + self.offset = offset; + } + + /// This is the main function to extend the WFA with a new symbol. + /// This function will handle all of the extending and potential edit distance increases as well. + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + /// * `other_seq` - the other sequence, typically getting updates + /// # Errors + /// * If this function is called after `finalize()` has been called. + pub fn update(&mut self, baseline_seq: &[u8], other_seq: &[u8]) -> Result> { + if self.is_finalized { + bail!("Cannot push more bases after finalizing a DWFA"); + } + + // maximally extend everything along the current diagonals + self.extend(baseline_seq, other_seq)?; + + // check how it looks + let mut maximum_distance = self.maximum_other_distance(); + while maximum_distance < other_seq.len() && !(self.allow_early_termination && self.reached_baseline_end(baseline_seq)){ + // increase the edit distance, re-extension happens automatically + self.increase_edit_distance(baseline_seq, other_seq)?; + + // recalculate this + maximum_distance = self.maximum_other_distance(); + } + + // final assertion just to make sure we don't break anything + assert!( + maximum_distance == other_seq.len() || + (self.allow_early_termination && self.maximum_baseline_distance() == baseline_seq.len()) + ); + Ok(self.edit_distance) + } + + /// This will take the current wavefront and try to extend each one along it's current diagonal. + /// In most cases, this will not do much work. + /// This should also be stable such that calling it after finalizing has no impact. + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + /// * `other_seq` - the other sequence, typically getting updates + /// # Errors + /// * None so far + fn extend(&mut self, baseline_seq: &[u8], other_seq: &[u8]) -> Result<(), Box> { + // this is easier logic than trying to handle the option syntax below + let is_wildcard_disabled = self.wildcard.is_none(); + let wildcard = self.wildcard.unwrap_or_default(); + + for (i, d) in self.wavefront.iter_mut().enumerate() { + // `i` is the index in the wavefront + // `i // 2` is always the middle diagonal + // anything less than that has used more baseline symbols + // anything more than that has used more other symbols + // anything above or below needs to shift the comparison + + // for this particular wavefront, extend as far as possible + loop { + // example at edit distance 1: + // baseline offset would equal: [d+1, d, d-1] which corresponds to + // a skipped base in primary, equal usage, and a skipped base in other + + // if we truncate a wavefront (e.g., a bounded size); then `i` below will be too small for every wavefront from the start + // that gets truncated; this means we want the below formula to be `*d + self.edit_distance - (i + truncate_prefix)` + // where `truncate_prefix` is the number we have cut off the front (in total). + // IMPORTANT: this formula will need to get updated everywhere that `baseline_offset` is calculated + + let baseline_offset = *d + self.edit_distance - i; + let other_offset = *d + self.offset; + if baseline_offset >= baseline_seq.len() || + other_offset >= other_seq.len() || + ( + // not equal + baseline_seq[baseline_offset] != other_seq[other_offset] && + // AND (wildcards are disable OR baseline != wildcard) + (is_wildcard_disabled || baseline_seq[baseline_offset] != wildcard) + ) { + // if we are past the end of either sequence OR + // the sequences are not equal at this position THEN + // we are done extending + break; + } + + // we are not done, so add one to this wavefront + *d += 1; + } + } + Ok(()) + } + + /// This will increase the edit distance for this DWFA and create a new larger wavefront. + /// This function will automatically call `extend()` to enforce the assumptions. + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + /// * `other_seq` - the other sequence, typically getting updates + /// # Errors + /// * If the DWFA is already finalized + fn increase_edit_distance(&mut self, baseline_seq: &[u8], other_seq: &[u8]) -> Result<(), Box> { + if self.is_finalized { + bail!("Cannot increase edit distance after finalizing a DWFA"); + } + // first, increase the distance we're at + self.edit_distance += 1; + + // create an empty new wavefront + let new_wf_len = self.wavefront.len() + 2; + let mut new_wavefront: Vec = vec![0; new_wf_len]; + + // now we need to populate the wavefront + for (i, &d) in self.wavefront.iter().enumerate() { + // deletion (skipping) of a base in `baseline_seq`; this does not change the distance into `other_seq` + new_wavefront[i] = new_wavefront[i].max(d); + + // mismatch progresses both sequences + new_wavefront[i+1] = new_wavefront[i+1].max(d + 1); + + // insertion of a base into `baseline_seq`; this progresses `other_seq` + new_wavefront[i+2] = new_wavefront[i+2].max(d + 1); + } + + // finally, save the new wavefront + self.wavefront = new_wavefront; + + // re-extend + self.extend(baseline_seq, other_seq)?; + Ok(()) + } + + /// This function signals that base insertion into `other_seq` is completed. + /// This will trigger the algorithm to make sure we have reached the end of both the `baseline_seq`, potentially increasing edit distance further. + /// Note: other_seq should already be at the end UNLESS we allow_early_termination + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + /// * `other_seq` - the other sequence, typically getting updates + /// # Errors + /// * If the edit distance cannot be increased further and it needs to be. + pub fn finalize(&mut self, baseline_seq: &[u8], other_seq: &[u8]) -> Result<(), Box> { + if self.is_finalized { + bail!("Cannot finalize a DWFA twice."); + } + while self.maximum_baseline_distance() < baseline_seq.len() { + // while we have not reached the end of the primary sequence + self.increase_edit_distance(baseline_seq, other_seq)?; + } + Ok(()) + } + + /// Helper function that will determine the farthest distance reached into the `baseline_seq` so far. + pub fn maximum_baseline_distance(&self) -> usize { + // baseline distance requires some compute + // the 0-index corresponds to deleting `edit_distance` bases in `baseline`, so it has the largest offset + // each additional iteration pushes the diagonal closer to inserting bases into `baseline`, so the shift gets progressively smaller + self.wavefront.iter().enumerate() + .map(|(i, &d)| d + self.edit_distance - i) + .max().unwrap() + } + + /// Helper function that will determine the farthest distance reached into the `other_seq` so far. + /// After a public function call, this should _always_ be the `other_seq` length. + pub fn maximum_other_distance(&self) -> usize { + // other distance is directly tracked in our wavefront + self.offset + *self.wavefront.iter().max().unwrap() + } + + /// Returns true if the farther wavefront in the baseline is at the end + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + pub fn reached_baseline_end(&self, baseline_seq: &[u8]) -> bool { + self.maximum_baseline_distance() == baseline_seq.len() + } + + /// This will return the set of candidate extensions that do not require increasing the edit distance. + /// It also includes how many times that character was counted in the event of multiple possible extension points. + /// # Arguments + /// * `baseline_seq` - the baseline sequence, theoretically fixed + /// * `other_seq` - the other sequence, typically getting updates + pub fn get_extension_candidates(&self, baseline_seq: &[u8], other_seq: &[u8]) -> HashMap { + let mut ret: HashMap = Default::default(); + for (i, &d) in self.wavefront.iter().enumerate() { + let other_offset = d + self.offset; + if other_offset == other_seq.len() { + let offset = d + self.edit_distance - i; + if offset < baseline_seq.len() { + // ret.insert(baseline_seq[offset]); + let entry = ret.entry(baseline_seq[offset]).or_insert(0); + *entry += 1; + } + } + } + ret + } + + // Getters below + pub fn edit_distance(&self) -> usize { + self.edit_distance + } + + pub fn wavefront(&self) -> &[usize] { + &self.wavefront + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new() { + let dwfa = DWFALite::default(); + assert_eq!(dwfa.edit_distance(), 0); + assert_eq!(dwfa.wavefront(), &[0]); + } + + #[test] + fn test_exact_match() { + let sequence = b"ACGTACGTACGT"; + let mut other_seq = vec![]; + let mut dwfa = DWFALite::default(); + for &c in sequence.iter() { + other_seq.push(c); + assert_eq!(dwfa.update(sequence, &other_seq).unwrap(), 0); + } + } + + #[test] + fn test_simple_mismatch() { + let sequence = b"ACGTACGTACGT"; + let alt_sequence = b"ACGTACCTACGT"; + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), 1); + } + + #[test] + fn test_simple_insertion() { + let sequence = b"ACGTACGTACGT"; + let alt_sequence = b"ACGTACIGTACGT"; + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), 1); + } + + #[test] + fn test_simple_deletion() { + let sequence = b"ACGTACGTACGT"; + let alt_sequence = b"ACGTACTACGT"; + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), 1); + } + + #[test] + fn test_complex_001() { + let sequence = b"ACGTACGTACGT"; + let alt_sequence = b"ACTACGCACGGGT"; + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), 4); + } + + #[test] + fn test_complex_002() { + //modified_seq has 2 separate deletions, 1 2bp insertion, and 1 mismatch + let sequence = b"AACGGATCAAGCTTACCAGTATTTACGT"; + let alt_sequence = b"AACGGACAAAAGCTTACCTGTATTACGT"; + + let mut dwfa = DWFALite::default(); + /* + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + */ + // test a shortcut + dwfa.update(sequence, alt_sequence).unwrap(); + assert_eq!(dwfa.edit_distance(), 5); + } + + #[test] + fn test_big_insertion() { + // one big insertion in the middle + let sequence = b"AACGGATTTTACGT"; + let alt_sequence = b"AACGGATAAAAGCTTACCTGTTTTACGT"; + + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), alt_sequence.len() - sequence.len()); + } + + #[test] + fn test_big_deletion() { + // one big deletion in the middle + let sequence = b"ATTTTTTTTTTAAAAAAAAAA"; + let alt_sequence = b"AAAAAAAAAAA"; + + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + assert_eq!(dwfa.edit_distance(), sequence.len() - alt_sequence.len()); + } + + #[test] + fn test_required_finalize() { + // the ALT is a lot smaller than the baseline, so we need to run a finalize to get the full ED + let sequence = b"ATTTTTTTTTTA"; + let alt_sequence = b"AA"; + + let mut dwfa = DWFALite::default(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + } + + // here it has only compared AT to AA, so ED=1 + assert_eq!(dwfa.edit_distance(), 1); + + // after finalizing, it will do end-to-end comparison + dwfa.finalize(sequence, alt_sequence).unwrap(); + assert_eq!(dwfa.edit_distance(), sequence.len() - alt_sequence.len()); + } + + #[test] + fn test_cloning() { + let sequence = b"AAAAAAA"; + let alt_sequence = b"AAACAAA"; + + let mut dwfa = DWFALite::default(); + let mut dwfa2 = dwfa.clone(); + for l in 0..alt_sequence.len() { + dwfa.update(sequence, &sequence[..(l+1)]).unwrap(); + dwfa2.update(sequence, &alt_sequence[..(l+1)]).unwrap(); + + if sequence[l] == alt_sequence[l] { + // same sequence still + assert_eq!(dwfa, dwfa2); + } else { + // should have different sequences now + assert_ne!(dwfa, dwfa2); + + // re-clone the first one + dwfa2 = dwfa.clone(); + } + } + + // in the end, both should exactly match due to cloning + assert_eq!(dwfa.edit_distance(), 0); + assert_eq!(dwfa2.edit_distance(), 0); + } + + #[test] + fn test_wildcards_001() { + //modified_seq has several wildcards, but otherwise an exact match + let consensus= b"AACGGATCAAGCTTACCAGTATTTACGT"; + let baseline = b"*ACGGATCAA**TTACCA*TATTTACG*"; + + let mut dwfa = DWFALite::new(Some(b'*'), false); + dwfa.update(baseline, consensus).unwrap(); + assert_eq!(dwfa.edit_distance(), 0); + } + + #[test] + fn test_wildcards_002() { + //modified_seq has several wildcards, and we added 1 del, 1 SNV, and 1 ins + let consensus= b"AACGGATCAAGCTTACCAGTATTTACGT"; + let baseline = b"*ACGATCAA**TATACCA*TATCTACG*"; + + let mut dwfa = DWFALite::new(Some(b'*'), false); + dwfa.update(baseline, consensus).unwrap(); + assert_eq!(dwfa.edit_distance(), 3); + } + + #[test] + fn test_early_termination_001() { + let consensus = b"ACGTACGT"; + let baseline = b"ACGT"; + let mut dwfa = DWFALite::new(None, true); + dwfa.update(baseline, consensus).unwrap(); + assert_eq!(dwfa.edit_distance(), 0); + } + + #[test] + fn test_big_early_termination() { + let c1 = "AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACACGGGATTATGGGCAAGGGTAACTGACATCTGCTCAGCCTCAACGTACCCGTCTCAAATGCGGCCAGGCGGTGGGGTAAGCAGGAATGAGGCAGGGGTGGGGTTGCCCTGAGGAGGATGATCCCAACGAGGGCGTGAGCAGGGGACCCGAGTTGGAACTACCACATTGCTTTATTGTACATTAGAGCCTCTGGCTAGGGAGCAGGCTGGGGACTAGGTACCCCATTCTAGCGGGGCACAGCACAAAGCTCATAGGGGGATGGGGTCACCAGAAAGCTGACGACACGAGAGTGGCTGGGCCGGGGCTGTCCGGCGGCCACGGAGAAGCTGAAGTGCTGCAGCAGGGAGGTGAAGAAGAGGAAGAGCTCCATGCGGGCCAGGGGCTCCCCGAGGCATGCACGGCGGCCTGTGGGGAGGGGAGGGGCGTCAGTGAGCCTGGCTCCTGGGTGATACCCCTGCAAGACTCCACGGAAGGGGACAGGGAGCCGGGCTCCCCACAGGCACCTGCTGAGAAAGGCAGGAAGGCCTCCGGCTTCACAAAGTGGCCCTGGGCATCCAGGAAGTGTTCGGGGTGGAAGCGGAAGGGCTTCTCCCAGACGGCCTCATCCTTCAGCACCGATGACAGGTTGGTGATGAGTGTCGTTCCCTGGGCAGGAGATGCAGGGTGAGAGTGGGGACTGGACTCTAGGATGCTGGGACCCCTGCCACCAAACACACGGGGGACACACACTGCCTGGCACACAGCTGGACTCTGTCAACTAGTCCTGCGCCCGAGAAGCTCCACAGTACCCTCTCCGACCCCACAGCAGGGCGCAGTCACACCTCTCAGAGGCACCCACACTGCCCCCTCTCCCTGCAGGCGCTGGGTCCTCCAACATTCTGGCAGGTCCTGGTTTGTCTCCCCACTAGACGGGGGCTCTGGATGGACAGGCCAGCCCTGCCTATACTCTGGACCCCCCACCCAAGTGGGGACAGTCAGTGTGGTGGCATTGAGGACTAGGTGGCCAGGGTTCCTAGAGTGGGCCCACCTGGCAGTAGCCATGCTGGGGCTATCACCAGGGGCTGGTGCTGAGCTGGGGTGAGGAGGGCGCCAGGCCTACCTTAGGGATGCGGAAGCCCTGTACTTCGATGTCACGGGATGTCATATGGGTCACACCCAGGGGGACGATGTCCCCAAAGCGCTGCACCTCATGAATCACGGCAGTGGTGTAGGGCATGTGAGCCTGGTCACCCATCTCTGGTCGCCGCACCTGCCCTATCACGTCGTCGATCTCCTGTTGGACACGGACTGGACAGACATGCGTCCCCACAATGGGTCAGCACCCAGGGGACACTCTCCTTCCTCCTGTGTTGGAGGAAGTTAGGCTTACAGGAGCCTGGCCACGCCTGTGCTGGAAGCCCCGGGTGTCCCAGCTAAGCCCAGGGGCCCCCAGCTGTACCCTTCCTCCCTCAGTCCCTGCCTTGGGCCCCAGCTGGGCTCACGCTGCACATCCAGGTGTAGGATCATGAGCAGGAGGCCCCAGGCCAGCGTGGTCAAGGTGGTCACCATCCCGGCAAGGAACAGGTTACCCACCACTATGCGCAGGTTCTCATCATTGAAGCTGCTCTCAGGGCTCCCCTTGGCCTGAGCAGGGCCGAGAGGATACTCAGGGGATAGAACGGGGTAGCCCCCAAATGACCTCCAATTCTGCACCTGTCAGCCCAGATGCGGCTCGCCGGGTGATGCACTGGTCCAACCTTTTGCCCAGCCTCCCCTCATTCCTCCTGGGACGTTCAACCCACCACCCTTGCCCCCCACCGTGGCAGCCACTCTCACCTTCTCCTTCTTTGCCAGGAAGGCCTCAGTCAGGTCTCGGGGTGGCTGGGCTGGGTCCCAGGTCATCCTGTGCTCAGTTAGCAGCTCATCCAGCTGGGTCAGGAAAGCCTTTTGGAAGCGTAGGACCTTGCCAGCCAGCGCTGGGATGTGCGGGAGGACGGGGACAGCATTCAGCACCTACACCAGACAGAACCGGGTCTCAATCCTTCCTGTGCTCTGCGTTCATCTGGACCAGTCTCAGGCCCCAGCCATCTCCAGGAAGACCCAGGGCCTGCCTGTCCTTACCACTGACCTCACCAAGTCCCTCCCCAAGTGCCAGCCTCCACCCTCTCTCTCCTTGCCCAGAGGAGAAACCTAAAATCGAAATCTCCAACGTGGACGGGGGTACAGAGTCCTTGGCCTCTCCTGGTGCCCCCTGACCCGGGCACACCTCTCCCACGACCATGTCTGAGATGTCCCCTCCTCCTCCAGGCCCTTCTTACAGTGGGGTCTCCTGGAATGTCCTTTCCCAAACCCATCTACGCAAATCCTGCCCTTCGGAGGCCCCAGTCCAGCCCCGGCACCTCTCAGGAGCTCGCCCTGCAAAGACCCTTGCTCCGCACCTCGCGCAGGAAGCCCGACTCCTCCTTCGATCCCTCCCTGAGCTAGGTCCAGCAGCCTGAGGAAGCGAGGGTCGTCGTACTCGAAGCGGCGCCCGCAGGTGAGGGAGGCGATCACGTTGCTCACGGCTTTGTCCAAGAGACCGTTGGGGCGAAAGGGGCGTCCTGGGGGTGGGAGATGCGGGTAAGGGGTCGCCTTCTCCGTCCCCCGCCTTCCCAGTTCCCGCTGTGTGCCCTTCTGCCCATCACCCACCGGCTTGGTCGGCGAAGGCGGCACAAAGGCAGGCGGCCTCCTCGGTCACCCACTGCTCCAGCGACTTCTTGCCCAGGCCCAAGTTGCGCAAGGTGGACACGGAGAAGCGCCTCTGCTCGCGCCACGCGGGCCCATAGCGCGACAGGATCACCCCTGTGGGCGGGACGGACACGTGGGCGTTGCCATGAAGGCCTTGGCCCCACCCTCCGCCACCCACTCCAACCCTGGCGCTCCACAAGGTCTCCCGCAGTCCCTAGCCCGGTCCAGCTGGGCACAGGGCCCACTCTTTGCTCACCCACATTGCTCCCCTGCCTGGGGCGGGGTTTGGCCCCACCTCGTCTCTGCCCACCCTGACCACCTTTCCACTCAAGGAAGATCCCGCCCGTCCCGCCCACACTGAGCCCGCAGCATAGGCGCGGTCCCCGCCACCGCCACTTCGACGCATCAGCCTCGCCCACCGGGCTTCTGGCGGGTCTGGGCAGTAGCCCCGCCCCCTCCCAGCCCACAGACTCGCACCTCCCCCGTGCAGGTGGTTTCCTGGCCCACTGTCCTCAGCCCACTCGCTGGCCTTTATCTCTGTTTCACGTCCAGGACCCCACGCCCTGTCGGCGCTGCTTGGGCTACGGTCACTGTCCACCCGGGGCCCACGGAAACGCGGTCTCTGTCCCCCACCGCCGCTTGCCTTGGGAACGCGGCCCGAAGCCCAGGACCTGGTAGATGGGCGCAGGCGGGCGGTCGGCCGTGTCCTCGCCGCGGGTCACCATCGCCTCGCGCACGGCCGCCAGCCCATTGAGCACGACCACCGGCGTCCAGGCCAGCTGCAGGCTGAACACGTCCCCGAAGCGGCGCCGCAACTGCAGAGGGAGGGTCAGGGCCTCTTGTCAAGCCAGGATCCCCCCAGACTACAGGTCCTAGTCCTATTTGAACCTTGGACGACCCCCGGGGCTACCAGGAGTGAGCAGGTGGAAGGAGGAGACCCAGCCTCCTGATCCTGGGGCGGGGGTGGGGGTCACACCTTCTGTGATGGAGGAACTCAGTTTGGATGCGTCACCCAGGTATGACCTTGCAAGAGTCACCAAAATTGCCGAGAGGCCCCAGTTAGCATCCCATTCCCAGATGATGGTCCATGCCGGTGAGCAGTGAGGCCCGAGGACCCACAGTGCAAAAGGTTTGAACCGGGTCACTGCACCCCCTTCATCCTCGATTTCGTGATTTAAACGGCACTCAGGACTAACTCATCTTCCATTCCCAAGGCCTTTCCTTCTGGTGTCAGCAGAAGGGACTTTGTACTCCATAACATATGTTGCCCAATGGGCTTGCATGCCCACTGCCAAGTCCAGCTCCACCTCCAGGCCCTTGCCCTACTCTTCCTTGGCCTTTGGAAAATCCAGTCCTTCATGCCATGTATAAATGTCCTTCCCCAGGACGTCCCCCAAACCTGCTTCCCCTTCTCAGCCTGGCTTCTGATCCAGCCTGTGGTTTAACCCACCACCCATGTTTGCTGGTGGTGGGGCATCCTCAGGACCTCTGCCGCCCTCCAGGACCTCCTCCCTCACCTGGTCGAAGCAGTATGGTGTGTTCTGGAAGTCCACATGCAGCAAGGTTGCCCAGCCCGGGCAGTGGCAGGGGACCTGGCGGGTAGCGTGCAGCCCAGCGTTGGTGCCGGTGCATCAGGTCCACCAGGAGCAGGAAGATGGCCACTATCATGGCCAGGGGCACCAGTGCTTCTAGCCCCATGGCTGCCTCACTACCAACTGGGCTCCTCTGGACACACCTGGCACCCCCACCCCACCAGGCACAGAGGACCAGGCAGGACACTCTCAGCACACCGAGCGCGTGACCCTTCCCTTATAAAGGGAGCTGATGATGGCCTTCGCCCTCTGCTGTGAGTGAACCTGCTGTGTTGACTGTGCTGCCAGTGGCAGAGTCAGGCCAGGGTGGGTATGGGCTGCTCCAGAGGTCCTTGCCGCTGCTTCCTGCTCCAGGCCCTTACCCAGGGTAGGGTGGTAGAAAGGCCTGGTCGGAGAAGTCACCCCCTCTCCCCACTCCAAGCTCCCCAAGCCCACACAGGCTTCTGGGATAACCAGGGTCTCAGTGGACCCGGCCATCCACCTCCCAGCTAGGCTCATACACCGTAATGTAGTCACAACCCCTCCTCCAGAACATGGCCTTGCCCTTTCCCTACCCCCACCTGCCCACTCCAGAGTGACCTTCAGCACCCTTATCTGTCACTGGCACTTACCTGGGGCCTTAGAGCTCCTGATGATGAGTGGCATCATGGGCCTGGTCCCTTCACTTCACCTTGCACTCTTGACATGCACAGACGCTATGCACACACCTGATGGTGCACAGATCTCTTGTCCACTCCCAGACACTTGTCCACTTGTTCACACTTGCAGGGACACGATTACACATGCAGAAAATCACCCACACAAAGACAATATTCACACATACACAGACTCACACTGACACTCAGGGCACACATTCTCTCTCACACACACCAGTCACACACACATACAGACCCGGCACCAAGTACCCCACTTCCCAGCCATGCCCAAGGTTTCCTGGATGGGACCTCTCCTGTCCAGAGGCTGCTCCCAGTGAGCCTCAAAGCTGTCACGTGGATCCCAGCTCAGCCCACATTCTGGGCTCTGGCCGGGCCATGGCTTCCTGTTTGCAACAGGGCTGTTCCCAGAGCTCCCAGTTGGTAGCCTGAAGGCCCTTGCCCCAGCCTGTGACAGCATCCTCCAGGGCTGCCTGAGGGTCGTCATTCTCCACTGCTTCCTGGCCTCCATGTTTCTGATTAGAAATCTGGTGGAAACATTATGGAGGATCCTTTATTTAGGATATGTTGCTTTTTTATTTTTATTTTTTCTTTAGACAGGGTCTCACTCTGTTGCCCGGGCCGGAGTGCAGTGGCAGGATCACGGCTCACTGCAATCTCAACATCAAGTGGACCTCCTGCCTCCCAAGTAGCTGGGACTACAGGCACCACCGAGCCCAAATAATTTTTTTTTTGAGACGGAGTTTTGCTCTGTCGCCCAGGTGGGAGTGCAATGATGCGATCTCGGCTCACTGCAACCTCCACCTCCAGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGATTACAGGTGCCCACCACCATGCCTGGCTGATTTTTTGTA"; + let seq_23 = "AGCCCATTCTGGCCCCTTCCCCACATGCCAGGACAATGTAGTCCTTGTCACCAATCTGGGCAGTCAGAGTTGGGTCAGTGGGGGACATGGGATTATGGGCAAGGGTAACTGACATCTGCTCAGCCTCAACGTACCCGTCTCAAATGCGGCCAGGCGGTGGGGTAAGCAGGAATGAGGCAGGGGTGGGGTTGCCCTGAGGAGGATGATCCCAACGAGGGCGTGAGCAGGGGACCCGAGTTGGAACTACCACATTGCTTTATTGTACATTAGAGCCTCTGGCTAGGGAGCAGGCTGGGGACTAGGTACCCCATTCTAGCGGGGCACAGCACAAAGCTCGTAGGGGGATGGGGTCACCAGAAAGCTGACGACACGAGAGTGGCTGGGCCGGGGCTGTCCGGCGGCCACGGAGAAGCTGAAGTGCTGCAGCAGGGAGGTGAAGAAGAGGAAGAGCTCCATGCGGGCCAGGGGCTCCCCGAGGCATGCACGGCGGCCTGTGGGGAGGGGAGGGGCGTCAGTGAGCCTGGCTCCTGGGTGATACCCCTGCAAGACTCCACGGAAGGGGACAGGGAGCCGGGCTCCCCACAGGCACCTGCTGAGAAAGGCAGGAAGGCCTCCGGCTTCACAAAGTGGCCCTGGGCATCCAGGAAGTGT"; + + // iterate, making sure everything is fine even as we go well beyond seq_23 + let mut dwfa = DWFALite::new(None, true); + for i in 0..c1.len() { + dwfa.update(seq_23.as_bytes(), c1[0..(i+1)].as_bytes()).unwrap(); + assert!(dwfa.edit_distance() <= 2); + } + assert_eq!(dwfa.edit_distance(), 2); + + // make sure finalize does not break it + dwfa.finalize(seq_23.as_bytes(), c1.as_bytes()).unwrap(); + assert_eq!(dwfa.edit_distance(), 2); + + } + + #[test] + fn test_offsets() { + // this sequence + let consensus = b"ACGTACGT"; + let baseline = b"GTACGT"; + let mut dwfa = DWFALite::new(None, true); + dwfa.set_offset(2); + dwfa.update(baseline, consensus).unwrap(); + assert_eq!(dwfa.edit_distance(), 0); + } +} diff --git a/src/example_gen.rs b/src/example_gen.rs new file mode 100644 index 0000000..e81edf4 --- /dev/null +++ b/src/example_gen.rs @@ -0,0 +1,64 @@ + +use rand::distributions::Uniform; +use rand::{Rng, SeedableRng}; + +/// Creates a test set we can verify is working +/// # Arguments +/// * `alphabet_size` - the length of the alphabet, e.g. for DNA it's 4 +/// * `seq_len` - the length of the sequences +/// * `num_samples` - the number of samples to generate from the consensus +/// * `error_rate` - overall error rate, assumes mismatch, insertion, and deletion are equally likely sub-components of this error rate +pub fn generate_test(alphabet_size: u8, seq_len: usize, num_samples: usize, error_rate: f64) -> (Vec, Vec>) { + assert!(alphabet_size > 1); + assert!((0.0..=1.0).contains(&error_rate)); + + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + let base_distribution = Uniform::new(0, alphabet_size); + let basem1_distribution = Uniform::new(0, alphabet_size-1); + let error_distribution = Uniform::new(0.0, 1.0); + let error_type_distribution = Uniform::new(0, 3); + + let consensus: Vec = (0..seq_len) + .map(|_i| rng.sample(base_distribution)) + .collect(); + + let samples: Vec> = (0..num_samples) + .map(|_i| { + + let mut seq = vec![]; + let mut con_index = 0; + while con_index < consensus.len() { + let c = consensus[con_index]; + let is_error = rng.sample(error_distribution) < error_rate; + if is_error { + let error_type = rng.sample(error_type_distribution); + match error_type { + 0 => { + // substition + let sub_offset = rng.sample(basem1_distribution); + let alt_c = (c+sub_offset) % alphabet_size; + seq.push(alt_c); + con_index += 1; + }, + 1 => { + // deletion + con_index += 1; + }, + 2 => { + //insertion + let s = rng.sample(base_distribution); + seq.push(s); + }, + _ => panic!("no impl") + } + } else { + seq.push(c); + con_index += 1; + } + } + seq + }) + .collect(); + + (consensus, samples) +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7b9938c --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,55 @@ +/*! +# waffle_con +This library provides access to consensus algorithms based on building a consensus with dynamic WFA exploration. + +Key benefits: +* Ability to generate different consensus types: single consensus, dual consensus (e.g., diplotype), or multi-consensus (e.g., unknown number of consensuses) +* Generally efficient for low noise inputs +* Does not rely on a sequence backbone + +Performance notes: +* The underlying algorithm scales with WFA, so high error rates will increase compute time and memory consumption +* Certain branching patterns are more expensive than others, we plan to improve this in the future + +# Example usage +```rust +use waffle_con::consensus::ConsensusDWFA; + +let sequences = [ + b"ACGT".to_vec(), + b"ACCGT".to_vec(), // this should be the consensus + b"ACCCGT".to_vec() +]; + +// add all the sequences +let mut cdwfa: ConsensusDWFA = Default::default(); +for s in sequences.iter() { + cdwfa.add_sequence(s).unwrap(); +} + +// run consensus and check the results +let consensuses = cdwfa.consensus().unwrap(); +assert_eq!(consensuses.len(), 1); +assert_eq!(consensuses[0].sequence(), sequences[1]); +assert_eq!(consensuses[0].scores(), &[1, 0, 1]); +``` +*/ + +/// Configuration for ConsensusDWFA +pub mod cdwfa_config; +/// Main functionality for the consensus component +pub mod consensus; +/// Functionality for a 2-way consensus +pub mod dual_consensus; +/// Main functionality for a dynamic WFA, which will just focus on a single string +pub mod dynamic_wfa; +/// Utility for generating examples +pub mod example_gen; +/// Main functionality for a multiple-consensus component +pub mod multi_consensus; +/// Utility for tracking a pqueue without force deleting items +pub mod pqueue_tracker; +/// Main functionality for a priority multiple-consensus component, allowing differentiation on multiple sequences in priority order +pub mod priority_consensus; +/// Basic pair-wise alignment utilities +pub mod sequence_alignment; diff --git a/src/multi_consensus.rs b/src/multi_consensus.rs new file mode 100644 index 0000000..542bb10 --- /dev/null +++ b/src/multi_consensus.rs @@ -0,0 +1,95 @@ + +/*! +Initially, this module provided access to a MultiConsensus approach. +This approach has been deprecated for now (see PriorityConsensus), but the output type `MultiConsensus` is still useful in many contexts. +*/ + +use crate::consensus::Consensus; + +/// Contains a final multi-consensus result +#[derive(Debug, PartialEq)] +pub struct MultiConsensus { + /// Results for the consensuses + consensuses: Vec, + /// For each input sequence, this is the index of the assigned consensus + sequence_indices: Vec +} + +impl MultiConsensus { + /// General Constructor for MultiConsensus. + /// Of note, this will re-order alleles alpha-numerically, allowing for predictable outputs. + /// This re-ordering will alter the sequence_indices to match the new order. + /// # Arguments + /// * `consensuses` - the actual consensus sequences with scores + /// * `sequence_indices` - a mapping point out which consensus the sequence ended up in + pub fn new(mut consensuses: Vec, sequence_indices: Vec) -> MultiConsensus { + // sort the consensuses + let mut ordered_indices = (0..consensuses.len()).collect::>(); + ordered_indices.sort_by(|i, j| { + let ci = consensuses[*i].sequence(); + let cj = consensuses[*j].sequence(); + ci.cmp(cj) + }); + + // now create the reverse lookup table + let mut reverse_lookup = vec![usize::MAX; consensuses.len()]; + for (new_index, &old_index) in ordered_indices.iter().enumerate() { + reverse_lookup[old_index] = new_index; + } + + // we can now just re-sort the consensuses in place, this is generally better than cloning and such + // TODO: is there some way to re-use the ordered indices above? not sure there is that is more efficient + consensuses.sort_by(|c1, c2| { + c1.sequence().cmp(c2.sequence()) + }); + + // re-map the sequence indices + let sequence_indices: Vec = sequence_indices.iter() + .map(|&bci| reverse_lookup[bci]) + .collect(); + + MultiConsensus { + consensuses, + sequence_indices + } + } + + // Getters + pub fn consensuses(&self) -> &[Consensus] { + &self.consensuses + } + + pub fn sequence_indices(&self) -> &[usize] { + &self.sequence_indices + } +} + +#[cfg(test)] +mod tests { + use crate::cdwfa_config::ConsensusCost; + + use super::*; + + #[test] + fn test_multiconsensus_sort() { + let consensuses = vec![ + Consensus::new(b"ACGT".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"TGCA".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"AAAA".to_vec(), ConsensusCost::L1Distance, vec![0]), + ]; + let sequence_indices = vec![ + 2, 0, 1 + ]; + let multicon = MultiConsensus::new(consensuses, sequence_indices); + + // these should now be sorted by sequence with the sequence_indices adjusted to match the new order + assert_eq!(multicon, MultiConsensus { + consensuses: vec![ + Consensus::new(b"AAAA".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"ACGT".to_vec(), ConsensusCost::L1Distance, vec![0]), + Consensus::new(b"TGCA".to_vec(), ConsensusCost::L1Distance, vec![0]), + ], + sequence_indices: vec![0, 1, 2] + }); + } +} \ No newline at end of file diff --git a/src/pqueue_tracker.rs b/src/pqueue_tracker.rs new file mode 100644 index 0000000..e5f7eff --- /dev/null +++ b/src/pqueue_tracker.rs @@ -0,0 +1,172 @@ + +use log::trace; +use simple_error::bail; + +/// Struct for tracking the length of results in our queue that are greater than some threshold. +/// This allows us to track how long a queue actually is when we know we will ignore events smaller than the threshold. +/// Currently, it does not contain the queue itself, which may be worth doing in the long term. +/// It also tracks the number of "processed" items separately, allowing a user to set a capacity at each size and report when that capacity is reached. +#[derive(Debug)] +pub struct PQueueTracker { + /// The count of each haplotype size in the queue + length_counts: Vec, + /// The total number of haplotypes in the queue with length >= threshold + total_count: usize, + /// The minimum threshold for a haplotype to count + threshold: usize, + /// The count of each haplotype size that was processed + processed_counts: Vec, + /// The maximum number of elements allowed at each index + capacity_per_size: usize +} + +impl PQueueTracker { + /// Creates a new tracker with a given initial size. + /// # Arguments + /// * `initial_size` - the initial size of the tracked elements + /// * `capacity_per_size` - the maximum capacity at each element + pub fn with_capacity(initial_size: usize, capacity_per_size: usize) -> PQueueTracker { + PQueueTracker { + length_counts: vec![0; initial_size], + total_count: 0, + threshold: 0, + processed_counts: vec![0; initial_size], + capacity_per_size + } + } + + /// Inserts a length to our tracker + /// # Arguments + /// * `value` - the length to track + pub fn insert(&mut self, value: usize) { + if value >= self.length_counts.len() { + self.length_counts.resize(value+1, 0); + } + self.length_counts[value] += 1; + if value >= self.threshold { + self.total_count += 1; + } + } + + /// Removes a length from the tracker + /// # Arguments + /// * `value` - the length to remove from tracking + pub fn remove(&mut self, value: usize) { + assert!(self.length_counts[value] > 0); + self.length_counts[value] -= 1; + if value >= self.threshold { + assert!(self.total_count > 0); + self.total_count -= 1; + } + } + + /// Wrapper for increasing by one + pub fn increment_threshold(&mut self) { + self.increase_threshold(self.threshold+1); + } + + /// Increased the threshold of what is included in our total count + /// # Arguments + /// * `new_threshold` - the new minimum threshold to track, must be >= current threshold + pub fn increase_threshold(&mut self, new_threshold: usize) { + assert!(new_threshold >= self.threshold); + trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count); + for t in self.threshold..new_threshold { + self.total_count -= self.length_counts[t]; + } + self.threshold = new_threshold; + trace!("increase_threshold => {}, size = {}", self.threshold, self.total_count); + } + + /// Attempts to mark a new value as processed. + /// # Arguments + /// * `value` - the length to track + pub fn process(&mut self, value: usize) -> Result<(), Box> { + if value >= self.processed_counts.len() { + self.processed_counts.resize(value+1, 0); + } + + if self.processed_counts[value] >= self.capacity_per_size { + bail!("Capacity is full"); + } + + // increment the value + self.processed_counts[value] += 1; + Ok(()) + } + + /// Returns the number of processed elements at a particular level + /// # Arguments + /// * `value` - the length to track + pub fn processed(&self, value: usize) -> usize { + if value >= self.processed_counts.len() { + 0 + } else { + self.processed_counts[value] + } + } + + /// Returns true if we are at capacity for a particular occupancy + /// # Arguments + /// * `value` - the length to track + pub fn at_capacity(&self, value: usize) -> bool { + self.processed(value) >= self.capacity_per_size + } + + /// Returns the total number of haplotypes in the queue with length >= the internal threshold. + pub fn len(&self) -> usize { + self.total_count + } + + /// Returns the total number of tracked elements, regardless of the threshold + pub fn unfiltered_len(&self) -> usize { + self.length_counts.iter().sum() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn threshold(&self) -> usize { + self.threshold + } + + /// Returns the number of tracked elements at a particular level + /// # Arguments + /// * `value` - the length to track + pub fn occupancy(&self, value: usize) -> usize { + if value >= self.length_counts.len() { + 0 + } else { + self.length_counts[value] + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_capacity() { + let mut capacity_tracker = PQueueTracker::with_capacity(0, 2); + + // currently empty + assert!(!capacity_tracker.at_capacity(1)); + assert_eq!(capacity_tracker.processed(1), 0); + + // add 1 of 2 + capacity_tracker.process(1).unwrap(); + assert!(!capacity_tracker.at_capacity(1)); + assert_eq!(capacity_tracker.processed(1), 1); + + // add 2 if 2 + capacity_tracker.process(1).unwrap(); + assert!(capacity_tracker.at_capacity(1)); + assert_eq!(capacity_tracker.processed(1), 2); + + // it's full now, this should fail without changing occupancy + assert!(capacity_tracker.process(1).is_err()); + assert_eq!(capacity_tracker.processed(1), 2); // should not have been modified when we failed to insert + } +} \ No newline at end of file diff --git a/src/priority_consensus.rs b/src/priority_consensus.rs new file mode 100644 index 0000000..3b2197e --- /dev/null +++ b/src/priority_consensus.rs @@ -0,0 +1,655 @@ +/*! +This module provides access to the PriorityConsensusDWFA, which iteratively uses a DualConsensusDWFA on multiple provided sequences. +In the classic paired example, it will first generate DualConsensus on the first sequences in the pair and then further split on the second sequence (if possible). +Additionally, the reported consensus would be a pair of consensus sequences. + +# Example usage +```rust +use waffle_con::consensus::Consensus; +use waffle_con::priority_consensus::PriorityConsensusDWFA; +use waffle_con::cdwfa_config::ConsensusCost; + +let sequences_chains = [ + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 +]; + +// add all the sequences +let mut cdwfa: PriorityConsensusDWFA = Default::default(); +for s_chain in sequences_chains.iter() { + let ref_chain: Vec<&[u8]> = s_chain.iter().map(|c| c.as_slice()).collect(); + cdwfa.add_sequence_chain(ref_chain).unwrap(); +} + +// run consensus and check results +let consensuses = cdwfa.consensus().unwrap(); +assert_eq!(consensuses.consensuses(), &[ + // these are in alphabetically ordered by the chains; in the example these chains are pairs (i.e. length 2) + vec![ + Consensus::new(b"ACGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]), + Consensus::new(b"ACCCGGTT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ], + vec![ + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 6]), // this is shared between consensus 1 and 2, so it has costs for both + Consensus::new(b"ACGGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ], + vec![ + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 6]), // this is shared between consensus 1 and 2, so it has costs for both + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ] +]); +assert_eq!(consensuses.sequence_indices(), &[ + 2, 2, 2, 1, 1, 1, 0, 0, 0 +]); +``` +*/ + +use itertools::Itertools; +use log::debug; +use rustc_hash::FxHashSet as HashSet; +use simple_error::bail; + +use crate::cdwfa_config::{CdwfaConfig, ConsensusCost}; +use crate::consensus::Consensus; +use crate::dual_consensus::DualConsensusDWFA; + +/// Contains a final multi-consensus result +#[derive(Debug, PartialEq)] +pub struct PriorityConsensus { + /// Results for the consensuses + consensuses: Vec>, + /// For each input sequence set, this is the index of the assigned consensus + sequence_indices: Vec +} + +impl PriorityConsensus { + pub fn new(consensuses: Vec>, sequence_indices: Vec) -> PriorityConsensus { + PriorityConsensus { + consensuses, + sequence_indices + } + } + + // Getters + pub fn consensuses(&self) -> &[Vec] { + &self.consensuses + } + + pub fn sequence_indices(&self) -> &[usize] { + &self.sequence_indices + } +} + +/// Core utility that will generate a consensus sequence using the DWFA approach. +/// This approach is a wrapper for MultiConsensusDWFA that allows for a consensus to get split based on multiple sequences in a priority order. +/// For example, this may be used to first split sequences by homo-polymer compressed sequences and then the full-length sequences. +/// All consensuses are returned as a "chain" of consensuses that a user can then make decisions with. +#[derive(Debug, Default)] +pub struct PriorityConsensusDWFA<'a> { + /// Contains all the sequences that have been added to this consensus so far. + sequences: Vec>, + /// Approximate offsets into the consensus that sequence starts, this will get adjust at run-time. If None, then assume start. + offsets: Vec>>, + /// Optional start seeds for running consensus + seed_groups: Vec>, + /// The config for this consensus run + config: CdwfaConfig, + /// The alphabet we will use for consensus building + alphabet: HashSet +} + +impl<'a> PriorityConsensusDWFA<'a> { + /// Creates a new instance of ConsensusDWFA and performs sanity checks. + /// # Arguments + /// * `config` - the type of consensus score we want to use + /// # Errors + /// * None so far + pub fn with_config(config: CdwfaConfig) -> Result, Box> { + Ok(PriorityConsensusDWFA { + sequences: vec![], + offsets: vec![], + seed_groups: vec![], + config, + alphabet: Default::default() + }) + } + + /// Wrapper function for adding an unseeded sequence chain to the list. + /// # Arguments + /// * `sequences` - the new sequences to add + /// # Errors + /// * Same as `add_seeded_sequence(...)` + pub fn add_sequence_chain(&mut self, sequences: Vec<&'a [u8]>) -> Result<(), Box> { + let offsets = vec![None; sequences.len()]; + self.add_seeded_sequence_chain(sequences, offsets, None) + } + + /// Adds a new sequence chain to the list with an optional seed group + /// # Arguments + /// * `sequences` - the new sequences to add + /// * `offset` - the approximate offset of the sequence, use None if at the start + /// * `seed_group` - an optional starting seed grouping for this sequence + /// # Errors + /// * None so far + pub fn add_seeded_sequence_chain(&mut self, sequences: Vec<&'a [u8]>, offsets: Vec>, seed_group: Option) -> Result<(), Box> { + // check for the errors that can break assumptions later + if sequences.is_empty() { + bail!("Must provide a non-empty sequences Vec"); + } + + if !self.sequences.is_empty() && self.sequences[0].len() != sequences.len() { + bail!("Expected sequences Vec of length {}, but got one of length {}", self.sequences[0].len(), sequences.len()); + } + + // add ever character to our alphabet + for sequence in sequences.iter() { + self.alphabet.extend(sequence.iter().cloned()); + } + + // make sure we didn't add the wildcard by accident + // note: it's easier to just remove it once here than do something fancy with the above iteration + if let Some(wc) = self.config.wildcard { + self.alphabet.remove(&wc); + } + + // now add the sequences + self.sequences.push(sequences); + self.offsets.push(offsets); + self.seed_groups.push(seed_group); + Ok(()) + } + + /// The core function that gets called after adding all the sequence sets we care about + /// # Errors + /// * if the DWFA throws errors + pub fn consensus(&self) -> Result> { + // initialize the first set with all the sequences + let max_split_level = self.sequences[0].len(); + let mut to_split: Vec> = vec![]; + let mut split_levels: Vec = vec![]; + let mut consensus_chains: Vec> = vec![]; + + // figure out the list of initial seeds, typicall all are "None" + let initial_group_keys: HashSet> = self.seed_groups.iter().cloned().collect(); + for igk in initial_group_keys.iter() { + // build a vec of which ones have this particular seed and add it to the list + let initial_group_set: Vec = self.seed_groups.iter() + .map(|sg| sg == igk) + .collect(); + + // pushing this grouping, and indicate we are looking at the first set of sequences (0) + to_split.push(initial_group_set); + split_levels.push(0); + consensus_chains.push(vec![]); + } + + let mut consensuses = vec![]; + let mut assignments = vec![]; + while let Some(include_set) = to_split.pop() { + // also pop the current split level + let current_split_level = split_levels.pop().unwrap(); + let mut current_consensus_chain = consensus_chains.pop().unwrap(); + + // build a dual consensus DWFA for these sequences + let mut dc_dwfa = DualConsensusDWFA::with_config(self.config.clone())?; + debug!("Calling Dual at level {current_split_level} with: {include_set:?}"); + + for (&include, (seq, offset)) in include_set.iter().zip(self.sequences.iter().zip(self.offsets.iter())) { + if include { + dc_dwfa.add_sequence_offset(seq[current_split_level], offset[current_split_level])?; + } + } + + // now solve it + let dc_result = dc_dwfa.consensus()?; + if dc_result.len() > 1 { + debug!("Multiple dual consensuses detected, arbitrarily selecting first option."); + } + + let chosen_result = &dc_result[0]; + debug!("Parsing result with {} consensuses...", if chosen_result.is_dual() { 2 } else { 1 }); + + /* + // this will print the consensus sequences + println!(">c1\n{}", std::str::from_utf8(chosen_result.consensus1().sequence()).unwrap()); + if chosen_result.is_dual() { + println!(">c2\n{}", std::str::from_utf8(chosen_result.consensus2().unwrap().sequence()).unwrap()); + } + + let index1: usize = 257; // this one is going into UNKNOWN eventually, why are they splitting? + let index2 = 301; // this one is resolving to D7 + // /* + if include_set[index1] && include_set[index2] { + debug!("BOTH INSIDE {index1} & {index2}"); + } + // */ + + // this was a debugging where we would score the full WFA ed of each sequence + let mut ic_index = 0; + for (i, seq_chain) in self.sequences.iter().enumerate() { + if include_set[i] { + let require_both_end = false; // set to true for end-to-end ED + let s1 = crate::sequence_alignment::wfa_ed_config(chosen_result.consensus1().sequence(), seq_chain[current_split_level], require_both_end, self.config.wildcard); + let s2 = if chosen_result.is_dual() { + crate::sequence_alignment::wfa_ed_config(chosen_result.consensus2().unwrap().sequence(), seq_chain[current_split_level], require_both_end, self.config.wildcard) + } else { + usize::MAX + }; + debug!("\tseq_{i} => {s1} | {s2} => is_ci = {}; {:?} | {:?}", chosen_result.is_consensus1()[ic_index], chosen_result.scores1()[ic_index], chosen_result.scores2()[ic_index]); + ic_index += 1; + /* + if i == index1 || i == index2 { + println!(">seq_{i}\n{}", std::str::from_utf8(seq_chain[current_split_level]).unwrap()); + } + // */ + } + } + // */ + + if chosen_result.is_dual() { + // consensus sequences actually don't matter at this point, we only care about how they were split + let is_c1 = chosen_result.is_consensus1(); + let mut is_c1_index: usize = 0; + + // these are the new clusters + let mut assign1 = vec![false; self.sequences.len()]; + let mut assign2 = vec![false; self.sequences.len()]; + + for (i, &included) in include_set.iter().enumerate() { + if included { + // this one was part of the include_set + if is_c1[is_c1_index] { + // this one should go to the first one + assign1[i] = true; + } else { + assign2[i] = true; + } + is_c1_index += 1; + } + } + + // make sure we have written the correct number of things + assert_eq!(is_c1.len(), is_c1_index); + + // now add both new sets for re-splitting + to_split.push(assign1); + split_levels.push(current_split_level); // level has not increased since we found a split + consensus_chains.push(current_consensus_chain.clone()); // consensus chain thus far is shared + + to_split.push(assign2); + split_levels.push(current_split_level); // level has not increased since we found a split + consensus_chains.push(current_consensus_chain); // consensus chain thus far is shared + } else { + let new_split_level = current_split_level + 1; + current_consensus_chain.push(chosen_result.consensus1().clone()); + + if new_split_level == max_split_level { + // we found no split at the final level, time to return the consensus + // consensuses.push(chosen_result.consensus1().clone()); + consensuses.push(current_consensus_chain); + assignments.push(include_set); + } else { + // we want to try to split at the next split level + to_split.push(include_set); + split_levels.push(new_split_level); + + // we found a consensus for this level, save it to the chain and add it back in to the looping + consensus_chains.push(current_consensus_chain); + } + } + } + + let (consensuses, sequence_indices) = if consensuses.len() > 1 { + // probably need to sort and correct ordering, etc. + let mut indices = vec![usize::MAX; self.sequences.len()]; + let sorted_cons = consensuses.into_iter().zip(assignments) + .sorted_by(|a, b| { + let chain_a = a.0.iter().map(|c| c.sequence()).collect::>(); + let chain_b = b.0.iter().map(|c| c.sequence()).collect::>(); + chain_a.cmp(&chain_b) + }) + .enumerate() + .map(|(con_index, (consensus, assignments))| { + for (i, &assigned) in assignments.iter().enumerate() { + if assigned { + // make sure we have not assigned this one + assert_eq!(indices[i], usize::MAX); + indices[i] = con_index; + } + } + consensus + }) + .collect(); + + (sorted_cons, indices) + } else { + // single value, pass through + (consensuses, vec![0; self.sequences.len()]) + }; + + Ok(PriorityConsensus { + consensuses, + sequence_indices + }) + } + + // getters + pub fn sequences(&self) -> &[Vec<&'a [u8]>] { + &self.sequences + } + + pub fn consensus_cost(&self) -> ConsensusCost { + self.config.consensus_cost + } + + pub fn alphabet(&self) -> &HashSet { + &self.alphabet + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + use std::path::PathBuf; + + use crate::cdwfa_config::CdwfaConfigBuilder; + + #[derive(Debug, serde::Deserialize)] + struct MultiRecord { + consensus: usize, + edits: usize, + sequence: String + } + + /// Wrapper test function that loads a multi test from a csv file. + /// Expected columns are "consensus" (integer >= 1), "edits" (u64), and "sequence" (String). + /// The "sequence" column is a ";" separated series of sequences. + /// The "edits" column is largely ignored except for marking the true consensus. The exact ED testing is in other tooling. + /// Returns a tuple of (sequence_chains, PriorityConsensus). + /// # Arguments + /// * `filename` - the file path to load + /// * `include_consensus` - if True, it will load the first consensus read into the sequences + /// * `cost_mode` - the cost mode getting tested + fn load_priority_csv_test(filename: &std::path::Path, include_consensus: bool, cost_mode: ConsensusCost) -> (Vec>>, PriorityConsensus) { + // contains the consensus sequences + let mut consensuses: Vec>> = vec![]; + // contains the edit distances for mappings the sequences to the consensus + // let mut edit_distances: Vec> = vec![]; + + // contains the sequences to get into the consensus algorithm + let mut sequence_chains: Vec>> = vec![]; + // contains the index of the consensus they _should_ map to + let mut sequence_indices: Vec = vec![]; + + let mut csv_reader = csv::ReaderBuilder::new() + .has_headers(true) + .from_path(filename) + .unwrap(); + for row in csv_reader.deserialize() { + let record: MultiRecord = row.unwrap(); + assert!(record.consensus >= 1, "consensus column must be >= 1"); + let con_index = record.consensus - 1; + let edits = match cost_mode { + ConsensusCost::L1Distance => record.edits, + ConsensusCost::L2Distance => record.edits.pow(2) + }; + let sequence_chain: Vec> = record.sequence + .split(';') + .map(|s| s.as_bytes().to_vec()) + .collect(); + + if con_index >= consensuses.len() { + // an index is beyond our current length, extend + consensuses.resize(con_index+1, vec![]); + // edit_distances.resize(con_index+1, vec![]); + } + + // check if this is the consensus + if edits == 0 && consensuses[con_index].is_empty() { + // this is the consensus + consensuses[con_index] = sequence_chain.clone(); + // everything after here is for normal reads, so early exit if we don't include the read + if !include_consensus { + continue; + } + } + + // edit_distances[con_index].push(edits); + + sequence_chains.push(sequence_chain); + sequence_indices.push(con_index); + } + + // make sure all consensuses are non-empty + assert!(consensuses.iter().all(|v| !v.is_empty())); + // make sure all consensuses have at least 1 read, how else would we ever find it? + assert!(sequence_chains.iter().all(|edv| !edv.is_empty())); + + // figure out the consensus order which is just the sorted order + for c in consensuses.iter() { + println!("{c:?}"); + } + + for i in 0..consensuses.len() { + for j in (i+1)..consensuses.len() { + println!("{}.cmp({}) = {:?}", i, j, consensuses[i].cmp(&consensuses[j])); + } + } + + // future: I feel like this is over-complicated + // first figure out the correct index order + let argsort: Vec = (0..consensuses.len()) + .sorted_by_key(|&i| &consensuses[i]) + .collect(); + // now convert that into a lookup table we can use for ordering them + let arg_lookup: Vec = (0..consensuses.len()) + .sorted_by_key(|&i| argsort[i]) + .collect(); + + // reorder everything based on the argsort lookup + let consensuses: Vec>> = consensuses.into_iter().enumerate() + .sorted_by_key(|(i, _v)| arg_lookup[*i]) + .map(|(_i, v)| v) + .collect(); + /* + let edit_distances: Vec> = edit_distances.into_iter().enumerate() + .sorted_by_key(|(i, _v)| arg_lookup[*i]) + .map(|(_i, v)| v) + .collect(); + */ + let sequence_indices: Vec = sequence_indices.into_iter() + .map(|i| arg_lookup[i]) + .collect(); + + // finally make the multi-consensus result + let con_vec = consensuses.into_iter() + .map(|con_chain| { + con_chain.into_iter().map(|con| { + Consensus::new(con, cost_mode, vec![]) + }) + .collect() + }) + .collect(); + + let consensus = PriorityConsensus { + consensuses: con_vec, + sequence_indices + }; + + (sequence_chains, consensus) + } + + /// Entry point for most file-based tests. + /// # Arguments + /// * `filename` - the test file to load, will be a csv + /// * `include_consensus` - if True, this will include the consensus sequence line as a read + /// * `opt_config` - optional alternate config, otherwise a default is created + fn run_test_file(filename: &str, include_consensus: bool, opt_config: Option) { + let config = opt_config.unwrap_or( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .build().unwrap() + ); + + let (sequences, expected_consensus) = load_priority_csv_test(&PathBuf::from(filename), include_consensus, config.consensus_cost); + + // set queue size large since we do not want to worry about filtering for this test + let mut consensus_dwfa = PriorityConsensusDWFA::with_config(config).unwrap(); + for sequence_chain in sequences.iter() { + let vec_ref: Vec<&[u8]> = sequence_chain.iter() + .map(|s| s.as_slice()) + .collect(); + consensus_dwfa.add_sequence_chain(vec_ref).unwrap(); + } + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + // assert_eq!(consensus, expected_consensus); // we don't load edit distances, so we can't directly compare this + assert_eq!(consensus.sequence_indices(), expected_consensus.sequence_indices()); // check consensus assignment + assert_eq!(consensus.consensuses().len(), expected_consensus.consensuses().len()); // check consensus length + for (cc, ecc) in consensus.consensuses().iter().zip(expected_consensus.consensuses().iter()) { + assert_eq!(cc.len(), ecc.len()); // check chain length + for (c, ec) in cc.iter().zip(ecc.iter()) { + assert_eq!(c.sequence(), ec.sequence()); // check equality of the sequences in the chain + } + } + } + + // these tests are just copied from the others to verify nothing breaks with single chains + #[test] + fn test_csv_dual_001() { + run_test_file("./tests/dual_001.csv", true, None); + } + + #[test] + fn test_multi_exact_001() { + run_test_file("./tests/multi_exact_001.csv", true, None); + } + + #[test] + fn test_multi_exact_002() { + run_test_file("./tests/multi_exact_002.csv", true, None); + } + + #[test] + fn test_multi_err_001() { + run_test_file("./tests/multi_err_001.csv", false, None); + } + + #[test] + fn test_multi_err_002() { + run_test_file("./tests/multi_err_002.csv", false, None); + } + + + #[test] + fn test_multi_samesplit_001() { + // this one tests four sequences that all have a unique symbol [ACGT] at the same position, so we must split into all four at once + run_test_file("./tests/multi_samesplit_001.csv", true, None); + } + + #[test] + fn test_multi_postcon_001() { + // this one tests a scenario where the consensus will get split off correctly, but we need to re-run to get the best overall consensus sequence + run_test_file("./tests/multi_postcon_001.csv", true, Some( + CdwfaConfigBuilder::default() + .wildcard(Some(b'*')) + .min_count(2) // has to be lower here due to the errors we inject + .build().unwrap() + )); + } + + // now begins the multi-tests + #[test] + fn test_single_sequence() { // simple double chain with a single entry + let sequence = b"ACGTACGTACGT"; + let mut consensus_dwfa = PriorityConsensusDWFA::default(); + consensus_dwfa.add_sequence_chain(vec![sequence, sequence]).unwrap(); + + // make sure our alphabet has four symbols, ACGT + assert_eq!(consensus_dwfa.alphabet().len(), 4); + + // now check that the consensus is the same as our sequence + let consensus = consensus_dwfa.consensus().unwrap(); + assert_eq!(consensus, PriorityConsensus { + consensuses: vec![ + vec![Consensus::new( + sequence.to_vec(), + ConsensusCost::L1Distance, + vec![0] + ); 2] + ], + sequence_indices: vec![0] + }); + } + + #[test] + fn doc_test_example() { + let sequences_chains = [ + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"TCCGT".to_vec()], // con 1 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"TCCGT".to_vec(), b"ACGGT".to_vec()], // con 2 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 + vec![b"ACGT".to_vec(), b"ACCCGGTT".to_vec()], // con 3 + ]; + + // add all the sequences + let mut cdwfa: PriorityConsensusDWFA = Default::default(); + for s_chain in sequences_chains.iter() { + let ref_chain: Vec<&[u8]> = s_chain.iter().map(|c| c.as_slice()).collect(); + cdwfa.add_sequence_chain(ref_chain).unwrap(); + } + + // run consensus and check results + let consensuses = cdwfa.consensus().unwrap(); + assert_eq!(consensuses.consensuses(), &[ + // these are in alphabetically order by the consensus content + vec![ + Consensus::new(b"ACGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]), + Consensus::new(b"ACCCGGTT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ], + vec![ + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 6]), + Consensus::new(b"ACGGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ], + vec![ + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 6]), + Consensus::new(b"TCCGT".to_vec(), ConsensusCost::L1Distance, vec![0; 3]) + ] + ]); + assert_eq!(consensuses.sequence_indices(), &[ + 2, 2, 2, 1, 1, 1, 0, 0, 0 + ]); + } + + #[test] + fn test_priority_001() { + run_test_file("./tests/priority_001.csv", true, None); + } + + #[test] + fn test_priority_002() { + run_test_file("./tests/priority_002.csv", true, None); + } + + #[test] + fn test_priority_003() { + run_test_file("./tests/priority_003.csv", true, None); + } +} \ No newline at end of file diff --git a/src/sequence_alignment.rs b/src/sequence_alignment.rs new file mode 100644 index 0000000..5b79089 --- /dev/null +++ b/src/sequence_alignment.rs @@ -0,0 +1,87 @@ + +use std::cmp::max; + +/// Returns the full edit distance between two u8 Vecs by using a version of WFA. +/// # Arguments +/// * `v1` - the first Vec +/// * `v2` - the second Vec +/// # Examples +/// ```rust +/// use waffle_con::sequence_alignment::wfa_ed; +/// let v1: Vec = vec![0, 1, 2, 4, 5]; +/// let v2: Vec = vec![0, 1, 3, 4, 5]; +/// let v3: Vec = vec![1, 2, 3, 5]; +/// assert_eq!(wfa_ed(&v1, &v1), 0); +/// assert_eq!(wfa_ed(&v1, &v2), 1); +/// assert_eq!(wfa_ed(&v1, &v3), 2); +/// ``` +pub fn wfa_ed(v1: &[u8], v2: &[u8]) -> usize { + wfa_ed_config(v1, v2, true, Some(b'*')) +} + +/// Returns the edit distance between two u8 Vecs by using a version of WFA. +/// If `require_both_end` is true, it requires the full end-to-end edit distance. If false, then it only requires that `v2` is at the end. +/// # Arguments +/// * `v1` - the first Vec +/// * `v2` - the second Vec +/// * `require_both_end` - if true, it requires the full end-to-end edit distance; if false, then it only requires that `v2` is at the end +/// # Examples +/// ```rust +/// use waffle_con::sequence_alignment::wfa_ed_config; +/// let v1: Vec = vec![0, 1, 2, 4, 5]; +/// let v2: Vec = vec![0, 1, 2, 4]; +/// assert_eq!(wfa_ed_config(&v1, &v2, false, Some(b'*')), 0); +/// assert_eq!(wfa_ed_config(&v1, &v2, true, Some(b'*')), 1); +/// ``` +pub fn wfa_ed_config(v1: &[u8], v2: &[u8], require_both_end: bool, wildcard: Option) -> usize { + //we need the lengths to know where we are in the vecs + let l1 = v1.len(); + let l2 = v2.len(); + + //stores the next indices that should be compared + let mut curr_wf: Vec<(usize, usize)> = vec![(0, 0)]; + let mut next_wf: Vec<(usize, usize)> = vec![(0, 0); 3]; + let mut edits = 0; + + //main idea is to iterate until we're at the end of BOTH vecs, this is guaranteed because i and j monotonically increase + loop { + //during each iteration, we go over all wavefronts; at iteration e, there are 2*e+1 current wavefronts that will generate 2*(e+1)+1 wavefronts + //"e" in this context corresponds to the edit distance "edits" + for (wf_index, &wf) in curr_wf.iter().enumerate() { + let mut i = wf.0; + let mut j = wf.1; + + // as long as the symbols match, keep moving along the diagonal + while i < l1 && j < l2 && (v1[i] == v2[j] || wildcard.map_or(false, |w| v1[i] == w) || wildcard.map_or(false, |w| w == v2[j])) { + i += 1; + j += 1; + } + + if (i == l1 || !require_both_end) && j == l2 { + //we found the end, return the number of edits required to get here + return edits; + } + else if i == l1 { + //push the wavefront, but i cannot increase + next_wf[wf_index] = max(next_wf[wf_index], (i, j)); + next_wf[wf_index+1] = max(next_wf[wf_index+1], (i, j+1)); + next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); + } else if j == l2 { + //push the wavefront, but j cannot increase + next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); + next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j)); + next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j)); + } else { + //v1 and v2 do not match at i, j; add mismatch, insert, and del to the next wavefront + next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); //v2 has a deletion relative to v1 + next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j+1)); //v2 has a mismatch relative to v1 + next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); //v2 has an insertion relative to v1 + } + } + + //we finished this wave, increment the edit count and generate the buffer for the next wavefront + edits += 1; + curr_wf = next_wf; + next_wf = vec![(0, 0); 3+2*edits]; + } +} diff --git a/tests/dual_001.csv b/tests/dual_001.csv new file mode 100644 index 0000000..e90f303 --- /dev/null +++ b/tests/dual_001.csv @@ -0,0 +1,11 @@ +consensus,edits,sequence +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,4,ACCACAGTATGAAGTAGACACGCTGACGTACCAGGTTAACACAAGT +1,2,*CACCCAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACCCAAAG* +1,4,ACACACAGTATAAGTAGACACGTGACCCGTACCCAGGTTAACACAAAGT +1,2,ACACACATATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGCT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT \ No newline at end of file diff --git a/tests/dual_early_termination_001.csv b/tests/dual_early_termination_001.csv new file mode 100644 index 0000000..b35e04f --- /dev/null +++ b/tests/dual_early_termination_001.csv @@ -0,0 +1,11 @@ +consensus,edits,sequence +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCA +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACAC +1,0,ACACACAGTATGAAGTAGACACG +1,2,ACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +2,0,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,1,ACACCCAGTATGAAGTAAGACAC +2,1,ACACGCAGTATGAAGTAGACACGGTGACGTACCTAGGTTA +2,1,AACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTAACACAAAGT +2,2,ACACCCAGTATGAAGTAGACACGGTGACGTACCTAGGTTCACAAAGT \ No newline at end of file diff --git a/tests/length_gap_001.csv b/tests/length_gap_001.csv new file mode 100644 index 0000000..97d8fb4 --- /dev/null +++ b/tests/length_gap_001.csv @@ -0,0 +1,11 @@ +consensus,edits,sequence +1,0,ACGTACGTACGTACGT +1,0,ACGTACGTACGTACGT +1,0,ACGTACGTACGTACGT +1,0,ACGTACGTACGTACGT +1,0,ACGTACGTACGTACGT +2,0,ACGTACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTACGTACGT +2,2,ACGTACGTTTTTCTTTTTTTTTTTTTTTTATTTTTTACGTACGT +2,2,ACGTACGTTTTTTTTTTTAATTTTTTTTTTTTTTTACGTACGT +2,1,ACGTACGTTTTTTTTTTTTTTTTTTATTTTTTTTTACGTACGT +2,1,ACGTACGTTTTTTTGTTTTTTTTTTTTTTTTTTTTTACGTACGT \ No newline at end of file diff --git a/tests/multi_err_001.csv b/tests/multi_err_001.csv new file mode 100644 index 0000000..7e28458 --- /dev/null +++ b/tests/multi_err_001.csv @@ -0,0 +1,13 @@ +consensus,edits,sequence +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,4,ACACACAAGTATGCAGTAGACACGTAGACGTACCAGGTTAACACAAAGT +1,2,CACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGTA +1,1,ACACACAGTATGAAGTAGACACGTTTGACGTACCCAGGTTAACACAAAGT +2,0,ACACACAGCATGAAGTAGACACTTTGACGTACACAGGTTAAACACAAAGT +2,2,CCACACAGCATGAAGTAGACACTTTGACGTACACAGGTTAAACACAAGT +2,4,ACACACAGCATGAAAGTAGACACTTTGACGTAACACAGGTTAAAACACAAAAGT +2,2,ACACACAGCAATGAAGTAGACACTTTTGACGTACACAGGTTAAACACAAAGT +3,0,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAAGT +3,1,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAGT +3,1,ACAACAGCATGAATTAGACACTTTTGACGTTACACAGGTTAAAACACAAAGT +3,0,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAAGT \ No newline at end of file diff --git a/tests/multi_err_002.csv b/tests/multi_err_002.csv new file mode 100644 index 0000000..9fb498b --- /dev/null +++ b/tests/multi_err_002.csv @@ -0,0 +1,25 @@ +consensus,edits,sequence +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGTACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +2,0,ACAAACAGTATGAAGAAGACACGTGACGTACCCCAGGTTAACACACAGTACACACACTATGAAGTAGACACGATGACGTACCCAGGTTAACATAAAGT +3,0,ACAAACAGTATGAGTAGACACGTTTGACGTACCCCAGGTAACACAAAGTACACCCAGTATGAAGTAGACACGTTGACGTACACAGGTAAACACAAAGT +4,0,ACACACAGTATGAATTAGACACGTTTGACGTACCAGGTTAACACAAAGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAGT +1,3,ACACACAGTATGAAGTAGAGTTGACGTACCCAGGTTAACACAAAGTACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,3,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGTACACACAGTATGAAGTAGACTTGACGTACCCAGGTTAACACAAAGT +1,3,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGTACAAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,3,ACACACAGTATGAAGTAGACACGTTGAACCCAGGTTAACACAAAGTACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,3,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGAACACAAAGTACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +2,3,ACAAACAGTATGAAAAAGAAGACACGTGACGTACCCCAGGTTAACACACAGTACACACACTATGAAGTAGACACGATGACGTACCCAGGTTAACATAAAGT +2,3,ACAAACAGTATGAAGAAGACACGTGACGTACCCCAGGTTAACACACAAAAGTACACACACTATGAAGTAGACACGATGACGTACCCAGGTTAACATAAAGT +2,3,ACAAACAGTATGAAGAAGACACGTGACGAAATACCCCAGGTTAACACACAGTACACACACTATGAAGTAGACACGATGACGTACCCAGGTTAACATAAAGT +2,3,ACAAACAGTATGAAGAAGACACGTGACGTACCCCAGGTTAACAAAACACAGTACACACACTATGAAGTAGACACGATGACGTACCCAGGTTAACATAAAGT +2,3,ACAAACAGTATGAAGAAGACACGTGACGTACCCCAGGTTAACACACAGTACACACACTATGAAGTAGACACGATGACGTAAAACCCAGGTTAACATAAAGT +3,3,ACAAACAGTATGAGTAGACACGTTTGACGTACCCCAGGTAACACCCCGTACACCCAGTATGAAGTAGACACGTTGACGTACACAGGTAAACACAAAGT +3,3,ACAAACAGTATGAGTAGACACGTTTGACGTACCCCAGGTAACACAAAGTACACCCAGTATGAAGTAGACACCCCGACGTACACAGGTAAACACAAAGT +3,3,ACAAACAGTATGAGTAGACACGTTTGACGTACCCCAGGTAACACAAAGTACACCCCCCATGAAGTAGACACGTTGACGTACACAGGTAAACACAAAGT +3,3,ACAAACAGTATGAGTAGACACGTTTGACGTACCCCACCCAACACAAAGTACACCCAGTATGAAGTAGACACGTTGACGTACACAGGTAAACACAAAGT +3,3,ACAAACAGTATGAGTAGACACGCCCGACGTACCCCAGGTAACACAAAGTACACCCAGTATGAAGTAGACACGTTGACGTACACAGGTAAACACAAAGT +4,1,ACACACAGTATGAATTAGACACGTTTGACGTACCAGGTTAACACAAAGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAG +4,1,CACACAGTATGAATTAGACACGTTTGACGTACCAGGTTAACACAAAGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAGT +4,3,ACACACAGTATGAATTAGACACGACGTTTGACGTACCAGGTTAACACAAAGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAGT +4,2,ACACAGTATGAATTAGACACGTTTGACGTACCAGGTTAACACAAAGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAGT +4,3,ACACACAGTATGAATTAGACACGTTTGACGTACCAGGTTAACACGAACACACACTATGAAGTAGACACGTTTGACGACCCAGGGTTAACACAAAGT \ No newline at end of file diff --git a/tests/multi_exact_001.csv b/tests/multi_exact_001.csv new file mode 100644 index 0000000..0a0d063 --- /dev/null +++ b/tests/multi_exact_001.csv @@ -0,0 +1,10 @@ +consensus,edits,sequence +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +1,0,ACACACAGTATGAAGTAGACACGTTGACGTACCCAGGTTAACACAAAGT +2,0,ACACACAGCATGAAGTAGACACTTTGACGTACACAGGTTAAACACAAAGT +2,0,ACACACAGCATGAAGTAGACACTTTGACGTACACAGGTTAAACACAAAGT +2,0,ACACACAGCATGAAGTAGACACTTTGACGTACACAGGTTAAACACAAAGT +3,0,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAAGT +3,0,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAAGT +3,0,ACAACAGCATGAATTAGACACTTTTGACGTACACAGGTTAAAACACAAAGT \ No newline at end of file diff --git a/tests/multi_exact_002.csv b/tests/multi_exact_002.csv new file mode 100644 index 0000000..8d8ee7b --- /dev/null +++ b/tests/multi_exact_002.csv @@ -0,0 +1,13 @@ +consensus,edits,sequence +1,0,TAAT +1,0,TAAT +1,0,TAAT +2,0,TAAAAAAAAAAAAAAAAAAAAAAAAC +2,0,TAAAAAAAAAAAAAAAAAAAAAAAAC +2,0,TAAAAAAAAAAAAAAAAAAAAAAAAC +3,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG +3,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG +3,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG +4,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT +4,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT +4,0,TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT \ No newline at end of file diff --git a/tests/multi_postcon_001.csv b/tests/multi_postcon_001.csv new file mode 100644 index 0000000..ed8476f --- /dev/null +++ b/tests/multi_postcon_001.csv @@ -0,0 +1,10 @@ +consensus,edits,sequence +1,0,ACGATA +1,0,ACGATA +1,1,AAGATA +2,0,AAGCAA +2,0,AAGCAA +2,0,AAGCAA +3,0,AAGGAA +3,0,AAGGAA +3,1,AAGGAT \ No newline at end of file diff --git a/tests/multi_samesplit_001.csv b/tests/multi_samesplit_001.csv new file mode 100644 index 0000000..627986c --- /dev/null +++ b/tests/multi_samesplit_001.csv @@ -0,0 +1,13 @@ +consensus,edits,sequence +1,0,AAAAA +1,0,AAAAA +1,0,AAAAA +2,0,AACAA +2,0,AACAA +2,0,AACAA +3,0,AAGAA +3,0,AAGAA +3,0,AAGAA +4,0,AATAA +4,0,AATAA +4,0,AATAA \ No newline at end of file diff --git a/tests/priority_001.csv b/tests/priority_001.csv new file mode 100644 index 0000000..fb98ea9 --- /dev/null +++ b/tests/priority_001.csv @@ -0,0 +1,10 @@ +consensus,edits,sequence +1,0,AAAA;CCCC +1,0,AAAA;CCCC +1,0,AAAA;CCCC +2,0,AAAA;TTTT +2,0,AAAA;TTTT +2,0,AAAA;TTTT +3,0,GGGG;CCCC +3,0,GGGG;CCCC +3,0,GGGG;CCCC \ No newline at end of file diff --git a/tests/priority_002.csv b/tests/priority_002.csv new file mode 100644 index 0000000..93e5562 --- /dev/null +++ b/tests/priority_002.csv @@ -0,0 +1,10 @@ +consensus,edits,sequence +1,0,AAAA;CCCC +1,0,AAAA;CCCC +1,0,AAAA;CCCC +2,0,AAAA;TTTT +2,0,AAAA;TTTT +2,0,AAAA;TTTT +1,0,AAAA;CCCC +1,0,AAAA;CCCC +1,1,AGAA;CCCC \ No newline at end of file diff --git a/tests/priority_003.csv b/tests/priority_003.csv new file mode 100644 index 0000000..5135096 --- /dev/null +++ b/tests/priority_003.csv @@ -0,0 +1,13 @@ +consensus,edits,sequence +1,0,ACGT;AACCCGGGGGT +1,0,ACGT;AACCCGGGGGT +1,0,ACGT;AACCCGGGGGT +1,0,ACGT;AACCCGGGGGT +1,0,ACGT;AACCCGGGGGT +2,0,ACGT;ACCCCGGGGTT +2,0,ACGT;ACCCCGGGGTT +2,0,ACGT;ACCCCGGGGTT +3,0,ACGT;AAACCGGGGTT +3,0,ACGT;AAACCGGGGTT +3,0,ACGT;AAACCGGGGTT +3,0,ACGT;AAACCGGGGTT \ No newline at end of file