From d33ee119a7c63a75a2b77224ce07854966d0df1e Mon Sep 17 00:00:00 2001 From: Patrick Marks Date: Sat, 16 Oct 2021 09:39:43 -0700 Subject: [PATCH 1/3] switch to const generics for selecting K --- Cargo.toml | 2 +- src/kmer.rs | 240 ++++++++-------------------------------------------- src/test.rs | 8 +- 3 files changed, 39 insertions(+), 211 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5542b17e..08464814 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "debruijn" -version = "0.3.4" +version = "0.4.0" authors = ["Patrick Marks "] license = "MIT" edition = '2018' diff --git a/src/kmer.rs b/src/kmer.rs index e89bd6fc..37d5b605 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -39,7 +39,6 @@ use serde_derive::{Deserialize, Serialize}; use std; use std::fmt; use std::hash::Hash; -use std::marker::PhantomData; use crate::bits_to_base; use crate::Kmer; @@ -51,47 +50,47 @@ use crate::Mer; pub type Kmer64 = IntKmer; /// 48-base kmer, backed by a single u128 -pub type Kmer48 = VarIntKmer; +pub type Kmer48 = VarIntKmer; /// 40-base kmer, backed by a single u128 -pub type Kmer40 = VarIntKmer; +pub type Kmer40 = VarIntKmer; /// 32-base kmer, backed by a single u64 pub type Kmer32 = IntKmer; /// 30-base kmer, backed by a single u64 -pub type Kmer30 = VarIntKmer; +pub type Kmer30 = VarIntKmer; /// 24-base kmer, backed by a single u64 -pub type Kmer24 = VarIntKmer; +pub type Kmer24 = VarIntKmer; /// 20-base kmer, backed by a single u64 -pub type Kmer20 = VarIntKmer; +pub type Kmer20 = VarIntKmer; /// 16-base kmer, backed by a single u32 pub type Kmer16 = IntKmer; /// 15-base kmer, backed by a single u32 -pub type Kmer15 = VarIntKmer; +pub type Kmer15 = VarIntKmer; /// 14-base kmer, backed by a single u32 -pub type Kmer14 = VarIntKmer; +pub type Kmer14 = VarIntKmer; /// 12-base kmer, backed by a single u32 -pub type Kmer12 = VarIntKmer; +pub type Kmer12 = VarIntKmer; /// 10-base kmer, backed by a single u32 -pub type Kmer10 = VarIntKmer; +pub type Kmer10 = VarIntKmer; /// 8-base kmer, backed by a single u16 pub type Kmer8 = IntKmer; -pub type Kmer6 = VarIntKmer; -pub type Kmer5 = VarIntKmer; +pub type Kmer6 = VarIntKmer; +pub type Kmer5 = VarIntKmer; pub type Kmer4 = IntKmer; -pub type Kmer3 = VarIntKmer; -pub type Kmer2 = VarIntKmer; +pub type Kmer3 = VarIntKmer; +pub type Kmer2 = VarIntKmer; /// Trait for specialized integer operations used in DeBruijn Graph pub trait IntHelp: PrimInt + FromPrimitive { @@ -416,6 +415,7 @@ impl fmt::Debug for IntKmer { } } + /// Helper trait for declaring the K value of a Kmer. Will be removed when const generics are available pub trait KmerSize: Ord + Hash + Copy + fmt::Debug { #[allow(non_snake_case)] @@ -432,22 +432,20 @@ pub trait KmerSize: Ord + Hash + Copy + fmt::Debug { /// sorting the integer will give a lexicographic sorting of the corresponding string. /// kmers that don't fill `storage` are always aligned to the least signifcant bits #[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] -pub struct VarIntKmer { +pub struct VarIntKmer { pub storage: T, - pub phantom: PhantomData, } -impl Kmer for VarIntKmer { +impl Kmer for VarIntKmer { fn empty() -> Self { VarIntKmer { storage: T::zero(), - phantom: PhantomData, } } #[inline] fn k() -> usize { - Self::_k() + KS } fn to_u64(&self) -> u64 { @@ -457,7 +455,6 @@ impl Kmer for VarIntK fn from_u64(v: u64) -> Self { VarIntKmer { storage: Self::t_from_u64(v), - phantom: PhantomData, } } @@ -466,7 +463,6 @@ impl Kmer for VarIntK let new = self.storage >> 2; let mut kmer = VarIntKmer { storage: new, - phantom: PhantomData, }; kmer.set_mut(0, v); kmer @@ -476,7 +472,6 @@ impl Kmer for VarIntK let new = self.storage << 2 & !Self::top_mask(0); let mut kmer = VarIntKmer { storage: new, - phantom: PhantomData, }; kmer.set_mut(Self::k() - 1, v); kmer @@ -489,7 +484,7 @@ impl Kmer for VarIntK } } -impl VarIntKmer { +impl VarIntKmer { #[inline(always)] fn msk() -> T { T::one() << 1 | T::one() @@ -514,16 +509,10 @@ impl VarIntKmer usize { - KS::K() - } - // Bits used by this kmer #[inline(always)] fn _bits() -> usize { - Self::_k() * 2 + KS * 2 } #[inline(always)] @@ -557,10 +546,10 @@ impl VarIntKmer Mer for VarIntKmer { +impl Mer for VarIntKmer { #[inline(always)] fn len(&self) -> usize { - Self::_k() + KS } /// Get the letter at the given position. @@ -622,7 +611,6 @@ impl Mer for VarIntKm VarIntKmer { storage: new, - phantom: PhantomData, } } @@ -642,8 +630,8 @@ impl Mer for VarIntKm mask_lower.count_ones() } } - -impl fmt::Debug for VarIntKmer { + +impl fmt::Debug for VarIntKmer { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut s = String::new(); for pos in 0..Self::k() { @@ -654,167 +642,7 @@ impl fmt::Debug for V } } -/// Marker struct for generating K=48 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K48; - -impl KmerSize for K48 { - #[inline(always)] - fn K() -> usize { - 48 - } -} - -/// Marker trait for generating K=40 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K40; - -impl KmerSize for K40 { - #[inline(always)] - fn K() -> usize { - 40 - } -} - -/// Marker trait for generating K=31 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K31; - -impl KmerSize for K31 { - #[inline(always)] - fn K() -> usize { - 31 - } -} - -/// Marker trait for generating K=30 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K30; - -impl KmerSize for K30 { - #[inline(always)] - fn K() -> usize { - 30 - } -} - -/// Marker trait for generating K=24 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K24; - -impl KmerSize for K24 { - #[inline(always)] - fn K() -> usize { - 24 - } -} - -/// Marker trait for generating K=20 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K20; - -impl KmerSize for K20 { - #[inline(always)] - fn K() -> usize { - 20 - } -} - -/// Marker trait for generating K=14 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K15; - -impl KmerSize for K15 { - #[inline(always)] - fn K() -> usize { - 15 - } -} - -/// Marker trait for generating K=14 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K14; - -impl KmerSize for K14 { - #[inline(always)] - fn K() -> usize { - 14 - } -} - -/// Marker trait for generating K=12 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K12; - -impl KmerSize for K12 { - #[inline] - fn K() -> usize { - 12 - } -} - -/// Marker trait for generating K=12 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K10; - -impl KmerSize for K10 { - #[inline] - fn K() -> usize { - 10 - } -} - -/// Marker trait for generating K=6 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K6; -impl KmerSize for K6 { - #[inline(always)] - fn K() -> usize { - 6 - } -} - -/// Marker trait for generating K=6 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K5; - -impl KmerSize for K5 { - #[inline(always)] - fn K() -> usize { - 5 - } -} -/// Marker trait for generating K=6 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K4; - -impl KmerSize for K4 { - #[inline(always)] - fn K() -> usize { - 4 - } -} -/// Marker trait for generating K=6 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K3; - -impl KmerSize for K3 { - #[inline(always)] - fn K() -> usize { - 3 - } -} -/// Marker trait for generating K=6 Kmers -#[derive(Debug, Hash, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] -pub struct K2; - -impl KmerSize for K2 { - #[inline(always)] - fn K() -> usize { - 2 - } -} #[cfg(test)] mod tests { @@ -1012,7 +840,7 @@ mod tests { #[test] fn test_lmer_3_kmer_48() { for _ in 0..10000 { - check_vmer::, VarIntKmer>(); + check_vmer::, VarIntKmer>(); } } @@ -1033,14 +861,14 @@ mod tests { #[test] fn test_lmer_1_kmer_24() { for _ in 0..10000 { - check_vmer::, VarIntKmer>(); + check_vmer::, VarIntKmer>(); } } #[test] fn test_lmer_1_kmer_20() { for _ in 0..10000 { - check_vmer::, VarIntKmer>(); + check_vmer::, VarIntKmer>(); } } @@ -1061,7 +889,7 @@ mod tests { #[test] fn test_kmer_48() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } @@ -1075,21 +903,21 @@ mod tests { #[test] fn test_kmer_31() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } #[test] fn test_kmer_24() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } #[test] fn test_kmer_20() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } @@ -1103,28 +931,28 @@ mod tests { #[test] fn test_kmer_15() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } #[test] fn test_kmer_14() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } #[test] fn test_kmer_12() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } #[test] fn test_kmer_10() { for _ in 0..10000 { - check_kmer::>(); + check_kmer::>(); } } diff --git a/src/test.rs b/src/test.rs index 4a195653..fa78294a 100644 --- a/src/test.rs +++ b/src/test.rs @@ -150,7 +150,7 @@ mod tests { use crate::dna_string::DnaString; use crate::filter; use crate::kmer::Kmer6; - use crate::kmer::{IntKmer, VarIntKmer, K31}; + use crate::kmer::{IntKmer, VarIntKmer}; use crate::msp; use std::ops::Sub; @@ -179,7 +179,7 @@ mod tests { .map(crate::base_to_bits) .collect(); - reassemble_contigs::, DnaString>(vec![seq.clone(), seq], false); + reassemble_contigs::, DnaString>(vec![seq.clone(), seq], false); } #[test] @@ -192,7 +192,7 @@ mod tests { .map(crate::base_to_bits) .collect(); - reassemble_sharded::, DnaString>(vec![seq.clone(), seq], false); + reassemble_sharded::, DnaString>(vec![seq.clone(), seq], false); } #[test] @@ -221,7 +221,7 @@ mod tests { fn complex_path_compress_k31() { for _ in 0..100 { let contigs = random_contigs(); - simplify_from_kmers::>(contigs, false); + simplify_from_kmers::>(contigs, false); } } From 488b8ccf661ed7805dc55c8ba1a5f290ed6c191e Mon Sep 17 00:00:00 2001 From: Patrick Marks Date: Sat, 16 Oct 2021 09:42:43 -0700 Subject: [PATCH 2/3] fmt --- src/kmer.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/kmer.rs b/src/kmer.rs index 37d5b605..3b6a53bf 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -415,7 +415,6 @@ impl fmt::Debug for IntKmer { } } - /// Helper trait for declaring the K value of a Kmer. Will be removed when const generics are available pub trait KmerSize: Ord + Hash + Copy + fmt::Debug { #[allow(non_snake_case)] @@ -438,9 +437,7 @@ pub struct VarIntKmer { impl Kmer for VarIntKmer { fn empty() -> Self { - VarIntKmer { - storage: T::zero(), - } + VarIntKmer { storage: T::zero() } } #[inline] @@ -461,18 +458,14 @@ impl Kmer for VarI /// Shift the base v into the left end of the kmer fn extend_left(&self, v: u8) -> Self { let new = self.storage >> 2; - let mut kmer = VarIntKmer { - storage: new, - }; + let mut kmer = VarIntKmer { storage: new }; kmer.set_mut(0, v); kmer } fn extend_right(&self, v: u8) -> Self { let new = self.storage << 2 & !Self::top_mask(0); - let mut kmer = VarIntKmer { - storage: new, - }; + let mut kmer = VarIntKmer { storage: new }; kmer.set_mut(Self::k() - 1, v); kmer } @@ -609,9 +602,7 @@ impl Mer for VarIn new = new >> up_shift; } - VarIntKmer { - storage: new, - } + VarIntKmer { storage: new } } fn at_count(&self) -> u32 { @@ -630,8 +621,10 @@ impl Mer for VarIn mask_lower.count_ones() } } - -impl fmt::Debug for VarIntKmer { + +impl fmt::Debug + for VarIntKmer +{ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut s = String::new(); for pos in 0..Self::k() { @@ -642,8 +635,6 @@ impl fmt::Debug fo } } - - #[cfg(test)] mod tests { use super::*; From f1d5a82418df3524a98e9128013f1c45fb7022cc Mon Sep 17 00:00:00 2001 From: Patrick Marks Date: Mon, 18 Oct 2021 15:50:26 -0700 Subject: [PATCH 3/3] fmt and add a few more tests --- src/array_kmer.rs | 480 ++++++++++++++++++++++++++++++++++++++++++++++ src/dna_string.rs | 6 +- src/kmer.rs | 95 +++++---- src/lib.rs | 1 + 4 files changed, 531 insertions(+), 51 deletions(-) create mode 100644 src/array_kmer.rs diff --git a/src/array_kmer.rs b/src/array_kmer.rs new file mode 100644 index 00000000..bac744a9 --- /dev/null +++ b/src/array_kmer.rs @@ -0,0 +1,480 @@ +//! Kmers represented by an array of integers. + +use num_traits::AsPrimitive; +use num_traits::FromPrimitive; +use num_traits::PrimInt; +use std; +use std::fmt; +use std::fmt::Debug; +use std::hash::Hash; + +use crate::bits_to_base; +use crate::kmer::IntHelp; +use crate::Kmer; +use crate::Mer; + +/// The [`ArrayKmer`] struct implements a [`Kmer`] backed by an +/// array of integers. The type of the integers, the number of integers in the array and value of `K` are +/// specified with generic and const-generic parameters. The type of the integer is specified as `T`, the `K` value is specified as `KS`, and the number of +/// elements of the backing array is `B`. +/// +/// This implementation of [`Kmer`] will generally be slower +/// than [`crate::kmer::IntKmer`] or [`crate::kmer::VarIntKmer`], but can be smaller by having less padding, and can go up to +/// K > 64, which is the current limit for [`crate::kmer::IntKmer`]. +/// +/// Implementation note: any padding required must always occupy the highest-order bits of the first element of the storage array. +/// The padding bits _must_ be maintained as 0 by all manipulations of the kmer. +/// The bits are ordered such that the natural ordering of the storage array corresponds to the lexicographic ordering of the kmers. +#[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +pub struct ArrayKmer { + pub storage: [T; B], +} + +impl< + T: 'static + PrimInt + FromPrimitive + Hash + IntHelp + Debug, + const KS: usize, + const B: usize, + > Kmer for ArrayKmer +where + u64: AsPrimitive, +{ + fn empty() -> Self { + ArrayKmer { + storage: [T::zero(); B], + } + } + + #[inline] + fn k() -> usize { + KS + } + + fn to_u64(&self) -> u64 { + let mut r = 0u64; + for i in 0..B { + let v = self.storage[B - i - 1].to_u64().unwrap(); + r = r | v << (i * Self::slot_bits()); + } + + r + } + + fn from_u64(v: u64) -> Self { + let mut new = Self::empty(); + + // shift v to the upper bits of the u64, which is what copy_bases_u64 expects + let vshift = v << (64 - KS * 2); + + new.copy_bases_u64(vshift, KS, 0); + new + } + + /// Shift the base v into the left end of the kmer + fn extend_left(&self, v: u8) -> Self { + let mut new: [T; B] = [T::zero(); B]; + + new[0] = self.storage[0] >> 2; + for i in 1..B { + new[i] = self.storage[i] >> 2 | self.storage[i - 1] << (Self::slot_bits() - 2); + } + + let mut kmer = ArrayKmer { storage: new }; + + kmer.set_mut(0, v); + kmer + } + + fn extend_right(&self, v: u8) -> Self { + let mut new = [T::zero(); B]; + + for i in 0..B - 1 { + new[i] = self.storage[i] << 2 | self.storage[i + 1] >> (Self::slot_bits() - 2); + + if i == 0 { + let lower_mask = Self::mask_below((Self::slot_bases() - Self::padding_bases()) * 2); + new[i] = new[i] & lower_mask; + } + } + new[B - 1] = self.storage[B - 1] << 2; + + let mut kmer = ArrayKmer { storage: new }; + kmer.set_mut(Self::k() - 1, v); + kmer + } + + fn hamming_dist(&self, other: Self) -> u32 { + let mut total_diffs = 0; + for i in 0..B { + let bit_diffs = self.storage[i] ^ other.storage[i]; + let two_bit_diffs = (bit_diffs | bit_diffs >> 1) & IntHelp::lower_of_two(); + total_diffs += two_bit_diffs.count_ones(); + } + + total_diffs + } +} + +impl< + T: 'static + PrimInt + FromPrimitive + Hash + IntHelp + Debug, + const KS: usize, + const B: usize, + > ArrayKmer +where + u64: AsPrimitive, +{ + #[inline(always)] + fn msk() -> T { + T::one() << 1 | T::one() + } + + fn to_byte(v: T) -> u8 { + T::to_u8(&v).unwrap() + } + + fn t_from_byte(v: u8) -> T { + T::from_u8(v).unwrap() + } + + fn t_from_u64(v: u64) -> T { + // move upper bits of + let shift = std::cmp::max(64 - Self::slot_bits(), 0); + let b = v >> shift; + b.as_() + } + + #[inline(always)] + fn slot_bits() -> usize { + std::mem::size_of::() * 8 + } + + #[inline(always)] + fn slot_bases() -> usize { + Self::slot_bits() / 2 + } + + #[inline(always)] + fn padding_bases() -> usize { + B * Self::slot_bases() - KS + } + + /// return (slot, pos_in_slot, bitpos_in_slot) for base posistion `pos` + #[inline(always)] + fn addr(pos: usize) -> (usize, usize, usize) { + let padding = Self::padding_bases(); + let padded_pos = pos + padding; + + let slot = padded_pos / Self::slot_bases(); + let pos_in_slot = padded_pos % Self::slot_bases(); + + let top_base = Self::slot_bases() - 1; + let bitpos = (top_base - pos_in_slot) * 2; + + let r = (slot, pos_in_slot, bitpos); + r + } + + /// Copy upper-most `nbases` bases from `src` into self, starting a base position `dest` + fn copy_bases_u64(&mut self, mut src: u64, mut nbases: usize, mut dest: usize) { + let (mut slot, mut slot_base, mut bit_pos) = Self::addr(dest); + + while nbases > 0 { + let cp_bits = std::cmp::min(nbases * 2, bit_pos + 2); + + let top_mask = Self::mask_above(bit_pos + 1); + let bottom_mask = Self::mask_below(bit_pos + 2 - cp_bits); + + let old_val = self.storage[slot]; + let new_bits = Self::t_from_u64(src >> (slot_base * 2)); + + let new_val = (old_val & top_mask) + | (old_val & bottom_mask) + | (new_bits & !top_mask & !bottom_mask); + + self.storage[slot] = new_val; + + src = src << cp_bits; + nbases = nbases - cp_bits / 2; + dest = dest + cp_bits / 2; + slot = slot + 1; + slot_base = 0; + bit_pos = Self::slot_bits() - 2; + } + } + + /// Copy upper-most `nbases` bases from `src` into self, starting a base position `dest` + fn copy_bases(&mut self, mut src: T, mut nbases: usize, mut dest: usize) { + let (mut slot, mut slot_base, mut bit_pos) = Self::addr(dest); + + loop { + let cp_bits = std::cmp::min(nbases * 2, (Self::slot_bases() - slot_base) * 2); + + let top_mask = Self::mask_above(bit_pos + 1); + let bottom_mask = Self::mask_below(bit_pos + 2 - cp_bits); + + let old_val = self.storage[slot]; + + let new_bits = src >> (slot_base * 2); + + let new_val = (old_val & top_mask) + | (old_val & bottom_mask) + | (new_bits & !top_mask & !bottom_mask); + + self.storage[slot] = new_val; + + nbases = nbases - cp_bits / 2; + + if nbases == 0 { + break; + } + + src = src << cp_bits; + dest = dest + cp_bits / 2; + slot = slot + 1; + slot_base = 0; + bit_pos = Self::slot_bits() - 2; + } + } + + #[inline(always)] + fn mask_above(bit: usize) -> T { + if bit + 1 >= Self::slot_bits() { + return T::zero(); + } + + let one = T::one(); + let start_bit = bit + 1; + let mask_bits = Self::slot_bits() - start_bit; + let res = ((one << mask_bits) - one) << start_bit; + + for i in 0..Self::slot_bits() { + let res_bit = (res >> i) & T::one(); + if i > bit { + debug_assert_eq!(res_bit, T::one()); + } else { + debug_assert_eq!(res_bit, T::zero()); + } + } + + res + } + + #[inline(always)] + fn mask_below(bit: usize) -> T { + // special case to mask everything + if bit == Self::slot_bits() { + return !T::zero(); + } + + let one = T::one(); + let res = (one << bit) - one; + + for i in 0..Self::slot_bits() { + let res_bit = (res >> i) & T::one(); + if i < bit { + debug_assert_eq!(res_bit, T::one()); + } else { + debug_assert_eq!(res_bit, T::zero()); + } + } + + res + } +} + +impl< + T: 'static + PrimInt + FromPrimitive + Hash + IntHelp + Debug, + const KS: usize, + const B: usize, + > Mer for ArrayKmer +where + u64: AsPrimitive, +{ + #[inline(always)] + fn len(&self) -> usize { + KS + } + + /// Get the letter at the given position. + fn get(&self, pos: usize) -> u8 { + let (slot, _, bit) = Self::addr(pos); + Self::to_byte(self.storage[slot] >> bit & Self::msk()) + } + + fn set_mut(&mut self, pos: usize, v: u8) { + let (slot, _, bit) = Self::addr(pos); + let mask = !(Self::msk() << bit); + + self.storage[slot] = (self.storage[slot] & mask) | (Self::t_from_byte(v) << bit); + } + + /// Set a slice of bases in the kmer, using the packed representation in value. + /// Sets n_bases, starting at pos. Incoming bases must always be packed into the upper-most + /// bits of the value. + #[inline(always)] + fn set_slice_mut(&mut self, pos: usize, n_bases: usize, value: u64) { + debug_assert!(pos + n_bases <= Self::k()); + self.copy_bases_u64(value, n_bases, pos); + } + + /// Return the reverse complement of this kmer + fn rc(&self) -> Self { + let mut new = ArrayKmer::empty(); + let mut cur_pos = 0; + + while cur_pos < KS { + let (slot, _, bitpos) = Self::addr(cur_pos); + + let s = self.storage[slot]; + let s_rc = !s.reverse_by_twos(); + let nbases = (bitpos / 2) + 1; + + let dest = KS - cur_pos - nbases; + new.copy_bases(s_rc, nbases, dest); + cur_pos += nbases; + } + + new + } + + fn at_count(&self) -> u32 { + // A's and T's have upper_bit ^ lower_bit == 0 + // count how many of these are present + + let mut count = 0; + let slot0 = self.storage[0]; + + let slot0_bases = KS - Self::slot_bases() * (B - 1); + let mix_base_bits = !((slot0 >> 1) ^ slot0); + let at_bases = mix_base_bits & Self::mask_below(slot0_bases * 2) & IntHelp::lower_of_two(); + count += at_bases.count_ones(); + + for i in 1..B { + let mix_base_bits = !((self.storage[i] >> 1) ^ self.storage[i]); + let at_bases = mix_base_bits & IntHelp::lower_of_two(); + count += at_bases.count_ones(); + } + + count + } + + fn gc_count(&self) -> u32 { + (KS as u32) - self.at_count() + } +} + +impl< + T: 'static + PrimInt + FromPrimitive + Hash + IntHelp + Debug, + const KS: usize, + const B: usize, + > fmt::Debug for ArrayKmer +where + u64: AsPrimitive, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut s = String::new(); + for pos in 0..Self::k() { + s.push(bits_to_base(self.get(pos))) + } + + write!(f, "{}", s) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::kmer; + + #[test] + fn simple_arr_test() { + type K = ArrayKmer; + + let km = K::from_ascii(b"CCCCCCCCCCCCCCCCCCCCCCCCCC"); + println!("{:?}", km); + + let k1 = km.extend_left(0); + println!("{:?}", k1); + + let k1 = km.extend_right(0); + println!("{:?}", k1); + + let km = K::from_ascii(b"ACGTACGTACGTACGTACGTACGTAC"); + println!("{:?}", km); + + let k1 = km.extend_left(0); + println!("{:?}", k1); + + let k1 = km.extend_right(0); + println!("{:?}", k1); + + let km = K::from_ascii(b"CCCCCCCCCCCCCCCCCCCCCCCCCC"); + println!("{:?}", km); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xF << 60, 2, 0); + assert_eq!(k2, K::from_ascii(b"TTCCCCCCCCCCCCCCCCCCCCCCCC")); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xF << 60, 2, 1); + assert_eq!(k2, K::from_ascii(b"CTTCCCCCCCCCCCCCCCCCCCCCCC")); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xF << 60, 2, 2); + assert_eq!(k2, K::from_ascii(b"CCTTCCCCCCCCCCCCCCCCCCCCCC")); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xF << 60, 2, 3); + assert_eq!(k2, K::from_ascii(b"CCCTTCCCCCCCCCCCCCCCCCCCCC")); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xF << 60, 2, 7); + assert_eq!(k2, K::from_ascii(b"CCCCCCCTTCCCCCCCCCCCCCCCCC")); + + let mut k2 = km.clone(); + k2.copy_bases_u64(0xFF << 56, 4, 7); + assert_eq!(k2, K::from_ascii(b"CCCCCCCTTTTCCCCCCCCCCCCCCC")); + + let km = K::from_ascii(b"CCCCCCCCCCCCCCCCCCCCCCCCCC"); + assert_eq!(km.rc(), K::from_ascii(b"GGGGGGGGGGGGGGGGGGGGGGGGGG")); + } + + #[test] + fn test_arr_kmer15() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer26() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer23() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer48() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer47() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer65_u16() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer68_u16() { + kmer::tests::kmer_test_suite::>(); + } + + #[test] + fn test_arr_kmer65_u32() { + kmer::tests::kmer_test_suite::>(); + } +} diff --git a/src/dna_string.rs b/src/dna_string.rs index 3bc1f76c..7d2d7266 100644 --- a/src/dna_string.rs +++ b/src/dna_string.rs @@ -812,7 +812,7 @@ impl<'a> PackedDnaStringSet { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::kmer::IntKmer; use crate::kmer::Kmer4; @@ -842,7 +842,7 @@ mod tests { } } - fn random_dna_string(len: usize) -> DnaString { + pub fn random_dna_string(len: usize) -> DnaString { let bytes = test::random_dna(len); DnaString::from_bytes(&bytes) } @@ -1086,7 +1086,7 @@ mod tests { assert_eq!(kmers, vec![]); } - fn kmer_test(kmers: &Vec, dna: &String, dna_string: &DnaString) { + pub fn kmer_test(kmers: &Vec, dna: &String, dna_string: &DnaString) { for i in 0..(dna.len() - K::k() + 1) { assert_eq!(kmers[i].to_string(), &dna[i..(i + K::k())]); } diff --git a/src/kmer.rs b/src/kmer.rs index 3b6a53bf..dae03ed3 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -636,8 +636,10 @@ impl fmt::Debug } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; + use crate::dna_string; + use crate::dna_string::DnaString; use crate::vmer::Lmer; use rand::{self, Rng, RngCore}; @@ -657,7 +659,7 @@ mod tests { } // Generate random kmers & test the methods for manipulating them - fn check_kmer() { + pub fn check_kmer() { #[allow(non_snake_case)] let K = T::k(); @@ -671,7 +673,7 @@ mod tests { // Reverse complementing let rc = km.rc(); let double_rc = rc.rc(); - assert!(km == double_rc); + assert_eq!(km, double_rc); for i in 0..K { assert!(km.get(i) == (3 - rc.get(K - 1 - i))) @@ -821,6 +823,33 @@ mod tests { (r.next_u64() % 4) as u8 } + pub fn kmer_test_suite() { + for _ in 0..10000 { + check_kmer::(); + } + + for len in [100, 101, 200, 201, 1000, 1001] { + let dna_string = dna_string::tests::random_dna_string(len); + + let kmers: Vec = dna_string.iter_kmers().collect(); + dna_string_test::(&kmers, &dna_string); + } + } + + pub fn dna_string_test(kmers: &Vec, dna_string: &DnaString) { + let string = dna_string.to_string(); + for i in 0..(string.len() - K::k() + 1) { + assert_eq!(kmers[i].to_string(), &string[i..(i + K::k())]); + } + + let last_kmer: K = dna_string.last_kmer(); + assert_eq!(last_kmer.to_string(), &string[(string.len() - K::k())..]); + + for (idx, &k) in kmers.iter().enumerate() { + assert_eq!(k, dna_string.get_kmer(idx)); + } + } + #[test] fn test_lmer_3_kmer_64() { for _ in 0..10000 { @@ -872,106 +901,76 @@ mod tests { #[test] fn test_kmer_64() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_48() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_32() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_31() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_24() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_20() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_16() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_15() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_14() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_12() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_10() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_8() { - for _ in 0..10000 { - check_kmer::>(); - } + kmer_test_suite::>(); } #[test] fn test_kmer_6() { - for _ in 0..10000 { - check_kmer::(); - } + kmer_test_suite::(); } #[test] fn test_kmer_5() { - for _ in 0..10000 { - check_kmer::(); - } + kmer_test_suite::(); } #[test] fn test_kmer_4() { - for _ in 0..10000 { - check_kmer::(); - } + kmer_test_suite::(); } } diff --git a/src/lib.rs b/src/lib.rs index 1b3a1d9d..369398af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,7 @@ use serde_derive::{Deserialize, Serialize}; use std::fmt; use std::hash::Hash; +pub mod array_kmer; pub mod clean_graph; pub mod compression; pub mod dna_string;