diff --git a/lib/genesis/population/format/bed_reader.cpp b/lib/genesis/population/format/bed_reader.cpp index 96fe9910..74d9ce1a 100644 --- a/lib/genesis/population/format/bed_reader.cpp +++ b/lib/genesis/population/format/bed_reader.cpp @@ -75,7 +75,7 @@ GenomeLocusSet BedReader::read_as_genome_locus_set( auto const chr_names = result.chromosome_names(); for( auto const& chr_name : chr_names ) { auto& bv = result.chromosome_positions( chr_name ); - auto const last_bit_idx = bv.find_last_set(); + auto const last_bit_idx = find_last_set( bv ); if( last_bit_idx == Bitvector::npos ) { bv = Bitvector( 1 ); } else { @@ -113,7 +113,7 @@ GenomeLocusSet BedReader::read_as_genome_locus_set( // Use the seq dict to resize the bitvector to the desired length. auto& bv = result.chromosome_positions( chr_name ); - auto const last_bit_idx = bv.find_last_set(); + auto const last_bit_idx = find_last_set( bv ); if( last_bit_idx == Bitvector::npos ) { // Empty chr in bed. Should not really be able to happen, as that means there was not // an entry in the input to begin with, but let's catch it anyway. diff --git a/lib/genesis/population/function/genome_locus_set.cpp b/lib/genesis/population/function/genome_locus_set.cpp index 0f4a71d0..63790929 100644 --- a/lib/genesis/population/function/genome_locus_set.cpp +++ b/lib/genesis/population/function/genome_locus_set.cpp @@ -109,7 +109,7 @@ GenomeLocusSet read_mask_fasta( // If we invert, we do that here at the end. We could also switch in the set function // above, but it's easier to do this in bulk. We need to unset the first bit then. if( invert ) { - bv.negate(); + negate(bv); bv.unset(0); } diff --git a/lib/genesis/population/function/window_average.hpp b/lib/genesis/population/function/window_average.hpp index de0467d9..9d05f71f 100644 --- a/lib/genesis/population/function/window_average.hpp +++ b/lib/genesis/population/function/window_average.hpp @@ -40,6 +40,9 @@ #include "genesis/population/variant.hpp" #include "genesis/population/window/base_window.hpp" #include "genesis/population/window/window_view.hpp" +#include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" +#include "genesis/utils/math/bitvector/operators.hpp" #include #include @@ -217,7 +220,7 @@ inline size_t get_window_provided_loci_count( // Finally, we have checked everything. Our first and last position are both inclusive, // while the bitvector count uses past-the-end, so we need to add one here for the last. - return bv.count( first, last + 1 ); + return pop_count( bv, first, last + 1 ); }; // If the window is a WindowStream over a whole genome, we use all its chromosomes. diff --git a/lib/genesis/population/genome_locus_set.cpp b/lib/genesis/population/genome_locus_set.cpp index f0b45ab4..2e7368a4 100644 --- a/lib/genesis/population/genome_locus_set.cpp +++ b/lib/genesis/population/genome_locus_set.cpp @@ -30,6 +30,7 @@ #include "genesis/population/genome_locus_set.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include "genesis/utils/math/bitvector/operators.hpp" #include @@ -90,7 +91,7 @@ void GenomeLocusSet::add( assert( bv.size() >= end + 1 ); // Now set all bits in between the two positions, inclusive. - bv.set( start, end + 1 ); + bv.set_range( start, end + 1 ); // for( size_t i = start; i <= end; ++i ) { // bv.set( i ); // } @@ -203,7 +204,7 @@ void GenomeLocusSet::set_intersect( GenomeLocusSet const& rhs ) // so remove it from the to-delete list. If all its bits are 0, we have eliminated // all positions from the filter, so we might as well delete the whole vector; in that // case, we simply keept it in the to-delete list and then it gets removed below. - if( lhs_bits.count() > 0 ) { + if( pop_count(lhs_bits) > 0 ) { chrs_to_delete.erase( chr.first ); } } diff --git a/lib/genesis/population/genome_locus_set.hpp b/lib/genesis/population/genome_locus_set.hpp index 186c9463..5de012e4 100644 --- a/lib/genesis/population/genome_locus_set.hpp +++ b/lib/genesis/population/genome_locus_set.hpp @@ -44,6 +44,7 @@ #include "genesis/population/genome_region_list.hpp" #include "genesis/sequence/sequence_dict.hpp" #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" namespace genesis { namespace population { @@ -386,7 +387,7 @@ class GenomeLocusSet // We do not need to to an extra check for position 0 here. // If it is true, then so will be the result. - return bv.any_set(); + return any_set( bv ); } // ------------------------------------------------------------------------- @@ -422,7 +423,7 @@ class GenomeLocusSet // If the above is not the case, check the actual start_position. // If the start_position is outside of the bitvector, it is not covered, obviously. - return bitvector.find_next_set( start_position ); + return find_next_set( bitvector, start_position ); } /** diff --git a/lib/genesis/sequence/functions/functions.cpp b/lib/genesis/sequence/functions/functions.cpp index e496cdb2..6a605551 100644 --- a/lib/genesis/sequence/functions/functions.cpp +++ b/lib/genesis/sequence/functions/functions.cpp @@ -1,6 +1,6 @@ /* Genesis - A toolkit for working with phylogenetic data. - Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH + Copyright (C) 2014-2024 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,6 +35,9 @@ #include "genesis/sequence/printers/simple.hpp" #include "genesis/utils/core/logging.hpp" +#include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" +#include "genesis/utils/math/bitvector/operators.hpp" #include "genesis/utils/text/string.hpp" #include "genesis/utils/text/style.hpp" @@ -189,7 +192,7 @@ void remove_sites( Sequence& seq, utils::Bitvector sites ) ); } - auto const num_sites = sites.size() - sites.count(); + auto const num_sites = sites.size() - pop_count(sites); std::string result; result.reserve( num_sites ); diff --git a/lib/genesis/tree/bipartition/bipartition.hpp b/lib/genesis/tree/bipartition/bipartition.hpp index 257f76ca..65d3ed7a 100644 --- a/lib/genesis/tree/bipartition/bipartition.hpp +++ b/lib/genesis/tree/bipartition/bipartition.hpp @@ -3,7 +3,7 @@ /* Genesis - A toolkit for working with phylogenetic data. - Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH + Copyright (C) 2014-2024 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,7 @@ #include "genesis/tree/tree.hpp" #include "genesis/tree/tree/subtree.hpp" #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include @@ -96,7 +97,7 @@ class Bipartition void invert() { - leaf_nodes_.negate(); + negate(leaf_nodes_); link_ = &link_->outer(); } diff --git a/lib/genesis/tree/bipartition/functions.cpp b/lib/genesis/tree/bipartition/functions.cpp index 251b55ac..8824eaa9 100644 --- a/lib/genesis/tree/bipartition/functions.cpp +++ b/lib/genesis/tree/bipartition/functions.cpp @@ -1,6 +1,6 @@ /* Genesis - A toolkit for working with phylogenetic data. - Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH + Copyright (C) 2014-2024 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -249,18 +249,18 @@ Bipartition find_smallest_subtree( auto const inverted = ~(bip.leaf_nodes()); if( utils::is_subset( comp, bip.leaf_nodes() )) { - if( min_count == 0 || bip.leaf_nodes().count() < min_count ) { + if( min_count == 0 || pop_count(bip.leaf_nodes()) < min_count ) { best_bip = bip; - min_count = best_bip.leaf_nodes().count(); + min_count = pop_count(best_bip.leaf_nodes()); } } if( utils::is_subset( comp, inverted ) ) { - if( min_count == 0 || inverted.count() < min_count ) { + if( min_count == 0 || pop_count(inverted) < min_count ) { best_bip = bip; best_bip.invert(); - assert( best_bip.leaf_nodes().count() == inverted.count() ); - min_count = best_bip.leaf_nodes().count(); + assert( pop_count( best_bip.leaf_nodes() ) == pop_count( inverted ) ); + min_count = pop_count( best_bip.leaf_nodes() ); } } } diff --git a/lib/genesis/tree/bipartition/rf.cpp b/lib/genesis/tree/bipartition/rf.cpp index ee5a40af..fb91c1ed 100644 --- a/lib/genesis/tree/bipartition/rf.cpp +++ b/lib/genesis/tree/bipartition/rf.cpp @@ -1,6 +1,6 @@ /* Genesis - A toolkit for working with phylogenetic data. - Copyright (C) 2014-2019 Lucas Czech + Copyright (C) 2014-2024 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,8 @@ #include "genesis/tree/function/functions.hpp" #include "genesis/tree/iterator/postorder.hpp" #include "genesis/utils/core/algorithm.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" +#include "genesis/utils/math/bitvector/operators.hpp" #include #include @@ -201,14 +203,14 @@ void rf_get_bitvectors_template( // Call the bitvector processor functor now, as we just finished constructing a split. // We normalize first to make sure that we always get comparable bitvectors in the end. - current.normalize(); + normalize(current); process_bitvector( current ); } } // We have traversed all node names now. If there is still an unset bit in the bitvector, // that means that we did not find all names that are in the tree. - if( name_check.count() != names.size() ) { + if( pop_count(name_check) != names.size() ) { throw std::runtime_error( "Cannot calculate RF distance with trees that have different node names. " "Some names are missing from one of the trees." diff --git a/lib/genesis/utils.hpp b/lib/genesis/utils.hpp index 40d8217d..1e463359 100644 --- a/lib/genesis/utils.hpp +++ b/lib/genesis/utils.hpp @@ -139,6 +139,7 @@ #include "genesis/utils/io/string_output_target.hpp" #include "genesis/utils/math/binomial.hpp" #include "genesis/utils/math/bit.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include "genesis/utils/math/bitvector.hpp" #include "genesis/utils/math/bitvector/operators.hpp" #include "genesis/utils/math/common.hpp" diff --git a/lib/genesis/utils/math/bit.hpp b/lib/genesis/utils/math/bit.hpp index d6977841..2be7f74b 100644 --- a/lib/genesis/utils/math/bit.hpp +++ b/lib/genesis/utils/math/bit.hpp @@ -36,7 +36,9 @@ #include "genesis/utils/core/std.hpp" #include +#include #include +#include #include #if GENESIS_CPP_STD >= GENESIS_CPP_STD_20 @@ -53,6 +55,33 @@ namespace genesis { namespace utils { +// ================================================================================================ +// Print +// ================================================================================================ + +// See genesis/utils/text/string.hpp +// /** +// * @brief Print the bits in an int as a string. +// */ +// template +// std::string to_bit_string( T value, bool with_byte_spaces = true ) +// { +// static_assert( +// std::is_unsigned::value, "Template parameter must be an unsigned integer type." +// ); +// static_assert( CHAR_BIT == 8, "CHAR_BIT != 8" ); +// +// std::string res = ""; +// T const one = 1; +// for( size_t i = 0; i < sizeof(T) * 8; ++i ) { +// res += (( value & (one << i)) ? "1" : "0"); +// if( with_byte_spaces && (i+1) % 8 == 0 ) { +// res += " "; +// } +// } +// return res; +// } + // ================================================================================================ // Pop Count // ================================================================================================ diff --git a/lib/genesis/utils/math/bitvector.cpp b/lib/genesis/utils/math/bitvector.cpp index 7455a773..ea152141 100644 --- a/lib/genesis/utils/math/bitvector.cpp +++ b/lib/genesis/utils/math/bitvector.cpp @@ -16,9 +16,9 @@ along with this program. If not, see . Contact: - Lucas Czech - Department of Plant Biology, Carnegie Institution For Science - 260 Panama Street, Stanford, CA 94305, USA + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark */ /** @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -48,12 +47,10 @@ namespace utils { // Constants // ============================================================================= -const size_t Bitvector::npos; +const Bitvector::IntType Bitvector::ALL_0 = 0ul; +const Bitvector::IntType Bitvector::ALL_1 = (((1ul << 32) - 1) << 32) + ((1ul << 32) - 1); -const Bitvector::IntType Bitvector::all_0_ = 0ul; -const Bitvector::IntType Bitvector::all_1_ = (((1ul << 32) - 1) << 32) + ((1ul << 32) - 1); - -const std::array Bitvector::bit_mask_ = +const std::array Bitvector::BIT_MASKS = {{ 1ul << 0, 1ul << 1, 1ul << 2, 1ul << 3, 1ul << 4, 1ul << 5, 1ul << 6, 1ul << 7, 1ul << 8, 1ul << 9, 1ul << 10, 1ul << 11, 1ul << 12, 1ul << 13, 1ul << 14, 1ul << 15, @@ -65,24 +62,24 @@ const std::array Bitvector::bit_mask_ = 1ul << 56, 1ul << 57, 1ul << 58, 1ul << 59, 1ul << 60, 1ul << 61, 1ul << 62, 1ul << 63 }}; -const std::array Bitvector::ones_mask_ = +const std::array Bitvector::ONES_MASKS = {{ - Bitvector::all_0_, Bitvector::all_1_ >> 63, Bitvector::all_1_ >> 62, Bitvector::all_1_ >> 61, - Bitvector::all_1_ >> 60, Bitvector::all_1_ >> 59, Bitvector::all_1_ >> 58, Bitvector::all_1_ >> 57, - Bitvector::all_1_ >> 56, Bitvector::all_1_ >> 55, Bitvector::all_1_ >> 54, Bitvector::all_1_ >> 53, - Bitvector::all_1_ >> 52, Bitvector::all_1_ >> 51, Bitvector::all_1_ >> 50, Bitvector::all_1_ >> 49, - Bitvector::all_1_ >> 48, Bitvector::all_1_ >> 47, Bitvector::all_1_ >> 46, Bitvector::all_1_ >> 45, - Bitvector::all_1_ >> 44, Bitvector::all_1_ >> 43, Bitvector::all_1_ >> 42, Bitvector::all_1_ >> 41, - Bitvector::all_1_ >> 40, Bitvector::all_1_ >> 39, Bitvector::all_1_ >> 38, Bitvector::all_1_ >> 37, - Bitvector::all_1_ >> 36, Bitvector::all_1_ >> 35, Bitvector::all_1_ >> 34, Bitvector::all_1_ >> 33, - Bitvector::all_1_ >> 32, Bitvector::all_1_ >> 31, Bitvector::all_1_ >> 30, Bitvector::all_1_ >> 29, - Bitvector::all_1_ >> 28, Bitvector::all_1_ >> 27, Bitvector::all_1_ >> 26, Bitvector::all_1_ >> 25, - Bitvector::all_1_ >> 24, Bitvector::all_1_ >> 23, Bitvector::all_1_ >> 22, Bitvector::all_1_ >> 21, - Bitvector::all_1_ >> 20, Bitvector::all_1_ >> 19, Bitvector::all_1_ >> 18, Bitvector::all_1_ >> 17, - Bitvector::all_1_ >> 16, Bitvector::all_1_ >> 15, Bitvector::all_1_ >> 14, Bitvector::all_1_ >> 13, - Bitvector::all_1_ >> 12, Bitvector::all_1_ >> 11, Bitvector::all_1_ >> 10, Bitvector::all_1_ >> 9, - Bitvector::all_1_ >> 8, Bitvector::all_1_ >> 7, Bitvector::all_1_ >> 6, Bitvector::all_1_ >> 5, - Bitvector::all_1_ >> 4, Bitvector::all_1_ >> 3, Bitvector::all_1_ >> 2, Bitvector::all_1_ >> 1 + Bitvector::ALL_0, Bitvector::ALL_1 >> 63, Bitvector::ALL_1 >> 62, Bitvector::ALL_1 >> 61, + Bitvector::ALL_1 >> 60, Bitvector::ALL_1 >> 59, Bitvector::ALL_1 >> 58, Bitvector::ALL_1 >> 57, + Bitvector::ALL_1 >> 56, Bitvector::ALL_1 >> 55, Bitvector::ALL_1 >> 54, Bitvector::ALL_1 >> 53, + Bitvector::ALL_1 >> 52, Bitvector::ALL_1 >> 51, Bitvector::ALL_1 >> 50, Bitvector::ALL_1 >> 49, + Bitvector::ALL_1 >> 48, Bitvector::ALL_1 >> 47, Bitvector::ALL_1 >> 46, Bitvector::ALL_1 >> 45, + Bitvector::ALL_1 >> 44, Bitvector::ALL_1 >> 43, Bitvector::ALL_1 >> 42, Bitvector::ALL_1 >> 41, + Bitvector::ALL_1 >> 40, Bitvector::ALL_1 >> 39, Bitvector::ALL_1 >> 38, Bitvector::ALL_1 >> 37, + Bitvector::ALL_1 >> 36, Bitvector::ALL_1 >> 35, Bitvector::ALL_1 >> 34, Bitvector::ALL_1 >> 33, + Bitvector::ALL_1 >> 32, Bitvector::ALL_1 >> 31, Bitvector::ALL_1 >> 30, Bitvector::ALL_1 >> 29, + Bitvector::ALL_1 >> 28, Bitvector::ALL_1 >> 27, Bitvector::ALL_1 >> 26, Bitvector::ALL_1 >> 25, + Bitvector::ALL_1 >> 24, Bitvector::ALL_1 >> 23, Bitvector::ALL_1 >> 22, Bitvector::ALL_1 >> 21, + Bitvector::ALL_1 >> 20, Bitvector::ALL_1 >> 19, Bitvector::ALL_1 >> 18, Bitvector::ALL_1 >> 17, + Bitvector::ALL_1 >> 16, Bitvector::ALL_1 >> 15, Bitvector::ALL_1 >> 14, Bitvector::ALL_1 >> 13, + Bitvector::ALL_1 >> 12, Bitvector::ALL_1 >> 11, Bitvector::ALL_1 >> 10, Bitvector::ALL_1 >> 9, + Bitvector::ALL_1 >> 8, Bitvector::ALL_1 >> 7, Bitvector::ALL_1 >> 6, Bitvector::ALL_1 >> 5, + Bitvector::ALL_1 >> 4, Bitvector::ALL_1 >> 3, Bitvector::ALL_1 >> 2, Bitvector::ALL_1 >> 1 }}; // ============================================================================= @@ -94,8 +91,8 @@ Bitvector::Bitvector( size_t size, Bitvector const& other ) { // Static test, needs to be here, as the constant is private. static_assert( - Bitvector::all_1_ == static_cast(0) - 1, - "Bitvector::all_1_ is not all one" + Bitvector::ALL_1 == static_cast(0) - 1, + "Bitvector::ALL_1 is not all one" ); if( &other == this ) { @@ -111,7 +108,7 @@ Bitvector::Bitvector( size_t size, Bitvector const& other ) for( size_t i = 0; i < n; ++i ) { data_[i] = other.data_[i]; } - unset_padding_(); + unset_padding_bits(); } Bitvector::Bitvector( std::string const& values ) @@ -142,14 +139,14 @@ Bitvector::Bitvector( std::string const& values ) // auto const ds = (size_ / IntSize) + (size_ % IntSize == 0 ? 0 : 1); // assert( ds <= other.data_.size() ); // data_ = std::vector( other.data_.begin(), other.data_.begin() + ds ); -// unset_padding_(); +// unset_padding_bits(); // } // ============================================================================= // Bit Functions // ============================================================================= -void Bitvector::set( size_t first, size_t last, bool value ) +void Bitvector::set_range( size_t first, size_t last, bool value ) { // Boundary checks if( first >= size_ || last > size_ || first > last ) { @@ -177,13 +174,13 @@ void Bitvector::set( size_t first, size_t last, bool value ) auto const l_bit_idx = last % IntSize; assert( f_wrd_idx < data_.size() ); assert( l_wrd_idx < data_.size() ); - assert( f_bit_idx < ones_mask_.size() ); - assert( l_bit_idx < ones_mask_.size() ); + assert( f_bit_idx < ONES_MASKS.size() ); + assert( l_bit_idx < ONES_MASKS.size() ); // Get the two words at the boundary. We later check if they are the same, // so we do not repeat the code here, and treat the special case later. - auto const f_mask = ~ones_mask_[ f_bit_idx ]; - auto const l_mask = l_bit_idx == 0 ? all_1_ : ones_mask_[ l_bit_idx ]; + auto const f_mask = ~ONES_MASKS[ f_bit_idx ]; + auto const l_mask = l_bit_idx == 0 ? ALL_1 : ONES_MASKS[ l_bit_idx ]; // Now set the bits as needed for the range. if( f_wrd_idx == l_wrd_idx ) { @@ -196,19 +193,71 @@ void Bitvector::set( size_t first, size_t last, bool value ) if( value ) { data_[ f_wrd_idx ] |= f_mask; for( size_t i = f_wrd_idx + 1; i < l_wrd_idx; ++i ) { - data_[i] = all_1_; + data_[i] = ALL_1; } data_[ l_wrd_idx ] |= l_mask; } else { data_[ f_wrd_idx ] &= ~f_mask; for( size_t i = f_wrd_idx + 1; i < l_wrd_idx; ++i ) { - data_[i] = all_0_; + data_[i] = ALL_0; } data_[ l_wrd_idx ] &= ~l_mask; } } } +void Bitvector::set_all( bool value ) +{ + // set according to flag. + const auto v = value ? ALL_1 : ALL_0; + for( size_t i = 0; i < data_.size(); ++i ) { + data_[i] = v; + } + + // if we initialized with true, we need to unset the surplus bits at the end! + if( value ) { + unset_padding_bits(); + } +} + +void Bitvector::unset_padding_bits() +{ + // Only apply if there are actual padding bits. + if(( size_ % IntSize ) == 0 ) { + assert( size_ / IntSize == data_.size() ); + return; + } + + // In the other cases, unset the padding. + assert( size_ / IntSize + 1 == data_.size() ); + assert( size_ % IntSize < ONES_MASKS.size() ); + data_.back() &= ONES_MASKS[ size_ % IntSize ]; + + // other versions that might be helpful if i messed up with this little/big endian stuff... + // first one is slow but definitely works, second one is fast, but might have the same + // issue as the used version above (which currently works perfectly). + //~ for( size_t i = size_ % IntSize; i < IntSize; ++i ) { + //~ data_.back() &= ~BIT_MASKS[i]; + //~ } + //~ data_.back() &= BIT_MASKS[size_ % IntSize] - 1; +} + +void Bitvector::negate() +{ + // flip all bits. + for( size_t i = 0; i < data_.size(); ++i ) { + data_[i] = ~ data_[i]; + } + + // reset the surplus bits at the end of the vector. + unset_padding_bits(); +} + +void Bitvector::invert() +{ + return negate(); +} + // ============================================================================= // Operators // ============================================================================= @@ -304,347 +353,5 @@ bool Bitvector::operator != (const Bitvector &other) const return !(*this == other); } -// ============================================================================= -// Other Functions -// ============================================================================= - -size_t Bitvector::count() const -{ - // Use bit trickery to count quickly. - size_t res = 0; - for( auto x : data_ ) { - res += pop_count(x); - } - - // safe, but slow version... - //~ size_t tmp = 0; - //~ for( size_t i = 0; i < size_; ++i ) { - //~ if (get(i)) { - //~ ++tmp; - //~ } - //~ } - //~ assert(tmp == res); - - return res; -} - -size_t Bitvector::count( size_t first, size_t last ) const -{ - // Boundary checks. - if( first >= size_ || last > size_ || first > last ) { - throw std::invalid_argument( - "Cannot compute pop count for Bitvector of size " + std::to_string( size_ ) + - " within invalid range [" + std::to_string(first) + "," + std::to_string( last ) + ")" - ); - } - assert( first < size_ ); - assert( last <= size_ ); - assert( first <= last ); - - // Check special case, as we might otherwise access invalid data at the boundaries. - if( first == last ) { - return 0; - } - assert( last > 0 ); - - // We need to mask the first bits of the first word and last bits of the last word - // before counting, and then can process the in-between words normally. - // If first and last are the same word, we need special treatment as well. - - // Get word indices, and bit position indices within those words. - // The last word is the one where the bit before last is, as last is past-the-end. - // However, the bit index is still meant to be past-the-end, to use the proper mask. - auto const f_wrd_idx = first / IntSize; - auto const l_wrd_idx = (last - 1) / IntSize; - auto const f_bit_idx = first % IntSize; - auto const l_bit_idx = last % IntSize; - assert( f_wrd_idx < data_.size() ); - assert( l_wrd_idx < data_.size() ); - assert( f_bit_idx < ones_mask_.size() ); - assert( l_bit_idx < ones_mask_.size() ); - - // Get the two words at the boundary. We later check if they are the same, - // so we do not repeat the code here, and treat the special case later. - auto f_word = data_[ f_wrd_idx ]; - auto l_word = data_[ l_wrd_idx ]; - - // Mask out the beginning and end, respectively. - // Remove all bits before the first index, and all bits after and including the last index. - // No special case needed here, as the 0th mask is idempotent. - // That's because it's the mask that we also use for unset_padding_(), - // we are basically doing the same here. - f_word &= ~ones_mask_[ f_bit_idx ]; - if( l_bit_idx != 0 ) { - l_word &= ones_mask_[ l_bit_idx ]; - } - - // Finally, count up all the parts. - size_t result = 0; - if( f_wrd_idx == l_wrd_idx ) { - // Same word. Mask out the bits we don't want, using only the bits that remained after - // filtering both words (which are the same, just different ends of the word), then count. - result = pop_count( f_word & l_word ); - } else { - // Count the first and last word, and then add everything in between the two. - result = pop_count( f_word ) + pop_count( l_word ); - for( size_t i = f_wrd_idx + 1; i < l_wrd_idx; ++i ) { - result += pop_count( data_[i] ); - } - } - return result; -} - -bool Bitvector::any_set() const -{ - for( auto word : data_ ) { - if( word > 0 ) { - return true; - } - } - return false; -} - -size_t Bitvector::find_first_set() const -{ - return find_next_set( 0 ); -} - -size_t Bitvector::find_last_set() const -{ - // We currently do not have a find_prev_set() function. If we implement one at some point, - // this function here should instead simply call that one. For now however, we implement - // only what we need here. - - // Find the last word that is non-zero. We use that unsigned ints have defined underflow - // behavior, so if we wrap around at word index 0, we just go to int max, and stop. - // This also works for the boundary case of an empty bitvector. - size_t wrd_idx = data_.size() - 1; - while( wrd_idx < data_.size() && data_[wrd_idx] == 0 ) { - --wrd_idx; - } - - // Boundary condition: No bits set, or empty vector - if( wrd_idx >= data_.size() ) { - return Bitvector::npos; - } - - // Now find the last bit in the word. For now, we do a simple bit-based loop, - // but we could speed this up with a similar ctz/clz approach as in find_next_set(). - // We start at either the end position of the word index that we found above, - // or, if that's the last word of the vector, we start at its last bit. - assert( wrd_idx < data_.size() ); - assert( data_[wrd_idx] != 0 ); - auto bit_idx = std::min( wrd_idx * IntSize + IntSize - 1, size_ - 1 ); - while( bit_idx < data_.size() * IntSize ) { - if( get(bit_idx) ) { - return bit_idx; - } - --bit_idx; - } - - // We must have found a bit above. - throw std::runtime_error( "Internal error in Bitvector::find_last_set()" ); -} - -size_t Bitvector::find_next_set( size_t start ) const -{ - // Boundary check - if( start >= size_ ) { - // We mimic the behaviour of std::string::find(), which just never finds anything - // when used beyond the string, but also does not throw an exception in such cases. - return Bitvector::npos; - - // Alternative: Throw an exception. - // throw std::invalid_argument( - // "Invalid call to find_next_set() with start==" + std::to_string(start) + - // " and a Bitvector of size " + std::to_string( size_ ) - // ); - } - - // Helper function to find the index of the first set bit in a non-zero word. - auto find_next_set_in_word_ = []( IntType word ) { - assert( word != 0 ); - - // We use ffs here, see https://man7.org/linux/man-pages/man3/ffs.3.html - // It returns the _position_ of the bit, so we need to subtract 1 to get the index. - // Alternatively, we could use __builtin_ctz, which returns the number of trailing - // zeros in the given word, but is a compiler intrinsic, so let's stay with POSIX. - - // Check the size of the input and call the appropriate ffs function. - // Any good compiler will see through this and make this constexpr. - // In C++17, we could do this ourselves ;-) - if( sizeof(word) <= sizeof(unsigned int) ) { - return ffs( static_cast( word )) - 1; - } else if( sizeof(word) <= sizeof(unsigned long) ) { - return ffsl( static_cast( word )) - 1; - } else { - return ffsll(word) - 1; - } - }; - - // First see if there is anything in the word at the start position. - // We assume that this function might be called on a dense bitvector, - // where the given position is already set, so we check that as a shortcut. - if( get( start )) { - return start; - } - - // If that did not work, we see if there is anything in the current word. - auto wrd_idx = start / IntSize; - auto bit_idx = start % IntSize; - assert( wrd_idx < data_.size() ); - assert( bit_idx < ones_mask_.size() ); - auto word = data_[wrd_idx]; - - // For this, we remove the bits before start, and then test the rest. - // Mask out the beginning of the word, and find the next bit on the remainder. - // Remove all bits before the first index. - word &= ~( ones_mask_[ bit_idx ]); - if( word != 0 ) { - return wrd_idx * IntSize + find_next_set_in_word_( word ); - } - - // We did not find a bit in the word of the start. So now we look for the first word - // after the start one that has bits set, and return its first set bit position. - ++wrd_idx; - while( wrd_idx < data_.size() && data_[wrd_idx] == 0 ) { - ++wrd_idx; - } - if( wrd_idx == data_.size() ) { - return Bitvector::npos; - - } - assert( wrd_idx < data_.size() ); - assert( data_[wrd_idx] != 0 ); - return wrd_idx * IntSize + find_next_set_in_word_( data_[wrd_idx] ); -} - -size_t Bitvector::hash() const -{ - std::size_t res = 0; - for( auto word : data_ ) { - res = hash_combine( res, word ); - } - return res; -} - -Bitvector::IntType Bitvector::x_hash() const -{ - IntType res = 0; - for( auto word : data_ ) { - res ^= word; - } - return res; -} - -void Bitvector::negate() -{ - // flip all bits. - for( size_t i = 0; i < data_.size(); ++i ) { - data_[i] = ~ data_[i]; - } - - // reset the surplus bits at the end of the vector. - unset_padding_(); -} - -void Bitvector::normalize() -{ - if( size_ > 0 && get(0) ) { - negate(); - } -} - -void Bitvector::set_all( const bool value ) -{ - // set according to flag. - const auto v = value ? all_1_ : all_0_; - for( size_t i = 0; i < data_.size(); ++i ) { - data_[i] = v; - } - - // if we initialized with true, we need to unset the surplus bits at the end! - if( value ) { - unset_padding_(); - } -} - -Bitvector Bitvector::make_random_bitvector( size_t size ) -{ - // Generate random unit64 values - static std::random_device rd; - static std::mt19937_64 engine(rd()); - static std::uniform_int_distribution dist( - std::numeric_limits::min(), - std::numeric_limits::max() - ); - static_assert( std::is_same::value, "Bitvector::IntType != uint64_t" ); - - // Iterate through all value types and assign random values. - // Way faster than setting individual bits. We only need to unset the remainder in the end. - auto result = Bitvector( size ); - for( auto& d : result.data_ ) { - d = dist(engine); - } - result.unset_padding_(); - return result; -} - -// ============================================================================= -// Internal Members -// ============================================================================= - -void Bitvector::unset_padding_() -{ - // Only apply if there are actual padding bits. - if(( size_ % IntSize ) == 0 ) { - assert( size_ / IntSize == data_.size() ); - return; - } - - // In the other cases, unset the padding. - assert( size_ / IntSize + 1 == data_.size() ); - assert( size_ % IntSize < ones_mask_.size() ); - data_.back() &= ones_mask_[ size_ % IntSize ]; - - // other versions that might be helpful if i messed up with this little/big endian stuff... - // first one is slow but definitely works, second one is fast, but might have the same - // issue as the used version above (which currently works perfectly). - //~ for( size_t i = size_ % IntSize; i < IntSize; ++i ) { - //~ data_.back() &= ~bit_mask_[i]; - //~ } - //~ data_.back() &= bit_mask_[size_ % IntSize] - 1; -} - -// ============================================================================= -// Dump and Debug -// ============================================================================= - -std::string Bitvector::dump() const -{ - std::string res = "[" + std::to_string(size_) + "]\n"; - for( size_t i = 0; i < size_; ++i ) { - res += (*this)[i] ? "1" : "0"; - if( (i+1) % 64 == 0 ) { - res += "\n"; - } else if( (i+1) % 8 == 0 ) { - res += " "; - } - } - return res; -} - -std::string Bitvector::dump_int(IntType x) const -{ - std::string res = ""; - for( size_t i = 0; i < IntSize; ++i ) { - res += (x & bit_mask_[i] ? "1" : "0"); - if( (i+1) % 8 == 0 ) { - res += " "; - } - } - return res; -} - } // namespace utils } // namespace genesis diff --git a/lib/genesis/utils/math/bitvector.hpp b/lib/genesis/utils/math/bitvector.hpp index 6a89ca66..9062ae6e 100644 --- a/lib/genesis/utils/math/bitvector.hpp +++ b/lib/genesis/utils/math/bitvector.hpp @@ -19,9 +19,9 @@ along with this program. If not, see . Contact: - Lucas Czech - Department of Plant Biology, Carnegie Institution For Science - 260 Panama Street, Stanford, CA 94305, USA + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark */ /** @@ -33,9 +33,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -43,12 +43,6 @@ namespace genesis { namespace utils { -// ================================================================================================= -// Forward Declarations -// ================================================================================================= - -class Deserializer; - // ================================================================================================= // Bitvector // ================================================================================================= @@ -61,13 +55,54 @@ class Bitvector // Typedefs, Enums, Constants // --------------------------------------------------------- + /** + * @brief Internally used 64 bit unsigned int type. + */ using IntType = uint64_t; - static const size_t IntSize = sizeof(IntType) * 8; /** - * @brief Value to indicate that find_next_set() did not find any set bits. + * @brief Number of bits per word used for storing data. + */ + static const size_t IntSize = sizeof(IntType) * CHAR_BIT; + static_assert( CHAR_BIT == 8, "CHAR_BIT != 8" ); + + /** + * @brief Constant word with no bits sets. + */ + static const IntType ALL_0; + + /** + * @brief Constant word with all bits sets. */ - static const size_t npos = std::numeric_limits::max(); + static const IntType ALL_1; + + /** + * @brief Bitmasks that contains a single bit at each of the 64 positions. + * + * The first entry BIT_MASKS[0] has the first bit set, etc. + */ + static const std::array BIT_MASKS; + + /** + * @brief Bitmask that contains as many ones as the position in the array tells. + * + * The element at position `i` contains `i` many ones, starting from the right. + * + * ONES_MASKS[ 0] --> 0 ones: 0000...0000 + * ONES_MASKS[ 1] --> 1 one: 0000...0001 + * ONES_MASKS[ 2] --> 2 ones: 0000...0011 + * ONES_MASKS[ 3] --> 3 ones: 0000...0111 + * ... + * ONES_MASKS[63] --> 63 ones: 0111...1111 + * + * This mask is used for unsetting the padding bits in unset_padding_bits(). + */ + static const std::array ONES_MASKS; + + /** + * @brief Value to indicate an invalid position in the bitvector. + */ + static constexpr size_t npos = std::numeric_limits::max(); // --------------------------------------------------------- // Constructor and Rule of Five @@ -154,10 +189,8 @@ class Bitvector Bitvector& operator= (Bitvector const&) = default; Bitvector& operator= (Bitvector&&) = default; - friend Deserializer& operator>>( Deserializer& deserializer, Bitvector& bv ); - // --------------------------------------------------------- - // Single Bit Functions + // Bit Operations // --------------------------------------------------------- /** @@ -165,8 +198,8 @@ class Bitvector */ inline bool operator [] ( size_t index ) const { assert( index / IntSize < data_.size() ); - assert( index % IntSize < bit_mask_.size() ); - return static_cast (data_[index / IntSize] & bit_mask_[index % IntSize]); + assert( index % IntSize < BIT_MASKS.size() ); + return static_cast (data_[index / IntSize] & BIT_MASKS[index % IntSize]); } /** @@ -182,8 +215,8 @@ class Bitvector } assert( index / IntSize < data_.size() ); - assert( index % IntSize < bit_mask_.size() ); - return static_cast (data_[index / IntSize] & bit_mask_[index % IntSize]); + assert( index % IntSize < BIT_MASKS.size() ); + return static_cast (data_[index / IntSize] & BIT_MASKS[index % IntSize]); } /** @@ -199,8 +232,20 @@ class Bitvector } assert( index / IntSize < data_.size() ); - assert( index % IntSize < bit_mask_.size() ); - data_[index / IntSize] |= bit_mask_[index % IntSize]; + assert( index % IntSize < BIT_MASKS.size() ); + data_[index / IntSize] |= BIT_MASKS[index % IntSize]; + } + + /** + * @brief Set the value of a single bit to a given bool value, with boundary check. + */ + inline void set( size_t index, bool value ) + { + if( value ) { + set( index ); + } else { + unset( index ); + } } /** @@ -216,20 +261,8 @@ class Bitvector } assert( index / IntSize < data_.size() ); - assert( index % IntSize < bit_mask_.size() ); - data_[index / IntSize] &= ~(bit_mask_[index % IntSize]); - } - - /** - * @brief Set the value of a single bit to a given bool value, with boundary check. - */ - inline void set( size_t index, bool value ) - { - if( value ) { - set( index ); - } else { - unset( index ); - } + assert( index % IntSize < BIT_MASKS.size() ); + data_[index / IntSize] &= ~(BIT_MASKS[index % IntSize]); } /** @@ -247,7 +280,12 @@ class Bitvector * * but faster for anything beyond a few bits, as we operate on whole words internally. */ - void set( size_t first, size_t last, bool value = true ); + void set_range( size_t first, size_t last, bool value = true ); + + /** + * @brief Set all the bits to a specified @p value. + */ + void set_all( bool value ); /** * @brief Flip (negate) the value of a single bit, with boundary check. @@ -262,8 +300,8 @@ class Bitvector } assert( index / IntSize < data_.size() ); - assert( index % IntSize < bit_mask_.size() ); - data_[index / IntSize] ^= bit_mask_[index % IntSize]; + assert( index % IntSize < BIT_MASKS.size() ); + data_[index / IntSize] ^= BIT_MASKS[index % IntSize]; } /** @@ -274,21 +312,15 @@ class Bitvector return flip( index ); } - // --------------------------------------------------------- - // Operators - // --------------------------------------------------------- - - Bitvector& operator &= (Bitvector const& rhs); - Bitvector& operator |= (Bitvector const& rhs); - Bitvector& operator ^= (Bitvector const& rhs); - Bitvector operator ~ () const; - - friend Bitvector operator & (Bitvector const& lhs, Bitvector const& rhs); - friend Bitvector operator | (Bitvector const& lhs, Bitvector const& rhs); - friend Bitvector operator ^ (Bitvector const& lhs, Bitvector const& rhs); + /** + * @brief Flip all bits. Alias for invert(). + */ + void negate(); - bool operator == (const Bitvector &other) const; - bool operator != (const Bitvector &other) const; + /** + * @brief Flip all bits. Alias for negate(). + */ + void invert(); // --------------------------------------------------------- // Other Functions @@ -315,232 +347,82 @@ class Bitvector } /** - * @brief Count the number of set bits in the Bitvector, that is, its Hamming weight, - * or population count (popcnt). - */ - size_t count() const; - - /** - * @brief Count the number of set bits between a range of indices in the Bitvector, - * that is, its Hamming weight, or population count (popcnt), for that range. - * - * The range @p first to @p last is zero-based, with last being the past-the-end index. - * Hence, this is the same as + * @brief Get the underlying data, i.e., the vector of uints, in which the data is stored. * - * size_t count = 0; - * for( size_t i = first; i < last; ++i ) { - * if( bitvector.get( i )) { - * ++count; - * } - * } - * - * but faster, as we use whole-word counting internally. - */ - size_t count( size_t first, size_t last ) const; - - /** - * @brief Return if any bits are set at all. + * Note that the last bits of the last word might be padding that does not actually belong + * to the bitvector data. They shall be 0 at all times. */ - bool any_set() const; - - /** - * @brief Return the index of the first bit in the Bitvector that is set. - * - * If no such position exists (because all bits are `false`), Bitvector::npos is returned. - */ - size_t find_first_set() const; + std::vector const& data() const + { + return data_; + } /** - * @brief Return the index of the last bit in the Bitvector that is set. + * @brief Get a non-const reference to the underlying data, i.e., the vector of units. * - * If no such position exists (because all bits are `false`), Bitvector::npos is returned. - */ - size_t find_last_set() const; - - /** - * @brief Return the index of the next position in the Bitvector that is set. + * This is meant to allow external functions such as serialization/deserialization to access + * the underlying data. It is hence important that any function uses this data with care, + * and does not break the assumptions and invariants expected in the bitvector. In particular, + * the size of the data returned here needs to match the number of bits of the bitvector, + * plus the needed padding to get to a full word size, and any padding bits shall be 0. * - * This returns the first position starting at @p start, including @p start itself, that is set. - * If no such position exists (because all following bits are `false`), or if @p start is beyond - * the length of the vector, Bitvector::npos is returned instead. + * We do not allow for resizing an existing instance. Similarly, we hence recommend to not + * change the size of the data from the outside, and always call unset_padding_bits() should + * any operation on the data cause the padding to be non-zero (such as inverting words). */ - size_t find_next_set( size_t start ) const; - - /** - * @brief Return an std::hash value for the Bitvector. - */ - size_t hash() const; + std::vector& data() + { + return data_; + } /** - * @brief Return a hash value of type IntType that is quicker to calculate than hash(). + * @brief Unset all bits to zero that are not actively used at the end of the data vector. * - * This can be used for obtaining a simple hash using xor of the words. - * The avalanche effect is of course not present, but for many applications, this hash is - * good enough and quite useful. - */ - IntType x_hash() const; - - /** - * @brief Flip all bits. - */ - void negate(); - - /** - * @brief Bring the Bitvector in a normalized form, where the first bit is always zero. + * The data buffer always contains a multiple of IntSize many bits, thus there might be surplus + * bits at its end for padding. In case we do operations with Bitvectors of different size, or + * when negating bits, these might be affected, so we need to reset them to zero in these cases. * - * If the first bit is zero, nothing happens. However, if is is one, the whole Bitvector is flipped - * using negate(). + * This function, similar to the non-const data() function, is meant for external operations + * that need access to internal data. This unset function is idempotent, and it is fine to call + * it on any bitvector. */ - void normalize(); - - /** - * @brief Set all the bits to a specified @p value. - */ - void set_all(const bool value = false); - - std::string dump() const; - std::string dump_int(IntType x) const; - - std::vector const& data() const - { - return data_; - } + void unset_padding_bits(); /** * @brief For a given numer of bits, compute the size of the internally used vector. * - * This is mostly meant as a helper for data operatins such as serialization and deserialization. + * This is mostly meant as a helper for data operations such as serialization and deserialization. */ static size_t get_vector_size( size_t bit_size ) { return (bit_size / IntSize) + (bit_size % IntSize == 0 ? 0 : 1); } - /** - * @brief Create a Bitvector of a given @p size, with randomly initialized bits, mostly for testing. - */ - static Bitvector make_random_bitvector( size_t size ); - // --------------------------------------------------------- - // Internal Members + // Operators // --------------------------------------------------------- -private: - - /** - * @brief Internal function that sets all bits to zero that are not actively used. - * - * The data_ buffer always contains a multiple of IntSize many bits, thus there might be surplus - * bits at its end for padding. In case we do operations with Bitvectors of different size, or - * when negating bits, these might be affected, so we need to reset them to zero in these cases. - */ - void unset_padding_(); - - static const IntType all_0_; - static const IntType all_1_; + Bitvector& operator &= (Bitvector const& rhs); + Bitvector& operator |= (Bitvector const& rhs); + Bitvector& operator ^= (Bitvector const& rhs); + Bitvector operator ~ () const; - /** - * @brief Bitmask that contains a single bit at each of the 64 positions. - */ - static const std::array bit_mask_; + friend Bitvector operator & (Bitvector const& lhs, Bitvector const& rhs); + friend Bitvector operator | (Bitvector const& lhs, Bitvector const& rhs); + friend Bitvector operator ^ (Bitvector const& lhs, Bitvector const& rhs); - /** - * @brief Bitmask that contains as many ones as the position in the array tells. - * - * The element at position `i` contains `i` many ones, starting from the right. - * - * ones_mask_[ 0] --> 0 ones: 0000...0000 - * ones_mask_[ 1] --> 1 one: 0000...0001 - * ones_mask_[ 2] --> 2 ones: 0000...0011 - * ones_mask_[ 3] --> 3 ones: 0000...0111 - * ... - * ones_mask_[63] --> 63 ones: 0111...1111 - * - * This mask is used for unsetting the padding bits in unset_padding_(). - */ - static const std::array ones_mask_; + bool operator == (const Bitvector &other) const; + bool operator != (const Bitvector &other) const; // --------------------------------------------------------- // Data Members // --------------------------------------------------------- - size_t size_ = 0; + size_t size_ = 0; std::vector data_; }; -// ============================================================================= -// Hashing -// ============================================================================= - -/** - * @brief Helper structure that yields the hash of a given Bitvector. - * - * It is meant to be used in containers such as `unordered_set` or `unordered_map` - * that can make use of custom hash functions for the key objects. By default, - * these containers use a specialization of the `std::hash` template, which we also offer, - * and that also uses the Bitvector::hash() function. - * - * Hence, this class here is slightly redundant, as it gives the same result as just using - * the `std::hash` specialization. Still, it might be useful to have. - * - * See also BitvectorXhash for an alternative version that uses Bitvector::x_hash() instead. - */ -struct BitvectorHash -{ - std::size_t operator() ( genesis::utils::Bitvector const& value ) const - { - return value.hash(); - } -}; - -/** - * @brief Helper structure that yields the x_hash of a given Bitvector. - * - * It is meant to be used in containers such as `unordered_set` or `unordered_map` - * that can make use of custom hash functions for the key objects. Using this class instead - * of the standard `std::hash` specialization, the Bitvector::x_hash() function is used instead - * of the standard hash() function. It is hence faster to compute, but without avalanche effect. - * - * In some use cases, this might be preferrable - we however recommend to test this, in order to - * make sure that colliding hashes do not slow down the performance in the end. - * - * Note that the function needs to cast from Bitvector::IntType to std::size_t. - * On most modern systems, these are expecte to be the same, i.e., 64 bit unsigned integers. - * However, this might cause problem on systems where this is not the case. - */ -struct BitvectorXhash -{ - std::size_t operator() ( genesis::utils::Bitvector const& value ) const - { - return static_cast( value.x_hash() ); - } -}; - } // namespace utils } // namespace genesis -// ============================================================================= -// Namespace std extension -// ============================================================================= - -namespace std { - -/** - * @brief Specialization of std::hash for the Bitvector class. - * - * It uses Bitvector::hash() for the hashing. See also BitvectorHash for an alternative class - * that does the same, but resides in the same namespace as Bitvector, and see BitvectorXhash - * for a variant that uses Bitvector::x_hash() instead as the hashing function. - */ -template<> -struct hash -{ - std::size_t operator() ( genesis::utils::Bitvector const& value ) const - { - return value.hash(); - } -}; - -} // namespace std - #endif // include guard diff --git a/lib/genesis/utils/math/bitvector/functions.cpp b/lib/genesis/utils/math/bitvector/functions.cpp new file mode 100644 index 00000000..68d78fd1 --- /dev/null +++ b/lib/genesis/utils/math/bitvector/functions.cpp @@ -0,0 +1,522 @@ +/* + Genesis - A toolkit for working with phylogenetic data. + Copyright (C) 2014-2024 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark +*/ + +/** + * @brief + * + * @file + * @ingroup utils + */ + +#include "genesis/utils/math/bitvector/functions.hpp" +#include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/core/std.hpp" +#include "genesis/utils/math/bit.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace genesis { +namespace utils { + +// ================================================================================================= +// Bitvector Functions +// ================================================================================================= + +// ------------------------------------------------------------------------- +// Creation +// ------------------------------------------------------------------------- + +std::vector make_bool_vector_from_indices( std::vector const& indices, size_t size ) +{ + // Get the largest element of the vector. If it's empty, we return an all-false vector. + auto max_it = std::max_element( indices.begin(), indices.end() ); + if( max_it == indices.end() ) { + return std::vector( size, false ); + } + size_t target_size = *max_it + 1; + if( size > 0 ) { + if( target_size > size ) { + throw std::invalid_argument( + "Cannot use make_bool_vector_from_indices() with size " + std::to_string( size ) + + " that is smaller than required to include the larged index " + + std::to_string( *max_it ) + " in the list of indices (zero-based)." + ); + } + target_size = size; + } + + // Fill a bool vector, setting all positions to true + // that are indicated by the indices, pun intended. + auto result = std::vector( target_size, false ); + for( auto const& idx : indices ) { + assert( idx < result.size() ); + result[idx] = true; + } + return result; +} + +Bitvector make_random_bitvector( size_t size ) +{ + // Generate random unit64 values + static std::random_device rd; + static std::mt19937_64 engine(rd()); + static std::uniform_int_distribution dist( + std::numeric_limits::min(), + std::numeric_limits::max() + ); + static_assert( std::is_same::value, "Bitvector::IntType != uint64_t" ); + + // Iterate through all value types and assign random values. + // Way faster than setting individual bits. We only need to unset the remainder in the end. + auto result = Bitvector( size ); + for( auto& d : result.data() ) { + d = dist(engine); + } + result.unset_padding_bits(); + return result; +} + +// ------------------------------------------------------------------------- +// Modification +// ------------------------------------------------------------------------- + +void negate( Bitvector& bv ) +{ + return bv.negate(); +} + +void invert( Bitvector& bv ) +{ + return bv.negate(); +} + +void normalize( Bitvector& bv ) +{ + if( bv.size() > 0 && bv.get(0) ) { + bv.negate(); + } +} + +// ------------------------------------------------------------------------- +// Hashing +// ------------------------------------------------------------------------- + +size_t bitvector_hash( Bitvector const& bv ) +{ + std::size_t res = 0; + for( auto word : bv.data() ) { + res = hash_combine( res, word ); + } + return res; +} + +Bitvector::IntType bitvector_x_hash( Bitvector const& bv ) +{ + Bitvector::IntType res = 0; + for( auto word : bv.data() ) { + res ^= word; + } + return res; +} + +// ------------------------------------------------------------------------- +// Pop Count +// ------------------------------------------------------------------------- + +size_t pop_count( Bitvector const& bv ) +{ + // Use bit trickery to count quickly. + size_t res = 0; + for( auto x : bv.data() ) { + res += pop_count(x); + } + + // safe, but slow version... + //~ size_t tmp = 0; + //~ for( size_t i = 0; i < size_; ++i ) { + //~ if (get(i)) { + //~ ++tmp; + //~ } + //~ } + //~ assert(tmp == res); + + return res; +} + +size_t pop_count( Bitvector const& bv, size_t first, size_t last ) +{ + // Boundary checks. + if( first >= bv.size() || last > bv.size() || first > last ) { + throw std::invalid_argument( + "Cannot compute pop count for Bitvector of size " + std::to_string( bv.size() ) + + " within invalid range [" + std::to_string(first) + "," + std::to_string( last ) + ")" + ); + } + assert( first < bv.size() ); + assert( last <= bv.size() ); + assert( first <= last ); + + // Check special case, as we might otherwise access invalid data at the boundaries. + if( first == last ) { + return 0; + } + assert( last > 0 ); + + // We need to mask the first bits of the first word and last bits of the last word + // before counting, and then can process the in-between words normally. + // If first and last are the same word, we need special treatment as well. + + // Get word indices, and bit position indices within those words. + // The last word is the one where the bit before last is, as last is past-the-end. + // However, the bit index is still meant to be past-the-end, to use the proper mask. + auto const f_wrd_idx = first / Bitvector::IntSize; + auto const l_wrd_idx = (last - 1) / Bitvector::IntSize; + auto const f_bit_idx = first % Bitvector::IntSize; + auto const l_bit_idx = last % Bitvector::IntSize; + assert( f_wrd_idx < bv.data().size() ); + assert( l_wrd_idx < bv.data().size() ); + assert( f_bit_idx < Bitvector::ONES_MASKS.size() ); + assert( l_bit_idx < Bitvector::ONES_MASKS.size() ); + + // Get the two words at the boundary. We later check if they are the same, + // so we do not repeat the code here, and treat the special case later. + auto f_word = bv.data()[ f_wrd_idx ]; + auto l_word = bv.data()[ l_wrd_idx ]; + + // Mask out the beginning and end, respectively. + // Remove all bits before the first index, and all bits after and including the last index. + // No special case needed here, as the 0th mask is idempotent. + // That's because it's the mask that we also use for unset_padding_bits(), + // we are basically doing the same here. + f_word &= ~Bitvector::ONES_MASKS[ f_bit_idx ]; + if( l_bit_idx != 0 ) { + l_word &= Bitvector::ONES_MASKS[ l_bit_idx ]; + } + + // Finally, count up all the parts. + size_t result = 0; + if( f_wrd_idx == l_wrd_idx ) { + // Same word. Mask out the bits we don't want, using only the bits that remained after + // filtering both words (which are the same, just different ends of the word), then count. + result = pop_count( f_word & l_word ); + } else { + // Count the first and last word, and then add everything in between the two. + result = pop_count( f_word ) + pop_count( l_word ); + for( size_t i = f_wrd_idx + 1; i < l_wrd_idx; ++i ) { + result += pop_count( bv.data()[i] ); + } + } + return result; +} + +// ------------------------------------------------------------------------- +// Set Operators +// ------------------------------------------------------------------------- + +Bitvector set_minus( Bitvector const& lhs, Bitvector const& rhs ) +{ + return lhs & (~rhs); +} + +Bitvector symmetric_difference( Bitvector const& lhs, Bitvector const& rhs ) +{ + return (lhs | rhs) & ~(lhs & rhs); +} + +bool is_strict_subset( Bitvector const& sub, Bitvector const& super ) +{ + // Not really efficient. We could stop early in the comparison instead. + // But as of now, we do not need the speed, so let's keep it simple instead. + // Same for the other variants of this function below. + return ((sub & super) == sub) && (pop_count(sub) < pop_count(super)); +} + +bool is_strict_superset( Bitvector const& super, Bitvector const& sub ) +{ + return is_strict_subset( sub, super ); +} + +bool is_subset( Bitvector const& sub, Bitvector const& super ) +{ + return (sub == super) || is_strict_subset(sub, super); +} + +bool is_superset( Bitvector const& super, Bitvector const& sub ) +{ + return (super == sub) || is_strict_superset(super, sub); +} + +// ------------------------------------------------------------------------- +// Distances +// ------------------------------------------------------------------------- + +double jaccard_similarity( Bitvector const& lhs, Bitvector const& rhs ) +{ + if( lhs.size() != rhs.size() ) { + throw std::invalid_argument( + "Cannot compute Jaccard similarity between Bitvectors of different size" + ); + } + + // Shorthands + auto const& lhs_data = lhs.data(); + auto const& rhs_data = rhs.data(); + + // Use the bitvector data directly to count + // the number of bits in the intersection and the union + size_t sum_i = 0; + size_t sum_u = 0; + for( size_t i = 0; i < lhs_data.size(); ++i ) { + sum_i += pop_count( lhs_data[i] & rhs_data[i] ); + sum_u += pop_count( lhs_data[i] | rhs_data[i] ); + } + + // Alternative (10x slower) way using operations on the whole bitvectors. + // auto const sum_i = pop_count( lhs & rhs ); + // auto const sum_u = pop_count( lhs | rhs ); + + // Compute the index, taking care of the edge case. + if( sum_u == 0 ) { + assert( sum_i == 0 ); + return 0.0; + } + return static_cast( sum_i ) / static_cast( sum_u ); +} + +double jaccard_distance( Bitvector const& lhs, Bitvector const& rhs ) +{ + return 1.0 - jaccard_similarity( lhs, rhs ); +} + +size_t hamming_distance( Bitvector const& lhs, Bitvector const& rhs ) +{ + if( lhs.size() != rhs.size() ) { + throw std::invalid_argument( + "Cannot compute Hamming distance between Bitvectors of different size" + ); + } + + // Shorthands + auto const& lhs_data = lhs.data(); + auto const& rhs_data = rhs.data(); + + // Use the bitvector data directly to count the number of resulting bits. + size_t sum = 0; + for( size_t i = 0; i < lhs_data.size(); ++i ) { + sum += pop_count( lhs_data[i] ^ rhs_data[i] ); + } + return sum; +} + +// ------------------------------------------------------------------------- +// Fit Set Bits +// ------------------------------------------------------------------------- + +bool any_set( Bitvector const& bv ) +{ + for( auto word : bv.data() ) { + if( word > 0 ) { + return true; + } + } + return false; +} + +size_t find_first_set( Bitvector const& bv ) +{ + return find_next_set( bv, 0 ); +} + +size_t find_last_set( Bitvector const& bv ) +{ + auto const& data = bv.data(); + if( data.size() == 0 ) { + return Bitvector::npos; + } + + // We currently do not have a find_prev_set() function. If we implement one at some point, + // this function here should instead simply call that one. For now however, we implement + // only what we need here. + + // Find the last word that is non-zero. We use that unsigned ints have defined underflow + // behavior, so if we wrap around at word index 0, we just go to int max, and stop. + // This also works for the boundary case of an empty bitvector. + size_t wrd_idx = data.size() - 1; + while( wrd_idx < data.size() && data[wrd_idx] == 0 ) { + --wrd_idx; + } + + // Boundary condition: No bits set, or empty vector + if( wrd_idx >= data.size() ) { + return Bitvector::npos; + } + + // Now find the last bit in the word. For now, we do a simple bit-based loop, + // but we could speed this up with a similar ctz/clz approach as in find_next_set(). + // We start at either the end position of the word index that we found above, + // or, if that's the last word of the vector, we start at its last bit. + assert( wrd_idx < data.size() ); + assert( data[wrd_idx] != 0 ); + auto bit_idx = std::min( wrd_idx * Bitvector::IntSize + Bitvector::IntSize - 1, bv.size() - 1 ); + while( bit_idx < data.size() * Bitvector::IntSize ) { + if( bv.get(bit_idx) ) { + return bit_idx; + } + --bit_idx; + } + + // We must have found a bit above. + throw std::runtime_error( "Internal error in Bitvector::find_last_set()" ); +} + +size_t find_next_set( Bitvector const& bv, size_t start ) +{ + // Boundary check + if( start >= bv.size() ) { + // We mimic the behaviour of std::string::find(), which just never finds anything + // when used beyond the string, but also does not throw an exception in such cases. + return Bitvector::npos; + + // Alternative: Throw an exception. + // throw std::invalid_argument( + // "Invalid call to find_next_set() with start==" + std::to_string(start) + + // " and a Bitvector of size " + std::to_string( size_ ) + // ); + } + + // Helper function to find the index of the first set bit in a non-zero word. + auto find_next_set_in_word_ = []( Bitvector::IntType word ) { + assert( word != 0 ); + + // We use ffs here, see https://man7.org/linux/man-pages/man3/ffs.3.html + // It returns the _position_ of the bit, so we need to subtract 1 to get the index. + // Alternatively, we could use __builtin_ctz, which returns the number of trailing + // zeros in the given word, but is a compiler intrinsic, so let's stay with POSIX. + + // Check the size of the input and call the appropriate ffs function. + // Any good compiler will see through this and make this constexpr. + // In C++17, we could do this ourselves ;-) + if( sizeof(word) <= sizeof(unsigned int) ) { + return ffs( static_cast( word )) - 1; + } else if( sizeof(word) <= sizeof(unsigned long) ) { + return ffsl( static_cast( word )) - 1; + } else { + return ffsll(word) - 1; + } + }; + + // First see if there is anything in the word at the start position. + // We assume that this function might be called on a dense bitvector, + // where the given position is already set, so we check that as a shortcut. + if( bv.get( start )) { + return start; + } + + // If that did not work, we see if there is anything in the current word. + auto wrd_idx = start / Bitvector::IntSize; + auto bit_idx = start % Bitvector::IntSize; + assert( wrd_idx < bv.data().size() ); + assert( bit_idx < Bitvector::ONES_MASKS.size() ); + auto word = bv.data()[wrd_idx]; + + // For this, we remove the bits before start, and then test the rest. + // Mask out the beginning of the word, and find the next bit on the remainder. + // Remove all bits before the first index. + word &= ~( Bitvector::ONES_MASKS[ bit_idx ]); + if( word != 0 ) { + return wrd_idx * Bitvector::IntSize + find_next_set_in_word_( word ); + } + + // We did not find a bit in the word of the start. So now we look for the first word + // after the start one that has bits set, and return its first set bit position. + ++wrd_idx; + while( wrd_idx < bv.data().size() && bv.data()[wrd_idx] == 0 ) { + ++wrd_idx; + } + if( wrd_idx == bv.data().size() ) { + return Bitvector::npos; + + } + assert( wrd_idx < bv.data().size() ); + assert( bv.data()[wrd_idx] != 0 ); + return wrd_idx * Bitvector::IntSize + find_next_set_in_word_( bv.data()[wrd_idx] ); +} + +// ------------------------------------------------------------------------- +// Sorting +// ------------------------------------------------------------------------- + +// bool lexicographically_compare_helper_( Bitvector const& lhs, Bitvector const& rhs, bool on_equal ) +// { +// // Deactivated at the moment, as this does not take care of the typical little endian-ness +// // of modern computers, and hence yields wrong results... +// +// // Local helper function to avoid code duplication. +// if( lhs.size() != rhs.size() ) { +// throw std::runtime_error( +// "Cannot use lexicographical comparison functions on Bitvectors of different size." +// ); +// } +// for( size_t i = 0; i < lhs.data().size(); ++i ) { +// if( lhs.data()[i] < rhs.data()[i] ) { +// return true; +// } else if( lhs.data()[i] > rhs.data()[i] ) { +// return false; +// } +// } +// +// // If we are here, all of the above comparisons shows that lhs == rhs. +// assert( lhs == rhs ); +// return on_equal; +// } +// +// bool is_lexicographically_less( Bitvector const& lhs, Bitvector const& rhs ) +// { +// return lexicographically_compare_helper_( lhs, rhs, false ); +// } +// +// bool is_lexicographically_less_or_equal( Bitvector const& lhs, Bitvector const& rhs ) +// { +// return lexicographically_compare_helper_( lhs, rhs, true ); +// } +// +// bool is_lexicographically_greater( Bitvector const& lhs, Bitvector const& rhs ) +// { +// return lexicographically_compare_helper_( rhs, lhs, false ); +// } +// +// bool is_lexicographically_greater_or_equal( Bitvector const& lhs, Bitvector const& rhs ) +// { +// return lexicographically_compare_helper_( rhs, lhs, true ); +// } + +} // namespace utils +} // namespace genesis diff --git a/lib/genesis/utils/math/bitvector/functions.hpp b/lib/genesis/utils/math/bitvector/functions.hpp new file mode 100644 index 00000000..cfb86957 --- /dev/null +++ b/lib/genesis/utils/math/bitvector/functions.hpp @@ -0,0 +1,267 @@ +#ifndef GENESIS_UTILS_MATH_BITVECTOR_FUNCTIONS_H_ +#define GENESIS_UTILS_MATH_BITVECTOR_FUNCTIONS_H_ + +/* + Genesis - A toolkit for working with phylogenetic data. + Copyright (C) 2014-2024 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark +*/ + +/** + * @brief + * + * @file + * @ingroup utils + */ + +#include "genesis/utils/math/bitvector.hpp" + +#include +#include +#include + +namespace genesis { +namespace utils { + +// ================================================================================================= +// Bitvector Functions +// ================================================================================================= + +// ------------------------------------------------------------------------- +// Creation +// ------------------------------------------------------------------------- + +/** + * @brief Helper function to create a bool vector from a set of indices to be set to `true`. + * + * The function expectes a list of indices. It returns a bool vector with the size of the largest + * index, or the provided @p size (if set to a value > 0), where all positions of these indices are + * `true`, and all other positions are `false`. + */ +std::vector make_bool_vector_from_indices( + std::vector const& indices, + size_t size = 0 +); + +/** + * @brief Create a Bitvector of a given @p size, with randomly initialized bits, mostly for testing. + */ +Bitvector make_random_bitvector( size_t size ); + +// ------------------------------------------------------------------------- +// Modification +// ------------------------------------------------------------------------- + +/** +* @brief Flip all bits. Alias for invert(). +*/ +void negate( Bitvector& bv ); + +/** +* @brief Flip all bits. Alias for negate(). +*/ +void invert( Bitvector& bv ); + +/** +* @brief Bring the Bitvector in a normalized form, where the first bit is always zero. +* +* If the first bit is zero, nothing happens. However, if is is one, the whole Bitvector is flipped +* using negate(). +*/ +void normalize( Bitvector& bv ); + +// ------------------------------------------------------------------------- +// Hashing +// ------------------------------------------------------------------------- + +/** +* @brief Return an std::hash value for the Bitvector. +*/ +size_t bitvector_hash( Bitvector const& bv ); + +/** +* @brief Return a hash value of type IntType that is quicker to calculate than hash(). +* +* This can be used for obtaining a simple hash using xor of the words. +* The avalanche effect is of course not present, but for many applications, this hash is +* good enough and quite useful. +*/ +Bitvector::IntType bitvector_x_hash( Bitvector const& bv ); + +// ------------------------------------------------------------------------- +// Pop Count +// ------------------------------------------------------------------------- + +/** + * @brief Count the number of set bits in the Bitvector, that is, its Hamming weight, + * or population count (popcnt). + */ +size_t pop_count( Bitvector const& bv ); + +/** + * @brief Count the number of set bits between a range of indices in the Bitvector, + * that is, its Hamming weight, or population count (popcnt), for that range. + * + * The range @p first to @p last is zero-based, with last being the past-the-end index. + * Hence, this is the same as + * + * size_t count = 0; + * for( size_t i = first; i < last; ++i ) { + * if( bitvector.get( i )) { + * ++count; + * } + * } + * + * but faster, as we use whole-word counting internally. + */ +size_t pop_count( Bitvector const& bv, size_t first, size_t last ); + +// ------------------------------------------------------------------------- +// Set Operators +// ------------------------------------------------------------------------- + +/** + * @brief Compute the set minus `lhs & (~rhs)` between two Bitvector%s. + */ +Bitvector set_minus( Bitvector const& lhs, Bitvector const& rhs ); + +/** + * @brief Compute the symmetric differeence `(lhs | rhs) & ~(lhs & rhs)` between two Bitvector%s. + */ +Bitvector symmetric_difference( Bitvector const& lhs, Bitvector const& rhs ); + +/** + * @brief Strict subset. + */ +bool is_strict_subset( Bitvector const& sub, Bitvector const& super ); + +/** + * @brief Strict superset. + */ +bool is_strict_superset( Bitvector const& super, Bitvector const& sub ); + +/** + * @brief Subset or equal. + */ +bool is_subset( Bitvector const& sub, Bitvector const& super ); + +/** + * @brief Superset or equal. + */ +bool is_superset( Bitvector const& super, Bitvector const& sub ); + +// ------------------------------------------------------------------------- +// Distances +// ------------------------------------------------------------------------- + +/** + * @brief Compute the Jaccard index (Jaccard similarity coefficient) for two Bitvector%s + * of the same size. + * + * This is simply the count of bits in the intersection divided by the count of bits in the union + * of the Bitvectors. + */ +double jaccard_similarity( Bitvector const& lhs, Bitvector const& rhs ); + +/** + * @brief Compute the Jaccard distance for two Bitvector%s of the same size. + * + * This dissimilarity is simply 1 - jaccard_similarity(). + */ +double jaccard_distance( Bitvector const& lhs, Bitvector const& rhs ); + +/** + * @brief Compute the Hamming distance between two Bitvector%s, + * i.e., the Hamming weight (pop count) of the `xor` of the inputs. + */ +size_t hamming_distance( Bitvector const& lhs, Bitvector const& rhs ); + +// ------------------------------------------------------------------------- +// Fit Set Bits +// ------------------------------------------------------------------------- + +/** + * @brief Return if any bits are set at all. + */ +bool any_set( Bitvector const& bv ); + +/** + * @brief Return the index of the first bit in the Bitvector that is set. + * + * If no such position exists (because all bits are `false`), Bitvector::npos + * is returned. + */ +size_t find_first_set( Bitvector const& bv ); + +/** + * @brief Return the index of the last bit in the Bitvector that is set. + * + * If no such position exists (because all bits are `false`), Bitvector::npos + * is returned. + */ +size_t find_last_set( Bitvector const& bv ); + +/** + * @brief Return the index of the next position in the Bitvector that is set. + * + * This returns the first position starting at @p start, including @p start itself, that is set. + * If no such position exists (because all following bits are `false`), or if @p start is beyond + * the length of the vector, Bitvector::npos is returned instead. + */ +size_t find_next_set( Bitvector const& bv, size_t start ); + +// ------------------------------------------------------------------------- +// Sorting +// ------------------------------------------------------------------------- + +// /* * +// * @brief Return whether @p lhs is lexicographically less than @p rhs. +// * +// * This function consideres the bits ordered with most significant bits towards the left, so that +// * `0001 < 1000` for example. Both Bitvector%s have to have the same length (this could be +// * implemented for Bitvectors of different size, but that is not needed as of now, so it isn't). +// */ +// bool is_lexicographically_less( Bitvector const& lhs, Bitvector const& rhs ); +// +// /* * +// * @brief Return whether @p lhs is lexicographically less than @p rhs, or equal to it. +// * +// * @copydetails is_lexicographically_less() +// */ +// bool is_lexicographically_less_or_equal( Bitvector const& lhs, Bitvector const& rhs ); +// +// /* * +// * @brief Return whether @p lhs is lexicographically greater than @p rhs. +// * +// * @copydetails is_lexicographically_less() +// */ +// bool is_lexicographically_greater( Bitvector const& lhs, Bitvector const& rhs ); +// +// /* * +// * @brief Return whether @p lhs is lexicographically greater than @p rhs, or equal to it. +// * +// * @copydetails is_lexicographically_less() +// */ +// bool is_lexicographically_greater_or_equal( Bitvector const& lhs, Bitvector const& rhs ); + +} // namespace utils +} // namespace genesis + +#endif // include guard diff --git a/lib/genesis/utils/math/bitvector/operators.cpp b/lib/genesis/utils/math/bitvector/operators.cpp index a8d70a62..763bfae4 100644 --- a/lib/genesis/utils/math/bitvector/operators.cpp +++ b/lib/genesis/utils/math/bitvector/operators.cpp @@ -154,152 +154,23 @@ Bitvector bitwise_xor( } // ------------------------------------------------------------------------- -// Set Operators +// Input and Output // ------------------------------------------------------------------------- -Bitvector set_minus( Bitvector const& lhs, Bitvector const& rhs ) -{ - return lhs & (~rhs); -} - -Bitvector symmetric_difference( Bitvector const& lhs, Bitvector const& rhs ) -{ - return (lhs | rhs) & ~(lhs & rhs); -} - -bool is_strict_subset( Bitvector const& sub, Bitvector const& super ) -{ - // Not really efficient. We could stop early in the comparison instead. - // But as of now, we do not need the speed, so let's keep it simple instead. - // Same for the other variants of this function below. - return ((sub & super) == sub) && (sub.count() < super.count()); -} - -bool is_strict_superset( Bitvector const& super, Bitvector const& sub ) -{ - return is_strict_subset( sub, super ); -} - -bool is_subset( Bitvector const& sub, Bitvector const& super ) -{ - return (sub == super) || is_strict_subset(sub, super); -} - -bool is_superset( Bitvector const& super, Bitvector const& sub ) -{ - return (super == sub) || is_strict_superset(super, sub); -} - -double jaccard_similarity( Bitvector const& lhs, Bitvector const& rhs ) -{ - if( lhs.size() != rhs.size() ) { - throw std::invalid_argument( - "Cannot compute Jaccard similarity between Bitvectors of different size" - ); - } - - // Shorthands - auto const& lhs_data = lhs.data(); - auto const& rhs_data = rhs.data(); - - // Use the bitvector data directly to count - // the number of bits in the intersection and the union - size_t sum_i = 0; - size_t sum_u = 0; - for( size_t i = 0; i < lhs_data.size(); ++i ) { - sum_i += pop_count( lhs_data[i] & rhs_data[i] ); - sum_u += pop_count( lhs_data[i] | rhs_data[i] ); - } - - // Alternative (10x slower) way using bitvector operations. - // auto const sum_i = ( lhs & rhs ).count(); - // auto const sum_u = ( lhs | rhs ).count(); - - // Compute the index, taking care of the edge case. - if( sum_u == 0 ) { - assert( sum_i == 0 ); - return 0.0; - } - return static_cast( sum_i ) / static_cast( sum_u ); -} - -double jaccard_distance( Bitvector const& lhs, Bitvector const& rhs ) -{ - return 1.0 - jaccard_similarity( lhs, rhs ); -} - -size_t hamming_distance( Bitvector const& lhs, Bitvector const& rhs ) +std::string to_bit_string( Bitvector const& bv, bool with_size ) { - if( lhs.size() != rhs.size() ) { - throw std::invalid_argument( - "Cannot compute Hamming distance between Bitvectors of different size" - ); - } - - // Shorthands - auto const& lhs_data = lhs.data(); - auto const& rhs_data = rhs.data(); - - // Use the bitvector data directly to count the number of resulting bits. - size_t sum = 0; - for( size_t i = 0; i < lhs_data.size(); ++i ) { - sum += pop_count( lhs_data[i] ^ rhs_data[i] ); + std::string res = ( with_size ? "[" + std::to_string( bv.size() ) + "]\n" : "" ); + for( size_t i = 0; i < bv.size(); ++i ) { + res += bv[i] ? "1" : "0"; + if( i+1 < bv.size() && (i+1) % 64 == 0 ) { + res += "\n"; + } else if( i+1 < bv.size() && (i+1) % 8 == 0 ) { + res += " "; + } } - return sum; + return res; } -// ------------------------------------------------------------------------- -// Sorting -// ------------------------------------------------------------------------- - -// bool lexicographically_compare_helper_( Bitvector const& lhs, Bitvector const& rhs, bool on_equal ) -// { -// // Deactivated at the moment, as this does not take care of the typical little endian-ness -// // of modern computers, and hence yields wrong results... -// -// // Local helper function to avoid code duplication. -// if( lhs.size() != rhs.size() ) { -// throw std::runtime_error( -// "Cannot use lexicographical comparison functions on Bitvectors of different size." -// ); -// } -// for( size_t i = 0; i < lhs.data().size(); ++i ) { -// if( lhs.data()[i] < rhs.data()[i] ) { -// return true; -// } else if( lhs.data()[i] > rhs.data()[i] ) { -// return false; -// } -// } -// -// // If we are here, all of the above comparisons shows that lhs == rhs. -// assert( lhs == rhs ); -// return on_equal; -// } -// -// bool is_lexicographically_less( Bitvector const& lhs, Bitvector const& rhs ) -// { -// return lexicographically_compare_helper_( lhs, rhs, false ); -// } -// -// bool is_lexicographically_less_or_equal( Bitvector const& lhs, Bitvector const& rhs ) -// { -// return lexicographically_compare_helper_( lhs, rhs, true ); -// } -// -// bool is_lexicographically_greater( Bitvector const& lhs, Bitvector const& rhs ) -// { -// return lexicographically_compare_helper_( rhs, lhs, false ); -// } -// -// bool is_lexicographically_greater_or_equal( Bitvector const& lhs, Bitvector const& rhs ) -// { -// return lexicographically_compare_helper_( rhs, lhs, true ); -// } - -// ------------------------------------------------------------------------- -// Input and Output -// ------------------------------------------------------------------------- - std::ostream& operator << (std::ostream& s, Bitvector const& bv) { for( size_t i = 0; i < bv.size(); ++i ) { @@ -359,7 +230,7 @@ Deserializer& operator>>( Deserializer& deserializer, Bitvector& bv ) } if( bv.data_.size() > 0 ) { auto const back = bv.data_.back(); - bv.unset_padding_(); + bv.unset_padding_bits(); if( bv.data_.back() != back ) { throw std::invalid_argument( "Invalid (de)serialization of Bitvector where last bits after the actual size were set" @@ -370,34 +241,5 @@ Deserializer& operator>>( Deserializer& deserializer, Bitvector& bv ) return deserializer; } -std::vector make_bool_vector_from_indices( std::vector const& indices, size_t size ) -{ - // Get the largest element of the vector. If it's empty, we return an all-false vector. - auto max_it = std::max_element( indices.begin(), indices.end() ); - if( max_it == indices.end() ) { - return std::vector( size, false ); - } - size_t target_size = *max_it + 1; - if( size > 0 ) { - if( target_size > size ) { - throw std::invalid_argument( - "Cannot use make_bool_vector_from_indices() with size " + std::to_string( size ) + - " that is smaller than required to include the larged index " + - std::to_string( *max_it ) + " in the list of indices (zero-based)." - ); - } - target_size = size; - } - - // Fill a bool vector, setting all positions to true - // that are indicated by the indices, pun intended. - auto result = std::vector( target_size, false ); - for( auto const& idx : indices ) { - assert( idx < result.size() ); - result[idx] = true; - } - return result; -} - } // namespace utils } // namespace genesis diff --git a/lib/genesis/utils/math/bitvector/operators.hpp b/lib/genesis/utils/math/bitvector/operators.hpp index b75bca75..fbd6f4ec 100644 --- a/lib/genesis/utils/math/bitvector/operators.hpp +++ b/lib/genesis/utils/math/bitvector/operators.hpp @@ -19,9 +19,9 @@ along with this program. If not, see . Contact: - Lucas Czech - Department of Plant Biology, Carnegie Institution For Science - 260 Panama Street, Stanford, CA 94305, USA + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark */ /** @@ -32,6 +32,7 @@ */ #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include "genesis/utils/io/deserializer.hpp" #include "genesis/utils/io/serializer.hpp" @@ -131,98 +132,16 @@ Bitvector bitwise_xor( ); // ------------------------------------------------------------------------- -// Set Operators +// Input and Output // ------------------------------------------------------------------------- /** - * @brief Compute the set minus `lhs & (~rhs)` between two Bitvector%s. - */ -Bitvector set_minus( Bitvector const& lhs, Bitvector const& rhs ); - -/** - * @brief Compute the symmetric differeence `(lhs | rhs) & ~(lhs & rhs)` between two Bitvector%s. - */ -Bitvector symmetric_difference( Bitvector const& lhs, Bitvector const& rhs ); - -/** - * @brief Strict subset. - */ -bool is_strict_subset( Bitvector const& sub, Bitvector const& super ); - -/** - * @brief Strict superset. - */ -bool is_strict_superset( Bitvector const& super, Bitvector const& sub ); - -/** - * @brief Subset or equal. - */ -bool is_subset( Bitvector const& sub, Bitvector const& super ); - -/** - * @brief Superset or equal. - */ -bool is_superset( Bitvector const& super, Bitvector const& sub ); - -/** - * @brief Compute the Jaccard index (Jaccard similarity coefficient) for two Bitvector%s - * of the same size. - * - * This is simply the count of bits in the intersection divided by the count of bits in the union - * of the Bitvectors. - */ -double jaccard_similarity( Bitvector const& lhs, Bitvector const& rhs ); - -/** - * @brief Compute the Jaccard distance for two Bitvector%s of the same size. + * @brief Print the bits of a Bitvector to a string. * - * This dissimilarity is simply 1 - jaccard_similarity(). + * If @p with_size is set, the string is prefixed by the size of the string in square brackets, + * such as `[5] 01011`, which is useful for debugging. */ -double jaccard_distance( Bitvector const& lhs, Bitvector const& rhs ); - -/** - * @brief Compute the Hamming distance between two Bitvector%s, - * i.e., the Hamming weight (pop count) of the `xor` of the inputs. - */ -size_t hamming_distance( Bitvector const& lhs, Bitvector const& rhs ); - -// ------------------------------------------------------------------------- -// Sorting -// ------------------------------------------------------------------------- - -// /* * -// * @brief Return whether @p lhs is lexicographically less than @p rhs. -// * -// * This function consideres the bits ordered with most significant bits towards the left, so that -// * `0001 < 1000` for example. Both Bitvector%s have to have the same length (this could be -// * implemented for Bitvectors of different size, but that is not needed as of now, so it isn't). -// */ -// bool is_lexicographically_less( Bitvector const& lhs, Bitvector const& rhs ); -// -// /* * -// * @brief Return whether @p lhs is lexicographically less than @p rhs, or equal to it. -// * -// * @copydetails is_lexicographically_less() -// */ -// bool is_lexicographically_less_or_equal( Bitvector const& lhs, Bitvector const& rhs ); -// -// /* * -// * @brief Return whether @p lhs is lexicographically greater than @p rhs. -// * -// * @copydetails is_lexicographically_less() -// */ -// bool is_lexicographically_greater( Bitvector const& lhs, Bitvector const& rhs ); -// -// /* * -// * @brief Return whether @p lhs is lexicographically greater than @p rhs, or equal to it. -// * -// * @copydetails is_lexicographically_less() -// */ -// bool is_lexicographically_greater_or_equal( Bitvector const& lhs, Bitvector const& rhs ); - -// ------------------------------------------------------------------------- -// Input and Output -// ------------------------------------------------------------------------- +std::string to_bit_string( Bitvector const& bv, bool with_size = false ); /** * @brief Insertion operator that outputs a Bitvector as a string of '0's and '1's. @@ -265,19 +184,79 @@ inline size_t serialized_bitvector_size( Bitvector const& bv ) return serialized_bitvector_size( bv.size() ); } +// ------------------------------------------------------------------------- +// Hashing +// ------------------------------------------------------------------------- + /** - * @brief Helper function to create a bool vector from a set of indices to be set to `true`. + * @brief Helper structure that yields the hash of a given Bitvector. + * + * It is meant to be used in containers such as `unordered_set` or `unordered_map` + * that can make use of custom hash functions for the key objects. By default, + * these containers use a specialization of the `std::hash` template, which we also offer, + * and that also uses the Bitvector::hash() function. * - * The function expectes a list of indices. It returns a bool vector with the size of the largest - * index, or the provided @p size (if set to a value > 0), where all positions of these indices are - * `true`, and all other positions are `false`. + * Hence, this class here is slightly redundant, as it gives the same result as just using + * the `std::hash` specialization. Still, it might be useful to have. + * + * See also BitvectorXhash for an alternative version that uses Bitvector::x_hash() instead. */ -std::vector make_bool_vector_from_indices( - std::vector const& indices, - size_t size = 0 -); +struct BitvectorHash +{ + std::size_t operator() ( genesis::utils::Bitvector const& value ) const + { + return genesis::utils::bitvector_hash(value); + } +}; + +/** + * @brief Helper structure that yields the x_hash of a given Bitvector. + * + * It is meant to be used in containers such as `unordered_set` or `unordered_map` + * that can make use of custom hash functions for the key objects. Using this class instead + * of the standard `std::hash` specialization, the Bitvector::x_hash() function is used instead + * of the standard hash() function. It is hence faster to compute, but without avalanche effect. + * + * In some use cases, this might be preferrable - we however recommend to test this, in order to + * make sure that colliding hashes do not slow down the performance in the end. + * + * Note that the function needs to cast from Bitvector::IntType to std::size_t. + * On most modern systems, these are expecte to be the same, i.e., 64 bit unsigned integers. + * However, this might cause problem on systems where this is not the case. + */ +struct BitvectorXhash +{ + std::size_t operator() ( genesis::utils::Bitvector const& value ) const + { + return static_cast( bitvector_x_hash( value )); + } +}; } // namespace utils } // namespace genesis +// ============================================================================= +// Namespace std extension +// ============================================================================= + +namespace std { + +/** + * @brief Specialization of std::hash for the Bitvector class. + * + * It uses Bitvector::hash() for the hashing. See also BitvectorHash for an alternative class + * that does the same, but resides in the same namespace as Bitvector, and see BitvectorXhash + * for a variant that uses Bitvector::x_hash() instead as the hashing function. + */ +template<> +struct hash +{ + std::size_t operator() ( genesis::utils::Bitvector const& value ) const + { + return genesis::utils::bitvector_hash(value); + } +}; + +} // namespace std + #endif // include guard diff --git a/test/src/population/genome_locus_set.cpp b/test/src/population/genome_locus_set.cpp index ab27904c..8c719db6 100644 --- a/test/src/population/genome_locus_set.cpp +++ b/test/src/population/genome_locus_set.cpp @@ -35,6 +35,7 @@ #include "genesis/population/function/genome_region.hpp" #include "genesis/population/genome_locus_set.hpp" #include "genesis/population/genome_region.hpp" +#include "genesis/utils/math/bitvector/operators.hpp" using namespace genesis::population; using namespace genesis::utils; @@ -79,6 +80,10 @@ void test_genome_locus_set_operators_( auto result = list_1; result.set_intersect( list_2 ); + // LOG_DBG << to_bit_string( list_1.chromosome_positions( "A" )); + // LOG_DBG << to_bit_string( list_2.chromosome_positions( "A" )); + // LOG_DBG << to_bit_string( result.chromosome_positions( "A" )); + EXPECT_FALSE( result.is_covered( "A" )); EXPECT_FALSE( result.is_covered( "A", 0 )); EXPECT_FALSE( result.is_covered( "A", 4 )); @@ -121,6 +126,10 @@ void test_genome_locus_set_operators_( auto result = list_1; result.set_union( list_2 ); + // LOG_DBG << to_bit_string( list_1.chromosome_positions( "A" )); + // LOG_DBG << to_bit_string( list_2.chromosome_positions( "A" )); + // LOG_DBG << to_bit_string( result.chromosome_positions( "A" )); + EXPECT_TRUE( result.is_covered( "A" )); EXPECT_TRUE( result.is_covered( "A", 0 )); EXPECT_TRUE( result.is_covered( "A", 4 )); diff --git a/test/src/population/region_window_stream.cpp b/test/src/population/region_window_stream.cpp index 069fd4f9..5e0fbb20 100644 --- a/test/src/population/region_window_stream.cpp +++ b/test/src/population/region_window_stream.cpp @@ -36,6 +36,7 @@ #include "genesis/population/window/window.hpp" #include "genesis/utils/containers/generic_input_stream.hpp" #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include "genesis/utils/math/common.hpp" #include "genesis/utils/math/random.hpp" #include "genesis/utils/text/string.hpp" @@ -303,7 +304,7 @@ void run_region_window_test_( } // We expect the result to be empty. // If not, there are variants that were not in the window. - EXPECT_EQ( 0, bv.count() ); + EXPECT_EQ( 0, pop_count( bv )); } // For each variant in the window, decrement its cover count. diff --git a/test/src/population/variant_gapless_input_stream.cpp b/test/src/population/variant_gapless_input_stream.cpp index 6d3395f3..5ab9104d 100644 --- a/test/src/population/variant_gapless_input_stream.cpp +++ b/test/src/population/variant_gapless_input_stream.cpp @@ -43,6 +43,8 @@ #include "genesis/utils/core/algorithm.hpp" #include "genesis/utils/core/std.hpp" #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" +#include "genesis/utils/math/bitvector/operators.hpp" #include "genesis/utils/math/common.hpp" #include "genesis/utils/math/random.hpp" #include "genesis/utils/text/string.hpp" @@ -311,7 +313,7 @@ std::map test_gapless_input_stream_make_ramdom_bitvector for( auto s : selection ) { bv.set(s); } - EXPECT_EQ( filled, bv.count() ); + EXPECT_EQ( filled, pop_count( bv )); result[std::string( 1, 'A' + i )] = bv; } return result; @@ -326,7 +328,7 @@ std::vector test_gapless_input_stream_make_variants_( std::vector result; size_t total_variants = 0; for( auto const& bv : bitvectors ) { - total_variants += bv.second.count(); + total_variants += pop_count( bv.second ); for( size_t i = 0; i < bv.second.size(); ++i ) { if( !bv.second.get(i) ) { @@ -417,7 +419,7 @@ void test_gapless_input_stream_random_() } LOG_DBG << "var_bvs"; for( auto const& fp : var_bvs ) { - LOG_DBG1 << fp.first << ":" << fp.second.dump(); + LOG_DBG1 << fp.first << ":" << to_bit_string( fp.second ); } LOG_DBG << "vars"; for( auto const& var : vars ) { @@ -497,7 +499,7 @@ void test_gapless_input_stream_random_() } EXPECT_EQ( total_variants, present_variants + missing_variants ); for( auto const& fp : found_positions ) { - EXPECT_GT( fp.second.count(), 0 ); + EXPECT_GT( pop_count( fp.second ), 0 ); } // Check found positions @@ -515,7 +517,7 @@ void test_gapless_input_stream_random_() EXPECT_EQ( num_reg_chrs * 10, total_variants ); } for( auto const& fp : found_positions ) { - EXPECT_EQ( 10, fp.second.count() ); + EXPECT_EQ( 10, pop_count( fp.second )); } } else { // Without ref genome or seq dict, we only see the positions up until the last in the @@ -526,7 +528,7 @@ void test_gapless_input_stream_random_() auto const past_last = find_position_past_last_true_(bv.second); exp_total_variants += past_last; EXPECT_TRUE( found_positions.count(bv.first) > 0 ); - EXPECT_EQ( past_last, found_positions[bv.first].count() ); + EXPECT_EQ( past_last, pop_count( found_positions[bv.first] )); // Now scan the positions. All before the last need to true, all after false. EXPECT_LE( past_last, 10 ); @@ -543,7 +545,7 @@ void test_gapless_input_stream_random_() // Count up all the set positions in the original bitvectors to get the present variant count. size_t exp_present_variants = 0; for( auto const& bv : var_bvs ) { - exp_present_variants += bv.second.count(); + exp_present_variants += pop_count( bv.second ); } EXPECT_EQ( exp_present_variants, present_variants ); } diff --git a/test/src/utils/math/bitvector.cpp b/test/src/utils/math/bitvector.cpp index b067f665..0369e2ab 100644 --- a/test/src/utils/math/bitvector.cpp +++ b/test/src/utils/math/bitvector.cpp @@ -16,9 +16,9 @@ along with this program. If not, see . Contact: - Lucas Czech - Department of Plant Biology, Carnegie Institution For Science - 260 Panama Street, Stanford, CA 94305, USA + Lucas Czech + University of Copenhagen, Globe Institute, Section for GeoGenetics + Oster Voldgade 5-7, 1350 Copenhagen K, Denmark */ /** @@ -32,7 +32,9 @@ #include "genesis/utils/io/deserializer.hpp" #include "genesis/utils/io/serializer.hpp" +#include "genesis/utils/math/bit.hpp" #include "genesis/utils/math/bitvector.hpp" +#include "genesis/utils/math/bitvector/functions.hpp" #include "genesis/utils/math/bitvector/operators.hpp" #include "genesis/utils/math/random.hpp" #include "genesis/utils/text/string.hpp" @@ -46,6 +48,42 @@ using namespace genesis::utils; +TEST( Bits, ToBitString ) +{ + EXPECT_EQ( + "00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000", + to_bit_string( 0 ) + ); + EXPECT_EQ( + "00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000001", + to_bit_string( 1 ) + ); + EXPECT_EQ( + "00000001 10110110 10011011 01001011 10101100 11010000 01011111 00010101", + to_bit_string( 123456789123456789ULL ) + ); +} + +TEST( Bitvector, ToBitString ) +{ + EXPECT_EQ( + "", + to_bit_string( Bitvector( "" )) + ); + EXPECT_EQ( + "01010101 0101", + to_bit_string( Bitvector( "010101010101" )) + ); + EXPECT_EQ( + "01010101 01011010", + to_bit_string( Bitvector( "0101010101011010" )) + ); + EXPECT_EQ( + "01100", + to_bit_string( Bitvector( 5, { 1, 2 })) + ); +} + // Very slow. We use the built-in function of Bitvector now. // Bitvector make_random_bitvector_( size_t size ) // { @@ -88,14 +126,14 @@ TEST( Bitvector, Arithmetics ) EXPECT_EQ( Bitvector( "000011110000" ), ~bv2 ); // Test pop counting - EXPECT_EQ( 6, bv0.count() ); - EXPECT_EQ( 6, bv1.count() ); - EXPECT_EQ( 8, bv2.count() ); + EXPECT_EQ( 6, pop_count(bv0) ); + EXPECT_EQ( 6, pop_count(bv1) ); + EXPECT_EQ( 8, pop_count(bv2) ); // Test inverse as well, which also tests that the padding is 0 - EXPECT_EQ( 6, (~bv0).count() ); - EXPECT_EQ( 6, (~bv1).count() ); - EXPECT_EQ( 4, (~bv2).count() ); + EXPECT_EQ( 6, pop_count(~bv0) ); + EXPECT_EQ( 6, pop_count(~bv1) ); + EXPECT_EQ( 4, pop_count(~bv2) ); // Test some inequality as well EXPECT_FALSE( bv0 == bv1 ); @@ -323,7 +361,7 @@ TEST( Bitvector, JaccardIndexSpeed ) // LOG_TIME << "make bvs"; auto bvs = std::vector( n ); for( size_t i = 0; i < n; ++i ) { - bvs[i] = Bitvector::make_random_bitvector( s ); + bvs[i] = make_random_bitvector( s ); } // LOG_TIME << "compute jaccard"; @@ -373,7 +411,7 @@ TEST( Bitvector, SetRange ) { // Use the function to test auto bv = Bitvector( s, false ); - bv.set( f, l, true ); + bv.set_range( f, l, true ); // Make expected version using slow setter auto ex = Bitvector( s, false ); @@ -394,7 +432,7 @@ TEST( Bitvector, SetRange ) { // Use the function to test auto bv = Bitvector( s, true ); - bv.set( f, l, false ); + bv.set_range( f, l, false ); // Make expected version using slow setter auto ex = Bitvector( s, true ); @@ -419,58 +457,58 @@ TEST( Bitvector, CountRange ) { // 0 word Bitvector const bv_0( 0, true ); - EXPECT_ANY_THROW( bv_0.count( 0, 0 )); - EXPECT_ANY_THROW( bv_0.count( 0, 1 )); - EXPECT_ANY_THROW( bv_0.count( 1, 1 )); - EXPECT_ANY_THROW( bv_0.count( 1, 0 )); + EXPECT_ANY_THROW( pop_count( bv_0, 0, 0 )); + EXPECT_ANY_THROW( pop_count( bv_0, 0, 1 )); + EXPECT_ANY_THROW( pop_count( bv_0, 1, 1 )); + EXPECT_ANY_THROW( pop_count( bv_0, 1, 0 )); // 0.5 word Bitvector const bv_32( 32, true ); - EXPECT_EQ( 0, bv_32.count( 0, 0 )); - EXPECT_EQ( 0, bv_32.count( 1, 1 )); - EXPECT_EQ( 1, bv_32.count( 0, 1 )); - EXPECT_EQ( 1, bv_32.count( 31, 32 )); - EXPECT_EQ( 32, bv_32.count( 0, 32 )); + EXPECT_EQ( 0, pop_count( bv_32, 0, 0 )); + EXPECT_EQ( 0, pop_count( bv_32, 1, 1 )); + EXPECT_EQ( 1, pop_count( bv_32, 0, 1 )); + EXPECT_EQ( 1, pop_count( bv_32, 31, 32 )); + EXPECT_EQ( 32, pop_count( bv_32, 0, 32 )); - EXPECT_ANY_THROW( bv_32.count( 1, 0 )); - EXPECT_ANY_THROW( bv_32.count( 0, 33 )); - EXPECT_ANY_THROW( bv_32.count( 33, 33 )); + EXPECT_ANY_THROW( pop_count( bv_32, 1, 0 )); + EXPECT_ANY_THROW( pop_count( bv_32, 0, 33 )); + EXPECT_ANY_THROW( pop_count( bv_32, 33, 33 )); // 1 word Bitvector const bv_64( 64, true ); - EXPECT_EQ( 0, bv_64.count( 0, 0 )); - EXPECT_EQ( 0, bv_64.count( 1, 1 )); - EXPECT_EQ( 1, bv_64.count( 0, 1 )); - EXPECT_EQ( 1, bv_64.count( 63, 64 )); - EXPECT_EQ( 64, bv_64.count( 0, 64 )); + EXPECT_EQ( 0, pop_count( bv_64, 0, 0 )); + EXPECT_EQ( 0, pop_count( bv_64, 1, 1 )); + EXPECT_EQ( 1, pop_count( bv_64, 0, 1 )); + EXPECT_EQ( 1, pop_count( bv_64, 63, 64 )); + EXPECT_EQ( 64, pop_count( bv_64, 0, 64 )); - EXPECT_ANY_THROW( bv_64.count( 1, 0 )); - EXPECT_ANY_THROW( bv_64.count( 0, 65 )); - EXPECT_ANY_THROW( bv_64.count( 65, 64 )); + EXPECT_ANY_THROW( pop_count( bv_64, 1, 0 )); + EXPECT_ANY_THROW( pop_count( bv_64, 0, 65 )); + EXPECT_ANY_THROW( pop_count( bv_64, 65, 64 )); // 1.5 word Bitvector const bv_96( 96, true ); - EXPECT_EQ( 0, bv_96.count( 0, 0 )); - EXPECT_EQ( 0, bv_96.count( 1, 1 )); - EXPECT_EQ( 1, bv_96.count( 0, 1 )); - EXPECT_EQ( 1, bv_96.count( 95, 96 )); - EXPECT_EQ( 96, bv_96.count( 0, 96 )); + EXPECT_EQ( 0, pop_count( bv_96, 0, 0 )); + EXPECT_EQ( 0, pop_count( bv_96, 1, 1 )); + EXPECT_EQ( 1, pop_count( bv_96, 0, 1 )); + EXPECT_EQ( 1, pop_count( bv_96, 95, 96 )); + EXPECT_EQ( 96, pop_count( bv_96, 0, 96 )); - EXPECT_ANY_THROW( bv_96.count( 1, 0 )); - EXPECT_ANY_THROW( bv_96.count( 0, 97 )); - EXPECT_ANY_THROW( bv_96.count( 97, 97 )); + EXPECT_ANY_THROW( pop_count( bv_96, 1, 0 )); + EXPECT_ANY_THROW( pop_count( bv_96, 0, 97 )); + EXPECT_ANY_THROW( pop_count( bv_96, 97, 97 )); // 2.5 word Bitvector const bv_160( 160, true ); - EXPECT_EQ( 0, bv_160.count( 0, 0 )); - EXPECT_EQ( 0, bv_160.count( 1, 1 )); - EXPECT_EQ( 1, bv_160.count( 0, 1 )); - EXPECT_EQ( 1, bv_160.count( 159, 160 )); - EXPECT_EQ( 160, bv_160.count( 0, 160 )); - - EXPECT_ANY_THROW( bv_160.count( 1, 0 )); - EXPECT_ANY_THROW( bv_160.count( 0, 161 )); - EXPECT_ANY_THROW( bv_160.count( 161, 161 )); + EXPECT_EQ( 0, pop_count( bv_160, 0, 0 )); + EXPECT_EQ( 0, pop_count( bv_160, 1, 1 )); + EXPECT_EQ( 1, pop_count( bv_160, 0, 1 )); + EXPECT_EQ( 1, pop_count( bv_160, 159, 160 )); + EXPECT_EQ( 160, pop_count( bv_160, 0, 160 )); + + EXPECT_ANY_THROW( pop_count( bv_160, 1, 0 )); + EXPECT_ANY_THROW( pop_count( bv_160, 0, 161 )); + EXPECT_ANY_THROW( pop_count( bv_160, 161, 161 )); } TEST( Bitvector, CountRangeFuzzy ) @@ -489,7 +527,7 @@ TEST( Bitvector, CountRangeFuzzy ) } // Get some random bits - auto const bv = Bitvector::make_random_bitvector( size ); + auto const bv = make_random_bitvector( size ); // Get random positions between which to count. size_t s = std::rand() % size; @@ -502,7 +540,7 @@ TEST( Bitvector, CountRangeFuzzy ) ASSERT_LE( s, e ); // Get the count of bits between the two. - auto const cnt = bv.count( s, e ); + auto const cnt = pop_count( bv, s, e ); // Same, but slow, for comparison. size_t exp = 0; @@ -523,8 +561,8 @@ TEST( Bitvector, FindNextSet ) // 0 word Bitvector bv_0( 0 ); - EXPECT_EQ( max, bv_0.find_next_set( 0 )); - EXPECT_EQ( max, bv_0.find_next_set( 1 )); + EXPECT_EQ( max, find_next_set( bv_0, 0 )); + EXPECT_EQ( max, find_next_set( bv_0, 1 )); // 0.5 word Bitvector bv_32( 32 ); @@ -532,47 +570,47 @@ TEST( Bitvector, FindNextSet ) bv_32.set( 1 ); bv_32.set( 16 ); bv_32.set( 31 ); - EXPECT_EQ( 0, bv_32.find_next_set( 0 )); - EXPECT_EQ( 1, bv_32.find_next_set( 1 )); - EXPECT_EQ( 16, bv_32.find_next_set( 2 )); - EXPECT_EQ( 16, bv_32.find_next_set( 15 )); - EXPECT_EQ( 16, bv_32.find_next_set( 16 )); - EXPECT_EQ( 31, bv_32.find_next_set( 17 )); - EXPECT_EQ( 31, bv_32.find_next_set( 30 )); - EXPECT_EQ( 31, bv_32.find_next_set( 31 )); - EXPECT_EQ( max, bv_32.find_next_set( 32 )); - EXPECT_EQ( max, bv_32.find_next_set( 63 )); - EXPECT_EQ( max, bv_32.find_next_set( 64 )); + EXPECT_EQ( 0, find_next_set( bv_32, 0 )); + EXPECT_EQ( 1, find_next_set( bv_32, 1 )); + EXPECT_EQ( 16, find_next_set( bv_32, 2 )); + EXPECT_EQ( 16, find_next_set( bv_32, 15 )); + EXPECT_EQ( 16, find_next_set( bv_32, 16 )); + EXPECT_EQ( 31, find_next_set( bv_32, 17 )); + EXPECT_EQ( 31, find_next_set( bv_32, 30 )); + EXPECT_EQ( 31, find_next_set( bv_32, 31 )); + EXPECT_EQ( max, find_next_set( bv_32, 32 )); + EXPECT_EQ( max, find_next_set( bv_32, 63 )); + EXPECT_EQ( max, find_next_set( bv_32, 64 )); // 1 word Bitvector bv_64( 64 ); bv_64.set( 63 ); - EXPECT_EQ( 63, bv_64.find_next_set( 0 )); - EXPECT_EQ( 63, bv_64.find_next_set( 62 )); - EXPECT_EQ( 63, bv_64.find_next_set( 63 )); - EXPECT_EQ( max, bv_64.find_next_set( 64 )); + EXPECT_EQ( 63, find_next_set( bv_64, 0 )); + EXPECT_EQ( 63, find_next_set( bv_64, 62 )); + EXPECT_EQ( 63, find_next_set( bv_64, 63 )); + EXPECT_EQ( max, find_next_set( bv_64, 64 )); // 1.5 word Bitvector bv_96( 96 ); bv_96.set( 64 ); bv_96.set( 95 ); - EXPECT_EQ( 64, bv_96.find_next_set( 0 )); - EXPECT_EQ( 64, bv_96.find_next_set( 63 )); - EXPECT_EQ( 64, bv_96.find_next_set( 64 )); - EXPECT_EQ( 95, bv_96.find_next_set( 65 )); - EXPECT_EQ( 95, bv_96.find_next_set( 95 )); - EXPECT_EQ( max, bv_96.find_next_set( 96 )); + EXPECT_EQ( 64, find_next_set( bv_96, 0 )); + EXPECT_EQ( 64, find_next_set( bv_96, 63 )); + EXPECT_EQ( 64, find_next_set( bv_96, 64 )); + EXPECT_EQ( 95, find_next_set( bv_96, 65 )); + EXPECT_EQ( 95, find_next_set( bv_96, 95 )); + EXPECT_EQ( max, find_next_set( bv_96, 96 )); // 2.5 word Bitvector bv_160( 160 ); bv_160.set( 63 ); bv_160.set( 130 ); - EXPECT_EQ( 63, bv_160.find_next_set( 0 )); - EXPECT_EQ( 63, bv_160.find_next_set( 63 )); - EXPECT_EQ( 130, bv_160.find_next_set( 64 )); - EXPECT_EQ( 130, bv_160.find_next_set( 129 )); - EXPECT_EQ( 130, bv_160.find_next_set( 130 )); - EXPECT_EQ( max, bv_160.find_next_set( 131 )); + EXPECT_EQ( 63, find_next_set( bv_160, 0 )); + EXPECT_EQ( 63, find_next_set( bv_160, 63 )); + EXPECT_EQ( 130, find_next_set( bv_160, 64 )); + EXPECT_EQ( 130, find_next_set( bv_160, 129 )); + EXPECT_EQ( 130, find_next_set( bv_160, 130 )); + EXPECT_EQ( max, find_next_set( bv_160, 131 )); } TEST( Bitvector, FindNextSetFuzzy ) @@ -625,10 +663,10 @@ TEST( Bitvector, FindNextSetFuzzy ) next_in_vec = find_next_set_val_( selection, i ); } - EXPECT_EQ( next_in_vec, bv.find_next_set(i) ) << " at " << i; + EXPECT_EQ( next_in_vec, find_next_set( bv, i )) << " at " << i; } - EXPECT_EQ( std::numeric_limits::max(), bv.find_next_set( size )); - EXPECT_EQ( std::numeric_limits::max(), bv.find_next_set( size + 1 )); + EXPECT_EQ( std::numeric_limits::max(), find_next_set( bv, size )); + EXPECT_EQ( std::numeric_limits::max(), find_next_set( bv, size + 1 )); } } @@ -638,16 +676,16 @@ TEST( Bitvector, FindFirstLastSet ) for( size_t l = 0; l <= 1024; ++l ) { // Test no bit being set auto bv = Bitvector( l ); - EXPECT_EQ( Bitvector::npos, bv.find_first_set() ); - EXPECT_EQ( Bitvector::npos, bv.find_last_set() ); + EXPECT_EQ( Bitvector::npos, find_first_set( bv )); + EXPECT_EQ( Bitvector::npos, find_last_set( bv )); // Test exactly one bit being set, for all bits. for( size_t s = 0; s < l; ++s ) { // LOG_DBG << "l==" << l << " s==" << s; auto bv = Bitvector( l ); bv.set( s ); - EXPECT_EQ( s, bv.find_first_set() ); - EXPECT_EQ( s, bv.find_last_set() ); + EXPECT_EQ( s, find_first_set( bv )); + EXPECT_EQ( s, find_last_set( bv )); } } } @@ -659,10 +697,10 @@ TEST( Bitvector, Serialization ) // We test that a container of bitvectors also works, and internally test // different sizes that are either exact boundaries, or some arbitrary values. std::vector bvs; - bvs.push_back( Bitvector::make_random_bitvector( 42 )); - bvs.push_back( Bitvector::make_random_bitvector( 0 )); - bvs.push_back( Bitvector::make_random_bitvector( 512 )); - bvs.push_back( Bitvector::make_random_bitvector( 710 )); + bvs.push_back( make_random_bitvector( 42 )); + bvs.push_back( make_random_bitvector( 0 )); + bvs.push_back( make_random_bitvector( 512 )); + bvs.push_back( make_random_bitvector( 710 )); // Serialize std::ostringstream out;