Skip to content

Commit

Permalink
Merge pull request #15 from ssciwr/fix_9_uint16_distances
Browse files Browse the repository at this point in the history
use uint8 for distances to reduce memory use
  • Loading branch information
lkeegan authored Jun 22, 2021
2 parents 50fc382 + a86ec8d commit 1f7ae99
Show file tree
Hide file tree
Showing 12 changed files with 83 additions and 33 deletions.
14 changes: 1 addition & 13 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
@@ -1,24 +1,12 @@
#ifndef _HAMMING_HH
#define _HAMMING_HH

#include<array>
#include<cstdint>
#include"hamming/hamming_types.hh"
#include<string>
#include<vector>

namespace hamming {

struct DataSet
{
DataSet(std::vector<std::string>&, bool clear_input_data = false);
DataSet(const std::string&);
void dump(const std::string&);
int operator[](const std::array<std::size_t, 2>&) const;

std::size_t nsamples;
std::vector<int> result;
};

DataSet from_stringlist(std::vector<std::string>&);
DataSet from_csv(const std::string&);
DataSet from_fasta(const std::string&, std::size_t n = 0);
Expand Down
26 changes: 26 additions & 0 deletions include/hamming/hamming_types.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef _HAMMING_TYPES_HH
#define _HAMMING_TYPES_HH

#include<array>
#include<cstdint>
#include<string>
#include<vector>

namespace hamming {

using DistIntType = uint8_t;

struct DataSet
{
DataSet(std::vector<std::string>&, bool clear_input_data = false);
DataSet(const std::string&);
void dump(const std::string&);
int operator[](const std::array<std::size_t, 2>&) const;

std::size_t nsamples;
std::vector<DistIntType> result;
};

}

#endif
2 changes: 1 addition & 1 deletion src/distance_avx2.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "distance_avx2.hh"
#include "hamming_impl.hh"
#include "hamming_impl_types.hh"
#include <immintrin.h>

namespace hamming {
Expand Down
4 changes: 3 additions & 1 deletion src/distance_avx2.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"

namespace hamming {

int distance_avx2(const std::vector<std::uint8_t>& a, const std::vector<std::uint8_t>& b);
int distance_avx2(const std::vector<GeneBlock>& a, const std::vector<GeneBlock>& b);

}

Expand Down
1 change: 0 additions & 1 deletion src/distance_avx512.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "distance_avx512.hh"
#include "hamming_impl.hh"
#include <immintrin.h>

namespace hamming {
Expand Down
4 changes: 3 additions & 1 deletion src/distance_avx512.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"

namespace hamming {

int distance_avx512(const std::vector<std::uint8_t>& a, const std::vector<std::uint8_t>& b);
int distance_avx512(const std::vector<GeneBlock>& a, const std::vector<GeneBlock>& b);

}

Expand Down
1 change: 0 additions & 1 deletion src/distance_sse2.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "distance_sse2.hh"
#include "hamming_impl.hh"
#include <immintrin.h>

namespace hamming {
Expand Down
4 changes: 3 additions & 1 deletion src/distance_sse2.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include <cstdint>
#include <vector>

#include "hamming_impl_types.hh"

namespace hamming {

int distance_sse2(const std::vector<std::uint8_t>& a, const std::vector<std::uint8_t>& b);
int distance_sse2(const std::vector<GeneBlock>& a, const std::vector<GeneBlock>& b);

}

Expand Down
18 changes: 13 additions & 5 deletions src/hamming_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,15 @@ std::array<GeneBlock, 256> lookupTable()
return lookup;
}

std::vector<int> distances(std::vector<std::string>& data, bool clear_input_data){
std::vector<int> result((data.size() - 1) * data.size()/2, 0);
static DistIntType safe_int_cast(int x){
if(x > std::numeric_limits<DistIntType>::max()){
throw std::runtime_error("Error: Distance is too large for chosen integer type");
}
return static_cast<DistIntType>(x);
}

std::vector<DistIntType> distances(std::vector<std::string>& data, bool clear_input_data){
std::vector<DistIntType> result((data.size() - 1) * data.size()/2, 0);
auto sparse = to_sparse_data(data);
std::size_t nsamples{data.size()};
std::size_t sample_length{data[0].size()};
Expand All @@ -59,8 +66,9 @@ std::vector<int> distances(std::vector<std::string>& data, bool clear_input_data
#endif
for(std::size_t i=0; i<nsamples; ++i){
std::size_t offset{i * (i - 1) / 2};
for(std::size_t j=0; j<i; ++j)
result[offset + j] = distance_sparse(sparse[i], sparse[j]);
for(std::size_t j=0; j<i; ++j){
result[offset + j] = safe_int_cast(distance_sparse(sparse[i], sparse[j]));
}
}
return result;
}
Expand Down Expand Up @@ -93,7 +101,7 @@ std::vector<int> distances(std::vector<std::string>& data, bool clear_input_data
for(std::size_t i=0; i<nsamples; ++i){
std::size_t offset{i * (i - 1) / 2};
for(std::size_t j=0; j<i; ++j)
result[offset + j] = distance_func(dense[i], dense[j]);
result[offset + j] = safe_int_cast(distance_func(dense[i], dense[j]));
}
return result;
}
Expand Down
12 changes: 4 additions & 8 deletions src/hamming_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
#include<string>
#include<vector>

namespace hamming {
#include"hamming/hamming_types.hh"
#include"hamming_impl_types.hh"

// 4-bit representation of gene:
using GeneBlock = std::uint_fast8_t;
using SparseData = std::vector<std::size_t>;
constexpr std::size_t n_bits_per_gene{4};
constexpr GeneBlock mask_gene0{0x0f};
constexpr GeneBlock mask_gene1{0xf0};
namespace hamming {

std::array<GeneBlock, 256> lookupTable();

std::vector<int> distances(std::vector<std::string>& data, bool clear_input_data);
std::vector<DistIntType> distances(std::vector<std::string>& data, bool clear_input_data);

int distance_sparse(const SparseData& a, const SparseData& b);

Expand Down
18 changes: 18 additions & 0 deletions src/hamming_impl_types.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef _HAMMING_IMPL_TYPES_HH
#define _HAMMING_IMPL_TYPES_HH

#include<array>
#include<cstdint>

namespace hamming {

// 4-bit representation of gene:
using GeneBlock = std::uint8_t;
using SparseData = std::vector<std::size_t>;
constexpr std::size_t n_bits_per_gene{4};
constexpr GeneBlock mask_gene0{0x0f};
constexpr GeneBlock mask_gene1{0xf0};

}

#endif
12 changes: 11 additions & 1 deletion src/hamming_t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ TEST_CASE("from_csv reproduces correct data", "[hamming]") {
std::mt19937 gen(12345);
std::vector<std::string> data(10);
for(auto& d : data)
d = make_test_string(1000, gen);
d = make_test_string(201, gen);

DataSet ref(data);
char tmp_file_name[L_tmpnam];
Expand All @@ -183,3 +183,13 @@ TEST_CASE("from_csv reproduces correct data", "[hamming]") {
}
std::remove(tmp_file_name);
}

TEST_CASE("throws on distance integer overflow", "[hamming]") {
auto n = std::numeric_limits<DistIntType>::max() + 1;
std::mt19937 gen(12345);
std::vector<std::string> data(2);
data[0] = std::string(n, 'A');
data[1] = std::string(n, 'T');
std::string msg{"Error: Distance is too large for chosen integer type"};
REQUIRE_THROWS_WITH(DataSet(data), msg);
}

0 comments on commit 1f7ae99

Please sign in to comment.