-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add NEON simd implementation for arm64 cpus (#114)
- resolves #110
- Loading branch information
Showing
9 changed files
with
202 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <vector> | ||
|
||
#include "hamming/hamming_impl_types.hh" | ||
|
||
namespace hamming { | ||
|
||
int distance_neon(const std::vector<GeneBlock> &a, | ||
const std::vector<GeneBlock> &b); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#include "hamming/distance_neon.hh" | ||
#include <arm_neon.h> | ||
|
||
namespace hamming { | ||
|
||
int distance_neon(const std::vector<GeneBlock> &a, | ||
const std::vector<GeneBlock> &b) { | ||
// distance implementation using NEON simd intrinsics | ||
// a 128-bit register holds 16 GeneBlocks, i.e. 32 genes | ||
constexpr std::size_t n_geneblocks{16}; | ||
int r{0}; | ||
// mask to select LSB of each gene | ||
const uint8x16_t lsb = vdupq_n_u8(1); | ||
// mask to select lower gene from each GeneBlock | ||
const uint8x16_t mask0 = vdupq_n_u8(mask_gene0); | ||
// mask to select upper gene from each GeneBlock | ||
const uint8x16_t mask1 = vdupq_n_u8(mask_gene1); | ||
// vector of partial distance counts | ||
uint8x16_t r_s; | ||
// work registers | ||
uint8x16_t r_a; | ||
uint8x16_t r_b; | ||
// each iteration processes 16 GeneBlocks | ||
std::size_t n_iter{a.size() / n_geneblocks}; | ||
// each partial distance count is stored in a uint8, so max value = 255, | ||
// and the value can be increased by at most 2 with each iteration, | ||
// so we do 127 inner iterations for a max value of 254 to avoid overflow | ||
std::size_t n_inner{127}; | ||
std::size_t n_outer{1 + n_iter / n_inner}; | ||
for (std::size_t j = 0; j < n_outer; ++j) { | ||
std::size_t n{std::min((j + 1) * n_inner, n_iter)}; | ||
r_s = vdupq_n_u8(0); | ||
for (std::size_t i = j * n_inner; i < n; ++i) { | ||
// load a[i], b[i] into registers | ||
r_a = vld1q_u8(a.data() + n_geneblocks * i); | ||
r_b = vld1q_u8(b.data() + n_geneblocks * i); | ||
// a[i] & b[i] | ||
r_a = vandq_u8(r_a, r_b); | ||
// mask lower genes | ||
r_b = vandq_u8(r_a, mask0); | ||
// mask upper genes | ||
r_a = vandq_u8(r_a, mask1); | ||
// compare genes with zero to get either 00000000 or 11111111 | ||
r_a = vceqzq_u8(r_a); | ||
r_b = vceqzq_u8(r_b); | ||
// only keep LSB for each uint8 to get either 0 or 1 | ||
r_a = vandq_u8(r_a, lsb); | ||
r_b = vandq_u8(r_b, lsb); | ||
// add these values to distance counts | ||
r_s = vaddq_u8(r_s, r_a); | ||
r_s = vaddq_u8(r_s, r_b); | ||
} | ||
// sum the 16 distances in r_s & add to r | ||
r += vaddlvq_u8(r_s); | ||
} | ||
// do last partial block without simd intrinsics | ||
for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { | ||
auto c{static_cast<GeneBlock>(a[i] & b[i])}; | ||
r += static_cast<int>((c & mask_gene0) == 0); | ||
r += static_cast<int>((c & mask_gene1) == 0); | ||
} | ||
return r; | ||
} | ||
|
||
} // namespace hamming |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#include "bench.hh" | ||
#include "hamming/distance_neon.hh" | ||
#include "hamming/hamming.hh" | ||
#include "hamming/hamming_impl.hh" | ||
#ifdef HAMMING_WITH_OPENMP | ||
#include <omp.h> | ||
#endif | ||
|
||
using namespace hamming; | ||
|
||
static void bench_distance_neon(benchmark::State &state) { | ||
#ifdef HAMMING_WITH_OPENMP | ||
omp_set_num_threads(1); | ||
#endif | ||
std::mt19937 gen(12345); | ||
int64_t n{state.range(0)}; | ||
auto s1{from_string(make_string(n, gen))}; | ||
auto s2{from_string(make_string(n, gen))}; | ||
int d{0}; | ||
for (auto _ : state) { | ||
d += distance_neon(s1, s2); | ||
} | ||
state.SetComplexityN(n); | ||
} | ||
|
||
BENCHMARK(bench_distance_neon)->Range(4096, 4194304)->Complexity(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#include "hamming/distance_neon.hh" | ||
#include "tests.hh" | ||
|
||
using namespace hamming; | ||
|
||
TEST_CASE("distance_neon() returns all return zero for identical vectors", | ||
"[impl][distance][neon]") { | ||
std::mt19937 gen(12345); | ||
for (int n : | ||
{1, 2, 3, 4, 5, 6, 7, 8, | ||
9, 10, 11, 12, 13, 14, 15, 16, | ||
17, 18, 19, 20, 31, 32, 33, 63, | ||
64, 65, 127, 128, 129, 254, 255, 256, | ||
256, 511, 512, 513, 1023, 1024, 1025, 2047, | ||
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, | ||
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, | ||
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, | ||
1048576, 1048577}) { | ||
CAPTURE(n); | ||
auto g1{make_gene_vector(n, gen)}; | ||
REQUIRE(distance_neon(g1, g1) == 0); | ||
} | ||
} | ||
|
||
TEST_CASE("distance_neon() all return n for n A's and n G's", | ||
"[impl][distance][neon]") { | ||
for (int n : | ||
{1, 2, 3, 4, 5, 6, 7, 8, | ||
9, 10, 11, 12, 13, 14, 15, 16, | ||
17, 18, 19, 20, 31, 32, 33, 63, | ||
64, 65, 127, 128, 129, 254, 255, 256, | ||
256, 511, 512, 513, 1023, 1024, 1025, 2047, | ||
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, | ||
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, | ||
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, | ||
1048576, 1048577}) { | ||
CAPTURE(n); | ||
auto g1 = from_string(std::string(n, 'A')); | ||
auto g2 = from_string(std::string(n, 'G')); | ||
REQUIRE(distance_neon(g1, g2) == n); | ||
} | ||
} | ||
|
||
TEST_CASE("distance_neon() returns same as distance_cpp() for random vectors", | ||
"[impl][distance][neon]") { | ||
std::mt19937 gen(12345); | ||
for (int n : | ||
{1, 2, 3, 4, 5, 6, 7, 8, | ||
9, 10, 11, 12, 13, 14, 15, 16, | ||
17, 18, 19, 20, 31, 32, 33, 63, | ||
64, 65, 127, 128, 129, 254, 255, 256, | ||
256, 511, 512, 513, 1023, 1024, 1025, 2047, | ||
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, | ||
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, | ||
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, | ||
1048576, 1048577}) { | ||
CAPTURE(n); | ||
auto g1{make_gene_vector(n, gen)}; | ||
auto g2{make_gene_vector(n, gen)}; | ||
REQUIRE(distance_neon(g1, g2) == distance_cpp(g1, g2)); | ||
} | ||
} |