Skip to content

Commit

Permalink
Add NEON simd implementation for arm64 cpus (#114)
Browse files Browse the repository at this point in the history
- resolves #110
  • Loading branch information
lkeegan authored Nov 10, 2023
1 parent 606837f commit 1f218de
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 11 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,16 @@ jobs:
include:
- os: ubuntu-latest
open-mp: "ON"
neon: "OFF"
- os: macos-latest
open-mp: "OFF"
neon: "OFF"
- os: macos-arm64-ssc
open-mp: "OFF"
neon: "ON"
- os: windows-latest
open-mp: "OFF"
neon: "OFF"

steps:
- uses: actions/checkout@v4
Expand All @@ -51,7 +55,7 @@ jobs:
- name: configure cmake
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }}
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_WITH_NEON=${{ matrix.neon }} -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }}

- name: build
shell: bash
Expand Down
10 changes: 7 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,19 @@ set(HAMMING_BUILD_PYTHON

set(HAMMING_WITH_SSE2
yes
CACHE BOOL "Enable SSE2 optimized code")
CACHE BOOL "Enable SSE2 optimized code on x86_64 CPUs")

set(HAMMING_WITH_AVX2
yes
CACHE BOOL "Enable AVX2 optimized code")
CACHE BOOL "Enable AVX2 optimized code on x86_64 CPUs")

set(HAMMING_WITH_AVX512
yes
CACHE BOOL "Enable AVX512 optimized code")
CACHE BOOL "Enable AVX512 optimized code on x86_64 CPUs")

set(HAMMING_WITH_NEON
no
CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")

# Add git submodules
add_subdirectory(ext)
Expand Down
13 changes: 13 additions & 0 deletions include/hamming/distance_neon.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <cstdint>
#include <vector>

#include "hamming/hamming_impl_types.hh"

namespace hamming {

int distance_neon(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

}
14 changes: 8 additions & 6 deletions include/hamming/hamming_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#define _HAMMING_IMPL_HH

#include <array>
#if defined(__aarch64__) || defined(_M_ARM64)
#include <cpuinfo_aarch64.h>
#else
#if !(defined(__aarch64__) || defined(_M_ARM64))
#include <cpuinfo_x86.h>
#endif
#include <cstdint>
Expand All @@ -14,6 +12,9 @@
#ifdef HAMMING_WITH_OPENMP
#include <omp.h>
#endif
#ifdef HAMMING_WITH_NEON
#include "hamming/distance_neon.hh"
#endif
#ifdef HAMMING_WITH_SSE2
#include "hamming/distance_sse2.hh"
#endif
Expand Down Expand Up @@ -101,11 +102,11 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
const std::vector<GeneBlock> &b) = distance_cpp;

#if defined(__aarch64__) || defined(_M_ARM64)
const auto features = cpu_features::GetAarch64Info().features;
#ifdef HAMMING_WITH_NEON
distance_func = distance_neon;
#endif
#else
const auto features = cpu_features::GetX86Info().features;
#endif

#ifdef HAMMING_WITH_SSE2
if (features.sse2) {
distance_func = distance_sse2;
Expand All @@ -121,6 +122,7 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
distance_func = distance_avx512;
}
#endif
#endif

#ifdef HAMMING_WITH_OPENMP
#pragma omp parallel for schedule(static, 1)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ environment = { CMAKE_ARGS="-DHAMMING_WITH_OPENMP=ON" }

[[tool.cibuildwheel.overrides]]
select = "*-macosx_arm64*"
environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF" }
environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF -DHAMMING_WITH_NEON=ON" }
15 changes: 15 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ if(HAMMING_WITH_AVX512)
target_link_libraries(hamming PRIVATE distance_avx512)
endif()

if(HAMMING_WITH_NEON)
target_compile_definitions(hamming PUBLIC HAMMING_WITH_NEON)
add_library(distance_neon STATIC distance_neon.cc)
target_include_directories(distance_neon PUBLIC ../include)
target_link_libraries(hamming PRIVATE distance_neon)
endif()

# Build library benchmarks
if(HAMMING_BUILD_BENCHMARKS)
add_executable(bench bench.cc hamming_bench.cc hamming_impl_bench.cc)
Expand All @@ -49,6 +56,10 @@ if(HAMMING_BUILD_BENCHMARKS)
target_sources(bench PRIVATE distance_avx512_bench.cc)
target_link_libraries(bench PRIVATE distance_avx512)
endif()
if(HAMMING_WITH_NEON)
target_sources(bench PRIVATE distance_neon_bench.cc)
target_link_libraries(bench PRIVATE distance_neon)
endif()
target_link_libraries(bench PRIVATE hamming benchmark::benchmark
CpuFeatures::cpu_features)
endif()
Expand All @@ -69,6 +80,10 @@ if(BUILD_TESTING)
target_sources(tests PRIVATE distance_avx512_t.cc)
target_link_libraries(tests PRIVATE distance_avx512)
endif()
if(HAMMING_WITH_NEON)
target_sources(tests PRIVATE distance_neon_t.cc)
target_link_libraries(tests PRIVATE distance_neon)
endif()
target_link_libraries(tests PRIVATE hamming Catch2::Catch2
CpuFeatures::cpu_features)
catch_discover_tests(tests)
Expand Down
65 changes: 65 additions & 0 deletions src/distance_neon.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include "hamming/distance_neon.hh"
#include <arm_neon.h>

namespace hamming {

int distance_neon(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b) {
// distance implementation using NEON simd intrinsics
// a 128-bit register holds 16 GeneBlocks, i.e. 32 genes
constexpr std::size_t n_geneblocks{16};
int r{0};
// mask to select LSB of each gene
const uint8x16_t lsb = vdupq_n_u8(1);
// mask to select lower gene from each GeneBlock
const uint8x16_t mask0 = vdupq_n_u8(mask_gene0);
// mask to select upper gene from each GeneBlock
const uint8x16_t mask1 = vdupq_n_u8(mask_gene1);
// vector of partial distance counts
uint8x16_t r_s;
// work registers
uint8x16_t r_a;
uint8x16_t r_b;
// each iteration processes 16 GeneBlocks
std::size_t n_iter{a.size() / n_geneblocks};
// each partial distance count is stored in a uint8, so max value = 255,
// and the value can be increased by at most 2 with each iteration,
// so we do 127 inner iterations for a max value of 254 to avoid overflow
std::size_t n_inner{127};
std::size_t n_outer{1 + n_iter / n_inner};
for (std::size_t j = 0; j < n_outer; ++j) {
std::size_t n{std::min((j + 1) * n_inner, n_iter)};
r_s = vdupq_n_u8(0);
for (std::size_t i = j * n_inner; i < n; ++i) {
// load a[i], b[i] into registers
r_a = vld1q_u8(a.data() + n_geneblocks * i);
r_b = vld1q_u8(b.data() + n_geneblocks * i);
// a[i] & b[i]
r_a = vandq_u8(r_a, r_b);
// mask lower genes
r_b = vandq_u8(r_a, mask0);
// mask upper genes
r_a = vandq_u8(r_a, mask1);
// compare genes with zero to get either 00000000 or 11111111
r_a = vceqzq_u8(r_a);
r_b = vceqzq_u8(r_b);
// only keep LSB for each uint8 to get either 0 or 1
r_a = vandq_u8(r_a, lsb);
r_b = vandq_u8(r_b, lsb);
// add these values to distance counts
r_s = vaddq_u8(r_s, r_a);
r_s = vaddq_u8(r_s, r_b);
}
// sum the 16 distances in r_s & add to r
r += vaddlvq_u8(r_s);
}
// do last partial block without simd intrinsics
for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) {
auto c{static_cast<GeneBlock>(a[i] & b[i])};
r += static_cast<int>((c & mask_gene0) == 0);
r += static_cast<int>((c & mask_gene1) == 0);
}
return r;
}

} // namespace hamming
26 changes: 26 additions & 0 deletions src/distance_neon_bench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "bench.hh"
#include "hamming/distance_neon.hh"
#include "hamming/hamming.hh"
#include "hamming/hamming_impl.hh"
#ifdef HAMMING_WITH_OPENMP
#include <omp.h>
#endif

using namespace hamming;

static void bench_distance_neon(benchmark::State &state) {
#ifdef HAMMING_WITH_OPENMP
omp_set_num_threads(1);
#endif
std::mt19937 gen(12345);
int64_t n{state.range(0)};
auto s1{from_string(make_string(n, gen))};
auto s2{from_string(make_string(n, gen))};
int d{0};
for (auto _ : state) {
d += distance_neon(s1, s2);
}
state.SetComplexityN(n);
}

BENCHMARK(bench_distance_neon)->Range(4096, 4194304)->Complexity();
62 changes: 62 additions & 0 deletions src/distance_neon_t.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "hamming/distance_neon.hh"
#include "tests.hh"

using namespace hamming;

TEST_CASE("distance_neon() returns all return zero for identical vectors",
"[impl][distance][neon]") {
std::mt19937 gen(12345);
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1{make_gene_vector(n, gen)};
REQUIRE(distance_neon(g1, g1) == 0);
}
}

TEST_CASE("distance_neon() all return n for n A's and n G's",
"[impl][distance][neon]") {
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1 = from_string(std::string(n, 'A'));
auto g2 = from_string(std::string(n, 'G'));
REQUIRE(distance_neon(g1, g2) == n);
}
}

TEST_CASE("distance_neon() returns same as distance_cpp() for random vectors",
"[impl][distance][neon]") {
std::mt19937 gen(12345);
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1{make_gene_vector(n, gen)};
auto g2{make_gene_vector(n, gen)};
REQUIRE(distance_neon(g1, g2) == distance_cpp(g1, g2));
}
}

0 comments on commit 1f218de

Please sign in to comment.