Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NEON simd implementation for arm64 cpus #114

Merged
merged 1 commit into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,16 @@ jobs:
include:
- os: ubuntu-latest
open-mp: "ON"
neon: "OFF"
- os: macos-latest
open-mp: "OFF"
neon: "OFF"
- os: macos-arm64-ssc
open-mp: "OFF"
neon: "ON"
- os: windows-latest
open-mp: "OFF"
neon: "OFF"

steps:
- uses: actions/checkout@v4
Expand All @@ -51,7 +55,7 @@ jobs:
- name: configure cmake
shell: bash
working-directory: ${{runner.workspace}}/build
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }}
run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_WITH_NEON=${{ matrix.neon }} -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }}

- name: build
shell: bash
Expand Down
10 changes: 7 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,19 @@ set(HAMMING_BUILD_PYTHON

set(HAMMING_WITH_SSE2
yes
CACHE BOOL "Enable SSE2 optimized code")
CACHE BOOL "Enable SSE2 optimized code on x86_64 CPUs")

set(HAMMING_WITH_AVX2
yes
CACHE BOOL "Enable AVX2 optimized code")
CACHE BOOL "Enable AVX2 optimized code on x86_64 CPUs")

set(HAMMING_WITH_AVX512
yes
CACHE BOOL "Enable AVX512 optimized code")
CACHE BOOL "Enable AVX512 optimized code on x86_64 CPUs")

set(HAMMING_WITH_NEON
no
CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")

# Add git submodules
add_subdirectory(ext)
Expand Down
13 changes: 13 additions & 0 deletions include/hamming/distance_neon.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <cstdint>
#include <vector>

#include "hamming/hamming_impl_types.hh"

namespace hamming {

int distance_neon(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

}
14 changes: 8 additions & 6 deletions include/hamming/hamming_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#define _HAMMING_IMPL_HH

#include <array>
#if defined(__aarch64__) || defined(_M_ARM64)
#include <cpuinfo_aarch64.h>
#else
#if !(defined(__aarch64__) || defined(_M_ARM64))
#include <cpuinfo_x86.h>
#endif
#include <cstdint>
Expand All @@ -14,6 +12,9 @@
#ifdef HAMMING_WITH_OPENMP
#include <omp.h>
#endif
#ifdef HAMMING_WITH_NEON
#include "hamming/distance_neon.hh"
#endif
#ifdef HAMMING_WITH_SSE2
#include "hamming/distance_sse2.hh"
#endif
Expand Down Expand Up @@ -101,11 +102,11 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
const std::vector<GeneBlock> &b) = distance_cpp;

#if defined(__aarch64__) || defined(_M_ARM64)
const auto features = cpu_features::GetAarch64Info().features;
#ifdef HAMMING_WITH_NEON
distance_func = distance_neon;
#endif
#else
const auto features = cpu_features::GetX86Info().features;
#endif

#ifdef HAMMING_WITH_SSE2
if (features.sse2) {
distance_func = distance_sse2;
Expand All @@ -121,6 +122,7 @@ std::vector<DistIntType> distances(std::vector<std::string> &data,
distance_func = distance_avx512;
}
#endif
#endif

#ifdef HAMMING_WITH_OPENMP
#pragma omp parallel for schedule(static, 1)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ environment = { CMAKE_ARGS="-DHAMMING_WITH_OPENMP=ON" }

[[tool.cibuildwheel.overrides]]
select = "*-macosx_arm64*"
environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF" }
environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF -DHAMMING_WITH_NEON=ON" }
15 changes: 15 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ if(HAMMING_WITH_AVX512)
target_link_libraries(hamming PRIVATE distance_avx512)
endif()

if(HAMMING_WITH_NEON)
target_compile_definitions(hamming PUBLIC HAMMING_WITH_NEON)
add_library(distance_neon STATIC distance_neon.cc)
target_include_directories(distance_neon PUBLIC ../include)
target_link_libraries(hamming PRIVATE distance_neon)
endif()

# Build library benchmarks
if(HAMMING_BUILD_BENCHMARKS)
add_executable(bench bench.cc hamming_bench.cc hamming_impl_bench.cc)
Expand All @@ -49,6 +56,10 @@ if(HAMMING_BUILD_BENCHMARKS)
target_sources(bench PRIVATE distance_avx512_bench.cc)
target_link_libraries(bench PRIVATE distance_avx512)
endif()
if(HAMMING_WITH_NEON)
target_sources(bench PRIVATE distance_neon_bench.cc)
target_link_libraries(bench PRIVATE distance_neon)
endif()
target_link_libraries(bench PRIVATE hamming benchmark::benchmark
CpuFeatures::cpu_features)
endif()
Expand All @@ -69,6 +80,10 @@ if(BUILD_TESTING)
target_sources(tests PRIVATE distance_avx512_t.cc)
target_link_libraries(tests PRIVATE distance_avx512)
endif()
if(HAMMING_WITH_NEON)
target_sources(tests PRIVATE distance_neon_t.cc)
target_link_libraries(tests PRIVATE distance_neon)
endif()
target_link_libraries(tests PRIVATE hamming Catch2::Catch2
CpuFeatures::cpu_features)
catch_discover_tests(tests)
Expand Down
65 changes: 65 additions & 0 deletions src/distance_neon.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include "hamming/distance_neon.hh"
#include <arm_neon.h>

namespace hamming {

int distance_neon(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b) {
// distance implementation using NEON simd intrinsics
// a 128-bit register holds 16 GeneBlocks, i.e. 32 genes
constexpr std::size_t n_geneblocks{16};
int r{0};
// mask to select LSB of each gene
const uint8x16_t lsb = vdupq_n_u8(1);
// mask to select lower gene from each GeneBlock
const uint8x16_t mask0 = vdupq_n_u8(mask_gene0);
// mask to select upper gene from each GeneBlock
const uint8x16_t mask1 = vdupq_n_u8(mask_gene1);
// vector of partial distance counts
uint8x16_t r_s;
// work registers
uint8x16_t r_a;
uint8x16_t r_b;
// each iteration processes 16 GeneBlocks
std::size_t n_iter{a.size() / n_geneblocks};
// each partial distance count is stored in a uint8, so max value = 255,
// and the value can be increased by at most 2 with each iteration,
// so we do 127 inner iterations for a max value of 254 to avoid overflow
std::size_t n_inner{127};
std::size_t n_outer{1 + n_iter / n_inner};
for (std::size_t j = 0; j < n_outer; ++j) {
std::size_t n{std::min((j + 1) * n_inner, n_iter)};
r_s = vdupq_n_u8(0);
for (std::size_t i = j * n_inner; i < n; ++i) {
// load a[i], b[i] into registers
r_a = vld1q_u8(a.data() + n_geneblocks * i);
r_b = vld1q_u8(b.data() + n_geneblocks * i);
// a[i] & b[i]
r_a = vandq_u8(r_a, r_b);
// mask lower genes
r_b = vandq_u8(r_a, mask0);
// mask upper genes
r_a = vandq_u8(r_a, mask1);
// compare genes with zero to get either 00000000 or 11111111
r_a = vceqzq_u8(r_a);
r_b = vceqzq_u8(r_b);
// only keep LSB for each uint8 to get either 0 or 1
r_a = vandq_u8(r_a, lsb);
r_b = vandq_u8(r_b, lsb);
// add these values to distance counts
r_s = vaddq_u8(r_s, r_a);
r_s = vaddq_u8(r_s, r_b);
}
// sum the 16 distances in r_s & add to r
r += vaddlvq_u8(r_s);
}
// do last partial block without simd intrinsics
for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) {
auto c{static_cast<GeneBlock>(a[i] & b[i])};
r += static_cast<int>((c & mask_gene0) == 0);
r += static_cast<int>((c & mask_gene1) == 0);
}
return r;
}

} // namespace hamming
26 changes: 26 additions & 0 deletions src/distance_neon_bench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "bench.hh"
#include "hamming/distance_neon.hh"
#include "hamming/hamming.hh"
#include "hamming/hamming_impl.hh"
#ifdef HAMMING_WITH_OPENMP
#include <omp.h>
#endif

using namespace hamming;

static void bench_distance_neon(benchmark::State &state) {
#ifdef HAMMING_WITH_OPENMP
omp_set_num_threads(1);
#endif
std::mt19937 gen(12345);
int64_t n{state.range(0)};
auto s1{from_string(make_string(n, gen))};
auto s2{from_string(make_string(n, gen))};
int d{0};
for (auto _ : state) {
d += distance_neon(s1, s2);
}
state.SetComplexityN(n);
}

BENCHMARK(bench_distance_neon)->Range(4096, 4194304)->Complexity();
62 changes: 62 additions & 0 deletions src/distance_neon_t.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "hamming/distance_neon.hh"
#include "tests.hh"

using namespace hamming;

TEST_CASE("distance_neon() returns all return zero for identical vectors",
"[impl][distance][neon]") {
std::mt19937 gen(12345);
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1{make_gene_vector(n, gen)};
REQUIRE(distance_neon(g1, g1) == 0);
}
}

TEST_CASE("distance_neon() all return n for n A's and n G's",
"[impl][distance][neon]") {
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1 = from_string(std::string(n, 'A'));
auto g2 = from_string(std::string(n, 'G'));
REQUIRE(distance_neon(g1, g2) == n);
}
}

TEST_CASE("distance_neon() returns same as distance_cpp() for random vectors",
"[impl][distance][neon]") {
std::mt19937 gen(12345);
for (int n :
{1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 31, 32, 33, 63,
64, 65, 127, 128, 129, 254, 255, 256,
256, 511, 512, 513, 1023, 1024, 1025, 2047,
2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193,
32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072,
131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575,
1048576, 1048577}) {
CAPTURE(n);
auto g1{make_gene_vector(n, gen)};
auto g2{make_gene_vector(n, gen)};
REQUIRE(distance_neon(g1, g2) == distance_cpp(g1, g2));
}
}