From b2067d8d3c77405f972701d7fa35baec0c0a631f Mon Sep 17 00:00:00 2001
From: Harry Phillips <harryjamesphillips@gmail.com>
Date: Mon, 11 Feb 2019 15:34:50 +0000
Subject: [PATCH] Working

---
 .gitignore                  |    1 +
 .gitmodules                 |    3 +
 .travis.yml                 |   28 +
 Cargo.toml                  |    2 +-
 build.rs                    |   26 +-
 build.sh                    |    6 +
 scavenger                   |    1 +
 src/c/SSE2NEON.h            | 1690 -------------------------------
 src/c/common.c              |   15 -
 src/c/common.h              |   65 --
 src/c/mshabal_128_avx.c     |  966 ------------------
 src/c/mshabal_128_avx.h     |  174 ----
 src/c/mshabal_128_neon.c    |  963 ------------------
 src/c/mshabal_128_neon.h    |  174 ----
 src/c/mshabal_128_sse2.c    |  963 ------------------
 src/c/mshabal_128_sse2.h    |  174 ----
 src/c/mshabal_256_avx2.c    | 1086 --------------------
 src/c/mshabal_256_avx2.h    |  179 ----
 src/c/mshabal_512_avx512f.c | 1318 ------------------------
 src/c/mshabal_512_avx512f.h |  195 ----
 src/c/shabal.c              |   13 -
 src/c/shabal.h              |    7 -
 src/c/shabal_avx.c          |   75 --
 src/c/shabal_avx.h          |    9 -
 src/c/shabal_avx2.c         |   95 --
 src/c/shabal_avx2.h         |    9 -
 src/c/shabal_avx512f.c      |  138 ---
 src/c/shabal_avx512f.h      |    9 -
 src/c/shabal_neon.c         |   75 --
 src/c/shabal_neon.h         |    9 -
 src/c/shabal_sse2.c         |   75 --
 src/c/shabal_sse2.h         |    9 -
 src/c/sph_shabal.c          |  693 -------------
 src/c/sph_shabal.h          |  133 ---
 src/c/sph_types.h           | 1912 -----------------------------------
 src/lib.rs                  |   74 +-
 36 files changed, 106 insertions(+), 11258 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 .travis.yml
 create mode 100755 build.sh
 create mode 160000 scavenger
 delete mode 100644 src/c/SSE2NEON.h
 delete mode 100644 src/c/common.c
 delete mode 100644 src/c/common.h
 delete mode 100644 src/c/mshabal_128_avx.c
 delete mode 100644 src/c/mshabal_128_avx.h
 delete mode 100644 src/c/mshabal_128_neon.c
 delete mode 100644 src/c/mshabal_128_neon.h
 delete mode 100644 src/c/mshabal_128_sse2.c
 delete mode 100644 src/c/mshabal_128_sse2.h
 delete mode 100644 src/c/mshabal_256_avx2.c
 delete mode 100644 src/c/mshabal_256_avx2.h
 delete mode 100644 src/c/mshabal_512_avx512f.c
 delete mode 100644 src/c/mshabal_512_avx512f.h
 delete mode 100644 src/c/shabal.c
 delete mode 100644 src/c/shabal.h
 delete mode 100644 src/c/shabal_avx.c
 delete mode 100644 src/c/shabal_avx.h
 delete mode 100644 src/c/shabal_avx2.c
 delete mode 100644 src/c/shabal_avx2.h
 delete mode 100644 src/c/shabal_avx512f.c
 delete mode 100644 src/c/shabal_avx512f.h
 delete mode 100644 src/c/shabal_neon.c
 delete mode 100644 src/c/shabal_neon.h
 delete mode 100644 src/c/shabal_sse2.c
 delete mode 100644 src/c/shabal_sse2.h
 delete mode 100644 src/c/sph_shabal.c
 delete mode 100644 src/c/sph_shabal.h
 delete mode 100644 src/c/sph_types.h

diff --git a/.gitignore b/.gitignore
index 6936990..22676ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 /target
 **/*.rs.bk
 Cargo.lock
+.idea/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..0aef76e
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "scavenger"]
+	path = scavenger
+	url = https://github.com/PoC-Consortium/scavenger
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..ce3ac59
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,28 @@
+language: rust
+cache: cargo
+
+before_deploy:
+  - ./build.sh
+
+matrix:
+  include:
+    - os: osx
+      before_install:
+        - brew install gcc
+    - os: linux
+      before_install:
+        - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+        - sudo apt-get update && sudo apt-get install -y gcc-8
+        - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+deploy:
+  provider: releases
+  api_key: $API_KEY
+  file: ${TRAVIS_OS_NAME}.zip
+  skip_cleanup: true
+  on:
+    branch: master
+    tags: true
+
+notifications:
+  email: false
diff --git a/Cargo.toml b/Cargo.toml
index b282948..afaacc3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2018"
 build = "build.rs"
 
 [lib]
-name = "shabal_lib"
+name = "shabal"
 crate-type = ["dylib"]
 
 [features]
diff --git a/build.rs b/build.rs
index 6128eaf..d27a725 100644
--- a/build.rs
+++ b/build.rs
@@ -27,9 +27,9 @@ fn main() {
     let mut config = shared_config.clone();
 
     config
-        .file("src/c/sph_shabal.c")
-        .file("src/c/shabal.c")
-        .file("src/c/common.c")
+        .file("scavenger/src/c/sph_shabal.c")
+        .file("scavenger/src/c/shabal.c")
+        .file("scavenger/src/c/common.c")
         .compile("shabal");
 
     generate_bindings();
@@ -43,8 +43,8 @@ fn main() {
                 config.flag("-mfpu=neon");
 
                 config
-                    .file("src/c/mshabal_128_neon.c")
-                    .file("src/c/shabal_neon.c")
+                    .file("scavenger/src/c/mshabal_128_neon.c")
+                    .file("scavenger/src/c/shabal_neon.c")
                     .compile("shabal_neon");
              }
          }
@@ -59,8 +59,8 @@ fn main() {
                 config.flag("-msse2");
 
                 config
-                    .file("src/c/mshabal_128_sse2.c")
-                    .file("src/c/shabal_sse2.c")
+                    .file("scavenger/src/c/mshabal_128_sse2.c")
+                    .file("scavenger/src/c/shabal_sse2.c")
                     .compile("shabal_sse2");
 
                 let mut config = shared_config.clone();
@@ -72,8 +72,8 @@ fn main() {
                 config.flag("-mavx");
 
                 config
-                    .file("src/c/mshabal_128_avx.c")
-                    .file("src/c/shabal_avx.c")
+                    .file("scavenger/src/c/mshabal_128_avx.c")
+                    .file("scavenger/src/c/shabal_avx.c")
                     .compile("shabal_avx");
 
                 let mut config = shared_config.clone();
@@ -85,8 +85,8 @@ fn main() {
                 config.flag("-mavx2");
 
                 config
-                    .file("src/c/mshabal_256_avx2.c")
-                    .file("src/c/shabal_avx2.c")
+                    .file("scavenger/src/c/mshabal_256_avx2.c")
+                    .file("scavenger/src/c/shabal_avx2.c")
                     .compile("shabal_avx2");
 
                 let mut config = shared_config.clone();
@@ -98,8 +98,8 @@ fn main() {
                 config.flag("-mavx512f");
 
                 config
-                    .file("src/c/mshabal_512_avx512f.c")
-                    .file("src/c/shabal_avx512f.c")
+                    .file("scavenger/src/c/mshabal_512_avx512f.c")
+                    .file("scavenger/src/c/shabal_avx512f.c")
                     .compile("shabal_avx512f");
             }
         }
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..7948af8
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cargo build --release --features "simd"
+
+cd target/release
+zip ../../${TRAVIS_OS_NAME}.zip libshabal.*
diff --git a/scavenger b/scavenger
new file mode 160000
index 0000000..775b0ea
--- /dev/null
+++ b/scavenger
@@ -0,0 +1 @@
+Subproject commit 775b0ea52fdc22c81ad2cc0ec20a787c20922aef
diff --git a/src/c/SSE2NEON.h b/src/c/SSE2NEON.h
deleted file mode 100644
index 60495ed..0000000
--- a/src/c/SSE2NEON.h
+++ /dev/null
@@ -1,1690 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding ARM NEON versions
-//
-// This header file does not (yet) translate *all* of the SSE intrinsics.
-// Since this is in support of a specific porting effort, I have only
-// included the intrinsics I needed to get my port to work.
-//
-// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
-//
-// If you want to improve or add to this project, send me an
-// email and I will probably approve your access to the depot.
-//
-// Project is located here:
-//
-//	https://github.com/jratcliff63367/sse2neon
-//
-// Show your appreciation for open source by sending me a bitcoin tip to the following
-// address.
-//
-// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
-// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
-//
-//
-// Contributors to this project are:
-//
-// John W. Ratcliff     : jratcliffscarab@gmail.com
-// Brandon Rowlett      : browlett@nvidia.com
-// Ken Fast             : kfast@gdeb.com
-// Eric van Beurden     : evanbeurden@nvidia.com
-// Alexander Potylitsin : apotylitsin@nvidia.com
-//
-//
-// *********************************************************************************************************************
-// apoty: March 17, 2017
-// Current version was changed in most to fix issues and potential issues.
-// All unit tests were rewritten as a part of forge lib project to cover all implemented functions.
-// *********************************************************************************************************************
-// Release notes for January 20, 2017 version:
-//
-// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
-// The unit-tests now test 10,000 random float and int values against each intrinsic.
-//
-// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
-// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
-//
-// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
-// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
-// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-// 
-// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
-// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
-// cast or otherwise alias the base NEON data type for intrinsic operations.
-//
-// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
-// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
-// to return the correct value.  This is now fixed.
-//
-// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
-// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
-// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
-// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
-// a build error.
-//
-// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
-// producing the correct results on NEON.  These unit tests will be added as soon as possible.
-// 
-// Here is the list of new instrinsics which have been added:
-//
-// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
-// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
-// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
-// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
-// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
-// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
-// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
-// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
-// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
-// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
-// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
-// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
-// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
-// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
-//
-// *********************************************************************************************************************
-/*
-** The MIT license:
-**
-** Permission is hereby granted, free of charge, to any person obtaining a copy
-** of this software and associated documentation files (the "Software"), to deal
-** in the Software without restriction, including without limitation the rights
-** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-** copies of the Software, and to permit persons to whom the Software is furnished
-** to do so, subject to the following conditions:
-**
-** The above copyright notice and this permission notice shall be included in all
-** copies or substantial portions of the Software.
-
-** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define ENABLE_CPP_VERSION 0
-
-#if defined(__GNUC__) || defined(__clang__)
-#	pragma push_macro("FORCE_INLINE")
-#	pragma push_macro("ALIGN_STRUCT")
-#	define FORCE_INLINE       static inline __attribute__((always_inline))
-#	define ALIGN_STRUCT(x)    __attribute__((aligned(x)))
-#else
-#	error "Macro name collisions may happens with unknown compiler"
-#	define FORCE_INLINE       static inline
-#	define ALIGN_STRUCT(x)    __declspec(align(x))
-#endif
-
-#include <stdint.h>
-#include "arm_neon.h"
-
-
-/*******************************************************/
-/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
-/* Argument fp3 is a digit[0123] that represents the fp*/
-/* from argument "b" of mm_shuffle_ps that will be     */
-/* placed in fp3 of result. fp2 is the same for fp2 in */
-/* result. fp1 is a digit[0123] that represents the fp */
-/* from argument "a" of mm_shuffle_ps that will be     */
-/* places in fp1 of result. fp0 is the same for fp0 of */
-/* result                                              */
-/*******************************************************/
-#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
-	(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a,b) \
-	const
-
-typedef float32x4_t __m128;
-typedef int32x4_t __m128i;
-
-
-// ******************************************
-// type-safe casting between types
-// ******************************************
-
-#define vreinterpretq_m128_f16(x) \
-	vreinterpretq_f32_f16(x)
-
-#define vreinterpretq_m128_f32(x) \
-	(x)
-
-#define vreinterpretq_m128_f64(x) \
-	vreinterpretq_f32_f64(x)
-
-
-#define vreinterpretq_m128_u8(x) \
-	vreinterpretq_f32_u8(x)
-
-#define vreinterpretq_m128_u16(x) \
-	vreinterpretq_f32_u16(x)
-
-#define vreinterpretq_m128_u32(x) \
-	vreinterpretq_f32_u32(x)
-
-#define vreinterpretq_m128_u64(x) \
-	vreinterpretq_f32_u64(x)
-
-
-#define vreinterpretq_m128_s8(x) \
-	vreinterpretq_f32_s8(x)
-
-#define vreinterpretq_m128_s16(x) \
-	vreinterpretq_f32_s16(x)
-
-#define vreinterpretq_m128_s32(x) \
-	vreinterpretq_f32_s32(x)
-
-#define vreinterpretq_m128_s64(x) \
-	vreinterpretq_f32_s64(x)
-
-
-#define vreinterpretq_f16_m128(x) \
-	vreinterpretq_f16_f32(x)
-
-#define vreinterpretq_f32_m128(x) \
-	(x)
-
-#define vreinterpretq_f64_m128(x) \
-	vreinterpretq_f64_f32(x)
-
-
-#define vreinterpretq_u8_m128(x) \
-	vreinterpretq_u8_f32(x)
-
-#define vreinterpretq_u16_m128(x) \
-	vreinterpretq_u16_f32(x)
-
-#define vreinterpretq_u32_m128(x) \
-	vreinterpretq_u32_f32(x)
-
-#define vreinterpretq_u64_m128(x) \
-	vreinterpretq_u64_f32(x)
-
-
-#define vreinterpretq_s8_m128(x) \
-	vreinterpretq_s8_f32(x)
-
-#define vreinterpretq_s16_m128(x) \
-	vreinterpretq_s16_f32(x)
-
-#define vreinterpretq_s32_m128(x) \
-	vreinterpretq_s32_f32(x)
-
-#define vreinterpretq_s64_m128(x) \
-	vreinterpretq_s64_f32(x)
-
-
-#define vreinterpretq_m128i_s8(x) \
-	vreinterpretq_s32_s8(x)
-
-#define vreinterpretq_m128i_s16(x) \
-	vreinterpretq_s32_s16(x)
-
-#define vreinterpretq_m128i_s32(x) \
-	(x)
-
-#define vreinterpretq_m128i_s64(x) \
-	vreinterpretq_s32_s64(x)
-
-
-#define vreinterpretq_m128i_u8(x) \
-	vreinterpretq_s32_u8(x)
-
-#define vreinterpretq_m128i_u16(x) \
-	vreinterpretq_s32_u16(x)
-
-#define vreinterpretq_m128i_u32(x) \
-	vreinterpretq_s32_u32(x)
-
-#define vreinterpretq_m128i_u64(x) \
-	vreinterpretq_s32_u64(x)
-
-
-#define vreinterpretq_s8_m128i(x) \
-	vreinterpretq_s8_s32(x)
-
-#define vreinterpretq_s16_m128i(x) \
-	vreinterpretq_s16_s32(x)
-
-#define vreinterpretq_s32_m128i(x) \
-	(x)
-
-#define vreinterpretq_s64_m128i(x) \
-	vreinterpretq_s64_s32(x)
-
-
-#define vreinterpretq_u8_m128i(x) \
-	vreinterpretq_u8_s32(x)
-
-#define vreinterpretq_u16_m128i(x) \
-	vreinterpretq_u16_s32(x)
-
-#define vreinterpretq_u32_m128i(x) \
-	vreinterpretq_u32_s32(x)
-
-#define vreinterpretq_u64_m128i(x) \
-	vreinterpretq_u64_s32(x)
-
-
-// union intended to allow direct access to an __m128 variable using the names that the MSVC
-// compiler provides.  This union should really only be used when trying to access the members
-// of the vector as integer values.  GCC/clang allow native access to the float members through
-// a simple array access operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
-// hit.  If it really is needed however, the original __m128 variable can be aliased with a
-// pointer to this union and used to access individual components.  The use of this union should
-// be hidden behind a macro that is used throughout the codebase to access the members instead
-// of always declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec
-{
-	float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
-	int8_t      m128_i8[16];    // as signed 8-bit integers.
-	int16_t     m128_i16[8];    // as signed 16-bit integers.
-	int32_t     m128_i32[4];    // as signed 32-bit integers.
-	int64_t     m128_i64[2];    // as signed 64-bit integers.
-	uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
-	uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
-	uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
-	uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
-} SIMDVec;
-
-
-// ******************************************
-// Set/get methods
-// ******************************************
-
-// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-	return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128()
-{
-	return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-	float __attribute__((aligned(16))) data[4] = { x, y, z, w };
-	return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) 
-{
-	float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
-	return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-
-//added by hasindu
-//Sets the 4 signed 32-bit integer values in reverse order https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-	int32_t __attribute__((aligned(16))) data[4] = { i3, i2, i1, i0 };
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-//following added by hasindu 
-//Sets the 16 signed 8-bit integer values to b.https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(char w)
-{
-	return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-
-//following added by hasindu 
-//Sets the 8 signed 16-bit integer values to w. https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-	return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-//following added by hasindu 
-//Sets the 8 signed 16-bit integer values. https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
-{
-	int16_t __attribute__((aligned(16))) data[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };
-	return vreinterpretq_m128i_s16(vld1q_s16(data));	
-}
-
-
-// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-	return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-	int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-	vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
-}
-
-//added by hasindu (verify this for requirement of alignment)
-// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-	vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-	vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
-{
-	uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-	uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-	*a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
-}
-
-// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float * p)
-{
-	return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float * p)
-{
-	return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
-{
-	// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
-	return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float * p)
-{
-	return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-
-// ******************************************
-// Logic/Binary operations
-// ******************************************
-
-// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32( vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) ) );
-}
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a)) ); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a)) ); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
-	uint32x4_t &ia = *(uint32x4_t *)&a;
-	return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
-#else
-	static const uint32x4_t movemask = { 1, 2, 4, 8 };
-	static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-	uint32x4_t t0 = vreinterpretq_u32_m128(a);
-	uint32x4_t t1 = vtstq_u32(t0, highbit);
-	uint32x4_t t2 = vandq_u32(t1, movemask);
-	uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
-	return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
-#endif
-}
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high end of result
-// takes the higher two 32 bit values from b and swaps them and places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-	float32x2_t a21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-	float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-	float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-	float32x2_t b21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-	float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-	float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-	float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-	float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-	float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* apoty: TODO: use vzip ?*/
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-	float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
-// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
-// The same is true on SSE as well.
-// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
-FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, __constrange(0,255) int imm)
-{
-	__m128 ret;
-	ret[0] = a[imm & 0x3];
-	ret[1] = a[(imm >> 2) & 0x3];
-	ret[2] = b[(imm >> 4) & 0x03];
-	ret[3] = b[(imm >> 6) & 0x03];
-	return ret;
-}
-#else
-#define _mm_shuffle_ps_default(a, b, imm) \
-({ \
-	float32x4_t ret; \
-	ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & 0x3)); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret, 1); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret, 2); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret, 3); \
-	vreinterpretq_m128_f32(ret); \
-})
-#endif
-
-//FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) int imm)
-#define _mm_shuffle_ps(a, b, imm) \
-({ \
-	__m128 ret; \
-	switch (imm) \
-	{ \
-		case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_ps_1032((a), (b)); break; \
-		case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_ps_2301((a), (b)); break; \
-		case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_ps_0321((a), (b)); break; \
-		case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_ps_2103((a), (b)); break; \
-		case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_ps_1010((a), (b)); break; \
-		case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_ps_1001((a), (b)); break; \
-		case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_ps_0101((a), (b)); break; \
-		case _MM_SHUFFLE(3, 2, 1, 0): ret = _mm_shuffle_ps_3210((a), (b)); break; \
-		case _MM_SHUFFLE(0, 0, 1, 1): ret = _mm_shuffle_ps_0011((a), (b)); break; \
-		case _MM_SHUFFLE(0, 0, 2, 2): ret = _mm_shuffle_ps_0022((a), (b)); break; \
-		case _MM_SHUFFLE(2, 2, 0, 0): ret = _mm_shuffle_ps_2200((a), (b)); break; \
-		case _MM_SHUFFLE(3, 2, 0, 2): ret = _mm_shuffle_ps_3202((a), (b)); break; \
-		case _MM_SHUFFLE(1, 1, 3, 3): ret = _mm_shuffle_ps_1133((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 1, 0): ret = _mm_shuffle_ps_2010((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 0, 1): ret = _mm_shuffle_ps_2001((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 3, 2): ret = _mm_shuffle_ps_2032((a), (b)); break; \
-		default: ret = _mm_shuffle_ps_default((a), (b), (imm)); break; \
-	} \
-	ret; \
-})
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end of result
-// takes the higher two 32 bit values from a and swaps them and places in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-	int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-//FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
-#if ENABLE_CPP_VERSION
-FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
-{
-	__m128i ret;
-	ret[0] = a[imm & 0x3];
-	ret[1] = a[(imm >> 2) & 0x3];
-	ret[2] = a[(imm >> 4) & 0x03];
-	ret[3] = a[(imm >> 6) & 0x03];
-	return ret;
-}
-#else
-#define _mm_shuffle_epi32_default(a, imm) \
-({ \
-	int32x4_t ret; \
-	ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & 0x3)); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret, 1); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret, 2); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret, 3); \
-	vreinterpretq_m128i_s32(ret); \
-})
-#endif
-
-//FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm) \
-({ \
-	vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-})
-#else
-#define _mm_shuffle_epi32_splat(a, imm) \
-({ \
-	vreinterpretq_m128i_s32(vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-})
-#endif
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-//FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_shuffle_epi32(a, imm) \
-({ \
-	__m128i ret; \
-	switch (imm) \
-	{ \
-		case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_epi_1032((a)); break; \
-		case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_epi_2301((a)); break; \
-		case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_epi_0321((a)); break; \
-		case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_epi_2103((a)); break; \
-		case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_epi_1010((a)); break; \
-		case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_epi_1001((a)); break; \
-		case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_epi_0101((a)); break; \
-		case _MM_SHUFFLE(2, 2, 1, 1): ret = _mm_shuffle_epi_2211((a)); break; \
-		case _MM_SHUFFLE(0, 1, 2, 2): ret = _mm_shuffle_epi_0122((a)); break; \
-		case _MM_SHUFFLE(3, 3, 3, 2): ret = _mm_shuffle_epi_3332((a)); break; \
-		case _MM_SHUFFLE(0, 0, 0, 0): ret = _mm_shuffle_epi32_splat((a),0); break; \
-		case _MM_SHUFFLE(1, 1, 1, 1): ret = _mm_shuffle_epi32_splat((a),1); break; \
-		case _MM_SHUFFLE(2, 2, 2, 2): ret = _mm_shuffle_epi32_splat((a),2); break; \
-		case _MM_SHUFFLE(3, 3, 3, 3): ret = _mm_shuffle_epi32_splat((a),3); break; \
-		default: ret = _mm_shuffle_epi32_default((a), (imm)); break; \
-	} \
-	ret; \
-})
-
-// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, __constrange(0,255) int imm)
-#define _mm_shufflehi_epi16_function(a, imm) \
-({ \
-	int16x8_t ret = vreinterpretq_s16_s32(a); \
-	int16x4_t highBits = vget_high_s16(ret); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & 0x3), ret, 4); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
-	vreinterpretq_s32_s16(ret); \
-})
-
-//FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int imm)
-#define _mm_shufflehi_epi16(a, imm) \
-	_mm_shufflehi_epi16_function((a), (imm))
-
-
-//added by hasindu
-//Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros.	https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) {\
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s16(vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-	    
-    
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-//FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_epi32(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) {\
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s32(vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-
-//added by hasindu 
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros.
-//https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
-#define _mm_srli_epi16(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm)> 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_u16(vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-
-//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm)> 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_u32(vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16)); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-//FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 15) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s8(vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm) \
-({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 15) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
-	} \
-	ret; \
-})
-
-// NEON does not provide a version of this function, here is an article about some ways to repro the results.
-// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
-// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
-{
-	uint8x16_t input = vreinterpretq_u8_m128i(_a);
-	static const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
-	uint8x8_t mask_and = vdup_n_u8(0x80);
-	int8x8_t mask_shift = vld1_s8(xr);
-
-	uint8x8_t lo = vget_low_u8(input);
-	uint8x8_t hi = vget_high_u8(input);
-
-	lo = vand_u8(lo, mask_and);
-	lo = vshl_u8(lo, mask_shift);
-
-	hi = vand_u8(hi, mask_and);
-	hi = vshl_u8(hi, mask_shift);
-
-	lo = vpadd_u8(lo, lo);
-	lo = vpadd_u8(lo, lo);
-	lo = vpadd_u8(lo, lo);
-
-	hi = vpadd_u8(hi, hi);
-	hi = vpadd_u8(hi, hi);
-	hi = vpadd_u8(hi, hi);
-
-	return ((hi[0] << 8) | (lo[0] & 0xFF));
-}
-
-
-// ******************************************
-// Math operations
-// ******************************************
-
-// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128_f32(vsubq_s32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-//added by hasindu
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-//added by hasindu
-//Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-//added by hasindu
-//Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-	float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-	float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-	//the upper values in the result must be the remnants of <a>.
-	return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-//added by hasindu
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or unsigned 8-bit integers in b. https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-//added by hasindu
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates. https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-//added by hasindu
-//Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in b and saturates.. https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),vreinterpretq_s32_m128i(b)));
-}
-
-// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-	float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
-	float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
-}
-
-// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-	float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// This version does additional iterations to improve accuracy.  Between 1 and 4 recommended.
-// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
-FORCE_INLINE __m128 recipq_newton(__m128 in, int n)
-{
-	int i;
-	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-	for (i = 0; i < n; ++i)
-	{
-		recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-	}
-	return vreinterpretq_m128_f32(recip);
-}
-
-// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-	recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-	return vreinterpretq_m128_f32(recip);
-}
-
-// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-	float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-	float32x4_t sq = vrecpeq_f32(recipsq);
-	// ??? use step versions of both sqrt and recip for better accuracy?
-	return vreinterpretq_m128_f32(sq);
-}
-
-// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-	float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-	return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
-}
-
-// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-	float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-	float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-//added by hasindu
-//Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-//added by hasindu
-//Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-//added by hasindu
-//Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-	/* apoty: issue with large values because of result saturation */
-	//int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); /* =2*a*b */
-	//return vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-	int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-	int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-	uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-	return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point values a and b. 
-//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) 
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); //AArch64
-#else
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// ******************************************
-// Compare operations
-// ******************************************
-
-// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-
-//added by hasindu 
-//Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or unsigned 8-bit integers in b for equality. https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-//added by hasindu 
-//Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or unsigned 16-bit integers in b for equality. 
-//https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-//added by hasindu 
-//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for lesser than. https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-
-//added by hasindu 
-//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than. https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-//added by hasindu 
-//Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than. https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
-// see also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) 
-{
-	// Note: NEON does not have ordered compare builtin
-	// Need to compare a eq a and b eq b to check for NaN
-	// Do AND of results to get final
-	uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
-// Important note!! The documentation on MSDN is incorrect!  If either of the values is a NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-	//return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-	//return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-	//return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-	//return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-	//return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss      _mm_comilt_ss
-#define _mm_ucomile_ss      _mm_comile_ss
-#define _mm_ucomigt_ss      _mm_comigt_ss
-#define _mm_ucomige_ss      _mm_comige_ss
-#define _mm_ucomieq_ss      _mm_comieq_ss
-#define _mm_ucomineq_ss     _mm_comineq_ss
-
-// ******************************************
-// Conversions
-// ******************************************
-
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-	return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-	return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four unsigned 32-bit integers. https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-	uint8x16_t u8x16 = vreinterpretq_u8_s32(a);        /* xxxx xxxx xxxx DCBA */
-	uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-	uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-	return vreinterpretq_s32_u32(u32x4);
-}
-
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514079%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!  
-// It is supported on ARMv8 however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__aarch64__)
-	return vcvtnq_s32_f32(a);
-#else
-    uint32x4_t signmask = vdupq_n_u32(0x80000000);
-    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */
-    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-    int32x4_t r_trunc = vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-    float32x4_t delta = vsubq_f32(vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
-    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
-}
-
-// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-	return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-	return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-	return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-	return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-	return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
-}
-
-//added by hasindu (verify this for requirement of alignment)
-// Loads 128-bit value. : https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-	return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
-}
-
-
-// ******************************************
-// Miscellaneous Operations
-// ******************************************
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-	return vreinterpretq_m128i_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-	int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-	int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-	int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-	float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-	float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-	int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-	int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-	int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-}
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-//FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-({ \
-	(vgetq_lane_s16(vreinterpretq_s16_m128i(a), (imm)) & 0x0000ffffUL); \
-})
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer of a. https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-//FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b, __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm) \
-({ \
-	vreinterpretq_m128i_s16(vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-})
-
-// ******************************************
-// Streaming Extensions
-// ******************************************
-
-// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-	__sync_synchronize();
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-	*p = a;
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-/*
-FORCE_INLINE void _mm_clflush(void const*p) 
-{
-	// no corollary for Neon?
-}
-*/
-
-#if defined(__GNUC__) || defined(__clang__)
-#	pragma pop_macro("ALIGN_STRUCT")
-#	pragma pop_macro("FORCE_INLINE")
-#endif
-
-#endif
diff --git a/src/c/common.c b/src/c/common.c
deleted file mode 100644
index 4bb8ef1..0000000
--- a/src/c/common.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "common.h"
-#include <string.h>
-
-void write_seed(char seed[32], uint64_t numeric_id) {
-    numeric_id = bswap_64(numeric_id);
-    memmove(&seed[0], &numeric_id, 8);
-    memset(&seed[8], 0, 8);
-    seed[16] = -128;  // shabal message termination bit
-    memset(&seed[17], 0, 15);
-}
-
-void write_term(char term[32]) {
-    term[0] = -128;  // shabal message termination bit
-    memset(&term[1], 0, 31);
-}
diff --git a/src/c/common.h b/src/c/common.h
deleted file mode 100644
index a7aa9b8..0000000
--- a/src/c/common.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <stdint.h>
-
-#pragma once
-
-#ifdef _MSC_VER
-
-#include <stdlib.h>
-#define bswap_32(x) _byteswap_ulong(x)
-#define bswap_64(x) _byteswap_uint64(x)
-
-#elif defined(__APPLE__)
-
-// Mac OS X / Darwin features
-#include <libkern/OSByteOrder.h>
-#define bswap_32(x) OSSwapInt32(x)
-#define bswap_64(x) OSSwapInt64(x)
-
-#elif defined(__sun) || defined(sun)
-
-#include <sys/byteorder.h>
-#define bswap_32(x) BSWAP_32(x)
-#define bswap_64(x) BSWAP_64(x)
-
-#elif defined(__FreeBSD__)
-
-#include <sys/endian.h>
-#define bswap_32(x) bswap32(x)
-#define bswap_64(x) bswap64(x)
-
-#elif defined(__OpenBSD__)
-
-#include <sys/types.h>
-#define bswap_32(x) swap32(x)
-#define bswap_64(x) swap64(x)
-
-#elif defined(__NetBSD__)
-
-#include <machine/bswap.h>
-#include <sys/types.h>
-#if defined(__BSWAP_RENAME) && !defined(__bswap_32)
-#define bswap_32(x) bswap32(x)
-#define bswap_64(x) bswap64(x)
-#endif
-
-#else
-
-#include <byteswap.h>
-
-#endif
-
-#define HASH_SIZE 32
-#define HASH_CAP 4096
-#define NUM_SCOOPS 4096
-#define SCOOP_SIZE 64
-#define NONCE_SIZE (HASH_CAP * SCOOP_SIZE)  // 4096*64
-
-void write_seed(char seed[32], uint64_t numeric_id);
-
-void write_term(char term[32]);
-
-#define SET_BEST_DEADLINE(d, o) \
-    if ((d) < *best_deadline) { \
-        *best_deadline = (d);   \
-        *best_offset = (o);     \
-    }
diff --git a/src/c/mshabal_128_avx.c b/src/c/mshabal_128_avx.c
deleted file mode 100644
index f4c239c..0000000
--- a/src/c/mshabal_128_avx.c
+++ /dev/null
@@ -1,966 +0,0 @@
-/*
- * Parallel implementation of Shabal, using the AVX unit. This code
- * compiles and runs on x86 architectures, in 32-bit or 64-bit mode,
- * which possess a AVX-compatible SIMD unit.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#include <immintrin.h>
-#include <stddef.h>
-#include <string.h>
-#include "mshabal_128_avx.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-typedef mshabal_u32 u32;
-
-#define C32(x) ((u32)x##UL)
-#define T32(x) ((x)&C32(0xFFFFFFFF))
-#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-
-static void mshabal_compress_avx(mshabal128_context *sc, const unsigned char *buf0,
-                                     const unsigned char *buf1, const unsigned char *buf2,
-                                     const unsigned char *buf3, size_t num) {
-    _mm256_zeroupper();
-    union {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u;
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-#define M(i) _mm_load_si128(u.data + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) {
-            u.words[j + 0] = *(u32 *)(buf0 + j);
-            u.words[j + 1] = *(u32 *)(buf1 + j);
-            u.words[j + 2] = *(u32 *)(buf2 + j);
-            u.words[j + 3] = *(u32 *)(buf3 + j);
-        }
-
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        buf0 += 64;
-        buf1 += 64;
-        buf2 += 64;
-        buf3 += 64;
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]);
-    for (j = 0; j < 16; j++) {
-        _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]);
-        _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]);
-    }
-#undef M
-}
-
-void mshabal_init_avx(mshabal128_context *sc, unsigned out_size) {
-    unsigned u;
-
-    memset(sc->state, 0, sizeof sc->state);
-    memset(sc->buf0, 0, sizeof sc->buf0);
-    memset(sc->buf1, 0, sizeof sc->buf1);
-    memset(sc->buf2, 0, sizeof sc->buf2);
-    memset(sc->buf3, 0, sizeof sc->buf3);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u);
-        sc->buf0[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u);
-        sc->buf1[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u);
-        sc->buf2[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u);
-        sc->buf3[4 * u + 1] = (out_size + u) >> 8;
-    }
-    sc->Whigh = sc->Wlow = C32(0xFFFFFFFF);
-    mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u + 16);
-        sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u + 16);
-        sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u + 16);
-        sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u + 16);
-        sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8;
-    }
-    mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    sc->ptr = 0;
-    sc->out_size = out_size;
-}
-
-void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                     const void *data3, size_t len) {
-    size_t ptr, num;
-    
-    if (data0 == NULL) {
-        if (data1 == NULL) {
-            if (data2 == NULL) {
-                if (data3 == NULL) {
-                    return;
-                } else {
-                    data0 = data3;
-                }
-            } else {
-                data0 = data2;
-            }
-        } else {
-            data0 = data1;
-        }
-    }
-
-    if (data1 == NULL) data1 = data0;
-    if (data2 == NULL) data2 = data0;
-    if (data3 == NULL) data3 = data0;
-
-    ptr = sc->ptr;
-    if (ptr != 0) {
-        size_t clen = (sizeof sc->buf0 - ptr);
-        if (clen > len) {
-            memcpy(sc->buf0 + ptr, data0, len);
-            memcpy(sc->buf1 + ptr, data1, len);
-            memcpy(sc->buf2 + ptr, data2, len);
-            memcpy(sc->buf3 + ptr, data3, len);
-            sc->ptr = ptr + len;
-            return;
-        } else {
-            memcpy(sc->buf0 + ptr, data0, clen);
-            memcpy(sc->buf1 + ptr, data1, clen);
-            memcpy(sc->buf2 + ptr, data2, clen);
-            memcpy(sc->buf3 + ptr, data3, clen);
-            mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-            data0 = (const unsigned char *)data0 + clen;
-            data1 = (const unsigned char *)data1 + clen;
-            data2 = (const unsigned char *)data2 + clen;
-            data3 = (const unsigned char *)data3 + clen;
-            len -= clen;
-        }
-    }
-
-   num = len >> 6;
-    if (num != 0) {
-        mshabal_compress_avx(sc, data0, data1, data2, data3, num);
-        data0 = (const unsigned char *)data0 + (num << 6);
-        data1 = (const unsigned char *)data1 + (num << 6);
-        data2 = (const unsigned char *)data2 + (num << 6);
-        data3 = (const unsigned char *)data3 + (num << 6);
-    }
-    len &= 63;
-    memcpy(sc->buf0, data0, len);
-    memcpy(sc->buf1, data1, len);
-    memcpy(sc->buf2, data2, len);
-    memcpy(sc->buf3, data3, len);
-    sc->ptr = len;
-}
-
-void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3,
-                        unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) {
-    size_t ptr, off;
-    unsigned z, out_size_w32;
-
-    z = 0x80 >> n;
-    ptr = sc->ptr;
-    sc->buf0[ptr] = (ub0 & -z) | z;
-    sc->buf1[ptr] = (ub1 & -z) | z;
-    sc->buf2[ptr] = (ub2 & -z) | z;
-    sc->buf3[ptr] = (ub3 & -z) | z;
-    ptr++;
-    memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr);
-    memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr);
-    memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr);
-    memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr);
-    for (z = 0; z < 4; z++) {
-        mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-    out_size_w32 = sc->out_size >> 5;
-    off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32));
-    if (dst0 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst0;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0];
-    }
-    if (dst1 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst1;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1];
-    }
-    if (dst2 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst2;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2];
-    }
-    if (dst3 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst3;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3];
-    }
-}
-
-// Shabal routine optimized for plotting and hashing
-void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination,
-                                   void *dst, unsigned num) {
-    _mm256_zeroupper();
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        // move data pointer
-        message = (__m128i *)message + 16;
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned hashes
-    for (j = 0; j < 8; j++) {
-        _mm_storeu_si128((__m128i *)dst + j, C[j + 8]);
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-// Shabal routine optimized for mining
-void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                 void *dst1, void *dst2, void *dst3) {
-    _mm256_zeroupper();
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned deadlines
-    u32 simd_dst[8];
-    _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]);
-    _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]);
-   
-    // unpack SIMD data
-    unsigned z;
-    for (z = 0; z < 2; z++) {
-        unsigned y = z * MSHABAL128_VECTOR_SIZE;
-        ((u32 *)dst0)[z] = simd_dst[y + 0];
-        ((u32 *)dst1)[z] = simd_dst[y + 1];
-        ((u32 *)dst2)[z] = simd_dst[y + 2];
-        ((u32 *)dst3)[z] = simd_dst[y + 3];
-    }
-    
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/c/mshabal_128_avx.h b/src/c/mshabal_128_avx.h
deleted file mode 100644
index 5a869db..0000000
--- a/src/c/mshabal_128_avx.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * A parallel implementation of Shabal, for platforms with AVX.
- *
- * This is the header file for an implementation of the Shabal family
- * of hash functions, designed for maximum parallel speed. It processes
- * up to four instances of Shabal in parallel, using the AVX unit.
- * Total bandwidth appear to be up to twice that of a plain 32-bit
- * Shabal implementation.
- *
- * A computation uses a mshabal_context structure. That structure is
- * supposed to be allocated and released by the caller, e.g. as a
- * local or global variable, or on the heap. The structure contents
- * are initialized with mshabal_init(). Once the structure has been
- * initialized, data is input as chunks, with the mshabal() functions.
- * Chunks for the four parallel instances are provided simultaneously
- * and must have the same length. It is allowed not to use some of the
- * instances; the corresponding parameters in mshabal() are then NULL.
- * However, using NULL as a chunk for one of the instances effectively
- * deactivates that instance; this cannot be used to "skip" a chunk
- * for one instance.
- *
- * The computation is finalized with mshabal_close(). Some extra message
- * bits (0 to 7) can be input. The outputs of the four parallel instances
- * are written in the provided buffers. There again, NULL can be
- * provided as parameter is the output of one of the instances is not
- * needed.
- *
- * A mshabal_context instance is self-contained and holds no pointer.
- * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
- * proper alignment is maintained). This implementation uses no state
- * variable beyond the context instance; this, it is thread-safe and
- * reentrant.
- *
- * The Shabal specification defines Shabal with output sizes of 192,
- * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
- * well as any output size which is multiple of 32, between 32 and
- * 512 (inclusive).
- *
- * Parameters are not validated. Thus, undefined behaviour occurs if
- * any of the "shall" or "must" clauses in this documentation is
- * violated.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#ifndef MSHABAL_H__
-#define MSHABAL_H__
-
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We need an integer type with width 32-bit or more (preferably, with
- * a width of exactly 32 bits).
- */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#include <stdint.h>
-#ifdef UINT32_MAX
-typedef uint32_t mshabal_u32;
-#else
-typedef uint_fast32_t mshabal_u32;
-#endif
-#else
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-typedef unsigned int mshabal_u32;
-#else
-typedef unsigned long mshabal_u32;
-#endif
-#endif
-
-#define MSHABAL128_VECTOR_SIZE 4
-
-/*
- * The context structure for a Shabal computation. Contents are
- * private. Such a structure should be allocated and released by
- * the caller, in any memory area.
- */
-typedef struct {
-    unsigned char buf0[64];
-    unsigned char buf1[64];
-    unsigned char buf2[64];
-    unsigned char buf3[64];
-    size_t ptr;
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context;
-
-#pragma pack(1)
-typedef struct {
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context_fast;
-#pragma pack()
-
-/*
- * Initialize a context structure. The output size must be a multiple
- * of 32, between 32 and 512 (inclusive). The output size is expressed
- * in bits.
- */
-void mshabal_init_avx(mshabal128_context *sc, unsigned out_size);
-
-/*
- * Process some more data bytes; four chunks of data, pointed to by
- * data0, data1, data2 and data3, are processed. The four chunks have
- * the same length of "len" bytes. For efficiency, it is best if data is
- * processed by medium-sized chunks, e.g. a few kilobytes at a time.
- *
- * The "len" data bytes shall all be accessible. If "len" is zero, this
- * this function does nothing and ignores the data* arguments.
- * Otherwise, if one of the data* argument is NULL, then the
- * corresponding instance is deactivated (the final value obtained from
- * that instance is undefined).
- */
-void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                     const void *data3, size_t len);
-
-/*
- * Terminate the Shabal computation incarnated by the provided context
- * structure. "n" shall be a value between 0 and 7 (inclusive): this is
- * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
- * append at the end of the input message for each of the four parallel
- * instances. Bits in "ub*" are taken in big-endian format: first bit is
- * the one of numerical value 128, second bit has numerical value 64,
- * and so on. Other bits in "ub*" are ignored. For most applications,
- * input messages will consist in sequence of bytes, and the "ub*" and
- * "n" parameters will be zero.
- *
- * The Shabal output for each of the parallel instances is written out
- * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
- * These areas shall be wide enough to accomodate the result (result
- * size was specified as parameter to mshabal_init()). It is acceptable
- * to use NULL for any of those pointers, if the result from the
- * corresponding instance is not needed.
- *
- * After this call, the context structure is invalid. The caller shall
- * release it, or reinitialize it with mshabal_init(). The mshabal_close()
- * function does NOT imply a hidden call to mshabal_init().
- */
-void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                           unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2,
-                           void *dst3);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination,
-                               void *dst, unsigned num);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/c/mshabal_128_neon.c b/src/c/mshabal_128_neon.c
deleted file mode 100644
index 62ea371..0000000
--- a/src/c/mshabal_128_neon.c
+++ /dev/null
@@ -1,963 +0,0 @@
-/*
- * Parallel implementation of Shabal, using the NEON unit. This code
- * compiles and runs on x86 architectures, in 32-bit or 64-bit mode,
- * which possess a NEON-compatible SIMD unit.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#include "SSE2NEON.h"
-#include <stddef.h>
-#include <string.h>
-#include "mshabal_128_neon.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-typedef mshabal_u32 u32;
-
-#define C32(x) ((u32)x##UL)
-#define T32(x) ((x)&C32(0xFFFFFFFF))
-#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-
-static void mshabal_compress_neon(mshabal128_context *sc, const unsigned char *buf0,
-                                     const unsigned char *buf1, const unsigned char *buf2,
-                                     const unsigned char *buf3, size_t num) {
-    union {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u;
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-#define M(i) _mm_load_si128(u.data + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) {
-            u.words[j + 0] = *(u32 *)(buf0 + j);
-            u.words[j + 1] = *(u32 *)(buf1 + j);
-            u.words[j + 2] = *(u32 *)(buf2 + j);
-            u.words[j + 3] = *(u32 *)(buf3 + j);
-        }
-
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        buf0 += 64;
-        buf1 += 64;
-        buf2 += 64;
-        buf3 += 64;
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]);
-    for (j = 0; j < 16; j++) {
-        _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]);
-        _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]);
-    }
-#undef M
-}
-
-void mshabal_init_neon(mshabal128_context *sc, unsigned out_size) {
-    unsigned u;
-
-    memset(sc->state, 0, sizeof sc->state);
-    memset(sc->buf0, 0, sizeof sc->buf0);
-    memset(sc->buf1, 0, sizeof sc->buf1);
-    memset(sc->buf2, 0, sizeof sc->buf2);
-    memset(sc->buf3, 0, sizeof sc->buf3);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u);
-        sc->buf0[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u);
-        sc->buf1[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u);
-        sc->buf2[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u);
-        sc->buf3[4 * u + 1] = (out_size + u) >> 8;
-    }
-    sc->Whigh = sc->Wlow = C32(0xFFFFFFFF);
-    mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u + 16);
-        sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u + 16);
-        sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u + 16);
-        sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u + 16);
-        sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8;
-    }
-    mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    sc->ptr = 0;
-    sc->out_size = out_size;
-}
-
-void mshabal_neon(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                     const void *data3, size_t len) {
-    size_t ptr, num;
-        
-    if (data0 == NULL) {
-        if (data1 == NULL) {
-            if (data2 == NULL) {
-                if (data3 == NULL) {
-                    return;
-                } else {
-                    data0 = data3;
-                }
-            } else {
-                data0 = data2;
-            }
-        } else {
-            data0 = data1;
-        }
-    }
-
-    if (data1 == NULL) data1 = data0;
-    if (data2 == NULL) data2 = data0;
-    if (data3 == NULL) data3 = data0;
-
-    ptr = sc->ptr;
-    if (ptr != 0) {
-        size_t clen = (sizeof sc->buf0 - ptr);
-        if (clen > len) {
-            memcpy(sc->buf0 + ptr, data0, len);
-            memcpy(sc->buf1 + ptr, data1, len);
-            memcpy(sc->buf2 + ptr, data2, len);
-            memcpy(sc->buf3 + ptr, data3, len);
-            sc->ptr = ptr + len;
-            return;
-        } else {
-            memcpy(sc->buf0 + ptr, data0, clen);
-            memcpy(sc->buf1 + ptr, data1, clen);
-            memcpy(sc->buf2 + ptr, data2, clen);
-            memcpy(sc->buf3 + ptr, data3, clen);
-            mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-            data0 = (const unsigned char *)data0 + clen;
-            data1 = (const unsigned char *)data1 + clen;
-            data2 = (const unsigned char *)data2 + clen;
-            data3 = (const unsigned char *)data3 + clen;
-            len -= clen;
-        }
-    }
-
-   num = len >> 6;
-    if (num != 0) {
-        mshabal_compress_neon(sc,data0, data1, data2, data3, num);
-        data0 = (const unsigned char *)data0 + (num << 6);
-        data1 = (const unsigned char *)data1 + (num << 6);
-        data2 = (const unsigned char *)data2 + (num << 6);
-        data3 = (const unsigned char *)data3 + (num << 6);
-    }
-    len &= 63;
-    memcpy(sc->buf0, data0, len);
-    memcpy(sc->buf1, data1, len);
-    memcpy(sc->buf2, data2, len);
-    memcpy(sc->buf3, data3, len);
-    sc->ptr = len;
-}
-
-void mshabal_close_neon(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3,
-                        unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) {
-    size_t ptr, off;
-    unsigned z, out_size_w32;
-
-    z = 0x80 >> n;
-    ptr = sc->ptr;
-    sc->buf0[ptr] = (ub0 & -z) | z;
-    sc->buf1[ptr] = (ub1 & -z) | z;
-    sc->buf2[ptr] = (ub2 & -z) | z;
-    sc->buf3[ptr] = (ub3 & -z) | z;
-    ptr++;
-    memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr);
-    memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr);
-    memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr);
-    memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr);
-    for (z = 0; z < 4; z++) {
-        mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-    out_size_w32 = sc->out_size >> 5;
-    off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32));
-    if (dst0 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst0;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0];
-    }
-    if (dst1 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst1;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1];
-    }
-    if (dst2 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst2;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2];
-    }
-    if (dst3 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst3;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3];
-    }
-}
-
-// Shabal routine optimized for plotting and hashing
-void mshabal_hash_fast_neon(mshabal128_context_fast *sc, void *message, void *termination,
-                                   void *dst, unsigned num) {
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        // move data pointer
-        message = (__m128i *)message + 16;
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned hashes
-    for (j = 0; j < 8; j++) {
-        _mm_storeu_si128((__m128i *)dst + j, C[j + 8]);
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-// Shabal routine optimized for mining
-void mshabal_deadline_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                 void *dst1, void *dst2, void *dst3) {
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned deadlines
-    u32 simd_dst[8];
-    _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]);
-    _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]);
-   
-    // unpack SIMD data
-    unsigned z;
-    for (z = 0; z < 2; z++) {
-        unsigned y = z * MSHABAL128_VECTOR_SIZE;
-        ((u32 *)dst0)[z] = simd_dst[y + 0];
-        ((u32 *)dst1)[z] = simd_dst[y + 1];
-        ((u32 *)dst2)[z] = simd_dst[y + 2];
-        ((u32 *)dst3)[z] = simd_dst[y + 3];
-    }
-    
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/c/mshabal_128_neon.h b/src/c/mshabal_128_neon.h
deleted file mode 100644
index d230981..0000000
--- a/src/c/mshabal_128_neon.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * A parallel implementation of Shabal, for platforms with NEON.
- *
- * This is the header file for an implementation of the Shabal family
- * of hash functions, designed for maximum parallel speed. It processes
- * up to four instances of Shabal in parallel, using the NEON unit.
- * Total bandwidth appear to be up to twice that of a plain 32-bit
- * Shabal implementation.
- *
- * A computation uses a mshabal_context structure. That structure is
- * supposed to be allocated and released by the caller, e.g. as a
- * local or global variable, or on the heap. The structure contents
- * are initialized with mshabal_init(). Once the structure has been
- * initialized, data is input as chunks, with the mshabal() functions.
- * Chunks for the four parallel instances are provided simultaneously
- * and must have the same length. It is allowed not to use some of the
- * instances; the corresponding parameters in mshabal() are then NULL.
- * However, using NULL as a chunk for one of the instances effectively
- * deactivates that instance; this cannot be used to "skip" a chunk
- * for one instance.
- *
- * The computation is finalized with mshabal_close(). Some extra message
- * bits (0 to 7) can be input. The outputs of the four parallel instances
- * are written in the provided buffers. There again, NULL can be
- * provided as parameter is the output of one of the instances is not
- * needed.
- *
- * A mshabal_context instance is self-contained and holds no pointer.
- * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
- * proper alignment is maintained). This implementation uses no state
- * variable beyond the context instance; this, it is thread-safe and
- * reentrant.
- *
- * The Shabal specification defines Shabal with output sizes of 192,
- * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
- * well as any output size which is multiple of 32, between 32 and
- * 512 (inclusive).
- *
- * Parameters are not validated. Thus, undefined behaviour occurs if
- * any of the "shall" or "must" clauses in this documentation is
- * violated.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#ifndef MSHABAL_H__
-#define MSHABAL_H__
-
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We need an integer type with width 32-bit or more (preferably, with
- * a width of exactly 32 bits).
- */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#include <stdint.h>
-#ifdef UINT32_MAX
-typedef uint32_t mshabal_u32;
-#else
-typedef uint_fast32_t mshabal_u32;
-#endif
-#else
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-typedef unsigned int mshabal_u32;
-#else
-typedef unsigned long mshabal_u32;
-#endif
-#endif
-
-#define MSHABAL128_VECTOR_SIZE 4
-
-/*
- * The context structure for a Shabal computation. Contents are
- * private. Such a structure should be allocated and released by
- * the caller, in any memory area.
- */
-typedef struct {
-    unsigned char buf0[64];
-    unsigned char buf1[64];
-    unsigned char buf2[64];
-    unsigned char buf3[64];
-    size_t ptr;
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context;
-
-#pragma pack(1)
-typedef struct {
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context_fast;
-#pragma pack()
-
-/*
- * Initialize a context structure. The output size must be a multiple
- * of 32, between 32 and 512 (inclusive). The output size is expressed
- * in bits.
- */
-void mshabal_init_neon(mshabal128_context *sc, unsigned out_size);
-
-/*
- * Process some more data bytes; four chunks of data, pointed to by
- * data0, data1, data2 and data3, are processed. The four chunks have
- * the same length of "len" bytes. For efficiency, it is best if data is
- * processed by medium-sized chunks, e.g. a few kilobytes at a time.
- *
- * The "len" data bytes shall all be accessible. If "len" is zero, this
- * this function does nothing and ignores the data* arguments.
- * Otherwise, if one of the data* argument is NULL, then the
- * corresponding instance is deactivated (the final value obtained from
- * that instance is undefined).
- */
-void mshabal_neon(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                     const void *data3, size_t len);
-
-/*
- * Terminate the Shabal computation incarnated by the provided context
- * structure. "n" shall be a value between 0 and 7 (inclusive): this is
- * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
- * append at the end of the input message for each of the four parallel
- * instances. Bits in "ub*" are taken in big-endian format: first bit is
- * the one of numerical value 128, second bit has numerical value 64,
- * and so on. Other bits in "ub*" are ignored. For most applications,
- * input messages will consist in sequence of bytes, and the "ub*" and
- * "n" parameters will be zero.
- *
- * The Shabal output for each of the parallel instances is written out
- * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
- * These areas shall be wide enough to accomodate the result (result
- * size was specified as parameter to mshabal_init()). It is acceptable
- * to use NULL for any of those pointers, if the result from the
- * corresponding instance is not needed.
- *
- * After this call, the context structure is invalid. The caller shall
- * release it, or reinitialize it with mshabal_init(). The mshabal_close()
- * function does NOT imply a hidden call to mshabal_init().
- */
-void mshabal_close_neon(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                           unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2,
-                           void *dst3);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void mshabal_deadline_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void mshabal_hash_fast_neon(mshabal128_context_fast *sc, void *message, void *termination,
-                               void *dst, unsigned num);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/c/mshabal_128_sse2.c b/src/c/mshabal_128_sse2.c
deleted file mode 100644
index e147a75..0000000
--- a/src/c/mshabal_128_sse2.c
+++ /dev/null
@@ -1,963 +0,0 @@
-/*
- * Parallel implementation of Shabal, using the SSE2 unit. This code
- * compiles and runs on x86 architectures, in 32-bit or 64-bit mode,
- * which possess a SSE2-compatible SIMD unit.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#include <emmintrin.h>
-#include <stddef.h>
-#include <string.h>
-#include "mshabal_128_sse2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-typedef mshabal_u32 u32;
-
-#define C32(x) ((u32)x##UL)
-#define T32(x) ((x)&C32(0xFFFFFFFF))
-#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-
-static void mshabal_compress_sse2(mshabal128_context *sc, const unsigned char *buf0,
-                                          const unsigned char *buf1, const unsigned char *buf2,
-                                          const unsigned char *buf3, size_t num) {
-    union {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u;
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-#define M(i) _mm_load_si128(u.data + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) {
-            u.words[j + 0] = *(u32 *)(buf0 + j);
-            u.words[j + 1] = *(u32 *)(buf1 + j);
-            u.words[j + 2] = *(u32 *)(buf2 + j);
-            u.words[j + 3] = *(u32 *)(buf3 + j);
-        }
-
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        buf0 += 64;
-        buf1 += 64;
-        buf2 += 64;
-        buf3 += 64;
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]);
-    for (j = 0; j < 16; j++) {
-        _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]);
-        _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]);
-    }
-#undef M
-}
-
-void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size) {
-    unsigned u;
-
-    memset(sc->state, 0, sizeof sc->state);
-    memset(sc->buf0, 0, sizeof sc->buf0);
-    memset(sc->buf1, 0, sizeof sc->buf1);
-    memset(sc->buf2, 0, sizeof sc->buf2);
-    memset(sc->buf3, 0, sizeof sc->buf3);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u);
-        sc->buf0[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u);
-        sc->buf1[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u);
-        sc->buf2[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u);
-        sc->buf3[4 * u + 1] = (out_size + u) >> 8;
-    }
-    sc->Whigh = sc->Wlow = C32(0xFFFFFFFF);
-    mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u + 16);
-        sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u + 16);
-        sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u + 16);
-        sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u + 16);
-        sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8;
-    }
-    mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-    sc->ptr = 0;
-    sc->out_size = out_size;
-}
-
-void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                  const void *data3, size_t len) {
-    size_t ptr, num;
-
-    if (data0 == NULL) {
-        if (data1 == NULL) {
-            if (data2 == NULL) {
-                if (data3 == NULL) {
-                    return;
-                } else {
-                    data0 = data3;
-                }
-            } else {
-                data0 = data2;
-            }
-        } else {
-            data0 = data1;
-        }
-    }
-
-    if (data1 == NULL) data1 = data0;
-    if (data2 == NULL) data2 = data0;
-    if (data3 == NULL) data3 = data0;
-
-    ptr = sc->ptr;
-    if (ptr != 0) {
-        size_t clen = (sizeof sc->buf0 - ptr);
-        if (clen > len) {
-            memcpy(sc->buf0 + ptr, data0, len);
-            memcpy(sc->buf1 + ptr, data1, len);
-            memcpy(sc->buf2 + ptr, data2, len);
-            memcpy(sc->buf3 + ptr, data3, len);
-            sc->ptr = ptr + len;
-            return;
-        } else {
-            memcpy(sc->buf0 + ptr, data0, clen);
-            memcpy(sc->buf1 + ptr, data1, clen);
-            memcpy(sc->buf2 + ptr, data2, clen);
-            memcpy(sc->buf3 + ptr, data3, clen);
-            mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-            data0 = (const unsigned char *)data0 + clen;
-            data1 = (const unsigned char *)data1 + clen;
-            data2 = (const unsigned char *)data2 + clen;
-            data3 = (const unsigned char *)data3 + clen;
-            len -= clen;
-        }
-    }
-
-    num = len >> 6;
-    if (num != 0) {
-        mshabal_compress_sse2(sc, data0, data1, data2, data3, num);
-        data0 = (const unsigned char *)data0 + (num << 6);
-        data1 = (const unsigned char *)data1 + (num << 6);
-        data2 = (const unsigned char *)data2 + (num << 6);
-        data3 = (const unsigned char *)data3 + (num << 6);
-    }
-    len &= 63;
-    memcpy(sc->buf0, data0, len);
-    memcpy(sc->buf1, data1, len);
-    memcpy(sc->buf2, data2, len);
-    memcpy(sc->buf3, data3, len);
-    sc->ptr = len;
-}
-
-void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3,
-                        unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) {
-    size_t ptr, off;
-    unsigned z, out_size_w32;
-
-    z = 0x80 >> n;
-    ptr = sc->ptr;
-    sc->buf0[ptr] = (ub0 & -z) | z;
-    sc->buf1[ptr] = (ub1 & -z) | z;
-    sc->buf2[ptr] = (ub2 & -z) | z;
-    sc->buf3[ptr] = (ub3 & -z) | z;
-    ptr++;
-    memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr);
-    memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr);
-    memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr);
-    memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr);
-    for (z = 0; z < 4; z++) {
-        mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1);
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-    out_size_w32 = sc->out_size >> 5;
-    off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32));
-    if (dst0 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst0;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0];
-    }
-    if (dst1 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst1;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1];
-    }
-    if (dst2 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst2;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2];
-    }
-    if (dst3 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst3;
-        for (z = 0; z < out_size_w32; z++) 
-            out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3];
-    }
-}
-
-// Shabal routine optimized for plotting and hashing
-void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination,
-                                   void *dst, unsigned num) {
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-
-        // move data pointer
-        message = (__m128i *)message + 16;
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned hashes
-    for (j = 0; j < 8; j++) {
-        _mm_storeu_si128((__m128i *)dst + j, C[j + 8]);
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-// Shabal routine optimized for mining
-void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                                         void *dst1, void *dst2, void *dst3) {
-    union input {
-        u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    };
-    size_t j;
-    __m128i A[12], B[16], C[16];
-    __m128i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12);
-        C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28);
-    }
-    one = _mm_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm_load_si128((__m128i *)message + i)
-
-    for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j));
-
-    A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-    A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-    for (j = 0; j < 16; j++)
-        B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                                   \
-    do {                                                                                           \
-        __m128i tt;                                                                                \
-        tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17));                       \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc);                                            \
-        tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt);                                             \
-        tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \
-        xa0 = tt;                                                                                  \
-        tt = xb0;                                                                                  \
-        tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31));                          \
-        xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one));                                          \
-    } while (0)
-
-    PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-    A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-    A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-    A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-    A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-    A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-    A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-    A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-    A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-    A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-    A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-    A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-    A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-    A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-    A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-    A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-    A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-    A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-    A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-    A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-    A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-    A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-    A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-    A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-    A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-    A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-    A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-    A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-    A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-    A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-    A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-    A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-    A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-    A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-    A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-    A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB(xb, xc, xm)    \
-    do {                            \
-        __m128i tmp;                \
-        tmp = xb;                   \
-        xb = _mm_sub_epi32(xc, xm); \
-        xc = tmp;                   \
-    } while (0)
-
-    SWAP_AND_SUB(B[0x0], C[0x0], M(0x0));
-    SWAP_AND_SUB(B[0x1], C[0x1], M(0x1));
-    SWAP_AND_SUB(B[0x2], C[0x2], M(0x2));
-    SWAP_AND_SUB(B[0x3], C[0x3], M(0x3));
-    SWAP_AND_SUB(B[0x4], C[0x4], M(0x4));
-    SWAP_AND_SUB(B[0x5], C[0x5], M(0x5));
-    SWAP_AND_SUB(B[0x6], C[0x6], M(0x6));
-    SWAP_AND_SUB(B[0x7], C[0x7], M(0x7));
-    SWAP_AND_SUB(B[0x8], C[0x8], M(0x8));
-    SWAP_AND_SUB(B[0x9], C[0x9], M(0x9));
-    SWAP_AND_SUB(B[0xA], C[0xA], M(0xA));
-    SWAP_AND_SUB(B[0xB], C[0xB], M(0xB));
-    SWAP_AND_SUB(B[0xC], C[0xC], M(0xC));
-    SWAP_AND_SUB(B[0xD], C[0xD], M(0xD));
-    SWAP_AND_SUB(B[0xE], C[0xE], M(0xE));
-    SWAP_AND_SUB(B[0xF], C[0xF], M(0xF));
-    if (++sc->Wlow == 0) sc->Whigh++;
-
-    // round 2-5
-#define M2(i) _mm_load_si128((__m128i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j));
-
-        A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow));
-        A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15));
-
-        PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned deadlines
-    u32 simd_dst[8];
-    _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]);
-    _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]);
-
-    // unpack SIMD data
-    unsigned z;
-    for (z = 0; z < 2; z++) {
-        unsigned y = z * MSHABAL128_VECTOR_SIZE;
-        ((u32 *)dst0)[z] = simd_dst[y + 0];
-        ((u32 *)dst1)[z] = simd_dst[y + 1];
-        ((u32 *)dst2)[z] = simd_dst[y + 2];
-        ((u32 *)dst3)[z] = simd_dst[y + 3];
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/c/mshabal_128_sse2.h b/src/c/mshabal_128_sse2.h
deleted file mode 100644
index 5874469..0000000
--- a/src/c/mshabal_128_sse2.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * A parallel implementation of Shabal, for platforms with SSE2.
- *
- * This is the header file for an implementation of the Shabal family
- * of hash functions, designed for maximum parallel speed. It processes
- * up to four instances of Shabal in parallel, using the SSE2 unit.
- * Total bandwidth appear to be up to twice that of a plain 32-bit
- * Shabal implementation.
- *
- * A computation uses a mshabal_context structure. That structure is
- * supposed to be allocated and released by the caller, e.g. as a
- * local or global variable, or on the heap. The structure contents
- * are initialized with mshabal_init(). Once the structure has been
- * initialized, data is input as chunks, with the mshabal() functions.
- * Chunks for the four parallel instances are provided simultaneously
- * and must have the same length. It is allowed not to use some of the
- * instances; the corresponding parameters in mshabal() are then NULL.
- * However, using NULL as a chunk for one of the instances effectively
- * deactivates that instance; this cannot be used to "skip" a chunk
- * for one instance.
- *
- * The computation is finalized with mshabal_close(). Some extra message
- * bits (0 to 7) can be input. The outputs of the four parallel instances
- * are written in the provided buffers. There again, NULL can be
- * provided as parameter is the output of one of the instances is not
- * needed.
- *
- * A mshabal_context instance is self-contained and holds no pointer.
- * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
- * proper alignment is maintained). This implementation uses no state
- * variable beyond the context instance; this, it is thread-safe and
- * reentrant.
- *
- * The Shabal specification defines Shabal with output sizes of 192,
- * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
- * well as any output size which is multiple of 32, between 32 and
- * 512 (inclusive).
- *
- * Parameters are not validated. Thus, undefined behaviour occurs if
- * any of the "shall" or "must" clauses in this documentation is
- * violated.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#ifndef MSHABAL_H__
-#define MSHABAL_H__
-
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We need an integer type with width 32-bit or more (preferably, with
- * a width of exactly 32 bits).
- */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#include <stdint.h>
-#ifdef UINT32_MAX
-typedef uint32_t mshabal_u32;
-#else
-typedef uint_fast32_t mshabal_u32;
-#endif
-#else
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-typedef unsigned int mshabal_u32;
-#else
-typedef unsigned long mshabal_u32;
-#endif
-#endif
-
-#define MSHABAL128_VECTOR_SIZE 4
-
-/*
- * The context structure for a Shabal computation. Contents are
- * private. Such a structure should be allocated and released by
- * the caller, in any memory area.
- */
-typedef struct {
-    unsigned char buf0[64];
-    unsigned char buf1[64];
-    unsigned char buf2[64];
-    unsigned char buf3[64];
-    size_t ptr;
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context;
-
-#pragma pack(1)
-typedef struct {
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal128_context_fast;
-#pragma pack()
-
-/*
- * Initialize a context structure. The output size must be a multiple
- * of 32, between 32 and 512 (inclusive). The output size is expressed
- * in bits.
- */
-void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size);
-
-/*
- * Process some more data bytes; four chunks of data, pointed to by
- * data0, data1, data2 and data3, are processed. The four chunks have
- * the same length of "len" bytes. For efficiency, it is best if data is
- * processed by medium-sized chunks, e.g. a few kilobytes at a time.
- *
- * The "len" data bytes shall all be accessible. If "len" is zero, this
- * this function does nothing and ignores the data* arguments.
- * Otherwise, if one of the data* argument is NULL, then the
- * corresponding instance is deactivated (the final value obtained from
- * that instance is undefined).
- */
-void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2,
-                     const void *data3, size_t len);
-
-/*
- * Terminate the Shabal computation incarnated by the provided context
- * structure. "n" shall be a value between 0 and 7 (inclusive): this is
- * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
- * append at the end of the input message for each of the four parallel
- * instances. Bits in "ub*" are taken in big-endian format: first bit is
- * the one of numerical value 128, second bit has numerical value 64,
- * and so on. Other bits in "ub*" are ignored. For most applications,
- * input messages will consist in sequence of bytes, and the "ub*" and
- * "n" parameters will be zero.
- *
- * The Shabal output for each of the parallel instances is written out
- * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
- * These areas shall be wide enough to accomodate the result (result
- * size was specified as parameter to mshabal_init()). It is acceptable
- * to use NULL for any of those pointers, if the result from the
- * corresponding instance is not needed.
- *
- * After this call, the context structure is invalid. The caller shall
- * release it, or reinitialize it with mshabal_init(). The mshabal_close()
- * function does NOT imply a hidden call to mshabal_init().
- */
-void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                           unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2,
-                           void *dst3);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination,
-                               void *dst, unsigned num);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/c/mshabal_256_avx2.c b/src/c/mshabal_256_avx2.c
deleted file mode 100644
index 2081c09..0000000
--- a/src/c/mshabal_256_avx2.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-/*
- * Parallel implementation of Shabal, using the AVX2 unit. This code
- * compiles and runs on x86 architectures, in 32-bit or 64-bit mode,
- * which possess a AVX2-compatible SIMD unit.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#include <immintrin.h>
-#include <stddef.h>
-#include <string.h>
-#include "mshabal_256_avx2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-typedef mshabal_u32 u32;
-
-#define C32(x) ((u32)x##UL)
-#define T32(x) ((x)&C32(0xFFFFFFFF))
-#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-
-static void mshabal_compress_avx2(mshabal256_context *sc, const unsigned char *buf0,
-                                const unsigned char *buf1, const unsigned char *buf2,
-                                const unsigned char *buf3, const unsigned char *buf4,
-                                const unsigned char *buf5, const unsigned char *buf6,
-                                const unsigned char *buf7, size_t num) {
-    union {
-        u32 words[16 * MSHABAL256_VECTOR_SIZE];
-        __m256i data[16];
-    } u;
-    size_t j;
-    __m256i A[12], B[16], C[16];
-    __m256i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12);
-        C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28);
-    }
-    one = _mm256_set1_epi32(C32(0xFFFFFFFF));
-
-#define M(i) _mm256_load_si256(u.data + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16 * MSHABAL256_VECTOR_SIZE; j += MSHABAL256_VECTOR_SIZE) {
-            size_t o = j / 2;
-            u.words[j + 0] = *(u32 *)(buf0 + o);
-            u.words[j + 1] = *(u32 *)(buf1 + o);
-            u.words[j + 2] = *(u32 *)(buf2 + o);
-            u.words[j + 3] = *(u32 *)(buf3 + o);
-            u.words[j + 4] = *(u32 *)(buf4 + o);
-            u.words[j + 5] = *(u32 *)(buf5 + o);
-            u.words[j + 6] = *(u32 *)(buf6 + o);
-            u.words[j + 7] = *(u32 *)(buf7 + o);
-        }
-
-        for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j));
-
-        A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow));
-        A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15));
-
-#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m256i tt;                                                                   \
-        tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc);                         \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1),                              \
-                              _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31));    \
-        xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one));                       \
-    } while (0)
-
-        PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB256(xb, xc, xm)    \
-    do {                               \
-        __m256i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm256_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-        SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF));
-
-        buf0 += 64;
-        buf1 += 64;
-        buf2 += 64;
-        buf3 += 64;
-        buf4 += 64;
-        buf5 += 64;
-        buf6 += 64;
-        buf7 += 64;
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    for (j = 0; j < 12; j++) _mm256_storeu_si256((__m256i *)sc->state + j, A[j]);
-    for (j = 0; j < 16; j++) {
-        _mm256_storeu_si256((__m256i *)sc->state + j + 12, B[j]);
-        _mm256_storeu_si256((__m256i *)sc->state + j + 28, C[j]);
-    }
-
-#undef M
-}
-
-void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size) {
-    unsigned u;
-
-    memset(sc->state, 0, sizeof sc->state);
-    memset(sc->buf0, 0, sizeof sc->buf0);
-    memset(sc->buf1, 0, sizeof sc->buf1);
-    memset(sc->buf2, 0, sizeof sc->buf2);
-    memset(sc->buf3, 0, sizeof sc->buf3);
-    memset(sc->buf4, 0, sizeof sc->buf4);
-    memset(sc->buf5, 0, sizeof sc->buf5);
-    memset(sc->buf6, 0, sizeof sc->buf6);
-    memset(sc->buf7, 0, sizeof sc->buf7);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u);
-        sc->buf0[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u);
-        sc->buf1[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u);
-        sc->buf2[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u);
-        sc->buf3[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf4[4 * u + 0] = (out_size + u);
-        sc->buf4[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf5[4 * u + 0] = (out_size + u);
-        sc->buf5[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf6[4 * u + 0] = (out_size + u);
-        sc->buf6[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf7[4 * u + 0] = (out_size + u);
-        sc->buf7[4 * u + 1] = (out_size + u) >> 8;
-    }
-    sc->Whigh = sc->Wlow = C32(0xFFFFFFFF);
-    mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                             sc->buf6, sc->buf7, 1);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u + 16);
-        sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u + 16);
-        sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u + 16);
-        sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u + 16);
-        sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf4[4 * u + 0] = (out_size + u + 16);
-        sc->buf4[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf5[4 * u + 0] = (out_size + u + 16);
-        sc->buf5[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf6[4 * u + 0] = (out_size + u + 16);
-        sc->buf6[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf7[4 * u + 0] = (out_size + u + 16);
-        sc->buf7[4 * u + 1] = (out_size + u + 16) >> 8;
-    }
-    mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                     sc->buf6, sc->buf7, 1);
-    sc->ptr = 0;
-    sc->out_size = out_size;
-}
-
-void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3,
-                    const void *data4, const void *data5, const void *data6, const void *data7, size_t len) {
-    size_t ptr, num;
-
-    if (data0 == NULL) {
-        if (data1 == NULL) {
-            if (data2 == NULL) {
-                if (data3 == NULL) {
-                    if (data4 == NULL) {
-                        if (data5 == NULL) {
-                            if (data6 == NULL) {
-                                if (data7 == NULL) {
-                                    return;
-                                } else {
-                                    data0 = data7;
-                                }
-                            } else {
-                                data0 = data6;
-                            }
-                        } else {
-                            data0 = data5;
-                        }
-                    } else {
-                        data0 = data4;
-                    }
-                } else {
-                    data0 = data3;
-                }
-            } else {
-                data0 = data2;
-            }
-        } else {
-            data0 = data1;
-        }
-    }
-
-    if (data1 == NULL) data1 = data0;
-    if (data2 == NULL) data2 = data0;
-    if (data3 == NULL) data3 = data0;
-    if (data4 == NULL) data4 = data0;
-    if (data5 == NULL) data5 = data0;
-    if (data6 == NULL) data6 = data0;
-    if (data7 == NULL) data7 = data0;
-
-    ptr = sc->ptr;
-    if (ptr != 0) {
-        size_t clen = (sizeof sc->buf0 - ptr);
-        if (clen > len) {
-            memcpy(sc->buf0 + ptr, data0, len);
-            memcpy(sc->buf1 + ptr, data1, len);
-            memcpy(sc->buf2 + ptr, data2, len);
-            memcpy(sc->buf3 + ptr, data3, len);
-            memcpy(sc->buf4 + ptr, data4, len);
-            memcpy(sc->buf5 + ptr, data5, len);
-            memcpy(sc->buf6 + ptr, data6, len);
-            memcpy(sc->buf7 + ptr, data7, len);
-            sc->ptr = ptr + len;
-            return;
-        } else {
-            memcpy(sc->buf0 + ptr, data0, clen);
-            memcpy(sc->buf1 + ptr, data1, clen);
-            memcpy(sc->buf2 + ptr, data2, clen);
-            memcpy(sc->buf3 + ptr, data3, clen);
-            memcpy(sc->buf4 + ptr, data4, clen);
-            memcpy(sc->buf5 + ptr, data5, clen);
-            memcpy(sc->buf6 + ptr, data6, clen);
-            memcpy(sc->buf7 + ptr, data7, clen);
-            mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                                sc->buf6, sc->buf7, 1);
-            data0 = (const unsigned char *)data0 + clen;
-            data1 = (const unsigned char *)data1 + clen;
-            data2 = (const unsigned char *)data2 + clen;
-            data3 = (const unsigned char *)data3 + clen;
-            data4 = (const unsigned char *)data4 + clen;
-            data5 = (const unsigned char *)data5 + clen;
-            data6 = (const unsigned char *)data6 + clen;
-            data7 = (const unsigned char *)data7 + clen;
-            len -= clen;
-        }
-    }
-
-    num = len >> 6;
-    if (num != 0) {
-        mshabal_compress_avx2(sc, data0, data1, data2, data3, data4, data5, data6, data7, num);
-        data0 = (const unsigned char *)data0 + (num << 6);
-        data1 = (const unsigned char *)data1 + (num << 6);
-        data2 = (const unsigned char *)data2 + (num << 6);
-        data3 = (const unsigned char *)data3 + (num << 6);
-        data4 = (const unsigned char *)data4 + (num << 6);
-        data5 = (const unsigned char *)data5 + (num << 6);
-        data6 = (const unsigned char *)data6 + (num << 6);
-        data7 = (const unsigned char *)data7 + (num << 6);
-    }
-    len &= (size_t)63;
-    memcpy(sc->buf0, data0, len);
-    memcpy(sc->buf1, data1, len);
-    memcpy(sc->buf2, data2, len);
-    memcpy(sc->buf3, data3, len);
-    memcpy(sc->buf4, data4, len);
-    memcpy(sc->buf5, data5, len);
-    memcpy(sc->buf6, data6, len);
-    memcpy(sc->buf7, data7, len);
-    sc->ptr = len;
-}
-
-void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                      unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7,
-                      unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4,
-                      void *dst5, void *dst6, void *dst7) {
-    size_t ptr, off;
-    unsigned z, out_size_w32;
-
-    z = 0x80 >> n;
-    ptr = sc->ptr;
-    sc->buf0[ptr] = (ub0 & -z) | z;
-    sc->buf1[ptr] = (ub1 & -z) | z;
-    sc->buf2[ptr] = (ub2 & -z) | z;
-    sc->buf3[ptr] = (ub3 & -z) | z;
-    sc->buf4[ptr] = (ub4 & -z) | z;
-    sc->buf5[ptr] = (ub5 & -z) | z;
-    sc->buf6[ptr] = (ub6 & -z) | z;
-    sc->buf7[ptr] = (ub7 & -z) | z;
-    ptr++;
-    memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr);
-    memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr);
-    memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr);
-    memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr);
-    memset(sc->buf4 + ptr, 0, (sizeof sc->buf4) - ptr);
-    memset(sc->buf5 + ptr, 0, (sizeof sc->buf5) - ptr);
-    memset(sc->buf6 + ptr, 0, (sizeof sc->buf6) - ptr);
-    memset(sc->buf7 + ptr, 0, (sizeof sc->buf7) - ptr);
-    for (z = 0; z < 4; z++) {
-        mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                            sc->buf6, sc->buf7, 1);
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-    out_size_w32 = sc->out_size >> 5;
-    off = MSHABAL256_VECTOR_SIZE * (28 + (16 - out_size_w32));
-    if (dst0 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst0;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 0];
-    }
-    if (dst1 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst1;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 1];
-    }
-    if (dst2 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst2;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 2];
-    }
-    if (dst3 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst3;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 3];
-    }
-    if (dst4 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst4;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 4];
-    }
-    if (dst5 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst5;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 5];
-    }
-    if (dst6 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst6;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 6];
-    }
-    if (dst7 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst7;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 7];
-    }
-}
-
-// Shabal routines optimized for plotting and hashing
-void mshabal_hash_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination,
-                                    void *dst, unsigned num) {
-    union input {
-        u32 words[16 * MSHABAL256_VECTOR_SIZE];
-        __m256i data[16];
-    };
-
-    size_t j;
-    __m256i A[12], B[16], C[16];
-    __m256i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12);
-        C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28);
-    }
-    one = _mm256_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm256_loadu_si256((__m256i *)message + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j));
-
-        A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow));
-        A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15));
-
-#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m256i tt;                                                                   \
-        tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc);                         \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1),                              \
-                              _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31));    \
-        xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one));                       \
-    } while (0)
-
-        PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB256(xb, xc, xm)    \
-    do {                               \
-        __m256i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm256_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-        SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF));
-
-        // move data pointer
-        message = (__m256i *)message + 16;
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-    
-    // round 2-5
-#define M2(i) _mm256_load_si256((__m256i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M2(j));
-
-        A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow));
-        A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15));
-
-        PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB256(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB256(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB256(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB256(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB256(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB256(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB256(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB256(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB256(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB256(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB256(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB256(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB256(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB256(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB256(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB256(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned hashes
-    for (j = 0; j < 8; j++) {
-        _mm256_storeu_si256((__m256i *)dst + j, C[j+8]);
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-// Shabal routine optimized for mining
-void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0, void *dst1, void *dst2, 
-                                          void *dst3, void *dst4, void *dst5, void *dst6, void *dst7) {
-    union input {
-        u32 words[16 * MSHABAL256_VECTOR_SIZE];
-        __m256i data[16];
-    };
-    size_t j;
-    __m256i A[12], B[16], C[16];
-    __m256i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12);
-        C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28);
-    }
-    one = _mm256_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm256_loadu_si256((__m256i *)message + i)
-
-    for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j));
-
-    A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow));
-    A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh));
-
-    for (j = 0; j < 16; j++)
-        B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15));
-
-#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m256i tt;                                                                   \
-        tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc);                         \
-        tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt);                          \
-        tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1),                              \
-                              _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31));    \
-        xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one));                       \
-    } while (0)
-
-        PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB256(xb, xc, xm)    \
-    do {                               \
-        __m256i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm256_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-        SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF));
-        
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-    // round 2-5
-#define M2(i) _mm256_load_si256((__m256i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M2(j));
-
-        A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow));
-        A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15));
-
-        PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB256(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB256(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB256(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB256(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB256(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB256(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB256(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB256(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB256(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB256(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB256(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB256(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB256(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB256(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB256(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB256(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned deadlines
-    u32 simd_dst[16];
-    _mm256_storeu_si256((__m256i *)&simd_dst[0], C[8]);
-    _mm256_storeu_si256((__m256i *)&simd_dst[8], C[9]);
-   
-    // unpack SIMD data
-    unsigned z;
-    for (z = 0; z < 2; z++) {
-        unsigned y = z * MSHABAL256_VECTOR_SIZE;
-        ((u32 *)dst0)[z] = simd_dst[y + 0];
-        ((u32 *)dst1)[z] = simd_dst[y + 1];
-        ((u32 *)dst2)[z] = simd_dst[y + 2];
-        ((u32 *)dst3)[z] = simd_dst[y + 3];
-        ((u32 *)dst4)[z] = simd_dst[y + 4];
-        ((u32 *)dst5)[z] = simd_dst[y + 5];
-        ((u32 *)dst6)[z] = simd_dst[y + 6];
-        ((u32 *)dst7)[z] = simd_dst[y + 7];
-    }
-    
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/c/mshabal_256_avx2.h b/src/c/mshabal_256_avx2.h
deleted file mode 100644
index 4c0cb38..0000000
--- a/src/c/mshabal_256_avx2.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * A parallel implementation of Shabal, for platforms with AVX2.
- *
- * This is the header file for an implementation of the Shabal family
- * of hash functions, designed for maximum parallel speed. It processes
- * up to four instances of Shabal in parallel, using the AVX2 unit.
- * Total bandwidth appear to be up to twice that of a plain 32-bit
- * Shabal implementation.
- *
- * A computation uses a mshabal_context structure. That structure is
- * supposed to be allocated and released by the caller, e.g. as a
- * local or global variable, or on the heap. The structure contents
- * are initialized with mshabal_init(). Once the structure has been
- * initialized, data is input as chunks, with the mshabal() functions.
- * Chunks for the four parallel instances are provided simultaneously
- * and must have the same length. It is allowed not to use some of the
- * instances; the corresponding parameters in mshabal() are then NULL.
- * However, using NULL as a chunk for one of the instances effectively
- * deactivates that instance; this cannot be used to "skip" a chunk
- * for one instance.
- *
- * The computation is finalized with mshabal_close(). Some extra message
- * bits (0 to 7) can be input. The outputs of the four parallel instances
- * are written in the provided buffers. There again, NULL can be
- * provided as parameter is the output of one of the instances is not
- * needed.
- *
- * A mshabal_context instance is self-contained and holds no pointer.
- * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
- * proper alignment is maintained). This implementation uses no state
- * variable beyond the context instance; this, it is thread-safe and
- * reentrant.
- *
- * The Shabal specification defines Shabal with output sizes of 192,
- * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
- * well as any output size which is multiple of 32, between 32 and
- * 512 (inclusive).
- *
- * Parameters are not validated. Thus, undefined behaviour occurs if
- * any of the "shall" or "must" clauses in this documentation is
- * violated.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#ifndef MSHABAL_H__
-#define MSHABAL_H__
-
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We need an integer type with width 32-bit or more (preferably, with
- * a width of exactly 32 bits).
- */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#include <stdint.h>
-#ifdef UINT32_MAX
-typedef uint32_t mshabal_u32;
-#else
-typedef uint_fast32_t mshabal_u32;
-#endif
-#else
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-typedef unsigned int mshabal_u32;
-#else
-typedef unsigned long mshabal_u32;
-#endif
-#endif
-
-#define MSHABAL256_VECTOR_SIZE 8
-
-/*
- * The context structure for a Shabal computation. Contents are
- * private. Such a structure should be allocated and released by
- * the caller, in any memory area.
- */
-typedef struct {
-    unsigned char buf0[64];
-    unsigned char buf1[64];
-    unsigned char buf2[64];
-    unsigned char buf3[64];
-    unsigned char buf4[64];
-    unsigned char buf5[64];
-    unsigned char buf6[64];
-    unsigned char buf7[64];
-    size_t ptr;
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal256_context;
-
-#pragma pack(1)
-typedef struct {
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal256_context_fast;
-#pragma pack()
-
-/*
- * Initialize a context structure. The output size must be a multiple
- * of 32, between 32 and 512 (inclusive). The output size is expressed
- * in bits.
- */
-void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size);
-
-/*
- * Process some more data bytes; four chunks of data, pointed to by
- * data0, data1, data2 and data3, are processed. The four chunks have
- * the same length of "len" bytes. For efficiency, it is best if data is
- * processed by medium-sized chunks, e.g. a few kilobytes at a time.
- *
- * The "len" data bytes shall all be accessible. If "len" is zero, this
- * this function does nothing and ignores the data* arguments.
- * Otherwise, if one of the data* argument is NULL, then the
- * corresponding instance is deactivated (the final value obtained from
- * that instance is undefined).
- */
-void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3,
-                     const void *data4, const void *data5, const void *data6, const void *data7, size_t len);
-
-/*
- * Terminate the Shabal computation incarnated by the provided context
- * structure. "n" shall be a value between 0 and 7 (inclusive): this is
- * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
- * append at the end of the input message for each of the four parallel
- * instances. Bits in "ub*" are taken in big-endian format: first bit is
- * the one of numerical value 128, second bit has numerical value 64,
- * and so on. Other bits in "ub*" are ignored. For most applications,
- * input messages will consist in sequence of bytes, and the "ub*" and
- * "n" parameters will be zero.
- *
- * The Shabal output for each of the parallel instances is written out
- * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
- * These areas shall be wide enough to accomodate the result (result
- * size was specified as parameter to mshabal_init()). It is acceptable
- * to use NULL for any of those pointers, if the result from the
- * corresponding instance is not needed.
- *
- * After this call, the context structure is invalid. The caller shall
- * release it, or reinitialize it with mshabal_init(). The mshabal_close()
- * function does NOT imply a hidden call to mshabal_init().
- */
-void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                           unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7,
-                           unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4,
-                           void *dst5, void *dst6, void *dst7);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void mshabal256_openclose_fast(mshabal256_context_fast *sc, void *message, void *termination,
-                               void *dst, unsigned len);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3, void *dst4, void *dst5,
-                                    void *dst6, void *dst7);
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/c/mshabal_512_avx512f.c b/src/c/mshabal_512_avx512f.c
deleted file mode 100644
index 06c7c7b..0000000
--- a/src/c/mshabal_512_avx512f.c
+++ /dev/null
@@ -1,1318 +0,0 @@
-﻿/*
- * Parallel implementation of Shabal, using the AVX512f unit. This code
- * compiles and runs on x86 architectures, in 32-bit or 64-bit mode,
- * which possess a AVX512f-compatible SIMD unit.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#include <immintrin.h>
-#include <stddef.h>
-#include <string.h>
-#include "mshabal_512_avx512f.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-typedef mshabal_u32 u32;
-
-#define C32(x) ((u32)x##UL)
-#define T32(x) ((x)&C32(0xFFFFFFFF))
-#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-
-static void mshabal_compress_avx512f(mshabal512_context *sc, const unsigned char *buf0,
-                                     const unsigned char *buf1, const unsigned char *buf2,
-                                     const unsigned char *buf3, const unsigned char *buf4,
-                                     const unsigned char *buf5, const unsigned char *buf6,
-                                     const unsigned char *buf7, const unsigned char *buf8,
-                                     const unsigned char *buf9, const unsigned char *buf10,
-                                     const unsigned char *buf11, const unsigned char *buf12,
-                                     const unsigned char *buf13, const unsigned char *buf14,
-                                     const unsigned char *buf15, size_t num) {
-    union {
-        u32 words[16 * MSHABAL512_VECTOR_SIZE];
-        __m512i data[16];
-    } u;
-    size_t j;
-    __m512i A[12], B[16], C[16];
-    __m512i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12);
-        C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28);
-    }
-    one = _mm512_set1_epi32(C32(0xFFFFFFFF));
-
-#define M(i) _mm512_load_si512(u.data + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16 * MSHABAL512_VECTOR_SIZE; j += MSHABAL512_VECTOR_SIZE) {
-            size_t o = j / 4;
-            u.words[j + 0] = *(u32 *)(buf0 + o);
-            u.words[j + 1] = *(u32 *)(buf1 + o);
-            u.words[j + 2] = *(u32 *)(buf2 + o);
-            u.words[j + 3] = *(u32 *)(buf3 + o);
-            u.words[j + 4] = *(u32 *)(buf4 + o);
-            u.words[j + 5] = *(u32 *)(buf5 + o);
-            u.words[j + 6] = *(u32 *)(buf6 + o);
-            u.words[j + 7] = *(u32 *)(buf7 + o);
-            u.words[j + 8] = *(u32 *)(buf8 + o);
-            u.words[j + 9] = *(u32 *)(buf9 + o);
-            u.words[j + 10] = *(u32 *)(buf10 + o);
-            u.words[j + 11] = *(u32 *)(buf11 + o);
-            u.words[j + 12] = *(u32 *)(buf12 + o);
-            u.words[j + 13] = *(u32 *)(buf13 + o);
-            u.words[j + 14] = *(u32 *)(buf14 + o);
-            u.words[j + 15] = *(u32 *)(buf15 + o);
-        }
-
-        for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j));
-
-        A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow));
-        A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15));
-
-#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m512i tt;                                                                   \
-        tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc);                         \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1),                              \
-                              _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31));    \
-        xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one));                       \
-    } while (0)
-
-        PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB512(xb, xc, xm)    \
-    do {                               \
-        __m512i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm512_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-        SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF));
-
-        buf0 += 64;
-        buf1 += 64;
-        buf2 += 64;
-        buf3 += 64;
-        buf4 += 64;
-        buf5 += 64;
-        buf6 += 64;
-        buf7 += 64;
-        buf8 += 64;
-        buf9 += 64;
-        buf10 += 64;
-        buf11 += 64;
-        buf12 += 64;
-        buf13 += 64;
-        buf14 += 64;
-        buf15 += 64;
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-
-    for (j = 0; j < 12; j++) _mm512_storeu_si512((__m512i *)sc->state + j, A[j]);
-    for (j = 0; j < 16; j++) {
-        _mm512_storeu_si512((__m512i *)sc->state + j + 12, B[j]);
-        _mm512_storeu_si512((__m512i *)sc->state + j + 28, C[j]);
-    }
-
-#undef M
-}
-
-void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size) {
-    unsigned u;
-
-    memset(sc->state, 0, sizeof sc->state);
-    memset(sc->buf0, 0, sizeof sc->buf0);
-    memset(sc->buf1, 0, sizeof sc->buf1);
-    memset(sc->buf2, 0, sizeof sc->buf2);
-    memset(sc->buf3, 0, sizeof sc->buf3);
-    memset(sc->buf4, 0, sizeof sc->buf4);
-    memset(sc->buf5, 0, sizeof sc->buf5);
-    memset(sc->buf6, 0, sizeof sc->buf6);
-    memset(sc->buf7, 0, sizeof sc->buf7);
-    memset(sc->buf8, 0, sizeof sc->buf8);
-    memset(sc->buf9, 0, sizeof sc->buf9);
-    memset(sc->buf10, 0, sizeof sc->buf10);
-    memset(sc->buf11, 0, sizeof sc->buf11);
-    memset(sc->buf12, 0, sizeof sc->buf12);
-    memset(sc->buf13, 0, sizeof sc->buf13);
-    memset(sc->buf14, 0, sizeof sc->buf14);
-    memset(sc->buf15, 0, sizeof sc->buf15);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u);
-        sc->buf0[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u);
-        sc->buf1[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u);
-        sc->buf2[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u);
-        sc->buf3[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf4[4 * u + 0] = (out_size + u);
-        sc->buf4[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf5[4 * u + 0] = (out_size + u);
-        sc->buf5[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf6[4 * u + 0] = (out_size + u);
-        sc->buf6[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf7[4 * u + 0] = (out_size + u);
-        sc->buf7[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf8[4 * u + 0] = (out_size + u);
-        sc->buf8[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf9[4 * u + 0] = (out_size + u);
-        sc->buf9[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf10[4 * u + 0] = (out_size + u);
-        sc->buf10[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf11[4 * u + 0] = (out_size + u);
-        sc->buf11[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf12[4 * u + 0] = (out_size + u);
-        sc->buf12[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf13[4 * u + 0] = (out_size + u);
-        sc->buf13[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf14[4 * u + 0] = (out_size + u);
-        sc->buf14[4 * u + 1] = (out_size + u) >> 8;
-        sc->buf15[4 * u + 0] = (out_size + u);
-        sc->buf15[4 * u + 1] = (out_size + u) >> 8;
-    }
-    sc->Whigh = sc->Wlow = C32(0xFFFFFFFF);
-    mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                             sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11,
-                             sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1);
-    for (u = 0; u < 16; u++) {
-        sc->buf0[4 * u + 0] = (out_size + u + 16);
-        sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf1[4 * u + 0] = (out_size + u + 16);
-        sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf2[4 * u + 0] = (out_size + u + 16);
-        sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf3[4 * u + 0] = (out_size + u + 16);
-        sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf4[4 * u + 0] = (out_size + u + 16);
-        sc->buf4[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf5[4 * u + 0] = (out_size + u + 16);
-        sc->buf5[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf6[4 * u + 0] = (out_size + u + 16);
-        sc->buf6[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf7[4 * u + 0] = (out_size + u + 16);
-        sc->buf7[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf8[4 * u + 0] = (out_size + u + 16);
-        sc->buf8[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf9[4 * u + 0] = (out_size + u + 16);
-        sc->buf9[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf10[4 * u + 0] = (out_size + u + 16);
-        sc->buf10[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf11[4 * u + 0] = (out_size + u + 16);
-        sc->buf11[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf12[4 * u + 0] = (out_size + u + 16);
-        sc->buf12[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf13[4 * u + 0] = (out_size + u + 16);
-        sc->buf13[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf14[4 * u + 0] = (out_size + u + 16);
-        sc->buf14[4 * u + 1] = (out_size + u + 16) >> 8;
-        sc->buf15[4 * u + 0] = (out_size + u + 16);
-        sc->buf15[4 * u + 1] = (out_size + u + 16) >> 8;
-    }
-    mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                             sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11,
-                             sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1);
-    sc->ptr = 0;
-    sc->out_size = out_size;
-}
-
-void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2,
-                             const void *data3, const void *data4, const void *data5, const void *data6, const void *data7,
-                             const void *data8, const void *data9, const void *data10, const void *data11, const void *data12,
-                             const void *data13, const void *data14, const void *data15, size_t len) {
-    size_t ptr, num;
-
-    if (data0 == NULL) {
-        if (data1 == NULL) {
-            if (data2 == NULL) {
-                if (data3 == NULL) {
-                    if (data4 == NULL) {
-                        if (data5 == NULL) {
-                            if (data6 == NULL) {
-                                if (data7 == NULL) {
-                                    if (data8 == NULL) {
-                                        if (data9 == NULL) {
-                                            if (data10 == NULL) {
-                                                if (data11 == NULL) {
-                                                    if (data12 == NULL) {
-                                                        if (data13 == NULL) {
-                                                            if (data14 == NULL) {
-                                                                if (data15 == NULL) {
-                                                                    return;
-                                                                } else {
-                                                                    data0 = data15;
-                                                                }
-                                                            } else {
-                                                                data0 = data14;
-                                                            }
-                                                        } else {
-                                                            data0 = data13;
-                                                        }
-                                                    } else {
-                                                        data0 = data12;
-                                                    }
-                                                } else {
-                                                    data0 = data11;
-                                                }
-                                            } else {
-                                                data0 = data10;
-                                            }
-                                        } else {
-                                            data0 = data9;
-                                        }
-                                    } else {
-                                        data0 = data8;
-                                    }
-                                } else {
-                                    data0 = data7;
-                                }
-                            } else {
-                                data0 = data6;
-                            }
-                        } else {
-                            data0 = data5;
-                        }
-                    } else {
-                        data0 = data4;
-                    }
-                } else {
-                    data0 = data3;
-                }
-            } else {
-                data0 = data2;
-            }
-        } else {
-            data0 = data1;
-        }
-    }
-
-    if (data1 == NULL) data1 = data0;
-    if (data2 == NULL) data2 = data0;
-    if (data3 == NULL) data3 = data0;
-    if (data4 == NULL) data4 = data0;
-    if (data5 == NULL) data5 = data0;
-    if (data6 == NULL) data6 = data0;
-    if (data7 == NULL) data7 = data0;
-    if (data8 == NULL) data8 = data0;
-    if (data9 == NULL) data9 = data0;
-    if (data10 == NULL) data10 = data0;
-    if (data11 == NULL) data11 = data0;
-    if (data12 == NULL) data12 = data0;
-    if (data13 == NULL) data13 = data0;
-    if (data14 == NULL) data14 = data0;
-    if (data15 == NULL) data15 = data0;
-
-    ptr = sc->ptr;
-    if (ptr != 0) {
-        size_t clen = (sizeof sc->buf0 - ptr);
-        if (clen > len) {
-            memcpy(sc->buf0 + ptr, data0, len);
-            memcpy(sc->buf1 + ptr, data1, len);
-            memcpy(sc->buf2 + ptr, data2, len);
-            memcpy(sc->buf3 + ptr, data3, len);
-            memcpy(sc->buf4 + ptr, data4, len);
-            memcpy(sc->buf5 + ptr, data5, len);
-            memcpy(sc->buf6 + ptr, data6, len);
-            memcpy(sc->buf7 + ptr, data7, len);
-            memcpy(sc->buf8 + ptr, data8, len);
-            memcpy(sc->buf9 + ptr, data9, len);
-            memcpy(sc->buf10 + ptr, data10, len);
-            memcpy(sc->buf11 + ptr, data11, len);
-            memcpy(sc->buf12 + ptr, data12, len);
-            memcpy(sc->buf13 + ptr, data13, len);
-            memcpy(sc->buf14 + ptr, data14, len);
-            memcpy(sc->buf15 + ptr, data15, len);
-            sc->ptr = ptr + len;
-            return;
-        } else {
-            memcpy(sc->buf0 + ptr, data0, clen);
-            memcpy(sc->buf1 + ptr, data1, clen);
-            memcpy(sc->buf2 + ptr, data2, clen);
-            memcpy(sc->buf3 + ptr, data3, clen);
-            memcpy(sc->buf4 + ptr, data4, clen);
-            memcpy(sc->buf5 + ptr, data5, clen);
-            memcpy(sc->buf6 + ptr, data6, clen);
-            memcpy(sc->buf7 + ptr, data7, clen);
-            memcpy(sc->buf8 + ptr, data8, clen);
-            memcpy(sc->buf9 + ptr, data9, clen);
-            memcpy(sc->buf10 + ptr, data10, clen);
-            memcpy(sc->buf11 + ptr, data11, clen);
-            memcpy(sc->buf12 + ptr, data12, clen);
-            memcpy(sc->buf13 + ptr, data13, clen);
-            memcpy(sc->buf14 + ptr, data14, clen);
-            memcpy(sc->buf15 + ptr, data15, clen);
-            mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                                     sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11,
-                                     sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1);
-            data0 = (const unsigned char *)data0 + clen;
-            data1 = (const unsigned char *)data1 + clen;
-            data2 = (const unsigned char *)data2 + clen;
-            data3 = (const unsigned char *)data3 + clen;
-            data4 = (const unsigned char *)data4 + clen;
-            data5 = (const unsigned char *)data5 + clen;
-            data6 = (const unsigned char *)data6 + clen;
-            data7 = (const unsigned char *)data7 + clen;
-            data8 = (const unsigned char *)data8 + clen;
-            data9 = (const unsigned char *)data9 + clen;
-            data10 = (const unsigned char *)data10 + clen;
-            data11 = (const unsigned char *)data11 + clen;
-            data12 = (const unsigned char *)data12 + clen;
-            data13 = (const unsigned char *)data13 + clen;
-            data14 = (const unsigned char *)data14 + clen;
-            data15 = (const unsigned char *)data15 + clen;
-            len -= clen;
-        }
-    }
-
-    num = len >> 6;
-    if (num != 0) {
-       mshabal_compress_avx512f(sc, data0, data1, data2, data3, data4, data5, data6, data7,
-                                    data8, data9, data10, data11, data12, data13, data14, data15, num);
-        data0 = (const unsigned char *)data0 + (num << 6);
-        data1 = (const unsigned char *)data1 + (num << 6);
-        data2 = (const unsigned char *)data2 + (num << 6);
-        data3 = (const unsigned char *)data3 + (num << 6);
-        data4 = (const unsigned char *)data4 + (num << 6);
-        data5 = (const unsigned char *)data5 + (num << 6);
-        data6 = (const unsigned char *)data6 + (num << 6);
-        data7 = (const unsigned char *)data7 + (num << 6);
-        data8 = (const unsigned char *)data8 + (num << 6);
-        data9 = (const unsigned char *)data9 + (num << 6);
-        data10 = (const unsigned char *)data10 + (num << 6);
-        data11 = (const unsigned char *)data11 + (num << 6);
-        data12 = (const unsigned char *)data12 + (num << 6);
-        data13 = (const unsigned char *)data13 + (num << 6);
-        data14 = (const unsigned char *)data14 + (num << 6);
-        data15 = (const unsigned char *)data15 + (num << 6);
-    }
-    len &= (size_t)63;
-    memcpy(sc->buf0, data0, len);
-    memcpy(sc->buf1, data1, len);
-    memcpy(sc->buf2, data2, len);
-    memcpy(sc->buf3, data3, len);
-    memcpy(sc->buf4, data4, len);
-    memcpy(sc->buf5, data5, len);
-    memcpy(sc->buf6, data6, len);
-    memcpy(sc->buf7, data7, len);
-    memcpy(sc->buf8, data8, len);
-    memcpy(sc->buf9, data9, len);
-    memcpy(sc->buf10, data10, len);
-    memcpy(sc->buf11, data11, len);
-    memcpy(sc->buf12, data12, len);
-    memcpy(sc->buf13, data13, len);
-    memcpy(sc->buf14, data14, len);
-    memcpy(sc->buf15, data15, len);
-    sc->ptr = len;
-}
-
-void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                      unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, 
-                      unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12, 
-                      unsigned ub13, unsigned ub14, unsigned ub15,
-                      unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4,
-                      void *dst5, void *dst6, void *dst7, void *dst8, void *dst9, void *dst10, 
-                      void *dst11, void *dst12, void *dst13, void *dst14, void *dst15) {
-    size_t ptr, off;
-    unsigned z, out_size_w32;
-
-    z = 0x80 >> n;
-    ptr = sc->ptr;
-    sc->buf0[ptr] = (ub0 & -z) | z;
-    sc->buf1[ptr] = (ub1 & -z) | z;
-    sc->buf2[ptr] = (ub2 & -z) | z;
-    sc->buf3[ptr] = (ub3 & -z) | z;
-    sc->buf4[ptr] = (ub4 & -z) | z;
-    sc->buf5[ptr] = (ub5 & -z) | z;
-    sc->buf6[ptr] = (ub6 & -z) | z;
-    sc->buf7[ptr] = (ub7 & -z) | z;
-    sc->buf8[ptr] = (ub8 & -z) | z;
-    sc->buf9[ptr] = (ub9 & -z) | z;
-    sc->buf10[ptr] = (ub10 & -z) | z;
-    sc->buf11[ptr] = (ub11 & -z) | z;
-    sc->buf12[ptr] = (ub12 & -z) | z;
-    sc->buf13[ptr] = (ub13 & -z) | z;
-    sc->buf14[ptr] = (ub14 & -z) | z;
-    sc->buf15[ptr] = (ub15 & -z) | z;
-    ptr++;
-    memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr);
-    memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr);
-    memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr);
-    memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr);
-    memset(sc->buf4 + ptr, 0, (sizeof sc->buf4) - ptr);
-    memset(sc->buf5 + ptr, 0, (sizeof sc->buf5) - ptr);
-    memset(sc->buf6 + ptr, 0, (sizeof sc->buf6) - ptr);
-    memset(sc->buf7 + ptr, 0, (sizeof sc->buf7) - ptr);
-    memset(sc->buf8 + ptr, 0, (sizeof sc->buf8) - ptr);
-    memset(sc->buf9 + ptr, 0, (sizeof sc->buf9) - ptr);
-    memset(sc->buf10 + ptr, 0, (sizeof sc->buf10) - ptr);
-    memset(sc->buf11 + ptr, 0, (sizeof sc->buf11) - ptr);
-    memset(sc->buf12 + ptr, 0, (sizeof sc->buf12) - ptr);
-    memset(sc->buf13 + ptr, 0, (sizeof sc->buf13) - ptr);
-    memset(sc->buf14 + ptr, 0, (sizeof sc->buf14) - ptr);
-    memset(sc->buf15 + ptr, 0, (sizeof sc->buf15) - ptr);
-    for (z = 0; z < 4; z++) {
-        mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5,
-                            sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11, sc->buf12, sc->buf13,
-                            sc->buf14, sc->buf15, 1);
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-    out_size_w32 = sc->out_size >> 5;
-    off = MSHABAL512_VECTOR_SIZE * (28 + (16 - out_size_w32));
-    if (dst0 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst0;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 0];
-    }
-    if (dst1 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst1;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 1];
-    }
-    if (dst2 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst2;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 2];
-    }
-    if (dst3 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst3;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 3];
-    }
-    if (dst4 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst4;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 4];
-    }
-    if (dst5 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst5;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 5];
-    }
-    if (dst6 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst6;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 6];
-    }
-    if (dst7 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst7;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 7];
-    }
-    if (dst8 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst8;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 8];
-    }
-    if (dst9 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst9;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 9];
-    }
-    if (dst10 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst10;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 10];
-    }
-    if (dst11 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst11;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 11];
-    }
-    if (dst12 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst12;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 12];
-    }
-    if (dst13 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst13;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 13];
-    }
-    if (dst14 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst14;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 14];
-    }
-    if (dst15 != NULL) {
-        u32 *out;
-
-        out = (u32 *)dst15;
-        for (z = 0; z < out_size_w32; z++)
-            out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 15];
-    }
-}
-
-// Shabal routine optimized for plotting and hashing
-void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination,
-                                    void *dst, unsigned num) {
-    union input {
-        u32 words[16 * MSHABAL512_VECTOR_SIZE];
-        __m512i data[16];
-    };
-    size_t j;
-    __m512i A[12], B[16], C[16];
-    __m512i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12);
-        C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28);
-    }
-    one = _mm512_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm512_load_si512((__m512i *)message + i)
-
-    while (num-- > 0) {
-        for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j));
-
-        A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow));
-        A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15));
-
-#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m512i tt;                                                                   \
-        tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc);                         \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1),                              \
-                              _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31));    \
-        xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one));                       \
-    } while (0)
-
-        PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-        PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-        PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-        PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-        PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-        PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-        PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-        PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-        PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-        PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-        PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-        PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-        PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-        PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-        PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-        PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB512(xb, xc, xm)    \
-    do {                               \
-        __m512i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm512_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-        SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0));
-        SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1));
-        SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2));
-        SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3));
-        SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4));
-        SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5));
-        SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6));
-        SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7));
-        SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8));
-        SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9));
-        SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA));
-        SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB));
-        SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC));
-        SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD));
-        SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE));
-        SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF));
-
-        // move data pointer
-        message = (__m512i *)message + 16;
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-    }
-    
-    // round 2-5
-#define M2(i) _mm512_load_si512((__m512i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M2(j));
-
-        A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow));
-        A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15));
-
-        PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB512(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB512(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB512(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB512(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB512(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB512(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB512(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB512(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB512(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB512(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB512(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB512(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB512(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB512(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB512(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB512(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned hashes
-    for (j = 0; j < 8; j++) {
-        _mm512_storeu_si512((__m512i *)dst + j, C[j+8]);
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-// Shabal routine optimized for mining
-void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3, void *dst4, void *dst5,
-                                    void *dst6, void *dst7, void *dst8, void *dst9, void *dst10,
-                                    void *dst11, void *dst12, void *dst13, void *dst14,
-                                    void *dst15) {
-    union input {
-        u32 words[16 * MSHABAL512_VECTOR_SIZE];
-        __m512i data[16];
-    };
-    size_t j;
-    __m512i A[12], B[16], C[16];
-    __m512i one;
-
-    for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j);
-    for (j = 0; j < 16; j++) {
-        B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12);
-        C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28);
-    }
-    one = _mm512_set1_epi32(C32(0xFFFFFFFF));
-
-    // round 1
-#define M(i) _mm512_load_si512((__m512i *)message + i)
-
-    for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j));
-
-    A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow));
-    A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh));
-
-    for (j = 0; j < 16; j++)
-        B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15));
-
-#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                   \
-    do {                                                                              \
-        __m512i tt;                                                                   \
-        tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc);                         \
-        tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt);                          \
-        tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1),                              \
-                              _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm));   \
-        xa0 = tt;                                                                     \
-        tt = xb0;                                                                     \
-        tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31));    \
-        xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one));                       \
-    } while (0)
-
-    PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0));
-    PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1));
-    PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2));
-    PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3));
-    PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4));
-    PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5));
-    PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6));
-    PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7));
-    PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8));
-    PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9));
-    PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA));
-    PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB));
-    PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC));
-    PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD));
-    PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE));
-    PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF));
-
-    A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]);
-    A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]);
-    A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]);
-    A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]);
-    A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]);
-    A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]);
-    A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]);
-    A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]);
-    A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]);
-    A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]);
-    A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]);
-    A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]);
-    A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]);
-    A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]);
-    A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]);
-    A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]);
-    A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]);
-    A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]);
-    A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]);
-    A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]);
-    A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]);
-    A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]);
-    A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]);
-    A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]);
-    A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]);
-    A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]);
-    A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]);
-    A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]);
-    A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]);
-    A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]);
-    A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]);
-    A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]);
-    A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]);
-    A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]);
-    A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]);
-    A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]);
-
-#define SWAP_AND_SUB512(xb, xc, xm)    \
-    do {                               \
-        __m512i tmp;                   \
-        tmp = xb;                      \
-        xb = _mm512_sub_epi32(xc, xm); \
-        xc = tmp;                      \
-    } while (0)
-
-    SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0));
-    SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1));
-    SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2));
-    SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3));
-    SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4));
-    SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5));
-    SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6));
-    SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7));
-    SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8));
-    SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9));
-    SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA));
-    SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB));
-    SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC));
-    SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD));
-    SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE));
-    SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF));
-
-    if (++sc->Wlow == 0) sc->Whigh++;
-
-    // round 2-5
-#define M2(i) _mm512_load_si512((__m512i *)termination + i)
-
-    for (int k = 0; k < 4; k++) {
-        for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M2(j));
-
-        A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow));
-        A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh));
-
-        for (j = 0; j < 16; j++)
-            B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15));
-
-        PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0));
-        PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1));
-        PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2));
-        PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3));
-        PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4));
-        PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5));
-        PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6));
-        PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7));
-        PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8));
-        PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9));
-        PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA));
-        PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB));
-        PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC));
-        PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD));
-        PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE));
-        PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF));
-
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]);
-        A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]);
-        A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]);
-        A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]);
-        A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]);
-        A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]);
-        A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]);
-        A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]);
-        A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]);
-        A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]);
-        A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]);
-        A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]);
-        A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]);
-
-        SWAP_AND_SUB512(B[0x0], C[0x0], M2(0x0));
-        SWAP_AND_SUB512(B[0x1], C[0x1], M2(0x1));
-        SWAP_AND_SUB512(B[0x2], C[0x2], M2(0x2));
-        SWAP_AND_SUB512(B[0x3], C[0x3], M2(0x3));
-        SWAP_AND_SUB512(B[0x4], C[0x4], M2(0x4));
-        SWAP_AND_SUB512(B[0x5], C[0x5], M2(0x5));
-        SWAP_AND_SUB512(B[0x6], C[0x6], M2(0x6));
-        SWAP_AND_SUB512(B[0x7], C[0x7], M2(0x7));
-        SWAP_AND_SUB512(B[0x8], C[0x8], M2(0x8));
-        SWAP_AND_SUB512(B[0x9], C[0x9], M2(0x9));
-        SWAP_AND_SUB512(B[0xA], C[0xA], M2(0xA));
-        SWAP_AND_SUB512(B[0xB], C[0xB], M2(0xB));
-        SWAP_AND_SUB512(B[0xC], C[0xC], M2(0xC));
-        SWAP_AND_SUB512(B[0xD], C[0xD], M2(0xD));
-        SWAP_AND_SUB512(B[0xE], C[0xE], M2(0xE));
-        SWAP_AND_SUB512(B[0xF], C[0xF], M2(0xF));
-
-        if (++sc->Wlow == 0) sc->Whigh++;
-
-        if (sc->Wlow-- == 0) sc->Whigh--;
-    }
-
-    // download SIMD aligned deadlines
-    u32 simd_dst[32];
-    _mm512_storeu_si512((__m512i *)&simd_dst[0], C[8]);
-    _mm512_storeu_si512((__m512i *)&simd_dst[16], C[9]);
-
-    // unpack SIMD data
-    unsigned z;
-    for (z = 0; z < 2; z++) {
-        unsigned y = z * MSHABAL512_VECTOR_SIZE;
-        ((u32 *)dst0)[z] = simd_dst[y + 0];
-        ((u32 *)dst1)[z] = simd_dst[y + 1];
-        ((u32 *)dst2)[z] = simd_dst[y + 2];
-        ((u32 *)dst3)[z] = simd_dst[y + 3];
-        ((u32 *)dst4)[z] = simd_dst[y + 4];
-        ((u32 *)dst5)[z] = simd_dst[y + 5];
-        ((u32 *)dst6)[z] = simd_dst[y + 6];
-        ((u32 *)dst7)[z] = simd_dst[y + 7];
-        ((u32 *)dst8)[z] = simd_dst[y + 8];
-        ((u32 *)dst9)[z] = simd_dst[y + 9];
-        ((u32 *)dst10)[z] = simd_dst[y + 10];
-        ((u32 *)dst11)[z] = simd_dst[y + 11];
-        ((u32 *)dst12)[z] = simd_dst[y + 12];
-        ((u32 *)dst13)[z] = simd_dst[y + 13];
-        ((u32 *)dst14)[z] = simd_dst[y + 14];
-        ((u32 *)dst15)[z] = simd_dst[y + 15];
-    }
-
-    // reset Wlow & Whigh
-    sc->Wlow = 1;
-    sc->Whigh = 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/c/mshabal_512_avx512f.h b/src/c/mshabal_512_avx512f.h
deleted file mode 100644
index 57455f5..0000000
--- a/src/c/mshabal_512_avx512f.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * A parallel implementation of Shabal, for platforms with AVX512F.
- *
- * This is the header file for an implementation of the Shabal family
- * of hash functions, designed for maximum parallel speed. It processes
- * up to four instances of Shabal in parallel, using the AVX512F unit.
- * Total bandwidth appear to be up to twice that of a plain 32-bit
- * Shabal implementation.
- *
- * A computation uses a mshabal_context structure. That structure is
- * supposed to be allocated and released by the caller, e.g. as a
- * local or global variable, or on the heap. The structure contents
- * are initialized with mshabal_init(). Once the structure has been
- * initialized, data is input as chunks, with the mshabal() functions.
- * Chunks for the four parallel instances are provided simultaneously
- * and must have the same length. It is allowed not to use some of the
- * instances; the corresponding parameters in mshabal() are then NULL.
- * However, using NULL as a chunk for one of the instances effectively
- * deactivates that instance; this cannot be used to "skip" a chunk
- * for one instance.
- *
- * The computation is finalized with mshabal_close(). Some extra message
- * bits (0 to 7) can be input. The outputs of the four parallel instances
- * are written in the provided buffers. There again, NULL can be
- * provided as parameter is the output of one of the instances is not
- * needed.
- *
- * A mshabal_context instance is self-contained and holds no pointer.
- * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as
- * proper alignment is maintained). This implementation uses no state
- * variable beyond the context instance; this, it is thread-safe and
- * reentrant.
- *
- * The Shabal specification defines Shabal with output sizes of 192,
- * 224, 256, 384 and 512 bits. This code accepts all those sizes, as
- * well as any output size which is multiple of 32, between 32 and
- * 512 (inclusive).
- *
- * Parameters are not validated. Thus, undefined behaviour occurs if
- * any of the "shall" or "must" clauses in this documentation is
- * violated.
- *
- *
- * (c) 2010 SAPHIR project. This software is provided 'as-is', without
- * any epxress or implied warranty. In no event will the authors be held
- * liable for any damages arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to no restriction.
- *
- * Technical remarks and questions can be addressed to:
- * <thomas.pornin@cryptolog.com>
- */
-
-#ifndef MSHABAL_H__
-#define MSHABAL_H__
-
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We need an integer type with width 32-bit or more (preferably, with
- * a width of exactly 32 bits).
- */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#include <stdint.h>
-#ifdef UINT32_MAX
-typedef uint32_t mshabal_u32;
-#else
-typedef uint_fast32_t mshabal_u32;
-#endif
-#else
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-typedef unsigned int mshabal_u32;
-#else
-typedef unsigned long mshabal_u32;
-#endif
-#endif
-
-#define MSHABAL512_VECTOR_SIZE 16
-
-/*
- * The context structure for a Shabal computation. Contents are
- * private. Such a structure should be allocated and released by
- * the caller, in any memory area.
- */
-typedef struct {
-    unsigned char buf0[64];
-    unsigned char buf1[64];
-    unsigned char buf2[64];
-    unsigned char buf3[64];
-    unsigned char buf4[64];
-    unsigned char buf5[64];
-    unsigned char buf6[64];
-    unsigned char buf7[64];
-    unsigned char buf8[64];
-    unsigned char buf9[64];
-    unsigned char buf10[64];
-    unsigned char buf11[64];
-    unsigned char buf12[64];
-    unsigned char buf13[64];
-    unsigned char buf14[64];
-    unsigned char buf15[64];
-    size_t ptr;
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal512_context;
-
-#pragma pack(1)
-typedef struct {
-    mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE];
-    mshabal_u32 Whigh, Wlow;
-    unsigned out_size;
-} mshabal512_context_fast;
-#pragma pack()
-
-/*
- * Initialize a context structure. The output size must be a multiple
- * of 32, between 32 and 512 (inclusive). The output size is expressed
- * in bits.
- */
-void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size);
-
-/*
- * Process some more data bytes; four chunks of data, pointed to by
- * data0, data1, data2 and data3, are processed. The four chunks have
- * the same length of "len" bytes. For efficiency, it is best if data is
- * processed by medium-sized chunks, e.g. a few kilobytes at a time.
- *
- * The "len" data bytes shall all be accessible. If "len" is zero, this
- * this function does nothing and ignores the data* arguments.
- * Otherwise, if one of the data* argument is NULL, then the
- * corresponding instance is deactivated (the final value obtained from
- * that instance is undefined).
- */
-void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2, const void *data3,
-                     const void *data4, const void *data5, const void *data6, const void *data7, const void *data8, const void *data9,
-                     const void *data10, const void *data11, const void *data12, const void *data13, const void *data14,
-                     const void *data15, size_t len);
-
-/*
- * Terminate the Shabal computation incarnated by the provided context
- * structure. "n" shall be a value between 0 and 7 (inclusive): this is
- * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and
- * append at the end of the input message for each of the four parallel
- * instances. Bits in "ub*" are taken in big-endian format: first bit is
- * the one of numerical value 128, second bit has numerical value 64,
- * and so on. Other bits in "ub*" are ignored. For most applications,
- * input messages will consist in sequence of bytes, and the "ub*" and
- * "n" parameters will be zero.
- *
- * The Shabal output for each of the parallel instances is written out
- * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3.
- * These areas shall be wide enough to accomodate the result (result
- * size was specified as parameter to mshabal_init()). It is acceptable
- * to use NULL for any of those pointers, if the result from the
- * corresponding instance is not needed.
- *
- * After this call, the context structure is invalid. The caller shall
- * release it, or reinitialize it with mshabal_init(). The mshabal_close()
- * function does NOT imply a hidden call to mshabal_init().
- */
-void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2,
-                           unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7,
-                           unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12,
-                           unsigned ub13, unsigned ub14, unsigned ub15, unsigned n, void *dst0,
-                           void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6,
-                           void *dst7, void *dst8, void *dst9, void *dst10, void *dst11,
-                           void *dst12, void *dst13, void *dst14, void *dst15);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination,
-                               void *dst, unsigned len);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0,
-                                    void *dst1, void *dst2, void *dst3, void *dst4, void *dst5,
-                                    void *dst6, void *dst7, void *dst8, void *dst9, void *dst10,
-                                    void *dst11, void *dst12, void *dst13, void *dst14,
-                                    void *dst15);
-                                    
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/c/shabal.c b/src/c/shabal.c
deleted file mode 100644
index c817e29..0000000
--- a/src/c/shabal.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "shabal.h"
-#include <string.h>
-#include "common.h"
-#include "sph_shabal.h"
-
-void find_best_deadline_sph(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t dl = 0;
-	for (uint64_t i = 0; i < nonce_count; i++){
-		sph_shabal_deadline_fast(&scoops[i * 64], gensig, &dl);
-        SET_BEST_DEADLINE(dl, i);
-    }
-}
\ No newline at end of file
diff --git a/src/c/shabal.h b/src/c/shabal.h
deleted file mode 100644
index ebb741f..0000000
--- a/src/c/shabal.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void find_best_deadline_sph(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/shabal_avx.c b/src/c/shabal_avx.c
deleted file mode 100644
index 70e29c8..0000000
--- a/src/c/shabal_avx.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "shabal_avx.h"
-#include <immintrin.h>
-#include <string.h>
-#include "common.h"
-#include "mshabal_128_avx.h"
-#include "sph_shabal.h"
-
-mshabal128_context global_128;
-mshabal128_context_fast global_128_fast;
-
-void init_shabal_avx() {
-    mshabal_init_avx(&global_128, 256);
-    global_128_fast.out_size = global_128.out_size;
-    for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i];
-    global_128_fast.Whigh = global_128.Whigh;
-    global_128_fast.Wlow = global_128.Wlow;
-}
-
-void find_best_deadline_avx(char *scoops, uint64_t nonce_count, char *gensig,
-                            uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;
-    char term[32];
-    write_term(term);
-
-    // local copy of global fast context
-    mshabal128_context_fast x;
-    memcpy(&x, &global_128_fast, sizeof(global_128_fast));
-
-    // prepare shabal inputs
-    union {
-        mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u1, u2;
-
-    for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) {
-        size_t o = i;
-        u1.words[i + 0] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 1] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 2] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 3] = *(mshabal_u32 *)(gensig + o);
-        u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o);
-    }
-
-    for (uint64_t i = 0; i < nonce_count;) {
-        if (i + 4 <= nonce_count) {
-            // load and align data for SIMD
-            for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) {
-                size_t o = j;
-                u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o);
-                u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o);
-                u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o);
-                u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o);
-                u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o);
-                u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o);
-                u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o);
-                u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o);
-            }
-
-            mshabal_deadline_fast_avx(&x, &u1, &u2, &d0, &d1, &d2, &d3);
-
-            SET_BEST_DEADLINE(d0, i + 0);
-            SET_BEST_DEADLINE(d1, i + 1);
-            SET_BEST_DEADLINE(d2, i + 2);
-            SET_BEST_DEADLINE(d3, i + 3);
-            i += 4;
-        } else {
-            sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0);
-            SET_BEST_DEADLINE(d0, i);
-            i++;
-        }
-    }
-}
diff --git a/src/c/shabal_avx.h b/src/c/shabal_avx.h
deleted file mode 100644
index b6b7dd4..0000000
--- a/src/c/shabal_avx.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void init_shabal_avx();
-
-void find_best_deadline_avx(char *scoops, uint64_t nonce_count, char *gensig,
-                            uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/shabal_avx2.c b/src/c/shabal_avx2.c
deleted file mode 100644
index 4802bff..0000000
--- a/src/c/shabal_avx2.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "shabal_avx2.h"
-#include <immintrin.h>
-#include <string.h>
-#include "common.h"
-#include "mshabal_256_avx2.h"
-#include "sph_shabal.h"
-
-mshabal256_context global_256;
-mshabal256_context_fast global_256_fast;
-
-void init_shabal_avx2() {
-    mshabal_init_avx2(&global_256, 256);
-    global_256_fast.out_size = global_256.out_size;
-    for (uint64_t i = 0; i < 352; i++) global_256_fast.state[i] = global_256.state[i];
-    global_256_fast.Whigh = global_256.Whigh;
-    global_256_fast.Wlow = global_256.Wlow;
-}
-
-void find_best_deadline_avx2(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0, d5 = 0, d6 = 0, d7 = 0;
-    char term[32];
-    write_term(term);
-
-    // local copy of global fast context
-    mshabal256_context_fast x;
-    memcpy(&x, &global_256_fast, sizeof(global_256_fast));
-
-    // prepare shabal inputs
-    union {
-        mshabal_u32 words[16 * MSHABAL256_VECTOR_SIZE];
-        __m256i data[16];
-    } u1, u2;
-
-    for (uint64_t i = 0; i < 16 * MSHABAL256_VECTOR_SIZE / 2; i += MSHABAL256_VECTOR_SIZE) {
-        size_t o = i / 2;
-        u1.words[i + 0] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 1] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 2] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 3] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 4] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 5] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 6] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 7] = *(mshabal_u32 *)(gensig + o);
-        u2.words[i + 0 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 1 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 2 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 3 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 4 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 5 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 6 + 64] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 7 + 64] = *(mshabal_u32 *)(term + o);
-    }
-
-    for (uint64_t i = 0; i < nonce_count;) {
-        if (i + 8 <= nonce_count) {
-            // load and align data for SIMD
-            for (uint64_t j = 0; j < 16 * MSHABAL256_VECTOR_SIZE / 2; j += MSHABAL256_VECTOR_SIZE) {
-                size_t o = j / 2;
-                u1.words[j + 0 + 64] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o);
-                u1.words[j + 1 + 64] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o);
-                u1.words[j + 2 + 64] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o);
-                u1.words[j + 3 + 64] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o);
-                u1.words[j + 4 + 64] = *(mshabal_u32 *)(&scoops[(i + 4) * 64] + o);
-                u1.words[j + 5 + 64] = *(mshabal_u32 *)(&scoops[(i + 5) * 64] + o);
-                u1.words[j + 6 + 64] = *(mshabal_u32 *)(&scoops[(i + 6) * 64] + o);
-                u1.words[j + 7 + 64] = *(mshabal_u32 *)(&scoops[(i + 7) * 64] + o);
-                u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o);
-                u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o);
-                u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o);
-                u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o);
-                u2.words[j + 4] = *(mshabal_u32 *)(&scoops[(i + 4) * 64 + 32] + o);
-                u2.words[j + 5] = *(mshabal_u32 *)(&scoops[(i + 5) * 64 + 32] + o);
-                u2.words[j + 6] = *(mshabal_u32 *)(&scoops[(i + 6) * 64 + 32] + o);
-                u2.words[j + 7] = *(mshabal_u32 *)(&scoops[(i + 7) * 64 + 32] + o);
-            }
-
-            mshabal_deadline_fast_avx2(&x, &u1, &u2, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-            SET_BEST_DEADLINE(d0, i + 0);
-            SET_BEST_DEADLINE(d1, i + 1);
-            SET_BEST_DEADLINE(d2, i + 2);
-            SET_BEST_DEADLINE(d3, i + 3);
-            SET_BEST_DEADLINE(d4, i + 4);
-            SET_BEST_DEADLINE(d5, i + 5);
-            SET_BEST_DEADLINE(d6, i + 6);
-            SET_BEST_DEADLINE(d7, i + 7);
-            i += 8;
-        } else {
-            sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0);
-            SET_BEST_DEADLINE(d0, i);
-            i++;
-        }
-    }
-}
diff --git a/src/c/shabal_avx2.h b/src/c/shabal_avx2.h
deleted file mode 100644
index fa433b6..0000000
--- a/src/c/shabal_avx2.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void init_shabal_avx2();
-
-void find_best_deadline_avx2(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/shabal_avx512f.c b/src/c/shabal_avx512f.c
deleted file mode 100644
index 3b7b1b3..0000000
--- a/src/c/shabal_avx512f.c
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "shabal_avx512f.h"
-#include <immintrin.h>
-#include <string.h>
-#include "common.h"
-#include "mshabal_512_avx512f.h"
-#include "sph_shabal.h"
-
-mshabal512_context global_512;
-mshabal512_context_fast global_512_fast;
-
-void init_shabal_avx512f() {
-    mshabal_init_avx512f(&global_512, 256);
-    global_512_fast.out_size = global_512.out_size;
-    for (uint64_t i = 0; i < 704; i++) global_512_fast.state[i] = global_512.state[i];
-    global_512_fast.Whigh = global_512.Whigh;
-    global_512_fast.Wlow = global_512.Wlow;
-}
-
-void find_best_deadline_avx512f(char *scoops, uint64_t nonce_count, char *gensig,
-                                uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0, d5 = 0, d6 = 0, d7 = 0, d8 = 0, d9 = 0,
-             d10 = 0, d11 = 0, d12 = 0, d13 = 0, d14 = 0, d15 = 0;
-    char term[32];
-    write_term(term);
-
-    // local copy of global fast context
-    mshabal512_context_fast x;
-    memcpy(&x, &global_512_fast, sizeof(global_512_fast));
-
-    // prepare shabal inputs
-    union {
-        mshabal_u32 words[16 * MSHABAL512_VECTOR_SIZE];
-        __m512i data[16];
-    } u1, u2;
-
-    for (uint64_t i = 0; i < 16 * MSHABAL512_VECTOR_SIZE / 2; i += MSHABAL512_VECTOR_SIZE) {
-        size_t o = i / 4;
-        u1.words[i + 0] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 1] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 2] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 3] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 4] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 5] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 6] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 7] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 8] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 9] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 10] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 11] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 12] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 13] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 14] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 15] = *(mshabal_u32 *)(gensig + o);
-        u2.words[i + 0 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 1 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 2 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 3 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 4 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 5 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 6 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 7 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 8 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 9 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 10 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 11 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 12 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 13 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 14 + 128] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 15 + 128] = *(mshabal_u32 *)(term + o);
-    }
-
-    for (uint64_t i = 0; i < nonce_count;) {
-        if (i + 16 <= nonce_count) {
-            // load and align data for SIMD
-
-            for (uint64_t j = 0; j < 16 * MSHABAL512_VECTOR_SIZE / 2; j += MSHABAL512_VECTOR_SIZE) {
-                size_t o = j / 4;
-                u1.words[j + 0 + 128] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o);
-                u1.words[j + 1 + 128] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o);
-                u1.words[j + 2 + 128] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o);
-                u1.words[j + 3 + 128] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o);
-                u1.words[j + 4 + 128] = *(mshabal_u32 *)(&scoops[(i + 4) * 64] + o);
-                u1.words[j + 5 + 128] = *(mshabal_u32 *)(&scoops[(i + 5) * 64] + o);
-                u1.words[j + 6 + 128] = *(mshabal_u32 *)(&scoops[(i + 6) * 64] + o);
-                u1.words[j + 7 + 128] = *(mshabal_u32 *)(&scoops[(i + 7) * 64] + o);
-                u1.words[j + 8 + 128] = *(mshabal_u32 *)(&scoops[(i + 8) * 64] + o);
-                u1.words[j + 9 + 128] = *(mshabal_u32 *)(&scoops[(i + 9) * 64] + o);
-                u1.words[j + 10 + 128] = *(mshabal_u32 *)(&scoops[(i + 10) * 64] + o);
-                u1.words[j + 11 + 128] = *(mshabal_u32 *)(&scoops[(i + 11) * 64] + o);
-                u1.words[j + 12 + 128] = *(mshabal_u32 *)(&scoops[(i + 12) * 64] + o);
-                u1.words[j + 13 + 128] = *(mshabal_u32 *)(&scoops[(i + 13) * 64] + o);
-                u1.words[j + 14 + 128] = *(mshabal_u32 *)(&scoops[(i + 14) * 64] + o);
-                u1.words[j + 15 + 128] = *(mshabal_u32 *)(&scoops[(i + 15) * 64] + o);
-                u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o);
-                u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o);
-                u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o);
-                u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o);
-                u2.words[j + 4] = *(mshabal_u32 *)(&scoops[(i + 4) * 64 + 32] + o);
-                u2.words[j + 5] = *(mshabal_u32 *)(&scoops[(i + 5) * 64 + 32] + o);
-                u2.words[j + 6] = *(mshabal_u32 *)(&scoops[(i + 6) * 64 + 32] + o);
-                u2.words[j + 7] = *(mshabal_u32 *)(&scoops[(i + 7) * 64 + 32] + o);
-                u2.words[j + 8] = *(mshabal_u32 *)(&scoops[(i + 8) * 64 + 32] + o);
-                u2.words[j + 9] = *(mshabal_u32 *)(&scoops[(i + 9) * 64 + 32] + o);
-                u2.words[j + 10] = *(mshabal_u32 *)(&scoops[(i + 10) * 64 + 32] + o);
-                u2.words[j + 11] = *(mshabal_u32 *)(&scoops[(i + 11) * 64 + 32] + o);
-                u2.words[j + 12] = *(mshabal_u32 *)(&scoops[(i + 12) * 64 + 32] + o);
-                u2.words[j + 13] = *(mshabal_u32 *)(&scoops[(i + 13) * 64 + 32] + o);
-                u2.words[j + 14] = *(mshabal_u32 *)(&scoops[(i + 14) * 64 + 32] + o);
-                u2.words[j + 15] = *(mshabal_u32 *)(&scoops[(i + 15) * 64 + 32] + o);
-            }
-
-            mshabal_deadline_fast_avx512f(&x, &u1, &u2, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7,
-                                           &d8, &d9, &d10, &d11, &d12, &d13, &d14, &d15);
-
-            SET_BEST_DEADLINE(d0, i + 0);
-            SET_BEST_DEADLINE(d1, i + 1);
-            SET_BEST_DEADLINE(d2, i + 2);
-            SET_BEST_DEADLINE(d3, i + 3);
-            SET_BEST_DEADLINE(d4, i + 4);
-            SET_BEST_DEADLINE(d5, i + 5);
-            SET_BEST_DEADLINE(d6, i + 6);
-            SET_BEST_DEADLINE(d7, i + 7);
-            SET_BEST_DEADLINE(d8, i + 8);
-            SET_BEST_DEADLINE(d9, i + 9);
-            SET_BEST_DEADLINE(d10, i + 10);
-            SET_BEST_DEADLINE(d11, i + 11);
-            SET_BEST_DEADLINE(d12, i + 12);
-            SET_BEST_DEADLINE(d13, i + 13);
-            SET_BEST_DEADLINE(d14, i + 14);
-            SET_BEST_DEADLINE(d15, i + 15);
-            i += 16;
-        } else {
-            sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0);
-            SET_BEST_DEADLINE(d0, i);
-            i++;
-        }
-    }
-}
diff --git a/src/c/shabal_avx512f.h b/src/c/shabal_avx512f.h
deleted file mode 100644
index 0a6f1ed..0000000
--- a/src/c/shabal_avx512f.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void init_shabal_avx512f();
-
-void find_best_deadline_avx512f(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/shabal_neon.c b/src/c/shabal_neon.c
deleted file mode 100644
index 313642b..0000000
--- a/src/c/shabal_neon.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "shabal_neon.h"
-#include "SSE2NEON.h"
-#include <string.h>
-#include "common.h"
-#include "mshabal_128_neon.h"
-#include "sph_shabal.h"
-
-mshabal128_context global_128;
-mshabal128_context_fast global_128_fast;
-
-void init_shabal_neon() {
-    mshabal_init_neon(&global_128, 256);
-    global_128_fast.out_size = global_128.out_size;
-    for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i];
-    global_128_fast.Whigh = global_128.Whigh;
-    global_128_fast.Wlow = global_128.Wlow;
-}
-
-void find_best_deadline_neon(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;
-    char term[32];
-    write_term(term);
-
-    // local copy of global fast context
-    mshabal128_context_fast x;
-    memcpy(&x, &global_128_fast, sizeof(global_128_fast));
-
-    // prepare shabal inputs
-    union {
-        mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u1, u2;
-
-    for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) {
-        size_t o = i;
-        u1.words[i + 0] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 1] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 2] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 3] = *(mshabal_u32 *)(gensig + o);
-        u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o);
-    }
-
-    for (uint64_t i = 0; i < nonce_count;) {
-        if (i + 4 <= nonce_count) {
-            // load and align data for SIMD
-            for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) {
-                size_t o = j;
-                u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o);
-                u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o);
-                u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o);
-                u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o);
-                u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o);
-                u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o);
-                u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o);
-                u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o);
-            }
-
-            mshabal_deadline_fast_neon(&x, &u1, &u2, &d0, &d1, &d2, &d3);
-
-            SET_BEST_DEADLINE(d0, i + 0);
-            SET_BEST_DEADLINE(d1, i + 1);
-            SET_BEST_DEADLINE(d2, i + 2);
-            SET_BEST_DEADLINE(d3, i + 3);
-            i += 4;
-        } else {
-            sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0);
-            SET_BEST_DEADLINE(d0, i);
-            i++;
-        }
-    }
-}
diff --git a/src/c/shabal_neon.h b/src/c/shabal_neon.h
deleted file mode 100644
index faca1c1..0000000
--- a/src/c/shabal_neon.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void init_shabal_neon();
-
-void find_best_deadline_neon(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/shabal_sse2.c b/src/c/shabal_sse2.c
deleted file mode 100644
index 4f5e593..0000000
--- a/src/c/shabal_sse2.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "shabal_sse2.h"
-#include <immintrin.h>
-#include <string.h>
-#include "common.h"
-#include "mshabal_128_sse2.h"
-#include "sph_shabal.h"
-
-mshabal128_context global_128;
-mshabal128_context_fast global_128_fast;
-
-void init_shabal_sse2() {
-    mshabal_init_sse2(&global_128, 256);
-    global_128_fast.out_size = global_128.out_size;
-    for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i];
-    global_128_fast.Whigh = global_128.Whigh;
-    global_128_fast.Wlow = global_128.Wlow;
-}
-
-void find_best_deadline_sse2(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset) {
-    uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;
-    char term[32];
-    write_term(term);
-
-    // local copy of global fast context
-    mshabal128_context_fast x;
-    memcpy(&x, &global_128_fast, sizeof(global_128_fast));
-
-    // prepare shabal inputs
-    union {
-        mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE];
-        __m128i data[16];
-    } u1, u2;
-
-    for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) {
-        size_t o = i;
-        u1.words[i + 0] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 1] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 2] = *(mshabal_u32 *)(gensig + o);
-        u1.words[i + 3] = *(mshabal_u32 *)(gensig + o);
-        u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o);
-        u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o);
-    }
-
-    for (uint64_t i = 0; i < nonce_count;) {
-        if (i + 4 <= nonce_count) {
-            // load and align data for SIMD
-            for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) {
-                size_t o = j;
-                u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o);
-                u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o);
-                u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o);
-                u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o);
-                u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o);
-                u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o);
-                u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o);
-                u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o);
-            }
-
-            mshabal_deadline_fast_sse2(&x, &u1, &u2, &d0, &d1, &d2, &d3);
-
-            SET_BEST_DEADLINE(d0, i + 0);
-            SET_BEST_DEADLINE(d1, i + 1);
-            SET_BEST_DEADLINE(d2, i + 2);
-            SET_BEST_DEADLINE(d3, i + 3);
-            i += 4;
-        } else {
-            sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0);
-            SET_BEST_DEADLINE(d0, i);
-            i++;
-        }
-    }
-}
diff --git a/src/c/shabal_sse2.h b/src/c/shabal_sse2.h
deleted file mode 100644
index 56aa827..0000000
--- a/src/c/shabal_sse2.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-
-void init_shabal_sse2();
-
-void find_best_deadline_sse2(char *scoops, uint64_t nonce_count, char *gensig,
-                             uint64_t *best_deadline, uint64_t *best_offset);
diff --git a/src/c/sph_shabal.c b/src/c/sph_shabal.c
deleted file mode 100644
index c1507b7..0000000
--- a/src/c/sph_shabal.c
+++ /dev/null
@@ -1,693 +0,0 @@
-/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
-/*
- * Shabal implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-
-#include "sph_shabal.h"
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4146)
-#endif
-
-/*
- * Part of this code was automatically generated (the part between
- * the "BEGIN" and "END" markers).
- */
-
-#define sM 16
-
-//#define C32   SPH_C32
-//#define T32   SPH_T32
-
-//#define O1   13
-//#define O2    9
-//#define O3    6
-
-/*
- * We copy the state into local variables, so that the compiler knows
- * that it can optimize them at will.
- */
-
-/* BEGIN -- automatically generated code. */
-
-#define DECL_STATE                                                          \
-    sph_u32 A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A0A, A0B;     \
-    sph_u32 B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
-    sph_u32 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
-    sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
-    sph_u32 Wlow, Whigh;
-
-#define READ_STATE(state)       \
-    do {                        \
-        A00 = (state)->A[0];    \
-        A01 = (state)->A[1];    \
-        A02 = (state)->A[2];    \
-        A03 = (state)->A[3];    \
-        A04 = (state)->A[4];    \
-        A05 = (state)->A[5];    \
-        A06 = (state)->A[6];    \
-        A07 = (state)->A[7];    \
-        A08 = (state)->A[8];    \
-        A09 = (state)->A[9];    \
-        A0A = (state)->A[10];   \
-        A0B = (state)->A[11];   \
-        B0 = (state)->B[0];     \
-        B1 = (state)->B[1];     \
-        B2 = (state)->B[2];     \
-        B3 = (state)->B[3];     \
-        B4 = (state)->B[4];     \
-        B5 = (state)->B[5];     \
-        B6 = (state)->B[6];     \
-        B7 = (state)->B[7];     \
-        B8 = (state)->B[8];     \
-        B9 = (state)->B[9];     \
-        BA = (state)->B[10];    \
-        BB = (state)->B[11];    \
-        BC = (state)->B[12];    \
-        BD = (state)->B[13];    \
-        BE = (state)->B[14];    \
-        BF = (state)->B[15];    \
-        C0 = (state)->C[0];     \
-        C1 = (state)->C[1];     \
-        C2 = (state)->C[2];     \
-        C3 = (state)->C[3];     \
-        C4 = (state)->C[4];     \
-        C5 = (state)->C[5];     \
-        C6 = (state)->C[6];     \
-        C7 = (state)->C[7];     \
-        C8 = (state)->C[8];     \
-        C9 = (state)->C[9];     \
-        CA = (state)->C[10];    \
-        CB = (state)->C[11];    \
-        CC = (state)->C[12];    \
-        CD = (state)->C[13];    \
-        CE = (state)->C[14];    \
-        CF = (state)->C[15];    \
-        Wlow = (state)->Wlow;   \
-        Whigh = (state)->Whigh; \
-    } while (0)
-
-#define WRITE_STATE(state)      \
-    do {                        \
-        (state)->A[0] = A00;    \
-        (state)->A[1] = A01;    \
-        (state)->A[2] = A02;    \
-        (state)->A[3] = A03;    \
-        (state)->A[4] = A04;    \
-        (state)->A[5] = A05;    \
-        (state)->A[6] = A06;    \
-        (state)->A[7] = A07;    \
-        (state)->A[8] = A08;    \
-        (state)->A[9] = A09;    \
-        (state)->A[10] = A0A;   \
-        (state)->A[11] = A0B;   \
-        (state)->B[0] = B0;     \
-        (state)->B[1] = B1;     \
-        (state)->B[2] = B2;     \
-        (state)->B[3] = B3;     \
-        (state)->B[4] = B4;     \
-        (state)->B[5] = B5;     \
-        (state)->B[6] = B6;     \
-        (state)->B[7] = B7;     \
-        (state)->B[8] = B8;     \
-        (state)->B[9] = B9;     \
-        (state)->B[10] = BA;    \
-        (state)->B[11] = BB;    \
-        (state)->B[12] = BC;    \
-        (state)->B[13] = BD;    \
-        (state)->B[14] = BE;    \
-        (state)->B[15] = BF;    \
-        (state)->C[0] = C0;     \
-        (state)->C[1] = C1;     \
-        (state)->C[2] = C2;     \
-        (state)->C[3] = C3;     \
-        (state)->C[4] = C4;     \
-        (state)->C[5] = C5;     \
-        (state)->C[6] = C6;     \
-        (state)->C[7] = C7;     \
-        (state)->C[8] = C8;     \
-        (state)->C[9] = C9;     \
-        (state)->C[10] = CA;    \
-        (state)->C[11] = CB;    \
-        (state)->C[12] = CC;    \
-        (state)->C[13] = CD;    \
-        (state)->C[14] = CE;    \
-        (state)->C[15] = CF;    \
-        (state)->Wlow = Wlow;   \
-        (state)->Whigh = Whigh; \
-    } while (0)
-
-#define DECODE_BLOCK                        \
-    do {                                    \
-        M0 = sph_dec32le_aligned(buf + 0);  \
-        M1 = sph_dec32le_aligned(buf + 4);  \
-        M2 = sph_dec32le_aligned(buf + 8);  \
-        M3 = sph_dec32le_aligned(buf + 12); \
-        M4 = sph_dec32le_aligned(buf + 16); \
-        M5 = sph_dec32le_aligned(buf + 20); \
-        M6 = sph_dec32le_aligned(buf + 24); \
-        M7 = sph_dec32le_aligned(buf + 28); \
-        M8 = sph_dec32le_aligned(buf + 32); \
-        M9 = sph_dec32le_aligned(buf + 36); \
-        MA = sph_dec32le_aligned(buf + 40); \
-        MB = sph_dec32le_aligned(buf + 44); \
-        MC = sph_dec32le_aligned(buf + 48); \
-        MD = sph_dec32le_aligned(buf + 52); \
-        ME = sph_dec32le_aligned(buf + 56); \
-        MF = sph_dec32le_aligned(buf + 60); \
-    } while (0)
-
-#define INPUT_BLOCK_ADD        \
-    do {                       \
-        B0 = SPH_T32(B0 + M0); \
-        B1 = SPH_T32(B1 + M1); \
-        B2 = SPH_T32(B2 + M2); \
-        B3 = SPH_T32(B3 + M3); \
-        B4 = SPH_T32(B4 + M4); \
-        B5 = SPH_T32(B5 + M5); \
-        B6 = SPH_T32(B6 + M6); \
-        B7 = SPH_T32(B7 + M7); \
-        B8 = SPH_T32(B8 + M8); \
-        B9 = SPH_T32(B9 + M9); \
-        BA = SPH_T32(BA + MA); \
-        BB = SPH_T32(BB + MB); \
-        BC = SPH_T32(BC + MC); \
-        BD = SPH_T32(BD + MD); \
-        BE = SPH_T32(BE + ME); \
-        BF = SPH_T32(BF + MF); \
-    } while (0)
-
-#define INPUT_BLOCK_SUB        \
-    do {                       \
-        C0 = SPH_T32(C0 - M0); \
-        C1 = SPH_T32(C1 - M1); \
-        C2 = SPH_T32(C2 - M2); \
-        C3 = SPH_T32(C3 - M3); \
-        C4 = SPH_T32(C4 - M4); \
-        C5 = SPH_T32(C5 - M5); \
-        C6 = SPH_T32(C6 - M6); \
-        C7 = SPH_T32(C7 - M7); \
-        C8 = SPH_T32(C8 - M8); \
-        C9 = SPH_T32(C9 - M9); \
-        CA = SPH_T32(CA - MA); \
-        CB = SPH_T32(CB - MB); \
-        CC = SPH_T32(CC - MC); \
-        CD = SPH_T32(CD - MD); \
-        CE = SPH_T32(CE - ME); \
-        CF = SPH_T32(CF - MF); \
-    } while (0)
-
-#define XOR_W         \
-    do {              \
-        A00 ^= Wlow;  \
-        A01 ^= Whigh; \
-    } while (0)
-
-#define SWAP(v1, v2)        \
-    do {                    \
-        sph_u32 tmp = (v1); \
-        (v1) = (v2);        \
-        (v2) = tmp;         \
-    } while (0)
-
-#define SWAP_BC       \
-    do {              \
-        SWAP(B0, C0); \
-        SWAP(B1, C1); \
-        SWAP(B2, C2); \
-        SWAP(B3, C3); \
-        SWAP(B4, C4); \
-        SWAP(B5, C5); \
-        SWAP(B6, C6); \
-        SWAP(B7, C7); \
-        SWAP(B8, C8); \
-        SWAP(B9, C9); \
-        SWAP(BA, CA); \
-        SWAP(BB, CB); \
-        SWAP(BC, CC); \
-        SWAP(BD, CD); \
-        SWAP(BE, CE); \
-        SWAP(BF, CF); \
-    } while (0)
-
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)                                             \
-    do {                                                                                           \
-        xa0 = SPH_T32((xa0 ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) ^ xc) * 3U) ^ xb1 ^ (xb2 & ~xb3) ^ \
-              xm;                                                                                  \
-        xb0 = SPH_T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0));                                        \
-    } while (0)
-
-#define PERM_STEP_0                                 \
-    do {                                            \
-        PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
-        PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
-        PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
-        PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
-        PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
-        PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
-        PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
-        PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
-        PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
-        PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
-        PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
-        PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
-        PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
-        PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
-        PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
-        PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
-    } while (0)
-
-#define PERM_STEP_1                                 \
-    do {                                            \
-        PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
-        PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
-        PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
-        PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
-        PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
-        PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
-        PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
-        PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
-        PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
-        PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
-        PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
-        PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
-        PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
-        PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
-        PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
-        PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
-    } while (0)
-
-#define PERM_STEP_2                                 \
-    do {                                            \
-        PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
-        PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
-        PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
-        PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
-        PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
-        PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
-        PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
-        PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
-        PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
-        PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
-        PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
-        PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
-        PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
-        PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
-        PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
-        PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
-    } while (0)
-
-#define APPLY_P                              \
-    do {                                     \
-        B0 = SPH_T32(B0 << 17) | (B0 >> 15); \
-        B1 = SPH_T32(B1 << 17) | (B1 >> 15); \
-        B2 = SPH_T32(B2 << 17) | (B2 >> 15); \
-        B3 = SPH_T32(B3 << 17) | (B3 >> 15); \
-        B4 = SPH_T32(B4 << 17) | (B4 >> 15); \
-        B5 = SPH_T32(B5 << 17) | (B5 >> 15); \
-        B6 = SPH_T32(B6 << 17) | (B6 >> 15); \
-        B7 = SPH_T32(B7 << 17) | (B7 >> 15); \
-        B8 = SPH_T32(B8 << 17) | (B8 >> 15); \
-        B9 = SPH_T32(B9 << 17) | (B9 >> 15); \
-        BA = SPH_T32(BA << 17) | (BA >> 15); \
-        BB = SPH_T32(BB << 17) | (BB >> 15); \
-        BC = SPH_T32(BC << 17) | (BC >> 15); \
-        BD = SPH_T32(BD << 17) | (BD >> 15); \
-        BE = SPH_T32(BE << 17) | (BE >> 15); \
-        BF = SPH_T32(BF << 17) | (BF >> 15); \
-        PERM_STEP_0;                         \
-        PERM_STEP_1;                         \
-        PERM_STEP_2;                         \
-        A0B = SPH_T32(A0B + C6);             \
-        A0A = SPH_T32(A0A + C5);             \
-        A09 = SPH_T32(A09 + C4);             \
-        A08 = SPH_T32(A08 + C3);             \
-        A07 = SPH_T32(A07 + C2);             \
-        A06 = SPH_T32(A06 + C1);             \
-        A05 = SPH_T32(A05 + C0);             \
-        A04 = SPH_T32(A04 + CF);             \
-        A03 = SPH_T32(A03 + CE);             \
-        A02 = SPH_T32(A02 + CD);             \
-        A01 = SPH_T32(A01 + CC);             \
-        A00 = SPH_T32(A00 + CB);             \
-        A0B = SPH_T32(A0B + CA);             \
-        A0A = SPH_T32(A0A + C9);             \
-        A09 = SPH_T32(A09 + C8);             \
-        A08 = SPH_T32(A08 + C7);             \
-        A07 = SPH_T32(A07 + C6);             \
-        A06 = SPH_T32(A06 + C5);             \
-        A05 = SPH_T32(A05 + C4);             \
-        A04 = SPH_T32(A04 + C3);             \
-        A03 = SPH_T32(A03 + C2);             \
-        A02 = SPH_T32(A02 + C1);             \
-        A01 = SPH_T32(A01 + C0);             \
-        A00 = SPH_T32(A00 + CF);             \
-        A0B = SPH_T32(A0B + CE);             \
-        A0A = SPH_T32(A0A + CD);             \
-        A09 = SPH_T32(A09 + CC);             \
-        A08 = SPH_T32(A08 + CB);             \
-        A07 = SPH_T32(A07 + CA);             \
-        A06 = SPH_T32(A06 + C9);             \
-        A05 = SPH_T32(A05 + C8);             \
-        A04 = SPH_T32(A04 + C7);             \
-        A03 = SPH_T32(A03 + C6);             \
-        A02 = SPH_T32(A02 + C5);             \
-        A01 = SPH_T32(A01 + C4);             \
-        A00 = SPH_T32(A00 + C3);             \
-    } while (0)
-
-#define INCR_W                                                           \
-    do {                                                                 \
-        if ((Wlow = SPH_T32(Wlow + 1)) == 0) Whigh = SPH_T32(Whigh + 1); \
-    } while (0)
-
-static const sph_u32 A_init_256[] = {SPH_C32(0x52F84552), SPH_C32(0xE54B7999), SPH_C32(0x2D8EE3EC),
-                                     SPH_C32(0xB9645191), SPH_C32(0xE0078B86), SPH_C32(0xBB7C44C9),
-                                     SPH_C32(0xD2B5C1CA), SPH_C32(0xB0D2EB8C), SPH_C32(0x14CE5A45),
-                                     SPH_C32(0x22AF50DC), SPH_C32(0xEFFDBC6B), SPH_C32(0xEB21B74A)};
-
-static const sph_u32 B_init_256[] = {
-    SPH_C32(0xB555C6EE), SPH_C32(0x3E710596), SPH_C32(0xA72A652F), SPH_C32(0x9301515F),
-    SPH_C32(0xDA28C1FA), SPH_C32(0x696FD868), SPH_C32(0x9CB6BF72), SPH_C32(0x0AFE4002),
-    SPH_C32(0xA6E03615), SPH_C32(0x5138C1D4), SPH_C32(0xBE216306), SPH_C32(0xB38B8890),
-    SPH_C32(0x3EA8B96B), SPH_C32(0x3299ACE4), SPH_C32(0x30924DD4), SPH_C32(0x55CB34A5)};
-
-static const sph_u32 C_init_256[] = {
-    SPH_C32(0xB405F031), SPH_C32(0xC4233EBA), SPH_C32(0xB3733979), SPH_C32(0xC0DD9D55),
-    SPH_C32(0xC51C28AE), SPH_C32(0xA327B8E1), SPH_C32(0x56C56167), SPH_C32(0xED614433),
-    SPH_C32(0x88B59D60), SPH_C32(0x60E2CEBA), SPH_C32(0x758B4B8B), SPH_C32(0x83E82A7F),
-    SPH_C32(0xBC968828), SPH_C32(0xE6E00BF7), SPH_C32(0xBA839E55), SPH_C32(0x9B491C60)};
-
-/* END -- automatically generated code. */
-
-void sph_shabal256_init(sph_shabal_context* cc) {
-    /*
-     * We have precomputed initial states for all the supported
-     * output bit lengths.
-     */
-    // const sph_u32 *A_init, *B_init, *C_init;
-    // sph_shabal_context *sc;
-
-    // A_init = A_init_256;
-    // B_init = B_init_256;
-    // C_init = C_init_256;
-
-    // sc = (sph_shabal_context *) cc;
-    memcpy(cc->A, A_init_256, sizeof cc->A);
-    memcpy(cc->B, B_init_256, sizeof cc->B);
-    memcpy(cc->C, C_init_256, sizeof cc->C);
-    cc->Wlow = 1;
-    cc->Whigh = 0;
-    cc->ptr = 0;
-}
-
-void sph_shabal256(void* cc, const unsigned char* data, size_t len) {
-    sph_shabal_context* sc;
-    unsigned char* buf;
-    size_t ptr;
-    DECL_STATE
-
-    sc = (sph_shabal_context*)cc;
-    buf = sc->buf;
-    ptr = sc->ptr;
-
-    /*
-     * We do not want to copy the state to local variables if the
-     * amount of data is less than what is needed to complete the
-     * current block. Note that it is anyway suboptimal to call
-     * this method many times for small chunks of data.
-     */
-    if (len < (sizeof sc->buf) - ptr) {
-        memcpy(buf + ptr, data, len);
-        ptr += len;
-        sc->ptr = ptr;
-        return;
-    }
-
-    READ_STATE(sc);
-    while (len > 0) {
-        size_t clen;
-
-        clen = (sizeof sc->buf) - ptr;
-        if (clen > len) clen = len;
-        memcpy(buf + ptr, data, clen);
-        ptr += clen;
-        data += clen;
-        len -= clen;
-        if (ptr == sizeof sc->buf) {
-            DECODE_BLOCK;
-            INPUT_BLOCK_ADD;
-            XOR_W;
-            APPLY_P;
-            INPUT_BLOCK_SUB;
-            SWAP_BC;
-            INCR_W;
-            ptr = 0;
-        }
-    }
-    WRITE_STATE(sc);
-    sc->ptr = ptr;
-}
-
-static void shabal_close(void* cc, unsigned ub, unsigned n, void* dst, unsigned size_words) {
-    sph_shabal_context* sc;
-    unsigned char* buf;
-    size_t ptr;
-    unsigned z;
-    union {
-        unsigned char tmp_out[64];
-        sph_u32 dummy;
-    } u;
-    size_t out_len;
-    DECL_STATE
-
-    sc = (sph_shabal_context*)cc;
-    buf = sc->buf;
-    ptr = sc->ptr;
-    z = 0x80 >> n;
-    buf[ptr] = ((ub & -z) | z) & 0xFF;
-    memset(buf + ptr + 1, 0, (sizeof sc->buf) - (ptr + 1));
-    READ_STATE(sc);
-    DECODE_BLOCK;
-    INPUT_BLOCK_ADD;
-    XOR_W;
-    APPLY_P;
-    //#pragma loop(hint_parallel(3))
-    for (int i = 0; i < 3; i++) {
-        SWAP_BC;
-        XOR_W;
-        APPLY_P;
-    }
-
-    /*
-     * We just use our local variables; no need to go through
-     * the state structure. In order to share some code, we
-     * emit the relevant words into a temporary buffer, which
-     * we finally copy into the destination array.
-     */
-
-    sph_enc32le_aligned(u.tmp_out + 32, B8);
-
-    sph_enc32le_aligned(u.tmp_out + 36, B9);
-
-    sph_enc32le_aligned(u.tmp_out + 40, BA);
-    sph_enc32le_aligned(u.tmp_out + 44, BB);
-    sph_enc32le_aligned(u.tmp_out + 48, BC);
-    sph_enc32le_aligned(u.tmp_out + 52, BD);
-    sph_enc32le_aligned(u.tmp_out + 56, BE);
-    sph_enc32le_aligned(u.tmp_out + 60, BF);
-
-    out_len = size_words << 2;
-    memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len);
-    // sph_shabal256_init(sc, size_words << 5);
-}
-
-/* see sph_shabal.h */
-void sph_shabal256_close(void* cc, void* dst) { shabal_close(cc, 0, 0, dst, 8); }
-
-/* see sph_shabal.h */
-void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst) {
-    shabal_close(cc, ub, n, dst, 8);
-}
-
-// Shabal routines optimized for plotting and hashing
-void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num) {
-    sph_u32
-        A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3],
-        A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7],
-        A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11];
-    sph_u32
-        B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3],
-        B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7],
-        B8 = B_init_256[8], B9 = B_init_256[9], BA = B_init_256[10], BB = B_init_256[11],
-        BC = B_init_256[12], BD = B_init_256[13], BE = B_init_256[14], BF = B_init_256[15];
-    sph_u32
-        C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3],
-        C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7],
-        C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11],
-        CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15];
-    sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
-    sph_u32 Wlow = 1, Whigh = 0;
-         
-    while (num-- > 0) {
-	    M0 = ((unsigned int *)message)[0];
-	    M1 = ((unsigned int *)message)[1];
-	    M2 = ((unsigned int *)message)[2];
-	    M3 = ((unsigned int *)message)[3];
-	    M4 = ((unsigned int *)message)[4];
-	    M5 = ((unsigned int *)message)[5];
-	    M6 = ((unsigned int *)message)[6];
-	    M7 = ((unsigned int *)message)[7];
-	    M8 = ((unsigned int *)message)[8];
-	    M9 = ((unsigned int *)message)[9];
-	    MA = ((unsigned int *)message)[10];
-	    MB = ((unsigned int *)message)[11];
-	    MC = ((unsigned int *)message)[12];
-	    MD = ((unsigned int *)message)[13];
-	    ME = ((unsigned int *)message)[14];
-    	MF = ((unsigned int *)message)[15];
-
-        INPUT_BLOCK_ADD;
-        XOR_W;
-        APPLY_P;
-        INPUT_BLOCK_SUB;
-        SWAP_BC;
-        INCR_W;
-
-        message = (unsigned int *)message + 16;
-    }
-
-	    M0 = ((unsigned int *)termination)[0];
-	    M1 = ((unsigned int *)termination)[1];
-	    M2 = ((unsigned int *)termination)[2];
-	    M3 = ((unsigned int *)termination)[3];
-	    M4 = ((unsigned int *)termination)[4];
-	    M5 = ((unsigned int *)termination)[5];
-	    M6 = ((unsigned int *)termination)[6];
-	    M7 = ((unsigned int *)termination)[7];
-	    M8 = ((unsigned int *)termination)[8];
-	    M9 = ((unsigned int *)termination)[9];
-	    MA = ((unsigned int *)termination)[10];
-	    MB = ((unsigned int *)termination)[11];
-	    MC = ((unsigned int *)termination)[12];
-	    MD = ((unsigned int *)termination)[13];
-	    ME = ((unsigned int *)termination)[14];
-    	MF = ((unsigned int *)termination)[15];
-
-    INPUT_BLOCK_ADD;
-    XOR_W;
-    APPLY_P;
-
-    for (int i = 0; i < 3; i++) {
-        SWAP_BC;
-        XOR_W;
-        APPLY_P;
-    }
-
-    sph_enc32le_aligned((sph_u32 *)dst, B8);
-    sph_enc32le_aligned((sph_u32 *)dst + 1, B9);
-    sph_enc32le_aligned((sph_u32 *)dst + 2, BA);
-    sph_enc32le_aligned((sph_u32 *)dst + 3, BB);
-    sph_enc32le_aligned((sph_u32 *)dst + 4, BC);
-    sph_enc32le_aligned((sph_u32 *)dst + 5, BD);
-    sph_enc32le_aligned((sph_u32 *)dst + 6, BE);
-    sph_enc32le_aligned((sph_u32 *)dst + 7, BF);    
-}
-
-// Shabal routines optimized for mining
-void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst) {
-    sph_u32
-        A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3],
-        A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7],
-        A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11];
-    sph_u32
-        B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3],
-        B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7],
-        B8 = B_init_256[8], B9 = B_init_256[9], BA = B_init_256[10], BB = B_init_256[11],
-        BC = B_init_256[12], BD = B_init_256[13], BE = B_init_256[14], BF = B_init_256[15];
-    sph_u32
-        C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3],
-        C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7],
-        C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11],
-        CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15];
-    sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
-    sph_u32 Wlow = 1, Whigh = 0;
-         
-	M0 = ((unsigned int *)gen_sig)[0];
-	M1 = ((unsigned int *)gen_sig)[1];
-	M2 = ((unsigned int *)gen_sig)[2];
-	M3 = ((unsigned int *)gen_sig)[3];
-	M4 = ((unsigned int *)gen_sig)[4];
-	M5 = ((unsigned int *)gen_sig)[5];
-	M6 = ((unsigned int *)gen_sig)[6];
-	M7 = ((unsigned int *)gen_sig)[7];
-	M8 = ((unsigned int *)scoop_data)[0];
-	M9 = ((unsigned int *)scoop_data)[1];
-	MA = ((unsigned int *)scoop_data)[2];
-	MB = ((unsigned int *)scoop_data)[3];
-	MC = ((unsigned int *)scoop_data)[4];
-	MD = ((unsigned int *)scoop_data)[5];
-	ME = ((unsigned int *)scoop_data)[6];
-	MF = ((unsigned int *)scoop_data)[7];
-
-    INPUT_BLOCK_ADD;
-    XOR_W;
-    APPLY_P;
-    INPUT_BLOCK_SUB;
-    SWAP_BC;
-    INCR_W;
-
-	M0 = ((unsigned int *)scoop_data)[8];
-	M1 = ((unsigned int *)scoop_data)[9];
-	M2 = ((unsigned int *)scoop_data)[10];
-	M3 = ((unsigned int *)scoop_data)[11];
-	M4 = ((unsigned int *)scoop_data)[12];
-	M5 = ((unsigned int *)scoop_data)[13];
-	M6 = ((unsigned int *)scoop_data)[14];
-	M7 = ((unsigned int *)scoop_data)[15];
-	M8 = 0x80;
-	M9 = MA = MB = MC = MD = ME = MF = 0;
-
-    INPUT_BLOCK_ADD;
-    XOR_W;
-    APPLY_P;
-
-    for (int i = 0; i < 3; i++) {
-        SWAP_BC;
-        XOR_W;
-        APPLY_P;
-    }
-
-    sph_enc32le_aligned((sph_u32 *)dst, B8);
-    sph_enc32le_aligned((sph_u32 *)dst + 1, B9);    
-}
\ No newline at end of file
diff --git a/src/c/sph_shabal.h b/src/c/sph_shabal.h
deleted file mode 100644
index ca6c772..0000000
--- a/src/c/sph_shabal.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
-/**
- * Shabal interface. Shabal is a family of functions which differ by
- * their output size; this implementation defines Shabal for output
- * sizes 192, 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_shabal.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_SHABAL_H__
-#define SPH_SHABAL_H__
-
-#include <stddef.h>
-#include "sph_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Output size (in bits) for Shabal-256.
- */
-#define SPH_SIZE_shabal256 256
-
-/**
- * This structure is a context for Shabal computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * a Shabal computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running Shabal computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-    unsigned char buf[64]; /* first field, for alignment */
-    size_t ptr;
-    sph_u32 A[12], B[16], C[16];
-    sph_u32 Whigh, Wlow;
-#endif
-} sph_shabal_context;
-
-/**
- * Type for a Shabal-256 context (identical to the common context).
- */
-typedef sph_shabal_context sph_shabal256_context;
-
-/**
- * Initialize a Shabal-256 context. This process performs no memory
- * allocation.
- *
- * @param cc   the Shabal-256 context (pointer to a
- *             <code>sph_shabal256_context</code>)
- */
-void sph_shabal256_init(sph_shabal_context* cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the Shabal-256 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_shabal256(void* cc, const unsigned char* data, size_t len);
-
-/**
- * Terminate the current Shabal-256 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (32 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the Shabal-256 context
- * @param dst   the destination buffer
- */
-void sph_shabal256_close(void* cc, void* dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (32 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the Shabal-256 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst);
-
-/*
- * optimised Shabal routine for PoC plotting and hashing
- */
-void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num);
-
-/*
- * optimised Shabal routine for PoC mining
- */
-void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/c/sph_types.h b/src/c/sph_types.h
deleted file mode 100644
index b2bdef3..0000000
--- a/src/c/sph_types.h
+++ /dev/null
@@ -1,1912 +0,0 @@
-/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
-/**
- * Basic type definitions.
- *
- * This header file defines the generic integer types that will be used
- * for the implementation of hash functions; it also contains helper
- * functions which encode and decode multi-byte integer values, using
- * either little-endian or big-endian conventions.
- *
- * This file contains a compile-time test on the size of a byte
- * (the <code>unsigned char</code> C type). If bytes are not octets,
- * i.e. if they do not have a size of exactly 8 bits, then compilation
- * is aborted. Architectures where bytes are not octets are relatively
- * rare, even in the embedded devices market. We forbid non-octet bytes
- * because there is no clear convention on how octet streams are encoded
- * on such systems.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_types.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_TYPES_H__
-#define SPH_TYPES_H__
-
-#include <limits.h>
-
-/*
- * All our I/O functions are defined over octet streams. We do not know
- * how to handle input data if bytes are not octets.
- */
-#if CHAR_BIT != 8
-#error This code requires 8-bit bytes
-#endif
-
-/* ============= BEGIN documentation block for Doxygen ============ */
-
-#ifdef DOXYGEN_IGNORE
-
-/** @mainpage sphlib C code documentation
- *
- * @section overview Overview
- *
- * <code>sphlib</code> is a library which contains implementations of
- * various cryptographic hash functions. These pages have been generated
- * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
- * document the API for the C implementations.
- *
- * The API is described in appropriate header files, which are available
- * in the "Files" section. Each hash function family has its own header,
- * whose name begins with <code>"sph_"</code> and contains the family
- * name. For instance, the API for the RIPEMD hash functions is available
- * in the header file <code>sph_ripemd.h</code>.
- *
- * @section principles API structure and conventions
- *
- * @subsection io Input/output conventions
- *
- * In all generality, hash functions operate over strings of bits.
- * Individual bits are rarely encountered in C programming or actual
- * communication protocols; most protocols converge on the ubiquitous
- * "octet" which is a group of eight bits. Data is thus expressed as a
- * stream of octets. The C programming language contains the notion of a
- * "byte", which is a data unit managed under the type <code>"unsigned
- * char"</code>. The C standard prescribes that a byte should hold at
- * least eight bits, but possibly more. Most modern architectures, even
- * in the embedded world, feature eight-bit bytes, i.e. map bytes to
- * octets.
- *
- * Nevertheless, for some of the implemented hash functions, an extra
- * API has been added, which allows the input of arbitrary sequences of
- * bits: when the computation is about to be closed, 1 to 7 extra bits
- * can be added. The functions for which this API is implemented include
- * the SHA-2 functions and all SHA-3 candidates.
- *
- * <code>sphlib</code> defines hash function which may hash octet streams,
- * i.e. streams of bits where the number of bits is a multiple of eight.
- * The data input functions in the <code>sphlib</code> API expect data
- * as anonymous pointers (<code>"const void *"</code>) with a length
- * (of type <code>"size_t"</code>) which gives the input data chunk length
- * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
- * header contains a compile-time test which prevents compilation on
- * architectures where this property is not met.
- *
- * The hash function output is also converted into bytes. All currently
- * implemented hash functions have an output width which is a multiple of
- * eight, and this is likely to remain true for new designs.
- *
- * Most hash functions internally convert input data into 32-bit of 64-bit
- * words, using either little-endian or big-endian conversion. The hash
- * output also often consists of such words, which are encoded into output
- * bytes with a similar endianness convention. Some hash functions have
- * been only loosely specified on that subject; when necessary,
- * <code>sphlib</code> has been tested against published "reference"
- * implementations in order to use the same conventions.
- *
- * @subsection shortname Function short name
- *
- * Each implemented hash function has a "short name" which is used
- * internally to derive the identifiers for the functions and context
- * structures which the function uses. For instance, MD5 has the short
- * name <code>"md5"</code>. Short names are listed in the next section,
- * for the implemented hash functions. In subsequent sections, the
- * short name will be assumed to be <code>"XXX"</code>: replace with the
- * actual hash function name to get the C identifier.
- *
- * Note: some functions within the same family share the same core
- * elements, such as update function or context structure. Correspondingly,
- * some of the defined types or functions may actually be macros which
- * transparently evaluate to another type or function name.
- *
- * @subsection context Context structure
- *
- * Each implemented hash fonction has its own context structure, available
- * under the type name <code>"sph_XXX_context"</code> for the hash function
- * with short name <code>"XXX"</code>. This structure holds all needed
- * state for a running hash computation.
- *
- * The contents of these structures are meant to be opaque, and private
- * to the implementation. However, these contents are specified in the
- * header files so that application code which uses <code>sphlib</code>
- * may access the size of those structures.
- *
- * The caller is responsible for allocating the context structure,
- * whether by dynamic allocation (<code>malloc()</code> or equivalent),
- * static allocation (a global permanent variable), as an automatic
- * variable ("on the stack"), or by any other mean which ensures proper
- * structure alignment. <code>sphlib</code> code performs no dynamic
- * allocation by itself.
- *
- * The context must be initialized before use, using the
- * <code>sph_XXX_init()</code> function. This function sets the context
- * state to proper initial values for hashing.
- *
- * Since all state data is contained within the context structure,
- * <code>sphlib</code> is thread-safe and reentrant: several hash
- * computations may be performed in parallel, provided that they do not
- * operate on the same context. Moreover, a running computation can be
- * cloned by copying the context (with a simple <code>memcpy()</code>):
- * the context and its clone are then independant and may be updated
- * with new data and/or closed without interfering with each other.
- * Similarly, a context structure can be moved in memory at will:
- * context structures contain no pointer, in particular no pointer to
- * themselves.
- *
- * @subsection dataio Data input
- *
- * Hashed data is input with the <code>sph_XXX()</code> fonction, which
- * takes as parameters a pointer to the context, a pointer to the data
- * to hash, and the number of data bytes to hash. The context is updated
- * with the new data.
- *
- * Data can be input in one or several calls, with arbitrary input lengths.
- * However, it is best, performance wise, to input data by relatively big
- * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
- * optimize things and avoid internal copying.
- *
- * When all data has been input, the context can be closed with
- * <code>sph_XXX_close()</code>. The hash output is computed and written
- * into the provided buffer. The caller must take care to provide a
- * buffer of appropriate length; e.g., when using SHA-1, the output is
- * a 20-byte word, therefore the output buffer must be at least 20-byte
- * long.
- *
- * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
- * function can be used instead of <code>sph_XXX_close()</code>. This
- * function can take a few extra <strong>bits</strong> to be added at
- * the end of the input message. This allows hashing messages with a
- * bit length which is not a multiple of 8. The extra bits are provided
- * as an unsigned integer value, and a bit count. The bit count must be
- * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
- * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
- * For instance, to add three bits of value 1, 1 and 0, the unsigned
- * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
- * will be 3.
- *
- * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
- * it evaluates to the function output size, expressed in bits. For instance,
- * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
- *
- * When closed, the context is automatically reinitialized and can be
- * immediately used for another computation. It is not necessary to call
- * <code>sph_XXX_init()</code> after a close. Note that
- * <code>sph_XXX_init()</code> can still be called to "reset" a context,
- * i.e. forget previously input data, and get back to the initial state.
- *
- * @subsection alignment Data alignment
- *
- * "Alignment" is a property of data, which is said to be "properly
- * aligned" when its emplacement in memory is such that the data can
- * be optimally read by full words. This depends on the type of access;
- * basically, some hash functions will read data by 32-bit or 64-bit
- * words. <code>sphlib</code> does not mandate such alignment for input
- * data, but using aligned data can substantially improve performance.
- *
- * As a rule, it is best to input data by chunks whose length (in bytes)
- * is a multiple of eight, and which begins at "generally aligned"
- * addresses, such as the base address returned by a call to
- * <code>malloc()</code>.
- *
- * @section functions Implemented functions
- *
- * We give here the list of implemented functions. They are grouped by
- * family; to each family corresponds a specific header file. Each
- * individual function has its associated "short name". Please refer to
- * the documentation for that header file to get details on the hash
- * function denomination and provenance.
- *
- * Note: the functions marked with a '(64)' in the list below are
- * available only if the C compiler provides an integer type of length
- * 64 bits or more. Such a type is mandatory in the latest C standard
- * (ISO 9899:1999, aka "C99") and is present in several older compilers
- * as well, so chances are that such a type is available.
- *
- * - HAVAL family: file <code>sph_haval.h</code>
- *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
- *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
- *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
- *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
- *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
- *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
- *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
- *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
- *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
- *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
- *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
- *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
- *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
- *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
- *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
- * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
- * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
- * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
- * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
- * - RadioGatun family: file <code>sph_radiogatun.h</code>
- *   - RadioGatun[32]: short name: <code>radiogatun32</code>
- *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
- * - RIPEMD family: file <code>sph_ripemd.h</code>
- *   - RIPEMD: short name: <code>ripemd</code>
- *   - RIPEMD-128: short name: <code>ripemd128</code>
- *   - RIPEMD-160: short name: <code>ripemd160</code>
- * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
- * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
- * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
- *   - SHA-224: short name: <code>sha224</code>
- *   - SHA-256: short name: <code>sha256</code>
- *   - SHA-384: short name: <code>sha384</code> (64)
- *   - SHA-512: short name: <code>sha512</code> (64)
- * - Tiger family: file <code>sph_tiger.h</code>
- *   - Tiger: short name: <code>tiger</code> (64)
- *   - Tiger2: short name: <code>tiger2</code> (64)
- * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
- *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
- *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
- *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
- *
- * The fourteen second-round SHA-3 candidates are also implemented;
- * when applicable, the implementations follow the "final" specifications
- * as published for the third round of the SHA-3 competition (BLAKE,
- * Groestl, JH, Keccak and Skein have been tweaked for third round).
- *
- * - BLAKE family: file <code>sph_blake.h</code>
- *   - BLAKE-224: short name: <code>blake224</code>
- *   - BLAKE-256: short name: <code>blake256</code>
- *   - BLAKE-384: short name: <code>blake384</code>
- *   - BLAKE-512: short name: <code>blake512</code>
- * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
- *   - BMW-224: short name: <code>bmw224</code>
- *   - BMW-256: short name: <code>bmw256</code>
- *   - BMW-384: short name: <code>bmw384</code> (64)
- *   - BMW-512: short name: <code>bmw512</code> (64)
- * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
- *   CubeHash16/32 in the CubeHash specification)
- *   - CubeHash-224: short name: <code>cubehash224</code>
- *   - CubeHash-256: short name: <code>cubehash256</code>
- *   - CubeHash-384: short name: <code>cubehash384</code>
- *   - CubeHash-512: short name: <code>cubehash512</code>
- * - ECHO family: file <code>sph_echo.h</code>
- *   - ECHO-224: short name: <code>echo224</code>
- *   - ECHO-256: short name: <code>echo256</code>
- *   - ECHO-384: short name: <code>echo384</code>
- *   - ECHO-512: short name: <code>echo512</code>
- * - Fugue family: file <code>sph_fugue.h</code>
- *   - Fugue-224: short name: <code>fugue224</code>
- *   - Fugue-256: short name: <code>fugue256</code>
- *   - Fugue-384: short name: <code>fugue384</code>
- *   - Fugue-512: short name: <code>fugue512</code>
- * - Groestl family: file <code>sph_groestl.h</code>
- *   - Groestl-224: short name: <code>groestl224</code>
- *   - Groestl-256: short name: <code>groestl256</code>
- *   - Groestl-384: short name: <code>groestl384</code>
- *   - Groestl-512: short name: <code>groestl512</code>
- * - Hamsi family: file <code>sph_hamsi.h</code>
- *   - Hamsi-224: short name: <code>hamsi224</code>
- *   - Hamsi-256: short name: <code>hamsi256</code>
- *   - Hamsi-384: short name: <code>hamsi384</code>
- *   - Hamsi-512: short name: <code>hamsi512</code>
- * - JH family: file <code>sph_jh.h</code>
- *   - JH-224: short name: <code>jh224</code>
- *   - JH-256: short name: <code>jh256</code>
- *   - JH-384: short name: <code>jh384</code>
- *   - JH-512: short name: <code>jh512</code>
- * - Keccak family: file <code>sph_keccak.h</code>
- *   - Keccak-224: short name: <code>keccak224</code>
- *   - Keccak-256: short name: <code>keccak256</code>
- *   - Keccak-384: short name: <code>keccak384</code>
- *   - Keccak-512: short name: <code>keccak512</code>
- * - Luffa family: file <code>sph_luffa.h</code>
- *   - Luffa-224: short name: <code>luffa224</code>
- *   - Luffa-256: short name: <code>luffa256</code>
- *   - Luffa-384: short name: <code>luffa384</code>
- *   - Luffa-512: short name: <code>luffa512</code>
- * - Shabal family: file <code>sph_shabal.h</code>
- *   - Shabal-192: short name: <code>shabal192</code>
- *   - Shabal-224: short name: <code>shabal224</code>
- *   - Shabal-256: short name: <code>shabal256</code>
- *   - Shabal-384: short name: <code>shabal384</code>
- *   - Shabal-512: short name: <code>shabal512</code>
- * - SHAvite-3 family: file <code>sph_shavite.h</code>
- *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
- *     short name: <code>shabal224</code>
- *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
- *     short name: <code>shabal256</code>
- *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
- *     short name: <code>shabal384</code>
- *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
- *     short name: <code>shabal512</code>
- * - SIMD family: file <code>sph_simd.h</code>
- *   - SIMD-224: short name: <code>simd224</code>
- *   - SIMD-256: short name: <code>simd256</code>
- *   - SIMD-384: short name: <code>simd384</code>
- *   - SIMD-512: short name: <code>simd512</code>
- * - Skein family: file <code>sph_skein.h</code>
- *   - Skein-224 (nominally specified as Skein-512-224): short name:
- *     <code>skein224</code> (64)
- *   - Skein-256 (nominally specified as Skein-512-256): short name:
- *     <code>skein256</code> (64)
- *   - Skein-384 (nominally specified as Skein-512-384): short name:
- *     <code>skein384</code> (64)
- *   - Skein-512 (nominally specified as Skein-512-512): short name:
- *     <code>skein512</code> (64)
- *
- * For the second-round SHA-3 candidates, the functions are as specified
- * for round 2, i.e. with the "tweaks" that some candidates added
- * between round 1 and round 2. Also, some of the submitted packages for
- * round 2 contained errors, in the specification, reference code, or
- * both. <code>sphlib</code> implements the corrected versions.
- */
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 32 bits; on most
- * architectures, it will have a width of exactly 32 bits. Unsigned C
- * types implement arithmetics modulo a power of 2; use the
- * <code>SPH_T32()</code> macro to ensure that the value is truncated
- * to exactly 32 bits. Unless otherwise specified, all macros and
- * functions which accept <code>sph_u32</code> values assume that these
- * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
- * where <code>sph_u32</code> is larger than that.
- */
-typedef __arch_dependant__ sph_u32;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u32</code>; it has
- * width 32 bits or more.
- */
-typedef __arch_dependant__ sph_s32;
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 64 bits; on most
- * architectures which feature such a type, it will have a width of
- * exactly 64 bits. C99-compliant platform will have this type; it
- * is also defined when the GNU compiler (gcc) is used, and on
- * platforms where <code>unsigned long</code> is large enough. If this
- * type is not available, then some hash functions which depends on
- * a 64-bit type will not be available (most notably SHA-384, SHA-512,
- * Tiger and WHIRLPOOL).
- */
-typedef __arch_dependant__ sph_u64;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u64</code>; it has
- * width 64 bits or more.
- */
-typedef __arch_dependant__ sph_s64;
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u32</code>. Depending on
- * how this type is defined, a suffix such as <code>UL</code> may
- * be appended to the argument.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C32(x)
-
-/**
- * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
- * a no-op, recognized as such by the compiler.
- *
- * @param x   the value to truncate (of type <code>sph_u32</code>)
- */
-#define SPH_T32(x)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTL32(x, n)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTR32(x, n)
-
-/**
- * This macro is defined on systems for which a 64-bit type has been
- * detected, and is used for <code>sph_u64</code>.
- */
-#define SPH_64
-
-/**
- * This macro is defined on systems for the "native" integer size is
- * 64 bits (64-bit values fit in one register).
- */
-#define SPH_64_TRUE
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u64</code>. Depending on
- * how this type is defined, a suffix such as <code>ULL</code> may
- * be appended to the argument. This macro is defined only if a
- * 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C64(x)
-
-/**
- * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
- * a no-op, recognized as such by the compiler. This macro is defined only
- * if a 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the value to truncate (of type <code>sph_u64</code>)
- */
-#define SPH_T64(x)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTL64(x, n)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTR64(x, n)
-
-/**
- * This macro evaluates to <code>inline</code> or an equivalent construction,
- * if available on the compilation platform, or to nothing otherwise. This
- * is used to declare inline functions, for which the compiler should
- * endeavour to include the code directly in the caller. Inline functions
- * are typically defined in header files as replacement for macros.
- */
-#define SPH_INLINE
-
-/**
- * This macro is defined if the platform has been detected as using
- * little-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_LITTLE_ENDIAN
-
-/**
- * This macro is defined if the platform has been detected as using
- * big-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_BIG_ENDIAN
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in little-endian
- * convention. This is the case for little-endian platforms, and also
- * for the big-endian platforms which have special little-endian access
- * opcodes (e.g. Ultrasparc).
- */
-#define SPH_LITTLE_FAST
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in big-endian
- * convention. This is the case for little-endian platforms, and also
- * for the little-endian platforms which have special big-endian access
- * opcodes.
- */
-#define SPH_BIG_FAST
-
-/**
- * On some platforms, this macro is defined to an unsigned integer type
- * into which pointer values may be cast. The resulting value can then
- * be tested for being a multiple of 2, 4 or 8, indicating an aligned
- * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
- */
-#define SPH_UPTR
-
-/**
- * When defined, this macro indicates that unaligned memory accesses
- * are possible with only a minor penalty, and thus should be prefered
- * over strategies which first copy data to an aligned buffer.
- */
-#define SPH_UNALIGNED
-
-/**
- * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
- * <code>0x78563412</code>). This is an inline function which resorts
- * to inline assembly on some platforms, for better performance.
- *
- * @param x   the 32-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u32 sph_bswap32(sph_u32 x);
-
-/**
- * Byte-swap a 64-bit word. This is an inline function which resorts
- * to inline assembly on some platforms, for better performance. This
- * function is defined only if a suitable 64-bit type was found for
- * <code>sph_u64</code>
- *
- * @param x   the 64-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u64 sph_bswap64(sph_u64 x);
-
-/**
- * Decode a 16-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16le(const void* src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16le(void* dst, unsigned val);
-
-/**
- * Decode a 16-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16be(const void* src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16be(void* dst, unsigned val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le(const void* src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32le()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le_aligned(const void* src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le(void* dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32le()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le_aligned(void* dst, sph_u32 val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be(const void* src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32be()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be_aligned(const void* src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be(void* dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32be()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be_aligned(void* dst, sph_u32 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le(const void* src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64le()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le_aligned(const void* src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le(void* dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64le()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le_aligned(void* dst, sph_u64 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be(const void* src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64be()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be_aligned(const void* src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be(void* dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64be()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be_aligned(void* dst, sph_u64 val);
-
-#endif
-
-/* ============== END documentation block for Doxygen ============= */
-
-#ifndef DOXYGEN_IGNORE
-
-/*
- * We want to define the types "sph_u32" and "sph_u64" which hold
- * unsigned values of at least, respectively, 32 and 64 bits. These
- * tests should select appropriate types for most platforms. The
- * macro "SPH_64" is defined if the 64-bit is supported.
- */
-
-#undef SPH_64
-#undef SPH_64_TRUE
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
- * type, if any, or otherwise use a wider type (which must exist, for
- * C99 conformance).
- */
-
-#include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t sph_u32;
-typedef int32_t sph_s32;
-#else
-typedef uint_fast32_t sph_u32;
-typedef int_fast32_t sph_s32;
-#endif
-#if !SPH_NO_64
-#ifdef UINT64_MAX
-typedef uint64_t sph_u64;
-typedef int64_t sph_s64;
-#else
-typedef uint_fast64_t sph_u64;
-typedef int_fast64_t sph_s64;
-#endif
-#endif
-
-#define SPH_C32(x) ((sph_u32)(x))
-#if !SPH_NO_64
-#define SPH_C64(x) ((sph_u64)(x))
-#define SPH_64 1
-#endif
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int sph_u32;
-typedef int sph_s32;
-
-#define SPH_C32(x) ((sph_u32)(x##U))
-
-#else
-
-typedef unsigned long sph_u32;
-typedef long sph_s32;
-
-#define SPH_C32(x) ((sph_u32)(x##UL))
-
-#endif
-
-#if !SPH_NO_64
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long sph_u64;
-typedef long sph_s64;
-
-#define SPH_C64(x) ((sph_u64)(x##UL))
-
-#define SPH_64 1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long sph_u64;
-typedef long long sph_s64;
-
-#define SPH_C64(x) ((sph_u64)(x##ULL))
-
-#define SPH_64 1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-#endif
-
-/*
- * If the "unsigned long" type has length 64 bits or more, then this is
- * a "true" 64-bit architectures. This is also true with Visual C on
- * amd64, even though the "long" type is limited to 32 bits.
- */
-#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
-#define SPH_64_TRUE 1
-#endif
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define SPH_T32(x) ((x)&SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
-
-#if SPH_64
-
-#define SPH_T64(x) ((x)&SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
-
-#endif
-
-#ifndef DOXYGEN_IGNORE
-/*
- * Define SPH_INLINE to be an "inline" qualifier, if available. We define
- * some small macro-like functions which benefit greatly from being inlined.
- */
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
-#define SPH_INLINE inline
-#elif defined _MSC_VER
-#define SPH_INLINE __inline
-#else
-#define SPH_INLINE
-#endif
-#endif
-
-/*
- * We define some macros which qualify the architecture. These macros
- * may be explicit set externally (e.g. as compiler parameters). The
- * code below sets those macros if they are not already defined.
- *
- * Most macros are boolean, thus evaluate to either zero or non-zero.
- * The SPH_UPTR macro is special, in that it evaluates to a C type,
- * or is not defined.
- *
- * SPH_UPTR             if defined: unsigned type to cast pointers into
- *
- * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
- * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
- * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
- * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
- * SPH_BIG_FAST         non-zero if big-endian decoding is fast
- *
- * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
- * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
- * _must_ be non-zero in those situations. The 32-bit and 64-bit types
- * _must_ also have an exact width.
- *
- * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
- * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
- * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
- * SPH_I386_GCC         x86-compatible (32-bit) with gcc
- * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
- * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
- * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
- * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
- * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
- *
- * TODO: enhance automatic detection, for more architectures and compilers.
- * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
- * some very fast functions (e.g. MD4) when using unaligned input data.
- * The CPU-specific-with-GCC macros are useful only for inline assembly,
- * normally restrained to this header file.
- */
-
-/*
- * 32-bit x86, aka "i386 compatible".
- */
-#if defined __i386__ || defined _M_IX86
-
-#define SPH_DETECT_UNALIGNED 1
-#define SPH_DETECT_LITTLE_ENDIAN 1
-#define SPH_DETECT_UPTR sph_u32
-#ifdef __GNUC__
-#define SPH_DETECT_I386_GCC 1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_I386_MSVC 1
-#endif
-
-/*
- * 64-bit x86, hereafter known as "amd64".
- */
-#elif defined __x86_64 || defined _M_X64
-
-#define SPH_DETECT_UNALIGNED 1
-#define SPH_DETECT_LITTLE_ENDIAN 1
-#define SPH_DETECT_UPTR sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_AMD64_GCC 1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_AMD64_MSVC 1
-#endif
-
-/*
- * 64-bit Sparc architecture (implies v9).
- */
-#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) || defined __sparcv9
-
-#define SPH_DETECT_BIG_ENDIAN 1
-#define SPH_DETECT_UPTR sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_SPARCV9_GCC_64 1
-#define SPH_DETECT_LITTLE_FAST 1
-#endif
-
-/*
- * 32-bit Sparc.
- */
-#elif (defined __sparc__ || defined __sparc) && !(defined __sparcv9 || defined __arch64__)
-
-#define SPH_DETECT_BIG_ENDIAN 1
-#define SPH_DETECT_UPTR sph_u32
-#if defined __GNUC__ && defined __sparc_v9__
-#define SPH_DETECT_SPARCV9_GCC_32 1
-#define SPH_DETECT_LITTLE_FAST 1
-#endif
-
-/*
- * ARM, little-endian.
- */
-#elif defined __arm__ && __ARMEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN 1
-
-/*
- * MIPS, little-endian.
- */
-#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN 1
-
-/*
- * MIPS, big-endian.
- */
-#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
-
-#define SPH_DETECT_BIG_ENDIAN 1
-
-/*
- * PowerPC.
- */
-#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ || defined _ARCH_PPC
-
-/*
- * Note: we do not declare cross-endian access to be "fast": even if
- * using inline assembly, implementation should still assume that
- * keeping the decoded word in a temporary is faster than decoding
- * it again.
- */
-#if defined __GNUC__
-#if SPH_64_TRUE
-#define SPH_DETECT_PPC64_GCC 1
-#else
-#define SPH_DETECT_PPC32_GCC 1
-#endif
-#endif
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN 1
-#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
-#define SPH_DETECT_LITTLE_ENDIAN 1
-#endif
-
-/*
- * Itanium, 64-bit.
- */
-#elif defined __ia64 || defined __ia64__ || defined __itanium__ || defined _M_IA64
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN 1
-#else
-#define SPH_DETECT_LITTLE_ENDIAN 1
-#endif
-#if defined __LP64__ || defined _LP64
-#define SPH_DETECT_UPTR sph_u64
-#else
-#define SPH_DETECT_UPTR sph_u32
-#endif
-
-#endif
-
-#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
-#define SPH_DETECT_SPARCV9_GCC 1
-#endif
-
-#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
-#define SPH_UNALIGNED SPH_DETECT_UNALIGNED
-#endif
-#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
-#define SPH_UPTR SPH_DETECT_UPTR
-#endif
-#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
-#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN
-#endif
-#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
-#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN
-#endif
-#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST
-#endif
-#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST SPH_DETECT_BIG_FAST
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
-#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
-#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
-#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC
-#endif
-#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
-#define SPH_I386_GCC SPH_DETECT_I386_GCC
-#endif
-#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
-#define SPH_I386_MSVC SPH_DETECT_I386_MSVC
-#endif
-#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
-#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC
-#endif
-#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
-#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC
-#endif
-#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
-#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC
-#endif
-#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
-#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC
-#endif
-
-#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST 1
-#endif
-#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST 1
-#endif
-
-#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
-#error SPH_UPTR defined, but endianness is not known.
-#endif
-
-#if SPH_I386_GCC && !SPH_NO_ASM
-
-/*
- * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * values.
- */
-
-static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) {
-    __asm__ __volatile__("bswapl %0" : "=r"(x) : "0"(x));
-    return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) {
-    return ((sph_u64)sph_bswap32((sph_u32)x) << 32) | (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
-#elif SPH_AMD64_GCC && !SPH_NO_ASM
-
-/*
- * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * and 64-bit values.
- */
-
-static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) {
-    __asm__ __volatile__("bswapl %0" : "=r"(x) : "0"(x));
-    return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) {
-    __asm__ __volatile__("bswapq %0" : "=r"(x) : "0"(x));
-    return x;
-}
-
-#endif
-
-/*
- * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
- * to generate proper opcodes for endianness swapping with the pure C
- * implementation below.
- *
-
-#elif SPH_I386_MSVC && !SPH_NO_ASM
-
-static __inline sph_u32 __declspec(naked) __fastcall
-sph_bswap32(sph_u32 x)
-{
-        __asm {
-                bswap  ecx
-                mov    eax,ecx
-                ret
-        }
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-        return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-                | (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
- *
- * [end of disabled code]
- */
-
-#else
-
-static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) {
-    x = SPH_T32((x << 16) | (x >> 16));
-    x = ((x & SPH_C32(0xFF00FF00)) >> 8) | ((x & SPH_C32(0x00FF00FF)) << 8);
-    return x;
-}
-
-#if SPH_64
-
-/**
- * Byte-swap a 64-bit value.
- *
- * @param x   the input value
- * @return  the byte-swapped value
- */
-static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) {
-    x = SPH_T64((x << 32) | (x >> 32));
-    x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
-    x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
-    return x;
-}
-
-#endif
-
-#endif
-
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-
-/*
- * On UltraSPARC systems, native ordering is big-endian, but it is
- * possible to perform little-endian read accesses by specifying the
- * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
- * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
- * contains the source address and %dst is the destination register,
- * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
- * to get the address space name. The latter format is better since it
- * combines an addition and the actual access in a single opcode; but
- * it requires the setting (and subsequent resetting) of %asi, which is
- * slow. Some operations (i.e. MD5 compression function) combine many
- * successive little-endian read accesses, which may share the same
- * %asi setting. The macros below contain the appropriate inline
- * assembly.
- */
-
-#define SPH_SPARCV9_SET_ASI  \
-    sph_u32 sph_sparcv9_asi; \
-    __asm__ __volatile__("rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r"(sph_sparcv9_asi));
-
-#define SPH_SPARCV9_RESET_ASI __asm__ __volatile__("wr %%g0,%0,%%asi" : : "r"(sph_sparcv9_asi));
-
-#define SPH_SPARCV9_DEC32LE(base, idx)                                                           \
-    ({                                                                                           \
-        sph_u32 sph_sparcv9_tmp;                                                                 \
-        __asm__ __volatile__("lda [%1+" #idx "*4]%%asi,%0" : "=r"(sph_sparcv9_tmp) : "r"(base)); \
-        sph_sparcv9_tmp;                                                                         \
-    })
-
-#endif
-
-static SPH_INLINE void sph_enc16be(void* dst, unsigned val) {
-    ((unsigned char*)dst)[0] = (val >> 8);
-    ((unsigned char*)dst)[1] = val;
-}
-
-static SPH_INLINE unsigned sph_dec16be(const void* src) {
-    return ((unsigned)(((const unsigned char*)src)[0]) << 8) |
-           (unsigned)(((const unsigned char*)src)[1]);
-}
-
-static SPH_INLINE void sph_enc16le(void* dst, unsigned val) {
-    ((unsigned char*)dst)[0] = val;
-    ((unsigned char*)dst)[1] = val >> 8;
-}
-
-static SPH_INLINE unsigned sph_dec16le(const void* src) {
-    return (unsigned)(((const unsigned char*)src)[0]) |
-           ((unsigned)(((const unsigned char*)src)[1]) << 8);
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void sph_enc32be(void* dst, sph_u32 val) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    val = sph_bswap32(val);
-#endif
-    *(sph_u32*)dst = val;
-#else
-    if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-        val = sph_bswap32(val);
-#endif
-        *(sph_u32*)dst = val;
-    } else {
-        ((unsigned char*)dst)[0] = (val >> 24);
-        ((unsigned char*)dst)[1] = (val >> 16);
-        ((unsigned char*)dst)[2] = (val >> 8);
-        ((unsigned char*)dst)[3] = val;
-    }
-#endif
-#else
-    ((unsigned char*)dst)[0] = (val >> 24);
-    ((unsigned char*)dst)[1] = (val >> 16);
-    ((unsigned char*)dst)[2] = (val >> 8);
-    ((unsigned char*)dst)[3] = val;
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void sph_enc32be_aligned(void* dst, sph_u32 val) {
-#if SPH_LITTLE_ENDIAN
-    *(sph_u32*)dst = sph_bswap32(val);
-#elif SPH_BIG_ENDIAN
-    *(sph_u32*)dst = val;
-#else
-    ((unsigned char*)dst)[0] = (val >> 24);
-    ((unsigned char*)dst)[1] = (val >> 16);
-    ((unsigned char*)dst)[2] = (val >> 8);
-    ((unsigned char*)dst)[3] = val;
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32 sph_dec32be(const void* src) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap32(*(const sph_u32*)src);
-#else
-    return *(const sph_u32*)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-        return sph_bswap32(*(const sph_u32*)src);
-#else
-        return *(const sph_u32*)src;
-#endif
-    } else {
-        return ((sph_u32)(((const unsigned char*)src)[0]) << 24) |
-               ((sph_u32)(((const unsigned char*)src)[1]) << 16) |
-               ((sph_u32)(((const unsigned char*)src)[2]) << 8) |
-               (sph_u32)(((const unsigned char*)src)[3]);
-    }
-#endif
-#else
-    return ((sph_u32)(((const unsigned char*)src)[0]) << 24) |
-           ((sph_u32)(((const unsigned char*)src)[1]) << 16) |
-           ((sph_u32)(((const unsigned char*)src)[2]) << 8) |
-           (sph_u32)(((const unsigned char*)src)[3]);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32 sph_dec32be_aligned(const void* src) {
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap32(*(const sph_u32*)src);
-#elif SPH_BIG_ENDIAN
-    return *(const sph_u32*)src;
-#else
-    return ((sph_u32)(((const unsigned char*)src)[0]) << 24) |
-           ((sph_u32)(((const unsigned char*)src)[1]) << 16) |
-           ((sph_u32)(((const unsigned char*)src)[2]) << 8) |
-           (sph_u32)(((const unsigned char*)src)[3]);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void sph_enc32le(void* dst, sph_u32 val) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    val = sph_bswap32(val);
-#endif
-    *(sph_u32*)dst = val;
-#else
-    if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_BIG_ENDIAN
-        val = sph_bswap32(val);
-#endif
-        *(sph_u32*)dst = val;
-    } else {
-        ((unsigned char*)dst)[0] = val;
-        ((unsigned char*)dst)[1] = (val >> 8);
-        ((unsigned char*)dst)[2] = (val >> 16);
-        ((unsigned char*)dst)[3] = (val >> 24);
-    }
-#endif
-#else
-    ((unsigned char*)dst)[0] = val;
-    ((unsigned char*)dst)[1] = (val >> 8);
-    ((unsigned char*)dst)[2] = (val >> 16);
-    ((unsigned char*)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void sph_enc32le_aligned(void* dst, sph_u32 val) {
-#if SPH_LITTLE_ENDIAN
-    *(sph_u32*)dst = val;
-#elif SPH_BIG_ENDIAN
-    *(sph_u32*)dst = sph_bswap32(val);
-#else
-    ((unsigned char*)dst)[0] = val;
-    ((unsigned char*)dst)[1] = (val >> 8);
-    ((unsigned char*)dst)[2] = (val >> 16);
-    ((unsigned char*)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32 sph_dec32le(const void* src) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    return sph_bswap32(*(const sph_u32*)src);
-#else
-    return *(const sph_u32*)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-        sph_u32 tmp;
-
-        /*
-         * "__volatile__" is needed here because without it,
-         * gcc-3.4.3 miscompiles the code and performs the
-         * access before the test on the address, thus triggering
-         * a bus error...
-         */
-        __asm__ __volatile__("lda [%1]0x88,%0" : "=r"(tmp) : "r"(src));
-        return tmp;
-/*
- * On PowerPC, this turns out not to be worth the effort: the inline
- * assembly makes GCC optimizer uncomfortable, which tends to nullify
- * the decoding gains.
- *
- * For most hash functions, using this inline assembly trick changes
- * hashing speed by less than 5% and often _reduces_ it. The biggest
- * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
- * less then 10%. The speed gain on CubeHash is probably due to the
- * chronic shortage of registers that CubeHash endures; for the other
- * functions, the generic code appears to be efficient enough already.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-                sph_u32 tmp;
-
-                __asm__ __volatile__ (
-                        "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-                return tmp;
- */
-#else
-        return sph_bswap32(*(const sph_u32*)src);
-#endif
-#else
-        return *(const sph_u32*)src;
-#endif
-    } else {
-        return (sph_u32)(((const unsigned char*)src)[0]) |
-               ((sph_u32)(((const unsigned char*)src)[1]) << 8) |
-               ((sph_u32)(((const unsigned char*)src)[2]) << 16) |
-               ((sph_u32)(((const unsigned char*)src)[3]) << 24);
-    }
-#endif
-#else
-    return (sph_u32)(((const unsigned char*)src)[0]) |
-           ((sph_u32)(((const unsigned char*)src)[1]) << 8) |
-           ((sph_u32)(((const unsigned char*)src)[2]) << 16) |
-           ((sph_u32)(((const unsigned char*)src)[3]) << 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32 sph_dec32le_aligned(const void* src) {
-#if SPH_LITTLE_ENDIAN
-    return *(const sph_u32*)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-    sph_u32 tmp;
-
-    __asm__ __volatile__("lda [%1]0x88,%0" : "=r"(tmp) : "r"(src));
-    return tmp;
-/*
- * Not worth it generally.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-        sph_u32 tmp;
-
-        __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-        return tmp;
- */
-#else
-    return sph_bswap32(*(const sph_u32*)src);
-#endif
-#else
-    return (sph_u32)(((const unsigned char*)src)[0]) |
-           ((sph_u32)(((const unsigned char*)src)[1]) << 8) |
-           ((sph_u32)(((const unsigned char*)src)[2]) << 16) |
-           ((sph_u32)(((const unsigned char*)src)[3]) << 24);
-#endif
-}
-
-#if SPH_64
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void sph_enc64be(void* dst, sph_u64 val) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    val = sph_bswap64(val);
-#endif
-    *(sph_u64*)dst = val;
-#else
-    if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-        val = sph_bswap64(val);
-#endif
-        *(sph_u64*)dst = val;
-    } else {
-        ((unsigned char*)dst)[0] = (val >> 56);
-        ((unsigned char*)dst)[1] = (val >> 48);
-        ((unsigned char*)dst)[2] = (val >> 40);
-        ((unsigned char*)dst)[3] = (val >> 32);
-        ((unsigned char*)dst)[4] = (val >> 24);
-        ((unsigned char*)dst)[5] = (val >> 16);
-        ((unsigned char*)dst)[6] = (val >> 8);
-        ((unsigned char*)dst)[7] = val;
-    }
-#endif
-#else
-    ((unsigned char*)dst)[0] = (val >> 56);
-    ((unsigned char*)dst)[1] = (val >> 48);
-    ((unsigned char*)dst)[2] = (val >> 40);
-    ((unsigned char*)dst)[3] = (val >> 32);
-    ((unsigned char*)dst)[4] = (val >> 24);
-    ((unsigned char*)dst)[5] = (val >> 16);
-    ((unsigned char*)dst)[6] = (val >> 8);
-    ((unsigned char*)dst)[7] = val;
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void sph_enc64be_aligned(void* dst, sph_u64 val) {
-#if SPH_LITTLE_ENDIAN
-    *(sph_u64*)dst = sph_bswap64(val);
-#elif SPH_BIG_ENDIAN
-    *(sph_u64*)dst = val;
-#else
-    ((unsigned char*)dst)[0] = (val >> 56);
-    ((unsigned char*)dst)[1] = (val >> 48);
-    ((unsigned char*)dst)[2] = (val >> 40);
-    ((unsigned char*)dst)[3] = (val >> 32);
-    ((unsigned char*)dst)[4] = (val >> 24);
-    ((unsigned char*)dst)[5] = (val >> 16);
-    ((unsigned char*)dst)[6] = (val >> 8);
-    ((unsigned char*)dst)[7] = val;
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64 sph_dec64be(const void* src) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap64(*(const sph_u64*)src);
-#else
-    return *(const sph_u64*)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-        return sph_bswap64(*(const sph_u64*)src);
-#else
-        return *(const sph_u64*)src;
-#endif
-    } else {
-        return ((sph_u64)(((const unsigned char*)src)[0]) << 56) |
-               ((sph_u64)(((const unsigned char*)src)[1]) << 48) |
-               ((sph_u64)(((const unsigned char*)src)[2]) << 40) |
-               ((sph_u64)(((const unsigned char*)src)[3]) << 32) |
-               ((sph_u64)(((const unsigned char*)src)[4]) << 24) |
-               ((sph_u64)(((const unsigned char*)src)[5]) << 16) |
-               ((sph_u64)(((const unsigned char*)src)[6]) << 8) |
-               (sph_u64)(((const unsigned char*)src)[7]);
-    }
-#endif
-#else
-    return ((sph_u64)(((const unsigned char*)src)[0]) << 56) |
-           ((sph_u64)(((const unsigned char*)src)[1]) << 48) |
-           ((sph_u64)(((const unsigned char*)src)[2]) << 40) |
-           ((sph_u64)(((const unsigned char*)src)[3]) << 32) |
-           ((sph_u64)(((const unsigned char*)src)[4]) << 24) |
-           ((sph_u64)(((const unsigned char*)src)[5]) << 16) |
-           ((sph_u64)(((const unsigned char*)src)[6]) << 8) |
-           (sph_u64)(((const unsigned char*)src)[7]);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64 sph_dec64be_aligned(const void* src) {
-#if SPH_LITTLE_ENDIAN
-    return sph_bswap64(*(const sph_u64*)src);
-#elif SPH_BIG_ENDIAN
-    return *(const sph_u64*)src;
-#else
-    return ((sph_u64)(((const unsigned char*)src)[0]) << 56) |
-           ((sph_u64)(((const unsigned char*)src)[1]) << 48) |
-           ((sph_u64)(((const unsigned char*)src)[2]) << 40) |
-           ((sph_u64)(((const unsigned char*)src)[3]) << 32) |
-           ((sph_u64)(((const unsigned char*)src)[4]) << 24) |
-           ((sph_u64)(((const unsigned char*)src)[5]) << 16) |
-           ((sph_u64)(((const unsigned char*)src)[6]) << 8) |
-           (sph_u64)(((const unsigned char*)src)[7]);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void sph_enc64le(void* dst, sph_u64 val) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    val = sph_bswap64(val);
-#endif
-    *(sph_u64*)dst = val;
-#else
-    if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_BIG_ENDIAN
-        val = sph_bswap64(val);
-#endif
-        *(sph_u64*)dst = val;
-    } else {
-        ((unsigned char*)dst)[0] = val;
-        ((unsigned char*)dst)[1] = (val >> 8);
-        ((unsigned char*)dst)[2] = (val >> 16);
-        ((unsigned char*)dst)[3] = (val >> 24);
-        ((unsigned char*)dst)[4] = (val >> 32);
-        ((unsigned char*)dst)[5] = (val >> 40);
-        ((unsigned char*)dst)[6] = (val >> 48);
-        ((unsigned char*)dst)[7] = (val >> 56);
-    }
-#endif
-#else
-    ((unsigned char*)dst)[0] = val;
-    ((unsigned char*)dst)[1] = (val >> 8);
-    ((unsigned char*)dst)[2] = (val >> 16);
-    ((unsigned char*)dst)[3] = (val >> 24);
-    ((unsigned char*)dst)[4] = (val >> 32);
-    ((unsigned char*)dst)[5] = (val >> 40);
-    ((unsigned char*)dst)[6] = (val >> 48);
-    ((unsigned char*)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void sph_enc64le_aligned(void* dst, sph_u64 val) {
-#if SPH_LITTLE_ENDIAN
-    *(sph_u64*)dst = val;
-#elif SPH_BIG_ENDIAN
-    *(sph_u64*)dst = sph_bswap64(val);
-#else
-    ((unsigned char*)dst)[0] = val;
-    ((unsigned char*)dst)[1] = (val >> 8);
-    ((unsigned char*)dst)[2] = (val >> 16);
-    ((unsigned char*)dst)[3] = (val >> 24);
-    ((unsigned char*)dst)[4] = (val >> 32);
-    ((unsigned char*)dst)[5] = (val >> 40);
-    ((unsigned char*)dst)[6] = (val >> 48);
-    ((unsigned char*)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64 sph_dec64le(const void* src) {
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-    return sph_bswap64(*(const sph_u64*)src);
-#else
-    return *(const sph_u64*)src;
-#endif
-#else
-    if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-        sph_u64 tmp;
-
-        __asm__ __volatile__("ldxa [%1]0x88,%0" : "=r"(tmp) : "r"(src));
-        return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-                return (sph_u64)sph_dec32le_aligned(src)
-                        | ((sph_u64)sph_dec32le_aligned(
-                                (const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-                sph_u64 tmp;
-
-                __asm__ __volatile__ (
-                        "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-                return tmp;
- */
-#else
-        return sph_bswap64(*(const sph_u64*)src);
-#endif
-#else
-        return *(const sph_u64*)src;
-#endif
-    } else {
-        return (sph_u64)(((const unsigned char*)src)[0]) |
-               ((sph_u64)(((const unsigned char*)src)[1]) << 8) |
-               ((sph_u64)(((const unsigned char*)src)[2]) << 16) |
-               ((sph_u64)(((const unsigned char*)src)[3]) << 24) |
-               ((sph_u64)(((const unsigned char*)src)[4]) << 32) |
-               ((sph_u64)(((const unsigned char*)src)[5]) << 40) |
-               ((sph_u64)(((const unsigned char*)src)[6]) << 48) |
-               ((sph_u64)(((const unsigned char*)src)[7]) << 56);
-    }
-#endif
-#else
-    return (sph_u64)(((const unsigned char*)src)[0]) |
-           ((sph_u64)(((const unsigned char*)src)[1]) << 8) |
-           ((sph_u64)(((const unsigned char*)src)[2]) << 16) |
-           ((sph_u64)(((const unsigned char*)src)[3]) << 24) |
-           ((sph_u64)(((const unsigned char*)src)[4]) << 32) |
-           ((sph_u64)(((const unsigned char*)src)[5]) << 40) |
-           ((sph_u64)(((const unsigned char*)src)[6]) << 48) |
-           ((sph_u64)(((const unsigned char*)src)[7]) << 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64 sph_dec64le_aligned(const void* src) {
-#if SPH_LITTLE_ENDIAN
-    return *(const sph_u64*)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-    sph_u64 tmp;
-
-    __asm__ __volatile__("ldxa [%1]0x88,%0" : "=r"(tmp) : "r"(src));
-    return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-        return (sph_u64)sph_dec32le_aligned(src)
-                | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-        sph_u64 tmp;
-
-        __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-        return tmp;
- */
-#else
-    return sph_bswap64(*(const sph_u64*)src);
-#endif
-#else
-    return (sph_u64)(((const unsigned char*)src)[0]) |
-           ((sph_u64)(((const unsigned char*)src)[1]) << 8) |
-           ((sph_u64)(((const unsigned char*)src)[2]) << 16) |
-           ((sph_u64)(((const unsigned char*)src)[3]) << 24) |
-           ((sph_u64)(((const unsigned char*)src)[4]) << 32) |
-           ((sph_u64)(((const unsigned char*)src)[5]) << 40) |
-           ((sph_u64)(((const unsigned char*)src)[6]) << 48) |
-           ((sph_u64)(((const unsigned char*)src)[7]) << 56);
-#endif
-}
-
-#endif
-
-#endif /* Doxygen excluded block */
-
-#endif
diff --git a/src/lib.rs b/src/lib.rs
index 65680a3..6f902bd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
-use libc::{c_void, uint64_t};
+extern crate libc;
+use libc::{c_char, uint64_t};
 #[macro_use]
 extern crate cfg_if;
 
@@ -6,9 +7,9 @@ use std::u64;
 
 extern "C" {
     pub fn find_best_deadline_sph(
-        scoops: *mut c_void,
+        scoops: *const c_char,
         nonce_count: uint64_t,
-        gensig: *const c_void,
+        gensig: *const c_char,
         best_deadline: *mut uint64_t,
         best_offset: *mut uint64_t,
     ) -> ();
@@ -17,34 +18,38 @@ extern "C" {
 cfg_if! {
     if #[cfg(feature = "simd")] {
         extern "C" {
+            pub fn init_shabal_avx512f() -> ();
             pub fn find_best_deadline_avx512f(
-                scoops: *mut c_void,
+                scoops: *const c_char,
                 nonce_count: uint64_t,
-                gensig: *const c_void,
+                gensig: *const c_char,
                 best_deadline: *mut uint64_t,
                 best_offset: *mut uint64_t,
             ) -> ();
 
+            pub fn init_shabal_avx2() -> ();
             pub fn find_best_deadline_avx2(
-                scoops: *mut c_void,
+                scoops: *const c_char,
                 nonce_count: uint64_t,
-                gensig: *const c_void,
+                gensig: *const c_char,
                 best_deadline: *mut uint64_t,
                 best_offset: *mut uint64_t,
             ) -> ();
 
+            pub fn init_shabal_avx() -> ();
             pub fn find_best_deadline_avx(
-                scoops: *mut c_void,
+                scoops: *const c_char,
                 nonce_count: uint64_t,
-                gensig: *const c_void,
+                gensig: *const c_char,
                 best_deadline: *mut uint64_t,
                 best_offset: *mut uint64_t,
             ) -> ();
 
+            pub fn init_shabal_sse2() -> ();
             pub fn find_best_deadline_sse2(
-                scoops: *mut c_void,
+                scoops: *const c_char,
                 nonce_count: uint64_t,
-                gensig: *const c_void,
+                gensig: *const c_char,
                 best_deadline: *mut uint64_t,
                 best_offset: *mut uint64_t,
             ) -> ();
@@ -55,10 +60,11 @@ cfg_if! {
 cfg_if! {
     if #[cfg(feature = "neon")] {
         extern "C" {
+            pub fn init_shabal_neon() -> ();
             pub fn find_best_deadline_neon(
-                scoops: *mut c_void,
+                scoops: *const c_char,
                 nonce_count: uint64_t,
-                gensig: *const c_void,
+                gensig: *const c_char,
                 best_deadline: *mut uint64_t,
                 best_offset: *mut uint64_t,
             ) -> ();
@@ -67,10 +73,10 @@ cfg_if! {
 }
 
 #[no_mangle]
-pub extern fn find_best_deadline(
-    scoops: *mut c_void,
+pub extern fn shabal_findBestDeadlineDirect(
+    scoops: *const c_char,
     nonce_count: uint64_t,
-    gensig: *const c_void,
+    gensig: *const c_char,
     best_deadline: *mut uint64_t,
     best_offset: *mut uint64_t,
 ) {
@@ -155,14 +161,40 @@ pub extern fn find_best_deadline(
 }
 
 #[no_mangle]
-pub extern fn find_best_deadline_assisted(
-    scoops: *mut c_void,
+pub extern fn shabal_init() {
+    #[cfg(feature = "simd")]
+    unsafe {
+        if is_x86_feature_detected!("avx512f") {
+            init_shabal_avx512f();
+        } else if is_x86_feature_detected!("avx2") {
+            init_shabal_avx2();
+        } else if is_x86_feature_detected!("avx") {
+            init_shabal_avx();
+        } else if is_x86_feature_detected!("sse2") {
+            init_shabal_sse2();
+        }
+    }
+    #[cfg(feature = "neon")]
+    unsafe {
+        #[cfg(target_arch = "arm")]
+        let neon = is_arm_feature_detected!("neon");
+        #[cfg(target_arch = "aarch64")]
+        let neon = true;
+
+        if neon {
+            init_shabal_neon();
+        }
+    }
+}
+
+#[no_mangle]
+pub extern fn shabal_findBestDeadline(
+    scoops: *const c_char,
     nonce_count: uint64_t,
-    gensig: *const c_void,
+    gensig: *const c_char,
 ) -> uint64_t {
     let mut deadline: u64 = u64::MAX;
     let mut offset: u64 = 0;
-    find_best_deadline(scoops, nonce_count, gensig, &mut deadline, &mut offset);
-    println!("scoop length is {}, best deadline is {}, best offset is {}", deadline, offset);
+    shabal_findBestDeadlineDirect(scoops, nonce_count, gensig, &mut deadline, &mut offset);
     return offset;
 }