From b2067d8d3c77405f972701d7fa35baec0c0a631f Mon Sep 17 00:00:00 2001 From: Harry Phillips Date: Mon, 11 Feb 2019 15:34:50 +0000 Subject: [PATCH] Working --- .gitignore | 1 + .gitmodules | 3 + .travis.yml | 28 + Cargo.toml | 2 +- build.rs | 26 +- build.sh | 6 + scavenger | 1 + src/c/SSE2NEON.h | 1690 ------------------------------- src/c/common.c | 15 - src/c/common.h | 65 -- src/c/mshabal_128_avx.c | 966 ------------------ src/c/mshabal_128_avx.h | 174 ---- src/c/mshabal_128_neon.c | 963 ------------------ src/c/mshabal_128_neon.h | 174 ---- src/c/mshabal_128_sse2.c | 963 ------------------ src/c/mshabal_128_sse2.h | 174 ---- src/c/mshabal_256_avx2.c | 1086 -------------------- src/c/mshabal_256_avx2.h | 179 ---- src/c/mshabal_512_avx512f.c | 1318 ------------------------ src/c/mshabal_512_avx512f.h | 195 ---- src/c/shabal.c | 13 - src/c/shabal.h | 7 - src/c/shabal_avx.c | 75 -- src/c/shabal_avx.h | 9 - src/c/shabal_avx2.c | 95 -- src/c/shabal_avx2.h | 9 - src/c/shabal_avx512f.c | 138 --- src/c/shabal_avx512f.h | 9 - src/c/shabal_neon.c | 75 -- src/c/shabal_neon.h | 9 - src/c/shabal_sse2.c | 75 -- src/c/shabal_sse2.h | 9 - src/c/sph_shabal.c | 693 ------------- src/c/sph_shabal.h | 133 --- src/c/sph_types.h | 1912 ----------------------------------- src/lib.rs | 74 +- 36 files changed, 106 insertions(+), 11258 deletions(-) create mode 100644 .gitmodules create mode 100644 .travis.yml create mode 100755 build.sh create mode 160000 scavenger delete mode 100644 src/c/SSE2NEON.h delete mode 100644 src/c/common.c delete mode 100644 src/c/common.h delete mode 100644 src/c/mshabal_128_avx.c delete mode 100644 src/c/mshabal_128_avx.h delete mode 100644 src/c/mshabal_128_neon.c delete mode 100644 src/c/mshabal_128_neon.h delete mode 100644 src/c/mshabal_128_sse2.c delete mode 100644 src/c/mshabal_128_sse2.h delete mode 100644 src/c/mshabal_256_avx2.c delete mode 100644 src/c/mshabal_256_avx2.h delete mode 100644 src/c/mshabal_512_avx512f.c delete mode 100644 src/c/mshabal_512_avx512f.h delete mode 100644 src/c/shabal.c delete mode 100644 src/c/shabal.h delete mode 100644 src/c/shabal_avx.c delete mode 100644 src/c/shabal_avx.h delete mode 100644 src/c/shabal_avx2.c delete mode 100644 src/c/shabal_avx2.h delete mode 100644 src/c/shabal_avx512f.c delete mode 100644 src/c/shabal_avx512f.h delete mode 100644 src/c/shabal_neon.c delete mode 100644 src/c/shabal_neon.h delete mode 100644 src/c/shabal_sse2.c delete mode 100644 src/c/shabal_sse2.h delete mode 100644 src/c/sph_shabal.c delete mode 100644 src/c/sph_shabal.h delete mode 100644 src/c/sph_types.h diff --git a/.gitignore b/.gitignore index 6936990..22676ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target **/*.rs.bk Cargo.lock +.idea/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..0aef76e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "scavenger"] + path = scavenger + url = https://github.com/PoC-Consortium/scavenger diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..ce3ac59 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +language: rust +cache: cargo + +before_deploy: + - ./build.sh + +matrix: + include: + - os: osx + before_install: + - brew install gcc + - os: linux + before_install: + - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + - sudo apt-get update && sudo apt-get install -y gcc-8 + - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8 + +deploy: + provider: releases + api_key: $API_KEY + file: ${TRAVIS_OS_NAME}.zip + skip_cleanup: true + on: + branch: master + tags: true + +notifications: + email: false diff --git a/Cargo.toml b/Cargo.toml index b282948..afaacc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ edition = "2018" build = "build.rs" [lib] -name = "shabal_lib" +name = "shabal" crate-type = ["dylib"] [features] diff --git a/build.rs b/build.rs index 6128eaf..d27a725 100644 --- a/build.rs +++ b/build.rs @@ -27,9 +27,9 @@ fn main() { let mut config = shared_config.clone(); config - .file("src/c/sph_shabal.c") - .file("src/c/shabal.c") - .file("src/c/common.c") + .file("scavenger/src/c/sph_shabal.c") + .file("scavenger/src/c/shabal.c") + .file("scavenger/src/c/common.c") .compile("shabal"); generate_bindings(); @@ -43,8 +43,8 @@ fn main() { config.flag("-mfpu=neon"); config - .file("src/c/mshabal_128_neon.c") - .file("src/c/shabal_neon.c") + .file("scavenger/src/c/mshabal_128_neon.c") + .file("scavenger/src/c/shabal_neon.c") .compile("shabal_neon"); } } @@ -59,8 +59,8 @@ fn main() { config.flag("-msse2"); config - .file("src/c/mshabal_128_sse2.c") - .file("src/c/shabal_sse2.c") + .file("scavenger/src/c/mshabal_128_sse2.c") + .file("scavenger/src/c/shabal_sse2.c") .compile("shabal_sse2"); let mut config = shared_config.clone(); @@ -72,8 +72,8 @@ fn main() { config.flag("-mavx"); config - .file("src/c/mshabal_128_avx.c") - .file("src/c/shabal_avx.c") + .file("scavenger/src/c/mshabal_128_avx.c") + .file("scavenger/src/c/shabal_avx.c") .compile("shabal_avx"); let mut config = shared_config.clone(); @@ -85,8 +85,8 @@ fn main() { config.flag("-mavx2"); config - .file("src/c/mshabal_256_avx2.c") - .file("src/c/shabal_avx2.c") + .file("scavenger/src/c/mshabal_256_avx2.c") + .file("scavenger/src/c/shabal_avx2.c") .compile("shabal_avx2"); let mut config = shared_config.clone(); @@ -98,8 +98,8 @@ fn main() { config.flag("-mavx512f"); config - .file("src/c/mshabal_512_avx512f.c") - .file("src/c/shabal_avx512f.c") + .file("scavenger/src/c/mshabal_512_avx512f.c") + .file("scavenger/src/c/shabal_avx512f.c") .compile("shabal_avx512f"); } } diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..7948af8 --- /dev/null +++ b/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cargo build --release --features "simd" + +cd target/release +zip ../../${TRAVIS_OS_NAME}.zip libshabal.* diff --git a/scavenger b/scavenger new file mode 160000 index 0000000..775b0ea --- /dev/null +++ b/scavenger @@ -0,0 +1 @@ +Subproject commit 775b0ea52fdc22c81ad2cc0ec20a787c20922aef diff --git a/src/c/SSE2NEON.h b/src/c/SSE2NEON.h deleted file mode 100644 index 60495ed..0000000 --- a/src/c/SSE2NEON.h +++ /dev/null @@ -1,1690 +0,0 @@ -#ifndef SSE2NEON_H -#define SSE2NEON_H - -// This header file provides a simple API translation layer -// between SSE intrinsics to their corresponding ARM NEON versions -// -// This header file does not (yet) translate *all* of the SSE intrinsics. -// Since this is in support of a specific porting effort, I have only -// included the intrinsics I needed to get my port to work. -// -// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com -// -// If you want to improve or add to this project, send me an -// email and I will probably approve your access to the depot. -// -// Project is located here: -// -// https://github.com/jratcliff63367/sse2neon -// -// Show your appreciation for open source by sending me a bitcoin tip to the following -// address. -// -// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p : -// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p -// -// -// Contributors to this project are: -// -// John W. Ratcliff : jratcliffscarab@gmail.com -// Brandon Rowlett : browlett@nvidia.com -// Ken Fast : kfast@gdeb.com -// Eric van Beurden : evanbeurden@nvidia.com -// Alexander Potylitsin : apotylitsin@nvidia.com -// -// -// ********************************************************************************************************************* -// apoty: March 17, 2017 -// Current version was changed in most to fix issues and potential issues. -// All unit tests were rewritten as a part of forge lib project to cover all implemented functions. -// ********************************************************************************************************************* -// Release notes for January 20, 2017 version: -// -// The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition -// The unit-tests now test 10,000 random float and int values against each intrinsic. -// -// SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and -// fully tested on NEON/ARM. The remaining 56 still need unit tests implemented. -// -// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which -// attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128 -// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx -// -// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer -// can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot -// cast or otherwise alias the base NEON data type for intrinsic operations. -// -// A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with -// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing -// to return the correct value. This is now fixed. -// -// A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers. -// It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int -// is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does. -// As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get -// a build error. -// -// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are -// producing the correct results on NEON. These unit tests will be added as soon as possible. -// -// Here is the list of new instrinsics which have been added: -// -// _mm_cvtss_f32 : extracts the lower order floating point value from the parameter -// _mm_add_ss : adds the scalar single - precision floating point values of a and b -// _mm_div_ps : Divides the four single - precision, floating - point values of a and b. -// _mm_div_ss : Divides the scalar single - precision floating point value of a by b. -// _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in. -// _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in. -// _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation -// _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation. -// _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation. -// _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation. -// _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation. -// _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation -// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b. -// _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b. -// -// ********************************************************************************************************************* -/* -** The MIT license: -** -** Permission is hereby granted, free of charge, to any person obtaining a copy -** of this software and associated documentation files (the "Software"), to deal -** in the Software without restriction, including without limitation the rights -** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -** copies of the Software, and to permit persons to whom the Software is furnished -** to do so, subject to the following conditions: -** -** The above copyright notice and this permission notice shall be included in all -** copies or substantial portions of the Software. - -** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#define ENABLE_CPP_VERSION 0 - -#if defined(__GNUC__) || defined(__clang__) -# pragma push_macro("FORCE_INLINE") -# pragma push_macro("ALIGN_STRUCT") -# define FORCE_INLINE static inline __attribute__((always_inline)) -# define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#else -# error "Macro name collisions may happens with unknown compiler" -# define FORCE_INLINE static inline -# define ALIGN_STRUCT(x) __declspec(align(x)) -#endif - -#include -#include "arm_neon.h" - - -/*******************************************************/ -/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ -/* Argument fp3 is a digit[0123] that represents the fp*/ -/* from argument "b" of mm_shuffle_ps that will be */ -/* placed in fp3 of result. fp2 is the same for fp2 in */ -/* result. fp1 is a digit[0123] that represents the fp */ -/* from argument "a" of mm_shuffle_ps that will be */ -/* places in fp1 of result. fp0 is the same for fp0 of */ -/* result */ -/*******************************************************/ -#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ - (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) - -/* indicate immediate constant argument in a given range */ -#define __constrange(a,b) \ - const - -typedef float32x4_t __m128; -typedef int32x4_t __m128i; - - -// ****************************************** -// type-safe casting between types -// ****************************************** - -#define vreinterpretq_m128_f16(x) \ - vreinterpretq_f32_f16(x) - -#define vreinterpretq_m128_f32(x) \ - (x) - -#define vreinterpretq_m128_f64(x) \ - vreinterpretq_f32_f64(x) - - -#define vreinterpretq_m128_u8(x) \ - vreinterpretq_f32_u8(x) - -#define vreinterpretq_m128_u16(x) \ - vreinterpretq_f32_u16(x) - -#define vreinterpretq_m128_u32(x) \ - vreinterpretq_f32_u32(x) - -#define vreinterpretq_m128_u64(x) \ - vreinterpretq_f32_u64(x) - - -#define vreinterpretq_m128_s8(x) \ - vreinterpretq_f32_s8(x) - -#define vreinterpretq_m128_s16(x) \ - vreinterpretq_f32_s16(x) - -#define vreinterpretq_m128_s32(x) \ - vreinterpretq_f32_s32(x) - -#define vreinterpretq_m128_s64(x) \ - vreinterpretq_f32_s64(x) - - -#define vreinterpretq_f16_m128(x) \ - vreinterpretq_f16_f32(x) - -#define vreinterpretq_f32_m128(x) \ - (x) - -#define vreinterpretq_f64_m128(x) \ - vreinterpretq_f64_f32(x) - - -#define vreinterpretq_u8_m128(x) \ - vreinterpretq_u8_f32(x) - -#define vreinterpretq_u16_m128(x) \ - vreinterpretq_u16_f32(x) - -#define vreinterpretq_u32_m128(x) \ - vreinterpretq_u32_f32(x) - -#define vreinterpretq_u64_m128(x) \ - vreinterpretq_u64_f32(x) - - -#define vreinterpretq_s8_m128(x) \ - vreinterpretq_s8_f32(x) - -#define vreinterpretq_s16_m128(x) \ - vreinterpretq_s16_f32(x) - -#define vreinterpretq_s32_m128(x) \ - vreinterpretq_s32_f32(x) - -#define vreinterpretq_s64_m128(x) \ - vreinterpretq_s64_f32(x) - - -#define vreinterpretq_m128i_s8(x) \ - vreinterpretq_s32_s8(x) - -#define vreinterpretq_m128i_s16(x) \ - vreinterpretq_s32_s16(x) - -#define vreinterpretq_m128i_s32(x) \ - (x) - -#define vreinterpretq_m128i_s64(x) \ - vreinterpretq_s32_s64(x) - - -#define vreinterpretq_m128i_u8(x) \ - vreinterpretq_s32_u8(x) - -#define vreinterpretq_m128i_u16(x) \ - vreinterpretq_s32_u16(x) - -#define vreinterpretq_m128i_u32(x) \ - vreinterpretq_s32_u32(x) - -#define vreinterpretq_m128i_u64(x) \ - vreinterpretq_s32_u64(x) - - -#define vreinterpretq_s8_m128i(x) \ - vreinterpretq_s8_s32(x) - -#define vreinterpretq_s16_m128i(x) \ - vreinterpretq_s16_s32(x) - -#define vreinterpretq_s32_m128i(x) \ - (x) - -#define vreinterpretq_s64_m128i(x) \ - vreinterpretq_s64_s32(x) - - -#define vreinterpretq_u8_m128i(x) \ - vreinterpretq_u8_s32(x) - -#define vreinterpretq_u16_m128i(x) \ - vreinterpretq_u16_s32(x) - -#define vreinterpretq_u32_m128i(x) \ - vreinterpretq_u32_s32(x) - -#define vreinterpretq_u64_m128i(x) \ - vreinterpretq_u64_s32(x) - - -// union intended to allow direct access to an __m128 variable using the names that the MSVC -// compiler provides. This union should really only be used when trying to access the members -// of the vector as integer values. GCC/clang allow native access to the float members through -// a simple array access operator (in C since 4.6, in C++ since 4.8). -// -// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance -// hit. If it really is needed however, the original __m128 variable can be aliased with a -// pointer to this union and used to access individual components. The use of this union should -// be hidden behind a macro that is used throughout the codebase to access the members instead -// of always declaring this type of variable. -typedef union ALIGN_STRUCT(16) SIMDVec -{ - float m128_f32[4]; // as floats - do not to use this. Added for convenience. - int8_t m128_i8[16]; // as signed 8-bit integers. - int16_t m128_i16[8]; // as signed 16-bit integers. - int32_t m128_i32[4]; // as signed 32-bit integers. - int64_t m128_i64[2]; // as signed 64-bit integers. - uint8_t m128_u8[16]; // as unsigned 8-bit integers. - uint16_t m128_u16[8]; // as unsigned 16-bit integers. - uint32_t m128_u32[4]; // as unsigned 32-bit integers. - uint64_t m128_u64[2]; // as unsigned 64-bit integers. -} SIMDVec; - - -// ****************************************** -// Set/get methods -// ****************************************** - -// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396 -FORCE_INLINE float _mm_cvtss_f32(__m128 a) -{ - return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); -} - -// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx -FORCE_INLINE __m128i _mm_setzero_si128() -{ - return vreinterpretq_m128i_s32(vdupq_n_s32(0)); -} - -// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setzero_ps(void) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(0)); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set1_ps(float _w) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); -} - -// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps1(float _w) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); -} - -// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) -{ - float __attribute__((aligned(16))) data[4] = { x, y, z, w }; - return vreinterpretq_m128_f32(vld1q_f32(data)); -} - -// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) -{ - float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; - return vreinterpretq_m128_f32(vld1q_f32(data)); -} - - -//added by hasindu -//Sets the 4 signed 32-bit integer values in reverse order https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx -FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) -{ - int32_t __attribute__((aligned(16))) data[4] = { i3, i2, i1, i0 }; - return vreinterpretq_m128i_s32(vld1q_s32(data)); -} - -//following added by hasindu -//Sets the 16 signed 8-bit integer values to b.https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi8(char w) -{ - return vreinterpretq_m128i_s8(vdupq_n_s8(w)); -} - - -//following added by hasindu -//Sets the 8 signed 16-bit integer values to w. https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set1_epi16(short w) -{ - return vreinterpretq_m128i_s16(vdupq_n_s16(w)); -} - -//following added by hasindu -//Sets the 8 signed 16-bit integer values. https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0) -{ - int16_t __attribute__((aligned(16))) data[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; - return vreinterpretq_m128i_s16(vld1q_s16(data)); -} - - -// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi32(int _i) -{ - return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); -} - -// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 }; - return vreinterpretq_m128i_s32(vld1q_s32(data)); -} - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx -FORCE_INLINE void _mm_store_ps(float *p, __m128 a) -{ - vst1q_f32(p, vreinterpretq_f32_m128(a)); -} - -// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) -{ - vst1q_f32(p, vreinterpretq_f32_m128(a)); -} - -// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a)); -} - -//added by hasindu (verify this for requirement of alignment) -// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a)); -} - -// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx -FORCE_INLINE void _mm_store_ss(float *p, __m128 a) -{ - vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); -} - -// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx -FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b) -{ - uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); - uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); - *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); -} - -// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load1_ps(const float * p) -{ - return vreinterpretq_m128_f32(vld1q_dup_f32(p)); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load_ps(const float * p) -{ - return vreinterpretq_m128_f32(vld1q_f32(p)); -} - -// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_loadu_ps(const float * p) -{ - // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon - return vreinterpretq_m128_f32(vld1q_f32(p)); -} - -// Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_load_ss(const float * p) -{ - return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); -} - - -// ****************************************** -// Logic/Binary operations -// ****************************************** - -// Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) ) ); -} - -// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx -FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a)) ); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a)) ); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) ); -} - -// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) ); -} - -// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx -FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) ); -} - -// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) ); -} - -// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx -FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) ); -} - -// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx -FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) ); -} - -// NEON does not provide this method -// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_ps(__m128 a) -{ -#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this - uint32x4_t &ia = *(uint32x4_t *)&a; - return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8); -#else - static const uint32x4_t movemask = { 1, 2, 4, 8 }; - static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; - uint32x4_t t0 = vreinterpretq_u32_m128(a); - uint32x4_t t1 = vtstq_u32(t0, highbit); - uint32x4_t t2 = vandq_u32(t1, movemask); - uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); - return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); -#endif -} - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); -} - -// takes the lower two 32-bit values from a and swaps them and places in high end of result -// takes the higher two 32 bit values from b and swaps them and places in low end of result. -FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) -{ - float32x2_t a21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) -{ - float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); -} - -// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high -FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) -{ - float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) -{ - float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) -{ - float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) -{ - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* apoty: TODO: use vzip ?*/ - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) -{ - float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); - return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); -} - -// NEON does not support a general purpose permute intrinsic -// Currently I am not sure whether the C implementation is faster or slower than the NEON version. -// Note, this has to be expanded as a template because the shuffle value must be an immediate value. -// The same is true on SSE as well. -// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx -#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet. -FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, __constrange(0,255) int imm) -{ - __m128 ret; - ret[0] = a[imm & 0x3]; - ret[1] = a[(imm >> 2) & 0x3]; - ret[2] = b[(imm >> 4) & 0x03]; - ret[3] = b[(imm >> 6) & 0x03]; - return ret; -} -#else -#define _mm_shuffle_ps_default(a, b, imm) \ -({ \ - float32x4_t ret; \ - ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & 0x3)); \ - ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret, 1); \ - ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret, 2); \ - ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret, 3); \ - vreinterpretq_m128_f32(ret); \ -}) -#endif - -//FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) int imm) -#define _mm_shuffle_ps(a, b, imm) \ -({ \ - __m128 ret; \ - switch (imm) \ - { \ - case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_ps_1032((a), (b)); break; \ - case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_ps_2301((a), (b)); break; \ - case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_ps_0321((a), (b)); break; \ - case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_ps_2103((a), (b)); break; \ - case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_ps_1010((a), (b)); break; \ - case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_ps_1001((a), (b)); break; \ - case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_ps_0101((a), (b)); break; \ - case _MM_SHUFFLE(3, 2, 1, 0): ret = _mm_shuffle_ps_3210((a), (b)); break; \ - case _MM_SHUFFLE(0, 0, 1, 1): ret = _mm_shuffle_ps_0011((a), (b)); break; \ - case _MM_SHUFFLE(0, 0, 2, 2): ret = _mm_shuffle_ps_0022((a), (b)); break; \ - case _MM_SHUFFLE(2, 2, 0, 0): ret = _mm_shuffle_ps_2200((a), (b)); break; \ - case _MM_SHUFFLE(3, 2, 0, 2): ret = _mm_shuffle_ps_3202((a), (b)); break; \ - case _MM_SHUFFLE(1, 1, 3, 3): ret = _mm_shuffle_ps_1133((a), (b)); break; \ - case _MM_SHUFFLE(2, 0, 1, 0): ret = _mm_shuffle_ps_2010((a), (b)); break; \ - case _MM_SHUFFLE(2, 0, 0, 1): ret = _mm_shuffle_ps_2001((a), (b)); break; \ - case _MM_SHUFFLE(2, 0, 3, 2): ret = _mm_shuffle_ps_2032((a), (b)); break; \ - default: ret = _mm_shuffle_ps_default((a), (b), (imm)); break; \ - } \ - ret; \ -}) - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of a and places it into the high end of the result. -FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) -{ - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); -} - -// takes the lower two 32-bit values from a and swaps them and places in low end of result -// takes the higher two 32 bit values from a and swaps them and places in high end of result. -FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); -} - -// rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down -FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) -{ - return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); -} - -// rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up -FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) -{ - return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of a and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) -{ - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits -// gets the lower 64 bits of a, and places it in the upper 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits -// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) -{ - int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) -{ - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) -{ - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); -} - -//FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm) -#if ENABLE_CPP_VERSION -FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm) -{ - __m128i ret; - ret[0] = a[imm & 0x3]; - ret[1] = a[(imm >> 2) & 0x3]; - ret[2] = a[(imm >> 4) & 0x03]; - ret[3] = a[(imm >> 6) & 0x03]; - return ret; -} -#else -#define _mm_shuffle_epi32_default(a, imm) \ -({ \ - int32x4_t ret; \ - ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & 0x3)); \ - ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret, 1); \ - ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret, 2); \ - ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret, 3); \ - vreinterpretq_m128i_s32(ret); \ -}) -#endif - -//FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) int imm) -#if defined(__aarch64__) -#define _mm_shuffle_epi32_splat(a, imm) \ -({ \ - vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ -}) -#else -#define _mm_shuffle_epi32_splat(a, imm) \ -({ \ - vreinterpretq_m128i_s32(vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ -}) -#endif - -// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx -//FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_shuffle_epi32(a, imm) \ -({ \ - __m128i ret; \ - switch (imm) \ - { \ - case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_epi_1032((a)); break; \ - case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_epi_2301((a)); break; \ - case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_epi_0321((a)); break; \ - case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_epi_2103((a)); break; \ - case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_epi_1010((a)); break; \ - case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_epi_1001((a)); break; \ - case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_epi_0101((a)); break; \ - case _MM_SHUFFLE(2, 2, 1, 1): ret = _mm_shuffle_epi_2211((a)); break; \ - case _MM_SHUFFLE(0, 1, 2, 2): ret = _mm_shuffle_epi_0122((a)); break; \ - case _MM_SHUFFLE(3, 3, 3, 2): ret = _mm_shuffle_epi_3332((a)); break; \ - case _MM_SHUFFLE(0, 0, 0, 0): ret = _mm_shuffle_epi32_splat((a),0); break; \ - case _MM_SHUFFLE(1, 1, 1, 1): ret = _mm_shuffle_epi32_splat((a),1); break; \ - case _MM_SHUFFLE(2, 2, 2, 2): ret = _mm_shuffle_epi32_splat((a),2); break; \ - case _MM_SHUFFLE(3, 3, 3, 3): ret = _mm_shuffle_epi32_splat((a),3); break; \ - default: ret = _mm_shuffle_epi32_default((a), (imm)); break; \ - } \ - ret; \ -}) - -// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx -//FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, __constrange(0,255) int imm) -#define _mm_shufflehi_epi16_function(a, imm) \ -({ \ - int16x8_t ret = vreinterpretq_s16_s32(a); \ - int16x4_t highBits = vget_high_s16(ret); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & 0x3), ret, 4); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \ - vreinterpretq_s32_s16(ret); \ -}) - -//FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int imm) -#define _mm_shufflehi_epi16(a, imm) \ - _mm_shufflehi_epi16_function((a), (imm)) - - -//added by hasindu -//Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx -#define _mm_slli_epi16(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) {\ - ret = a; \ - } \ - else if ((imm) > 31) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_s16(vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ - } \ - ret; \ -}) - - - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx -//FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_slli_epi32(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) {\ - ret = a; \ - } \ - else if ((imm) > 31) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_s32(vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \ - } \ - ret; \ -}) - - -//added by hasindu -// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros. -//https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx -#define _mm_srli_epi16(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } \ - else if ((imm)> 31) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_u16(vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \ - } \ - ret; \ -}) - - -//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx -//FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_epi32(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } \ - else if ((imm)> 31) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_u32(vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \ - } \ - ret; \ -}) - -// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx -//FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srai_epi32(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } \ - else if ((imm) > 31) { \ - ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \ - ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16)); \ - } \ - else { \ - ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \ - } \ - ret; \ -}) - -// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx -//FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_si128(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } \ - else if ((imm) > 15) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_s8(vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ - } \ - ret; \ -}) - -// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx -//FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_slli_si128(a, imm) \ -({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } \ - else if ((imm) > 15) { \ - ret = _mm_setzero_si128(); \ - } \ - else { \ - ret = vreinterpretq_m128i_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ - } \ - ret; \ -}) - -// NEON does not provide a version of this function, here is an article about some ways to repro the results. -// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon -// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_epi8(__m128i _a) -{ - uint8x16_t input = vreinterpretq_u8_m128i(_a); - static const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; - uint8x8_t mask_and = vdup_n_u8(0x80); - int8x8_t mask_shift = vld1_s8(xr); - - uint8x8_t lo = vget_low_u8(input); - uint8x8_t hi = vget_high_u8(input); - - lo = vand_u8(lo, mask_and); - lo = vshl_u8(lo, mask_shift); - - hi = vand_u8(hi, mask_and); - hi = vshl_u8(hi, mask_shift); - - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - lo = vpadd_u8(lo, lo); - - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - hi = vpadd_u8(hi, hi); - - return ((hi[0] << 8) | (lo[0] & 0xFF)); -} - - -// ****************************************** -// Math operations -// ****************************************** - -// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx -FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128_f32(vsubq_s32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -//added by hasindu -FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8(vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -//added by hasindu -//Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx -FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); -} - -//added by hasindu -//Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - //the upper values in the result must be the remnants of . - return vreinterpretq_m128_f32(vaddq_f32(a, value)); -} - -// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -//added by hasindu -// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or unsigned 8-bit integers in b. https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) -FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8(vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -//added by hasindu -// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates. https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -//added by hasindu -//Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in b and saturates.. https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - - -// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),vreinterpretq_s32_m128i(b))); -} - -// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) -{ - float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b)); - float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1)); -} - -// Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) -{ - float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); - return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -// This version does additional iterations to improve accuracy. Between 1 and 4 recommended. -// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx -FORCE_INLINE __m128 recipq_newton(__m128 in, int n) -{ - int i; - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); - for (i = 0; i < n; ++i) - { - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); - } - return vreinterpretq_m128_f32(recip); -} - -// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) -{ - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); - return vreinterpretq_m128_f32(recip); -} - -// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) -{ - float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); - float32x4_t sq = vrecpeq_f32(recipsq); - // ??? use step versions of both sqrt and recip for better accuracy? - return vreinterpretq_m128_f32(sq); -} - -// Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) -{ - float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); - return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); -} - -// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) -{ - return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in))); -} - -// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) -{ - float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -// Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) -{ - float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -//added by hasindu -//Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -//added by hasindu -//Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx -FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - - -// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -//added by hasindu -//Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - - -// epi versions of min/max -// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) -{ - /* apoty: issue with large values because of result saturation */ - //int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); /* =2*a*b */ - //return vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); - int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); - int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ - int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); - int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); - return vreinterpretq_m128i_u16(r.val[1]); -} - -// Computes pairwise add of each argument as single-precision, floating-point values a and b. -//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx -FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); //AArch64 -#else - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); -#endif -} - -// ****************************************** -// Compare operations -// ****************************************** - -// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - - -//added by hasindu -//Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or unsigned 8-bit integers in b for equality. https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -//added by hasindu -//Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or unsigned 16-bit integers in b for equality. -//https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -//added by hasindu -//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for lesser than. https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - - -//added by hasindu -//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than. https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8(vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -//added by hasindu -//Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than. https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx -// see also: -// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean -// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics -FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) -{ - // Note: NEON does not have ordered compare builtin - // Need to compare a eq a and b eq b to check for NaN - // Do AND of results to get final - uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx -// Important note!! The documentation on MSDN is incorrect! If either of the values is a NAN the docs say you will get a one, but in fact, it will return a zero!! -FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) -{ - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx -FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) -{ - //return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx -FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) -{ - //return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx -FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) -{ - //return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx -FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) -{ - //return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx -FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) -{ - //return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) ? 1 : 0; -} - -// according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here. -#define _mm_ucomilt_ss _mm_comilt_ss -#define _mm_ucomile_ss _mm_comile_ss -#define _mm_ucomigt_ss _mm_comigt_ss -#define _mm_ucomige_ss _mm_comige_ss -#define _mm_ucomieq_ss _mm_comieq_ss -#define _mm_ucomineq_ss _mm_comineq_ss - -// ****************************************** -// Conversions -// ****************************************** - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) -{ - return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); -} - -// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) -{ - return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); -} - -// Converts the four unsigned 8-bit integers in the lower 32 bits to four unsigned 32-bit integers. https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx -FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) -{ - uint8x16_t u8x16 = vreinterpretq_u8_s32(a); /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ - return vreinterpretq_s32_u32(u32x4); -} - -// Converts the four signed 16-bit integers in the lower 64 bits to four signed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514079%28v=vs.100%29.aspx -FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) -{ - return vreinterpretq_m128i_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); -} - -// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx -// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support! -// It is supported on ARMv8 however. -FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) -{ -#if defined(__aarch64__) - return vcvtnq_s32_f32(a); -#else - uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ - int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ - int32x4_t r_trunc = vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ - int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ - int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ - float32x4_t delta = vsubq_f32(vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ - uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); -#endif -} - -// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx -FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) -{ - return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); -} - -// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) -{ - return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); -} - - -// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx -FORCE_INLINE __m128i _mm_castps_si128(__m128 a) -{ - return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); -} - -// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx -FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) -{ - return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); -} - -// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx -FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) -{ - return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p)); -} - -//added by hasindu (verify this for requirement of alignment) -// Loads 128-bit value. : https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx -FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) -{ - return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p)); -} - - -// ****************************************** -// Miscellaneous Operations -// ****************************************** - -// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b)))); -} - -// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) -{ - return vreinterpretq_m128i_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b)))); -} - -// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b)))); -} - -// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -} - -// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -} - -// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); -} - -// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) -{ - float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); -} - -// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) -{ - float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); -} - -// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) -{ - int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -} - -// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) -{ - int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -} - -// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) -{ - int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); -} - -// Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx -//FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) -#define _mm_extract_epi16(a, imm) \ -({ \ - (vgetq_lane_s16(vreinterpretq_s16_m128i(a), (imm)) & 0x0000ffffUL); \ -}) - -// Inserts the least significant 16 bits of b into the selected 16-bit integer of a. https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx -//FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b, __constrange(0,8) int imm) -#define _mm_insert_epi16(a, b, imm) \ -({ \ - vreinterpretq_m128i_s16(vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ -}) - -// ****************************************** -// Streaming Extensions -// ****************************************** - -// Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx -FORCE_INLINE void _mm_sfence(void) -{ - __sync_synchronize(); -} - -// Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx -FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) -{ - *p = a; -} - -// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx -/* -FORCE_INLINE void _mm_clflush(void const*p) -{ - // no corollary for Neon? -} -*/ - -#if defined(__GNUC__) || defined(__clang__) -# pragma pop_macro("ALIGN_STRUCT") -# pragma pop_macro("FORCE_INLINE") -#endif - -#endif diff --git a/src/c/common.c b/src/c/common.c deleted file mode 100644 index 4bb8ef1..0000000 --- a/src/c/common.c +++ /dev/null @@ -1,15 +0,0 @@ -#include "common.h" -#include - -void write_seed(char seed[32], uint64_t numeric_id) { - numeric_id = bswap_64(numeric_id); - memmove(&seed[0], &numeric_id, 8); - memset(&seed[8], 0, 8); - seed[16] = -128; // shabal message termination bit - memset(&seed[17], 0, 15); -} - -void write_term(char term[32]) { - term[0] = -128; // shabal message termination bit - memset(&term[1], 0, 31); -} diff --git a/src/c/common.h b/src/c/common.h deleted file mode 100644 index a7aa9b8..0000000 --- a/src/c/common.h +++ /dev/null @@ -1,65 +0,0 @@ -#include - -#pragma once - -#ifdef _MSC_VER - -#include -#define bswap_32(x) _byteswap_ulong(x) -#define bswap_64(x) _byteswap_uint64(x) - -#elif defined(__APPLE__) - -// Mac OS X / Darwin features -#include -#define bswap_32(x) OSSwapInt32(x) -#define bswap_64(x) OSSwapInt64(x) - -#elif defined(__sun) || defined(sun) - -#include -#define bswap_32(x) BSWAP_32(x) -#define bswap_64(x) BSWAP_64(x) - -#elif defined(__FreeBSD__) - -#include -#define bswap_32(x) bswap32(x) -#define bswap_64(x) bswap64(x) - -#elif defined(__OpenBSD__) - -#include -#define bswap_32(x) swap32(x) -#define bswap_64(x) swap64(x) - -#elif defined(__NetBSD__) - -#include -#include -#if defined(__BSWAP_RENAME) && !defined(__bswap_32) -#define bswap_32(x) bswap32(x) -#define bswap_64(x) bswap64(x) -#endif - -#else - -#include - -#endif - -#define HASH_SIZE 32 -#define HASH_CAP 4096 -#define NUM_SCOOPS 4096 -#define SCOOP_SIZE 64 -#define NONCE_SIZE (HASH_CAP * SCOOP_SIZE) // 4096*64 - -void write_seed(char seed[32], uint64_t numeric_id); - -void write_term(char term[32]); - -#define SET_BEST_DEADLINE(d, o) \ - if ((d) < *best_deadline) { \ - *best_deadline = (d); \ - *best_offset = (o); \ - } diff --git a/src/c/mshabal_128_avx.c b/src/c/mshabal_128_avx.c deleted file mode 100644 index f4c239c..0000000 --- a/src/c/mshabal_128_avx.c +++ /dev/null @@ -1,966 +0,0 @@ -/* - * Parallel implementation of Shabal, using the AVX unit. This code - * compiles and runs on x86 architectures, in 32-bit or 64-bit mode, - * which possess a AVX-compatible SIMD unit. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#include -#include -#include -#include "mshabal_128_avx.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -typedef mshabal_u32 u32; - -#define C32(x) ((u32)x##UL) -#define T32(x) ((x)&C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) - -static void mshabal_compress_avx(mshabal128_context *sc, const unsigned char *buf0, - const unsigned char *buf1, const unsigned char *buf2, - const unsigned char *buf3, size_t num) { - _mm256_zeroupper(); - union { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - -#define M(i) _mm_load_si128(u.data + i) - - while (num-- > 0) { - for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) { - u.words[j + 0] = *(u32 *)(buf0 + j); - u.words[j + 1] = *(u32 *)(buf1 + j); - u.words[j + 2] = *(u32 *)(buf2 + j); - u.words[j + 3] = *(u32 *)(buf3 + j); - } - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - buf0 += 64; - buf1 += 64; - buf2 += 64; - buf3 += 64; - if (++sc->Wlow == 0) sc->Whigh++; - } - - for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]); - for (j = 0; j < 16; j++) { - _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]); - _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]); - } -#undef M -} - -void mshabal_init_avx(mshabal128_context *sc, unsigned out_size) { - unsigned u; - - memset(sc->state, 0, sizeof sc->state); - memset(sc->buf0, 0, sizeof sc->buf0); - memset(sc->buf1, 0, sizeof sc->buf1); - memset(sc->buf2, 0, sizeof sc->buf2); - memset(sc->buf3, 0, sizeof sc->buf3); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u); - sc->buf0[4 * u + 1] = (out_size + u) >> 8; - sc->buf1[4 * u + 0] = (out_size + u); - sc->buf1[4 * u + 1] = (out_size + u) >> 8; - sc->buf2[4 * u + 0] = (out_size + u); - sc->buf2[4 * u + 1] = (out_size + u) >> 8; - sc->buf3[4 * u + 0] = (out_size + u); - sc->buf3[4 * u + 1] = (out_size + u) >> 8; - } - sc->Whigh = sc->Wlow = C32(0xFFFFFFFF); - mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u + 16); - sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf1[4 * u + 0] = (out_size + u + 16); - sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf2[4 * u + 0] = (out_size + u + 16); - sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf3[4 * u + 0] = (out_size + u + 16); - sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8; - } - mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - sc->ptr = 0; - sc->out_size = out_size; -} - -void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len) { - size_t ptr, num; - - if (data0 == NULL) { - if (data1 == NULL) { - if (data2 == NULL) { - if (data3 == NULL) { - return; - } else { - data0 = data3; - } - } else { - data0 = data2; - } - } else { - data0 = data1; - } - } - - if (data1 == NULL) data1 = data0; - if (data2 == NULL) data2 = data0; - if (data3 == NULL) data3 = data0; - - ptr = sc->ptr; - if (ptr != 0) { - size_t clen = (sizeof sc->buf0 - ptr); - if (clen > len) { - memcpy(sc->buf0 + ptr, data0, len); - memcpy(sc->buf1 + ptr, data1, len); - memcpy(sc->buf2 + ptr, data2, len); - memcpy(sc->buf3 + ptr, data3, len); - sc->ptr = ptr + len; - return; - } else { - memcpy(sc->buf0 + ptr, data0, clen); - memcpy(sc->buf1 + ptr, data1, clen); - memcpy(sc->buf2 + ptr, data2, clen); - memcpy(sc->buf3 + ptr, data3, clen); - mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - data0 = (const unsigned char *)data0 + clen; - data1 = (const unsigned char *)data1 + clen; - data2 = (const unsigned char *)data2 + clen; - data3 = (const unsigned char *)data3 + clen; - len -= clen; - } - } - - num = len >> 6; - if (num != 0) { - mshabal_compress_avx(sc, data0, data1, data2, data3, num); - data0 = (const unsigned char *)data0 + (num << 6); - data1 = (const unsigned char *)data1 + (num << 6); - data2 = (const unsigned char *)data2 + (num << 6); - data3 = (const unsigned char *)data3 + (num << 6); - } - len &= 63; - memcpy(sc->buf0, data0, len); - memcpy(sc->buf1, data1, len); - memcpy(sc->buf2, data2, len); - memcpy(sc->buf3, data3, len); - sc->ptr = len; -} - -void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) { - size_t ptr, off; - unsigned z, out_size_w32; - - z = 0x80 >> n; - ptr = sc->ptr; - sc->buf0[ptr] = (ub0 & -z) | z; - sc->buf1[ptr] = (ub1 & -z) | z; - sc->buf2[ptr] = (ub2 & -z) | z; - sc->buf3[ptr] = (ub3 & -z) | z; - ptr++; - memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr); - memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr); - memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr); - memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr); - for (z = 0; z < 4; z++) { - mshabal_compress_avx(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - if (sc->Wlow-- == 0) sc->Whigh--; - } - out_size_w32 = sc->out_size >> 5; - off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32)); - if (dst0 != NULL) { - u32 *out; - - out = (u32 *)dst0; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0]; - } - if (dst1 != NULL) { - u32 *out; - - out = (u32 *)dst1; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1]; - } - if (dst2 != NULL) { - u32 *out; - - out = (u32 *)dst2; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2]; - } - if (dst3 != NULL) { - u32 *out; - - out = (u32 *)dst3; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3]; - } -} - -// Shabal routine optimized for plotting and hashing -void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num) { - _mm256_zeroupper(); - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - while (num-- > 0) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - // move data pointer - message = (__m128i *)message + 16; - - if (++sc->Wlow == 0) sc->Whigh++; - } - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned hashes - for (j = 0; j < 8; j++) { - _mm_storeu_si128((__m128i *)dst + j, C[j + 8]); - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -// Shabal routine optimized for mining -void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3) { - _mm256_zeroupper(); - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - if (++sc->Wlow == 0) sc->Whigh++; - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned deadlines - u32 simd_dst[8]; - _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]); - _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]); - - // unpack SIMD data - unsigned z; - for (z = 0; z < 2; z++) { - unsigned y = z * MSHABAL128_VECTOR_SIZE; - ((u32 *)dst0)[z] = simd_dst[y + 0]; - ((u32 *)dst1)[z] = simd_dst[y + 1]; - ((u32 *)dst2)[z] = simd_dst[y + 2]; - ((u32 *)dst3)[z] = simd_dst[y + 3]; - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -#ifdef __cplusplus -} -#endif diff --git a/src/c/mshabal_128_avx.h b/src/c/mshabal_128_avx.h deleted file mode 100644 index 5a869db..0000000 --- a/src/c/mshabal_128_avx.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * A parallel implementation of Shabal, for platforms with AVX. - * - * This is the header file for an implementation of the Shabal family - * of hash functions, designed for maximum parallel speed. It processes - * up to four instances of Shabal in parallel, using the AVX unit. - * Total bandwidth appear to be up to twice that of a plain 32-bit - * Shabal implementation. - * - * A computation uses a mshabal_context structure. That structure is - * supposed to be allocated and released by the caller, e.g. as a - * local or global variable, or on the heap. The structure contents - * are initialized with mshabal_init(). Once the structure has been - * initialized, data is input as chunks, with the mshabal() functions. - * Chunks for the four parallel instances are provided simultaneously - * and must have the same length. It is allowed not to use some of the - * instances; the corresponding parameters in mshabal() are then NULL. - * However, using NULL as a chunk for one of the instances effectively - * deactivates that instance; this cannot be used to "skip" a chunk - * for one instance. - * - * The computation is finalized with mshabal_close(). Some extra message - * bits (0 to 7) can be input. The outputs of the four parallel instances - * are written in the provided buffers. There again, NULL can be - * provided as parameter is the output of one of the instances is not - * needed. - * - * A mshabal_context instance is self-contained and holds no pointer. - * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as - * proper alignment is maintained). This implementation uses no state - * variable beyond the context instance; this, it is thread-safe and - * reentrant. - * - * The Shabal specification defines Shabal with output sizes of 192, - * 224, 256, 384 and 512 bits. This code accepts all those sizes, as - * well as any output size which is multiple of 32, between 32 and - * 512 (inclusive). - * - * Parameters are not validated. Thus, undefined behaviour occurs if - * any of the "shall" or "must" clauses in this documentation is - * violated. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#ifndef MSHABAL_H__ -#define MSHABAL_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * We need an integer type with width 32-bit or more (preferably, with - * a width of exactly 32 bits). - */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#include -#ifdef UINT32_MAX -typedef uint32_t mshabal_u32; -#else -typedef uint_fast32_t mshabal_u32; -#endif -#else -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF -typedef unsigned int mshabal_u32; -#else -typedef unsigned long mshabal_u32; -#endif -#endif - -#define MSHABAL128_VECTOR_SIZE 4 - -/* - * The context structure for a Shabal computation. Contents are - * private. Such a structure should be allocated and released by - * the caller, in any memory area. - */ -typedef struct { - unsigned char buf0[64]; - unsigned char buf1[64]; - unsigned char buf2[64]; - unsigned char buf3[64]; - size_t ptr; - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context; - -#pragma pack(1) -typedef struct { - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context_fast; -#pragma pack() - -/* - * Initialize a context structure. The output size must be a multiple - * of 32, between 32 and 512 (inclusive). The output size is expressed - * in bits. - */ -void mshabal_init_avx(mshabal128_context *sc, unsigned out_size); - -/* - * Process some more data bytes; four chunks of data, pointed to by - * data0, data1, data2 and data3, are processed. The four chunks have - * the same length of "len" bytes. For efficiency, it is best if data is - * processed by medium-sized chunks, e.g. a few kilobytes at a time. - * - * The "len" data bytes shall all be accessible. If "len" is zero, this - * this function does nothing and ignores the data* arguments. - * Otherwise, if one of the data* argument is NULL, then the - * corresponding instance is deactivated (the final value obtained from - * that instance is undefined). - */ -void mshabal_avx(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len); - -/* - * Terminate the Shabal computation incarnated by the provided context - * structure. "n" shall be a value between 0 and 7 (inclusive): this is - * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and - * append at the end of the input message for each of the four parallel - * instances. Bits in "ub*" are taken in big-endian format: first bit is - * the one of numerical value 128, second bit has numerical value 64, - * and so on. Other bits in "ub*" are ignored. For most applications, - * input messages will consist in sequence of bytes, and the "ub*" and - * "n" parameters will be zero. - * - * The Shabal output for each of the parallel instances is written out - * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. - * These areas shall be wide enough to accomodate the result (result - * size was specified as parameter to mshabal_init()). It is acceptable - * to use NULL for any of those pointers, if the result from the - * corresponding instance is not needed. - * - * After this call, the context structure is invalid. The caller shall - * release it, or reinitialize it with mshabal_init(). The mshabal_close() - * function does NOT imply a hidden call to mshabal_init(). - */ -void mshabal_close_avx(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, - void *dst3); - -/* - * optimised Shabal routine for PoC mining - */ -void mshabal_deadline_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void mshabal_hash_fast_avx(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/c/mshabal_128_neon.c b/src/c/mshabal_128_neon.c deleted file mode 100644 index 62ea371..0000000 --- a/src/c/mshabal_128_neon.c +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Parallel implementation of Shabal, using the NEON unit. This code - * compiles and runs on x86 architectures, in 32-bit or 64-bit mode, - * which possess a NEON-compatible SIMD unit. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#include "SSE2NEON.h" -#include -#include -#include "mshabal_128_neon.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -typedef mshabal_u32 u32; - -#define C32(x) ((u32)x##UL) -#define T32(x) ((x)&C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) - -static void mshabal_compress_neon(mshabal128_context *sc, const unsigned char *buf0, - const unsigned char *buf1, const unsigned char *buf2, - const unsigned char *buf3, size_t num) { - union { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - -#define M(i) _mm_load_si128(u.data + i) - - while (num-- > 0) { - for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) { - u.words[j + 0] = *(u32 *)(buf0 + j); - u.words[j + 1] = *(u32 *)(buf1 + j); - u.words[j + 2] = *(u32 *)(buf2 + j); - u.words[j + 3] = *(u32 *)(buf3 + j); - } - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - buf0 += 64; - buf1 += 64; - buf2 += 64; - buf3 += 64; - if (++sc->Wlow == 0) sc->Whigh++; - } - - for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]); - for (j = 0; j < 16; j++) { - _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]); - _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]); - } -#undef M -} - -void mshabal_init_neon(mshabal128_context *sc, unsigned out_size) { - unsigned u; - - memset(sc->state, 0, sizeof sc->state); - memset(sc->buf0, 0, sizeof sc->buf0); - memset(sc->buf1, 0, sizeof sc->buf1); - memset(sc->buf2, 0, sizeof sc->buf2); - memset(sc->buf3, 0, sizeof sc->buf3); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u); - sc->buf0[4 * u + 1] = (out_size + u) >> 8; - sc->buf1[4 * u + 0] = (out_size + u); - sc->buf1[4 * u + 1] = (out_size + u) >> 8; - sc->buf2[4 * u + 0] = (out_size + u); - sc->buf2[4 * u + 1] = (out_size + u) >> 8; - sc->buf3[4 * u + 0] = (out_size + u); - sc->buf3[4 * u + 1] = (out_size + u) >> 8; - } - sc->Whigh = sc->Wlow = C32(0xFFFFFFFF); - mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u + 16); - sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf1[4 * u + 0] = (out_size + u + 16); - sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf2[4 * u + 0] = (out_size + u + 16); - sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf3[4 * u + 0] = (out_size + u + 16); - sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8; - } - mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - sc->ptr = 0; - sc->out_size = out_size; -} - -void mshabal_neon(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len) { - size_t ptr, num; - - if (data0 == NULL) { - if (data1 == NULL) { - if (data2 == NULL) { - if (data3 == NULL) { - return; - } else { - data0 = data3; - } - } else { - data0 = data2; - } - } else { - data0 = data1; - } - } - - if (data1 == NULL) data1 = data0; - if (data2 == NULL) data2 = data0; - if (data3 == NULL) data3 = data0; - - ptr = sc->ptr; - if (ptr != 0) { - size_t clen = (sizeof sc->buf0 - ptr); - if (clen > len) { - memcpy(sc->buf0 + ptr, data0, len); - memcpy(sc->buf1 + ptr, data1, len); - memcpy(sc->buf2 + ptr, data2, len); - memcpy(sc->buf3 + ptr, data3, len); - sc->ptr = ptr + len; - return; - } else { - memcpy(sc->buf0 + ptr, data0, clen); - memcpy(sc->buf1 + ptr, data1, clen); - memcpy(sc->buf2 + ptr, data2, clen); - memcpy(sc->buf3 + ptr, data3, clen); - mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - data0 = (const unsigned char *)data0 + clen; - data1 = (const unsigned char *)data1 + clen; - data2 = (const unsigned char *)data2 + clen; - data3 = (const unsigned char *)data3 + clen; - len -= clen; - } - } - - num = len >> 6; - if (num != 0) { - mshabal_compress_neon(sc,data0, data1, data2, data3, num); - data0 = (const unsigned char *)data0 + (num << 6); - data1 = (const unsigned char *)data1 + (num << 6); - data2 = (const unsigned char *)data2 + (num << 6); - data3 = (const unsigned char *)data3 + (num << 6); - } - len &= 63; - memcpy(sc->buf0, data0, len); - memcpy(sc->buf1, data1, len); - memcpy(sc->buf2, data2, len); - memcpy(sc->buf3, data3, len); - sc->ptr = len; -} - -void mshabal_close_neon(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) { - size_t ptr, off; - unsigned z, out_size_w32; - - z = 0x80 >> n; - ptr = sc->ptr; - sc->buf0[ptr] = (ub0 & -z) | z; - sc->buf1[ptr] = (ub1 & -z) | z; - sc->buf2[ptr] = (ub2 & -z) | z; - sc->buf3[ptr] = (ub3 & -z) | z; - ptr++; - memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr); - memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr); - memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr); - memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr); - for (z = 0; z < 4; z++) { - mshabal_compress_neon(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - if (sc->Wlow-- == 0) sc->Whigh--; - } - out_size_w32 = sc->out_size >> 5; - off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32)); - if (dst0 != NULL) { - u32 *out; - - out = (u32 *)dst0; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0]; - } - if (dst1 != NULL) { - u32 *out; - - out = (u32 *)dst1; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1]; - } - if (dst2 != NULL) { - u32 *out; - - out = (u32 *)dst2; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2]; - } - if (dst3 != NULL) { - u32 *out; - - out = (u32 *)dst3; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3]; - } -} - -// Shabal routine optimized for plotting and hashing -void mshabal_hash_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num) { - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - while (num-- > 0) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - // move data pointer - message = (__m128i *)message + 16; - - if (++sc->Wlow == 0) sc->Whigh++; - } - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned hashes - for (j = 0; j < 8; j++) { - _mm_storeu_si128((__m128i *)dst + j, C[j + 8]); - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -// Shabal routine optimized for mining -void mshabal_deadline_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3) { - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - if (++sc->Wlow == 0) sc->Whigh++; - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned deadlines - u32 simd_dst[8]; - _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]); - _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]); - - // unpack SIMD data - unsigned z; - for (z = 0; z < 2; z++) { - unsigned y = z * MSHABAL128_VECTOR_SIZE; - ((u32 *)dst0)[z] = simd_dst[y + 0]; - ((u32 *)dst1)[z] = simd_dst[y + 1]; - ((u32 *)dst2)[z] = simd_dst[y + 2]; - ((u32 *)dst3)[z] = simd_dst[y + 3]; - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -#ifdef __cplusplus -} -#endif diff --git a/src/c/mshabal_128_neon.h b/src/c/mshabal_128_neon.h deleted file mode 100644 index d230981..0000000 --- a/src/c/mshabal_128_neon.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * A parallel implementation of Shabal, for platforms with NEON. - * - * This is the header file for an implementation of the Shabal family - * of hash functions, designed for maximum parallel speed. It processes - * up to four instances of Shabal in parallel, using the NEON unit. - * Total bandwidth appear to be up to twice that of a plain 32-bit - * Shabal implementation. - * - * A computation uses a mshabal_context structure. That structure is - * supposed to be allocated and released by the caller, e.g. as a - * local or global variable, or on the heap. The structure contents - * are initialized with mshabal_init(). Once the structure has been - * initialized, data is input as chunks, with the mshabal() functions. - * Chunks for the four parallel instances are provided simultaneously - * and must have the same length. It is allowed not to use some of the - * instances; the corresponding parameters in mshabal() are then NULL. - * However, using NULL as a chunk for one of the instances effectively - * deactivates that instance; this cannot be used to "skip" a chunk - * for one instance. - * - * The computation is finalized with mshabal_close(). Some extra message - * bits (0 to 7) can be input. The outputs of the four parallel instances - * are written in the provided buffers. There again, NULL can be - * provided as parameter is the output of one of the instances is not - * needed. - * - * A mshabal_context instance is self-contained and holds no pointer. - * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as - * proper alignment is maintained). This implementation uses no state - * variable beyond the context instance; this, it is thread-safe and - * reentrant. - * - * The Shabal specification defines Shabal with output sizes of 192, - * 224, 256, 384 and 512 bits. This code accepts all those sizes, as - * well as any output size which is multiple of 32, between 32 and - * 512 (inclusive). - * - * Parameters are not validated. Thus, undefined behaviour occurs if - * any of the "shall" or "must" clauses in this documentation is - * violated. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#ifndef MSHABAL_H__ -#define MSHABAL_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * We need an integer type with width 32-bit or more (preferably, with - * a width of exactly 32 bits). - */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#include -#ifdef UINT32_MAX -typedef uint32_t mshabal_u32; -#else -typedef uint_fast32_t mshabal_u32; -#endif -#else -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF -typedef unsigned int mshabal_u32; -#else -typedef unsigned long mshabal_u32; -#endif -#endif - -#define MSHABAL128_VECTOR_SIZE 4 - -/* - * The context structure for a Shabal computation. Contents are - * private. Such a structure should be allocated and released by - * the caller, in any memory area. - */ -typedef struct { - unsigned char buf0[64]; - unsigned char buf1[64]; - unsigned char buf2[64]; - unsigned char buf3[64]; - size_t ptr; - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context; - -#pragma pack(1) -typedef struct { - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context_fast; -#pragma pack() - -/* - * Initialize a context structure. The output size must be a multiple - * of 32, between 32 and 512 (inclusive). The output size is expressed - * in bits. - */ -void mshabal_init_neon(mshabal128_context *sc, unsigned out_size); - -/* - * Process some more data bytes; four chunks of data, pointed to by - * data0, data1, data2 and data3, are processed. The four chunks have - * the same length of "len" bytes. For efficiency, it is best if data is - * processed by medium-sized chunks, e.g. a few kilobytes at a time. - * - * The "len" data bytes shall all be accessible. If "len" is zero, this - * this function does nothing and ignores the data* arguments. - * Otherwise, if one of the data* argument is NULL, then the - * corresponding instance is deactivated (the final value obtained from - * that instance is undefined). - */ -void mshabal_neon(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len); - -/* - * Terminate the Shabal computation incarnated by the provided context - * structure. "n" shall be a value between 0 and 7 (inclusive): this is - * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and - * append at the end of the input message for each of the four parallel - * instances. Bits in "ub*" are taken in big-endian format: first bit is - * the one of numerical value 128, second bit has numerical value 64, - * and so on. Other bits in "ub*" are ignored. For most applications, - * input messages will consist in sequence of bytes, and the "ub*" and - * "n" parameters will be zero. - * - * The Shabal output for each of the parallel instances is written out - * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. - * These areas shall be wide enough to accomodate the result (result - * size was specified as parameter to mshabal_init()). It is acceptable - * to use NULL for any of those pointers, if the result from the - * corresponding instance is not needed. - * - * After this call, the context structure is invalid. The caller shall - * release it, or reinitialize it with mshabal_init(). The mshabal_close() - * function does NOT imply a hidden call to mshabal_init(). - */ -void mshabal_close_neon(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, - void *dst3); - -/* - * optimised Shabal routine for PoC mining - */ -void mshabal_deadline_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void mshabal_hash_fast_neon(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/c/mshabal_128_sse2.c b/src/c/mshabal_128_sse2.c deleted file mode 100644 index e147a75..0000000 --- a/src/c/mshabal_128_sse2.c +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Parallel implementation of Shabal, using the SSE2 unit. This code - * compiles and runs on x86 architectures, in 32-bit or 64-bit mode, - * which possess a SSE2-compatible SIMD unit. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#include -#include -#include -#include "mshabal_128_sse2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -typedef mshabal_u32 u32; - -#define C32(x) ((u32)x##UL) -#define T32(x) ((x)&C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) - -static void mshabal_compress_sse2(mshabal128_context *sc, const unsigned char *buf0, - const unsigned char *buf1, const unsigned char *buf2, - const unsigned char *buf3, size_t num) { - union { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - -#define M(i) _mm_load_si128(u.data + i) - - while (num-- > 0) { - for (j = 0; j < 16 * MSHABAL128_VECTOR_SIZE; j += MSHABAL128_VECTOR_SIZE) { - u.words[j + 0] = *(u32 *)(buf0 + j); - u.words[j + 1] = *(u32 *)(buf1 + j); - u.words[j + 2] = *(u32 *)(buf2 + j); - u.words[j + 3] = *(u32 *)(buf3 + j); - } - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - buf0 += 64; - buf1 += 64; - buf2 += 64; - buf3 += 64; - if (++sc->Wlow == 0) sc->Whigh++; - } - - for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]); - for (j = 0; j < 16; j++) { - _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]); - _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]); - } -#undef M -} - -void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size) { - unsigned u; - - memset(sc->state, 0, sizeof sc->state); - memset(sc->buf0, 0, sizeof sc->buf0); - memset(sc->buf1, 0, sizeof sc->buf1); - memset(sc->buf2, 0, sizeof sc->buf2); - memset(sc->buf3, 0, sizeof sc->buf3); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u); - sc->buf0[4 * u + 1] = (out_size + u) >> 8; - sc->buf1[4 * u + 0] = (out_size + u); - sc->buf1[4 * u + 1] = (out_size + u) >> 8; - sc->buf2[4 * u + 0] = (out_size + u); - sc->buf2[4 * u + 1] = (out_size + u) >> 8; - sc->buf3[4 * u + 0] = (out_size + u); - sc->buf3[4 * u + 1] = (out_size + u) >> 8; - } - sc->Whigh = sc->Wlow = C32(0xFFFFFFFF); - mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u + 16); - sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf1[4 * u + 0] = (out_size + u + 16); - sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf2[4 * u + 0] = (out_size + u + 16); - sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf3[4 * u + 0] = (out_size + u + 16); - sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8; - } - mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - sc->ptr = 0; - sc->out_size = out_size; -} - -void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len) { - size_t ptr, num; - - if (data0 == NULL) { - if (data1 == NULL) { - if (data2 == NULL) { - if (data3 == NULL) { - return; - } else { - data0 = data3; - } - } else { - data0 = data2; - } - } else { - data0 = data1; - } - } - - if (data1 == NULL) data1 = data0; - if (data2 == NULL) data2 = data0; - if (data3 == NULL) data3 = data0; - - ptr = sc->ptr; - if (ptr != 0) { - size_t clen = (sizeof sc->buf0 - ptr); - if (clen > len) { - memcpy(sc->buf0 + ptr, data0, len); - memcpy(sc->buf1 + ptr, data1, len); - memcpy(sc->buf2 + ptr, data2, len); - memcpy(sc->buf3 + ptr, data3, len); - sc->ptr = ptr + len; - return; - } else { - memcpy(sc->buf0 + ptr, data0, clen); - memcpy(sc->buf1 + ptr, data1, clen); - memcpy(sc->buf2 + ptr, data2, clen); - memcpy(sc->buf3 + ptr, data3, clen); - mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - data0 = (const unsigned char *)data0 + clen; - data1 = (const unsigned char *)data1 + clen; - data2 = (const unsigned char *)data2 + clen; - data3 = (const unsigned char *)data3 + clen; - len -= clen; - } - } - - num = len >> 6; - if (num != 0) { - mshabal_compress_sse2(sc, data0, data1, data2, data3, num); - data0 = (const unsigned char *)data0 + (num << 6); - data1 = (const unsigned char *)data1 + (num << 6); - data2 = (const unsigned char *)data2 + (num << 6); - data3 = (const unsigned char *)data3 + (num << 6); - } - len &= 63; - memcpy(sc->buf0, data0, len); - memcpy(sc->buf1, data1, len); - memcpy(sc->buf2, data2, len); - memcpy(sc->buf3, data3, len); - sc->ptr = len; -} - -void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, unsigned ub3, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3) { - size_t ptr, off; - unsigned z, out_size_w32; - - z = 0x80 >> n; - ptr = sc->ptr; - sc->buf0[ptr] = (ub0 & -z) | z; - sc->buf1[ptr] = (ub1 & -z) | z; - sc->buf2[ptr] = (ub2 & -z) | z; - sc->buf3[ptr] = (ub3 & -z) | z; - ptr++; - memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr); - memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr); - memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr); - memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr); - for (z = 0; z < 4; z++) { - mshabal_compress_sse2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, 1); - if (sc->Wlow-- == 0) sc->Whigh--; - } - out_size_w32 = sc->out_size >> 5; - off = MSHABAL128_VECTOR_SIZE * (28 + (16 - out_size_w32)); - if (dst0 != NULL) { - u32 *out; - - out = (u32 *)dst0; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 0]; - } - if (dst1 != NULL) { - u32 *out; - - out = (u32 *)dst1; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 1]; - } - if (dst2 != NULL) { - u32 *out; - - out = (u32 *)dst2; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 2]; - } - if (dst3 != NULL) { - u32 *out; - - out = (u32 *)dst3; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL128_VECTOR_SIZE + 3]; - } -} - -// Shabal routine optimized for plotting and hashing -void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num) { - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - while (num-- > 0) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - - // move data pointer - message = (__m128i *)message + 16; - - if (++sc->Wlow == 0) sc->Whigh++; - } - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned hashes - for (j = 0; j < 8; j++) { - _mm_storeu_si128((__m128i *)dst + j, C[j + 8]); - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -// Shabal routine optimized for mining -void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3) { - union input { - u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - }; - size_t j; - __m128i A[12], B[16], C[16]; - __m128i one; - - for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); - C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); - } - one = _mm_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm_load_si128((__m128i *)message + i) - - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - -#define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m128i tt; \ - tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), _mm_srli_epi32(xa1, 17)); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ - tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ - tt = _mm_xor_si128(_mm_xor_si128(tt, xb1), _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm_or_si128(_mm_slli_epi32(tt, 1), _mm_srli_epi32(tt, 31)); \ - xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ - } while (0) - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB(xb, xc, xm) \ - do { \ - __m128i tmp; \ - tmp = xb; \ - xb = _mm_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); - if (++sc->Wlow == 0) sc->Whigh++; - - // round 2-5 -#define M2(i) _mm_load_si128((__m128i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M2(j)); - - A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); - A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); - - PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned deadlines - u32 simd_dst[8]; - _mm_storeu_si128((__m128i *)&simd_dst[0], C[8]); - _mm_storeu_si128((__m128i *)&simd_dst[4], C[9]); - - // unpack SIMD data - unsigned z; - for (z = 0; z < 2; z++) { - unsigned y = z * MSHABAL128_VECTOR_SIZE; - ((u32 *)dst0)[z] = simd_dst[y + 0]; - ((u32 *)dst1)[z] = simd_dst[y + 1]; - ((u32 *)dst2)[z] = simd_dst[y + 2]; - ((u32 *)dst3)[z] = simd_dst[y + 3]; - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -#ifdef __cplusplus -} -#endif diff --git a/src/c/mshabal_128_sse2.h b/src/c/mshabal_128_sse2.h deleted file mode 100644 index 5874469..0000000 --- a/src/c/mshabal_128_sse2.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * A parallel implementation of Shabal, for platforms with SSE2. - * - * This is the header file for an implementation of the Shabal family - * of hash functions, designed for maximum parallel speed. It processes - * up to four instances of Shabal in parallel, using the SSE2 unit. - * Total bandwidth appear to be up to twice that of a plain 32-bit - * Shabal implementation. - * - * A computation uses a mshabal_context structure. That structure is - * supposed to be allocated and released by the caller, e.g. as a - * local or global variable, or on the heap. The structure contents - * are initialized with mshabal_init(). Once the structure has been - * initialized, data is input as chunks, with the mshabal() functions. - * Chunks for the four parallel instances are provided simultaneously - * and must have the same length. It is allowed not to use some of the - * instances; the corresponding parameters in mshabal() are then NULL. - * However, using NULL as a chunk for one of the instances effectively - * deactivates that instance; this cannot be used to "skip" a chunk - * for one instance. - * - * The computation is finalized with mshabal_close(). Some extra message - * bits (0 to 7) can be input. The outputs of the four parallel instances - * are written in the provided buffers. There again, NULL can be - * provided as parameter is the output of one of the instances is not - * needed. - * - * A mshabal_context instance is self-contained and holds no pointer. - * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as - * proper alignment is maintained). This implementation uses no state - * variable beyond the context instance; this, it is thread-safe and - * reentrant. - * - * The Shabal specification defines Shabal with output sizes of 192, - * 224, 256, 384 and 512 bits. This code accepts all those sizes, as - * well as any output size which is multiple of 32, between 32 and - * 512 (inclusive). - * - * Parameters are not validated. Thus, undefined behaviour occurs if - * any of the "shall" or "must" clauses in this documentation is - * violated. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#ifndef MSHABAL_H__ -#define MSHABAL_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * We need an integer type with width 32-bit or more (preferably, with - * a width of exactly 32 bits). - */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#include -#ifdef UINT32_MAX -typedef uint32_t mshabal_u32; -#else -typedef uint_fast32_t mshabal_u32; -#endif -#else -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF -typedef unsigned int mshabal_u32; -#else -typedef unsigned long mshabal_u32; -#endif -#endif - -#define MSHABAL128_VECTOR_SIZE 4 - -/* - * The context structure for a Shabal computation. Contents are - * private. Such a structure should be allocated and released by - * the caller, in any memory area. - */ -typedef struct { - unsigned char buf0[64]; - unsigned char buf1[64]; - unsigned char buf2[64]; - unsigned char buf3[64]; - size_t ptr; - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context; - -#pragma pack(1) -typedef struct { - mshabal_u32 state[(12 + 16 + 16) * MSHABAL128_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal128_context_fast; -#pragma pack() - -/* - * Initialize a context structure. The output size must be a multiple - * of 32, between 32 and 512 (inclusive). The output size is expressed - * in bits. - */ -void mshabal_init_sse2(mshabal128_context *sc, unsigned out_size); - -/* - * Process some more data bytes; four chunks of data, pointed to by - * data0, data1, data2 and data3, are processed. The four chunks have - * the same length of "len" bytes. For efficiency, it is best if data is - * processed by medium-sized chunks, e.g. a few kilobytes at a time. - * - * The "len" data bytes shall all be accessible. If "len" is zero, this - * this function does nothing and ignores the data* arguments. - * Otherwise, if one of the data* argument is NULL, then the - * corresponding instance is deactivated (the final value obtained from - * that instance is undefined). - */ -void mshabal_sse2(mshabal128_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, size_t len); - -/* - * Terminate the Shabal computation incarnated by the provided context - * structure. "n" shall be a value between 0 and 7 (inclusive): this is - * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and - * append at the end of the input message for each of the four parallel - * instances. Bits in "ub*" are taken in big-endian format: first bit is - * the one of numerical value 128, second bit has numerical value 64, - * and so on. Other bits in "ub*" are ignored. For most applications, - * input messages will consist in sequence of bytes, and the "ub*" and - * "n" parameters will be zero. - * - * The Shabal output for each of the parallel instances is written out - * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. - * These areas shall be wide enough to accomodate the result (result - * size was specified as parameter to mshabal_init()). It is acceptable - * to use NULL for any of those pointers, if the result from the - * corresponding instance is not needed. - * - * After this call, the context structure is invalid. The caller shall - * release it, or reinitialize it with mshabal_init(). The mshabal_close() - * function does NOT imply a hidden call to mshabal_init(). - */ -void mshabal_close_sse2(mshabal128_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned n, void *dst0, void *dst1, void *dst2, - void *dst3); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void mshabal_hash_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, - void *dst, unsigned num); - -/* - * optimised Shabal routine for PoC mining - */ -void mshabal_deadline_fast_sse2(mshabal128_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/c/mshabal_256_avx2.c b/src/c/mshabal_256_avx2.c deleted file mode 100644 index 2081c09..0000000 --- a/src/c/mshabal_256_avx2.c +++ /dev/null @@ -1,1086 +0,0 @@ -/* - * Parallel implementation of Shabal, using the AVX2 unit. This code - * compiles and runs on x86 architectures, in 32-bit or 64-bit mode, - * which possess a AVX2-compatible SIMD unit. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#include -#include -#include -#include "mshabal_256_avx2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -typedef mshabal_u32 u32; - -#define C32(x) ((u32)x##UL) -#define T32(x) ((x)&C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) - -static void mshabal_compress_avx2(mshabal256_context *sc, const unsigned char *buf0, - const unsigned char *buf1, const unsigned char *buf2, - const unsigned char *buf3, const unsigned char *buf4, - const unsigned char *buf5, const unsigned char *buf6, - const unsigned char *buf7, size_t num) { - union { - u32 words[16 * MSHABAL256_VECTOR_SIZE]; - __m256i data[16]; - } u; - size_t j; - __m256i A[12], B[16], C[16]; - __m256i one; - - for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); - C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); - } - one = _mm256_set1_epi32(C32(0xFFFFFFFF)); - -#define M(i) _mm256_load_si256(u.data + i) - - while (num-- > 0) { - for (j = 0; j < 16 * MSHABAL256_VECTOR_SIZE; j += MSHABAL256_VECTOR_SIZE) { - size_t o = j / 2; - u.words[j + 0] = *(u32 *)(buf0 + o); - u.words[j + 1] = *(u32 *)(buf1 + o); - u.words[j + 2] = *(u32 *)(buf2 + o); - u.words[j + 3] = *(u32 *)(buf3 + o); - u.words[j + 4] = *(u32 *)(buf4 + o); - u.words[j + 5] = *(u32 *)(buf5 + o); - u.words[j + 6] = *(u32 *)(buf6 + o); - u.words[j + 7] = *(u32 *)(buf7 + o); - } - - for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); - - A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); - A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); - -#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m256i tt; \ - tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1), \ - _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31)); \ - xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ - } while (0) - - PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB256(xb, xc, xm) \ - do { \ - __m256i tmp; \ - tmp = xb; \ - xb = _mm256_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF)); - - buf0 += 64; - buf1 += 64; - buf2 += 64; - buf3 += 64; - buf4 += 64; - buf5 += 64; - buf6 += 64; - buf7 += 64; - if (++sc->Wlow == 0) sc->Whigh++; - } - - for (j = 0; j < 12; j++) _mm256_storeu_si256((__m256i *)sc->state + j, A[j]); - for (j = 0; j < 16; j++) { - _mm256_storeu_si256((__m256i *)sc->state + j + 12, B[j]); - _mm256_storeu_si256((__m256i *)sc->state + j + 28, C[j]); - } - -#undef M -} - -void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size) { - unsigned u; - - memset(sc->state, 0, sizeof sc->state); - memset(sc->buf0, 0, sizeof sc->buf0); - memset(sc->buf1, 0, sizeof sc->buf1); - memset(sc->buf2, 0, sizeof sc->buf2); - memset(sc->buf3, 0, sizeof sc->buf3); - memset(sc->buf4, 0, sizeof sc->buf4); - memset(sc->buf5, 0, sizeof sc->buf5); - memset(sc->buf6, 0, sizeof sc->buf6); - memset(sc->buf7, 0, sizeof sc->buf7); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u); - sc->buf0[4 * u + 1] = (out_size + u) >> 8; - sc->buf1[4 * u + 0] = (out_size + u); - sc->buf1[4 * u + 1] = (out_size + u) >> 8; - sc->buf2[4 * u + 0] = (out_size + u); - sc->buf2[4 * u + 1] = (out_size + u) >> 8; - sc->buf3[4 * u + 0] = (out_size + u); - sc->buf3[4 * u + 1] = (out_size + u) >> 8; - sc->buf4[4 * u + 0] = (out_size + u); - sc->buf4[4 * u + 1] = (out_size + u) >> 8; - sc->buf5[4 * u + 0] = (out_size + u); - sc->buf5[4 * u + 1] = (out_size + u) >> 8; - sc->buf6[4 * u + 0] = (out_size + u); - sc->buf6[4 * u + 1] = (out_size + u) >> 8; - sc->buf7[4 * u + 0] = (out_size + u); - sc->buf7[4 * u + 1] = (out_size + u) >> 8; - } - sc->Whigh = sc->Wlow = C32(0xFFFFFFFF); - mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, 1); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u + 16); - sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf1[4 * u + 0] = (out_size + u + 16); - sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf2[4 * u + 0] = (out_size + u + 16); - sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf3[4 * u + 0] = (out_size + u + 16); - sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf4[4 * u + 0] = (out_size + u + 16); - sc->buf4[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf5[4 * u + 0] = (out_size + u + 16); - sc->buf5[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf6[4 * u + 0] = (out_size + u + 16); - sc->buf6[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf7[4 * u + 0] = (out_size + u + 16); - sc->buf7[4 * u + 1] = (out_size + u + 16) >> 8; - } - mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, 1); - sc->ptr = 0; - sc->out_size = out_size; -} - -void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, - const void *data4, const void *data5, const void *data6, const void *data7, size_t len) { - size_t ptr, num; - - if (data0 == NULL) { - if (data1 == NULL) { - if (data2 == NULL) { - if (data3 == NULL) { - if (data4 == NULL) { - if (data5 == NULL) { - if (data6 == NULL) { - if (data7 == NULL) { - return; - } else { - data0 = data7; - } - } else { - data0 = data6; - } - } else { - data0 = data5; - } - } else { - data0 = data4; - } - } else { - data0 = data3; - } - } else { - data0 = data2; - } - } else { - data0 = data1; - } - } - - if (data1 == NULL) data1 = data0; - if (data2 == NULL) data2 = data0; - if (data3 == NULL) data3 = data0; - if (data4 == NULL) data4 = data0; - if (data5 == NULL) data5 = data0; - if (data6 == NULL) data6 = data0; - if (data7 == NULL) data7 = data0; - - ptr = sc->ptr; - if (ptr != 0) { - size_t clen = (sizeof sc->buf0 - ptr); - if (clen > len) { - memcpy(sc->buf0 + ptr, data0, len); - memcpy(sc->buf1 + ptr, data1, len); - memcpy(sc->buf2 + ptr, data2, len); - memcpy(sc->buf3 + ptr, data3, len); - memcpy(sc->buf4 + ptr, data4, len); - memcpy(sc->buf5 + ptr, data5, len); - memcpy(sc->buf6 + ptr, data6, len); - memcpy(sc->buf7 + ptr, data7, len); - sc->ptr = ptr + len; - return; - } else { - memcpy(sc->buf0 + ptr, data0, clen); - memcpy(sc->buf1 + ptr, data1, clen); - memcpy(sc->buf2 + ptr, data2, clen); - memcpy(sc->buf3 + ptr, data3, clen); - memcpy(sc->buf4 + ptr, data4, clen); - memcpy(sc->buf5 + ptr, data5, clen); - memcpy(sc->buf6 + ptr, data6, clen); - memcpy(sc->buf7 + ptr, data7, clen); - mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, 1); - data0 = (const unsigned char *)data0 + clen; - data1 = (const unsigned char *)data1 + clen; - data2 = (const unsigned char *)data2 + clen; - data3 = (const unsigned char *)data3 + clen; - data4 = (const unsigned char *)data4 + clen; - data5 = (const unsigned char *)data5 + clen; - data6 = (const unsigned char *)data6 + clen; - data7 = (const unsigned char *)data7 + clen; - len -= clen; - } - } - - num = len >> 6; - if (num != 0) { - mshabal_compress_avx2(sc, data0, data1, data2, data3, data4, data5, data6, data7, num); - data0 = (const unsigned char *)data0 + (num << 6); - data1 = (const unsigned char *)data1 + (num << 6); - data2 = (const unsigned char *)data2 + (num << 6); - data3 = (const unsigned char *)data3 + (num << 6); - data4 = (const unsigned char *)data4 + (num << 6); - data5 = (const unsigned char *)data5 + (num << 6); - data6 = (const unsigned char *)data6 + (num << 6); - data7 = (const unsigned char *)data7 + (num << 6); - } - len &= (size_t)63; - memcpy(sc->buf0, data0, len); - memcpy(sc->buf1, data1, len); - memcpy(sc->buf2, data2, len); - memcpy(sc->buf3, data3, len); - memcpy(sc->buf4, data4, len); - memcpy(sc->buf5, data5, len); - memcpy(sc->buf6, data6, len); - memcpy(sc->buf7, data7, len); - sc->ptr = len; -} - -void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, - void *dst5, void *dst6, void *dst7) { - size_t ptr, off; - unsigned z, out_size_w32; - - z = 0x80 >> n; - ptr = sc->ptr; - sc->buf0[ptr] = (ub0 & -z) | z; - sc->buf1[ptr] = (ub1 & -z) | z; - sc->buf2[ptr] = (ub2 & -z) | z; - sc->buf3[ptr] = (ub3 & -z) | z; - sc->buf4[ptr] = (ub4 & -z) | z; - sc->buf5[ptr] = (ub5 & -z) | z; - sc->buf6[ptr] = (ub6 & -z) | z; - sc->buf7[ptr] = (ub7 & -z) | z; - ptr++; - memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr); - memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr); - memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr); - memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr); - memset(sc->buf4 + ptr, 0, (sizeof sc->buf4) - ptr); - memset(sc->buf5 + ptr, 0, (sizeof sc->buf5) - ptr); - memset(sc->buf6 + ptr, 0, (sizeof sc->buf6) - ptr); - memset(sc->buf7 + ptr, 0, (sizeof sc->buf7) - ptr); - for (z = 0; z < 4; z++) { - mshabal_compress_avx2(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, 1); - if (sc->Wlow-- == 0) sc->Whigh--; - } - out_size_w32 = sc->out_size >> 5; - off = MSHABAL256_VECTOR_SIZE * (28 + (16 - out_size_w32)); - if (dst0 != NULL) { - u32 *out; - - out = (u32 *)dst0; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 0]; - } - if (dst1 != NULL) { - u32 *out; - - out = (u32 *)dst1; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 1]; - } - if (dst2 != NULL) { - u32 *out; - - out = (u32 *)dst2; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 2]; - } - if (dst3 != NULL) { - u32 *out; - - out = (u32 *)dst3; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 3]; - } - if (dst4 != NULL) { - u32 *out; - - out = (u32 *)dst4; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 4]; - } - if (dst5 != NULL) { - u32 *out; - - out = (u32 *)dst5; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 5]; - } - if (dst6 != NULL) { - u32 *out; - - out = (u32 *)dst6; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 6]; - } - if (dst7 != NULL) { - u32 *out; - - out = (u32 *)dst7; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL256_VECTOR_SIZE + 7]; - } -} - -// Shabal routines optimized for plotting and hashing -void mshabal_hash_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, - void *dst, unsigned num) { - union input { - u32 words[16 * MSHABAL256_VECTOR_SIZE]; - __m256i data[16]; - }; - - size_t j; - __m256i A[12], B[16], C[16]; - __m256i one; - - for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); - C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); - } - one = _mm256_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm256_loadu_si256((__m256i *)message + i) - - while (num-- > 0) { - for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); - - A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); - A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); - -#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m256i tt; \ - tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1), \ - _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31)); \ - xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ - } while (0) - - PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB256(xb, xc, xm) \ - do { \ - __m256i tmp; \ - tmp = xb; \ - xb = _mm256_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF)); - - // move data pointer - message = (__m256i *)message + 16; - - if (++sc->Wlow == 0) sc->Whigh++; - } - - // round 2-5 -#define M2(i) _mm256_load_si256((__m256i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M2(j)); - - A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); - A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); - - PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB256(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB256(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB256(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB256(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB256(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB256(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB256(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB256(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB256(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB256(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB256(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB256(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB256(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB256(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB256(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB256(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned hashes - for (j = 0; j < 8; j++) { - _mm256_storeu_si256((__m256i *)dst + j, C[j+8]); - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -// Shabal routine optimized for mining -void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0, void *dst1, void *dst2, - void *dst3, void *dst4, void *dst5, void *dst6, void *dst7) { - union input { - u32 words[16 * MSHABAL256_VECTOR_SIZE]; - __m256i data[16]; - }; - size_t j; - __m256i A[12], B[16], C[16]; - __m256i one; - - for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); - C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); - } - one = _mm256_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm256_loadu_si256((__m256i *)message + i) - - for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); - - A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); - A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); - -#define PP256(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m256i tt; \ - tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), _mm256_srli_epi32(xa1, 17)); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ - tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ - tt = _mm256_xor_si256(_mm256_xor_si256(tt, xb1), \ - _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), _mm256_srli_epi32(tt, 31)); \ - xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ - } while (0) - - PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB256(xb, xc, xm) \ - do { \ - __m256i tmp; \ - tmp = xb; \ - xb = _mm256_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB256(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB256(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB256(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB256(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB256(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB256(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB256(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB256(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB256(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB256(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB256(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB256(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB256(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB256(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB256(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB256(B[0xF], C[0xF], M(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - // round 2-5 -#define M2(i) _mm256_load_si256((__m256i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M2(j)); - - A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); - A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); - - PP256(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP256(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP256(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP256(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP256(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP256(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP256(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP256(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP256(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP256(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP256(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP256(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP256(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP256(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP256(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP256(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP256(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP256(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB256(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB256(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB256(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB256(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB256(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB256(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB256(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB256(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB256(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB256(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB256(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB256(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB256(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB256(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB256(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB256(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned deadlines - u32 simd_dst[16]; - _mm256_storeu_si256((__m256i *)&simd_dst[0], C[8]); - _mm256_storeu_si256((__m256i *)&simd_dst[8], C[9]); - - // unpack SIMD data - unsigned z; - for (z = 0; z < 2; z++) { - unsigned y = z * MSHABAL256_VECTOR_SIZE; - ((u32 *)dst0)[z] = simd_dst[y + 0]; - ((u32 *)dst1)[z] = simd_dst[y + 1]; - ((u32 *)dst2)[z] = simd_dst[y + 2]; - ((u32 *)dst3)[z] = simd_dst[y + 3]; - ((u32 *)dst4)[z] = simd_dst[y + 4]; - ((u32 *)dst5)[z] = simd_dst[y + 5]; - ((u32 *)dst6)[z] = simd_dst[y + 6]; - ((u32 *)dst7)[z] = simd_dst[y + 7]; - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -#ifdef __cplusplus -} -#endif diff --git a/src/c/mshabal_256_avx2.h b/src/c/mshabal_256_avx2.h deleted file mode 100644 index 4c0cb38..0000000 --- a/src/c/mshabal_256_avx2.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * A parallel implementation of Shabal, for platforms with AVX2. - * - * This is the header file for an implementation of the Shabal family - * of hash functions, designed for maximum parallel speed. It processes - * up to four instances of Shabal in parallel, using the AVX2 unit. - * Total bandwidth appear to be up to twice that of a plain 32-bit - * Shabal implementation. - * - * A computation uses a mshabal_context structure. That structure is - * supposed to be allocated and released by the caller, e.g. as a - * local or global variable, or on the heap. The structure contents - * are initialized with mshabal_init(). Once the structure has been - * initialized, data is input as chunks, with the mshabal() functions. - * Chunks for the four parallel instances are provided simultaneously - * and must have the same length. It is allowed not to use some of the - * instances; the corresponding parameters in mshabal() are then NULL. - * However, using NULL as a chunk for one of the instances effectively - * deactivates that instance; this cannot be used to "skip" a chunk - * for one instance. - * - * The computation is finalized with mshabal_close(). Some extra message - * bits (0 to 7) can be input. The outputs of the four parallel instances - * are written in the provided buffers. There again, NULL can be - * provided as parameter is the output of one of the instances is not - * needed. - * - * A mshabal_context instance is self-contained and holds no pointer. - * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as - * proper alignment is maintained). This implementation uses no state - * variable beyond the context instance; this, it is thread-safe and - * reentrant. - * - * The Shabal specification defines Shabal with output sizes of 192, - * 224, 256, 384 and 512 bits. This code accepts all those sizes, as - * well as any output size which is multiple of 32, between 32 and - * 512 (inclusive). - * - * Parameters are not validated. Thus, undefined behaviour occurs if - * any of the "shall" or "must" clauses in this documentation is - * violated. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#ifndef MSHABAL_H__ -#define MSHABAL_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * We need an integer type with width 32-bit or more (preferably, with - * a width of exactly 32 bits). - */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#include -#ifdef UINT32_MAX -typedef uint32_t mshabal_u32; -#else -typedef uint_fast32_t mshabal_u32; -#endif -#else -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF -typedef unsigned int mshabal_u32; -#else -typedef unsigned long mshabal_u32; -#endif -#endif - -#define MSHABAL256_VECTOR_SIZE 8 - -/* - * The context structure for a Shabal computation. Contents are - * private. Such a structure should be allocated and released by - * the caller, in any memory area. - */ -typedef struct { - unsigned char buf0[64]; - unsigned char buf1[64]; - unsigned char buf2[64]; - unsigned char buf3[64]; - unsigned char buf4[64]; - unsigned char buf5[64]; - unsigned char buf6[64]; - unsigned char buf7[64]; - size_t ptr; - mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal256_context; - -#pragma pack(1) -typedef struct { - mshabal_u32 state[(12 + 16 + 16) * MSHABAL256_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal256_context_fast; -#pragma pack() - -/* - * Initialize a context structure. The output size must be a multiple - * of 32, between 32 and 512 (inclusive). The output size is expressed - * in bits. - */ -void mshabal_init_avx2(mshabal256_context *sc, unsigned out_size); - -/* - * Process some more data bytes; four chunks of data, pointed to by - * data0, data1, data2 and data3, are processed. The four chunks have - * the same length of "len" bytes. For efficiency, it is best if data is - * processed by medium-sized chunks, e.g. a few kilobytes at a time. - * - * The "len" data bytes shall all be accessible. If "len" is zero, this - * this function does nothing and ignores the data* arguments. - * Otherwise, if one of the data* argument is NULL, then the - * corresponding instance is deactivated (the final value obtained from - * that instance is undefined). - */ -void mshabal_avx2(mshabal256_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, - const void *data4, const void *data5, const void *data6, const void *data7, size_t len); - -/* - * Terminate the Shabal computation incarnated by the provided context - * structure. "n" shall be a value between 0 and 7 (inclusive): this is - * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and - * append at the end of the input message for each of the four parallel - * instances. Bits in "ub*" are taken in big-endian format: first bit is - * the one of numerical value 128, second bit has numerical value 64, - * and so on. Other bits in "ub*" are ignored. For most applications, - * input messages will consist in sequence of bytes, and the "ub*" and - * "n" parameters will be zero. - * - * The Shabal output for each of the parallel instances is written out - * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. - * These areas shall be wide enough to accomodate the result (result - * size was specified as parameter to mshabal_init()). It is acceptable - * to use NULL for any of those pointers, if the result from the - * corresponding instance is not needed. - * - * After this call, the context structure is invalid. The caller shall - * release it, or reinitialize it with mshabal_init(). The mshabal_close() - * function does NOT imply a hidden call to mshabal_init(). - */ -void mshabal_close_avx2(mshabal256_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, - void *dst5, void *dst6, void *dst7); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void mshabal256_openclose_fast(mshabal256_context_fast *sc, void *message, void *termination, - void *dst, unsigned len); - -/* - * optimised Shabal routine for PoC mining - */ -void mshabal_deadline_fast_avx2(mshabal256_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, - void *dst6, void *dst7); -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/c/mshabal_512_avx512f.c b/src/c/mshabal_512_avx512f.c deleted file mode 100644 index 06c7c7b..0000000 --- a/src/c/mshabal_512_avx512f.c +++ /dev/null @@ -1,1318 +0,0 @@ -/* - * Parallel implementation of Shabal, using the AVX512f unit. This code - * compiles and runs on x86 architectures, in 32-bit or 64-bit mode, - * which possess a AVX512f-compatible SIMD unit. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#include -#include -#include -#include "mshabal_512_avx512f.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -typedef mshabal_u32 u32; - -#define C32(x) ((u32)x##UL) -#define T32(x) ((x)&C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) - -static void mshabal_compress_avx512f(mshabal512_context *sc, const unsigned char *buf0, - const unsigned char *buf1, const unsigned char *buf2, - const unsigned char *buf3, const unsigned char *buf4, - const unsigned char *buf5, const unsigned char *buf6, - const unsigned char *buf7, const unsigned char *buf8, - const unsigned char *buf9, const unsigned char *buf10, - const unsigned char *buf11, const unsigned char *buf12, - const unsigned char *buf13, const unsigned char *buf14, - const unsigned char *buf15, size_t num) { - union { - u32 words[16 * MSHABAL512_VECTOR_SIZE]; - __m512i data[16]; - } u; - size_t j; - __m512i A[12], B[16], C[16]; - __m512i one; - - for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12); - C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28); - } - one = _mm512_set1_epi32(C32(0xFFFFFFFF)); - -#define M(i) _mm512_load_si512(u.data + i) - - while (num-- > 0) { - for (j = 0; j < 16 * MSHABAL512_VECTOR_SIZE; j += MSHABAL512_VECTOR_SIZE) { - size_t o = j / 4; - u.words[j + 0] = *(u32 *)(buf0 + o); - u.words[j + 1] = *(u32 *)(buf1 + o); - u.words[j + 2] = *(u32 *)(buf2 + o); - u.words[j + 3] = *(u32 *)(buf3 + o); - u.words[j + 4] = *(u32 *)(buf4 + o); - u.words[j + 5] = *(u32 *)(buf5 + o); - u.words[j + 6] = *(u32 *)(buf6 + o); - u.words[j + 7] = *(u32 *)(buf7 + o); - u.words[j + 8] = *(u32 *)(buf8 + o); - u.words[j + 9] = *(u32 *)(buf9 + o); - u.words[j + 10] = *(u32 *)(buf10 + o); - u.words[j + 11] = *(u32 *)(buf11 + o); - u.words[j + 12] = *(u32 *)(buf12 + o); - u.words[j + 13] = *(u32 *)(buf13 + o); - u.words[j + 14] = *(u32 *)(buf14 + o); - u.words[j + 15] = *(u32 *)(buf15 + o); - } - - for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j)); - - A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow)); - A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15)); - -#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m512i tt; \ - tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1), \ - _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31)); \ - xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one)); \ - } while (0) - - PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB512(xb, xc, xm) \ - do { \ - __m512i tmp; \ - tmp = xb; \ - xb = _mm512_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF)); - - buf0 += 64; - buf1 += 64; - buf2 += 64; - buf3 += 64; - buf4 += 64; - buf5 += 64; - buf6 += 64; - buf7 += 64; - buf8 += 64; - buf9 += 64; - buf10 += 64; - buf11 += 64; - buf12 += 64; - buf13 += 64; - buf14 += 64; - buf15 += 64; - if (++sc->Wlow == 0) sc->Whigh++; - } - - for (j = 0; j < 12; j++) _mm512_storeu_si512((__m512i *)sc->state + j, A[j]); - for (j = 0; j < 16; j++) { - _mm512_storeu_si512((__m512i *)sc->state + j + 12, B[j]); - _mm512_storeu_si512((__m512i *)sc->state + j + 28, C[j]); - } - -#undef M -} - -void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size) { - unsigned u; - - memset(sc->state, 0, sizeof sc->state); - memset(sc->buf0, 0, sizeof sc->buf0); - memset(sc->buf1, 0, sizeof sc->buf1); - memset(sc->buf2, 0, sizeof sc->buf2); - memset(sc->buf3, 0, sizeof sc->buf3); - memset(sc->buf4, 0, sizeof sc->buf4); - memset(sc->buf5, 0, sizeof sc->buf5); - memset(sc->buf6, 0, sizeof sc->buf6); - memset(sc->buf7, 0, sizeof sc->buf7); - memset(sc->buf8, 0, sizeof sc->buf8); - memset(sc->buf9, 0, sizeof sc->buf9); - memset(sc->buf10, 0, sizeof sc->buf10); - memset(sc->buf11, 0, sizeof sc->buf11); - memset(sc->buf12, 0, sizeof sc->buf12); - memset(sc->buf13, 0, sizeof sc->buf13); - memset(sc->buf14, 0, sizeof sc->buf14); - memset(sc->buf15, 0, sizeof sc->buf15); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u); - sc->buf0[4 * u + 1] = (out_size + u) >> 8; - sc->buf1[4 * u + 0] = (out_size + u); - sc->buf1[4 * u + 1] = (out_size + u) >> 8; - sc->buf2[4 * u + 0] = (out_size + u); - sc->buf2[4 * u + 1] = (out_size + u) >> 8; - sc->buf3[4 * u + 0] = (out_size + u); - sc->buf3[4 * u + 1] = (out_size + u) >> 8; - sc->buf4[4 * u + 0] = (out_size + u); - sc->buf4[4 * u + 1] = (out_size + u) >> 8; - sc->buf5[4 * u + 0] = (out_size + u); - sc->buf5[4 * u + 1] = (out_size + u) >> 8; - sc->buf6[4 * u + 0] = (out_size + u); - sc->buf6[4 * u + 1] = (out_size + u) >> 8; - sc->buf7[4 * u + 0] = (out_size + u); - sc->buf7[4 * u + 1] = (out_size + u) >> 8; - sc->buf8[4 * u + 0] = (out_size + u); - sc->buf8[4 * u + 1] = (out_size + u) >> 8; - sc->buf9[4 * u + 0] = (out_size + u); - sc->buf9[4 * u + 1] = (out_size + u) >> 8; - sc->buf10[4 * u + 0] = (out_size + u); - sc->buf10[4 * u + 1] = (out_size + u) >> 8; - sc->buf11[4 * u + 0] = (out_size + u); - sc->buf11[4 * u + 1] = (out_size + u) >> 8; - sc->buf12[4 * u + 0] = (out_size + u); - sc->buf12[4 * u + 1] = (out_size + u) >> 8; - sc->buf13[4 * u + 0] = (out_size + u); - sc->buf13[4 * u + 1] = (out_size + u) >> 8; - sc->buf14[4 * u + 0] = (out_size + u); - sc->buf14[4 * u + 1] = (out_size + u) >> 8; - sc->buf15[4 * u + 0] = (out_size + u); - sc->buf15[4 * u + 1] = (out_size + u) >> 8; - } - sc->Whigh = sc->Wlow = C32(0xFFFFFFFF); - mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11, - sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1); - for (u = 0; u < 16; u++) { - sc->buf0[4 * u + 0] = (out_size + u + 16); - sc->buf0[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf1[4 * u + 0] = (out_size + u + 16); - sc->buf1[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf2[4 * u + 0] = (out_size + u + 16); - sc->buf2[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf3[4 * u + 0] = (out_size + u + 16); - sc->buf3[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf4[4 * u + 0] = (out_size + u + 16); - sc->buf4[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf5[4 * u + 0] = (out_size + u + 16); - sc->buf5[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf6[4 * u + 0] = (out_size + u + 16); - sc->buf6[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf7[4 * u + 0] = (out_size + u + 16); - sc->buf7[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf8[4 * u + 0] = (out_size + u + 16); - sc->buf8[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf9[4 * u + 0] = (out_size + u + 16); - sc->buf9[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf10[4 * u + 0] = (out_size + u + 16); - sc->buf10[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf11[4 * u + 0] = (out_size + u + 16); - sc->buf11[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf12[4 * u + 0] = (out_size + u + 16); - sc->buf12[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf13[4 * u + 0] = (out_size + u + 16); - sc->buf13[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf14[4 * u + 0] = (out_size + u + 16); - sc->buf14[4 * u + 1] = (out_size + u + 16) >> 8; - sc->buf15[4 * u + 0] = (out_size + u + 16); - sc->buf15[4 * u + 1] = (out_size + u + 16) >> 8; - } - mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11, - sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1); - sc->ptr = 0; - sc->out_size = out_size; -} - -void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2, - const void *data3, const void *data4, const void *data5, const void *data6, const void *data7, - const void *data8, const void *data9, const void *data10, const void *data11, const void *data12, - const void *data13, const void *data14, const void *data15, size_t len) { - size_t ptr, num; - - if (data0 == NULL) { - if (data1 == NULL) { - if (data2 == NULL) { - if (data3 == NULL) { - if (data4 == NULL) { - if (data5 == NULL) { - if (data6 == NULL) { - if (data7 == NULL) { - if (data8 == NULL) { - if (data9 == NULL) { - if (data10 == NULL) { - if (data11 == NULL) { - if (data12 == NULL) { - if (data13 == NULL) { - if (data14 == NULL) { - if (data15 == NULL) { - return; - } else { - data0 = data15; - } - } else { - data0 = data14; - } - } else { - data0 = data13; - } - } else { - data0 = data12; - } - } else { - data0 = data11; - } - } else { - data0 = data10; - } - } else { - data0 = data9; - } - } else { - data0 = data8; - } - } else { - data0 = data7; - } - } else { - data0 = data6; - } - } else { - data0 = data5; - } - } else { - data0 = data4; - } - } else { - data0 = data3; - } - } else { - data0 = data2; - } - } else { - data0 = data1; - } - } - - if (data1 == NULL) data1 = data0; - if (data2 == NULL) data2 = data0; - if (data3 == NULL) data3 = data0; - if (data4 == NULL) data4 = data0; - if (data5 == NULL) data5 = data0; - if (data6 == NULL) data6 = data0; - if (data7 == NULL) data7 = data0; - if (data8 == NULL) data8 = data0; - if (data9 == NULL) data9 = data0; - if (data10 == NULL) data10 = data0; - if (data11 == NULL) data11 = data0; - if (data12 == NULL) data12 = data0; - if (data13 == NULL) data13 = data0; - if (data14 == NULL) data14 = data0; - if (data15 == NULL) data15 = data0; - - ptr = sc->ptr; - if (ptr != 0) { - size_t clen = (sizeof sc->buf0 - ptr); - if (clen > len) { - memcpy(sc->buf0 + ptr, data0, len); - memcpy(sc->buf1 + ptr, data1, len); - memcpy(sc->buf2 + ptr, data2, len); - memcpy(sc->buf3 + ptr, data3, len); - memcpy(sc->buf4 + ptr, data4, len); - memcpy(sc->buf5 + ptr, data5, len); - memcpy(sc->buf6 + ptr, data6, len); - memcpy(sc->buf7 + ptr, data7, len); - memcpy(sc->buf8 + ptr, data8, len); - memcpy(sc->buf9 + ptr, data9, len); - memcpy(sc->buf10 + ptr, data10, len); - memcpy(sc->buf11 + ptr, data11, len); - memcpy(sc->buf12 + ptr, data12, len); - memcpy(sc->buf13 + ptr, data13, len); - memcpy(sc->buf14 + ptr, data14, len); - memcpy(sc->buf15 + ptr, data15, len); - sc->ptr = ptr + len; - return; - } else { - memcpy(sc->buf0 + ptr, data0, clen); - memcpy(sc->buf1 + ptr, data1, clen); - memcpy(sc->buf2 + ptr, data2, clen); - memcpy(sc->buf3 + ptr, data3, clen); - memcpy(sc->buf4 + ptr, data4, clen); - memcpy(sc->buf5 + ptr, data5, clen); - memcpy(sc->buf6 + ptr, data6, clen); - memcpy(sc->buf7 + ptr, data7, clen); - memcpy(sc->buf8 + ptr, data8, clen); - memcpy(sc->buf9 + ptr, data9, clen); - memcpy(sc->buf10 + ptr, data10, clen); - memcpy(sc->buf11 + ptr, data11, clen); - memcpy(sc->buf12 + ptr, data12, clen); - memcpy(sc->buf13 + ptr, data13, clen); - memcpy(sc->buf14 + ptr, data14, clen); - memcpy(sc->buf15 + ptr, data15, clen); - mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11, - sc->buf12, sc->buf13, sc->buf14, sc->buf15, 1); - data0 = (const unsigned char *)data0 + clen; - data1 = (const unsigned char *)data1 + clen; - data2 = (const unsigned char *)data2 + clen; - data3 = (const unsigned char *)data3 + clen; - data4 = (const unsigned char *)data4 + clen; - data5 = (const unsigned char *)data5 + clen; - data6 = (const unsigned char *)data6 + clen; - data7 = (const unsigned char *)data7 + clen; - data8 = (const unsigned char *)data8 + clen; - data9 = (const unsigned char *)data9 + clen; - data10 = (const unsigned char *)data10 + clen; - data11 = (const unsigned char *)data11 + clen; - data12 = (const unsigned char *)data12 + clen; - data13 = (const unsigned char *)data13 + clen; - data14 = (const unsigned char *)data14 + clen; - data15 = (const unsigned char *)data15 + clen; - len -= clen; - } - } - - num = len >> 6; - if (num != 0) { - mshabal_compress_avx512f(sc, data0, data1, data2, data3, data4, data5, data6, data7, - data8, data9, data10, data11, data12, data13, data14, data15, num); - data0 = (const unsigned char *)data0 + (num << 6); - data1 = (const unsigned char *)data1 + (num << 6); - data2 = (const unsigned char *)data2 + (num << 6); - data3 = (const unsigned char *)data3 + (num << 6); - data4 = (const unsigned char *)data4 + (num << 6); - data5 = (const unsigned char *)data5 + (num << 6); - data6 = (const unsigned char *)data6 + (num << 6); - data7 = (const unsigned char *)data7 + (num << 6); - data8 = (const unsigned char *)data8 + (num << 6); - data9 = (const unsigned char *)data9 + (num << 6); - data10 = (const unsigned char *)data10 + (num << 6); - data11 = (const unsigned char *)data11 + (num << 6); - data12 = (const unsigned char *)data12 + (num << 6); - data13 = (const unsigned char *)data13 + (num << 6); - data14 = (const unsigned char *)data14 + (num << 6); - data15 = (const unsigned char *)data15 + (num << 6); - } - len &= (size_t)63; - memcpy(sc->buf0, data0, len); - memcpy(sc->buf1, data1, len); - memcpy(sc->buf2, data2, len); - memcpy(sc->buf3, data3, len); - memcpy(sc->buf4, data4, len); - memcpy(sc->buf5, data5, len); - memcpy(sc->buf6, data6, len); - memcpy(sc->buf7, data7, len); - memcpy(sc->buf8, data8, len); - memcpy(sc->buf9, data9, len); - memcpy(sc->buf10, data10, len); - memcpy(sc->buf11, data11, len); - memcpy(sc->buf12, data12, len); - memcpy(sc->buf13, data13, len); - memcpy(sc->buf14, data14, len); - memcpy(sc->buf15, data15, len); - sc->ptr = len; -} - -void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, - unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12, - unsigned ub13, unsigned ub14, unsigned ub15, - unsigned n, void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, - void *dst5, void *dst6, void *dst7, void *dst8, void *dst9, void *dst10, - void *dst11, void *dst12, void *dst13, void *dst14, void *dst15) { - size_t ptr, off; - unsigned z, out_size_w32; - - z = 0x80 >> n; - ptr = sc->ptr; - sc->buf0[ptr] = (ub0 & -z) | z; - sc->buf1[ptr] = (ub1 & -z) | z; - sc->buf2[ptr] = (ub2 & -z) | z; - sc->buf3[ptr] = (ub3 & -z) | z; - sc->buf4[ptr] = (ub4 & -z) | z; - sc->buf5[ptr] = (ub5 & -z) | z; - sc->buf6[ptr] = (ub6 & -z) | z; - sc->buf7[ptr] = (ub7 & -z) | z; - sc->buf8[ptr] = (ub8 & -z) | z; - sc->buf9[ptr] = (ub9 & -z) | z; - sc->buf10[ptr] = (ub10 & -z) | z; - sc->buf11[ptr] = (ub11 & -z) | z; - sc->buf12[ptr] = (ub12 & -z) | z; - sc->buf13[ptr] = (ub13 & -z) | z; - sc->buf14[ptr] = (ub14 & -z) | z; - sc->buf15[ptr] = (ub15 & -z) | z; - ptr++; - memset(sc->buf0 + ptr, 0, (sizeof sc->buf0) - ptr); - memset(sc->buf1 + ptr, 0, (sizeof sc->buf1) - ptr); - memset(sc->buf2 + ptr, 0, (sizeof sc->buf2) - ptr); - memset(sc->buf3 + ptr, 0, (sizeof sc->buf3) - ptr); - memset(sc->buf4 + ptr, 0, (sizeof sc->buf4) - ptr); - memset(sc->buf5 + ptr, 0, (sizeof sc->buf5) - ptr); - memset(sc->buf6 + ptr, 0, (sizeof sc->buf6) - ptr); - memset(sc->buf7 + ptr, 0, (sizeof sc->buf7) - ptr); - memset(sc->buf8 + ptr, 0, (sizeof sc->buf8) - ptr); - memset(sc->buf9 + ptr, 0, (sizeof sc->buf9) - ptr); - memset(sc->buf10 + ptr, 0, (sizeof sc->buf10) - ptr); - memset(sc->buf11 + ptr, 0, (sizeof sc->buf11) - ptr); - memset(sc->buf12 + ptr, 0, (sizeof sc->buf12) - ptr); - memset(sc->buf13 + ptr, 0, (sizeof sc->buf13) - ptr); - memset(sc->buf14 + ptr, 0, (sizeof sc->buf14) - ptr); - memset(sc->buf15 + ptr, 0, (sizeof sc->buf15) - ptr); - for (z = 0; z < 4; z++) { - mshabal_compress_avx512f(sc, sc->buf0, sc->buf1, sc->buf2, sc->buf3, sc->buf4, sc->buf5, - sc->buf6, sc->buf7, sc->buf8, sc->buf9, sc->buf10, sc->buf11, sc->buf12, sc->buf13, - sc->buf14, sc->buf15, 1); - if (sc->Wlow-- == 0) sc->Whigh--; - } - out_size_w32 = sc->out_size >> 5; - off = MSHABAL512_VECTOR_SIZE * (28 + (16 - out_size_w32)); - if (dst0 != NULL) { - u32 *out; - - out = (u32 *)dst0; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 0]; - } - if (dst1 != NULL) { - u32 *out; - - out = (u32 *)dst1; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 1]; - } - if (dst2 != NULL) { - u32 *out; - - out = (u32 *)dst2; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 2]; - } - if (dst3 != NULL) { - u32 *out; - - out = (u32 *)dst3; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 3]; - } - if (dst4 != NULL) { - u32 *out; - - out = (u32 *)dst4; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 4]; - } - if (dst5 != NULL) { - u32 *out; - - out = (u32 *)dst5; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 5]; - } - if (dst6 != NULL) { - u32 *out; - - out = (u32 *)dst6; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 6]; - } - if (dst7 != NULL) { - u32 *out; - - out = (u32 *)dst7; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 7]; - } - if (dst8 != NULL) { - u32 *out; - - out = (u32 *)dst8; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 8]; - } - if (dst9 != NULL) { - u32 *out; - - out = (u32 *)dst9; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 9]; - } - if (dst10 != NULL) { - u32 *out; - - out = (u32 *)dst10; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 10]; - } - if (dst11 != NULL) { - u32 *out; - - out = (u32 *)dst11; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 11]; - } - if (dst12 != NULL) { - u32 *out; - - out = (u32 *)dst12; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 12]; - } - if (dst13 != NULL) { - u32 *out; - - out = (u32 *)dst13; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 13]; - } - if (dst14 != NULL) { - u32 *out; - - out = (u32 *)dst14; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 14]; - } - if (dst15 != NULL) { - u32 *out; - - out = (u32 *)dst15; - for (z = 0; z < out_size_w32; z++) - out[z] = sc->state[off + z * MSHABAL512_VECTOR_SIZE + 15]; - } -} - -// Shabal routine optimized for plotting and hashing -void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, - void *dst, unsigned num) { - union input { - u32 words[16 * MSHABAL512_VECTOR_SIZE]; - __m512i data[16]; - }; - size_t j; - __m512i A[12], B[16], C[16]; - __m512i one; - - for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12); - C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28); - } - one = _mm512_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm512_load_si512((__m512i *)message + i) - - while (num-- > 0) { - for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j)); - - A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow)); - A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15)); - -#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m512i tt; \ - tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1), \ - _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31)); \ - xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one)); \ - } while (0) - - PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB512(xb, xc, xm) \ - do { \ - __m512i tmp; \ - tmp = xb; \ - xb = _mm512_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF)); - - // move data pointer - message = (__m512i *)message + 16; - - if (++sc->Wlow == 0) sc->Whigh++; - } - - // round 2-5 -#define M2(i) _mm512_load_si512((__m512i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M2(j)); - - A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow)); - A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15)); - - PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB512(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB512(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB512(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB512(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB512(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB512(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB512(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB512(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB512(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB512(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB512(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB512(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB512(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB512(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB512(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB512(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned hashes - for (j = 0; j < 8; j++) { - _mm512_storeu_si512((__m512i *)dst + j, C[j+8]); - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -// Shabal routine optimized for mining -void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, - void *dst6, void *dst7, void *dst8, void *dst9, void *dst10, - void *dst11, void *dst12, void *dst13, void *dst14, - void *dst15) { - union input { - u32 words[16 * MSHABAL512_VECTOR_SIZE]; - __m512i data[16]; - }; - size_t j; - __m512i A[12], B[16], C[16]; - __m512i one; - - for (j = 0; j < 12; j++) A[j] = _mm512_loadu_si512((__m512i *)sc->state + j); - for (j = 0; j < 16; j++) { - B[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 12); - C[j] = _mm512_loadu_si512((__m512i *)sc->state + j + 28); - } - one = _mm512_set1_epi32(C32(0xFFFFFFFF)); - - // round 1 -#define M(i) _mm512_load_si512((__m512i *)message + i) - - for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M(j)); - - A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow)); - A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15)); - -#define PP512(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - __m512i tt; \ - tt = _mm512_or_si512(_mm512_slli_epi32(xa1, 15), _mm512_srli_epi32(xa1, 17)); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 2), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(xa0, tt), xc); \ - tt = _mm512_add_epi32(_mm512_slli_epi32(tt, 1), tt); \ - tt = _mm512_xor_si512(_mm512_xor_si512(tt, xb1), \ - _mm512_xor_si512(_mm512_andnot_si512(xb3, xb2), xm)); \ - xa0 = tt; \ - tt = xb0; \ - tt = _mm512_or_si512(_mm512_slli_epi32(tt, 1), _mm512_srli_epi32(tt, 31)); \ - xb0 = _mm512_xor_si512(tt, _mm512_xor_si512(xa0, one)); \ - } while (0) - - PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); - PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); - PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); - PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); - PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); - PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); - PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); - PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); - PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); - PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); - PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); - PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); - PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); - PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); - PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); - PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); - - A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]); - -#define SWAP_AND_SUB512(xb, xc, xm) \ - do { \ - __m512i tmp; \ - tmp = xb; \ - xb = _mm512_sub_epi32(xc, xm); \ - xc = tmp; \ - } while (0) - - SWAP_AND_SUB512(B[0x0], C[0x0], M(0x0)); - SWAP_AND_SUB512(B[0x1], C[0x1], M(0x1)); - SWAP_AND_SUB512(B[0x2], C[0x2], M(0x2)); - SWAP_AND_SUB512(B[0x3], C[0x3], M(0x3)); - SWAP_AND_SUB512(B[0x4], C[0x4], M(0x4)); - SWAP_AND_SUB512(B[0x5], C[0x5], M(0x5)); - SWAP_AND_SUB512(B[0x6], C[0x6], M(0x6)); - SWAP_AND_SUB512(B[0x7], C[0x7], M(0x7)); - SWAP_AND_SUB512(B[0x8], C[0x8], M(0x8)); - SWAP_AND_SUB512(B[0x9], C[0x9], M(0x9)); - SWAP_AND_SUB512(B[0xA], C[0xA], M(0xA)); - SWAP_AND_SUB512(B[0xB], C[0xB], M(0xB)); - SWAP_AND_SUB512(B[0xC], C[0xC], M(0xC)); - SWAP_AND_SUB512(B[0xD], C[0xD], M(0xD)); - SWAP_AND_SUB512(B[0xE], C[0xE], M(0xE)); - SWAP_AND_SUB512(B[0xF], C[0xF], M(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - // round 2-5 -#define M2(i) _mm512_load_si512((__m512i *)termination + i) - - for (int k = 0; k < 4; k++) { - for (j = 0; j < 16; j++) B[j] = _mm512_add_epi32(B[j], M2(j)); - - A[0] = _mm512_xor_si512(A[0], _mm512_set1_epi32(sc->Wlow)); - A[1] = _mm512_xor_si512(A[1], _mm512_set1_epi32(sc->Whigh)); - - for (j = 0; j < 16; j++) - B[j] = _mm512_or_si512(_mm512_slli_epi32(B[j], 17), _mm512_srli_epi32(B[j], 15)); - - PP512(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP512(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - PP512(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M2(0x0)); - PP512(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M2(0x1)); - PP512(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M2(0x2)); - PP512(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M2(0x3)); - PP512(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M2(0x4)); - PP512(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M2(0x5)); - PP512(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M2(0x6)); - PP512(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M2(0x7)); - PP512(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M2(0x8)); - PP512(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M2(0x9)); - PP512(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M2(0xA)); - PP512(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M2(0xB)); - PP512(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M2(0xC)); - PP512(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M2(0xD)); - PP512(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M2(0xE)); - PP512(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M2(0xF)); - - A[0xB] = _mm512_add_epi32(A[0xB], C[0x6]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x5]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x4]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x3]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x2]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x1]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x0]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0xF]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0xE]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0xD]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0xC]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xB]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xA]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0x9]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0x8]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0x7]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0x6]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x5]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x4]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x3]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x2]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x1]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x0]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0xF]); - A[0xB] = _mm512_add_epi32(A[0xB], C[0xE]); - A[0xA] = _mm512_add_epi32(A[0xA], C[0xD]); - A[0x9] = _mm512_add_epi32(A[0x9], C[0xC]); - A[0x8] = _mm512_add_epi32(A[0x8], C[0xB]); - A[0x7] = _mm512_add_epi32(A[0x7], C[0xA]); - A[0x6] = _mm512_add_epi32(A[0x6], C[0x9]); - A[0x5] = _mm512_add_epi32(A[0x5], C[0x8]); - A[0x4] = _mm512_add_epi32(A[0x4], C[0x7]); - A[0x3] = _mm512_add_epi32(A[0x3], C[0x6]); - A[0x2] = _mm512_add_epi32(A[0x2], C[0x5]); - A[0x1] = _mm512_add_epi32(A[0x1], C[0x4]); - A[0x0] = _mm512_add_epi32(A[0x0], C[0x3]); - - SWAP_AND_SUB512(B[0x0], C[0x0], M2(0x0)); - SWAP_AND_SUB512(B[0x1], C[0x1], M2(0x1)); - SWAP_AND_SUB512(B[0x2], C[0x2], M2(0x2)); - SWAP_AND_SUB512(B[0x3], C[0x3], M2(0x3)); - SWAP_AND_SUB512(B[0x4], C[0x4], M2(0x4)); - SWAP_AND_SUB512(B[0x5], C[0x5], M2(0x5)); - SWAP_AND_SUB512(B[0x6], C[0x6], M2(0x6)); - SWAP_AND_SUB512(B[0x7], C[0x7], M2(0x7)); - SWAP_AND_SUB512(B[0x8], C[0x8], M2(0x8)); - SWAP_AND_SUB512(B[0x9], C[0x9], M2(0x9)); - SWAP_AND_SUB512(B[0xA], C[0xA], M2(0xA)); - SWAP_AND_SUB512(B[0xB], C[0xB], M2(0xB)); - SWAP_AND_SUB512(B[0xC], C[0xC], M2(0xC)); - SWAP_AND_SUB512(B[0xD], C[0xD], M2(0xD)); - SWAP_AND_SUB512(B[0xE], C[0xE], M2(0xE)); - SWAP_AND_SUB512(B[0xF], C[0xF], M2(0xF)); - - if (++sc->Wlow == 0) sc->Whigh++; - - if (sc->Wlow-- == 0) sc->Whigh--; - } - - // download SIMD aligned deadlines - u32 simd_dst[32]; - _mm512_storeu_si512((__m512i *)&simd_dst[0], C[8]); - _mm512_storeu_si512((__m512i *)&simd_dst[16], C[9]); - - // unpack SIMD data - unsigned z; - for (z = 0; z < 2; z++) { - unsigned y = z * MSHABAL512_VECTOR_SIZE; - ((u32 *)dst0)[z] = simd_dst[y + 0]; - ((u32 *)dst1)[z] = simd_dst[y + 1]; - ((u32 *)dst2)[z] = simd_dst[y + 2]; - ((u32 *)dst3)[z] = simd_dst[y + 3]; - ((u32 *)dst4)[z] = simd_dst[y + 4]; - ((u32 *)dst5)[z] = simd_dst[y + 5]; - ((u32 *)dst6)[z] = simd_dst[y + 6]; - ((u32 *)dst7)[z] = simd_dst[y + 7]; - ((u32 *)dst8)[z] = simd_dst[y + 8]; - ((u32 *)dst9)[z] = simd_dst[y + 9]; - ((u32 *)dst10)[z] = simd_dst[y + 10]; - ((u32 *)dst11)[z] = simd_dst[y + 11]; - ((u32 *)dst12)[z] = simd_dst[y + 12]; - ((u32 *)dst13)[z] = simd_dst[y + 13]; - ((u32 *)dst14)[z] = simd_dst[y + 14]; - ((u32 *)dst15)[z] = simd_dst[y + 15]; - } - - // reset Wlow & Whigh - sc->Wlow = 1; - sc->Whigh = 0; -} - -#ifdef __cplusplus -} -#endif diff --git a/src/c/mshabal_512_avx512f.h b/src/c/mshabal_512_avx512f.h deleted file mode 100644 index 57455f5..0000000 --- a/src/c/mshabal_512_avx512f.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * A parallel implementation of Shabal, for platforms with AVX512F. - * - * This is the header file for an implementation of the Shabal family - * of hash functions, designed for maximum parallel speed. It processes - * up to four instances of Shabal in parallel, using the AVX512F unit. - * Total bandwidth appear to be up to twice that of a plain 32-bit - * Shabal implementation. - * - * A computation uses a mshabal_context structure. That structure is - * supposed to be allocated and released by the caller, e.g. as a - * local or global variable, or on the heap. The structure contents - * are initialized with mshabal_init(). Once the structure has been - * initialized, data is input as chunks, with the mshabal() functions. - * Chunks for the four parallel instances are provided simultaneously - * and must have the same length. It is allowed not to use some of the - * instances; the corresponding parameters in mshabal() are then NULL. - * However, using NULL as a chunk for one of the instances effectively - * deactivates that instance; this cannot be used to "skip" a chunk - * for one instance. - * - * The computation is finalized with mshabal_close(). Some extra message - * bits (0 to 7) can be input. The outputs of the four parallel instances - * are written in the provided buffers. There again, NULL can be - * provided as parameter is the output of one of the instances is not - * needed. - * - * A mshabal_context instance is self-contained and holds no pointer. - * Thus, it can be cloned (e.g. with memcpy()) or moved (as long as - * proper alignment is maintained). This implementation uses no state - * variable beyond the context instance; this, it is thread-safe and - * reentrant. - * - * The Shabal specification defines Shabal with output sizes of 192, - * 224, 256, 384 and 512 bits. This code accepts all those sizes, as - * well as any output size which is multiple of 32, between 32 and - * 512 (inclusive). - * - * Parameters are not validated. Thus, undefined behaviour occurs if - * any of the "shall" or "must" clauses in this documentation is - * violated. - * - * - * (c) 2010 SAPHIR project. This software is provided 'as-is', without - * any epxress or implied warranty. In no event will the authors be held - * liable for any damages arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to no restriction. - * - * Technical remarks and questions can be addressed to: - * - */ - -#ifndef MSHABAL_H__ -#define MSHABAL_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * We need an integer type with width 32-bit or more (preferably, with - * a width of exactly 32 bits). - */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#include -#ifdef UINT32_MAX -typedef uint32_t mshabal_u32; -#else -typedef uint_fast32_t mshabal_u32; -#endif -#else -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF -typedef unsigned int mshabal_u32; -#else -typedef unsigned long mshabal_u32; -#endif -#endif - -#define MSHABAL512_VECTOR_SIZE 16 - -/* - * The context structure for a Shabal computation. Contents are - * private. Such a structure should be allocated and released by - * the caller, in any memory area. - */ -typedef struct { - unsigned char buf0[64]; - unsigned char buf1[64]; - unsigned char buf2[64]; - unsigned char buf3[64]; - unsigned char buf4[64]; - unsigned char buf5[64]; - unsigned char buf6[64]; - unsigned char buf7[64]; - unsigned char buf8[64]; - unsigned char buf9[64]; - unsigned char buf10[64]; - unsigned char buf11[64]; - unsigned char buf12[64]; - unsigned char buf13[64]; - unsigned char buf14[64]; - unsigned char buf15[64]; - size_t ptr; - mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal512_context; - -#pragma pack(1) -typedef struct { - mshabal_u32 state[(12 + 16 + 16) * MSHABAL512_VECTOR_SIZE]; - mshabal_u32 Whigh, Wlow; - unsigned out_size; -} mshabal512_context_fast; -#pragma pack() - -/* - * Initialize a context structure. The output size must be a multiple - * of 32, between 32 and 512 (inclusive). The output size is expressed - * in bits. - */ -void mshabal_init_avx512f(mshabal512_context *sc, unsigned out_size); - -/* - * Process some more data bytes; four chunks of data, pointed to by - * data0, data1, data2 and data3, are processed. The four chunks have - * the same length of "len" bytes. For efficiency, it is best if data is - * processed by medium-sized chunks, e.g. a few kilobytes at a time. - * - * The "len" data bytes shall all be accessible. If "len" is zero, this - * this function does nothing and ignores the data* arguments. - * Otherwise, if one of the data* argument is NULL, then the - * corresponding instance is deactivated (the final value obtained from - * that instance is undefined). - */ -void mshabal_avx512f(mshabal512_context *sc, const void *data0, const void *data1, const void *data2, const void *data3, - const void *data4, const void *data5, const void *data6, const void *data7, const void *data8, const void *data9, - const void *data10, const void *data11, const void *data12, const void *data13, const void *data14, - const void *data15, size_t len); - -/* - * Terminate the Shabal computation incarnated by the provided context - * structure. "n" shall be a value between 0 and 7 (inclusive): this is - * the number of extra bits to extract from ub0, ub1, ub2 and ub3, and - * append at the end of the input message for each of the four parallel - * instances. Bits in "ub*" are taken in big-endian format: first bit is - * the one of numerical value 128, second bit has numerical value 64, - * and so on. Other bits in "ub*" are ignored. For most applications, - * input messages will consist in sequence of bytes, and the "ub*" and - * "n" parameters will be zero. - * - * The Shabal output for each of the parallel instances is written out - * in the areas pointed to by, respectively, dst0, dst1, dst2 and dst3. - * These areas shall be wide enough to accomodate the result (result - * size was specified as parameter to mshabal_init()). It is acceptable - * to use NULL for any of those pointers, if the result from the - * corresponding instance is not needed. - * - * After this call, the context structure is invalid. The caller shall - * release it, or reinitialize it with mshabal_init(). The mshabal_close() - * function does NOT imply a hidden call to mshabal_init(). - */ -void mshabal_close_avx512f(mshabal512_context *sc, unsigned ub0, unsigned ub1, unsigned ub2, - unsigned ub3, unsigned ub4, unsigned ub5, unsigned ub6, unsigned ub7, - unsigned ub8, unsigned ub9, unsigned ub10, unsigned ub11, unsigned ub12, - unsigned ub13, unsigned ub14, unsigned ub15, unsigned n, void *dst0, - void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, - void *dst7, void *dst8, void *dst9, void *dst10, void *dst11, - void *dst12, void *dst13, void *dst14, void *dst15); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void mshabal_hash_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, - void *dst, unsigned len); - -/* - * optimised Shabal routine for PoC mining - */ -void mshabal_deadline_fast_avx512f(mshabal512_context_fast *sc, void *message, void *termination, void *dst0, - void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, - void *dst6, void *dst7, void *dst8, void *dst9, void *dst10, - void *dst11, void *dst12, void *dst13, void *dst14, - void *dst15); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/c/shabal.c b/src/c/shabal.c deleted file mode 100644 index c817e29..0000000 --- a/src/c/shabal.c +++ /dev/null @@ -1,13 +0,0 @@ -#include "shabal.h" -#include -#include "common.h" -#include "sph_shabal.h" - -void find_best_deadline_sph(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t dl = 0; - for (uint64_t i = 0; i < nonce_count; i++){ - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &dl); - SET_BEST_DEADLINE(dl, i); - } -} \ No newline at end of file diff --git a/src/c/shabal.h b/src/c/shabal.h deleted file mode 100644 index ebb741f..0000000 --- a/src/c/shabal.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include -#include - -void find_best_deadline_sph(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/shabal_avx.c b/src/c/shabal_avx.c deleted file mode 100644 index 70e29c8..0000000 --- a/src/c/shabal_avx.c +++ /dev/null @@ -1,75 +0,0 @@ -#include "shabal_avx.h" -#include -#include -#include "common.h" -#include "mshabal_128_avx.h" -#include "sph_shabal.h" - -mshabal128_context global_128; -mshabal128_context_fast global_128_fast; - -void init_shabal_avx() { - mshabal_init_avx(&global_128, 256); - global_128_fast.out_size = global_128.out_size; - for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i]; - global_128_fast.Whigh = global_128.Whigh; - global_128_fast.Wlow = global_128.Wlow; -} - -void find_best_deadline_avx(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0; - char term[32]; - write_term(term); - - // local copy of global fast context - mshabal128_context_fast x; - memcpy(&x, &global_128_fast, sizeof(global_128_fast)); - - // prepare shabal inputs - union { - mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u1, u2; - - for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) { - size_t o = i; - u1.words[i + 0] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 1] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 2] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 3] = *(mshabal_u32 *)(gensig + o); - u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o); - } - - for (uint64_t i = 0; i < nonce_count;) { - if (i + 4 <= nonce_count) { - // load and align data for SIMD - for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) { - size_t o = j; - u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o); - u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o); - u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o); - u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o); - u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o); - u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o); - u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o); - u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o); - } - - mshabal_deadline_fast_avx(&x, &u1, &u2, &d0, &d1, &d2, &d3); - - SET_BEST_DEADLINE(d0, i + 0); - SET_BEST_DEADLINE(d1, i + 1); - SET_BEST_DEADLINE(d2, i + 2); - SET_BEST_DEADLINE(d3, i + 3); - i += 4; - } else { - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0); - SET_BEST_DEADLINE(d0, i); - i++; - } - } -} diff --git a/src/c/shabal_avx.h b/src/c/shabal_avx.h deleted file mode 100644 index b6b7dd4..0000000 --- a/src/c/shabal_avx.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include -#include - -void init_shabal_avx(); - -void find_best_deadline_avx(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/shabal_avx2.c b/src/c/shabal_avx2.c deleted file mode 100644 index 4802bff..0000000 --- a/src/c/shabal_avx2.c +++ /dev/null @@ -1,95 +0,0 @@ -#include "shabal_avx2.h" -#include -#include -#include "common.h" -#include "mshabal_256_avx2.h" -#include "sph_shabal.h" - -mshabal256_context global_256; -mshabal256_context_fast global_256_fast; - -void init_shabal_avx2() { - mshabal_init_avx2(&global_256, 256); - global_256_fast.out_size = global_256.out_size; - for (uint64_t i = 0; i < 352; i++) global_256_fast.state[i] = global_256.state[i]; - global_256_fast.Whigh = global_256.Whigh; - global_256_fast.Wlow = global_256.Wlow; -} - -void find_best_deadline_avx2(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0, d5 = 0, d6 = 0, d7 = 0; - char term[32]; - write_term(term); - - // local copy of global fast context - mshabal256_context_fast x; - memcpy(&x, &global_256_fast, sizeof(global_256_fast)); - - // prepare shabal inputs - union { - mshabal_u32 words[16 * MSHABAL256_VECTOR_SIZE]; - __m256i data[16]; - } u1, u2; - - for (uint64_t i = 0; i < 16 * MSHABAL256_VECTOR_SIZE / 2; i += MSHABAL256_VECTOR_SIZE) { - size_t o = i / 2; - u1.words[i + 0] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 1] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 2] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 3] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 4] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 5] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 6] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 7] = *(mshabal_u32 *)(gensig + o); - u2.words[i + 0 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 1 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 2 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 3 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 4 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 5 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 6 + 64] = *(mshabal_u32 *)(term + o); - u2.words[i + 7 + 64] = *(mshabal_u32 *)(term + o); - } - - for (uint64_t i = 0; i < nonce_count;) { - if (i + 8 <= nonce_count) { - // load and align data for SIMD - for (uint64_t j = 0; j < 16 * MSHABAL256_VECTOR_SIZE / 2; j += MSHABAL256_VECTOR_SIZE) { - size_t o = j / 2; - u1.words[j + 0 + 64] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o); - u1.words[j + 1 + 64] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o); - u1.words[j + 2 + 64] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o); - u1.words[j + 3 + 64] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o); - u1.words[j + 4 + 64] = *(mshabal_u32 *)(&scoops[(i + 4) * 64] + o); - u1.words[j + 5 + 64] = *(mshabal_u32 *)(&scoops[(i + 5) * 64] + o); - u1.words[j + 6 + 64] = *(mshabal_u32 *)(&scoops[(i + 6) * 64] + o); - u1.words[j + 7 + 64] = *(mshabal_u32 *)(&scoops[(i + 7) * 64] + o); - u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o); - u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o); - u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o); - u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o); - u2.words[j + 4] = *(mshabal_u32 *)(&scoops[(i + 4) * 64 + 32] + o); - u2.words[j + 5] = *(mshabal_u32 *)(&scoops[(i + 5) * 64 + 32] + o); - u2.words[j + 6] = *(mshabal_u32 *)(&scoops[(i + 6) * 64 + 32] + o); - u2.words[j + 7] = *(mshabal_u32 *)(&scoops[(i + 7) * 64 + 32] + o); - } - - mshabal_deadline_fast_avx2(&x, &u1, &u2, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - - SET_BEST_DEADLINE(d0, i + 0); - SET_BEST_DEADLINE(d1, i + 1); - SET_BEST_DEADLINE(d2, i + 2); - SET_BEST_DEADLINE(d3, i + 3); - SET_BEST_DEADLINE(d4, i + 4); - SET_BEST_DEADLINE(d5, i + 5); - SET_BEST_DEADLINE(d6, i + 6); - SET_BEST_DEADLINE(d7, i + 7); - i += 8; - } else { - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0); - SET_BEST_DEADLINE(d0, i); - i++; - } - } -} diff --git a/src/c/shabal_avx2.h b/src/c/shabal_avx2.h deleted file mode 100644 index fa433b6..0000000 --- a/src/c/shabal_avx2.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include -#include - -void init_shabal_avx2(); - -void find_best_deadline_avx2(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/shabal_avx512f.c b/src/c/shabal_avx512f.c deleted file mode 100644 index 3b7b1b3..0000000 --- a/src/c/shabal_avx512f.c +++ /dev/null @@ -1,138 +0,0 @@ -#include "shabal_avx512f.h" -#include -#include -#include "common.h" -#include "mshabal_512_avx512f.h" -#include "sph_shabal.h" - -mshabal512_context global_512; -mshabal512_context_fast global_512_fast; - -void init_shabal_avx512f() { - mshabal_init_avx512f(&global_512, 256); - global_512_fast.out_size = global_512.out_size; - for (uint64_t i = 0; i < 704; i++) global_512_fast.state[i] = global_512.state[i]; - global_512_fast.Whigh = global_512.Whigh; - global_512_fast.Wlow = global_512.Wlow; -} - -void find_best_deadline_avx512f(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0, d5 = 0, d6 = 0, d7 = 0, d8 = 0, d9 = 0, - d10 = 0, d11 = 0, d12 = 0, d13 = 0, d14 = 0, d15 = 0; - char term[32]; - write_term(term); - - // local copy of global fast context - mshabal512_context_fast x; - memcpy(&x, &global_512_fast, sizeof(global_512_fast)); - - // prepare shabal inputs - union { - mshabal_u32 words[16 * MSHABAL512_VECTOR_SIZE]; - __m512i data[16]; - } u1, u2; - - for (uint64_t i = 0; i < 16 * MSHABAL512_VECTOR_SIZE / 2; i += MSHABAL512_VECTOR_SIZE) { - size_t o = i / 4; - u1.words[i + 0] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 1] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 2] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 3] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 4] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 5] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 6] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 7] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 8] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 9] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 10] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 11] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 12] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 13] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 14] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 15] = *(mshabal_u32 *)(gensig + o); - u2.words[i + 0 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 1 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 2 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 3 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 4 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 5 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 6 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 7 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 8 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 9 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 10 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 11 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 12 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 13 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 14 + 128] = *(mshabal_u32 *)(term + o); - u2.words[i + 15 + 128] = *(mshabal_u32 *)(term + o); - } - - for (uint64_t i = 0; i < nonce_count;) { - if (i + 16 <= nonce_count) { - // load and align data for SIMD - - for (uint64_t j = 0; j < 16 * MSHABAL512_VECTOR_SIZE / 2; j += MSHABAL512_VECTOR_SIZE) { - size_t o = j / 4; - u1.words[j + 0 + 128] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o); - u1.words[j + 1 + 128] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o); - u1.words[j + 2 + 128] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o); - u1.words[j + 3 + 128] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o); - u1.words[j + 4 + 128] = *(mshabal_u32 *)(&scoops[(i + 4) * 64] + o); - u1.words[j + 5 + 128] = *(mshabal_u32 *)(&scoops[(i + 5) * 64] + o); - u1.words[j + 6 + 128] = *(mshabal_u32 *)(&scoops[(i + 6) * 64] + o); - u1.words[j + 7 + 128] = *(mshabal_u32 *)(&scoops[(i + 7) * 64] + o); - u1.words[j + 8 + 128] = *(mshabal_u32 *)(&scoops[(i + 8) * 64] + o); - u1.words[j + 9 + 128] = *(mshabal_u32 *)(&scoops[(i + 9) * 64] + o); - u1.words[j + 10 + 128] = *(mshabal_u32 *)(&scoops[(i + 10) * 64] + o); - u1.words[j + 11 + 128] = *(mshabal_u32 *)(&scoops[(i + 11) * 64] + o); - u1.words[j + 12 + 128] = *(mshabal_u32 *)(&scoops[(i + 12) * 64] + o); - u1.words[j + 13 + 128] = *(mshabal_u32 *)(&scoops[(i + 13) * 64] + o); - u1.words[j + 14 + 128] = *(mshabal_u32 *)(&scoops[(i + 14) * 64] + o); - u1.words[j + 15 + 128] = *(mshabal_u32 *)(&scoops[(i + 15) * 64] + o); - u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o); - u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o); - u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o); - u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o); - u2.words[j + 4] = *(mshabal_u32 *)(&scoops[(i + 4) * 64 + 32] + o); - u2.words[j + 5] = *(mshabal_u32 *)(&scoops[(i + 5) * 64 + 32] + o); - u2.words[j + 6] = *(mshabal_u32 *)(&scoops[(i + 6) * 64 + 32] + o); - u2.words[j + 7] = *(mshabal_u32 *)(&scoops[(i + 7) * 64 + 32] + o); - u2.words[j + 8] = *(mshabal_u32 *)(&scoops[(i + 8) * 64 + 32] + o); - u2.words[j + 9] = *(mshabal_u32 *)(&scoops[(i + 9) * 64 + 32] + o); - u2.words[j + 10] = *(mshabal_u32 *)(&scoops[(i + 10) * 64 + 32] + o); - u2.words[j + 11] = *(mshabal_u32 *)(&scoops[(i + 11) * 64 + 32] + o); - u2.words[j + 12] = *(mshabal_u32 *)(&scoops[(i + 12) * 64 + 32] + o); - u2.words[j + 13] = *(mshabal_u32 *)(&scoops[(i + 13) * 64 + 32] + o); - u2.words[j + 14] = *(mshabal_u32 *)(&scoops[(i + 14) * 64 + 32] + o); - u2.words[j + 15] = *(mshabal_u32 *)(&scoops[(i + 15) * 64 + 32] + o); - } - - mshabal_deadline_fast_avx512f(&x, &u1, &u2, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, - &d8, &d9, &d10, &d11, &d12, &d13, &d14, &d15); - - SET_BEST_DEADLINE(d0, i + 0); - SET_BEST_DEADLINE(d1, i + 1); - SET_BEST_DEADLINE(d2, i + 2); - SET_BEST_DEADLINE(d3, i + 3); - SET_BEST_DEADLINE(d4, i + 4); - SET_BEST_DEADLINE(d5, i + 5); - SET_BEST_DEADLINE(d6, i + 6); - SET_BEST_DEADLINE(d7, i + 7); - SET_BEST_DEADLINE(d8, i + 8); - SET_BEST_DEADLINE(d9, i + 9); - SET_BEST_DEADLINE(d10, i + 10); - SET_BEST_DEADLINE(d11, i + 11); - SET_BEST_DEADLINE(d12, i + 12); - SET_BEST_DEADLINE(d13, i + 13); - SET_BEST_DEADLINE(d14, i + 14); - SET_BEST_DEADLINE(d15, i + 15); - i += 16; - } else { - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0); - SET_BEST_DEADLINE(d0, i); - i++; - } - } -} diff --git a/src/c/shabal_avx512f.h b/src/c/shabal_avx512f.h deleted file mode 100644 index 0a6f1ed..0000000 --- a/src/c/shabal_avx512f.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include -#include - -void init_shabal_avx512f(); - -void find_best_deadline_avx512f(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/shabal_neon.c b/src/c/shabal_neon.c deleted file mode 100644 index 313642b..0000000 --- a/src/c/shabal_neon.c +++ /dev/null @@ -1,75 +0,0 @@ -#include "shabal_neon.h" -#include "SSE2NEON.h" -#include -#include "common.h" -#include "mshabal_128_neon.h" -#include "sph_shabal.h" - -mshabal128_context global_128; -mshabal128_context_fast global_128_fast; - -void init_shabal_neon() { - mshabal_init_neon(&global_128, 256); - global_128_fast.out_size = global_128.out_size; - for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i]; - global_128_fast.Whigh = global_128.Whigh; - global_128_fast.Wlow = global_128.Wlow; -} - -void find_best_deadline_neon(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0; - char term[32]; - write_term(term); - - // local copy of global fast context - mshabal128_context_fast x; - memcpy(&x, &global_128_fast, sizeof(global_128_fast)); - - // prepare shabal inputs - union { - mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u1, u2; - - for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) { - size_t o = i; - u1.words[i + 0] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 1] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 2] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 3] = *(mshabal_u32 *)(gensig + o); - u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o); - } - - for (uint64_t i = 0; i < nonce_count;) { - if (i + 4 <= nonce_count) { - // load and align data for SIMD - for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) { - size_t o = j; - u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o); - u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o); - u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o); - u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o); - u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o); - u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o); - u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o); - u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o); - } - - mshabal_deadline_fast_neon(&x, &u1, &u2, &d0, &d1, &d2, &d3); - - SET_BEST_DEADLINE(d0, i + 0); - SET_BEST_DEADLINE(d1, i + 1); - SET_BEST_DEADLINE(d2, i + 2); - SET_BEST_DEADLINE(d3, i + 3); - i += 4; - } else { - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0); - SET_BEST_DEADLINE(d0, i); - i++; - } - } -} diff --git a/src/c/shabal_neon.h b/src/c/shabal_neon.h deleted file mode 100644 index faca1c1..0000000 --- a/src/c/shabal_neon.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include -#include - -void init_shabal_neon(); - -void find_best_deadline_neon(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/shabal_sse2.c b/src/c/shabal_sse2.c deleted file mode 100644 index 4f5e593..0000000 --- a/src/c/shabal_sse2.c +++ /dev/null @@ -1,75 +0,0 @@ -#include "shabal_sse2.h" -#include -#include -#include "common.h" -#include "mshabal_128_sse2.h" -#include "sph_shabal.h" - -mshabal128_context global_128; -mshabal128_context_fast global_128_fast; - -void init_shabal_sse2() { - mshabal_init_sse2(&global_128, 256); - global_128_fast.out_size = global_128.out_size; - for (uint64_t i = 0; i < 176; i++) global_128_fast.state[i] = global_128.state[i]; - global_128_fast.Whigh = global_128.Whigh; - global_128_fast.Wlow = global_128.Wlow; -} - -void find_best_deadline_sse2(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset) { - uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0; - char term[32]; - write_term(term); - - // local copy of global fast context - mshabal128_context_fast x; - memcpy(&x, &global_128_fast, sizeof(global_128_fast)); - - // prepare shabal inputs - union { - mshabal_u32 words[16 * MSHABAL128_VECTOR_SIZE]; - __m128i data[16]; - } u1, u2; - - for (uint64_t i = 0; i < 16 * MSHABAL128_VECTOR_SIZE / 2; i += MSHABAL128_VECTOR_SIZE) { - size_t o = i; - u1.words[i + 0] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 1] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 2] = *(mshabal_u32 *)(gensig + o); - u1.words[i + 3] = *(mshabal_u32 *)(gensig + o); - u2.words[i + 0 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 1 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 2 + 32] = *(mshabal_u32 *)(term + o); - u2.words[i + 3 + 32] = *(mshabal_u32 *)(term + o); - } - - for (uint64_t i = 0; i < nonce_count;) { - if (i + 4 <= nonce_count) { - // load and align data for SIMD - for (uint64_t j = 0; j < 16 * MSHABAL128_VECTOR_SIZE / 2; j += MSHABAL128_VECTOR_SIZE) { - size_t o = j; - u1.words[j + 0 + 32] = *(mshabal_u32 *)(&scoops[(i + 0) * 64] + o); - u1.words[j + 1 + 32] = *(mshabal_u32 *)(&scoops[(i + 1) * 64] + o); - u1.words[j + 2 + 32] = *(mshabal_u32 *)(&scoops[(i + 2) * 64] + o); - u1.words[j + 3 + 32] = *(mshabal_u32 *)(&scoops[(i + 3) * 64] + o); - u2.words[j + 0] = *(mshabal_u32 *)(&scoops[(i + 0) * 64 + 32] + o); - u2.words[j + 1] = *(mshabal_u32 *)(&scoops[(i + 1) * 64 + 32] + o); - u2.words[j + 2] = *(mshabal_u32 *)(&scoops[(i + 2) * 64 + 32] + o); - u2.words[j + 3] = *(mshabal_u32 *)(&scoops[(i + 3) * 64 + 32] + o); - } - - mshabal_deadline_fast_sse2(&x, &u1, &u2, &d0, &d1, &d2, &d3); - - SET_BEST_DEADLINE(d0, i + 0); - SET_BEST_DEADLINE(d1, i + 1); - SET_BEST_DEADLINE(d2, i + 2); - SET_BEST_DEADLINE(d3, i + 3); - i += 4; - } else { - sph_shabal_deadline_fast(&scoops[i * 64], gensig, &d0); - SET_BEST_DEADLINE(d0, i); - i++; - } - } -} diff --git a/src/c/shabal_sse2.h b/src/c/shabal_sse2.h deleted file mode 100644 index 56aa827..0000000 --- a/src/c/shabal_sse2.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include -#include - -void init_shabal_sse2(); - -void find_best_deadline_sse2(char *scoops, uint64_t nonce_count, char *gensig, - uint64_t *best_deadline, uint64_t *best_offset); diff --git a/src/c/sph_shabal.c b/src/c/sph_shabal.c deleted file mode 100644 index c1507b7..0000000 --- a/src/c/sph_shabal.c +++ /dev/null @@ -1,693 +0,0 @@ -/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ -/* - * Shabal implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_shabal.h" - -#ifdef _MSC_VER -#pragma warning(disable : 4146) -#endif - -/* - * Part of this code was automatically generated (the part between - * the "BEGIN" and "END" markers). - */ - -#define sM 16 - -//#define C32 SPH_C32 -//#define T32 SPH_T32 - -//#define O1 13 -//#define O2 9 -//#define O3 6 - -/* - * We copy the state into local variables, so that the compiler knows - * that it can optimize them at will. - */ - -/* BEGIN -- automatically generated code. */ - -#define DECL_STATE \ - sph_u32 A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A0A, A0B; \ - sph_u32 B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \ - sph_u32 C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \ - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 Wlow, Whigh; - -#define READ_STATE(state) \ - do { \ - A00 = (state)->A[0]; \ - A01 = (state)->A[1]; \ - A02 = (state)->A[2]; \ - A03 = (state)->A[3]; \ - A04 = (state)->A[4]; \ - A05 = (state)->A[5]; \ - A06 = (state)->A[6]; \ - A07 = (state)->A[7]; \ - A08 = (state)->A[8]; \ - A09 = (state)->A[9]; \ - A0A = (state)->A[10]; \ - A0B = (state)->A[11]; \ - B0 = (state)->B[0]; \ - B1 = (state)->B[1]; \ - B2 = (state)->B[2]; \ - B3 = (state)->B[3]; \ - B4 = (state)->B[4]; \ - B5 = (state)->B[5]; \ - B6 = (state)->B[6]; \ - B7 = (state)->B[7]; \ - B8 = (state)->B[8]; \ - B9 = (state)->B[9]; \ - BA = (state)->B[10]; \ - BB = (state)->B[11]; \ - BC = (state)->B[12]; \ - BD = (state)->B[13]; \ - BE = (state)->B[14]; \ - BF = (state)->B[15]; \ - C0 = (state)->C[0]; \ - C1 = (state)->C[1]; \ - C2 = (state)->C[2]; \ - C3 = (state)->C[3]; \ - C4 = (state)->C[4]; \ - C5 = (state)->C[5]; \ - C6 = (state)->C[6]; \ - C7 = (state)->C[7]; \ - C8 = (state)->C[8]; \ - C9 = (state)->C[9]; \ - CA = (state)->C[10]; \ - CB = (state)->C[11]; \ - CC = (state)->C[12]; \ - CD = (state)->C[13]; \ - CE = (state)->C[14]; \ - CF = (state)->C[15]; \ - Wlow = (state)->Wlow; \ - Whigh = (state)->Whigh; \ - } while (0) - -#define WRITE_STATE(state) \ - do { \ - (state)->A[0] = A00; \ - (state)->A[1] = A01; \ - (state)->A[2] = A02; \ - (state)->A[3] = A03; \ - (state)->A[4] = A04; \ - (state)->A[5] = A05; \ - (state)->A[6] = A06; \ - (state)->A[7] = A07; \ - (state)->A[8] = A08; \ - (state)->A[9] = A09; \ - (state)->A[10] = A0A; \ - (state)->A[11] = A0B; \ - (state)->B[0] = B0; \ - (state)->B[1] = B1; \ - (state)->B[2] = B2; \ - (state)->B[3] = B3; \ - (state)->B[4] = B4; \ - (state)->B[5] = B5; \ - (state)->B[6] = B6; \ - (state)->B[7] = B7; \ - (state)->B[8] = B8; \ - (state)->B[9] = B9; \ - (state)->B[10] = BA; \ - (state)->B[11] = BB; \ - (state)->B[12] = BC; \ - (state)->B[13] = BD; \ - (state)->B[14] = BE; \ - (state)->B[15] = BF; \ - (state)->C[0] = C0; \ - (state)->C[1] = C1; \ - (state)->C[2] = C2; \ - (state)->C[3] = C3; \ - (state)->C[4] = C4; \ - (state)->C[5] = C5; \ - (state)->C[6] = C6; \ - (state)->C[7] = C7; \ - (state)->C[8] = C8; \ - (state)->C[9] = C9; \ - (state)->C[10] = CA; \ - (state)->C[11] = CB; \ - (state)->C[12] = CC; \ - (state)->C[13] = CD; \ - (state)->C[14] = CE; \ - (state)->C[15] = CF; \ - (state)->Wlow = Wlow; \ - (state)->Whigh = Whigh; \ - } while (0) - -#define DECODE_BLOCK \ - do { \ - M0 = sph_dec32le_aligned(buf + 0); \ - M1 = sph_dec32le_aligned(buf + 4); \ - M2 = sph_dec32le_aligned(buf + 8); \ - M3 = sph_dec32le_aligned(buf + 12); \ - M4 = sph_dec32le_aligned(buf + 16); \ - M5 = sph_dec32le_aligned(buf + 20); \ - M6 = sph_dec32le_aligned(buf + 24); \ - M7 = sph_dec32le_aligned(buf + 28); \ - M8 = sph_dec32le_aligned(buf + 32); \ - M9 = sph_dec32le_aligned(buf + 36); \ - MA = sph_dec32le_aligned(buf + 40); \ - MB = sph_dec32le_aligned(buf + 44); \ - MC = sph_dec32le_aligned(buf + 48); \ - MD = sph_dec32le_aligned(buf + 52); \ - ME = sph_dec32le_aligned(buf + 56); \ - MF = sph_dec32le_aligned(buf + 60); \ - } while (0) - -#define INPUT_BLOCK_ADD \ - do { \ - B0 = SPH_T32(B0 + M0); \ - B1 = SPH_T32(B1 + M1); \ - B2 = SPH_T32(B2 + M2); \ - B3 = SPH_T32(B3 + M3); \ - B4 = SPH_T32(B4 + M4); \ - B5 = SPH_T32(B5 + M5); \ - B6 = SPH_T32(B6 + M6); \ - B7 = SPH_T32(B7 + M7); \ - B8 = SPH_T32(B8 + M8); \ - B9 = SPH_T32(B9 + M9); \ - BA = SPH_T32(BA + MA); \ - BB = SPH_T32(BB + MB); \ - BC = SPH_T32(BC + MC); \ - BD = SPH_T32(BD + MD); \ - BE = SPH_T32(BE + ME); \ - BF = SPH_T32(BF + MF); \ - } while (0) - -#define INPUT_BLOCK_SUB \ - do { \ - C0 = SPH_T32(C0 - M0); \ - C1 = SPH_T32(C1 - M1); \ - C2 = SPH_T32(C2 - M2); \ - C3 = SPH_T32(C3 - M3); \ - C4 = SPH_T32(C4 - M4); \ - C5 = SPH_T32(C5 - M5); \ - C6 = SPH_T32(C6 - M6); \ - C7 = SPH_T32(C7 - M7); \ - C8 = SPH_T32(C8 - M8); \ - C9 = SPH_T32(C9 - M9); \ - CA = SPH_T32(CA - MA); \ - CB = SPH_T32(CB - MB); \ - CC = SPH_T32(CC - MC); \ - CD = SPH_T32(CD - MD); \ - CE = SPH_T32(CE - ME); \ - CF = SPH_T32(CF - MF); \ - } while (0) - -#define XOR_W \ - do { \ - A00 ^= Wlow; \ - A01 ^= Whigh; \ - } while (0) - -#define SWAP(v1, v2) \ - do { \ - sph_u32 tmp = (v1); \ - (v1) = (v2); \ - (v2) = tmp; \ - } while (0) - -#define SWAP_BC \ - do { \ - SWAP(B0, C0); \ - SWAP(B1, C1); \ - SWAP(B2, C2); \ - SWAP(B3, C3); \ - SWAP(B4, C4); \ - SWAP(B5, C5); \ - SWAP(B6, C6); \ - SWAP(B7, C7); \ - SWAP(B8, C8); \ - SWAP(B9, C9); \ - SWAP(BA, CA); \ - SWAP(BB, CB); \ - SWAP(BC, CC); \ - SWAP(BD, CD); \ - SWAP(BE, CE); \ - SWAP(BF, CF); \ - } while (0) - -#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ - do { \ - xa0 = SPH_T32((xa0 ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) ^ xc) * 3U) ^ xb1 ^ (xb2 & ~xb3) ^ \ - xm; \ - xb0 = SPH_T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \ - } while (0) - -#define PERM_STEP_0 \ - do { \ - PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define PERM_STEP_1 \ - do { \ - PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define PERM_STEP_2 \ - do { \ - PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ - PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ - PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ - PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ - PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ - PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ - PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ - PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ - PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ - PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ - PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ - PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ - PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ - PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ - PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ - PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ - } while (0) - -#define APPLY_P \ - do { \ - B0 = SPH_T32(B0 << 17) | (B0 >> 15); \ - B1 = SPH_T32(B1 << 17) | (B1 >> 15); \ - B2 = SPH_T32(B2 << 17) | (B2 >> 15); \ - B3 = SPH_T32(B3 << 17) | (B3 >> 15); \ - B4 = SPH_T32(B4 << 17) | (B4 >> 15); \ - B5 = SPH_T32(B5 << 17) | (B5 >> 15); \ - B6 = SPH_T32(B6 << 17) | (B6 >> 15); \ - B7 = SPH_T32(B7 << 17) | (B7 >> 15); \ - B8 = SPH_T32(B8 << 17) | (B8 >> 15); \ - B9 = SPH_T32(B9 << 17) | (B9 >> 15); \ - BA = SPH_T32(BA << 17) | (BA >> 15); \ - BB = SPH_T32(BB << 17) | (BB >> 15); \ - BC = SPH_T32(BC << 17) | (BC >> 15); \ - BD = SPH_T32(BD << 17) | (BD >> 15); \ - BE = SPH_T32(BE << 17) | (BE >> 15); \ - BF = SPH_T32(BF << 17) | (BF >> 15); \ - PERM_STEP_0; \ - PERM_STEP_1; \ - PERM_STEP_2; \ - A0B = SPH_T32(A0B + C6); \ - A0A = SPH_T32(A0A + C5); \ - A09 = SPH_T32(A09 + C4); \ - A08 = SPH_T32(A08 + C3); \ - A07 = SPH_T32(A07 + C2); \ - A06 = SPH_T32(A06 + C1); \ - A05 = SPH_T32(A05 + C0); \ - A04 = SPH_T32(A04 + CF); \ - A03 = SPH_T32(A03 + CE); \ - A02 = SPH_T32(A02 + CD); \ - A01 = SPH_T32(A01 + CC); \ - A00 = SPH_T32(A00 + CB); \ - A0B = SPH_T32(A0B + CA); \ - A0A = SPH_T32(A0A + C9); \ - A09 = SPH_T32(A09 + C8); \ - A08 = SPH_T32(A08 + C7); \ - A07 = SPH_T32(A07 + C6); \ - A06 = SPH_T32(A06 + C5); \ - A05 = SPH_T32(A05 + C4); \ - A04 = SPH_T32(A04 + C3); \ - A03 = SPH_T32(A03 + C2); \ - A02 = SPH_T32(A02 + C1); \ - A01 = SPH_T32(A01 + C0); \ - A00 = SPH_T32(A00 + CF); \ - A0B = SPH_T32(A0B + CE); \ - A0A = SPH_T32(A0A + CD); \ - A09 = SPH_T32(A09 + CC); \ - A08 = SPH_T32(A08 + CB); \ - A07 = SPH_T32(A07 + CA); \ - A06 = SPH_T32(A06 + C9); \ - A05 = SPH_T32(A05 + C8); \ - A04 = SPH_T32(A04 + C7); \ - A03 = SPH_T32(A03 + C6); \ - A02 = SPH_T32(A02 + C5); \ - A01 = SPH_T32(A01 + C4); \ - A00 = SPH_T32(A00 + C3); \ - } while (0) - -#define INCR_W \ - do { \ - if ((Wlow = SPH_T32(Wlow + 1)) == 0) Whigh = SPH_T32(Whigh + 1); \ - } while (0) - -static const sph_u32 A_init_256[] = {SPH_C32(0x52F84552), SPH_C32(0xE54B7999), SPH_C32(0x2D8EE3EC), - SPH_C32(0xB9645191), SPH_C32(0xE0078B86), SPH_C32(0xBB7C44C9), - SPH_C32(0xD2B5C1CA), SPH_C32(0xB0D2EB8C), SPH_C32(0x14CE5A45), - SPH_C32(0x22AF50DC), SPH_C32(0xEFFDBC6B), SPH_C32(0xEB21B74A)}; - -static const sph_u32 B_init_256[] = { - SPH_C32(0xB555C6EE), SPH_C32(0x3E710596), SPH_C32(0xA72A652F), SPH_C32(0x9301515F), - SPH_C32(0xDA28C1FA), SPH_C32(0x696FD868), SPH_C32(0x9CB6BF72), SPH_C32(0x0AFE4002), - SPH_C32(0xA6E03615), SPH_C32(0x5138C1D4), SPH_C32(0xBE216306), SPH_C32(0xB38B8890), - SPH_C32(0x3EA8B96B), SPH_C32(0x3299ACE4), SPH_C32(0x30924DD4), SPH_C32(0x55CB34A5)}; - -static const sph_u32 C_init_256[] = { - SPH_C32(0xB405F031), SPH_C32(0xC4233EBA), SPH_C32(0xB3733979), SPH_C32(0xC0DD9D55), - SPH_C32(0xC51C28AE), SPH_C32(0xA327B8E1), SPH_C32(0x56C56167), SPH_C32(0xED614433), - SPH_C32(0x88B59D60), SPH_C32(0x60E2CEBA), SPH_C32(0x758B4B8B), SPH_C32(0x83E82A7F), - SPH_C32(0xBC968828), SPH_C32(0xE6E00BF7), SPH_C32(0xBA839E55), SPH_C32(0x9B491C60)}; - -/* END -- automatically generated code. */ - -void sph_shabal256_init(sph_shabal_context* cc) { - /* - * We have precomputed initial states for all the supported - * output bit lengths. - */ - // const sph_u32 *A_init, *B_init, *C_init; - // sph_shabal_context *sc; - - // A_init = A_init_256; - // B_init = B_init_256; - // C_init = C_init_256; - - // sc = (sph_shabal_context *) cc; - memcpy(cc->A, A_init_256, sizeof cc->A); - memcpy(cc->B, B_init_256, sizeof cc->B); - memcpy(cc->C, C_init_256, sizeof cc->C); - cc->Wlow = 1; - cc->Whigh = 0; - cc->ptr = 0; -} - -void sph_shabal256(void* cc, const unsigned char* data, size_t len) { - sph_shabal_context* sc; - unsigned char* buf; - size_t ptr; - DECL_STATE - - sc = (sph_shabal_context*)cc; - buf = sc->buf; - ptr = sc->ptr; - - /* - * We do not want to copy the state to local variables if the - * amount of data is less than what is needed to complete the - * current block. Note that it is anyway suboptimal to call - * this method many times for small chunks of data. - */ - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data += clen; - len -= clen; - if (ptr == sizeof sc->buf) { - DECODE_BLOCK; - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - INPUT_BLOCK_SUB; - SWAP_BC; - INCR_W; - ptr = 0; - } - } - WRITE_STATE(sc); - sc->ptr = ptr; -} - -static void shabal_close(void* cc, unsigned ub, unsigned n, void* dst, unsigned size_words) { - sph_shabal_context* sc; - unsigned char* buf; - size_t ptr; - unsigned z; - union { - unsigned char tmp_out[64]; - sph_u32 dummy; - } u; - size_t out_len; - DECL_STATE - - sc = (sph_shabal_context*)cc; - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - buf[ptr] = ((ub & -z) | z) & 0xFF; - memset(buf + ptr + 1, 0, (sizeof sc->buf) - (ptr + 1)); - READ_STATE(sc); - DECODE_BLOCK; - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - //#pragma loop(hint_parallel(3)) - for (int i = 0; i < 3; i++) { - SWAP_BC; - XOR_W; - APPLY_P; - } - - /* - * We just use our local variables; no need to go through - * the state structure. In order to share some code, we - * emit the relevant words into a temporary buffer, which - * we finally copy into the destination array. - */ - - sph_enc32le_aligned(u.tmp_out + 32, B8); - - sph_enc32le_aligned(u.tmp_out + 36, B9); - - sph_enc32le_aligned(u.tmp_out + 40, BA); - sph_enc32le_aligned(u.tmp_out + 44, BB); - sph_enc32le_aligned(u.tmp_out + 48, BC); - sph_enc32le_aligned(u.tmp_out + 52, BD); - sph_enc32le_aligned(u.tmp_out + 56, BE); - sph_enc32le_aligned(u.tmp_out + 60, BF); - - out_len = size_words << 2; - memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len); - // sph_shabal256_init(sc, size_words << 5); -} - -/* see sph_shabal.h */ -void sph_shabal256_close(void* cc, void* dst) { shabal_close(cc, 0, 0, dst, 8); } - -/* see sph_shabal.h */ -void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst) { - shabal_close(cc, ub, n, dst, 8); -} - -// Shabal routines optimized for plotting and hashing -void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num) { - sph_u32 - A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3], - A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7], - A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11]; - sph_u32 - B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3], - B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7], - B8 = B_init_256[8], B9 = B_init_256[9], BA = B_init_256[10], BB = B_init_256[11], - BC = B_init_256[12], BD = B_init_256[13], BE = B_init_256[14], BF = B_init_256[15]; - sph_u32 - C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3], - C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7], - C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11], - CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15]; - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - sph_u32 Wlow = 1, Whigh = 0; - - while (num-- > 0) { - M0 = ((unsigned int *)message)[0]; - M1 = ((unsigned int *)message)[1]; - M2 = ((unsigned int *)message)[2]; - M3 = ((unsigned int *)message)[3]; - M4 = ((unsigned int *)message)[4]; - M5 = ((unsigned int *)message)[5]; - M6 = ((unsigned int *)message)[6]; - M7 = ((unsigned int *)message)[7]; - M8 = ((unsigned int *)message)[8]; - M9 = ((unsigned int *)message)[9]; - MA = ((unsigned int *)message)[10]; - MB = ((unsigned int *)message)[11]; - MC = ((unsigned int *)message)[12]; - MD = ((unsigned int *)message)[13]; - ME = ((unsigned int *)message)[14]; - MF = ((unsigned int *)message)[15]; - - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - INPUT_BLOCK_SUB; - SWAP_BC; - INCR_W; - - message = (unsigned int *)message + 16; - } - - M0 = ((unsigned int *)termination)[0]; - M1 = ((unsigned int *)termination)[1]; - M2 = ((unsigned int *)termination)[2]; - M3 = ((unsigned int *)termination)[3]; - M4 = ((unsigned int *)termination)[4]; - M5 = ((unsigned int *)termination)[5]; - M6 = ((unsigned int *)termination)[6]; - M7 = ((unsigned int *)termination)[7]; - M8 = ((unsigned int *)termination)[8]; - M9 = ((unsigned int *)termination)[9]; - MA = ((unsigned int *)termination)[10]; - MB = ((unsigned int *)termination)[11]; - MC = ((unsigned int *)termination)[12]; - MD = ((unsigned int *)termination)[13]; - ME = ((unsigned int *)termination)[14]; - MF = ((unsigned int *)termination)[15]; - - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - - for (int i = 0; i < 3; i++) { - SWAP_BC; - XOR_W; - APPLY_P; - } - - sph_enc32le_aligned((sph_u32 *)dst, B8); - sph_enc32le_aligned((sph_u32 *)dst + 1, B9); - sph_enc32le_aligned((sph_u32 *)dst + 2, BA); - sph_enc32le_aligned((sph_u32 *)dst + 3, BB); - sph_enc32le_aligned((sph_u32 *)dst + 4, BC); - sph_enc32le_aligned((sph_u32 *)dst + 5, BD); - sph_enc32le_aligned((sph_u32 *)dst + 6, BE); - sph_enc32le_aligned((sph_u32 *)dst + 7, BF); -} - -// Shabal routines optimized for mining -void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst) { - sph_u32 - A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3], - A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7], - A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11]; - sph_u32 - B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3], - B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7], - B8 = B_init_256[8], B9 = B_init_256[9], BA = B_init_256[10], BB = B_init_256[11], - BC = B_init_256[12], BD = B_init_256[13], BE = B_init_256[14], BF = B_init_256[15]; - sph_u32 - C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3], - C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7], - C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11], - CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15]; - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - sph_u32 Wlow = 1, Whigh = 0; - - M0 = ((unsigned int *)gen_sig)[0]; - M1 = ((unsigned int *)gen_sig)[1]; - M2 = ((unsigned int *)gen_sig)[2]; - M3 = ((unsigned int *)gen_sig)[3]; - M4 = ((unsigned int *)gen_sig)[4]; - M5 = ((unsigned int *)gen_sig)[5]; - M6 = ((unsigned int *)gen_sig)[6]; - M7 = ((unsigned int *)gen_sig)[7]; - M8 = ((unsigned int *)scoop_data)[0]; - M9 = ((unsigned int *)scoop_data)[1]; - MA = ((unsigned int *)scoop_data)[2]; - MB = ((unsigned int *)scoop_data)[3]; - MC = ((unsigned int *)scoop_data)[4]; - MD = ((unsigned int *)scoop_data)[5]; - ME = ((unsigned int *)scoop_data)[6]; - MF = ((unsigned int *)scoop_data)[7]; - - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - INPUT_BLOCK_SUB; - SWAP_BC; - INCR_W; - - M0 = ((unsigned int *)scoop_data)[8]; - M1 = ((unsigned int *)scoop_data)[9]; - M2 = ((unsigned int *)scoop_data)[10]; - M3 = ((unsigned int *)scoop_data)[11]; - M4 = ((unsigned int *)scoop_data)[12]; - M5 = ((unsigned int *)scoop_data)[13]; - M6 = ((unsigned int *)scoop_data)[14]; - M7 = ((unsigned int *)scoop_data)[15]; - M8 = 0x80; - M9 = MA = MB = MC = MD = ME = MF = 0; - - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - - for (int i = 0; i < 3; i++) { - SWAP_BC; - XOR_W; - APPLY_P; - } - - sph_enc32le_aligned((sph_u32 *)dst, B8); - sph_enc32le_aligned((sph_u32 *)dst + 1, B9); -} \ No newline at end of file diff --git a/src/c/sph_shabal.h b/src/c/sph_shabal.h deleted file mode 100644 index ca6c772..0000000 --- a/src/c/sph_shabal.h +++ /dev/null @@ -1,133 +0,0 @@ -/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */ -/** - * Shabal interface. Shabal is a family of functions which differ by - * their output size; this implementation defines Shabal for output - * sizes 192, 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_shabal.h - * @author Thomas Pornin - */ - -#ifndef SPH_SHABAL_H__ -#define SPH_SHABAL_H__ - -#include -#include "sph_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Output size (in bits) for Shabal-256. - */ -#define SPH_SIZE_shabal256 256 - -/** - * This structure is a context for Shabal computations: it contains the - * intermediate values and some data from the last entered block. Once - * a Shabal computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Shabal computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - size_t ptr; - sph_u32 A[12], B[16], C[16]; - sph_u32 Whigh, Wlow; -#endif -} sph_shabal_context; - -/** - * Type for a Shabal-256 context (identical to the common context). - */ -typedef sph_shabal_context sph_shabal256_context; - -/** - * Initialize a Shabal-256 context. This process performs no memory - * allocation. - * - * @param cc the Shabal-256 context (pointer to a - * sph_shabal256_context) - */ -void sph_shabal256_init(sph_shabal_context* cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Shabal-256 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_shabal256(void* cc, const unsigned char* data, size_t len); - -/** - * Terminate the current Shabal-256 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the Shabal-256 context - * @param dst the destination buffer - */ -void sph_shabal256_close(void* cc, void* dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (32 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Shabal-256 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_shabal256_addbits_and_close(void* cc, unsigned ub, unsigned n, void* dst); - -/* - * optimised Shabal routine for PoC plotting and hashing - */ -void sph_shabal_hash_fast(void *message, void *termination, void* dst, unsigned num); - -/* - * optimised Shabal routine for PoC mining - */ -void sph_shabal_deadline_fast(void *scoop_data, void *gen_sig, void *dst); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/src/c/sph_types.h b/src/c/sph_types.h deleted file mode 100644 index b2bdef3..0000000 --- a/src/c/sph_types.h +++ /dev/null @@ -1,1912 +0,0 @@ -/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ -/** - * Basic type definitions. - * - * This header file defines the generic integer types that will be used - * for the implementation of hash functions; it also contains helper - * functions which encode and decode multi-byte integer values, using - * either little-endian or big-endian conventions. - * - * This file contains a compile-time test on the size of a byte - * (the unsigned char C type). If bytes are not octets, - * i.e. if they do not have a size of exactly 8 bits, then compilation - * is aborted. Architectures where bytes are not octets are relatively - * rare, even in the embedded devices market. We forbid non-octet bytes - * because there is no clear convention on how octet streams are encoded - * on such systems. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_types.h - * @author Thomas Pornin - */ - -#ifndef SPH_TYPES_H__ -#define SPH_TYPES_H__ - -#include - -/* - * All our I/O functions are defined over octet streams. We do not know - * how to handle input data if bytes are not octets. - */ -#if CHAR_BIT != 8 -#error This code requires 8-bit bytes -#endif - -/* ============= BEGIN documentation block for Doxygen ============ */ - -#ifdef DOXYGEN_IGNORE - -/** @mainpage sphlib C code documentation - * - * @section overview Overview - * - * sphlib is a library which contains implementations of - * various cryptographic hash functions. These pages have been generated - * with doxygen and - * document the API for the C implementations. - * - * The API is described in appropriate header files, which are available - * in the "Files" section. Each hash function family has its own header, - * whose name begins with "sph_" and contains the family - * name. For instance, the API for the RIPEMD hash functions is available - * in the header file sph_ripemd.h. - * - * @section principles API structure and conventions - * - * @subsection io Input/output conventions - * - * In all generality, hash functions operate over strings of bits. - * Individual bits are rarely encountered in C programming or actual - * communication protocols; most protocols converge on the ubiquitous - * "octet" which is a group of eight bits. Data is thus expressed as a - * stream of octets. The C programming language contains the notion of a - * "byte", which is a data unit managed under the type "unsigned - * char". The C standard prescribes that a byte should hold at - * least eight bits, but possibly more. Most modern architectures, even - * in the embedded world, feature eight-bit bytes, i.e. map bytes to - * octets. - * - * Nevertheless, for some of the implemented hash functions, an extra - * API has been added, which allows the input of arbitrary sequences of - * bits: when the computation is about to be closed, 1 to 7 extra bits - * can be added. The functions for which this API is implemented include - * the SHA-2 functions and all SHA-3 candidates. - * - * sphlib defines hash function which may hash octet streams, - * i.e. streams of bits where the number of bits is a multiple of eight. - * The data input functions in the sphlib API expect data - * as anonymous pointers ("const void *") with a length - * (of type "size_t") which gives the input data chunk length - * in bytes. A byte is assumed to be an octet; the sph_types.h - * header contains a compile-time test which prevents compilation on - * architectures where this property is not met. - * - * The hash function output is also converted into bytes. All currently - * implemented hash functions have an output width which is a multiple of - * eight, and this is likely to remain true for new designs. - * - * Most hash functions internally convert input data into 32-bit of 64-bit - * words, using either little-endian or big-endian conversion. The hash - * output also often consists of such words, which are encoded into output - * bytes with a similar endianness convention. Some hash functions have - * been only loosely specified on that subject; when necessary, - * sphlib has been tested against published "reference" - * implementations in order to use the same conventions. - * - * @subsection shortname Function short name - * - * Each implemented hash function has a "short name" which is used - * internally to derive the identifiers for the functions and context - * structures which the function uses. For instance, MD5 has the short - * name "md5". Short names are listed in the next section, - * for the implemented hash functions. In subsequent sections, the - * short name will be assumed to be "XXX": replace with the - * actual hash function name to get the C identifier. - * - * Note: some functions within the same family share the same core - * elements, such as update function or context structure. Correspondingly, - * some of the defined types or functions may actually be macros which - * transparently evaluate to another type or function name. - * - * @subsection context Context structure - * - * Each implemented hash fonction has its own context structure, available - * under the type name "sph_XXX_context" for the hash function - * with short name "XXX". This structure holds all needed - * state for a running hash computation. - * - * The contents of these structures are meant to be opaque, and private - * to the implementation. However, these contents are specified in the - * header files so that application code which uses sphlib - * may access the size of those structures. - * - * The caller is responsible for allocating the context structure, - * whether by dynamic allocation (malloc() or equivalent), - * static allocation (a global permanent variable), as an automatic - * variable ("on the stack"), or by any other mean which ensures proper - * structure alignment. sphlib code performs no dynamic - * allocation by itself. - * - * The context must be initialized before use, using the - * sph_XXX_init() function. This function sets the context - * state to proper initial values for hashing. - * - * Since all state data is contained within the context structure, - * sphlib is thread-safe and reentrant: several hash - * computations may be performed in parallel, provided that they do not - * operate on the same context. Moreover, a running computation can be - * cloned by copying the context (with a simple memcpy()): - * the context and its clone are then independant and may be updated - * with new data and/or closed without interfering with each other. - * Similarly, a context structure can be moved in memory at will: - * context structures contain no pointer, in particular no pointer to - * themselves. - * - * @subsection dataio Data input - * - * Hashed data is input with the sph_XXX() fonction, which - * takes as parameters a pointer to the context, a pointer to the data - * to hash, and the number of data bytes to hash. The context is updated - * with the new data. - * - * Data can be input in one or several calls, with arbitrary input lengths. - * However, it is best, performance wise, to input data by relatively big - * chunks (say a few kilobytes), because this allows sphlib to - * optimize things and avoid internal copying. - * - * When all data has been input, the context can be closed with - * sph_XXX_close(). The hash output is computed and written - * into the provided buffer. The caller must take care to provide a - * buffer of appropriate length; e.g., when using SHA-1, the output is - * a 20-byte word, therefore the output buffer must be at least 20-byte - * long. - * - * For some hash functions, the sph_XXX_addbits_and_close() - * function can be used instead of sph_XXX_close(). This - * function can take a few extra bits to be added at - * the end of the input message. This allows hashing messages with a - * bit length which is not a multiple of 8. The extra bits are provided - * as an unsigned integer value, and a bit count. The bit count must be - * between 0 and 7, inclusive. The extra bits are provided as bits 7 to - * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. - * For instance, to add three bits of value 1, 1 and 0, the unsigned - * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count - * will be 3. - * - * The SPH_SIZE_XXX macro is defined for each hash function; - * it evaluates to the function output size, expressed in bits. For instance, - * SPH_SIZE_sha1 evaluates to 160. - * - * When closed, the context is automatically reinitialized and can be - * immediately used for another computation. It is not necessary to call - * sph_XXX_init() after a close. Note that - * sph_XXX_init() can still be called to "reset" a context, - * i.e. forget previously input data, and get back to the initial state. - * - * @subsection alignment Data alignment - * - * "Alignment" is a property of data, which is said to be "properly - * aligned" when its emplacement in memory is such that the data can - * be optimally read by full words. This depends on the type of access; - * basically, some hash functions will read data by 32-bit or 64-bit - * words. sphlib does not mandate such alignment for input - * data, but using aligned data can substantially improve performance. - * - * As a rule, it is best to input data by chunks whose length (in bytes) - * is a multiple of eight, and which begins at "generally aligned" - * addresses, such as the base address returned by a call to - * malloc(). - * - * @section functions Implemented functions - * - * We give here the list of implemented functions. They are grouped by - * family; to each family corresponds a specific header file. Each - * individual function has its associated "short name". Please refer to - * the documentation for that header file to get details on the hash - * function denomination and provenance. - * - * Note: the functions marked with a '(64)' in the list below are - * available only if the C compiler provides an integer type of length - * 64 bits or more. Such a type is mandatory in the latest C standard - * (ISO 9899:1999, aka "C99") and is present in several older compilers - * as well, so chances are that such a type is available. - * - * - HAVAL family: file sph_haval.h - * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 - * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 - * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 - * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 - * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 - * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 - * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 - * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 - * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 - * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 - * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 - * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 - * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 - * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 - * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 - * - MD2: file sph_md2.h, short name: md2 - * - MD4: file sph_md4.h, short name: md4 - * - MD5: file sph_md5.h, short name: md5 - * - PANAMA: file sph_panama.h, short name: panama - * - RadioGatun family: file sph_radiogatun.h - * - RadioGatun[32]: short name: radiogatun32 - * - RadioGatun[64]: short name: radiogatun64 (64) - * - RIPEMD family: file sph_ripemd.h - * - RIPEMD: short name: ripemd - * - RIPEMD-128: short name: ripemd128 - * - RIPEMD-160: short name: ripemd160 - * - SHA-0: file sph_sha0.h, short name: sha0 - * - SHA-1: file sph_sha1.h, short name: sha1 - * - SHA-2 family, 32-bit hashes: file sph_sha2.h - * - SHA-224: short name: sha224 - * - SHA-256: short name: sha256 - * - SHA-384: short name: sha384 (64) - * - SHA-512: short name: sha512 (64) - * - Tiger family: file sph_tiger.h - * - Tiger: short name: tiger (64) - * - Tiger2: short name: tiger2 (64) - * - WHIRLPOOL family: file sph_whirlpool.h - * - WHIRLPOOL-0: short name: whirlpool0 (64) - * - WHIRLPOOL-1: short name: whirlpool1 (64) - * - WHIRLPOOL: short name: whirlpool (64) - * - * The fourteen second-round SHA-3 candidates are also implemented; - * when applicable, the implementations follow the "final" specifications - * as published for the third round of the SHA-3 competition (BLAKE, - * Groestl, JH, Keccak and Skein have been tweaked for third round). - * - * - BLAKE family: file sph_blake.h - * - BLAKE-224: short name: blake224 - * - BLAKE-256: short name: blake256 - * - BLAKE-384: short name: blake384 - * - BLAKE-512: short name: blake512 - * - BMW (Blue Midnight Wish) family: file sph_bmw.h - * - BMW-224: short name: bmw224 - * - BMW-256: short name: bmw256 - * - BMW-384: short name: bmw384 (64) - * - BMW-512: short name: bmw512 (64) - * - CubeHash family: file sph_cubehash.h (specified as - * CubeHash16/32 in the CubeHash specification) - * - CubeHash-224: short name: cubehash224 - * - CubeHash-256: short name: cubehash256 - * - CubeHash-384: short name: cubehash384 - * - CubeHash-512: short name: cubehash512 - * - ECHO family: file sph_echo.h - * - ECHO-224: short name: echo224 - * - ECHO-256: short name: echo256 - * - ECHO-384: short name: echo384 - * - ECHO-512: short name: echo512 - * - Fugue family: file sph_fugue.h - * - Fugue-224: short name: fugue224 - * - Fugue-256: short name: fugue256 - * - Fugue-384: short name: fugue384 - * - Fugue-512: short name: fugue512 - * - Groestl family: file sph_groestl.h - * - Groestl-224: short name: groestl224 - * - Groestl-256: short name: groestl256 - * - Groestl-384: short name: groestl384 - * - Groestl-512: short name: groestl512 - * - Hamsi family: file sph_hamsi.h - * - Hamsi-224: short name: hamsi224 - * - Hamsi-256: short name: hamsi256 - * - Hamsi-384: short name: hamsi384 - * - Hamsi-512: short name: hamsi512 - * - JH family: file sph_jh.h - * - JH-224: short name: jh224 - * - JH-256: short name: jh256 - * - JH-384: short name: jh384 - * - JH-512: short name: jh512 - * - Keccak family: file sph_keccak.h - * - Keccak-224: short name: keccak224 - * - Keccak-256: short name: keccak256 - * - Keccak-384: short name: keccak384 - * - Keccak-512: short name: keccak512 - * - Luffa family: file sph_luffa.h - * - Luffa-224: short name: luffa224 - * - Luffa-256: short name: luffa256 - * - Luffa-384: short name: luffa384 - * - Luffa-512: short name: luffa512 - * - Shabal family: file sph_shabal.h - * - Shabal-192: short name: shabal192 - * - Shabal-224: short name: shabal224 - * - Shabal-256: short name: shabal256 - * - Shabal-384: short name: shabal384 - * - Shabal-512: short name: shabal512 - * - SHAvite-3 family: file sph_shavite.h - * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): - * short name: shabal224 - * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): - * short name: shabal256 - * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): - * short name: shabal384 - * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): - * short name: shabal512 - * - SIMD family: file sph_simd.h - * - SIMD-224: short name: simd224 - * - SIMD-256: short name: simd256 - * - SIMD-384: short name: simd384 - * - SIMD-512: short name: simd512 - * - Skein family: file sph_skein.h - * - Skein-224 (nominally specified as Skein-512-224): short name: - * skein224 (64) - * - Skein-256 (nominally specified as Skein-512-256): short name: - * skein256 (64) - * - Skein-384 (nominally specified as Skein-512-384): short name: - * skein384 (64) - * - Skein-512 (nominally specified as Skein-512-512): short name: - * skein512 (64) - * - * For the second-round SHA-3 candidates, the functions are as specified - * for round 2, i.e. with the "tweaks" that some candidates added - * between round 1 and round 2. Also, some of the submitted packages for - * round 2 contained errors, in the specification, reference code, or - * both. sphlib implements the corrected versions. - */ - -/** @hideinitializer - * Unsigned integer type whose length is at least 32 bits; on most - * architectures, it will have a width of exactly 32 bits. Unsigned C - * types implement arithmetics modulo a power of 2; use the - * SPH_T32() macro to ensure that the value is truncated - * to exactly 32 bits. Unless otherwise specified, all macros and - * functions which accept sph_u32 values assume that these - * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures - * where sph_u32 is larger than that. - */ -typedef __arch_dependant__ sph_u32; - -/** @hideinitializer - * Signed integer type corresponding to sph_u32; it has - * width 32 bits or more. - */ -typedef __arch_dependant__ sph_s32; - -/** @hideinitializer - * Unsigned integer type whose length is at least 64 bits; on most - * architectures which feature such a type, it will have a width of - * exactly 64 bits. C99-compliant platform will have this type; it - * is also defined when the GNU compiler (gcc) is used, and on - * platforms where unsigned long is large enough. If this - * type is not available, then some hash functions which depends on - * a 64-bit type will not be available (most notably SHA-384, SHA-512, - * Tiger and WHIRLPOOL). - */ -typedef __arch_dependant__ sph_u64; - -/** @hideinitializer - * Signed integer type corresponding to sph_u64; it has - * width 64 bits or more. - */ -typedef __arch_dependant__ sph_s64; - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u32. Depending on - * how this type is defined, a suffix such as UL may - * be appended to the argument. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C32(x) - -/** - * Truncate a 32-bit value to exactly 32 bits. On most systems, this is - * a no-op, recognized as such by the compiler. - * - * @param x the value to truncate (of type sph_u32) - */ -#define SPH_T32(x) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTL32(x, n) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTR32(x, n) - -/** - * This macro is defined on systems for which a 64-bit type has been - * detected, and is used for sph_u64. - */ -#define SPH_64 - -/** - * This macro is defined on systems for the "native" integer size is - * 64 bits (64-bit values fit in one register). - */ -#define SPH_64_TRUE - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u64. Depending on - * how this type is defined, a suffix such as ULL may - * be appended to the argument. This macro is defined only if a - * 64-bit type was detected and used for sph_u64. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C64(x) - -/** - * Truncate a 64-bit value to exactly 64 bits. On most systems, this is - * a no-op, recognized as such by the compiler. This macro is defined only - * if a 64-bit type was detected and used for sph_u64. - * - * @param x the value to truncate (of type sph_u64) - */ -#define SPH_T64(x) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTL64(x, n) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTR64(x, n) - -/** - * This macro evaluates to inline or an equivalent construction, - * if available on the compilation platform, or to nothing otherwise. This - * is used to declare inline functions, for which the compiler should - * endeavour to include the code directly in the caller. Inline functions - * are typically defined in header files as replacement for macros. - */ -#define SPH_INLINE - -/** - * This macro is defined if the platform has been detected as using - * little-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_LITTLE_ENDIAN - -/** - * This macro is defined if the platform has been detected as using - * big-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_BIG_ENDIAN - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in little-endian - * convention. This is the case for little-endian platforms, and also - * for the big-endian platforms which have special little-endian access - * opcodes (e.g. Ultrasparc). - */ -#define SPH_LITTLE_FAST - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in big-endian - * convention. This is the case for little-endian platforms, and also - * for the little-endian platforms which have special big-endian access - * opcodes. - */ -#define SPH_BIG_FAST - -/** - * On some platforms, this macro is defined to an unsigned integer type - * into which pointer values may be cast. The resulting value can then - * be tested for being a multiple of 2, 4 or 8, indicating an aligned - * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. - */ -#define SPH_UPTR - -/** - * When defined, this macro indicates that unaligned memory accesses - * are possible with only a minor penalty, and thus should be prefered - * over strategies which first copy data to an aligned buffer. - */ -#define SPH_UNALIGNED - -/** - * Byte-swap a 32-bit word (i.e. 0x12345678 becomes - * 0x78563412). This is an inline function which resorts - * to inline assembly on some platforms, for better performance. - * - * @param x the 32-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u32 sph_bswap32(sph_u32 x); - -/** - * Byte-swap a 64-bit word. This is an inline function which resorts - * to inline assembly on some platforms, for better performance. This - * function is defined only if a suitable 64-bit type was found for - * sph_u64 - * - * @param x the 64-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u64 sph_bswap64(sph_u64 x); - -/** - * Decode a 16-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16le(const void* src); - -/** - * Encode a 16-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16le(void* dst, unsigned val); - -/** - * Decode a 16-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16be(const void* src); - -/** - * Encode a 16-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16be(void* dst, unsigned val); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le(const void* src); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32le() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le_aligned(const void* src); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le(void* dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32le() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le_aligned(void* dst, sph_u32 val); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be(const void* src); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32be() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be_aligned(const void* src); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be(void* dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32be() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be_aligned(void* dst, sph_u32 val); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le(const void* src); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64le() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le_aligned(const void* src); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le(void* dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64le() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le_aligned(void* dst, sph_u64 val); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be(const void* src); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64be() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be_aligned(const void* src); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be(void* dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64be() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be_aligned(void* dst, sph_u64 val); - -#endif - -/* ============== END documentation block for Doxygen ============= */ - -#ifndef DOXYGEN_IGNORE - -/* - * We want to define the types "sph_u32" and "sph_u64" which hold - * unsigned values of at least, respectively, 32 and 64 bits. These - * tests should select appropriate types for most platforms. The - * macro "SPH_64" is defined if the 64-bit is supported. - */ - -#undef SPH_64 -#undef SPH_64_TRUE - -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - -/* - * On C99 implementations, we can use to get an exact 64-bit - * type, if any, or otherwise use a wider type (which must exist, for - * C99 conformance). - */ - -#include - -#ifdef UINT32_MAX -typedef uint32_t sph_u32; -typedef int32_t sph_s32; -#else -typedef uint_fast32_t sph_u32; -typedef int_fast32_t sph_s32; -#endif -#if !SPH_NO_64 -#ifdef UINT64_MAX -typedef uint64_t sph_u64; -typedef int64_t sph_s64; -#else -typedef uint_fast64_t sph_u64; -typedef int_fast64_t sph_s64; -#endif -#endif - -#define SPH_C32(x) ((sph_u32)(x)) -#if !SPH_NO_64 -#define SPH_C64(x) ((sph_u64)(x)) -#define SPH_64 1 -#endif - -#else - -/* - * On non-C99 systems, we use "unsigned int" if it is wide enough, - * "unsigned long" otherwise. This supports all "reasonable" architectures. - * We have to be cautious: pre-C99 preprocessors handle constants - * differently in '#if' expressions. Hence the shifts to test UINT_MAX. - */ - -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF - -typedef unsigned int sph_u32; -typedef int sph_s32; - -#define SPH_C32(x) ((sph_u32)(x##U)) - -#else - -typedef unsigned long sph_u32; -typedef long sph_s32; - -#define SPH_C32(x) ((sph_u32)(x##UL)) - -#endif - -#if !SPH_NO_64 - -/* - * We want a 64-bit type. We use "unsigned long" if it is wide enough (as - * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), - * "unsigned long long" otherwise, if available. We use ULLONG_MAX to - * test whether "unsigned long long" is available; we also know that - * gcc features this type, even if the libc header do not know it. - */ - -#if ((ULONG_MAX >> 31) >> 31) >= 3 - -typedef unsigned long sph_u64; -typedef long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x##UL)) - -#define SPH_64 1 - -#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ - -typedef unsigned long long sph_u64; -typedef long long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x##ULL)) - -#define SPH_64 1 - -#else - -/* - * No 64-bit type... - */ - -#endif - -#endif - -#endif - -/* - * If the "unsigned long" type has length 64 bits or more, then this is - * a "true" 64-bit architectures. This is also true with Visual C on - * amd64, even though the "long" type is limited to 32 bits. - */ -#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) -#define SPH_64_TRUE 1 -#endif - -/* - * Implementation note: some processors have specific opcodes to perform - * a rotation. Recent versions of gcc recognize the expression above and - * use the relevant opcodes, when appropriate. - */ - -#define SPH_T32(x) ((x)&SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -#if SPH_64 - -#define SPH_T64(x) ((x)&SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) - -#endif - -#ifndef DOXYGEN_IGNORE -/* - * Define SPH_INLINE to be an "inline" qualifier, if available. We define - * some small macro-like functions which benefit greatly from being inlined. - */ -#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ -#define SPH_INLINE inline -#elif defined _MSC_VER -#define SPH_INLINE __inline -#else -#define SPH_INLINE -#endif -#endif - -/* - * We define some macros which qualify the architecture. These macros - * may be explicit set externally (e.g. as compiler parameters). The - * code below sets those macros if they are not already defined. - * - * Most macros are boolean, thus evaluate to either zero or non-zero. - * The SPH_UPTR macro is special, in that it evaluates to a C type, - * or is not defined. - * - * SPH_UPTR if defined: unsigned type to cast pointers into - * - * SPH_UNALIGNED non-zero if unaligned accesses are efficient - * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian - * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian - * SPH_LITTLE_FAST non-zero if little-endian decoding is fast - * SPH_BIG_FAST non-zero if big-endian decoding is fast - * - * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit - * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN - * _must_ be non-zero in those situations. The 32-bit and 64-bit types - * _must_ also have an exact width. - * - * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode - * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode - * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc - * SPH_I386_GCC x86-compatible (32-bit) with gcc - * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C - * SPH_AMD64_GCC x86-compatible (64-bit) with gcc - * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C - * SPH_PPC32_GCC PowerPC, 32-bit, with gcc - * SPH_PPC64_GCC PowerPC, 64-bit, with gcc - * - * TODO: enhance automatic detection, for more architectures and compilers. - * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with - * some very fast functions (e.g. MD4) when using unaligned input data. - * The CPU-specific-with-GCC macros are useful only for inline assembly, - * normally restrained to this header file. - */ - -/* - * 32-bit x86, aka "i386 compatible". - */ -#if defined __i386__ || defined _M_IX86 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#ifdef __GNUC__ -#define SPH_DETECT_I386_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_I386_MSVC 1 -#endif - -/* - * 64-bit x86, hereafter known as "amd64". - */ -#elif defined __x86_64 || defined _M_X64 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_AMD64_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_AMD64_MSVC 1 -#endif - -/* - * 64-bit Sparc architecture (implies v9). - */ -#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) || defined __sparcv9 - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_SPARCV9_GCC_64 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * 32-bit Sparc. - */ -#elif (defined __sparc__ || defined __sparc) && !(defined __sparcv9 || defined __arch64__) - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#if defined __GNUC__ && defined __sparc_v9__ -#define SPH_DETECT_SPARCV9_GCC_32 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * ARM, little-endian. - */ -#elif defined __arm__ && __ARMEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, little-endian. - */ -#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, big-endian. - */ -#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ - -#define SPH_DETECT_BIG_ENDIAN 1 - -/* - * PowerPC. - */ -#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ || defined _ARCH_PPC - -/* - * Note: we do not declare cross-endian access to be "fast": even if - * using inline assembly, implementation should still assume that - * keeping the decoded word in a temporary is faster than decoding - * it again. - */ -#if defined __GNUC__ -#if SPH_64_TRUE -#define SPH_DETECT_PPC64_GCC 1 -#else -#define SPH_DETECT_PPC32_GCC 1 -#endif -#endif - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif - -/* - * Itanium, 64-bit. - */ -#elif defined __ia64 || defined __ia64__ || defined __itanium__ || defined _M_IA64 - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#else -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif -#if defined __LP64__ || defined _LP64 -#define SPH_DETECT_UPTR sph_u64 -#else -#define SPH_DETECT_UPTR sph_u32 -#endif - -#endif - -#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 -#define SPH_DETECT_SPARCV9_GCC 1 -#endif - -#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED -#define SPH_UNALIGNED SPH_DETECT_UNALIGNED -#endif -#if defined SPH_DETECT_UPTR && !defined SPH_UPTR -#define SPH_UPTR SPH_DETECT_UPTR -#endif -#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN -#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN -#endif -#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN -#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN -#endif -#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST -#endif -#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST -#define SPH_BIG_FAST SPH_DETECT_BIG_FAST -#endif -#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 -#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 -#endif -#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 -#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 -#endif -#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC -#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC -#endif -#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC -#define SPH_I386_GCC SPH_DETECT_I386_GCC -#endif -#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC -#define SPH_I386_MSVC SPH_DETECT_I386_MSVC -#endif -#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC -#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC -#endif -#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC -#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC -#endif -#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC -#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC -#endif -#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC -#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC -#endif - -#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST 1 -#endif -#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST -#define SPH_BIG_FAST 1 -#endif - -#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) -#error SPH_UPTR defined, but endianness is not known. -#endif - -#if SPH_I386_GCC && !SPH_NO_ASM - -/* - * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * values. - */ - -static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) { - __asm__ __volatile__("bswapl %0" : "=r"(x) : "0"(x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) { - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - -#elif SPH_AMD64_GCC && !SPH_NO_ASM - -/* - * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * and 64-bit values. - */ - -static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) { - __asm__ __volatile__("bswapl %0" : "=r"(x) : "0"(x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) { - __asm__ __volatile__("bswapq %0" : "=r"(x) : "0"(x)); - return x; -} - -#endif - -/* - * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough - * to generate proper opcodes for endianness swapping with the pure C - * implementation below. - * - -#elif SPH_I386_MSVC && !SPH_NO_ASM - -static __inline sph_u32 __declspec(naked) __fastcall -sph_bswap32(sph_u32 x) -{ - __asm { - bswap ecx - mov eax,ecx - ret - } -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - - * - * [end of disabled code] - */ - -#else - -static SPH_INLINE sph_u32 sph_bswap32(sph_u32 x) { - x = SPH_T32((x << 16) | (x >> 16)); - x = ((x & SPH_C32(0xFF00FF00)) >> 8) | ((x & SPH_C32(0x00FF00FF)) << 8); - return x; -} - -#if SPH_64 - -/** - * Byte-swap a 64-bit value. - * - * @param x the input value - * @return the byte-swapped value - */ -static SPH_INLINE sph_u64 sph_bswap64(sph_u64 x) { - x = SPH_T64((x << 32) | (x >> 32)); - x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); - x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); - return x; -} - -#endif - -#endif - -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - -/* - * On UltraSPARC systems, native ordering is big-endian, but it is - * possible to perform little-endian read accesses by specifying the - * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use - * the opcode "lda [%reg]0x88,%dst", where %reg is the register which - * contains the source address and %dst is the destination register, - * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register - * to get the address space name. The latter format is better since it - * combines an addition and the actual access in a single opcode; but - * it requires the setting (and subsequent resetting) of %asi, which is - * slow. Some operations (i.e. MD5 compression function) combine many - * successive little-endian read accesses, which may share the same - * %asi setting. The macros below contain the appropriate inline - * assembly. - */ - -#define SPH_SPARCV9_SET_ASI \ - sph_u32 sph_sparcv9_asi; \ - __asm__ __volatile__("rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r"(sph_sparcv9_asi)); - -#define SPH_SPARCV9_RESET_ASI __asm__ __volatile__("wr %%g0,%0,%%asi" : : "r"(sph_sparcv9_asi)); - -#define SPH_SPARCV9_DEC32LE(base, idx) \ - ({ \ - sph_u32 sph_sparcv9_tmp; \ - __asm__ __volatile__("lda [%1+" #idx "*4]%%asi,%0" : "=r"(sph_sparcv9_tmp) : "r"(base)); \ - sph_sparcv9_tmp; \ - }) - -#endif - -static SPH_INLINE void sph_enc16be(void* dst, unsigned val) { - ((unsigned char*)dst)[0] = (val >> 8); - ((unsigned char*)dst)[1] = val; -} - -static SPH_INLINE unsigned sph_dec16be(const void* src) { - return ((unsigned)(((const unsigned char*)src)[0]) << 8) | - (unsigned)(((const unsigned char*)src)[1]); -} - -static SPH_INLINE void sph_enc16le(void* dst, unsigned val) { - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = val >> 8; -} - -static SPH_INLINE unsigned sph_dec16le(const void* src) { - return (unsigned)(((const unsigned char*)src)[0]) | - ((unsigned)(((const unsigned char*)src)[1]) << 8); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void sph_enc32be(void* dst, sph_u32 val) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32*)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32*)dst = val; - } else { - ((unsigned char*)dst)[0] = (val >> 24); - ((unsigned char*)dst)[1] = (val >> 16); - ((unsigned char*)dst)[2] = (val >> 8); - ((unsigned char*)dst)[3] = val; - } -#endif -#else - ((unsigned char*)dst)[0] = (val >> 24); - ((unsigned char*)dst)[1] = (val >> 16); - ((unsigned char*)dst)[2] = (val >> 8); - ((unsigned char*)dst)[3] = val; -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void sph_enc32be_aligned(void* dst, sph_u32 val) { -#if SPH_LITTLE_ENDIAN - *(sph_u32*)dst = sph_bswap32(val); -#elif SPH_BIG_ENDIAN - *(sph_u32*)dst = val; -#else - ((unsigned char*)dst)[0] = (val >> 24); - ((unsigned char*)dst)[1] = (val >> 16); - ((unsigned char*)dst)[2] = (val >> 8); - ((unsigned char*)dst)[3] = val; -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 sph_dec32be(const void* src) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32*)src); -#else - return *(const sph_u32*)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32*)src); -#else - return *(const sph_u32*)src; -#endif - } else { - return ((sph_u32)(((const unsigned char*)src)[0]) << 24) | - ((sph_u32)(((const unsigned char*)src)[1]) << 16) | - ((sph_u32)(((const unsigned char*)src)[2]) << 8) | - (sph_u32)(((const unsigned char*)src)[3]); - } -#endif -#else - return ((sph_u32)(((const unsigned char*)src)[0]) << 24) | - ((sph_u32)(((const unsigned char*)src)[1]) << 16) | - ((sph_u32)(((const unsigned char*)src)[2]) << 8) | - (sph_u32)(((const unsigned char*)src)[3]); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 sph_dec32be_aligned(const void* src) { -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32*)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u32*)src; -#else - return ((sph_u32)(((const unsigned char*)src)[0]) << 24) | - ((sph_u32)(((const unsigned char*)src)[1]) << 16) | - ((sph_u32)(((const unsigned char*)src)[2]) << 8) | - (sph_u32)(((const unsigned char*)src)[3]); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void sph_enc32le(void* dst, sph_u32 val) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32*)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32*)dst = val; - } else { - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); - } -#endif -#else - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void sph_enc32le_aligned(void* dst, sph_u32 val) { -#if SPH_LITTLE_ENDIAN - *(sph_u32*)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u32*)dst = sph_bswap32(val); -#else - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 sph_dec32le(const void* src) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap32(*(const sph_u32*)src); -#else - return *(const sph_u32*)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - /* - * "__volatile__" is needed here because without it, - * gcc-3.4.3 miscompiles the code and performs the - * access before the test on the address, thus triggering - * a bus error... - */ - __asm__ __volatile__("lda [%1]0x88,%0" : "=r"(tmp) : "r"(src)); - return tmp; -/* - * On PowerPC, this turns out not to be worth the effort: the inline - * assembly makes GCC optimizer uncomfortable, which tends to nullify - * the decoding gains. - * - * For most hash functions, using this inline assembly trick changes - * hashing speed by less than 5% and often _reduces_ it. The biggest - * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is - * less then 10%. The speed gain on CubeHash is probably due to the - * chronic shortage of registers that CubeHash endures; for the other - * functions, the generic code appears to be efficient enough already. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ( - "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32*)src); -#endif -#else - return *(const sph_u32*)src; -#endif - } else { - return (sph_u32)(((const unsigned char*)src)[0]) | - ((sph_u32)(((const unsigned char*)src)[1]) << 8) | - ((sph_u32)(((const unsigned char*)src)[2]) << 16) | - ((sph_u32)(((const unsigned char*)src)[3]) << 24); - } -#endif -#else - return (sph_u32)(((const unsigned char*)src)[0]) | - ((sph_u32)(((const unsigned char*)src)[1]) << 8) | - ((sph_u32)(((const unsigned char*)src)[2]) << 16) | - ((sph_u32)(((const unsigned char*)src)[3]) << 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 sph_dec32le_aligned(const void* src) { -#if SPH_LITTLE_ENDIAN - return *(const sph_u32*)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__("lda [%1]0x88,%0" : "=r"(tmp) : "r"(src)); - return tmp; -/* - * Not worth it generally. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32*)src); -#endif -#else - return (sph_u32)(((const unsigned char*)src)[0]) | - ((sph_u32)(((const unsigned char*)src)[1]) << 8) | - ((sph_u32)(((const unsigned char*)src)[2]) << 16) | - ((sph_u32)(((const unsigned char*)src)[3]) << 24); -#endif -} - -#if SPH_64 - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void sph_enc64be(void* dst, sph_u64 val) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64*)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64*)dst = val; - } else { - ((unsigned char*)dst)[0] = (val >> 56); - ((unsigned char*)dst)[1] = (val >> 48); - ((unsigned char*)dst)[2] = (val >> 40); - ((unsigned char*)dst)[3] = (val >> 32); - ((unsigned char*)dst)[4] = (val >> 24); - ((unsigned char*)dst)[5] = (val >> 16); - ((unsigned char*)dst)[6] = (val >> 8); - ((unsigned char*)dst)[7] = val; - } -#endif -#else - ((unsigned char*)dst)[0] = (val >> 56); - ((unsigned char*)dst)[1] = (val >> 48); - ((unsigned char*)dst)[2] = (val >> 40); - ((unsigned char*)dst)[3] = (val >> 32); - ((unsigned char*)dst)[4] = (val >> 24); - ((unsigned char*)dst)[5] = (val >> 16); - ((unsigned char*)dst)[6] = (val >> 8); - ((unsigned char*)dst)[7] = val; -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void sph_enc64be_aligned(void* dst, sph_u64 val) { -#if SPH_LITTLE_ENDIAN - *(sph_u64*)dst = sph_bswap64(val); -#elif SPH_BIG_ENDIAN - *(sph_u64*)dst = val; -#else - ((unsigned char*)dst)[0] = (val >> 56); - ((unsigned char*)dst)[1] = (val >> 48); - ((unsigned char*)dst)[2] = (val >> 40); - ((unsigned char*)dst)[3] = (val >> 32); - ((unsigned char*)dst)[4] = (val >> 24); - ((unsigned char*)dst)[5] = (val >> 16); - ((unsigned char*)dst)[6] = (val >> 8); - ((unsigned char*)dst)[7] = val; -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 sph_dec64be(const void* src) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64*)src); -#else - return *(const sph_u64*)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64*)src); -#else - return *(const sph_u64*)src; -#endif - } else { - return ((sph_u64)(((const unsigned char*)src)[0]) << 56) | - ((sph_u64)(((const unsigned char*)src)[1]) << 48) | - ((sph_u64)(((const unsigned char*)src)[2]) << 40) | - ((sph_u64)(((const unsigned char*)src)[3]) << 32) | - ((sph_u64)(((const unsigned char*)src)[4]) << 24) | - ((sph_u64)(((const unsigned char*)src)[5]) << 16) | - ((sph_u64)(((const unsigned char*)src)[6]) << 8) | - (sph_u64)(((const unsigned char*)src)[7]); - } -#endif -#else - return ((sph_u64)(((const unsigned char*)src)[0]) << 56) | - ((sph_u64)(((const unsigned char*)src)[1]) << 48) | - ((sph_u64)(((const unsigned char*)src)[2]) << 40) | - ((sph_u64)(((const unsigned char*)src)[3]) << 32) | - ((sph_u64)(((const unsigned char*)src)[4]) << 24) | - ((sph_u64)(((const unsigned char*)src)[5]) << 16) | - ((sph_u64)(((const unsigned char*)src)[6]) << 8) | - (sph_u64)(((const unsigned char*)src)[7]); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 sph_dec64be_aligned(const void* src) { -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64*)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u64*)src; -#else - return ((sph_u64)(((const unsigned char*)src)[0]) << 56) | - ((sph_u64)(((const unsigned char*)src)[1]) << 48) | - ((sph_u64)(((const unsigned char*)src)[2]) << 40) | - ((sph_u64)(((const unsigned char*)src)[3]) << 32) | - ((sph_u64)(((const unsigned char*)src)[4]) << 24) | - ((sph_u64)(((const unsigned char*)src)[5]) << 16) | - ((sph_u64)(((const unsigned char*)src)[6]) << 8) | - (sph_u64)(((const unsigned char*)src)[7]); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void sph_enc64le(void* dst, sph_u64 val) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64*)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64*)dst = val; - } else { - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); - ((unsigned char*)dst)[4] = (val >> 32); - ((unsigned char*)dst)[5] = (val >> 40); - ((unsigned char*)dst)[6] = (val >> 48); - ((unsigned char*)dst)[7] = (val >> 56); - } -#endif -#else - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); - ((unsigned char*)dst)[4] = (val >> 32); - ((unsigned char*)dst)[5] = (val >> 40); - ((unsigned char*)dst)[6] = (val >> 48); - ((unsigned char*)dst)[7] = (val >> 56); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void sph_enc64le_aligned(void* dst, sph_u64 val) { -#if SPH_LITTLE_ENDIAN - *(sph_u64*)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u64*)dst = sph_bswap64(val); -#else - ((unsigned char*)dst)[0] = val; - ((unsigned char*)dst)[1] = (val >> 8); - ((unsigned char*)dst)[2] = (val >> 16); - ((unsigned char*)dst)[3] = (val >> 24); - ((unsigned char*)dst)[4] = (val >> 32); - ((unsigned char*)dst)[5] = (val >> 40); - ((unsigned char*)dst)[6] = (val >> 48); - ((unsigned char*)dst)[7] = (val >> 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 sph_dec64le(const void* src) { -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap64(*(const sph_u64*)src); -#else - return *(const sph_u64*)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__("ldxa [%1]0x88,%0" : "=r"(tmp) : "r"(src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned( - (const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64*)src); -#endif -#else - return *(const sph_u64*)src; -#endif - } else { - return (sph_u64)(((const unsigned char*)src)[0]) | - ((sph_u64)(((const unsigned char*)src)[1]) << 8) | - ((sph_u64)(((const unsigned char*)src)[2]) << 16) | - ((sph_u64)(((const unsigned char*)src)[3]) << 24) | - ((sph_u64)(((const unsigned char*)src)[4]) << 32) | - ((sph_u64)(((const unsigned char*)src)[5]) << 40) | - ((sph_u64)(((const unsigned char*)src)[6]) << 48) | - ((sph_u64)(((const unsigned char*)src)[7]) << 56); - } -#endif -#else - return (sph_u64)(((const unsigned char*)src)[0]) | - ((sph_u64)(((const unsigned char*)src)[1]) << 8) | - ((sph_u64)(((const unsigned char*)src)[2]) << 16) | - ((sph_u64)(((const unsigned char*)src)[3]) << 24) | - ((sph_u64)(((const unsigned char*)src)[4]) << 32) | - ((sph_u64)(((const unsigned char*)src)[5]) << 40) | - ((sph_u64)(((const unsigned char*)src)[6]) << 48) | - ((sph_u64)(((const unsigned char*)src)[7]) << 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 sph_dec64le_aligned(const void* src) { -#if SPH_LITTLE_ENDIAN - return *(const sph_u64*)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__("ldxa [%1]0x88,%0" : "=r"(tmp) : "r"(src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64*)src); -#endif -#else - return (sph_u64)(((const unsigned char*)src)[0]) | - ((sph_u64)(((const unsigned char*)src)[1]) << 8) | - ((sph_u64)(((const unsigned char*)src)[2]) << 16) | - ((sph_u64)(((const unsigned char*)src)[3]) << 24) | - ((sph_u64)(((const unsigned char*)src)[4]) << 32) | - ((sph_u64)(((const unsigned char*)src)[5]) << 40) | - ((sph_u64)(((const unsigned char*)src)[6]) << 48) | - ((sph_u64)(((const unsigned char*)src)[7]) << 56); -#endif -} - -#endif - -#endif /* Doxygen excluded block */ - -#endif diff --git a/src/lib.rs b/src/lib.rs index 65680a3..6f902bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ -use libc::{c_void, uint64_t}; +extern crate libc; +use libc::{c_char, uint64_t}; #[macro_use] extern crate cfg_if; @@ -6,9 +7,9 @@ use std::u64; extern "C" { pub fn find_best_deadline_sph( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); @@ -17,34 +18,38 @@ extern "C" { cfg_if! { if #[cfg(feature = "simd")] { extern "C" { + pub fn init_shabal_avx512f() -> (); pub fn find_best_deadline_avx512f( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); + pub fn init_shabal_avx2() -> (); pub fn find_best_deadline_avx2( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); + pub fn init_shabal_avx() -> (); pub fn find_best_deadline_avx( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); + pub fn init_shabal_sse2() -> (); pub fn find_best_deadline_sse2( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); @@ -55,10 +60,11 @@ cfg_if! { cfg_if! { if #[cfg(feature = "neon")] { extern "C" { + pub fn init_shabal_neon() -> (); pub fn find_best_deadline_neon( - scoops: *mut c_void, + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) -> (); @@ -67,10 +73,10 @@ cfg_if! { } #[no_mangle] -pub extern fn find_best_deadline( - scoops: *mut c_void, +pub extern fn shabal_findBestDeadlineDirect( + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, best_deadline: *mut uint64_t, best_offset: *mut uint64_t, ) { @@ -155,14 +161,40 @@ pub extern fn find_best_deadline( } #[no_mangle] -pub extern fn find_best_deadline_assisted( - scoops: *mut c_void, +pub extern fn shabal_init() { + #[cfg(feature = "simd")] + unsafe { + if is_x86_feature_detected!("avx512f") { + init_shabal_avx512f(); + } else if is_x86_feature_detected!("avx2") { + init_shabal_avx2(); + } else if is_x86_feature_detected!("avx") { + init_shabal_avx(); + } else if is_x86_feature_detected!("sse2") { + init_shabal_sse2(); + } + } + #[cfg(feature = "neon")] + unsafe { + #[cfg(target_arch = "arm")] + let neon = is_arm_feature_detected!("neon"); + #[cfg(target_arch = "aarch64")] + let neon = true; + + if neon { + init_shabal_neon(); + } + } +} + +#[no_mangle] +pub extern fn shabal_findBestDeadline( + scoops: *const c_char, nonce_count: uint64_t, - gensig: *const c_void, + gensig: *const c_char, ) -> uint64_t { let mut deadline: u64 = u64::MAX; let mut offset: u64 = 0; - find_best_deadline(scoops, nonce_count, gensig, &mut deadline, &mut offset); - println!("scoop length is {}, best deadline is {}, best offset is {}", deadline, offset); + shabal_findBestDeadlineDirect(scoops, nonce_count, gensig, &mut deadline, &mut offset); return offset; }