diff --git a/Makefile.am b/Makefile.am index 4da1cee2..74a8f0fd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer dist_man_MANS = cpuminer.1 cpuminer_SOURCES = \ + dummy.cpp \ cpu-miner.c \ util.c \ api.c \ @@ -113,7 +114,6 @@ cpuminer_SOURCES = \ algo/lyra2/phi2-4way.c \ algo/lyra2/phi2.c \ algo/m7m/m7m.c \ - algo/m7m/magimath.cpp \ algo/nist5/nist5-gate.c \ algo/nist5/nist5-4way.c \ algo/nist5/nist5.c \ @@ -289,7 +289,7 @@ if HAVE_WINDOWS endif cpuminer_LDFLAGS = @LDFLAGS@ -cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp +cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES) cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 5ea62c97..5927c158 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,15 @@ If not what makes it happen or not happen? Change Log ---------- +v24.1 + +#414: fix bug in merkle error handling. +#416: change $nproc to $(nproc) in build scripts. +#420: change some inline function definitions to static inline. +#413: Fix formatting error for share result log when using no-color. +Faster 2 way interleaving. +Cleanup sha256 architecture targetting. + v23.15 Fixed x11gost (sib) algo for all architectures, broken in v3.23.4. diff --git a/algo-gate-api.h b/algo-gate-api.h index 045b306e..e9ac10e0 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -107,16 +107,16 @@ typedef uint32_t set_t; // AVX10_256 is compatible with AVX2 + VAES // return set containing all elements from sets a & b -inline set_t set_union ( set_t a, set_t b ) { return a | b; } +static inline set_t set_union ( set_t a, set_t b ) { return a | b; } // return set contained common elements from sets a & b -inline set_t set_intsec ( set_t a, set_t b) { return a & b; } +static inline set_t set_intsec ( set_t a, set_t b) { return a & b; } // all elements in set a are included in set b -inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; } +static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; } // no elements in set a are included in set b -inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; } +static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; } typedef struct { diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c index f5660152..9098f983 100644 --- a/algo/m7m/m7m.c +++ b/algo/m7m/m7m.c @@ -21,7 +21,7 @@ #define EPS1 DBL_EPSILON #define EPS2 3.0e-11 -inline double exp_n( double xt ) +static inline double exp_n( double xt ) { if ( xt < -700.0 ) return 0; @@ -33,7 +33,7 @@ inline double exp_n( double xt ) return exp( xt ); } -inline double exp_n2( double x1, double x2 ) +static inline double exp_n2( double x1, double x2 ) { double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.; diff --git a/algo/m7m/magimath.cpp b/algo/m7m/magimath.cpp deleted file mode 100644 index c8c64e23..00000000 --- a/algo/m7m/magimath.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2014 The Magi developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. - -#include -#include -#include -#include -#include -#include -#include - -#include "magimath.h" - -#define EPS1 (std::numeric_limits::epsilon()) -#define EPS2 3.0e-11 - -static void gauleg(double x1, double x2, double x[], double w[], const int n) -{ - int m,j,i; - double z1, z, xm, xl, pp, p3, p2, p1; - m=(n+1)/2; - xm=0.5*(x2+x1); - xl=0.5*(x2-x1); - for (i=1;i<=m;i++) { - z=cos(3.141592654*(i-0.25)/(n+0.5)); - do { - p1=1.0; - p2=0.0; - for (j=1;j<=n;j++) { - p3=p2; - p2=p1; - p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j; - } - pp=n*(z*p1-p2)/(z*z-1.0); - z1=z; - z=z1-p1/pp; - } while (fabs(z-z1) > EPS2); - x[i]=xm-xl*z; - x[n+1-i]=xm+xl*z; - w[i]=2.0*xl/((1.0-z*z)*pp*pp); - w[n+1-i]=w[i]; - } -} - -static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ) -{ - double s=0.0; -#ifdef _MSC_VER -#define SW_DIVS 23 - double x[SW_DIVS+1], w[SW_DIVS+1]; -#else - double x[NptGQ+1], w[NptGQ+1]; -#endif - - gauleg(a2, b2, x, w, NptGQ); - - for (int j=1; j<=NptGQ; j++) { - s += w[j]*func(x[j]); - } - - return s; -} - -static double swit_(double wvnmb) -{ - return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5) - / 1034.66 * pow(sin(wvnmb/65.), 2.); -} - -uint32_t sw_(int nnounce, int divs) -{ - double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100); - return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6)); -} diff --git a/algo/m7m/magimath.h b/algo/m7m/magimath.h deleted file mode 100644 index b57eb806..00000000 --- a/algo/m7m/magimath.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2014 The Magi developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. -#ifndef MAGI_MATH_H -#define MAGI_MATH_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -uint32_t sw_(int nnounce, int divs); - -#ifdef __cplusplus -} -#endif - - -inline double exp_n(double xt) -{ - double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0; - if(xt < p1) - return 0; - else if(xt > p6) - return 1e200; - else if(xt > p3 && xt < p4) - return (1.0 + xt); - else - return exp(xt); -} - -// 1 / (1 + exp(x1-x2)) -inline double exp_n2(double x1, double x2) -{ - double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.; - double xt = x1 - x2; - if (xt < p1+1.e-200) - return 1.; - else if (xt > p1 && xt < p2 + 1.e-200) - return ( 1. - exp(xt) ); - else if (xt > p2 && xt < p3 + 1.e-200) - return ( 1. / (1. + exp(xt)) ); - else if (xt > p3 && xt < p4) - return ( 1. / (2. + xt) ); - else if (xt > p4 - 1.e-200 && xt < p5) - return ( exp(-xt) / (1. + exp(-xt)) ); - else if (xt > p5 - 1.e-200 && xt < p6) - return ( exp(-xt) ); - else //if (xt > p6 - 1.e-200) - return 0.; -} - -#endif diff --git a/algo/sha/sha1-hash.c b/algo/sha/sha1-hash.c index 6f1928df..cfe02d30 100644 --- a/algo/sha/sha1-hash.c +++ b/algo/sha/sha1-hash.c @@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input, #endif -#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) #define sha1_neon_rounds( state_out, data, state_in ) \ { \ diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c index 51971101..e2c10e94 100644 --- a/algo/sha/sha256-hash.c +++ b/algo/sha/sha256-hash.c @@ -1,6 +1,6 @@ #include "sha256-hash.h" -#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) +#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) ) static const uint32_t SHA256_IV[8] = { @@ -189,7 +189,7 @@ static const uint32_t SHA256_IV[8] = _mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \ } -void sha256_opt_transform_le( uint32_t *state_out, const void *input, +void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input, const uint32_t *state_in ) { #define load_msg( m, i ) casti_v128( m, i ) @@ -197,7 +197,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input, #undef load_msg } -void sha256_opt_transform_be( uint32_t *state_out, const void *input, +void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input, const uint32_t *state_in ) { #define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) ) @@ -517,7 +517,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input, _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \ } -void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y, +void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ) { @@ -526,7 +526,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y, #undef load_msg } -void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y, +void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ) { @@ -541,7 +541,7 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y, // The goal is to avoid any redundant processing in final. Prehash is almost // 4 rounds total, only missing the final addition of the nonce. // Nonce must be set to zero for prehash. -void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, +void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg, uint32_t *sstate, const uint32_t *istate ) { __m128i STATE0, STATE1, MSG, TMP; @@ -569,7 +569,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, casti_m128i( ostate, 1 ) = STATE1; } -void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y, +void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y, const void *msg_X, const void *msg_Y, const uint32_t *state_mid_X, const uint32_t *state_mid_Y, const uint32_t *state_save_X, const uint32_t *state_save_Y ) diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h index b0917982..4a201110 100644 --- a/algo/sha/sha256-hash.h +++ b/algo/sha/sha256-hash.h @@ -5,27 +5,21 @@ #include "simd-utils.h" #include "cpuminer-config.h" -// generic interface +static const uint32_t SHA256_IV[8]; + +#if defined(__x86_64__) && defined(__SHA__) typedef struct { - unsigned char buf[64]; /* first field, for alignment */ + unsigned char buf[64]; uint32_t state[8]; uint64_t count; } sha256_context __attribute__((aligned(64))); -static const uint32_t SHA256_IV[8]; - void sha256_full( void *hash, const void *data, size_t len ); void sha256_update( sha256_context *ctx, const void *data, size_t len ); void sha256_final( sha256_context *ctx, void *hash ); void sha256_ctx_init( sha256_context *ctx ); -void sha256_transform_le( uint32_t *state_out, const uint32_t *data, - const uint32_t *state_in ); -void sha256_transform_be( uint32_t *state_out, const uint32_t *data, - const uint32_t *state_in ); - -#if defined(__x86_64__) && defined(__SHA__) void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input, const uint32_t *state_in ); @@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y const uint32_t *state_mid_X, const uint32_t *state_mid_Y, const uint32_t *state_save_X, const uint32_t *state_save_Y ); -// Temporary during name transition -#define sha256_opt_transform_le sha256_x86_sha_transform_le -#define sha256_opt_transform_be sha256_x86_sha_transform_be -#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le -#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be -#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds -#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds - // generic API #define sha256_transform_le sha256_x86_sha_transform_le #define sha256_transform_be sha256_x86_sha_transform_be @@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) +// SHA-256 AArch64 with NEON & SHA2 + +typedef struct +{ + unsigned char buf[64]; + uint32_t state[8]; + uint64_t count; +} sha256_context __attribute__((aligned(64))); + +void sha256_full( void *hash, const void *data, size_t len ); +void sha256_update( sha256_context *ctx, const void *data, size_t len ); +void sha256_final( sha256_context *ctx, void *hash ); +void sha256_ctx_init( sha256_context *ctx ); + void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input, const uint32_t *state_in ); void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input, @@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X, const uint32_t *state_mid_X, const uint32_t *state_mid_Y, const uint32_t *state_save_X, const uint32_t *state_save_Y ); -// Temporary during name transition -#define sha256_transform_le sha256_neon_sha_transform_le -#define sha256_transform_be sha256_neon_sha_transform_be -#define sha256_2x_transform_le sha256_neon_x2sha_transform_le -#define sha256_2x_transform_be sha256_neon_x2sha_transform_be -#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds -#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds - // generic API #define sha256_transform_le sha256_neon_sha_transform_le #define sha256_transform_be sha256_neon_sha_transform_be @@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X, #define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds #else + // without HW acceleration... #include "sph_sha2.h" +#define sha256_context sph_sha256_context #define sha256_full sph_sha256_full #define sha256_ctx_init sph_sha256_init #define sha256_update sph_sha256 @@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X, #define sha256_transform_be sph_sha256_transform_be #define sha256_prehash_3rounds sph_sha256_prehash_3rounds - #endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// SHA-256 16 way +// SHA-256 16 way x86_64 typedef struct { @@ -147,7 +140,7 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data, int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, const __m512i *state_in, const uint32_t *target ); -#define sha256_16way_context sha256_16x32_context +#define sha256_16way_context sha256_16x32_context #define sha256_16way_init sha256_16x32_init #define sha256_16way_update sha256_16x32_update #define sha256_16way_close sha256_16x32_close @@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, #if defined (__AVX2__) -// SHA-256 8 way +// SHA-256 8 way x86_64 typedef struct { @@ -201,7 +194,7 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, #endif // AVX2 -// SHA-256 4 way +// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON typedef struct { diff --git a/algo/x11/timetravel-gate.c b/algo/x11/timetravel-gate.c index 370ef394..f4996925 100644 --- a/algo/x11/timetravel-gate.c +++ b/algo/x11/timetravel-gate.c @@ -16,14 +16,14 @@ bool register_timetravel_algo( algo_gate_t* gate ) return true; }; -inline void tt_swap( int *a, int *b ) +static inline void tt_swap( int *a, int *b ) { int c = *a; *a = *b; *b = c; } -inline void reverse( int *pbegin, int *pend ) +static inline void reverse( int *pbegin, int *pend ) { while ( (pbegin != pend) && (pbegin != --pend) ) { diff --git a/algo/x11/timetravel10-gate.c b/algo/x11/timetravel10-gate.c index 8c212298..b3214823 100644 --- a/algo/x11/timetravel10-gate.c +++ b/algo/x11/timetravel10-gate.c @@ -16,14 +16,14 @@ bool register_timetravel10_algo( algo_gate_t* gate ) return true; }; -inline void tt10_swap( int *a, int *b ) +static inline void tt10_swap( int *a, int *b ) { int c = *a; *a = *b; *b = c; } -inline void reverse( int *pbegin, int *pend ) +static inline void reverse( int *pbegin, int *pend ) { while ( (pbegin != pend) && (pbegin != --pend) ) { diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index ff55048b..b79b4a3c 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -730,7 +730,7 @@ typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay; static __thread x16rv2_4way_context_overlay x16rv2_ctx; // Pad the 24 bytes tiger hash to 64 bytes -inline void padtiger512( uint32_t* hash ) +static inline void padtiger512( uint32_t* hash ) { for ( int i = 6; i < 16; i++ ) hash[i] = 0; } diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index b7e51b16..4ad201bc 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -33,7 +33,7 @@ union _x16rv2_context_overlay typedef union _x16rv2_context_overlay x16rv2_context_overlay; // Pad the 24 bytes tiger hash to 64 bytes -inline void padtiger512(uint32_t* hash) { +static inline void padtiger512(uint32_t* hash) { for (int i = (24/4); i < (64/4); i++) hash[i] = 0; } diff --git a/arm-build.sh b/arm-build.sh index b8a8a7b6..ecb297bc 100755 --- a/arm-build.sh +++ b/arm-build.sh @@ -9,6 +9,6 @@ rm -f config.status CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/armbuild-all.sh b/armbuild-all.sh index 328cb33b..328fd71c 100755 --- a/armbuild-all.sh +++ b/armbuild-all.sh @@ -10,33 +10,33 @@ make distclean || echo clean rm -f config.status ./autogen.sh || echo done CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8-aes-sha2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8-sha2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8-aes make clean || echo clean rm -f config.status CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/build-allarch.sh b/build-allarch.sh index 7f85a2e8..7fbebc27 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -13,7 +13,7 @@ rm -f config.status CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl # Rocketlake needs gcc-11 #CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes @@ -34,7 +34,7 @@ rm -f config.status # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-zen4 @@ -43,7 +43,7 @@ make clean || echo clean rm -f config.status #CFLAGS="-O3 -march=znver2 -mvaes" ./configure --with-curl CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-zen3 @@ -51,7 +51,7 @@ mv cpuminer cpuminer-zen3 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=skylake-avx512 -maes -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx512 @@ -60,7 +60,7 @@ make clean || echo done rm -f config.status # vaes doesn't include aes CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx2-sha-vaes @@ -69,7 +69,7 @@ make clean || echo done rm -f config.status #CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx2-sha @@ -78,7 +78,7 @@ make clean || echo clean rm -f config.status # GCC 9 doesn't include AES with core-avx2 CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx2 @@ -86,7 +86,7 @@ mv cpuminer cpuminer-avx2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx @@ -94,7 +94,7 @@ mv cpuminer cpuminer-avx make clean || echo clean rm -f config.status CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-aes-sse42 @@ -102,7 +102,7 @@ mv cpuminer cpuminer-aes-sse42 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-sse42 @@ -110,7 +110,7 @@ mv cpuminer cpuminer-sse42 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-ssse3 @@ -118,7 +118,7 @@ mv cpuminer cpuminer-ssse3 make clean || echo clean rm -f config.status CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-sse2 @@ -126,7 +126,7 @@ mv cpuminer cpuminer-sse2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=x86-64 -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-x64 @@ -134,6 +134,6 @@ mv cpuminer cpuminer-x64 make clean || echo done rm -f config.status CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/build-armv8.sh b/build-armv8.sh index 0d68f1f8..512d72fd 100755 --- a/build-armv8.sh +++ b/build-armv8.sh @@ -10,6 +10,6 @@ rm -f config.status CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl --host=aarch64-cortexa76-elf --build=x86_64-pc-linux-gnu --target=aarch64-cortexa76-elf #CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/build-avx2.sh b/build-avx2.sh index aeca8889..25ac4b39 100755 --- a/build-avx2.sh +++ b/build-avx2.sh @@ -22,6 +22,6 @@ rm -f config.status CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/build-msys2.sh b/build-msys2.sh index eee2b8db..ed57783a 100755 --- a/build-msys2.sh +++ b/build-msys2.sh @@ -6,5 +6,5 @@ make distclean || echo clean rm -f config.status ./autogen.sh || echo done CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl -make -j 4 +make -j $(nproc) strip -s cpuminer diff --git a/build.sh b/build.sh index c6f895c5..5e5cfa91 100755 --- a/build.sh +++ b/build.sh @@ -15,6 +15,6 @@ rm -f config.status #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -make -j $nproc +make -j $(nproc) strip -s cpuminer diff --git a/clean-all.sh b/clean-all.sh index 855b54f2..8364e408 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,7 +2,7 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null diff --git a/configure b/configure index f919fd81..dd6ce7fc 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.15' -PACKAGE_STRING='cpuminer-opt 23.15' +PACKAGE_VERSION='24.1' +PACKAGE_STRING='cpuminer-opt 24.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.15:";; + short | recursive ) echo "Configuration of cpuminer-opt 24.1:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.15 +cpuminer-opt configure 24.1 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.15, which was +It was created by cpuminer-opt $as_me 24.1, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.15' + VERSION='24.1' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.15, which was +This file was extended by cpuminer-opt $as_me 24.1, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.15 +cpuminer-opt config.status 24.1 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 1d7b3d16..09fbbdb2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [23.15]) +AC_INIT([cpuminer-opt], [24.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index e8088bfc..dab87859 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.16. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='23.14' -PACKAGE_STRING='cpuminer-opt 23.14' +PACKAGE_VERSION='23.16' +PACKAGE_STRING='cpuminer-opt 23.16' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 23.16 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 23.14:";; + short | recursive ) echo "Configuration of cpuminer-opt 23.16:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 23.14 +cpuminer-opt configure 23.16 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 23.14, which was +It was created by cpuminer-opt $as_me 23.16, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='23.14' + VERSION='23.16' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 23.14, which was +This file was extended by cpuminer-opt $as_me 23.16, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 23.14 +cpuminer-opt config.status 23.16 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 5d87a683..ad7cc03e 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1294,7 +1294,7 @@ static int share_result( int result, struct work *work, const char *bell = !result && opt_bell ? &ASCII_BELL : ""; applog( LOG_INFO, "%s%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)", bell, my_stats.share_count, acol, ares, scol, sres, rcol, rres, - bcol, bres, CL_N, share_time, latency ); + bcol, bres, use_colors ? CL_N : "", share_time, latency ); if ( unlikely( !( opt_quiet || result || stale ) ) ) { applog2( LOG_INFO, "%sReject reason: %s", bell, reason ? reason : "" ); diff --git a/dummy.cpp b/dummy.cpp new file mode 100644 index 00000000..31139546 --- /dev/null +++ b/dummy.cpp @@ -0,0 +1,6 @@ +// This file exists to force the use of g++ as the linker which in turn +// links the math library with the inclusion of math.h. gcc will not +// automatically link math. Without this file linking will fail for m7m.c. +// Linking math manually, allowing gcc to do the linking work on Linux +// but on Windows it segfaults. Until that is solved this file must continue +// to exist. diff --git a/miner.h b/miner.h index 5d0cebf0..9ca8b936 100644 --- a/miner.h +++ b/miner.h @@ -185,13 +185,13 @@ static inline bool is_windows(void) */ #endif -static inline uint32_t swab32(uint32_t v) +static inline uint32_t swab32(uint32_t x) { #ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap32(v); + return __builtin_bswap32(x); #else - return ( (x << 24) & 0xff000000u ) | ( (x << 8) & 0x00ff0000u ) - | ( (x >> 8) & 0x0000ff00u ) | ( (x >> 24) & 0x000000ffu ) + return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) << 8 ) & 0x00ff0000u ) + | ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu ) // return bswap_32(v); diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index bcf85dbe..d5a87fd8 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -86,29 +86,28 @@ static inline void extr_lane_2x32( void *dst, const void *src, // 4x32 -#if ( defined(__x86_64__) && defined(__SSE4_1__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) ) +#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) ) #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \ - D0 = v128_movlane32( S0, 1, S1, 0 ); \ - D1 = v128_movlane32( S1, 0, S0, 1 ); \ - D2 = v128_movlane32( S2, 0, S0, 2 ); \ - D3 = v128_movlane32( S3, 0, S0, 3 ); \ - D0 = v128_movlane32( D0, 2, S2, 0 ); \ - D1 = v128_movlane32( D1, 2, S2, 1 ); \ - D2 = v128_movlane32( D2, 1, S1, 2 ); \ - D3 = v128_movlane32( D3, 1, S1, 3 ); \ - D0 = v128_movlane32( D0, 3, S3, 0 ); \ - D1 = v128_movlane32( D1, 3, S3, 1 ); \ - D2 = v128_movlane32( D2, 3, S3, 2 ); \ - D3 = v128_movlane32( D3, 2, S2, 3 ); - -#define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \ +{ \ + v128_t T; \ + T = v128_unpacklo32( S0, S1 ); \ + S0 = v128_unpackhi32( S0, S1 ); \ + S1 = v128_unpacklo32( S2, S3 ); \ + S2 = v128_unpackhi32( S2, S3 ); \ + D0 = v128_unpacklo64( T, S1 ); \ + D1 = v128_unpackhi64( T, S1 ); \ + D2 = v128_unpacklo64( S0, S2 ); \ + D3 = v128_unpackhi64( S0, S2 ); \ +} + +#define LOAD_SRCE_4x32( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \ S0 = v128_load( (const v128_t*)(src0) + (i0) ); \ S1 = v128_load( (const v128_t*)(src1) + (i1) ); \ S2 = v128_load( (const v128_t*)(src2) + (i2) ); \ S3 = v128_load( (const v128_t*)(src3) + (i3) ); -#define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \ +#define STOR_DEST_4x32( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \ v128_store( (v128_t*)(dst0) + (i0), D0 ); \ v128_store( (v128_t*)(dst1) + (i1), D1 ); \ v128_store( (v128_t*)(dst2) + (i2), D2 ); \ @@ -119,39 +118,39 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, src0, 4, src1, 4, src2, 4, src3, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 4, src1, 4, src2, 4, src3, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 17, dst, 18, dst, 19 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 16, dst, 17, dst, 18, dst, 19 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, src0, 5, src1, 5, src2, 5, src3, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 5, src1, 5, src2, 5, src3, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 6, src1, 6, src2, 6, src3, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 20, dst, 21, dst, 22, dst, 23 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 6, src1, 6, src2, 6, src3, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 20, dst, 21, dst, 22, dst, 23 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 7, src1, 7, src2, 7, src3, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 25, dst, 26, dst, 27 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 7, src1, 7, src2, 7, src3, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 24, dst, 25, dst, 26, dst, 27 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 28, dst, 29, dst, 30, dst, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 28, dst, 29, dst, 30, dst, 31 ); // if ( bit_len <= 1024 ) return; } @@ -161,18 +160,18 @@ static inline void intrlv_4x32_512( void *dst, const void *src0, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); } static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, @@ -180,39 +179,39 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); - STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 17, src, 18, src, 19 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 16, src, 17, src, 18, src, 19 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 20, src, 21, src, 22, src, 23 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 20, src, 21, src, 22, src, 23 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 25, src, 26, src, 27 ); - STORE_DEST( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 24, src, 25, src, 26, src, 27 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 28, src, 29, src, 30, src, 31 ); - STORE_DEST( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 28, src, 29, src, 30, src, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); // if ( bit_len <= 1024 ) return; } @@ -222,21 +221,21 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); - STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); - STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); } -#else // SSE2 +#else // !SSE2 && !NEON static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) @@ -381,7 +380,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63]; } -#endif // SSE4_1 or NEON else SSE2 +#endif // SSE2 or NEON else static inline void extr_lane_4x32( void *d, const void *s, const int lane, const int bit_len ) @@ -410,7 +409,7 @@ static inline void extr_lane_4x32( void *d, const void *s, static inline void v128_bswap32_80( void *d, void *s ) { const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); + 0x0405060700010203 ); casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf ); casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf ); casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf ); @@ -560,6 +559,132 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) // 8x32 + +#if defined(__AVX2__) + +#define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \ + S0, S1, S2, S3, S4, S5, S6, S7 ) \ +{ \ + D0 = _mm256_unpacklo_epi32( S0, S1 ); \ + D1 = _mm256_unpackhi_epi32( S0, S1 ); \ + D2 = _mm256_unpacklo_epi32( S2, S3 ); \ + D3 = _mm256_unpackhi_epi32( S2, S3 ); \ + D4 = _mm256_unpacklo_epi32( S4, S5 ); \ + D5 = _mm256_unpackhi_epi32( S4, S5 ); \ + D6 = _mm256_unpacklo_epi32( S6, S7 ); \ + D7 = _mm256_unpackhi_epi32( S6, S7 ); \ +\ + S0 = _mm256_unpacklo_epi64( D0, D2 ); \ + S1 = _mm256_unpackhi_epi64( D0, D2 ); \ + S2 = _mm256_unpacklo_epi64( D1, D3 ); \ + S3 = _mm256_unpackhi_epi64( D1, D3 ); \ + S4 = _mm256_unpacklo_epi64( D4, D6 ); \ + S5 = _mm256_unpackhi_epi64( D4, D6 ); \ + S6 = _mm256_unpacklo_epi64( D5, D7 ); \ + S7 = _mm256_unpackhi_epi64( D5, D7 ); \ +\ + D0 = v256_unpacklo128( S0, S4 ); \ + D1 = v256_unpackhi128( S0, S4 ); \ + D2 = v256_unpacklo128( S1, S5 ); \ + D3 = v256_unpackhi128( S1, S5 ); \ + D4 = v256_unpacklo128( S2, S6 ); \ + D5 = v256_unpackhi128( S2, S6 ); \ + D6 = v256_unpacklo128( S3, S7 ); \ + D7 = v256_unpackhi128( S3, S7 ); \ +} + +#define LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, \ + src0, i0, src1, i1, src2, i2, src3, i3, \ + src4, i4, src5, i5, src6, i6, src7, i7 ) \ + S0 = _mm256_load_si256( (const __m256i*)(src0) + (i0) ); \ + S1 = _mm256_load_si256( (const __m256i*)(src1) + (i1) ); \ + S2 = _mm256_load_si256( (const __m256i*)(src2) + (i2) ); \ + S3 = _mm256_load_si256( (const __m256i*)(src3) + (i3) ); \ + S4 = _mm256_load_si256( (const __m256i*)(src4) + (i4) ); \ + S5 = _mm256_load_si256( (const __m256i*)(src5) + (i5) ); \ + S6 = _mm256_load_si256( (const __m256i*)(src6) + (i6) ); \ + S7 = _mm256_load_si256( (const __m256i*)(src7) + (i7) ); + +#define STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \ + dst0, i0, dst1, i1, dst2, i2, dst3, i3, \ + dst4, i4, dst5, i5, dst6, i6, dst7, i7 ) \ + _mm256_store_si256( (__m256i*)(dst0) + (i0), D0 ); \ + _mm256_store_si256( (__m256i*)(dst1) + (i1), D1 ); \ + _mm256_store_si256( (__m256i*)(dst2) + (i2), D2 ); \ + _mm256_store_si256( (__m256i*)(dst3) + (i3), D3 ); \ + _mm256_store_si256( (__m256i*)(dst4) + (i4), D4 ); \ + _mm256_store_si256( (__m256i*)(dst5) + (i5), D5 ); \ + _mm256_store_si256( (__m256i*)(dst6) + (i6), D6 ); \ + _mm256_store_si256( (__m256i*)(dst7) + (i7), D7 ); + +static inline void intrlv_8x32_256( void *dst, const void *s0, const void *s1, + const void *s2, const void *s3, const void *s4, const void *s5, + const void *s6, const void *s7 ) +{ + __m256i D0, D1, D2, D3, D4, D5, D6, D7, + S0, S1, S2, S3, S4, S5, S6, S7; + + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + s0, 0, s1, 0, s2, 0, s3, 0, s4, 0, s5, 0, s6, 0, s7, 0 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst, 0, dst, 4, dst, 1, dst, 5, dst, 2, dst, 6, dst, 3, dst, 7 ); +} + +static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, + const void *s2, const void *s3, const void *s4, const void *s5, + const void *s6, const void *s7 ) +{ + __m256i D0, D1, D2, D3, D4, D5, D6, D7, + S0, S1, S2, S3, S4, S5, S6, S7; + + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + s0, 0, s1, 0, s2, 0, s3, 0, s4, 0, s5, 0, s6, 0, s7, 0 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + s0, 1, s1, 1, s2, 1, s3, 1, s4, 1, s5, 1, s6, 1, s7, 1 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst, 0, dst, 4, dst, 1, dst, 5, dst, 2, dst, 6, dst, 3, dst, 7 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst, 8, dst,12, dst, 9, dst,13, dst,10, dst,14, dst,11, dst,15 ); +} + +static inline void dintrlv_8x32_256( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src ) +{ + __m256i D0, D1, D2, D3, D4, D5, D6, D7, + S0, S1, S2, S3, S4, S5, S6, S7; + + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + src, 0, src, 1, src, 2, src, 3, src, 4, src, 5, src, 6, src, 7 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst0,0, dst4,0, dst1,0, dst5,0, dst2,0, dst6,0, dst3,0, dst7,0 ); +} + +static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2, + void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, + const void *src ) +{ + __m256i D0, D1, D2, D3, D4, D5, D6, D7, + S0, S1, S2, S3, S4, S5, S6, S7; + + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + src, 0, src, 1, src, 2, src, 3, src, 4, src, 5, src, 6, src, 7 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + LOAD_SRCE_8x32( S0, S1, S2, S3, S4, S5, S6, S7, + src, 8, src, 9, src,10, src,11, src,12, src,13, src,14, src,15 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst0,0, dst4,0, dst1,0, dst5,0, dst2,0, dst6,0, dst3,0, dst7,0 ); + ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, S0, S1, S2, S3, S4, S5, S6, S7 ); + STOR_DEST_8x32( D0, D1, D2, D3, D4, D5, D6, D7, + dst0,1, dst4,1, dst1,1, dst5,1, dst2,1, dst6,1, dst3,1, dst7,1 ); +} + +#endif + #if defined(__SSE4_1__) static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, @@ -568,98 +693,99 @@ static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, s0, 4, s1, 4, s2, 4, s3, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 4, s1, 4, s2, 4, s3, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 4, s5, 4, s6, 4, s7, 4 ); - STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 34, dst, 36, dst, 38 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 4, s5, 4, s6, 4, s7, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 32, dst, 34, dst, 36, dst, 38 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 35, dst, 37, dst, 39 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 33, dst, 35, dst, 37, dst, 39 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, s0, 5, s1, 5, s2, 5, s3, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 5, s1, 5, s2, 5, s3, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 5, s5, 5, s6, 5, s7, 5 ); - STORE_DEST( D0, D1, D2, D3, dst, 40, dst, 42, dst, 44, dst, 46 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 5, s5, 5, s6, 5, s7, 5 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 40, dst, 42, dst, 44, dst, 46 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 6, s1, 6, s2, 6, s3, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 41, dst, 43, dst, 45, dst, 47 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 6, s1, 6, s2, 6, s3, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 41, dst, 43, dst, 45, dst, 47 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 6, s5, 6, s6, 6, s7, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 50, dst, 52, dst, 54 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 6, s5, 6, s6, 6, s7, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 48, dst, 50, dst, 52, dst, 54 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 7, s1, 7, s2, 7, s3, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 51, dst, 53, dst, 55 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 7, s1, 7, s2, 7, s3, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 49, dst, 51, dst, 53, dst, 55 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 7, s5, 7, s6, 7, s7, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 56, dst, 58, dst, 60, dst, 62 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 7, s5, 7, s6, 7, s7, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 56, dst, 58, dst, 60, dst, 62 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 57, dst, 59, dst, 61, dst, 63 ); - + STOR_DEST_4x32( D0, D1, D2, D3, dst, 57, dst, 59, dst, 61, dst, 63 ); // if ( bit_len <= 1024 ) return; } +/* static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, const void *s6, const void *s7 ) { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); } +*/ static inline void dintrlv_8x32( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src, @@ -667,98 +793,99 @@ static inline void dintrlv_8x32( void *dst0, void *dst1, void *dst2, void *dst3, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); - STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); - STORE_DEST( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); - STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); - STORE_DEST( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); - STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 34, src, 36, src, 38 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 32, src, 34, src, 36, src, 38 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 35, src, 37, src, 39 ); - STORE_DEST( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 33, src, 35, src, 37, src, 39 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst4, 4, dst5, 4, dst6, 4, dst7, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 4, dst5, 4, dst6, 4, dst7, 4 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 40, src, 42, src, 44, src, 46 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 40, src, 42, src, 44, src, 46 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 41, src, 43, src, 45, src, 47 ); - STORE_DEST( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 41, src, 43, src, 45, src, 47 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 50, src, 52, src, 54 ); - STORE_DEST( D0, D1, D2, D3, dst4, 5, dst5, 5, dst6, 5, dst7, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 48, src, 50, src, 52, src, 54 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 5, dst5, 5, dst6, 5, dst7, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 51, src, 53, src, 55 ); - STORE_DEST( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 49, src, 51, src, 53, src, 55 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 56, src, 58, src, 60, src, 62 ); - STORE_DEST( D0, D1, D2, D3, dst4, 6, dst5, 6, dst6, 6, dst7, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 56, src, 58, src, 60, src, 62 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 6, dst5, 6, dst6, 6, dst7, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 57, src, 59, src, 61, src, 63 ); - STORE_DEST( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 57, src, 59, src, 61, src, 63 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst4, 7, dst5, 7, dst6, 7, dst7, 7 ); - + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 7, dst5, 7, dst6, 7, dst7, 7 ); // if ( bit_len <= 1024 ) return; } +/* static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src ) { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); - STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); - STORE_DEST( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); - STORE_DEST( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); - STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); - STORE_DEST( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); - STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); } +*/ #endif // SSE4_1 @@ -923,111 +1050,111 @@ static inline void intrlv_16x32( void *dst, const void *s00, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, s00, 4, s01, 4, s02, 4, s03, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 4, s01, 4, s02, 4, s03, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 4, s05, 4, s06, 4, s07, 4 ); - STORE_DEST( D0, D1, D2, D3, dst, 64, dst, 68, dst, 72, dst, 76 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 4, s05, 4, s06, 4, s07, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 64, dst, 68, dst, 72, dst, 76 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 4, s09, 4, s10, 4, s11, 4 ); - STORE_DEST( D0, D1, D2, D3, dst, 65, dst, 69, dst, 73, dst, 77 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 4, s09, 4, s10, 4, s11, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 65, dst, 69, dst, 73, dst, 77 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 4, s13, 4, s14, 4, s15, 4 ); - STORE_DEST( D0, D1, D2, D3, dst, 66, dst, 70, dst, 74, dst, 78 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 4, s13, 4, s14, 4, s15, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 66, dst, 70, dst, 74, dst, 78 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 67, dst, 71, dst, 75, dst, 79 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 67, dst, 71, dst, 75, dst, 79 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, s00, 5, s01, 5, s02, 5, s03, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 5, s01, 5, s02, 5, s03, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 5, s05, 5, s06, 5, s07, 5 ); - STORE_DEST( D0, D1, D2, D3, dst, 80, dst, 84, dst, 88, dst, 92 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 5, s05, 5, s06, 5, s07, 5 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 80, dst, 84, dst, 88, dst, 92 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 5, s09, 5, s10, 5, s11, 5 ); - STORE_DEST( D0, D1, D2, D3, dst, 81, dst, 85, dst, 89, dst, 93 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 5, s09, 5, s10, 5, s11, 5 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 81, dst, 85, dst, 89, dst, 93 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 5, s13, 5, s14, 5, s15, 5 ); - STORE_DEST( D0, D1, D2, D3, dst, 82, dst, 86, dst, 90, dst, 94 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 5, s13, 5, s14, 5, s15, 5 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 82, dst, 86, dst, 90, dst, 94 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 6, s01, 6, s02, 6, s03, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 83, dst, 87, dst, 91, dst, 95 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 6, s01, 6, s02, 6, s03, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 83, dst, 87, dst, 91, dst, 95 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 6, s05, 6, s06, 6, s07, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 96, dst, 100, dst, 104, dst, 108 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 6, s05, 6, s06, 6, s07, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 96, dst, 100, dst, 104, dst, 108 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 6, s09, 6, s10, 6, s11, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 97, dst, 101, dst, 105, dst, 109 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 6, s09, 6, s10, 6, s11, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 97, dst, 101, dst, 105, dst, 109 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 6, s13, 6, s14, 6, s15, 6 ); - STORE_DEST( D0, D1, D2, D3, dst, 98, dst, 102, dst, 106, dst, 110 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 6, s13, 6, s14, 6, s15, 6 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 98, dst, 102, dst, 106, dst, 110 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 7, s01, 7, s02, 7, s03, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 99, dst, 103, dst, 107, dst, 111 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 7, s01, 7, s02, 7, s03, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 99, dst, 103, dst, 107, dst, 111 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 7, s05, 7, s06, 7, s07, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 112, dst, 116, dst, 120, dst, 124 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 7, s05, 7, s06, 7, s07, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 112, dst, 116, dst, 120, dst, 124 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 7, s09, 7, s10, 7, s11, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 113, dst, 117, dst, 121, dst, 125 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 7, s09, 7, s10, 7, s11, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 113, dst, 117, dst, 121, dst, 125 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 7, s13, 7, s14, 7, s15, 7 ); - STORE_DEST( D0, D1, D2, D3, dst, 114, dst, 118, dst, 122, dst, 126 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 7, s13, 7, s14, 7, s15, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 114, dst, 118, dst, 122, dst, 126 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 115, dst, 119, dst, 123, dst, 127 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 115, dst, 119, dst, 123, dst, 127 ); // if ( bit_len <= 1024 ) return; } @@ -1041,54 +1168,54 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); - STORE_DEST( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); - STORE_DEST( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); - STORE_DEST( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); - STORE_DEST( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); } static inline void dintrlv_16x32( void *dst00, void *dst01, void *dst02, @@ -1099,111 +1226,111 @@ static inline void dintrlv_16x32( void *dst00, void *dst01, void *dst02, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); - STORE_DEST( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); - STORE_DEST( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); - STORE_DEST( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); - STORE_DEST( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); - STORE_DEST( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); - STORE_DEST( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); if ( bit_len <= 256 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); - STORE_DEST( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); - STORE_DEST( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); - STORE_DEST( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); - STORE_DEST( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); - STORE_DEST( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); - STORE_DEST( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); - STORE_DEST( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); if ( bit_len <= 512 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 64, src, 68, src, 72, src, 76 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 64, src, 68, src, 72, src, 76 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 65, src, 69, src, 73, src, 77 ); - STORE_DEST( D0, D1, D2, D3, dst00, 4, dst01, 4, dst02, 4, dst03, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 65, src, 69, src, 73, src, 77 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 4, dst01, 4, dst02, 4, dst03, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 66, src, 70, src, 74, src, 78 ); - STORE_DEST( D0, D1, D2, D3, dst04, 4, dst05, 4, dst06, 4, dst07, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 66, src, 70, src, 74, src, 78 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 4, dst05, 4, dst06, 4, dst07, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 67, src, 71, src, 75, src, 79 ); - STORE_DEST( D0, D1, D2, D3, dst08, 4, dst09, 4, dst10, 4, dst11, 4 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 67, src, 71, src, 75, src, 79 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 4, dst09, 4, dst10, 4, dst11, 4 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 4, dst13, 4, dst14, 4, dst15, 4 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 4, dst13, 4, dst14, 4, dst15, 4 ); if ( bit_len <= 640 ) return; - LOAD_SRCE( S0, S1, S2, S3, src, 80, src, 84, src, 88, src, 92 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 80, src, 84, src, 88, src, 92 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 81, src, 85, src, 89, src, 93 ); - STORE_DEST( D0, D1, D2, D3, dst00, 5, dst01, 5, dst02, 5, dst03, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 81, src, 85, src, 89, src, 93 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 5, dst01, 5, dst02, 5, dst03, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 82, src, 86, src, 90, src, 94 ); - STORE_DEST( D0, D1, D2, D3, dst04, 5, dst05, 5, dst06, 5, dst07, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 82, src, 86, src, 90, src, 94 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 5, dst05, 5, dst06, 5, dst07, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 83, src, 87, src, 91, src, 95 ); - STORE_DEST( D0, D1, D2, D3, dst08, 5, dst09, 5, dst10, 5, dst11, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 83, src, 87, src, 91, src, 95 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 5, dst09, 5, dst10, 5, dst11, 5 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 5, dst13, 5, dst14, 5, dst15, 5 ); - LOAD_SRCE( S0, S1, S2, S3, src, 96, src, 100, src, 104, src, 108 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 5, dst13, 5, dst14, 5, dst15, 5 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 96, src, 100, src, 104, src, 108 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 97, src, 101, src, 105, src, 109 ); - STORE_DEST( D0, D1, D2, D3, dst00, 6, dst01, 6, dst02, 6, dst03, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 97, src, 101, src, 105, src, 109 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 6, dst01, 6, dst02, 6, dst03, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 98, src, 102, src, 106, src, 110 ); - STORE_DEST( D0, D1, D2, D3, dst04, 6, dst05, 6, dst06, 6, dst07, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 98, src, 102, src, 106, src, 110 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 6, dst05, 6, dst06, 6, dst07, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 99, src, 103, src, 107, src, 111 ); - STORE_DEST( D0, D1, D2, D3, dst08, 6, dst09, 6, dst10, 6, dst11, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 99, src, 103, src, 107, src, 111 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 6, dst09, 6, dst10, 6, dst11, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 112, src, 116, src, 120, src, 124 ); - STORE_DEST( D0, D1, D2, D3, dst12, 6, dst13, 6, dst14, 6, dst15, 6 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 112, src, 116, src, 120, src, 124 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 6, dst13, 6, dst14, 6, dst15, 6 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 113, src, 117, src, 121, src, 125 ); - STORE_DEST( D0, D1, D2, D3, dst00, 7, dst01, 7, dst02, 7, dst03, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 113, src, 117, src, 121, src, 125 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 7, dst01, 7, dst02, 7, dst03, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 114, src, 118, src, 122, src, 126 ); - STORE_DEST( D0, D1, D2, D3, dst04, 7, dst05, 7, dst06, 7, dst07, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 114, src, 118, src, 122, src, 126 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 7, dst05, 7, dst06, 7, dst07, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 115, src, 119, src, 123, src, 127 ); - STORE_DEST( D0, D1, D2, D3, dst08, 7, dst09, 7, dst10, 7, dst11, 7 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 115, src, 119, src, 123, src, 127 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 7, dst09, 7, dst10, 7, dst11, 7 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 7, dst13, 7, dst14, 7, dst15, 7 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 7, dst13, 7, dst14, 7, dst15, 7 ); // if ( bit_len <= 1024 ) return; } @@ -1216,54 +1343,54 @@ static inline void dintrlv_16x32_512( void *dst00, void *dst01, void *dst02, { v128_t D0, D1, D2, D3, S0, S1, S2, S3; - LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); - STORE_DEST( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); - STORE_DEST( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); - STORE_DEST( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); - STORE_DEST( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); - STORE_DEST( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); - STORE_DEST( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); - STORE_DEST( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); - STORE_DEST( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); - STORE_DEST( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); - STORE_DEST( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); - STORE_DEST( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); - STORE_DEST( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); - STORE_DEST( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); - STORE_DEST( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - LOAD_SRCE( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); - STORE_DEST( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); + LOAD_SRCE_4x32( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); - STORE_DEST( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); + STOR_DEST_4x32( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); } #endif // SSE4_1 @@ -1422,6 +1549,63 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) // 2x64 SSE2, NEON +static inline void intrlv_2x64( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + v128u64_t *d = (v128u64_t*)dst;; + const v128u64_t *s0 = (const v128u64_t*)src0; + const v128u64_t *s1 = (const v128u64_t*)src1; + + d[ 0] = v128_unpacklo64( s0[0], s1[0] ); + d[ 1] = v128_unpackhi64( s0[0], s1[0] ); + d[ 2] = v128_unpacklo64( s0[1], s1[1] ); + d[ 3] = v128_unpackhi64( s0[1], s1[1] ); + if ( bit_len <= 256 ) return; + d[ 4] = v128_unpacklo64( s0[2], s1[2] ); + d[ 5] = v128_unpackhi64( s0[2], s1[2] ); + d[ 6] = v128_unpacklo64( s0[3], s1[3] ); + d[ 7] = v128_unpackhi64( s0[3], s1[3] ); + if ( bit_len <= 512 ) return; + d[ 8] = v128_unpacklo64( s0[4], s1[4] ); + d[ 9] = v128_unpackhi64( s0[4], s1[4] ); + if ( bit_len <= 640 ) return; + d[10] = v128_unpacklo64( s0[5], s1[5] ); + d[11] = v128_unpackhi64( s0[5], s1[5] ); + d[12] = v128_unpacklo64( s0[6], s1[6] ); + d[13] = v128_unpackhi64( s0[6], s1[6] ); + d[14] = v128_unpacklo64( s0[7], s1[7] ); + d[15] = v128_unpackhi64( s0[7], s1[7] ); +} + +static inline void dintrlv_2x64( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + v128u64_t *d0 = (v128u64_t*)dst0; + v128u64_t *d1 = (v128u64_t*)dst1; + const v128u64_t *s = (const v128u64_t*)src; + + d0[0] = v128_unpacklo64( s[ 0], s[ 1] ); + d1[0] = v128_unpackhi64( s[ 0], s[ 1] ); + d0[1] = v128_unpacklo64( s[ 2], s[ 3] ); + d1[1] = v128_unpackhi64( s[ 2], s[ 3] ); + if ( bit_len <= 256 ) return; + d0[2] = v128_unpacklo64( s[ 4], s[ 5] ); + d1[2] = v128_unpackhi64( s[ 4], s[ 5] ); + d0[3] = v128_unpacklo64( s[ 6], s[ 7] ); + d1[3] = v128_unpackhi64( s[ 6], s[ 7] ); + if ( bit_len <= 512 ) return; + d0[4] = v128_unpacklo64( s[ 8], s[ 9] ); + d1[4] = v128_unpackhi64( s[ 8], s[ 9] ); + if ( bit_len <= 640 ) return; + d0[5] = v128_unpacklo64( s[10], s[11] ); + d1[5] = v128_unpackhi64( s[10], s[11] ); + d0[6] = v128_unpacklo64( s[12], s[13] ); + d1[6] = v128_unpackhi64( s[12], s[13] ); + d0[7] = v128_unpacklo64( s[14], s[15] ); + d1[7] = v128_unpackhi64( s[14], s[15] ); +} + +/* static inline void intrlv_2x64( void *dst, const void *src0, const void *src1, const int bit_len ) { @@ -1440,7 +1624,8 @@ static inline void intrlv_2x64( void *dst, const void *src0, d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13]; d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15]; } - +*/ +/* static inline void dintrlv_2x64( void *dst0, void *dst1, const void *src, const int bit_len ) { @@ -1460,6 +1645,7 @@ static inline void dintrlv_2x64( void *dst0, void *dst1, d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27]; d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31]; } +*/ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src ) { @@ -1542,6 +1728,61 @@ static inline void extr_lane_2x64( void *dst, const void *src, // 4x64 (AVX2) +#if defined(__AVX2__) + +#define ILEAVE_4x64( D0, D1, D2, D3, S0, S1, S2, S3 ) \ +{ \ + __m256i T; \ + T = _mm256_unpacklo_epi64( S0, S1 ); \ + S0 = _mm256_unpackhi_epi64( S0, S1 ); \ + S1 = _mm256_unpacklo_epi64( S2, S3 ); \ + S2 = _mm256_unpackhi_epi64( S2, S3 ); \ + D0 = v256_unpacklo128( T, S1 ); \ + D1 = v256_unpacklo128( S0, S2 ); \ + D2 = v256_unpackhi128( T, S1 ); \ + D3 = v256_unpackhi128( S0, S2 ); \ +} + +#define LOAD_SRCE_4x64( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \ + S0 = _mm256_load_si256( (const __m256i*)(src0) + (i0) ); \ + S1 = _mm256_load_si256( (const __m256i*)(src1) + (i1) ); \ + S2 = _mm256_load_si256( (const __m256i*)(src2) + (i2) ); \ + S3 = _mm256_load_si256( (const __m256i*)(src3) + (i3) ); + +#define STOR_DEST_4x64( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \ + _mm256_store_si256( (__m256i*)(dst0) + (i0), D0 ); \ + _mm256_store_si256( (__m256i*)(dst1) + (i1), D1 ); \ + _mm256_store_si256( (__m256i*)(dst2) + (i2), D2 ); \ + _mm256_store_si256( (__m256i*)(dst3) + (i3), D3 ); + +/* +static inline void intrlv_4x64_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3 ) +{ + __m256i D0, D1, D2, D3, S0, S1, S2, S3; + LOAD_SRCE_4x64( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); + ILEAVE_4x64( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE_4x64( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); + STOR_DEST_4x64( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); + ILEAVE_4x64( D0, D1, D2, D3, S0, S1, S2, S3 ); + STOR_DEST_4x64( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); +} +*/ +/* +static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src ) +{ + __m256i D0, D1, D2, D3, S0, S1, S2, S3; + LOAD_SRCE_4x64( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); + ILEAVE_4x64( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE_4x64( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); + STOR_DEST_4x64( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + ILEAVE_4x64( D0, D1, D2, D3, S0, S1, S2, S3 ); + STOR_DEST_4x64( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); +} +*/ +#endif + #if defined(__SSE2__) static inline void intrlv_4x64( void *dst, const void *src0, @@ -1590,6 +1831,7 @@ static inline void intrlv_4x64( void *dst, const void *src0, d[31] = v128_unpackhi64( s2[7], s3[7] ); } + static inline void intrlv_4x64_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { @@ -1616,6 +1858,7 @@ static inline void intrlv_4x64_512( void *dst, const void *src0, d[15] = v128_unpackhi64( s2[3], s3[3] ); } + static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { @@ -1661,6 +1904,7 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2, d3[7] = v128_unpackhi64( s[29], s[31] ); } + static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -3318,7 +3562,13 @@ do { \ #endif // AVX512 #undef ILEAVE_4x32 -#undef LOAD_SRCE -#undef ILEAVE_STORE_DEST +#undef LOAD_SRCE_4x32 +#undef STOR_DEST_4x32 +#undef ILEAVE_8x32 +#undef LOAD_SRCE_8x32 +#undef STOR_DEST_8x32 +#undef ILEAVE_4x64 +#undef LOAD_SRCE_4x64 +#undef STOR_DEST_4x64 #endif // INTERLEAVE_H__ diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 655c729e..2fa9898f 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -141,7 +141,7 @@ #define v128_aesdeclast _mm_aesdeclast_si128 #define v128_aesdeclast_nokey(v) _mm_aesdeclast_si128( v, v128_zero ) -// Used instead if casting. +// Used instead of casting. typedef union { v128_t v128; diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 1a894c15..c568dafa 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -429,6 +429,19 @@ static inline __m256i mm256_not( const __m256i v ) #endif // AVX512 else AVX2 +#if defined(__AVX2__) + +// 128 bit version of unpack +#define v256_unpacklo128( v1, v0 ) _mm256_permute2x128_si256( v1, v0, 0x20 ) +#define v256_unpackhi128( v1, v0 ) _mm256_permute2x128_si256( v1, v0, 0x31 ) + +#else + +#define v256_unpacklo128( v1, v0 ) _mm256_permute2f128_si256( v1, v0, 0x20 ) +#define v256_unpackhi128( v1, v0 ) _mm256_permute2f128_si256( v1, v0, 0x31 ) + +#endif + // // Cross lane shuffles // diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 2a4ce0b5..95650c59 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -164,11 +164,10 @@ static inline uint32_t ror32( uint32_t a, const int c ) // obsolete test // Compiler check for __int128 support // Configure also has a test for int128. -#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) +//#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) ) #define GCC_INT128 1 -#endif +//#endif -// obsolte test #if !defined(GCC_INT128) #warning "__int128 not supported, requires GCC-4.8 or newer." #endif diff --git a/sysinfos.c b/sysinfos.c index c054a85b..18fb507f 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -14,6 +14,7 @@ #include #include #include "miner.h" +#include "simd-utils.h" #if defined(__aarch64__) && !defined(__APPLE__) // for arm's "cpuid" @@ -223,8 +224,8 @@ static inline int cpu_fanpercent() #define AVX512_F_Flag (1<<16) #define AVX512_DQ_Flag (1<<17) #define AVX512_IFMA_Flag (1<<21) -#define AVX512_PF_Flag (1<<26) -#define AVX512_ER_Flag (1<<27) +#define AVX512_PF_Flag (1<<26) // obsolete +#define AVX512_ER_Flag (1<<27) // obsolete #define AVX512_CD_Flag (1<<28) #define SHA_Flag (1<<29) #define AVX512_BW_Flag (1<<30) @@ -237,8 +238,8 @@ static inline int cpu_fanpercent() #define AVX512_BITALG_Flag (1<<12) #define AVX512_VPOPCNTDQ_Flag (1<<14) // EDX -#define AVX512_4VNNIW_Flag (1<< 2) -#define AVX512_4FMAPS_Flag (1<< 3) +#define AVX512_4VNNIW_Flag (1<< 2) // obsolete +#define AVX512_4FMAPS_Flag (1<< 3) // obsolete #define AVX512_VP2INTERSECT_Flag (1<< 8) #define AMX_BF16_Flag (1<<22) #define AVX512_FP16_Flag (1<<23) @@ -557,10 +558,15 @@ static inline bool has_aes_ni() #elif defined(__aarch64__) && !defined(__APPLE__) if ( has_neon() ) { - unsigned int cpu_info[4] = { 0 }; +#if defined(KERNEL_HWCAP_AES) + return true; +#else + return false; +#endif +/* unsigned int cpu_info[4] = { 0 }; cpuid( 0, 0, cpu_info ); return cpu_info[0] & HWCAP_AES; - } +*/ } return false; #else return false; @@ -602,10 +608,15 @@ static inline bool has_sha() #elif defined(__aarch64__) && !defined(__APPLE__) if ( has_neon() ) { - unsigned int cpu_info[4] = { 0 }; +#if defined(KERNEL_HWCAP_SHA2) + return true; +#else + return false; +#endif +/* unsigned int cpu_info[4] = { 0 }; cpuid( 0, 0, cpu_info ); return cpu_info[0] & HWCAP_SHA2; - } +*/ } return false; #else return false; diff --git a/util.c b/util.c index 7f0ca6a4..c1161c32 100644 --- a/util.c +++ b/util.c @@ -2075,11 +2075,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) const char *s = json_string_value( json_array_get( merkle_arr, i ) ); if ( !s || strlen(s) != 64 ) { - for ( int j = sctx->job.merkle_buf_size; j > 0; j-- ) - free( sctx->job.merkle[i] ); - free( sctx->job.merkle ); - sctx->job.merkle_count = - sctx->job.merkle_buf_size = 0; + sctx->job.merkle_count = 0; pthread_mutex_unlock( &sctx->work_lock ); applog( LOG_ERR, "Stratum notify: invalid Merkle branch" ); goto out;