diff --git a/Makefile b/Makefile index 9cbd3b99..0a80e68d 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ ARCH_CFLAGS := $(ARCH_CFLAGS)+$(subst $(COMMA),+,$(FEATURE)) endif endif -CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) -std=gnu++14 +CXXFLAGS += -Wall -Wcast-qual -Wconversion -I. $(ARCH_CFLAGS) -std=gnu++14 LDFLAGS += -lm OBJS = \ tests/binding.o \ diff --git a/sse2neon.h b/sse2neon.h index 788b3304..9458b521 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -128,17 +128,17 @@ #include #include -FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64) +FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val) { - double f64; - memcpy(&f64, &u64, sizeof(uint64_t)); - return f64; + double tmp; + memcpy(&tmp, &val, sizeof(uint64_t)); + return tmp; } -FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) +FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val) { - int64_t i64; - memcpy(&i64, &f64, sizeof(uint64_t)); - return i64; + int64_t tmp; + memcpy(&tmp, &val, sizeof(uint64_t)); + return tmp; } #if defined(_WIN32) && !defined(__MINGW32__) @@ -2407,7 +2407,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( - vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); + vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to @@ -5312,7 +5312,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16( - vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and @@ -5391,7 +5391,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) if (_sse2neon_unlikely(c & ~15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16( - vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); + vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, @@ -5411,7 +5411,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { - const int count = (imm & ~15) ? 15 : imm; + const int16_t count = (imm & ~15) ? 15 : (int16_t) imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); } @@ -5473,13 +5473,13 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 -#define _mm_srli_epi16(a, imm) \ - _sse2neon_define0( \ - __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_u16( \ - vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ +#define _mm_srli_epi16(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16(vshlq_u16( \ + vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \ } _sse2neon_return(ret);) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and @@ -7765,9 +7765,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { - int64x2_t s64 = + int64x2_t s64_vec = vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, @@ -7785,9 +7785,9 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { - int64x2_t s64 = + int64x2_t s64_vec = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1)); } /* SSE4.2 */ @@ -7955,40 +7955,40 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(u, size))) \ } while (0) -#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ - static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ - int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_equal_any_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ + static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_any_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } -#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ - static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ - int lb) \ - { \ - __m128i mtx[16]; \ - PCMPSTR_RANGES( \ - a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ - return SSE2NEON_CAT( \ - _sse2neon_aggregate_ranges_, \ - SSE2NEON_CAT( \ - SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ - SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ +#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ + static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_RANGES( \ + a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_ranges_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ } #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ - static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ - __m128i b, int lb) \ + static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ + __m128i b, int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ @@ -8002,29 +8002,34 @@ static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ } -static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_equal_any_8x16(int la, + int lb, + __m128i mtx[16]) { - int res = 0; + uint16_t res = 0; int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u8( vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); - int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; + uint16_t tmp = + _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; } -static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_equal_any_16x8(int la, + int lb, + __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t res = 0; + uint16_t m = (uint16_t) (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { @@ -8032,7 +8037,8 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u16( vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); - int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; + uint16_t tmp = + _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; @@ -8046,10 +8052,10 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) -static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t res = 0; + uint16_t m = (uint16_t) (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { @@ -8062,23 +8068,23 @@ static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), vreinterpretq_u32_m128i(tmp)); #if defined(__aarch64__) || defined(_M_ARM64) - int t = vaddvq_u32(vec_res) ? 1 : 0; + uint16_t t = vaddvq_u32(vec_res) ? 1 : 0; #else uint64x2_t sumh = vpaddlq_u32(vec_res); - int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); + uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); #endif res |= (t << j); } return res; } -static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) +static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; + uint16_t res = 0; + uint16_t m = (uint16_t) ((1 << la) - 1); uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( @@ -8089,7 +8095,7 @@ static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), vreinterpretq_u16_m128i(tmp)); - int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; + uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; res |= (t << j); } return res; @@ -8111,22 +8117,25 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) #undef SSE2NEON_CMP_RANGES_IS_BYTE #undef SSE2NEON_CMP_RANGES_IS_WORD -static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) +static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a, + int la, + __m128i b, + int lb) { uint8x16_t mtx = vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x10000 - (1 << la); - int tb = 0x10000 - (1 << lb); + uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb)); + uint16_t m1 = (uint16_t) (0x10000 - (1 << la)); + uint16_t tb = (uint16_t) (0x10000 - (1 << lb)); uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); - vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); - vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); - vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); - vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); - tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); - tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); + vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask); + vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask); + vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask); + vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask); + tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask); + tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask); res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); @@ -8135,17 +8144,20 @@ static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) res_lo = vand_u8(res_lo, vec_mask); res_hi = vand_u8(res_hi, vec_mask); - int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); - return res; + return _sse2neon_vaddv_u8(res_lo) + + (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8); } -static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) +static uint16_t _sse2neon_cmp_word_equal_each(__m128i a, + int la, + __m128i b, + int lb) { uint16x8_t mtx = vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x100 - (1 << la); - int tb = 0x100 - (1 << lb); + uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb))); + uint16_t m1 = (uint16_t) (0x100 - (1 << la)); + uint16_t tb = (uint16_t) (0x100 - (1 << lb)); uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); @@ -8160,18 +8172,22 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ - static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ - int bound, int la, int lb, __m128i mtx[16]) \ + static uint16_t \ + _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ + int bound, int la, int lb, __m128i mtx[16]) \ { \ - int res = 0; \ - int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ + uint16_t res = 0; \ + uint16_t m1 = \ + (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la)); \ uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ - vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ - vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ - vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ + vcombine_u##size( \ + vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask), \ + vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)), \ + vec_mask)), \ + vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask)); \ uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ for (int j = 0; j < lb; j++) { \ @@ -8188,7 +8204,7 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) int val = 1; \ for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ val &= ptr[k * bound + j]; \ - res += val << i; \ + res += (uint16_t) (val << i); \ } \ return res; \ } @@ -8235,14 +8251,17 @@ enum { SSE2NEON_CMPESTR_LIST #undef _ }; -typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); +typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); static cmpestr_func_t _sse2neon_cmpfunc_table[] = { #define _(name, func_suffix) _sse2neon_##func_suffix, SSE2NEON_CMPESTR_LIST #undef _ }; -FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) +FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, + int lb, + int imm8, + int bound) { switch (imm8 & 0x30) { case _SIDD_NEGATIVE_POLARITY: @@ -8255,7 +8274,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) break; } - return res & ((bound == 8) ? 0xFF : 0xFFFF); + return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF)); } FORCE_INLINE int _sse2neon_clz(unsigned int x) @@ -8319,10 +8338,10 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of // string a and b. -#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ - SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ - SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ - int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ +#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ + SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ + uint16_t r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) #define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ @@ -8339,8 +8358,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) dst = vreinterpretq_m128i_u16(vbslq_u16( \ tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ } else { \ - uint8x16_t vec_r2 = \ - vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ + uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2), \ + vdup_n_u8((uint8_t) (r2 >> 8))); \ uint8x16_t tmp = \ vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ dst = vreinterpretq_m128i_u8( \ @@ -8351,8 +8370,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) dst = vreinterpretq_m128i_u16( \ vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ } else { \ - dst = vreinterpretq_m128i_u8( \ - vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ + dst = vreinterpretq_m128i_u8(vsetq_lane_u8( \ + (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0)); \ } \ } \ return dst @@ -8576,8 +8595,8 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32ch(crc, v); #else - crc = _mm_crc32_u8(crc, v & 0xff); - crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); + crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff)); + crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff)); #endif return crc; } @@ -8595,8 +8614,8 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cw(crc, v); #else - crc = _mm_crc32_u16(crc, v & 0xffff); - crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); + crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff)); + crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff)); #endif return crc; } @@ -8613,8 +8632,8 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) #elif (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cd((uint32_t) crc, v); #else - crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); - crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); + crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff)); + crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff)); #endif return crc; } diff --git a/tests/impl.cpp b/tests/impl.cpp index 93d53b3c..3140b5aa 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -531,15 +531,20 @@ int32_t comineq_ss(float a, float b) return (a != b); } -static inline int16_t saturate_16(int32_t a) -{ - int32_t max = (1 << 15) - 1; - int32_t min = -(1 << 15); - if (a > max) - return max; - if (a < min) - return min; - return a; +static inline int16_t saturate_i16(int32_t a) +{ + if (a > INT16_MAX) + return INT16_MAX; + if (a < INT16_MIN) + return INT16_MIN; + return (int16_t) a; +} + +static inline uint16_t saturate_u16(uint32_t a) +{ + if (a > UINT16_MAX) + return UINT16_MAX; + return (uint16_t) a; } uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v) @@ -556,22 +561,23 @@ uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v) uint32_t canonical_crc32_u16(uint32_t crc, uint16_t v) { - crc = canonical_crc32_u8(crc, v & 0xff); - crc = canonical_crc32_u8(crc, (v >> 8) & 0xff); + crc = canonical_crc32_u8(crc, (uint8_t) (v & 0xff)); + crc = canonical_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff)); return crc; } uint32_t canonical_crc32_u32(uint32_t crc, uint32_t v) { - crc = canonical_crc32_u16(crc, v & 0xffff); - crc = canonical_crc32_u16(crc, (v >> 16) & 0xffff); + crc = canonical_crc32_u16(crc, (uint16_t) (v & 0xffff)); + crc = canonical_crc32_u16(crc, (uint16_t) (v >> 16) & 0xffff); return crc; } uint64_t canonical_crc32_u64(uint64_t crc, uint64_t v) { - crc = canonical_crc32_u32((uint32_t) (crc), v & 0xffffffff); - crc = canonical_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); + crc = canonical_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff)); + crc = canonical_crc32_u32((uint32_t) (crc), + (uint32_t) ((v >> 32) & 0xffffffff)); return crc; } @@ -670,14 +676,14 @@ inline __m128i aesdec_128_reference(__m128i a, __m128i b) g = v[i][2]; h = v[i][3]; - v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^ - MULTIPLY(h, 0x09); - v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^ - MULTIPLY(h, 0x0d); - v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^ - MULTIPLY(h, 0x0b); - v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^ - MULTIPLY(h, 0x0e); + v[i][0] = (uint8_t) (MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ + MULTIPLY(g, 0x0d) ^ MULTIPLY(h, 0x09)); + v[i][1] = (uint8_t) (MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ + MULTIPLY(g, 0x0b) ^ MULTIPLY(h, 0x0d)); + v[i][2] = (uint8_t) (MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ + MULTIPLY(g, 0x0e) ^ MULTIPLY(h, 0x0b)); + v[i][3] = (uint8_t) (MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ + MULTIPLY(g, 0x09) ^ MULTIPLY(h, 0x0e)); } for (i = 0; i < 16; ++i) { @@ -774,11 +780,11 @@ static std::pair clmul_64(uint64_t x, uint64_t y) // B = 2 // m = 32 // x = (x1 * B^m) + x0 - uint32_t x0 = x & 0xffffffff; - uint32_t x1 = x >> 32; + uint32_t x0 = (uint32_t) (x & 0xffffffff); + uint32_t x1 = (uint32_t) (x >> 32); // y = (y1 * B^m) + y0 - uint32_t y0 = y & 0xffffffff; - uint32_t y1 = y >> 32; + uint32_t y0 = (uint32_t) (y & 0xffffffff); + uint32_t y1 = (uint32_t) (y >> 32); // z0 = x0 * y0 uint64_t z0 = clmul_32(x0, y0); @@ -891,10 +897,10 @@ result_t test_mm_avg_pu16(const SSE2NEONTestImpl &impl, uint32_t iter) const uint16_t *_a = (const uint16_t *) impl.mTestIntPointer1; const uint16_t *_b = (const uint16_t *) impl.mTestIntPointer2; uint16_t d[4]; - d[0] = (_a[0] + _b[0] + 1) >> 1; - d[1] = (_a[1] + _b[1] + 1) >> 1; - d[2] = (_a[2] + _b[2] + 1) >> 1; - d[3] = (_a[3] + _b[3] + 1) >> 1; + d[0] = (uint16_t) ((_a[0] + _b[0] + 1) >> 1); + d[1] = (uint16_t) ((_a[1] + _b[1] + 1) >> 1); + d[2] = (uint16_t) ((_a[2] + _b[2] + 1) >> 1); + d[3] = (uint16_t) ((_a[3] + _b[3] + 1) >> 1); __m64 a = load_m64(_a); __m64 b = load_m64(_b); @@ -908,14 +914,14 @@ result_t test_mm_avg_pu8(const SSE2NEONTestImpl &impl, uint32_t iter) const uint8_t *_a = (const uint8_t *) impl.mTestIntPointer1; const uint8_t *_b = (const uint8_t *) impl.mTestIntPointer2; uint8_t d[8]; - d[0] = (_a[0] + _b[0] + 1) >> 1; - d[1] = (_a[1] + _b[1] + 1) >> 1; - d[2] = (_a[2] + _b[2] + 1) >> 1; - d[3] = (_a[3] + _b[3] + 1) >> 1; - d[4] = (_a[4] + _b[4] + 1) >> 1; - d[5] = (_a[5] + _b[5] + 1) >> 1; - d[6] = (_a[6] + _b[6] + 1) >> 1; - d[7] = (_a[7] + _b[7] + 1) >> 1; + d[0] = (uint8_t) ((_a[0] + _b[0] + 1) >> 1); + d[1] = (uint8_t) ((_a[1] + _b[1] + 1) >> 1); + d[2] = (uint8_t) ((_a[2] + _b[2] + 1) >> 1); + d[3] = (uint8_t) ((_a[3] + _b[3] + 1) >> 1); + d[4] = (uint8_t) ((_a[4] + _b[4] + 1) >> 1); + d[5] = (uint8_t) ((_a[5] + _b[5] + 1) >> 1); + d[6] = (uint8_t) ((_a[6] + _b[6] + 1) >> 1); + d[7] = (uint8_t) ((_a[7] + _b[7] + 1) >> 1); __m64 a = load_m64(_a); __m64 b = load_m64(_b); @@ -2634,7 +2640,7 @@ result_t test_m_psadbw(const SSE2NEONTestImpl &impl, uint32_t iter) const uint8_t *_b = (const uint8_t *) impl.mTestIntPointer2; uint16_t d = 0; for (int i = 0; i < 8; i++) { - d += abs(_a[i] - _b[i]); + d += (uint16_t) abs(_a[i] - _b[i]); } __m64 a = load_m64(_a); @@ -2714,7 +2720,7 @@ result_t test_mm_sad_pu8(const SSE2NEONTestImpl &impl, uint32_t iter) const uint8_t *_b = (const uint8_t *) impl.mTestIntPointer2; uint16_t d = 0; for (int i = 0; i < 8; i++) { - d += abs(_a[i] - _b[i]); + d += (uint16_t) abs(_a[i] - _b[i]); } __m64 a = load_m64(_a); @@ -3394,47 +3400,15 @@ result_t test_mm_adds_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; const int16_t *_b = (const int16_t *) impl.mTestIntPointer2; - int32_t d[8]; - d[0] = (int32_t) _a[0] + (int32_t) _b[0]; - if (d[0] > 32767) - d[0] = 32767; - if (d[0] < -32768) - d[0] = -32768; - d[1] = (int32_t) _a[1] + (int32_t) _b[1]; - if (d[1] > 32767) - d[1] = 32767; - if (d[1] < -32768) - d[1] = -32768; - d[2] = (int32_t) _a[2] + (int32_t) _b[2]; - if (d[2] > 32767) - d[2] = 32767; - if (d[2] < -32768) - d[2] = -32768; - d[3] = (int32_t) _a[3] + (int32_t) _b[3]; - if (d[3] > 32767) - d[3] = 32767; - if (d[3] < -32768) - d[3] = -32768; - d[4] = (int32_t) _a[4] + (int32_t) _b[4]; - if (d[4] > 32767) - d[4] = 32767; - if (d[4] < -32768) - d[4] = -32768; - d[5] = (int32_t) _a[5] + (int32_t) _b[5]; - if (d[5] > 32767) - d[5] = 32767; - if (d[5] < -32768) - d[5] = -32768; - d[6] = (int32_t) _a[6] + (int32_t) _b[6]; - if (d[6] > 32767) - d[6] = 32767; - if (d[6] < -32768) - d[6] = -32768; - d[7] = (int32_t) _a[7] + (int32_t) _b[7]; - if (d[7] > 32767) - d[7] = 32767; - if (d[7] < -32768) - d[7] = -32768; + int16_t d[8]; + d[0] = saturate_i16((int32_t) _a[0] + (int32_t) _b[0]); + d[1] = saturate_i16((int32_t) _a[1] + (int32_t) _b[1]); + d[2] = saturate_i16((int32_t) _a[2] + (int32_t) _b[2]); + d[3] = saturate_i16((int32_t) _a[3] + (int32_t) _b[3]); + d[4] = saturate_i16((int32_t) _a[4] + (int32_t) _b[4]); + d[5] = saturate_i16((int32_t) _a[5] + (int32_t) _b[5]); + d[6] = saturate_i16((int32_t) _a[6] + (int32_t) _b[6]); + d[7] = saturate_i16((int32_t) _a[7] + (int32_t) _b[7]); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); @@ -3466,19 +3440,18 @@ result_t test_mm_adds_epi8(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_adds_epu16(const SSE2NEONTestImpl &impl, uint32_t iter) { - uint32_t max = 0xFFFF; const uint16_t *_a = (const uint16_t *) impl.mTestIntPointer1; const uint16_t *_b = (const uint16_t *) impl.mTestIntPointer2; uint16_t d[8]; - d[0] = (uint32_t) _a[0] + (uint32_t) _b[0] > max ? max : _a[0] + _b[0]; - d[1] = (uint32_t) _a[1] + (uint32_t) _b[1] > max ? max : _a[1] + _b[1]; - d[2] = (uint32_t) _a[2] + (uint32_t) _b[2] > max ? max : _a[2] + _b[2]; - d[3] = (uint32_t) _a[3] + (uint32_t) _b[3] > max ? max : _a[3] + _b[3]; - d[4] = (uint32_t) _a[4] + (uint32_t) _b[4] > max ? max : _a[4] + _b[4]; - d[5] = (uint32_t) _a[5] + (uint32_t) _b[5] > max ? max : _a[5] + _b[5]; - d[6] = (uint32_t) _a[6] + (uint32_t) _b[6] > max ? max : _a[6] + _b[6]; - d[7] = (uint32_t) _a[7] + (uint32_t) _b[7] > max ? max : _a[7] + _b[7]; + d[0] = saturate_u16((uint32_t) _a[0] + (uint32_t) _b[0]); + d[1] = saturate_u16((uint32_t) _a[1] + (uint32_t) _b[1]); + d[2] = saturate_u16((uint32_t) _a[2] + (uint32_t) _b[2]); + d[3] = saturate_u16((uint32_t) _a[3] + (uint32_t) _b[3]); + d[4] = saturate_u16((uint32_t) _a[4] + (uint32_t) _b[4]); + d[5] = saturate_u16((uint32_t) _a[5] + (uint32_t) _b[5]); + d[6] = saturate_u16((uint32_t) _a[6] + (uint32_t) _b[6]); + d[7] = saturate_u16((uint32_t) _a[7] + (uint32_t) _b[7]); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); @@ -3631,17 +3604,17 @@ result_t test_mm_andnot_si128(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_avg_epu16(const SSE2NEONTestImpl &impl, uint32_t iter) { - const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; - const int16_t *_b = (const int16_t *) impl.mTestIntPointer2; + const uint16_t *_a = (const uint16_t *) impl.mTestIntPointer1; + const uint16_t *_b = (const uint16_t *) impl.mTestIntPointer2; uint16_t d[8]; - d[0] = ((uint16_t) _a[0] + (uint16_t) _b[0] + 1) >> 1; - d[1] = ((uint16_t) _a[1] + (uint16_t) _b[1] + 1) >> 1; - d[2] = ((uint16_t) _a[2] + (uint16_t) _b[2] + 1) >> 1; - d[3] = ((uint16_t) _a[3] + (uint16_t) _b[3] + 1) >> 1; - d[4] = ((uint16_t) _a[4] + (uint16_t) _b[4] + 1) >> 1; - d[5] = ((uint16_t) _a[5] + (uint16_t) _b[5] + 1) >> 1; - d[6] = ((uint16_t) _a[6] + (uint16_t) _b[6] + 1) >> 1; - d[7] = ((uint16_t) _a[7] + (uint16_t) _b[7] + 1) >> 1; + d[0] = (uint16_t) ((_a[0] + _b[0] + 1) >> 1); + d[1] = (uint16_t) ((_a[1] + _b[1] + 1) >> 1); + d[2] = (uint16_t) ((_a[2] + _b[2] + 1) >> 1); + d[3] = (uint16_t) ((_a[3] + _b[3] + 1) >> 1); + d[4] = (uint16_t) ((_a[4] + _b[4] + 1) >> 1); + d[5] = (uint16_t) ((_a[5] + _b[5] + 1) >> 1); + d[6] = (uint16_t) ((_a[6] + _b[6] + 1) >> 1); + d[7] = (uint16_t) ((_a[7] + _b[7] + 1) >> 1); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); __m128i c = _mm_avg_epu16(a, b); @@ -3650,25 +3623,25 @@ result_t test_mm_avg_epu16(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_avg_epu8(const SSE2NEONTestImpl &impl, uint32_t iter) { - const int8_t *_a = (const int8_t *) impl.mTestIntPointer1; - const int8_t *_b = (const int8_t *) impl.mTestIntPointer2; + const uint8_t *_a = (const uint8_t *) impl.mTestIntPointer1; + const uint8_t *_b = (const uint8_t *) impl.mTestIntPointer2; uint8_t d[16]; - d[0] = ((uint8_t) _a[0] + (uint8_t) _b[0] + 1) >> 1; - d[1] = ((uint8_t) _a[1] + (uint8_t) _b[1] + 1) >> 1; - d[2] = ((uint8_t) _a[2] + (uint8_t) _b[2] + 1) >> 1; - d[3] = ((uint8_t) _a[3] + (uint8_t) _b[3] + 1) >> 1; - d[4] = ((uint8_t) _a[4] + (uint8_t) _b[4] + 1) >> 1; - d[5] = ((uint8_t) _a[5] + (uint8_t) _b[5] + 1) >> 1; - d[6] = ((uint8_t) _a[6] + (uint8_t) _b[6] + 1) >> 1; - d[7] = ((uint8_t) _a[7] + (uint8_t) _b[7] + 1) >> 1; - d[8] = ((uint8_t) _a[8] + (uint8_t) _b[8] + 1) >> 1; - d[9] = ((uint8_t) _a[9] + (uint8_t) _b[9] + 1) >> 1; - d[10] = ((uint8_t) _a[10] + (uint8_t) _b[10] + 1) >> 1; - d[11] = ((uint8_t) _a[11] + (uint8_t) _b[11] + 1) >> 1; - d[12] = ((uint8_t) _a[12] + (uint8_t) _b[12] + 1) >> 1; - d[13] = ((uint8_t) _a[13] + (uint8_t) _b[13] + 1) >> 1; - d[14] = ((uint8_t) _a[14] + (uint8_t) _b[14] + 1) >> 1; - d[15] = ((uint8_t) _a[15] + (uint8_t) _b[15] + 1) >> 1; + d[0] = (uint8_t) ((_a[0] + _b[0] + 1) >> 1); + d[1] = (uint8_t) ((_a[1] + _b[1] + 1) >> 1); + d[2] = (uint8_t) ((_a[2] + _b[2] + 1) >> 1); + d[3] = (uint8_t) ((_a[3] + _b[3] + 1) >> 1); + d[4] = (uint8_t) ((_a[4] + _b[4] + 1) >> 1); + d[5] = (uint8_t) ((_a[5] + _b[5] + 1) >> 1); + d[6] = (uint8_t) ((_a[6] + _b[6] + 1) >> 1); + d[7] = (uint8_t) ((_a[7] + _b[7] + 1) >> 1); + d[8] = (uint8_t) ((_a[8] + _b[8] + 1) >> 1); + d[9] = (uint8_t) ((_a[9] + _b[9] + 1) >> 1); + d[10] = (uint8_t) ((_a[10] + _b[10] + 1) >> 1); + d[11] = (uint8_t) ((_a[11] + _b[11] + 1) >> 1); + d[12] = (uint8_t) ((_a[12] + _b[12] + 1) >> 1); + d[13] = (uint8_t) ((_a[13] + _b[13] + 1) >> 1); + d[14] = (uint8_t) ((_a[14] + _b[14] + 1) >> 1); + d[15] = (uint8_t) ((_a[15] + _b[15] + 1) >> 1); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); __m128i c = _mm_avg_epu8(a, b); @@ -5357,14 +5330,14 @@ result_t test_mm_movemask_epi8(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_movemask_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; - unsigned int _c = 0; - _c |= ((*(const uint64_t *) _a) >> 63) & 0x1; + int _c = 0; + _c |= (int) (((*(const uint64_t *) _a) >> 63) & 0x1); _c |= (((*(const uint64_t *) (_a + 1)) >> 62) & 0x2); __m128d a = load_m128d(_a); int c = _mm_movemask_pd(a); - ASSERT_RETURN((unsigned int) c == _c); + ASSERT_RETURN(c == _c); return TEST_SUCCESS; } @@ -5644,10 +5617,10 @@ result_t test_mm_sad_epu8(const SSE2NEONTestImpl &impl, uint32_t iter) uint16_t d0 = 0; uint16_t d1 = 0; for (int i = 0; i < 8; i++) { - d0 += abs(_a[i] - _b[i]); + d0 += (uint16_t) abs(_a[i] - _b[i]); } for (int i = 8; i < 16; i++) { - d1 += abs(_a[i] - _b[i]); + d1 += (uint16_t) abs(_a[i] - _b[i]); } const __m128i a = load_m128i(_a); @@ -5984,21 +5957,21 @@ result_t test_mm_sll_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; __m128i a, b, c; uint8_t idx; -#define TEST_IMPL(IDX) \ - uint16_t d##IDX[8]; \ - idx = IDX; \ - d##IDX[0] = (idx > 15) ? 0 : _a[0] << idx; \ - d##IDX[1] = (idx > 15) ? 0 : _a[1] << idx; \ - d##IDX[2] = (idx > 15) ? 0 : _a[2] << idx; \ - d##IDX[3] = (idx > 15) ? 0 : _a[3] << idx; \ - d##IDX[4] = (idx > 15) ? 0 : _a[4] << idx; \ - d##IDX[5] = (idx > 15) ? 0 : _a[5] << idx; \ - d##IDX[6] = (idx > 15) ? 0 : _a[6] << idx; \ - d##IDX[7] = (idx > 15) ? 0 : _a[7] << idx; \ - \ - a = load_m128i(_a); \ - b = _mm_set1_epi64x(IDX); \ - c = _mm_sll_epi16(a, b); \ +#define TEST_IMPL(IDX) \ + uint16_t d##IDX[8]; \ + idx = IDX; \ + d##IDX[0] = (idx > 15) ? 0 : (uint16_t) (_a[0] << idx); \ + d##IDX[1] = (idx > 15) ? 0 : (uint16_t) (_a[1] << idx); \ + d##IDX[2] = (idx > 15) ? 0 : (uint16_t) (_a[2] << idx); \ + d##IDX[3] = (idx > 15) ? 0 : (uint16_t) (_a[3] << idx); \ + d##IDX[4] = (idx > 15) ? 0 : (uint16_t) (_a[4] << idx); \ + d##IDX[5] = (idx > 15) ? 0 : (uint16_t) (_a[5] << idx); \ + d##IDX[6] = (idx > 15) ? 0 : (uint16_t) (_a[6] << idx); \ + d##IDX[7] = (idx > 15) ? 0 : (uint16_t) (_a[7] << idx); \ + \ + a = load_m128i(_a); \ + b = _mm_set1_epi64x(IDX); \ + c = _mm_sll_epi16(a, b); \ CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) IMM_64_ITER @@ -6312,18 +6285,18 @@ result_t test_mm_srl_epi64(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_srli_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { - const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; + const uint16_t *_a = (const uint16_t *) impl.mTestIntPointer1; const int count = (int) (iter % 18 - 1); // range: -1 ~ 16 int16_t d[8]; - d[0] = count & (~15) ? 0 : (uint16_t) (_a[0]) >> count; - d[1] = count & (~15) ? 0 : (uint16_t) (_a[1]) >> count; - d[2] = count & (~15) ? 0 : (uint16_t) (_a[2]) >> count; - d[3] = count & (~15) ? 0 : (uint16_t) (_a[3]) >> count; - d[4] = count & (~15) ? 0 : (uint16_t) (_a[4]) >> count; - d[5] = count & (~15) ? 0 : (uint16_t) (_a[5]) >> count; - d[6] = count & (~15) ? 0 : (uint16_t) (_a[6]) >> count; - d[7] = count & (~15) ? 0 : (uint16_t) (_a[7]) >> count; + d[0] = count & (~15) ? 0 : (int16_t) (_a[0] >> count); + d[1] = count & (~15) ? 0 : (int16_t) (_a[1] >> count); + d[2] = count & (~15) ? 0 : (int16_t) (_a[2] >> count); + d[3] = count & (~15) ? 0 : (int16_t) (_a[3] >> count); + d[4] = count & (~15) ? 0 : (int16_t) (_a[4] >> count); + d[5] = count & (~15) ? 0 : (int16_t) (_a[5] >> count); + d[6] = count & (~15) ? 0 : (int16_t) (_a[6] >> count); + d[7] = count & (~15) ? 0 : (int16_t) (_a[7] >> count); __m128i a = load_m128i(_a); __m128i c = _mm_srli_epi16(a, count); @@ -6333,14 +6306,14 @@ result_t test_mm_srli_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_srli_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) { - const int32_t *_a = (const int32_t *) impl.mTestIntPointer1; + const uint32_t *_a = (const uint32_t *) impl.mTestIntPointer1; const int count = (int) (iter % 34 - 1); // range: -1 ~ 32 int32_t d[4]; - d[0] = count & (~31) ? 0 : (uint32_t) (_a[0]) >> count; - d[1] = count & (~31) ? 0 : (uint32_t) (_a[1]) >> count; - d[2] = count & (~31) ? 0 : (uint32_t) (_a[2]) >> count; - d[3] = count & (~31) ? 0 : (uint32_t) (_a[3]) >> count; + d[0] = count & (~31) ? 0 : (int32_t) (_a[0] >> count); + d[1] = count & (~31) ? 0 : (int32_t) (_a[1] >> count); + d[2] = count & (~31) ? 0 : (int32_t) (_a[2] >> count); + d[3] = count & (~31) ? 0 : (int32_t) (_a[3] >> count); __m128i a = load_m128i(_a); __m128i c = _mm_srli_epi32(a, count); @@ -6701,20 +6674,12 @@ result_t test_mm_sub_si64(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_subs_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) { - int32_t max = 32767; - int32_t min = -32768; const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; const int16_t *_b = (const int16_t *) impl.mTestIntPointer2; int16_t d[8]; for (int i = 0; i < 8; i++) { - int32_t res = (int32_t) _a[i] - (int32_t) _b[i]; - if (res > max) - d[i] = max; - else if (res < min) - d[i] = min; - else - d[i] = (int16_t) res; + d[i] = saturate_i16((int32_t) _a[i] - (int32_t) _b[i]); } __m128i a = load_m128i(_a); @@ -7260,7 +7225,7 @@ result_t test_mm_abs_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i a = load_m128i(_a); __m128i c = _mm_abs_epi16(a); - uint32_t d[8]; + uint16_t d[8]; d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; @@ -7294,7 +7259,7 @@ result_t test_mm_abs_epi8(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i a = load_m128i(_a); __m128i c = _mm_abs_epi8(a); - uint32_t d[16]; + uint8_t d[16]; for (int i = 0; i < 16; i++) { d[i] = (_a[i] < 0) ? -_a[i] : _a[i]; } @@ -7308,7 +7273,7 @@ result_t test_mm_abs_pi16(const SSE2NEONTestImpl &impl, uint32_t iter) __m64 a = load_m64(_a); __m64 c = _mm_abs_pi16(a); - uint32_t d[4]; + uint16_t d[4]; d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; @@ -7336,7 +7301,7 @@ result_t test_mm_abs_pi8(const SSE2NEONTestImpl &impl, uint32_t iter) __m64 a = load_m64(_a); __m64 c = _mm_abs_pi8(a); - uint32_t d[8]; + uint8_t d[8]; d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; @@ -7641,23 +7606,14 @@ result_t test_mm_hsubs_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) const int16_t *_b = (const int16_t *) impl.mTestIntPointer1; int16_t d16[8]; - int32_t d32[8]; - d32[0] = (int32_t) _a[0] - (int32_t) _a[1]; - d32[1] = (int32_t) _a[2] - (int32_t) _a[3]; - d32[2] = (int32_t) _a[4] - (int32_t) _a[5]; - d32[3] = (int32_t) _a[6] - (int32_t) _a[7]; - d32[4] = (int32_t) _b[0] - (int32_t) _b[1]; - d32[5] = (int32_t) _b[2] - (int32_t) _b[3]; - d32[6] = (int32_t) _b[4] - (int32_t) _b[5]; - d32[7] = (int32_t) _b[6] - (int32_t) _b[7]; - for (int i = 0; i < 8; i++) { - if (d32[i] > (int32_t) INT16_MAX) - d16[i] = INT16_MAX; - else if (d32[i] < (int32_t) INT16_MIN) - d16[i] = INT16_MIN; - else - d16[i] = (int16_t) d32[i]; - } + d16[0] = saturate_i16((int32_t) _a[0] - (int32_t) _a[1]); + d16[1] = saturate_i16((int32_t) _a[2] - (int32_t) _a[3]); + d16[2] = saturate_i16((int32_t) _a[4] - (int32_t) _a[5]); + d16[3] = saturate_i16((int32_t) _a[6] - (int32_t) _a[7]); + d16[4] = saturate_i16((int32_t) _b[0] - (int32_t) _b[1]); + d16[5] = saturate_i16((int32_t) _b[2] - (int32_t) _b[3]); + d16[6] = saturate_i16((int32_t) _b[4] - (int32_t) _b[5]); + d16[7] = saturate_i16((int32_t) _b[6] - (int32_t) _b[7]); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); @@ -7671,19 +7627,11 @@ result_t test_mm_hsubs_pi16(const SSE2NEONTestImpl &impl, uint32_t iter) const int16_t *_a = (const int16_t *) impl.mTestIntPointer1; const int16_t *_b = (const int16_t *) impl.mTestIntPointer1; - int32_t _d[4]; - _d[0] = (int32_t) _a[0] - (int32_t) _a[1]; - _d[1] = (int32_t) _a[2] - (int32_t) _a[3]; - _d[2] = (int32_t) _b[0] - (int32_t) _b[1]; - _d[3] = (int32_t) _b[2] - (int32_t) _b[3]; - - for (int i = 0; i < 4; i++) { - if (_d[i] > (int32_t) INT16_MAX) { - _d[i] = INT16_MAX; - } else if (_d[i] < (int32_t) INT16_MIN) { - _d[i] = INT16_MIN; - } - } + int16_t _d[4]; + _d[0] = saturate_i16((int32_t) _a[0] - (int32_t) _a[1]); + _d[1] = saturate_i16((int32_t) _a[2] - (int32_t) _a[3]); + _d[2] = saturate_i16((int32_t) _b[0] - (int32_t) _b[1]); + _d[3] = saturate_i16((int32_t) _b[2] - (int32_t) _b[3]); __m64 a = load_m64(_a); __m64 b = load_m64(_b); @@ -7714,14 +7662,14 @@ result_t test_mm_maddubs_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) int32_t d15 = (int32_t) (_a[15] * _b[15]); int16_t e[8]; - e[0] = saturate_16(d0 + d1); - e[1] = saturate_16(d2 + d3); - e[2] = saturate_16(d4 + d5); - e[3] = saturate_16(d6 + d7); - e[4] = saturate_16(d8 + d9); - e[5] = saturate_16(d10 + d11); - e[6] = saturate_16(d12 + d13); - e[7] = saturate_16(d14 + d15); + e[0] = saturate_i16(d0 + d1); + e[1] = saturate_i16(d2 + d3); + e[2] = saturate_i16(d4 + d5); + e[3] = saturate_i16(d6 + d7); + e[4] = saturate_i16(d8 + d9); + e[5] = saturate_i16(d10 + d11); + e[6] = saturate_i16(d12 + d13); + e[7] = saturate_i16(d14 + d15); __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); @@ -7743,10 +7691,10 @@ result_t test_mm_maddubs_pi16(const SSE2NEONTestImpl &impl, uint32_t iter) int16_t d7 = (int16_t) (_a[7] * _b[7]); int16_t e[4]; - e[0] = saturate_16(d0 + d1); - e[1] = saturate_16(d2 + d3); - e[2] = saturate_16(d4 + d5); - e[3] = saturate_16(d6 + d7); + e[0] = saturate_i16(d0 + d1); + e[1] = saturate_i16(d2 + d3); + e[2] = saturate_i16(d4 + d5); + e[3] = saturate_i16(d6 + d7); __m64 a = load_m64(_a); __m64 b = load_m64(_b); @@ -7762,10 +7710,11 @@ result_t test_mm_mulhrs_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); - int32_t _c[8]; + int16_t _c[8]; for (int i = 0; i < 8; i++) { - _c[i] = - (((((int32_t) _a[i] * (int32_t) _b[i]) >> 14) + 1) & 0x1FFFE) >> 1; + _c[i] = (int16_t) ((((((int32_t) _a[i] * (int32_t) _b[i]) >> 14) + 1) & + 0x1FFFE) >> + 1); } __m128i c = _mm_mulhrs_epi16(a, b); @@ -7779,10 +7728,11 @@ result_t test_mm_mulhrs_pi16(const SSE2NEONTestImpl &impl, uint32_t iter) __m64 a = load_m64(_a); __m64 b = load_m64(_b); - int32_t _c[4]; + int16_t _c[4]; for (int i = 0; i < 4; i++) { - _c[i] = - (((((int32_t) _a[i] * (int32_t) _b[i]) >> 14) + 1) & 0x1FFFE) >> 1; + _c[i] = (int16_t) ((((((int32_t) _a[i] * (int32_t) _b[i]) >> 14) + 1) & + 0x1FFFE) >> + 1); } __m64 c = _mm_mulhrs_pi16(a, b); @@ -8870,18 +8820,18 @@ result_t test_mm_mpsadbw_epu8(const SSE2NEONTestImpl &impl, uint32_t iter) __m128i a = load_m128i(_a); __m128i b = load_m128i(_b); __m128i c; -#define TEST_IMPL(IDX) \ - uint8_t a_offset##IDX = ((IDX >> 2) & 0x1) * 4; \ - uint8_t b_offset##IDX = (IDX & 0x3) * 4; \ - \ - uint16_t d##IDX[8] = {}; \ - for (int i = 0; i < 8; i++) { \ - for (int j = 0; j < 4; j++) { \ - d##IDX[i] += \ - abs(_a[(a_offset##IDX + i) + j] - _b[b_offset##IDX + j]); \ - } \ - } \ - c = _mm_mpsadbw_epu8(a, b, IDX); \ +#define TEST_IMPL(IDX) \ + uint8_t a_offset##IDX = ((IDX >> 2) & 0x1) * 4; \ + uint8_t b_offset##IDX = (IDX & 0x3) * 4; \ + \ + uint16_t d##IDX[8] = {}; \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 4; j++) { \ + d##IDX[i] += (uint16_t) abs(_a[(a_offset##IDX + i) + j] - \ + _b[b_offset##IDX + j]); \ + } \ + } \ + c = _mm_mpsadbw_epu8(a, b, IDX); \ CHECK_RESULT(VALIDATE_UINT16_M128(c, d##IDX)); IMM_8_ITER @@ -11587,7 +11537,7 @@ result_t test_mm_cmpistrz(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_crc32_u16(const SSE2NEONTestImpl &impl, uint32_t iter) { uint32_t crc = *(const uint32_t *) impl.mTestIntPointer1; - uint16_t v = iter; + uint16_t v = (uint16_t) iter; uint32_t result = _mm_crc32_u16(crc, v); ASSERT_RETURN(result == canonical_crc32_u16(crc, v)); return TEST_SUCCESS; @@ -11614,7 +11564,7 @@ result_t test_mm_crc32_u64(const SSE2NEONTestImpl &impl, uint32_t iter) result_t test_mm_crc32_u8(const SSE2NEONTestImpl &impl, uint32_t iter) { uint32_t crc = *(const uint32_t *) impl.mTestIntPointer1; - uint8_t v = iter; + uint8_t v = (uint8_t) iter; uint32_t result = _mm_crc32_u8(crc, v); ASSERT_RETURN(result == canonical_crc32_u8(crc, v)); return TEST_SUCCESS; @@ -11697,14 +11647,14 @@ result_t test_mm_aesimc_si128(const SSE2NEONTestImpl &impl, uint32_t iter) g = v[i][2]; h = v[i][3]; - v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^ - MULTIPLY(h, 0x09); - v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^ - MULTIPLY(h, 0x0d); - v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^ - MULTIPLY(h, 0x0b); - v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^ - MULTIPLY(h, 0x0e); + v[i][0] = (uint8_t) (MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ + MULTIPLY(g, 0x0d) ^ MULTIPLY(h, 0x09)); + v[i][1] = (uint8_t) (MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ + MULTIPLY(g, 0x0b) ^ MULTIPLY(h, 0x0d)); + v[i][2] = (uint8_t) (MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ + MULTIPLY(g, 0x0e) ^ MULTIPLY(h, 0x0b)); + v[i][3] = (uint8_t) (MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ + MULTIPLY(g, 0x09) ^ MULTIPLY(h, 0x0e)); } __m128i result_reference = _mm_loadu_si128((const __m128i *) v); diff --git a/tests/main.cpp b/tests/main.cpp index 953d4c36..844353e0 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -32,7 +32,8 @@ int main(int /*argc*/, const char ** /*argv*/) "Ignored: %d\n" "Coverage rate: %.2f%%\n", passCount, failedCount, ignoreCount, - (float) passCount / (passCount + failedCount + ignoreCount) * 100); + (float) passCount / (float) (passCount + failedCount + ignoreCount) * + 100); return failedCount ? -1 : 0; }