Skip to content

Commit 3bd4587

Browse files
committed
simplify simd
1 parent 32e906d commit 3bd4587

File tree

1 file changed

+13
-22
lines changed

1 file changed

+13
-22
lines changed

cp-algo/util/simd.hpp

+13-22
Original file line numberDiff line numberDiff line change
@@ -12,54 +12,45 @@ namespace cp_algo {
1212
using u32x4 = simd<uint32_t, 4>;
1313
using dx4 = simd<double, 4>;
1414

15-
dx4 abs(dx4 a) {
16-
#ifdef __AVX2__
17-
return _mm256_and_pd(a, dx4{} + 1/0.);
18-
#else
15+
[[gnu::always_inline]] inline dx4 abs(dx4 a) {
1916
return a < 0 ? -a : a;
20-
#endif
2117
}
2218

23-
i64x4 lround(dx4 x) {
19+
[[gnu::always_inline]] inline i64x4 lround(dx4 x) {
2420
// https://stackoverflow.com/a/77376595
2521
static constexpr dx4 magic = dx4() + double(3ULL << 51);
2622
return i64x4(x + magic) - i64x4(magic);
2723
}
2824

29-
dx4 round(dx4 a) {
25+
[[gnu::always_inline]] inline dx4 round(dx4 a) {
3026
#ifdef __AVX2__
3127
return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
3228
#else
3329
return __builtin_convertvector(lround(a), dx4);
3430
#endif
3531
}
3632

37-
u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
38-
#ifndef __AVX2__
39-
auto x_ninv = _mm256_mul_epu32(__m256i(x), __m256i(imod));
40-
auto x_res = _mm256_add_epi64(__m256i(x), _mm256_mul_epu32(x_ninv, __m256i(mod)));
41-
return u64x4(_mm256_bsrli_epi128(x_res, 4));
42-
#else
43-
33+
[[gnu::always_inline]] inline u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
4434
auto x_ninv = u64x4(u32x8(x) * u32x8(imod));
45-
return (x + x_ninv * mod) >> 32;
35+
#ifdef __AVX2__
36+
auto x_res = __m256i(x) + _mm256_mul_epu32(__m256i(x_ninv), __m256i(mod));
37+
#else
38+
auto x_res = x + x_ninv * mod;
4639
#endif
40+
return u64x4(x_res) >> 32;
4741
}
4842

49-
u64x4 montgomery_mul(u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
43+
[[gnu::always_inline]] inline u64x4 montgomery_mul(u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
5044
#ifdef __AVX2__
5145
return montgomery_reduce(u64x4(_mm256_mul_epu32(__m256i(x), __m256i(y))), mod, imod);
5246
#else
5347
return montgomery_reduce(x * y, mod, imod);
5448
#endif
5549
}
5650

57-
dx4 rotate_right(dx4 x) {
58-
#ifdef __AVX2__
59-
return _mm256_permute4x64_pd(x, _MM_SHUFFLE(2, 1, 0, 3));
60-
#else
61-
return __builtin_shufflevector(x, x, 3, 0, 1, 2);
62-
#endif
51+
[[gnu::always_inline]] inline dx4 rotate_right(dx4 x) {
52+
static constexpr u64x4 shuffler = {3, 0, 1, 2};
53+
return __builtin_shuffle(x, shuffler);
6354
}
6455
}
6556
#endif // CP_ALGO_UTIL_SIMD_HPP

0 commit comments

Comments
 (0)