@@ -12,54 +12,45 @@ namespace cp_algo {
12
12
using u32x4 = simd<uint32_t , 4 >;
13
13
using dx4 = simd<double , 4 >;
14
14
15
- dx4 abs (dx4 a) {
16
- #ifdef __AVX2__
17
- return _mm256_and_pd (a, dx4{} + 1 /0 .);
18
- #else
15
+ [[gnu::always_inline]] inline dx4 abs (dx4 a) {
19
16
return a < 0 ? -a : a;
20
- #endif
21
17
}
22
18
23
- i64x4 lround (dx4 x) {
19
+ [[gnu::always_inline]] inline i64x4 lround (dx4 x) {
24
20
// https://stackoverflow.com/a/77376595
25
21
static constexpr dx4 magic = dx4 () + double (3ULL << 51 );
26
22
return i64x4 (x + magic) - i64x4 (magic);
27
23
}
28
24
29
- dx4 round (dx4 a) {
25
+ [[gnu::always_inline]] inline dx4 round (dx4 a) {
30
26
#ifdef __AVX2__
31
27
return _mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
32
28
#else
33
29
return __builtin_convertvector (lround (a), dx4);
34
30
#endif
35
31
}
36
32
37
- u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
38
- #ifndef __AVX2__
39
- auto x_ninv = _mm256_mul_epu32 (__m256i (x), __m256i (imod));
40
- auto x_res = _mm256_add_epi64 (__m256i (x), _mm256_mul_epu32 (x_ninv, __m256i (mod)));
41
- return u64x4 (_mm256_bsrli_epi128 (x_res, 4 ));
42
- #else
43
-
33
+ [[gnu::always_inline]] inline u64x4 montgomery_reduce (u64x4 x, u64x4 mod, u64x4 imod) {
44
34
auto x_ninv = u64x4 (u32x8 (x) * u32x8 (imod));
45
- return (x + x_ninv * mod) >> 32 ;
35
+ #ifdef __AVX2__
36
+ auto x_res = __m256i (x) + _mm256_mul_epu32 (__m256i (x_ninv), __m256i (mod));
37
+ #else
38
+ auto x_res = x + x_ninv * mod;
46
39
#endif
40
+ return u64x4 (x_res) >> 32 ;
47
41
}
48
42
49
- u64x4 montgomery_mul (u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
43
+ [[gnu::always_inline]] inline u64x4 montgomery_mul (u64x4 x, u64x4 y, u64x4 mod, u64x4 imod) {
50
44
#ifdef __AVX2__
51
45
return montgomery_reduce (u64x4 (_mm256_mul_epu32 (__m256i (x), __m256i (y))), mod, imod);
52
46
#else
53
47
return montgomery_reduce (x * y, mod, imod);
54
48
#endif
55
49
}
56
50
57
- dx4 rotate_right (dx4 x) {
58
- #ifdef __AVX2__
59
- return _mm256_permute4x64_pd (x, _MM_SHUFFLE (2 , 1 , 0 , 3 ));
60
- #else
61
- return __builtin_shufflevector (x, x, 3 , 0 , 1 , 2 );
62
- #endif
51
+ [[gnu::always_inline]] inline dx4 rotate_right (dx4 x) {
52
+ static constexpr u64x4 shuffler = {3 , 0 , 1 , 2 };
53
+ return __builtin_shuffle (x, shuffler);
63
54
}
64
55
}
65
56
#endif // CP_ALGO_UTIL_SIMD_HPP
0 commit comments