From 862399c598c3410ead7cfa28f1e1107e8aeb3901 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Mon, 16 Dec 2024 20:11:18 +0900 Subject: [PATCH 1/7] simd: Implement RVV (RISC-V "Vector") intrinsics Signed-off-by: Hiroshi Hatake --- include/fluent-bit/flb_simd.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h index 9ae53b115fe..d7f7950dada 100644 --- a/include/fluent-bit/flb_simd.h +++ b/include/fluent-bit/flb_simd.h @@ -59,6 +59,13 @@ typedef __m128i flb_vector32; typedef uint8x16_t flb_vector8; typedef uint32x4_t flb_vector32; +#elif defined(__riscv) && (__riscv_v_intrinsic >= 10000) +#include +#define FLB_SIMD_RVV +typedef vuint8m1_t flb_vector8; +typedef vuint32m1_t flb_vector32; +#define RVV_VEC_INST_LEN 16 + #else /* * If no SIMD instructions are available, we can in some cases emulate vector @@ -94,6 +101,8 @@ static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(FLB_SIMD_NEON) *v = vld1q_u8(s); +#elif defined(FLB_SIMD_RVV) + *v = __riscv_vle8_v_u8m1(s, 16); #else memset(v, 0, sizeof(flb_vector8)); #endif @@ -129,6 +138,8 @@ static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vecto return _mm_subs_epu8(v1, v2); #elif defined(FLB_SIMD_NEON) return vqsubq_u8(v1, v2); +#elif defined(FLB_SIMD_RVV) + return __riscv_vssubu_vv_u8m1(v1, v2, 16); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -144,6 +155,9 @@ static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8 return _mm_cmpeq_epi8(v1, v2); #elif defined(FLB_SIMD_NEON) return vceqq_u8(v1, v2); +#elif defined(FLB_SIMD_RVV) + vbool8_t ret = __riscv_vmseq_vv_u8m1_b8(v1, v2, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), ret, 16); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -155,6 +169,9 @@ static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vect return _mm_cmpeq_epi32(v1, v2); #elif defined(FLB_SIMD_NEON) return vceqq_u32(v1, v2); +#elif defined(FLB_SIMD_RVV) + vbool32_t ret = __riscv_vmseq_vv_u32m1_b32(v1, v2, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), ret, 4); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -168,6 +185,8 @@ static inline flb_vector8 flb_vector8_broadcast(const uint8_t c) return _mm_set1_epi8(c); #elif defined(FLB_SIMD_NEON) return vdupq_n_u8(c); +#elif defined(FLB_SIMD_RVV) + return __riscv_vmv_v_x_u8m1(c, 16); #else return ~UINT64CONST(0) / 0xFF * c; #endif @@ -182,6 +201,8 @@ static inline bool flb_vector8_is_highbit_set(const flb_vector8 v) return _mm_movemask_epi8(v) != 0; #elif defined(FLB_SIMD_NEON) return vmaxvq_u8(v) > 0x7F; +#elif defined(FLB_SIMD_RVV) + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(v, __riscv_vmv_v_x_u8m1(0, 16), 16)); #else return v & flb_vector8_broadcast(0x80); #endif @@ -249,6 +270,8 @@ static inline char *flb_simd_info() return "SSE2"; #elif defined(FLB_SIMD_NEON) return "NEON"; + #elif defined(FLB_SIMD_RVV) + return "RVV"; #elif defined(FLB_SIMD_NONE) return "none"; #else From 3dfa772559d3792202a1626b6f65e52c9781d97c Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Mon, 16 Dec 2024 20:12:51 +0900 Subject: [PATCH 2/7] build: Add C flags for enabling RVV intristics Signed-off-by: Hiroshi Hatake --- cmake/riscv64.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/riscv64.cmake b/cmake/riscv64.cmake index c4a8fb15751..a8e32515095 100644 --- a/cmake/riscv64.cmake +++ b/cmake/riscv64.cmake @@ -5,4 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64)") message(WARNING "LuaJIT is disabled, this platform does not support built-in LuaJIT and system provided one neither.") set(FLB_LUAJIT OFF) endif() + if(FLB_SIMD) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv_zba") + endif() endif () From 12590148b4bcb26ebc2c3f187a3d216989282c70 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Mon, 16 Dec 2024 20:13:26 +0900 Subject: [PATCH 3/7] utils: Adopt to use RVV extentions for RISC-V Signed-off-by: Hiroshi Hatake --- src/flb_utils.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/flb_utils.c b/src/flb_utils.c index 55b5bf6cae8..7ba7da534c5 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -801,6 +801,11 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ off_t offset = 0; char tmp[16]; char *p; +#if defined(FLB_SIMD_RVV) + const size_t inst_len = RVV_VEC_INST_LEN; +#else + const size_t inst_len = sizeof(flb_vector8); +#endif /* to encode codepoints > 0xFFFF */ uint16_t high; @@ -816,10 +821,10 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ p = buf + *off; /* align length to the nearest multiple of the vector size for safe SIMD processing */ - vlen = str_len & ~(sizeof(flb_vector8) - 1); + vlen = str_len & ~(inst_len - 1); for (i = 0;;) { /* SIMD optimization: Process chunk of input string */ - for (; i < vlen; i += sizeof(flb_vector8)) { + for (; i < vlen; i += inst_len) { flb_vector8 chunk; flb_vector8_load(&chunk, (const uint8_t *)&str[i]); @@ -851,7 +856,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ } /* Process remaining characters one by one */ - for (b = 0; b < sizeof(flb_vector8); b++) { + for (b = 0; b < inst_len; b++) { if (i >= str_len) { /* all characters has been processed */ goto done; From 78f87beacd9b8b1b8cdb6b830d525131e9dc72a6 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Mon, 16 Dec 2024 20:21:23 +0900 Subject: [PATCH 4/7] simd: Add comments for RVV intrinsics Signed-off-by: Hiroshi Hatake --- include/fluent-bit/flb_simd.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h index d7f7950dada..86a223fb24e 100644 --- a/include/fluent-bit/flb_simd.h +++ b/include/fluent-bit/flb_simd.h @@ -60,6 +60,18 @@ typedef uint8x16_t flb_vector8; typedef uint32x4_t flb_vector32; #elif defined(__riscv) && (__riscv_v_intrinsic >= 10000) +/* + * We use RVV (RISC-V "Vector") instructions if the compiler provides + * access to them (as indicated by __riscv_v_intrinsic) and using with + * -march=rv64gcv_zba flag. RVV extension is currently optional for + * risc-v processors. If the processors can handle this RVV + * intrinsics, this extension is able to use on that platform. + * However, there is a few RISC-V prosessors to support RVV + * extensions. + * If there is no RISC-V processor which supports RVV extensions, + * qemu-riscv with -cpu rv64,v=true,zba=true,vlen=128 flags could be + * able to emulate such extensions. + */ #include #define FLB_SIMD_RVV typedef vuint8m1_t flb_vector8; From 9e1a594bd4db97a6ff44ae041b0b3000df44886a Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Mon, 16 Dec 2024 20:37:01 +0900 Subject: [PATCH 5/7] simd: Specify RVV v0.11 requirement Signed-off-by: Hiroshi Hatake --- include/fluent-bit/flb_simd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h index 86a223fb24e..68a50913273 100644 --- a/include/fluent-bit/flb_simd.h +++ b/include/fluent-bit/flb_simd.h @@ -59,7 +59,7 @@ typedef __m128i flb_vector32; typedef uint8x16_t flb_vector8; typedef uint32x4_t flb_vector32; -#elif defined(__riscv) && (__riscv_v_intrinsic >= 10000) +#elif defined(__riscv) && (__riscv_v_intrinsic >= 11000) /* * We use RVV (RISC-V "Vector") instructions if the compiler provides * access to them (as indicated by __riscv_v_intrinsic) and using with From 2b094f5b17e91c6f51096062ed8d169d9742027f Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Tue, 17 Dec 2024 11:08:09 +0900 Subject: [PATCH 6/7] simd: Assume VLEN length of RISC-V as 128 Signed-off-by: Hiroshi Hatake --- include/fluent-bit/flb_simd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h index 68a50913273..a0fa3f3b32b 100644 --- a/include/fluent-bit/flb_simd.h +++ b/include/fluent-bit/flb_simd.h @@ -76,7 +76,9 @@ typedef uint32x4_t flb_vector32; #define FLB_SIMD_RVV typedef vuint8m1_t flb_vector8; typedef vuint32m1_t flb_vector32; -#define RVV_VEC_INST_LEN 16 + +/* Currently, VLEN is assumed to 128. */ +#define RVV_VEC_INST_LEN (128 / 8) /* 16 */ #else /* From fa27ea440d06c9c171445ff6454e48b4faa613b5 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Wed, 18 Dec 2024 14:08:54 +0900 Subject: [PATCH 7/7] simd: utils: Use macro to avoid magic numbers Signed-off-by: Hiroshi Hatake --- include/fluent-bit/flb_simd.h | 26 ++++++++++++++++---------- src/flb_utils.c | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h index a0fa3f3b32b..19e212c67b6 100644 --- a/include/fluent-bit/flb_simd.h +++ b/include/fluent-bit/flb_simd.h @@ -77,8 +77,8 @@ typedef uint32x4_t flb_vector32; typedef vuint8m1_t flb_vector8; typedef vuint32m1_t flb_vector32; -/* Currently, VLEN is assumed to 128. */ -#define RVV_VEC_INST_LEN (128 / 8) /* 16 */ +#define RVV_VEC8_INST_LEN (128 / 8) /* 16 */ +#define RVV_VEC32_INST_LEN (128 / 8 / 4) /* 4 */ #else /* @@ -116,7 +116,7 @@ static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s) #elif defined(FLB_SIMD_NEON) *v = vld1q_u8(s); #elif defined(FLB_SIMD_RVV) - *v = __riscv_vle8_v_u8m1(s, 16); + *v = __riscv_vle8_v_u8m1(s, RVV_VEC8_INST_LEN); #else memset(v, 0, sizeof(flb_vector8)); #endif @@ -153,7 +153,7 @@ static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vecto #elif defined(FLB_SIMD_NEON) return vqsubq_u8(v1, v2); #elif defined(FLB_SIMD_RVV) - return __riscv_vssubu_vv_u8m1(v1, v2, 16); + return __riscv_vssubu_vv_u8m1(v1, v2, RVV_VEC8_INST_LEN); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -170,8 +170,10 @@ static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8 #elif defined(FLB_SIMD_NEON) return vceqq_u8(v1, v2); #elif defined(FLB_SIMD_RVV) - vbool8_t ret = __riscv_vmseq_vv_u8m1_b8(v1, v2, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), ret, 16); + vbool8_t ret = __riscv_vmseq_vv_u8m1_b8(v1, v2, RVV_VEC8_INST_LEN); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN), + __riscv_vmv_v_x_u8m1(UINT8_MAX, RVV_VEC8_INST_LEN), + ret, RVV_VEC8_INST_LEN); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -184,8 +186,10 @@ static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vect #elif defined(FLB_SIMD_NEON) return vceqq_u32(v1, v2); #elif defined(FLB_SIMD_RVV) - vbool32_t ret = __riscv_vmseq_vv_u32m1_b32(v1, v2, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), ret, 4); + vbool32_t ret = __riscv_vmseq_vv_u32m1_b32(v1, v2, RVV_VEC32_INST_LEN); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0, RVV_VEC32_INST_LEN), + __riscv_vmv_v_x_u32m1(UINT32_MAX, RVV_VEC32_INST_LEN), + ret, RVV_VEC32_INST_LEN); #endif } #endif /* ! FLB_SIMD_NONE */ @@ -200,7 +204,7 @@ static inline flb_vector8 flb_vector8_broadcast(const uint8_t c) #elif defined(FLB_SIMD_NEON) return vdupq_n_u8(c); #elif defined(FLB_SIMD_RVV) - return __riscv_vmv_v_x_u8m1(c, 16); + return __riscv_vmv_v_x_u8m1(c, RVV_VEC8_INST_LEN); #else return ~UINT64CONST(0) / 0xFF * c; #endif @@ -216,7 +220,9 @@ static inline bool flb_vector8_is_highbit_set(const flb_vector8 v) #elif defined(FLB_SIMD_NEON) return vmaxvq_u8(v) > 0x7F; #elif defined(FLB_SIMD_RVV) - return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(v, __riscv_vmv_v_x_u8m1(0, 16), 16)); + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(v, + __riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN), + RVV_VEC8_INST_LEN)); #else return v & flb_vector8_broadcast(0x80); #endif diff --git a/src/flb_utils.c b/src/flb_utils.c index 7ba7da534c5..20d09b04d5c 100644 --- a/src/flb_utils.c +++ b/src/flb_utils.c @@ -802,7 +802,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_ char tmp[16]; char *p; #if defined(FLB_SIMD_RVV) - const size_t inst_len = RVV_VEC_INST_LEN; + const size_t inst_len = RVV_VEC8_INST_LEN; #else const size_t inst_len = sizeof(flb_vector8); #endif