fluent · cosmo0920 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
@@ -5,4 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64)")
     message(WARNING "LuaJIT is disabled, this platform does not support built-in LuaJIT and system provided one neither.")
     set(FLB_LUAJIT OFF)
   endif()
+  if(FLB_SIMD)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv_zba")
+  endif()
 endif ()
@@ -59,6 +59,27 @@ typedef __m128i flb_vector32;
 typedef uint8x16_t flb_vector8;
 typedef uint32x4_t flb_vector32;
 
+#elif defined(__riscv) && (__riscv_v_intrinsic >= 11000)
+/*
+ * We use RVV (RISC-V "Vector") instructions if the compiler provides
+ * access to them (as indicated by __riscv_v_intrinsic) and using with
+ * -march=rv64gcv_zba flag. RVV extension is currently optional for
+ * risc-v processors. If the processors can handle this RVV
+ * intrinsics, this extension is able to use on that platform.
+ * However, there is a few RISC-V prosessors to support RVV
+ * extensions.
+ * If there is no RISC-V processor which supports RVV extensions,
+ * qemu-riscv with -cpu rv64,v=true,zba=true,vlen=128 flags could be
+ * able to emulate such extensions.
+ */
+#include <riscv_vector.h>
+#define FLB_SIMD_RVV
+typedef vuint8m1_t flb_vector8;
+typedef vuint32m1_t flb_vector32;
+
+#define RVV_VEC8_INST_LEN  (128 / 8)     /* 16 */
+#define RVV_VEC32_INST_LEN (128 / 8 / 4) /*  4 */
+
 #else
 /*
  * If no SIMD instructions are available, we can in some cases emulate vector
@@ -94,6 +115,8 @@ static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(FLB_SIMD_NEON)
 	*v = vld1q_u8(s);
+#elif defined(FLB_SIMD_RVV)
+	*v = __riscv_vle8_v_u8m1(s, RVV_VEC8_INST_LEN);
 #else
 	memset(v, 0, sizeof(flb_vector8));
 #endif
@@ -129,6 +152,8 @@ static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vecto
 	return _mm_subs_epu8(v1, v2);
 #elif defined(FLB_SIMD_NEON)
 	return vqsubq_u8(v1, v2);
+#elif defined(FLB_SIMD_RVV)
+	return __riscv_vssubu_vv_u8m1(v1, v2, RVV_VEC8_INST_LEN);
 #endif
 }
 #endif /* ! FLB_SIMD_NONE */
@@ -144,6 +169,11 @@ static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(FLB_SIMD_NEON)
 	return vceqq_u8(v1, v2);
+#elif defined(FLB_SIMD_RVV)
+	vbool8_t ret = __riscv_vmseq_vv_u8m1_b8(v1, v2, RVV_VEC8_INST_LEN);
+	return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN),
+								   __riscv_vmv_v_x_u8m1(UINT8_MAX, RVV_VEC8_INST_LEN),
+								   ret, RVV_VEC8_INST_LEN);
 #endif
 }
 #endif /* ! FLB_SIMD_NONE */
@@ -155,6 +185,11 @@ static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vect
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(FLB_SIMD_NEON)
 	return vceqq_u32(v1, v2);
+#elif defined(FLB_SIMD_RVV)
+	vbool32_t ret = __riscv_vmseq_vv_u32m1_b32(v1, v2, RVV_VEC32_INST_LEN);
+	return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0, RVV_VEC32_INST_LEN),
+									__riscv_vmv_v_x_u32m1(UINT32_MAX, RVV_VEC32_INST_LEN),
+									ret, RVV_VEC32_INST_LEN);
 #endif
 }
 #endif /* ! FLB_SIMD_NONE */
@@ -168,6 +203,8 @@ static inline flb_vector8 flb_vector8_broadcast(const uint8_t c)
 	return _mm_set1_epi8(c);
 #elif defined(FLB_SIMD_NEON)
 	return vdupq_n_u8(c);
+#elif defined(FLB_SIMD_RVV)
+	return __riscv_vmv_v_x_u8m1(c, RVV_VEC8_INST_LEN);
 #else
 	return ~UINT64CONST(0) / 0xFF * c;
 #endif
@@ -182,6 +219,10 @@ static inline bool flb_vector8_is_highbit_set(const flb_vector8 v)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(FLB_SIMD_NEON)
 	return vmaxvq_u8(v) > 0x7F;
+#elif defined(FLB_SIMD_RVV)
+	return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(v,
+																 __riscv_vmv_v_x_u8m1(0, RVV_VEC8_INST_LEN),
+																 RVV_VEC8_INST_LEN));
 #else
 	return v & flb_vector8_broadcast(0x80);
 #endif
@@ -249,6 +290,8 @@ static inline char *flb_simd_info()
 			return "SSE2";
 		#elif defined(FLB_SIMD_NEON)
 			return "NEON";
+		#elif defined(FLB_SIMD_RVV)
+			return "RVV";
 		#elif defined(FLB_SIMD_NONE)
 			return "none";
 		#else

@@ -801,6 +801,11 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
     off_t offset = 0;
     char tmp[16];
     char *p;
+#if defined(FLB_SIMD_RVV)
+    const size_t inst_len = RVV_VEC8_INST_LEN;
+#else
+    const size_t inst_len = sizeof(flb_vector8);
+#endif
 
     /* to encode codepoints > 0xFFFF */
     uint16_t high;
@@ -816,10 +821,10 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
     p = buf + *off;
 
     /* align length to the nearest multiple of the vector size for safe SIMD processing */
-    vlen = str_len & ~(sizeof(flb_vector8) - 1);
+    vlen = str_len & ~(inst_len - 1);
     for (i = 0;;) {
         /* SIMD optimization: Process chunk of input string */
-        for (; i < vlen; i += sizeof(flb_vector8)) {
+        for (; i < vlen; i += inst_len) {
             flb_vector8 chunk;
             flb_vector8_load(&chunk, (const uint8_t *)&str[i]);
 
@@ -851,7 +856,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
         }
 
         /* Process remaining characters one by one */
-        for (b = 0; b < sizeof(flb_vector8); b++) {
+        for (b = 0; b < inst_len; b++) {
             if (i >= str_len) {
                 /* all characters has been processed */
                 goto done;