From b9cf63503e01158f1df8dbb18c18a0ed3a5a4e01 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Mon, 22 Jul 2024 16:14:00 +0000 Subject: [PATCH 1/2] FDR unflip the domain mask The domain mask was being flipped, then unfliped, while never using the flipped state. This patch remove this unecessary flipping. Signed-off-by: Yoan Picchi --- src/fdr/fdr.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 302487a31..a4fddfc05 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -143,11 +143,10 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, uint16_t domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a domain_mask = ~domain_mask_flipped; u64a it_hi = *(const u64a *)itPtr; u64a it_lo = *(const u64a *)(itPtr + 8); @@ -212,24 +211,24 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, static really_inline void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, uint16_t domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach2 = andn(domain_mask_flipped, itPtr + 2); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach6 = andn(domain_mask_flipped, itPtr + 6); + u64a reach0 = domain_mask & *(itPtr); + u64a reach2 = domain_mask & *(itPtr + 2); + u64a reach4 = domain_mask & *(itPtr + 4); + u64a reach6 = domain_mask & *(itPtr + 6); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st6 = load_m128_from_u64a(ft + reach6); - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach10 = andn(domain_mask_flipped, itPtr + 10); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); - u64a reach14 = andn(domain_mask_flipped, itPtr + 14); + u64a reach8 = domain_mask & *(itPtr + 8); + u64a reach10 = domain_mask & *(itPtr + 10); + u64a reach12 = domain_mask & *(itPtr + 12); + u64a reach14 = domain_mask & *(itPtr + 14); m128 st8 = load_m128_from_u64a(ft + reach8); m128 st10 = load_m128_from_u64a(ft + reach10); @@ -265,14 +264,14 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, static really_inline void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, uint16_t domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + u64a reach0 = domain_mask & *(itPtr); + u64a reach4 = domain_mask & *(itPtr + 4); + u64a reach8 = domain_mask & *(itPtr + 8); + u64a reach12 = domain_mask & *(itPtr + 12); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st4 = load_m128_from_u64a(ft + reach4); @@ -683,7 +682,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, __builtin_prefetch(itPtr + ITER_BYTES); \ u64a conf0; \ u64a conf8; \ - get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ + get_conf_fn(itPtr, start_ptr, end_ptr, fdr->domainMask, \ ft, &conf0, &conf8, &s); \ do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ &last_match_id, zz); \ @@ -703,7 +702,6 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, u32 floodBackoff = FLOOD_BACKOFF_START; u32 last_match_id = INVALID_MATCH_ID; - u32 domain_mask_flipped = ~fdr->domainMask; u8 stride = fdr->stride; const u64a *ft = (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); From b1dea77ea405575ffb9d5075297083f5bec78ec2 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Fri, 19 Jul 2024 16:33:25 +0000 Subject: [PATCH 2/2] FDR vectorise get_conf_stride's loads for NEON get_conf_stride_1 loads 16 consecutive bytes and apply a mask and shift. We can do that easily in a vectorized way instead. This speeds up fdr by around 5%. get_conf_stride_2 also benefits from it, but with less data, the overhead of vectorisation limit most of the gain. Signed-off-by: Yoan Picchi --- src/fdr/fdr.c | 75 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index a4fddfc05..526acc054 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -147,7 +147,35 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - +#if defined(HAVE_NEON) + uint8x16_t input = vld1q_u8(itPtr); + uint8x16_t shifted_input = vextq_u8(input, vdupq_n_u8(0), 1); + + uint16x8_t even = vreinterpretq_u16_u8(input); + uint16x8_t odd = vreinterpretq_u16_u8(shifted_input); + //between those two we have 15 values. The last one will still be scalar. + + uint16x8_t vect_domain_mask = vdupq_n_u16(domain_mask); + even = vandq_u16(vect_domain_mask, even); + odd = vandq_u16(vect_domain_mask, odd); + + uint16_t reach0 = vgetq_lane_u16(even, 0); + uint16_t reach1 = vgetq_lane_u16(odd, 0); + uint16_t reach2 = vgetq_lane_u16(even, 1); + uint16_t reach3 = vgetq_lane_u16(odd, 1); + uint16_t reach4 = vgetq_lane_u16(even, 2); + uint16_t reach5 = vgetq_lane_u16(odd, 2); + uint16_t reach6 = vgetq_lane_u16(even, 3); + uint16_t reach7 = vgetq_lane_u16(odd, 3); + uint16_t reach8 = vgetq_lane_u16(even, 4); + uint16_t reach9 = vgetq_lane_u16(odd, 4); + uint16_t reach10 = vgetq_lane_u16(even, 5); + uint16_t reach11 = vgetq_lane_u16(odd, 5); + uint16_t reach12 = vgetq_lane_u16(even, 6); + uint16_t reach13 = vgetq_lane_u16(odd, 6); + uint16_t reach14 = vgetq_lane_u16(even, 7); + uint16_t reach15 = domain_mask & unaligned_load_u16(itPtr + 15); +#else u64a it_hi = *(const u64a *)itPtr; u64a it_lo = *(const u64a *)(itPtr + 8); u64a reach0 = domain_mask & it_hi; @@ -166,6 +194,7 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, u64a reach13 = domain_mask & (it_lo >> 40); u64a reach14 = domain_mask & (it_lo >> 48); u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); +#endif m128 st0 = load_m128_from_u64a(ft + reach0); m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); @@ -215,25 +244,53 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = domain_mask & *(itPtr); - u64a reach2 = domain_mask & *(itPtr + 2); - u64a reach4 = domain_mask & *(itPtr + 4); - u64a reach6 = domain_mask & *(itPtr + 6); +#if defined(HAVE_NEON) + uint8x16_t input = vld1q_u8(itPtr); + uint16x8_t even = vreinterpretq_u16_u8(input); + + uint16x8_t vect_domain_mask = vdupq_n_u16(domain_mask); + even = vandq_u16(vect_domain_mask, even); + + uint16_t reach0 = vgetq_lane_u16(even, 0); + uint16_t reach2 = vgetq_lane_u16(even, 1); + uint16_t reach4 = vgetq_lane_u16(even, 2); + uint16_t reach6 = vgetq_lane_u16(even, 3); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st6 = load_m128_from_u64a(ft + reach6); - u64a reach8 = domain_mask & *(itPtr + 8); - u64a reach10 = domain_mask & *(itPtr + 10); - u64a reach12 = domain_mask & *(itPtr + 12); - u64a reach14 = domain_mask & *(itPtr + 14); + uint16_t reach8 = vgetq_lane_u16(even, 4); + uint16_t reach10 = vgetq_lane_u16(even, 5); + uint16_t reach12 = vgetq_lane_u16(even, 6); + uint16_t reach14 = vgetq_lane_u16(even, 7); + + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st12 = load_m128_from_u64a(ft + reach12); + m128 st14 = load_m128_from_u64a(ft + reach14); +#else + u64a reach0 = andn(domain_mask_flipped, itPtr); + u64a reach2 = andn(domain_mask_flipped, itPtr + 2); + u64a reach4 = andn(domain_mask_flipped, itPtr + 4); + u64a reach6 = andn(domain_mask_flipped, itPtr + 6); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st6 = load_m128_from_u64a(ft + reach6); + + u64a reach8 = andn(domain_mask_flipped, itPtr + 8); + u64a reach10 = andn(domain_mask_flipped, itPtr + 10); + u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + u64a reach14 = andn(domain_mask_flipped, itPtr + 14); m128 st8 = load_m128_from_u64a(ft + reach8); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st12 = load_m128_from_u64a(ft + reach12); m128 st14 = load_m128_from_u64a(ft + reach14); +#endif st2 = lshiftbyte_m128(st2, 2); st4 = lshiftbyte_m128(st4, 4);