diff --git a/difftest/online_drive/src/dpi.rs b/difftest/online_drive/src/dpi.rs index b4c17e8ae..0170b4583 100644 --- a/difftest/online_drive/src/dpi.rs +++ b/difftest/online_drive/src/dpi.rs @@ -49,7 +49,7 @@ unsafe fn load_from_payload( let data = &byte_vec[strb_width_in_byte..]; let strb_width_in_bit = std::cmp::min(8, data_width_in_byte); - let mut masks: Vec = strobe + let masks: Vec = strobe .into_iter() .flat_map(|strb| { let mask: Vec = (0..strb_width_in_bit).map(|i| (strb & (1 << i)) != 0).collect(); diff --git a/tests/rvv_bench/_include/bench.h b/tests/rvv_bench/_include/bench.h deleted file mode 100644 index 126346d4a..000000000 --- a/tests/rvv_bench/_include/bench.h +++ /dev/null @@ -1,170 +0,0 @@ -#include "config.h" -#include "nolibc.h" - -#ifndef BENCH_NEXT - #define BENCH_NEXT NEXT -#endif - -#define MX(f, F) f(F##_m1) f(F##_m2) f(F##_m4) f(F##_m8) -#define STR(x) STR_(x) -#define STR_(x) #x - -#define ROTL(x, n) (((x) << (n)) | ((x) >> (8 * sizeof(x) - (n)))) - -#if defined(__clang__) || defined(__GNUC__) || defined(__INTEL_COMPILER) - -#define BENCH_CLOBBER() ({ __asm volatile("" ::: "memory"); }) -#define BENCH_VOLATILE(x) \ - ({ __asm volatile("" : "+g"(x) : "g"(x) : "memory"); }) -#define BENCH_VOLATILE_REG(x) \ - ({ __asm volatile("" : "+r"(x) : "r"(x) : "memory"); }) -#define BENCH_VOLATILE_MEM(x) \ - ({ __asm volatile("" : "+m"(x) : "m"(x) : "memory"); }) -#define BENCH_FENCE() ({ __asm volatile("fence.i"); }) - -#define BENCH_MAY_ALIAS __attribute__((__may_alias__)) - -#else - -#define BENCH_CLOBBER() -#define BENCH_CLOBBER_WITH(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) -#define BENCH_CLOBBER_WITH_REG(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) -#define BENCH_CLOBBER_WITH_MEM(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) -static void bench_use_ptr(char const volatile *x) {} - -#define BENCH_MAY_ALIAS - -#endif - -static int compare_ux(void const *a, void const *b) { - ux A = *(ux *)a, B = *(ux *)b; - return A < B ? -1 : A > B ? 1 : 0; -} - -typedef struct { - ux x, y, z; -} RandState; -static RandState randState = {123, 456, 789}; - -/* RomuDuoJr, see https://romu-random.org/ */ -static ux urand(void) { - ux xp = randState.x, yp = randState.y, zp = randState.z; - randState.x = 3323815723u * zp; - randState.y = ROTL(yp - xp, 6); - randState.z = ROTL(zp - yp, 22); - return xp; -} - -typedef struct { - char const *name; - void *func; -} Impl; -typedef struct { - size_t N; - char const *name; - ux (*func)(void *, size_t); -} Bench; - -static unsigned char *mem = 0; - -void bench_main(void); -ux checksum(size_t n); -void init(void); - -static void memrand(void *ptr, size_t n) { - unsigned char *p = ptr; -#ifdef __GNUC__ - typedef ux __attribute__((__may_alias__)) uxa; - for (; n && (uintptr_t)p % sizeof(uxa); --n) - *p++ = urand(); - uxa *px = (uxa *)p; - for (; n > sizeof(ux); n -= sizeof(ux)) - *px++ = urand(); - p = (unsigned char *)px; -#endif - while (n--) - *p++ = urand(); -} - -#if __STDC_HOSTED__ -#include -#else -static ux heap[1 + MAX_MEM / sizeof(ux)]; -#endif - -int test(void) { - -#if __STDC_HOSTED__ - mem = malloc(MAX_MEM); -#else - mem = (unsigned char *)heap; -#endif - - size_t x; - randState.x ^= rv_cycles() * 7; - randState.y += rv_cycles() ^ (uintptr_t)&x + 666 * (uintptr_t)mem; - - /* initialize memory */ - memrand(mem, MAX_MEM); - - init(); - bench_main(); -#if __STDC_HOSTED__ - free(mem); -#endif - return 0; -} - -static fx bench_time(size_t n, Impl impl, Bench bench) { - static ux arr[MAX_REPEATS]; - size_t total = 0, repeats = 0; - for (; repeats < MAX_REPEATS; ++repeats) { - total += arr[repeats] = bench.func(impl.func, n); - if (repeats > MIN_REPEATS && total > STOP_CYCLES) - break; - } -#if MAX_REPEATS > 4 - qsort(arr, repeats, sizeof *arr, compare_ux); - ux sum = 0, count = 0; - for (size_t i = repeats * 0.2f; i < repeats * 0.8f; ++i, ++count) - sum += arr[i]; -#else - ux sum = 0, count = repeats; - for (size_t i = 0; i < repeats; ++i) - sum += arr[i]; -#endif - return n / ((fx)sum / count); -} - -static void bench_run(size_t nImpls, Impl *impls, size_t nBenches, - Bench *benches) { - for (Bench *b = benches; b != benches + nBenches; ++b) { - size_t N = b->N; - for (Impl *i = impls; i != impls + nImpls; ++i) { - printf("["); - for (size_t n = 1; n < N; n = BENCH_NEXT(n)) { - ux si = 0, s0 = 0; - printf("%f, ", bench_time(n, *i, *b)); - } - printf("],\n"); - } - printf("]\n},\n"); - } -} - -#define TIME \ - for (ux beg = rv_cycles(), _once = 1; _once; \ - BENCH_FENCE(), _cycles += rv_cycles() - beg, _once = 0) - -#define BENCH(name) \ - ux bench_##name(void *_func, size_t n) { \ - Func *f = _func; \ - ux _cycles = 0; -#define BENCH_END \ - return _cycles; \ - } - -#define BENCH_MAIN(impls, benches) \ - void bench_main(void) { \ - bench_run(ARR_LEN(impls), impls, ARR_LEN(benches), benches); \ - } diff --git a/tests/rvv_bench/_include/config.h b/tests/rvv_bench/_include/config.h deleted file mode 100644 index 44f1009b0..000000000 --- a/tests/rvv_bench/_include/config.h +++ /dev/null @@ -1,25 +0,0 @@ -/* processor specific configs */ -#define HAS_E64 (__riscv_v_elen >= 64) -#define HAS_F16 0 - -/* the maximum number of bytes to allocate, minimum of 4096 */ -#define MAX_MEM (4096 * 8) -/* the byte count for the next run */ -#define NEXT(c) (c + c / 3 + 3) - -/* minimum number of repeats, to sample median from */ -#define MIN_REPEATS 1 -/* maxium number of repeats, executed until more than STOP_TIME has elapsed */ -#define MAX_REPEATS 1 - -/* stop repeats early afer this many cycles have elapsed */ -#define STOP_CYCLES (1024 * 1024 * 500) - -/* custom scaling factors for benchmarks, these are used to make sure each - * benchmark approximately takes the same amount of time. */ - -#define SCALE_mandelbrot(N) ((N) / 10) -#define SCALE_mergelines(N) ((N) / 10) - -/* benchmark specific configurations */ -#define mandelbrot_ITER 100 diff --git a/tests/rvv_bench/_include/nolibc.h b/tests/rvv_bench/_include/nolibc.h deleted file mode 100644 index 88f31d136..000000000 --- a/tests/rvv_bench/_include/nolibc.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include - -#if __riscv_xlen == 32 -typedef uint32_t ux; -typedef float fx; -#define IF64(...) -#elif __riscv_xlen == 64 -typedef uint64_t ux; -typedef double fx; -#define IF64(...) __VA_ARGS__ -#else -#error "unsupported XLEN" -#endif -#define ARR_LEN(x) (sizeof x / sizeof *(x)) - -static void memwrite(void const *ptr, size_t len) { - fwrite(ptr, 1, len, stdout); -} - -static size_t memread(void *ptr, size_t len) { - return fread(ptr, 1, len, stdin); -} - -static inline ux rv_cycles(void) { - ux cycle; - __asm volatile("csrr %0, mcycle" : "=r"(cycle)); - return cycle; -} - -static void memswap(void *a, void *b, size_t size) { - unsigned char *A = (unsigned char *)a, *B = (unsigned char *)b; - unsigned char *aEnd = A + size; - while (A < aEnd) { - unsigned char temp = *A; - *A++ = *B; - *B++ = temp; - } -} - -static ux usqrt(ux y) { - ux L = 0, R = y + 1; - while (L != R - 1) { - ux M = (L + R) / 2; - if (M * M <= y) - L = M; - else - R = M; - } - return L; -} - -static ux uhash(ux x) { -#if __riscv_xlen == 32 - /* MurmurHash3 32-bit finalizer */ - x ^= x >> 16; - x *= 0x85ebca6b; - x ^= x >> 13; - x *= 0xc2b2ae35; - x ^= x >> 16; -#else - /* splitmix64 finalizer */ - x ^= x >> 30; - x *= 0xbf58476d1ce4e5b9U; - x ^= x >> 27; - x *= 0x94d049bb133111ebU; - x ^= x >> 31; -#endif - return x; -} - -#define IFHOSTED(...) __VA_ARGS__ diff --git a/tests/rvv_bench/_include/template.S b/tests/rvv_bench/_include/template.S deleted file mode 100644 index eabdd5017..000000000 --- a/tests/rvv_bench/_include/template.S +++ /dev/null @@ -1,80 +0,0 @@ -#define HAS_RVV_1_0 1 -#include "config.h" -.text -.balign 8 - -#define CAT_(a,b) a##b -#define CAT(a,b) CAT_(a,b) - -#define STR(x) #x -#define STRe(x) STR(x) - -#define MX_N 0 -#include STRe(INC) - -#undef MX_N - -#define MX_N 1 -#define MX8(x) x##m8 -#define MX4(x) x##m4 -#define MX2(x) x##m2 -#define MX(x) x##m1 -#if HAS_RVV_1_0 -#define MXf2(x) x##mf2 -#define MXf4(x) x##mf4 -# define MXf8(x) x##mf8 -#endif -#include STRe(INC) - -#undef MX_N -#undef MX8 -#undef MX4 -#undef MX2 -#undef MX -#undef MXf2 -#undef MXf4 -#undef MXf8 - -#define MX_N 2 -#define MX4(x) x##m8 -#define MX2(x) x##m4 -#define MX(x) x##m2 -#define MXf2(x) x##m1 -#if HAS_RVV_1_0 -#define MXf4(x) x##mf2 -# define MXf8(x) x##mf4 -#endif -#include STRe(INC) - -#undef MX_N -#undef MX4 -#undef MX2 -#undef MX -#undef MXf2 -#undef MXf4 -#undef MXf8 - -#define MX_N 4 -#define MX2(x) x##m8 -#define MX(x) x##m4 -#define MXf2(x) x##m2 -#define MXf4(x) x##m1 -#if HAS_RVV_1_0 -# define MXf8(x) x##mf2 -#endif -#include STRe(INC) - -#undef MX_N -#undef MX2 -#undef MX -#undef MXf2 -#undef MXf4 -#undef MXf8 - -#define MX_N 8 -#define MX(x) x##m8 -#define MXf2(x) x##m4 -#define MXf4(x) x##m2 -#define MXf8(x) x##m1 -#include STRe(INC) - diff --git a/tests/rvv_bench/_include/thirdparty/boring.c b/tests/rvv_bench/_include/thirdparty/boring.c deleted file mode 100644 index e7cea237e..000000000 --- a/tests/rvv_bench/_include/thirdparty/boring.c +++ /dev/null @@ -1,383 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// Adapted from the public domain, estream code by D. Bernstein. - -#include "boring.h" - - -extern void *memcpy(void *restrict dest, void const *restrict src, size_t n); - -#define U8TO32_LITTLE(p) \ - (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ - ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) - -// sigma contains the ChaCha constants, which happen to be an ASCII string. -static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', - '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; - -#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) - -// QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. -#define QUARTERROUND(a, b, c, d) \ - x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 16); \ - x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 12); \ - x[a] += x[b]; x[d] = ROTATE(x[d] ^ x[a], 8); \ - x[c] += x[d]; x[b] = ROTATE(x[b] ^ x[c], 7); - -#define U32TO8_LITTLE(p, v) \ - { \ - (p)[0] = (v >> 0) & 0xff; \ - (p)[1] = (v >> 8) & 0xff; \ - (p)[2] = (v >> 16) & 0xff; \ - (p)[3] = (v >> 24) & 0xff; \ - } - -// chacha_core performs 20 rounds of ChaCha on the input words in -// |input| and writes the 64 output bytes to |output|. -static void chacha_core(uint8_t output[64], const uint32_t input[16]) { - uint32_t x[16]; - int i; - - memcpy(x, input, sizeof(uint32_t) * 16); - for (i = 20; i > 0; i -= 2) { - QUARTERROUND(0, 4, 8, 12) - QUARTERROUND(1, 5, 9, 13) - QUARTERROUND(2, 6, 10, 14) - QUARTERROUND(3, 7, 11, 15) - QUARTERROUND(0, 5, 10, 15) - QUARTERROUND(1, 6, 11, 12) - QUARTERROUND(2, 7, 8, 13) - QUARTERROUND(3, 4, 9, 14) - } - - for (i = 0; i < 16; ++i) { - x[i] += input[i]; - } - for (i = 0; i < 16; ++i) { - U32TO8_LITTLE(output + 4 * i, x[i]); - } -} - -void boring_chacha20(uint8_t *out, const uint8_t *in, size_t in_len, - const uint8_t key[32], const uint8_t nonce[12], - uint32_t counter) { - - uint32_t input[16]; - uint8_t buf[64]; - size_t todo, i; - - input[0] = U8TO32_LITTLE(sigma + 0); - input[1] = U8TO32_LITTLE(sigma + 4); - input[2] = U8TO32_LITTLE(sigma + 8); - input[3] = U8TO32_LITTLE(sigma + 12); - - input[4] = U8TO32_LITTLE(key + 0); - input[5] = U8TO32_LITTLE(key + 4); - input[6] = U8TO32_LITTLE(key + 8); - input[7] = U8TO32_LITTLE(key + 12); - - input[8] = U8TO32_LITTLE(key + 16); - input[9] = U8TO32_LITTLE(key + 20); - input[10] = U8TO32_LITTLE(key + 24); - input[11] = U8TO32_LITTLE(key + 28); - - input[12] = counter; - input[13] = U8TO32_LITTLE(nonce + 0); - input[14] = U8TO32_LITTLE(nonce + 4); - input[15] = U8TO32_LITTLE(nonce + 8); - - while (in_len > 0) { - todo = sizeof(buf); - if (in_len < todo) { - todo = in_len; - } - - chacha_core(buf, input); - for (i = 0; i < todo; i++) { - out[i] = in[i] ^ buf[i]; - } - - out += todo; - in += todo; - in_len -= todo; - - input[12]++; - } -} - -///// poly1305 - -static uint32_t U8TO32_LE(const uint8_t *m) { - uint32_t r; - memcpy(&r, m, sizeof(r)); - return r; -} - -static void U32TO8_LE(uint8_t *m, uint32_t v) { - memcpy(m, &v, sizeof(v)); -} - - -static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } - -struct poly1305_state_st { - uint32_t r0, r1, r2, r3, r4; - uint32_t s1, s2, s3, s4; - uint32_t h0, h1, h2, h3, h4; - uint8_t buf[16]; - unsigned int buf_used; - uint8_t key[16]; -}; - -static inline struct poly1305_state_st *poly1305_aligned_state( - poly1305_state *state) { - return (struct poly1305_state_st *)(((uintptr_t)state + 63) & ~63); -} - -static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, - size_t len) { - uint32_t t0, t1, t2, t3; - uint64_t t[5]; - uint32_t b; - uint64_t c; - size_t j; - uint8_t mp[16]; - - if (len < 16) { - goto poly1305_donna_atmost15bytes; - } - - poly1305_donna_16bytes: - t0 = U8TO32_LE(in); - t1 = U8TO32_LE(in + 4); - t2 = U8TO32_LE(in + 8); - t3 = U8TO32_LE(in + 12); - - in += 16; - len -= 16; - - state->h0 += t0 & 0x3ffffff; - state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; - state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; - state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; - state->h4 += (t3 >> 8) | (1 << 24); - - poly1305_donna_mul: - t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + - mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + - mul32x32_64(state->h4, state->s1); - t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + - mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + - mul32x32_64(state->h4, state->s2); - t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + - mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + - mul32x32_64(state->h4, state->s3); - t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + - mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + - mul32x32_64(state->h4, state->s4); - t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + - mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + - mul32x32_64(state->h4, state->r0); - - state->h0 = (uint32_t)t[0] & 0x3ffffff; - c = (t[0] >> 26); - t[1] += c; - state->h1 = (uint32_t)t[1] & 0x3ffffff; - b = (uint32_t)(t[1] >> 26); - t[2] += b; - state->h2 = (uint32_t)t[2] & 0x3ffffff; - b = (uint32_t)(t[2] >> 26); - t[3] += b; - state->h3 = (uint32_t)t[3] & 0x3ffffff; - b = (uint32_t)(t[3] >> 26); - t[4] += b; - state->h4 = (uint32_t)t[4] & 0x3ffffff; - b = (uint32_t)(t[4] >> 26); - state->h0 += b * 5; - - if (len >= 16) { - goto poly1305_donna_16bytes; - } - - // final bytes - poly1305_donna_atmost15bytes: - if (!len) { - return; - } - - for (j = 0; j < len; j++) { - mp[j] = in[j]; - } - mp[j++] = 1; - for (; j < 16; j++) { - mp[j] = 0; - } - len = 0; - - t0 = U8TO32_LE(mp + 0); - t1 = U8TO32_LE(mp + 4); - t2 = U8TO32_LE(mp + 8); - t3 = U8TO32_LE(mp + 12); - - state->h0 += t0 & 0x3ffffff; - state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; - state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; - state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; - state->h4 += (t3 >> 8); - - goto poly1305_donna_mul; -} - -void boring_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { - struct poly1305_state_st *state = poly1305_aligned_state(statep); - uint32_t t0, t1, t2, t3; - - t0 = U8TO32_LE(key + 0); - t1 = U8TO32_LE(key + 4); - t2 = U8TO32_LE(key + 8); - t3 = U8TO32_LE(key + 12); - - // precompute multipliers - state->r0 = t0 & 0x3ffffff; - t0 >>= 26; - t0 |= t1 << 6; - state->r1 = t0 & 0x3ffff03; - t1 >>= 20; - t1 |= t2 << 12; - state->r2 = t1 & 0x3ffc0ff; - t2 >>= 14; - t2 |= t3 << 18; - state->r3 = t2 & 0x3f03fff; - t3 >>= 8; - state->r4 = t3 & 0x00fffff; - - state->s1 = state->r1 * 5; - state->s2 = state->r2 * 5; - state->s3 = state->r3 * 5; - state->s4 = state->r4 * 5; - - // init state - state->h0 = 0; - state->h1 = 0; - state->h2 = 0; - state->h3 = 0; - state->h4 = 0; - - state->buf_used = 0; - memcpy(state->key, key + 16, sizeof(state->key)); -} - -void boring_poly1305_update(poly1305_state *statep, const uint8_t *in, - size_t in_len) { - unsigned int i; - struct poly1305_state_st *state = poly1305_aligned_state(statep); - - if (state->buf_used) { - unsigned todo = 16 - state->buf_used; - if (todo > in_len) { - todo = (unsigned)in_len; - } - for (i = 0; i < todo; i++) { - state->buf[state->buf_used + i] = in[i]; - } - state->buf_used += todo; - in_len -= todo; - in += todo; - - if (state->buf_used == 16) { - poly1305_update(state, state->buf, 16); - state->buf_used = 0; - } - } - - if (in_len >= 16) { - size_t todo = in_len & ~0xf; - poly1305_update(state, in, todo); - in += todo; - in_len &= 0xf; - } - - if (in_len) { - for (i = 0; i < in_len; i++) { - state->buf[i] = in[i]; - } - state->buf_used = (unsigned)in_len; - } -} - -void boring_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { - struct poly1305_state_st *state = poly1305_aligned_state(statep); - uint64_t f0, f1, f2, f3; - uint32_t g0, g1, g2, g3, g4; - uint32_t b, nb; - - if (state->buf_used) { - poly1305_update(state, state->buf, state->buf_used); - } - - b = state->h0 >> 26; - state->h0 = state->h0 & 0x3ffffff; - state->h1 += b; - b = state->h1 >> 26; - state->h1 = state->h1 & 0x3ffffff; - state->h2 += b; - b = state->h2 >> 26; - state->h2 = state->h2 & 0x3ffffff; - state->h3 += b; - b = state->h3 >> 26; - state->h3 = state->h3 & 0x3ffffff; - state->h4 += b; - b = state->h4 >> 26; - state->h4 = state->h4 & 0x3ffffff; - state->h0 += b * 5; - - g0 = state->h0 + 5; - b = g0 >> 26; - g0 &= 0x3ffffff; - g1 = state->h1 + b; - b = g1 >> 26; - g1 &= 0x3ffffff; - g2 = state->h2 + b; - b = g2 >> 26; - g2 &= 0x3ffffff; - g3 = state->h3 + b; - b = g3 >> 26; - g3 &= 0x3ffffff; - g4 = state->h4 + b - (1 << 26); - - b = (g4 >> 31) - 1; - nb = ~b; - state->h0 = (state->h0 & nb) | (g0 & b); - state->h1 = (state->h1 & nb) | (g1 & b); - state->h2 = (state->h2 & nb) | (g2 & b); - state->h3 = (state->h3 & nb) | (g3 & b); - state->h4 = (state->h4 & nb) | (g4 & b); - - f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); - f1 = ((state->h1 >> 6) | (state->h2 << 20)) + - (uint64_t)U8TO32_LE(&state->key[4]); - f2 = ((state->h2 >> 12) | (state->h3 << 14)) + - (uint64_t)U8TO32_LE(&state->key[8]); - f3 = ((state->h3 >> 18) | (state->h4 << 8)) + - (uint64_t)U8TO32_LE(&state->key[12]); - - U32TO8_LE(&mac[0], f0); - f1 += (f0 >> 32); - U32TO8_LE(&mac[4], f1); - f2 += (f1 >> 32); - U32TO8_LE(&mac[8], f2); - f3 += (f2 >> 32); - U32TO8_LE(&mac[12], f3); -} diff --git a/tests/rvv_bench/_include/thirdparty/boring.h b/tests/rvv_bench/_include/thirdparty/boring.h deleted file mode 100644 index 3fb2300b6..000000000 --- a/tests/rvv_bench/_include/thirdparty/boring.h +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include - -void boring_chacha20(uint8_t *out, const uint8_t *in, - size_t in_len, const uint8_t key[32], - const uint8_t nonce[12], uint32_t counter); - -typedef uint8_t poly1305_state[512]; - -void boring_poly1305_init(poly1305_state *state, - const uint8_t key[32]); - -void boring_poly1305_update(poly1305_state *state, - const uint8_t *in, size_t in_len); - -void boring_poly1305_finish(poly1305_state *state, - uint8_t mac[16]); diff --git a/tests/rvv_bench/_include/thirdparty/rvv-rollback.S b/tests/rvv_bench/_include/thirdparty/rvv-rollback.S deleted file mode 100644 index e941604bb..000000000 --- a/tests/rvv_bench/_include/thirdparty/rvv-rollback.S +++ /dev/null @@ -1,255 +0,0 @@ -# rvv-rollback.S -- A minimal benchmarking library -# Olaf Bernstein -# Distributed under the MIT license, see license at the end of the file. -# New versions available at https://gist.github.com/camel-cdr/cfd9ba2b8754b521edf4892fe19c7031 -# Conversions taken from https://github.com/RISCVtestbed/rvv-rollback - -.macro vle32.v a:vararg - vlw.v \a -.endm -.macro vle16.v a:vararg - vlh.v \a -.endm -.macro vle8.v a:vararg - vlb.v \a -.endm -.macro vle32ff.v a:vararg - vlwff.v \a -.endm -.macro vle16ff.v a:vararg - vlhff.v \a -.endm -.macro vle8ff.v a:vararg - vlbff.v \a -.endm -.macro vse32.v a:vararg - vsw.v \a -.endm -.macro vse16.v a:vararg - vsh.v \a -.endm -.macro vse8.v a:vararg - vsb.v \a -.endm -.macro vluxei32.v a:vararg - vlxw.v \a -.endm -.macro vluxei16.v a:vararg - vlxh.v \a -.endm -.macro vluxei8.v a:vararg - vlxb.v \a -.endm -.macro vsuxei32.v a:vararg - vsuxw.v \a -.endm -.macro vsuxei16.v a:vararg - vsuxh.v \a -.endm -.macro vsuxei8.v a:vararg - vsuxb.v \a -.endm -.macro vlse32.v a:vararg - vlsw.v \a -.endm -.macro vlse16.v a:vararg - vlsh.v \a -.endm -.macro vlse8.v a:vararg - vlsb.v \a -.endm -.macro vsse32.v a:vararg - vssw.v \a -.endm -.macro vsse16.v a:vararg - vssh.v \a -.endm -.macro vsse8.v a:vararg - vssb.v \a -.endm -.macro vloxei32.v a:vararg - vlxw.v \a -.endm -.macro vloxei16.v a:vararg - vlxh.v \a -.endm -.macro vloxei8.v a:vararg - vlxb.v \a -.endm -.macro vsoxei32.v a:vararg - vsxw.v \a -.endm -.macro vsoxei16.v a:vararg - vsxh.v \a -.endm -.macro vsoxei8.v a:vararg - vsxb.v \a -.endm -.macro vfncvt.xu.f.w a:vararg - vfncvt.xu.f.v \a -.endm -.macro vfncvt.x.f.w a:vararg - vfncvt.x.f.v \a -.endm -.macro vfncvt.f.xu.w a:vararg - vfncvt.f.xu.v \a -.endm -.macro vfncvt.f.x.w a:vararg - vfncvt.f.x.v \a -.endm -.macro vfncvt.f.f.w a:vararg - vfncvt.f.f.v \a -.endm -.macro vfredusum a:vararg - vfredsum \a -.endm -.macro vfwredusum.vs a:vararg - vfwredsum.vs \a -.endm -.macro vnclip.wv a:vararg - vnclip.vv \a -.endm -.macro vnclip.wx a:vararg - vnclip.vx \a -.endm -.macro vnclip.wi a:vararg - vnclip.vi \a -.endm -.macro vnclipu.wv a:vararg - vnclipu.vv \a -.endm -.macro vnclipu.wx a:vararg - vnclipu.vx \a -.endm -.macro vnclipu.wi a:vararg - vnclipu.vi \a -.endm -.macro vnsra.wv a:vararg - vnsra.vv \a -.endm -.macro vnsra.wx a:vararg - vnsra.vx \a -.endm -.macro vnsra.wi a:vararg - vnsra.vi \a -.endm -.macro vnsrl.wv a:vararg - vnsrl.vv \a -.endm -.macro vnsrl.wx a:vararg - vnsrl.vx \a -.endm -.macro vnsrl.wi a:vararg - vnsrl.vi \a -.endm -.macro vmandn.mm a:vararg - vmandnot.mm \a -.endm -.macro vmorn.mm a:vararg - vmornot.mm \a -.endm -.macro vmmv.m a:vararg - vmcpy.m \a -.endm -.macro vcpop.m a:vararg - vmpopc.m \a -.endm -.macro vpop.m a:vararg - vmpopc.m \a -.endm -.macro vfirst.m a:vararg - vmfirst.m \a -.endm - -.macro define_for_all_nf prefix suffix prefix2 suffix2 - .macro \prefix\()2\suffix a:vararg - \prefix2\()2\suffix2 \a - .endm - .macro \prefix\()3\suffix a:vararg - \prefix2\()3\suffix2 \a - .endm - .macro \prefix\()4\suffix a:vararg - \prefix2\()4\suffix2 \a - .endm - .macro \prefix\()5\suffix a:vararg - \prefix2\()5\suffix2 \a - .endm - .macro \prefix\()6\suffix a:vararg - \prefix2\()6\suffix2 \a - .endm - .macro \prefix\()7\suffix a:vararg - \prefix2\()7\suffix2 \a - .endm - .macro \prefix\()8\suffix a:vararg - \prefix2\()8\suffix2 \a - .endm -.endm -define_for_all_nf vlseg e8.v vlseg b.v -define_for_all_nf vlseg e16.v vlseg h.v -define_for_all_nf vlseg e32.v vlseg w.v - -define_for_all_nf vsseg e8.v vsseg b.v -define_for_all_nf vsseg e16.v vsseg h.v -define_for_all_nf vsseg e32.v vsseg w.v - -define_for_all_nf vlsseg e8.v vlsseg bu.v -define_for_all_nf vlsseg e16.v vlsseg hu.v -define_for_all_nf vlsseg e32.v vlsseg wu.v - -define_for_all_nf vssseg e8.v vssseg b.v -define_for_all_nf vssseg e16.v vssseg h.v -define_for_all_nf vssseg e32.v vssseg w.v - -define_for_all_nf vloxseg e8.v vlxseg b.v -define_for_all_nf vloxseg e16.v vlxseg h.v -define_for_all_nf vloxseg e32.v vlxseg w.v -define_for_all_nf vluxseg e8.v vlxseg b.v -define_for_all_nf vluxseg e16.v vlxseg h.v -define_for_all_nf vluxseg e32.v vlxseg w.v - -define_for_all_nf vsoxseg e8.v vsxseg b.v -define_for_all_nf vsoxseg e16.v vsxseg h.v -define_for_all_nf vsoxseg e32.v vsxseg w.v -define_for_all_nf vsuxseg e8.v vsxseg b.v -define_for_all_nf vsuxseg e16.v vsxseg h.v -define_for_all_nf vsuxseg e32.v vsxseg w.v - - -.macro vsetvl0p7 rd, rs1, rs2, T=1, M=1 - vsetvl \rd, \rs1, \rs2 -.endm -.macro vsetvli0p7 rd, rs1, e=e8, m=m1, T=1, M=1 - .ifc \m, mf2 - NOT SUPPORTED IN rvv0.7 - .endif - .ifc \m, mf4 - NOT SUPPORTED IN rvv0.7 - .endif - .ifc \m, mf8 - NOT SUPPORTED IN rvv0.7 - .endif - vsetvli \rd, \rs1, \e, \m -.endm - -#define vsetvl vsetvl0p7 -#define vsetvli vsetvli0p7 - - - -# Copyright (c) 2023 Olaf Berstein -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - diff --git a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S deleted file mode 100644 index b363d7830..000000000 --- a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.S +++ /dev/null @@ -1,68 +0,0 @@ -#ifdef MX - -#if MX_N == 4 || MX_N == 2 || MX_N == 1 - -.global MX(ascii_to_utf16_rvv_vsseg_) -.type MX(ascii_to_utf16_rvv_vsseg_), @function -MX(ascii_to_utf16_rvv_vsseg_): - vsetvli t0, x0, e8, MX2(), ta, ma - vmv.v.i v0, 0 -1: - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v0, (a1) - vsseg2e8.v v0, (a0) - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 1 - add a0, a0, t0 - bnez a2, 1b - ret - - - -.global MX(ascii_to_utf16_rvv_ext_) -.type MX(ascii_to_utf16_rvv_ext_), @function -MX(ascii_to_utf16_rvv_ext_): -1: - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v0, (a1) -#if HAS_RVV_1_0 - vsetvli x0, x0, e16, MX2(), ta, ma - vzext.vf2 v8, v0 -#else - vwaddu.vx v8, v0, x0 - vsetvli x0, a2, e16, MX2(), ta, ma -#endif - vse16.v v8, (a0) - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 1 - add a0, a0, t0 - bnez a2, 1b - ret - - -.global MX(ascii_to_utf16_rvv_vss_) -.type MX(ascii_to_utf16_rvv_vss_), @function -MX(ascii_to_utf16_rvv_vss_): - vsetvli t0, x0, e8, MX2(), ta, ma - vmv.v.i v0, 0 - li a3, 2 -1: - vsetvli t0, a2, e16, MX2(), ta, ma - vse16.v v0, (a0) - - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v8, (a1) - vsse8.v v8, (a0), a3 - - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 1 - add a0, a0, t0 - bnez a2, 1b - ret - -#endif -#endif - diff --git a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c b/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c deleted file mode 100644 index fc3fba747..000000000 --- a/tests/rvv_bench/ascii_to_utf16/ascii_to_utf16.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "bench.h" - -void -ascii_to_utf16_scalar(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) -{ - while (len--) *dest++ = *src++, BENCH_CLOBBER(); -} - -void -ascii_to_utf16_scalar_autovec(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) -{ - while (len--) *dest++ = *src++; -} - -#define IMPLS(f) \ - f(scalar) f(scalar_autovec) \ - f(rvv_ext_m1) f(rvv_ext_m2) f(rvv_ext_m4) \ - f(rvv_vsseg_m1) f(rvv_vsseg_m2) f(rvv_vsseg_m4) \ - f(rvv_vss_m1) f(rvv_vss_m2) f(rvv_vss_m4) \ - -typedef void Func(uint16_t *restrict dest, uint8_t const *restrict src, size_t len); - -#define DECLARE(f) extern Func ascii_to_utf16_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &ascii_to_utf16_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint16_t *dest; -uint8_t *src; - -void init(void) { } - -ux checksum(size_t n) { - ux sum = 0; - for (size_t i = 0; i < n+9; ++i) - sum = uhash(sum) + dest[i]; - return sum; -} - -void common(size_t n, size_t dOff, size_t sOff) { - dest = (uint16_t*)mem + dOff/2; - src = (uint8_t*)(dest + 9 + MAX_MEM/3) + sOff; - memrand(src, n+9); - for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; - memset(dest, 1, (n+9)*2); -} - -BENCH(base) { - common(n, urand() & 255, urand() & 255); - TIME f(dest, src, n); -} BENCH_END - -BENCH(aligned) { - common(n, 0, 0); - TIME f(dest, src, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM/3 - 512-9*2, "ascii to utf16", bench_base }, - { MAX_MEM/3 - 512-9*2, "ascii to utf16 aligned", bench_aligned }, -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S deleted file mode 100644 index 9cf21fad3..000000000 --- a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.S +++ /dev/null @@ -1,66 +0,0 @@ -#ifdef MX - -#if MX_N == 2 || MX_N == 1 - -.global MX(ascii_to_utf32_rvv_vsseg_) -MX(ascii_to_utf32_rvv_vsseg_): - vsetvli t0, x0, e8, MX4(), ta, ma - vmv.v.i v0, 0 -1: - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v0, (a1) - vsseg4e8.v v0, (a0) - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 2 - add a0, a0, t0 - bnez a2, 1b - ret - - -.global MX(ascii_to_utf32_rvv_ext_) -MX(ascii_to_utf32_rvv_ext_): -1: - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v0, (a1) -#if HAS_RVV_1_0 - vsetvli x0, x0, e32, MX4(), ta, ma - vzext.vf4 v8, v0 -#else - vwaddu.vx v16, v0, x0 - vsetvli x0, a2, e16, MX2(), ta, ma - vwaddu.vx v8, v16, x0 - vsetvli x0, a2, e32, MX4(), ta, ma -#endif - vse32.v v8, (a0) - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 2 - add a0, a0, t0 - bnez a2, 1b - ret - - -.global MX(ascii_to_utf32_rvv_vss_) -MX(ascii_to_utf32_rvv_vss_): - vsetvli t0, x0, e8, MX4(), ta, ma - vmv.v.i v0, 0 - li a3, 4 -1: - vsetvli t0, a2, e32, MX4(), ta, ma - vse32.v v0, (a0) - - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v8, (a1) - vsse8.v v8, (a0), a3 - - add a1, a1, t0 - sub a2, a2, t0 - slli t0, t0, 2 - add a0, a0, t0 - bnez a2, 1b - ret - -#endif -#endif - diff --git a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c b/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c deleted file mode 100644 index 968493037..000000000 --- a/tests/rvv_bench/ascii_to_utf32/ascii_to_utf32.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "bench.h" - -void -ascii_to_utf32_scalar(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) -{ - while (len--) *dest++ = *src++, BENCH_CLOBBER(); -} - -void -ascii_to_utf32_scalar_autovec(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) -{ - while (len--) *dest++ = *src++; -} - -#define IMPLS(f) \ - f(scalar) f(scalar_autovec) \ - f(rvv_ext_m1) f(rvv_ext_m2) \ - f(rvv_vsseg_m1) f(rvv_vsseg_m2) \ - f(rvv_vss_m1) f(rvv_vss_m2) \ - -typedef void Func(uint32_t *restrict dest, uint8_t const *restrict src, size_t len); - -#define DECLARE(f) extern Func ascii_to_utf32_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &ascii_to_utf32_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint32_t *dest; -uint8_t *src; - -void init(void) { } - -ux checksum(size_t n) { - ux sum = 0; - for (size_t i = 0; i < n+9; ++i) - sum = uhash(sum) + dest[i]; - return sum; -} - -void common(size_t n, size_t dOff, size_t sOff) { - dest = (uint32_t*)mem + dOff/4; - src = (uint8_t*)(dest + 9 + MAX_MEM/5) + sOff; - memrand(src, n+9); - for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; - memset(dest, 1, (n+9)*4); -} - -BENCH(base) { - common(n, urand() & 255, urand() & 255); - TIME f(dest, src, n); -} BENCH_END - -BENCH(aligned) { - common(n, 0, 0); - TIME f(dest, src, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM/5 - 512-9*2, "ascii to utf32", bench_base }, - { MAX_MEM/5 - 512-9*2, "ascii to utf32 aligned", bench_aligned }, -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/byteswap/byteswap.S b/tests/rvv_bench/byteswap/byteswap.S deleted file mode 100644 index 79154ef68..000000000 --- a/tests/rvv_bench/byteswap/byteswap.S +++ /dev/null @@ -1,81 +0,0 @@ -/* - * TODO: This currently only works for VLEN<=256. - * I think rvv 1.0 should only vrgatherei16.vv here in the future. - */ - -#ifdef MX - - -# a0 = ptr, a1 = len -.global MX(byteswap32_rvv_gather_) -MX(byteswap32_rvv_gather_): - vsetvli t0, x0, e8, MX(), ta, ma - vid.v v0 - vand.vi v8, v0, 3 - vrsub.vi v8, v8, 3 - vsrl.vi v0, v0, 2 - vsll.vi v0, v0, 2 - vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) -1: - vsetvli t0, a1, e32, MX(), ta, ma - vle32.v v8, (a0) - slli t1, t0, 2 - vsetvli x0, t1, e8, MX(), ta, ma - vrgather.vv v16, v8, v0 - vsetvli x0, t0, e32, MX(), ta, ma - vse32.v v16, (a0) - sub a1, a1, t0 - add a0, a0, t1 - bnez a1, 1b - ret -#endif - -#if MX_N == 2 - -.macro byteswap32_rvv_m1_gathers n - .global byteswap32_rvv_m1_gathers_m\n - byteswap32_rvv_m1_gathers_m\n: - vsetvli t0, x0, e8, m1, ta, ma - vid.v v0 - vand.vi v8, v0, 3 - vrsub.vi v8, v8, 3 - vsrl.vi v0, v0, 2 - vsll.vi v0, v0, 2 - vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) - 1: - vsetvli t0, a1, e32, m\n, ta, ma - vle32.v v8, (a0) - vsetvli t1, x0, e8, m1, ta, ma - vrgather.vv v16, v8, v0 - .ifge \n-2 - vrgather.vv v17, v9, v0 - .ifge \n-4 - vrgather.vv v18, v10, v0 - vrgather.vv v19, v11, v0 - .ifge \n-8 - vrgather.vv v20, v12, v0 - vrgather.vv v21, v13, v0 - vrgather.vv v22, v14, v0 - vrgather.vv v23, v15, v0 - .endif - .endif - .endif - vsetvli x0, t0, e32, m\n, ta, ma - vse32.v v16, (a0) - sub a1, a1, t0 - slli t0, t0, 2 - add a0, a0, t0 - bnez a1, 1b - ret -.endm - -byteswap32_rvv_m1_gathers 2 -#endif -#if MX_N == 4 -byteswap32_rvv_m1_gathers 4 -#endif -#if MX_N == 8 -byteswap32_rvv_m1_gathers 8 -#endif - - diff --git a/tests/rvv_bench/byteswap/byteswap.c b/tests/rvv_bench/byteswap/byteswap.c deleted file mode 100644 index dff204b72..000000000 --- a/tests/rvv_bench/byteswap/byteswap.c +++ /dev/null @@ -1,79 +0,0 @@ -#include "bench.h" - -void -byteswap32_scalar(uint32_t *ptr, size_t n) -{ - for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { - uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; - p[3] = p0; BENCH_CLOBBER(); - p[2] = p1; BENCH_CLOBBER(); - p[1] = p2; BENCH_CLOBBER(); - p[0] = p3; BENCH_CLOBBER(); - } -} - -void -byteswap32_scalar_autovec(uint32_t *ptr, size_t n) -{ - for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { - uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; - p[3] = p0; - p[2] = p1; - p[1] = p2; - p[0] = p3; - } -} - -#if __riscv_zbb -void -byteswap32_SWAR_rev8(uint32_t *ptr, size_t n) -{ - while (n--) { - *ptr = __builtin_bswap32(*ptr); - ++ptr; - BENCH_CLOBBER(); - } -} -#define REV8(f) f(SWAR_rev8) -#else -#define REV8(f) -#endif - - -#define IMPLS(f) \ - f(scalar) \ - f(scalar_autovec) \ - REV8(f) \ - MX(f, rvv_gather) \ - f(rvv_m1_gathers_m2) \ - f(rvv_m1_gathers_m4) \ - f(rvv_m1_gathers_m8) \ - -typedef void Func(uint32_t *ptr, size_t n); - -#define DECLARE(f) extern Func byteswap32_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &byteswap32_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint32_t *ptr; - -void init(void) { ptr = (uint32_t*)mem; } - -ux checksum(size_t n) { - ux sum = 0; - for (size_t i = 0; i < n; ++i) - sum = uhash(sum) + ptr[i]; - return sum; -} - -BENCH(base) { - memrand(ptr, n * sizeof *ptr); - TIME f(ptr, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM/4, "byteswap32", bench_base } -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/chacha20/chacha20.S b/tests/rvv_bench/chacha20/chacha20.S deleted file mode 100644 index 9c62caeba..000000000 --- a/tests/rvv_bench/chacha20/chacha20.S +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef MX -#if __riscv_xlen >= 64 -#include "rvv-chacha-poly/vchacha.s" -#endif -#endif diff --git a/tests/rvv_bench/chacha20/chacha20.c b/tests/rvv_bench/chacha20/chacha20.c deleted file mode 100644 index 7d6328b54..000000000 --- a/tests/rvv_bench/chacha20/chacha20.c +++ /dev/null @@ -1,61 +0,0 @@ -#include "bench.h" -#if __riscv_xlen >= 64 -#include "../thirdparty/boring.h" - -uint8_t *dest, *src; -uint8_t key[32], nonce[12]; -uint32_t counter; - - -extern void vector_chacha20( - uint8_t *out, const uint8_t *in, - size_t in_len, const uint8_t key[32], - const uint8_t nonce[12], uint32_t counter); - -static void -chacha20_boring(void *restrict dest, void const *restrict src, size_t n) { - boring_chacha20(dest, src, n, key, nonce, counter); -} - -static void -chacha20_rvv(void *restrict dest, void const *restrict src, size_t n) { - vector_chacha20(dest, src, n, key, nonce, counter); -} - -typedef void *Func(void *restrict dest, void const *restrict src, size_t n); - -Impl impls[] = { - { "boring", &chacha20_boring }, - { "rvv", &chacha20_rvv }, -}; - -void init(void) { - memrand(key, sizeof key); - memrand(nonce, sizeof nonce); - counter = 0; -} - -ux checksum(size_t n) { - ux sum = 0; - for (size_t i = 0; i < n+16; ++i) - sum = uhash(sum) + mem[i]; - return sum; -} - -BENCH(aligned) { - memset(mem, 0, n+16); - TIME f(mem, mem + MAX_MEM/2 + 16, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM/2 - 16, "chacha20 aligned", bench_aligned } -}; BENCH_MAIN(impls, benches) - - -#include "../thirdparty/boring.c" -#else -void init(void) {} -Impl impls[] = {}; -Bench benches[] = {}; -BENCH_MAIN(impls, benches) -#endif diff --git a/tests/rvv_bench/default.nix b/tests/rvv_bench/default.nix index 3d4b86775..71d00ecf7 100644 --- a/tests/rvv_bench/default.nix +++ b/tests/rvv_bench/default.nix @@ -1,39 +1,65 @@ { lib +, fetchFromGitHub , linkerScript , makeBuilder -, findAndBuild , t1main -, makeEmuResult +, isFp }: let - include = ./_include; + src = fetchFromGitHub { + owner = "camel-cdr"; + repo = "rvv-bench"; + rev = "5dc20c3596b3aa8412804e2d169d1b175bae927a"; + hash = "sha256-5A079sl4g7FIWgCYykLgTZXrmyfIblyXtxeh1AwqKiU="; + fetchSubmodules = true; + }; + + nonFpCases = [ + "ascii_to_utf16" + "ascii_to_utf32" + "byteswap" + "chacha20" + "memcpy" + "memset" + "mergelines" + "poly1305" + "strlen" + "utf8_count" + ]; + + fpCases = [ + "mandelbrot" + ]; + + cases = nonFpCases ++ (lib.optionals isFp fpCases); + builder = makeBuilder { casePrefix = "rvv_bench"; }; - build = { caseName, sourcePath }: + build = caseName: let drv = builder { - inherit caseName; + inherit caseName src; - src = sourcePath; + isFp = lib.elem caseName fpCases; - isFp = lib.pathExists (lib.path.append sourcePath "isFp"); + patches = [ ./t1_runtime.patch ]; buildPhase = '' runHook preBuild + pushd bench >/dev/null - $CC -E -DINC=$PWD/${caseName}.S -E ${include}/template.S -o functions.S - $CC -I${include} ${caseName}.c -T${linkerScript} ${t1main} functions.S -o $pname.elf + $CC -E -DINC=$PWD/${caseName}.S template.S -E -o functions.S + $CC ${caseName}.c -T${linkerScript} ${t1main} functions.S -o ../$pname.elf + popd >/dev/null runHook postBuild ''; - meta.description = "test case '${caseName}', written in C intrinsic"; - - passthru.emu-result = makeEmuResult drv; + meta.description = "test case '${caseName}' from rvv-bench"; }; in drv; in -findAndBuild ./. build +lib.genAttrs cases build diff --git a/tests/rvv_bench/mandelbrot/isFp b/tests/rvv_bench/mandelbrot/isFp deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/rvv_bench/mandelbrot/mandelbrot.S b/tests/rvv_bench/mandelbrot/mandelbrot.S deleted file mode 100644 index 55224666a..000000000 --- a/tests/rvv_bench/mandelbrot/mandelbrot.S +++ /dev/null @@ -1,358 +0,0 @@ -#if 0 - -void -mandelbrot_rvv(size_t width, size_t maxIter, uint32_t *res) -{ - vfloat32m2_t cx, cy, zx, zy, zx2, zy2; - vuint32m2_t viter; - vbool16_t mask; - - for (size_t y = 0; y < width; ++y) { - size_t vl, x = width; - while (x > 0) { - x -= vl = __riscv_vsetvl_e32m2(x); - - mask = __riscv_vmset_m_b16(vl); - viter = __riscv_vmv_v_x_u32m2(0, vl); - - cx = __riscv_vfcvt_f_xu_v_f32m2(__riscv_vadd_vx_u32m2(__riscv_viota_m_u32m2(mask, vl), x, vl), vl); - cy = __riscv_vfmv_v_f_f32m2(y, vl); - - cx = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vf_f32m2(cx, 2.0f / width, vl), -1.5f, vl); - cy = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vf_f32m2(cy, 2.0f / width, vl), -1, vl); - - zx = zy = zx2 = zy2 = __riscv_vfmv_v_f_f32m2(0, vl); - - size_t iter = 0; - while (iter < maxIter && __riscv_vfirst_m_b16(mask, vl) >= 0) { - mask = __riscv_vmflt_vf_f32m2_b16(__riscv_vfadd_vv_f32m2(zx2, zy2, vl), 4, vl); - zx2 = __riscv_vfadd_vv_f32m2(__riscv_vfsub_vv_f32m2(zx2, zy2, vl), cx, vl); - zy = __riscv_vfmacc_vv_f32m2(cy, __riscv_vfadd_vv_f32m2(zx, zx, vl), zy, vl); - zx = zx2; - zx2 = __riscv_vfmul_vv_f32m2(zx, zx, vl); - zy2 = __riscv_vfmul_vv_f32m2(zy, zy, vl); - ++iter; - viter = __riscv_vmerge_vxm_u32m2(viter, iter, mask, vl); - } - __riscv_vse32_v_u32m2(res + x, viter, vl); - } - res += width; - } -} - -#endif - -#if MX_N > 0 && MX_N <= 2 - -#if HAS_F16 -.global MX(mandelbrot_rvv_f16_) # generated by clang -MX(rvv_f16_m1p5): - .half 0xbe00 # half -1.5 -MX(rvv_f16_m1): - .half 0xbc00 # half -1 -MX(rvv_f16_p4): - .half 0x4400 # half 4 -MX(mandelbrot_rvv_f16_): - beqz a0, MX(rvv_f16_13) - beqz a1, MX(rvv_f16_9) - li a7, 0 - fcvt.s.wu fa2, a0 - lui a3, 262144 - fmv.w.x fa1, a3 - la a3, MX(rvv_f16_m1p5) - flh fa5, (a3) - la a3, MX(rvv_f16_m1) - flh fa4, (a3) - la a3, MX(rvv_f16_p4) - flh fa3, (a3) - fdiv.s fa2, fa1, fa2 - fcvt.h.s fa2, fa2 - slli a6, a0, 2 - j MX(rvv_f16_4) -MX(rvv_f16_3): - addi a7, a7, 1 - add a2, a2, a6 - beq a7, a0, MX(rvv_f16_13) -MX(rvv_f16_4): - fcvt.s.wu fa1, a7 - fcvt.h.s fa1, fa1 - mv t0, a0 - j MX(rvv_f16_6) -MX(rvv_f16_5): - slli a3, t0, 2 - add a3, a3, a2 - vsetvli zero, zero, e32, MX2(), ta, ma - vse32.v v8, (a3) - beqz t0, MX(rvv_f16_3) -MX(rvv_f16_6): - vsetvli a3, t0, e32, MX2(), ta, ma - sub t0, t0, a3 - vmset.m v0 - vmv.v.i v8, 0 - vsetvli zero, zero, e16, MX(), ta, ma - viota.m v12, v0 - vadd.vx v12, v12, t0 - vfcvt.f.xu.v v12, v12 - vfmv.v.f v14, fa1 - vfmul.vf v12, v12, fa2 - vfadd.vf v12, v12, fa5 - vfmul.vf v14, v14, fa2 - vfadd.vf v14, v14, fa4 - vmv.v.i v20, 0 - li a4, 1 - mv a3, a1 - vmv.v.i v16, 0 - vmv.v.i v18, 0 - vmv.v.i v22, 0 -MX(rvv_f16_7): -#if HAS_RVV_1_0 || MX_N >= 2 - vsetvli zero, zero, e8, MXf2(), ta, ma -#else - vsetvli zero, zero, e8, m1, ta, ma -#endif - vfirst.m a5, v0 - bltz a5, MX(rvv_f16_5) - vsetvli zero, zero, e16, MX(), ta, ma - vfadd.vv v24, v18, v22 - vmflt.vf v0, v24, fa3 - vfsub.vv v18, v18, v22 - vfadd.vv v20, v20, v20 - vfadd.vv v24, v18, v12 - vfmadd.vv v16, v20, v14 - vfmul.vv v18, v24, v24 - vfmul.vv v22, v16, v16 - vsetvli zero, zero, e32, MX2(), ta, ma - vmerge.vxm v8, v8, a4, v0 - addi a3, a3, -1 - addi a4, a4, 1 -#if HAS_RVV_1_0 - vmv2r.v v20, v24 -#else - vsetvli zero, zero, e32, m2 - vmv.v.v v20, v24 -#endif - bnez a3, MX(rvv_f16_7) - j MX(rvv_f16_5) -MX(rvv_f16_9): - slli a3, a0, 2 -MX(rvv_f16_10): - mv a4, a0 -MX(rvv_f16_11): - vsetvli a5, a4, e32, MX2(), ta, ma - sub a4, a4, a5 - vmv.v.i v8, 0 - slli a5, a4, 2 - add a5, a5, a2 - vse32.v v8, (a5) - bnez a4, MX(rvv_f16_11) - addi a1, a1, 1 - add a2, a2, a3 - bne a1, a0, MX(rvv_f16_10) -MX(rvv_f16_13): - ret -#endif - - -.global MX(mandelbrot_rvv_f32_) # generated by clang -MX(mandelbrot_rvv_f32_): - beqz a0, MX(rvv_f32_13) - beqz a1, MX(rvv_f32_9) - li a7, 0 - fcvt.s.wu fa5, a0 - lui a3, 262144 - fmv.w.x fa4, a3 - fdiv.s fa5, fa4, fa5 - lui a3, 785408 - fmv.w.x fa4, a3 - lui a3, 784384 - fmv.w.x fa3, a3 - lui a3, 264192 - fmv.w.x fa2, a3 - slli a6, a0, 2 - j MX(rvv_f32_4) -MX(rvv_f32_3): - addi a7, a7, 1 - add a2, a2, a6 - beq a7, a0, MX(rvv_f32_13) -MX(rvv_f32_4): - fcvt.s.wu fa1, a7 - mv t0, a0 - j MX(rvv_f32_6) -MX(rvv_f32_5): - slli a3, t0, 2 - add a3, a3, a2 - vsetvli zero, zero, e32, MX(), ta, ma - vse32.v v8, (a3) - beqz t0, MX(rvv_f32_3) -MX(rvv_f32_6): - vsetvli t1, t0, e32, MX(), ta, ma - sub t0, t0, t1 - vmset.m v0 - vmv.v.i v8, 0 - viota.m v10, v0 - vadd.vx v10, v10, t0 - vfcvt.f.xu.v v10, v10 - vfmv.v.f v12, fa1 - vfmul.vf v10, v10, fa5 - vfadd.vf v10, v10, fa4 - vfmul.vf v12, v12, fa5 - vfadd.vf v12, v12, fa3 - vmv.v.i v18, 0 - li a3, 1 - mv a5, a1 - vmv.v.i v14, 0 - vmv.v.i v16, 0 - vmv.v.i v20, 0 -MX(rvv_f32_7): -#if HAS_RVV_1_0 - vsetvli zero, t1, e8, MXf4(), ta, ma -#else - vsetvli zero, t1, e8, m1, ta, ma -#endif - vfirst.m a4, v0 - bltz a4, MX(rvv_f32_5) - vsetvli zero, zero, e32, MX(), ta, ma - vfadd.vv v22, v16, v20 - vmflt.vf v0, v22, fa2 - vfsub.vv v16, v16, v20 - vfadd.vv v18, v18, v18 - vfadd.vv v22, v16, v10 - vfmadd.vv v14, v18, v12 - vfmul.vv v16, v22, v22 - vfmul.vv v20, v14, v14 - vmerge.vxm v8, v8, a3, v0 - addi a5, a5, -1 - addi a3, a3, 1 - vmv.v.v v18, v22 - bnez a5, MX(rvv_f32_7) - j MX(rvv_f32_5) -MX(rvv_f32_9): - slli a3, a0, 2 -MX(rvv_f32_10): - mv a4, a0 -MX(rvv_f32_11): - vsetvli a5, a4, e32, MX(), ta, ma - sub a4, a4, a5 - vmv.v.i v8, 0 - slli a5, a4, 2 - add a5, a5, a2 - vse32.v v8, (a5) - bnez a4, MX(rvv_f32_11) - addi a1, a1, 1 - add a2, a2, a3 - bne a1, a0, MX(rvv_f32_10) -MX(rvv_f32_13): - ret - -#endif - -#if MX_N == 2 && HAS_E64 - -.global MX(mandelbrot_rvv_f64_) # generated by clang -MX(rvv_f64_m1p5): - .quad 0xbff8000000000000 # double -1.5 -MX(rvv_f64_m1): - .quad 0xbff0000000000000 # double -1 -MX(rvv_f64_p4): - .quad 0x4010000000000000 # double 4 -MX(mandelbrot_rvv_f64_): - beqz a0, MX(rvv_f64_13) - beqz a1, MX(rvv_f64_9) - li a7, 0 - fcvt.s.wu fa2, a0 - lui a3, 262144 - fmv.w.x fa1, a3 - la a3, MX(rvv_f64_m1p5) - fld fa5, (a3) - la a3, MX(rvv_f64_m1) - fld fa4, (a3) - la a3, MX(rvv_f64_p4) - fld fa3, (a3) - fdiv.s fa2, fa1, fa2 - fcvt.d.s fa2, fa2 - slli a6, a0, 2 - j MX(rvv_f64_4) -MX(rvv_f64_3): - addi a7, a7, 1 - add a2, a2, a6 - beq a7, a0, MX(rvv_f64_13) -MX(rvv_f64_4): - fcvt.d.wu fa1, a7 - mv t0, a0 - j MX(rvv_f64_6) -MX(rvv_f64_5): - slli a3, t0, 2 - add a3, a3, a2 - vsetvli zero, zero, e32, m1, ta, ma - vse32.v v8, (a3) - beqz t0, MX(rvv_f64_3) -MX(rvv_f64_6): - vsetvli a3, t0, e32, m1, ta, ma - sub t0, t0, a3 - vmset.m v0 - vmv.v.i v8, 0 - vsetvli zero, zero, e64, m2, ta, ma - viota.m v10, v0 - vadd.vx v10, v10, t0 - vfcvt.f.xu.v v10, v10 - vfmv.v.f v12, fa1 - vfmul.vf v10, v10, fa2 - vfadd.vf v10, v10, fa5 - vfmul.vf v12, v12, fa2 - vfadd.vf v12, v12, fa4 - vmv.v.i v18, 0 - li a4, 1 - mv a3, a1 - vmv.v.i v14, 0 - vmv.v.i v16, 0 - vmv.v.i v20, 0 -MX(rvv_f64_7): -#if HAS_RVV_1_0 - vsetvli zero, zero, e8, MXf8(), ta, ma -#else - vsetvli zero, t1, e8, m1, ta, ma -#endif - vfirst.m a5, v0 - bltz a5, MX(rvv_f64_5) - vsetvli zero, zero, e64, m2, ta, ma - vfadd.vv v22, v16, v20 - vmflt.vf v0, v22, fa3 - vfsub.vv v16, v16, v20 - vfadd.vv v18, v18, v18 - vfadd.vv v22, v16, v10 - vfmadd.vv v14, v18, v12 - vfmul.vv v16, v22, v22 - vfmul.vv v20, v14, v14 - vsetvli zero, zero, e32, m1, ta, ma - vmerge.vxm v8, v8, a4, v0 - addi a3, a3, -1 - addi a4, a4, 1 -#if HAS_RVV_1_0 - vmv2r.v v18, v22 -#else - vsetvli zero, zero, e32, m2 - vmv.v.v v18, v22 -#endif - bnez a3, MX(rvv_f64_7) - j MX(rvv_f64_5) -MX(rvv_f64_9): - slli a3, a0, 2 -MX(rvv_f64_10): - mv a4, a0 -MX(rvv_f64_11): - vsetvli a5, a4, e32, m1, ta, ma - sub a4, a4, a5 - vmv.v.i v8, 0 - slli a5, a4, 2 - add a5, a5, a2 - vse32.v v8, (a5) - bnez a4, MX(rvv_f64_11) - addi a1, a1, 1 - add a2, a2, a3 - bne a1, a0, MX(rvv_f64_10) -MX(rvv_f64_13): - ret - -#endif - - diff --git a/tests/rvv_bench/mandelbrot/mandelbrot.c b/tests/rvv_bench/mandelbrot/mandelbrot.c deleted file mode 100644 index f182eba0f..000000000 --- a/tests/rvv_bench/mandelbrot/mandelbrot.c +++ /dev/null @@ -1,94 +0,0 @@ -#include "bench.h" - -void -mandelbrot_scalar_f32(size_t width, size_t maxIter, uint32_t *res) -{ - for (size_t y = 0; y < width; ++y) - for (size_t x = 0; x < width; ++x) { - float cx = x * 2.0f / width - 1.5; - float cy = y * 2.0f / width - 1; - size_t iter = 0; - float zx = 0, zy = 0, zxS = 0, zyS = 0; - - BENCH_VOLATILE_REG(cy); - while (zxS + zyS <= 4 && iter < maxIter) { - zxS = zxS - zyS + cx; - zy = 2 * zx * zy + cy; - zx = zxS; - BENCH_VOLATILE_REG(zx); - zxS = zx*zx; - zyS = zy*zy; - ++iter; - BENCH_CLOBBER(); - } - *res++ = iter; - } -} - -#if __riscv_xlen >= 64 -void -mandelbrot_scalar_f64(size_t width, size_t maxIter, uint32_t *res) -{ - for (size_t y = 0; y < width; ++y) - for (size_t x = 0; x < width; ++x) { - double cx = x * 2.0 / width - 1.5; - double cy = y * 2.0 / width - 1; - size_t iter = 0; - double zx = 0, zy = 0, zxS = 0, zyS = 0; - - BENCH_VOLATILE_REG(cy); - while (zxS + zyS <= 4 && iter < maxIter) { - zxS = zxS - zyS + cx; - zy = 2 * zx * zy + cy; - zx = zxS; - BENCH_VOLATILE_REG(zx); - zxS = zx*zx; - zyS = zy*zy; - ++iter; - } - *res++ = iter; - } -} -#endif - -#if HAS_F16 -# define IMPLS_F16(f) f(rvv_f16_m1) f(rvv_f16_m2) -#else -# define IMPLS_F16(f) -#endif - -#define IMPLS(f) \ - f(rvv_f32_m1) \ - f(scalar_f32) \ - IF64(f(scalar_f64)) \ - IMPLS_F16(f) \ - f(rvv_f32_m2) \ - IF64(f(rvv_f64_m2)) \ - -typedef void Func(size_t width, size_t maxIter, uint32_t *res); - -#define DECLARE(f) extern Func mandelbrot_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &mandelbrot_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint32_t *dest; -void init(void) { memset(mem, 0, MAX_MEM); dest = (uint32_t*)mem; } - -/* disabled, because of rounding errors, please independently verify */ -ux checksum(size_t n) { return 0; } - -BENCH(base) { - n = usqrt(n); - TIME f(n, mandelbrot_ITER, dest); -} BENCH_END - -Bench benches[] = { - { - SCALE_mandelbrot(MAX_MEM / 4), - "mandelbrot "STR(mandelbrot_ITER), - bench_base - }, -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/memcpy/memcpy.S b/tests/rvv_bench/memcpy/memcpy.S deleted file mode 100644 index 6511a0493..000000000 --- a/tests/rvv_bench/memcpy/memcpy.S +++ /dev/null @@ -1,153 +0,0 @@ -#if 0 -void *memcpy_rvv(void *restrict dest, void const *restrict src, size_t n) { - unsigned char *d = dest; - unsigned char const *s = src; - for (size_t vl; n > 0; n -= vl, s += vl, d += vl) { - vl = __riscv_vsetvl_e8m8(n); - vuint8m8_t vec_src = __riscv_vle8_v_u8m8(s, vl); - __riscv_vse8_v_u8m8(d, vec_src, vl); - } - return dest; -} -#endif - - -#ifdef MX - -# a0 = dest, a1 = src, a2 = len -.global MX(memcpy_rvv_) -MX(memcpy_rvv_): - mv a3, a0 -1: - vsetvli t0, a2, e8, MX(), ta, ma - vle8.v v0, (a1) - add a1, a1, t0 - sub a2, a2, t0 - vse8.v v0, (a3) - add a3, a3, t0 - bnez a2, 1b - ret - -.global MX(memcpy_rvv_align_dest_) -MX(memcpy_rvv_align_dest_): - mv a3, a0 -#if HAS_RVV_1_0 - csrr t0, vlenb -#else - vsetvli t0, zero, e8, m1, ta, ma # vlenb -#endif - bltu a2, t0, 2f # len < vlenb - # align dest to vlenb - sub t1, zero, a0 - addi t2, t0, -1 - and t1, t1, t2 #align = (-dest) & (vlenb-1) - vsetvli t0, t1, e8, MX(), ta, ma -1: - vle8.v v0, (a1) - add a1, a1, t0 - sub a2, a2, t0 - vse8.v v0, (a3) - add a3, a3, t0 -2: - vsetvli t0, a2, e8, MX(), ta, ma - bnez a2, 1b - ret - -.global MX(memcpy_rvv_align_src_) -MX(memcpy_rvv_align_src_): - mv a3, a0 -#if HAS_RVV_1_0 - csrr t0, vlenb -#else - vsetvli t0, zero, e8, m1, ta, ma # vlen -#endif - bltu a2, t0, 2f # len < vlen - # align src to vlen - sub t1, zero, a1 - addi t2, t0, -1 - and t1, t1, t2 # align = (-src) & (vlen-1) - vsetvli t0, t1, e8, MX(), ta, ma -1: - vle8.v v0, (a1) - add a1, a1, t0 - sub a2, a2, t0 - vse8.v v0, (a3) - add a3, a3, t0 -2: - vsetvli t0, a2, e8, MX(), ta, ma - bnez a2, 1b - ret - -# combination of memcpy_rvv_align_dest and memcpy_rvv -.global MX(memcpy_rvv_align_dest_hybrid_) -MX(memcpy_rvv_align_dest_hybrid_): - mv a3, a0 -#if HAS_RVV_1_0 - csrr t0, vlenb -#else - vsetvli t0, zero, e8, m1, ta, ma # vlen -#endif - slli t1, t0, 8 # skip costly division for more values - bltu a2, t1, 2f # len < vlen - sub t1, zero, a0 - addi t2, t0, -1 - and t1, t1, t2 # align = (-dest) & (vlen-1) - vsetvli t0, t1, e8, MX(), ta, ma # align dest to vlen -1: - vle8.v v0, (a1) - add a1, a1, t0 - sub a2, a2, t0 - vse8.v v0, (a3) - add a3, a3, t0 -2: - vsetvli t0, a2, e8, MX(), ta, ma - bnez a2, 1b - ret - - -.global MX(memcpy_rvv_tail_) -MX(memcpy_rvv_tail_): - vsetvli t0, a2, e8, MX(), ta, ma - remu a3, a2, t0 # tail = n % vlenb - sub a2, a2, a3 # n -= tail - add a4, a0, a2 # end = dest + n - mv a2, a0 # n = dest -1: - vle8.v v8, (a1) - add a1, a1, t0 # src += vlenb - vse8.v v8, (a2) - add a2, a2, t0 # dest += vlenb - bltu a2, a4, 1b # dest < end - # copy tail - vsetvli zero, a3, e8, MX(), ta, ma - vle8.v v8, (a1) - vse8.v v8, (a2) - ret - -# this is supposed to test how well the implementation handles -# operations with an vl smaller than VLMAX -.global MX(memcpy_rvv_128_) -MX(memcpy_rvv_128_): - li t0, 128/8 - bgt a2, t0, 1f - mv t0, a2 -1: - vsetvli t0, t0, e8, MX(), ta, ma - remu a3, a2, t0 # tail = n % vlenb - sub a2, a2, a3 # n -= tail - add a4, a0, a2 # end = dest + n - mv a2, a0 # n = dest -1: - vle8.v v8, (a1) - add a1, a1, t0 # src += vlenb - vse8.v v8, (a2) - add a2, a2, t0 # dest += vlenb - bltu a2, a4, 1b # dest < end - # copy tail - vsetvli zero, a3, e8, MX(), ta, ma - vle8.v v8, (a1) - vse8.v v8, (a2) - ret - -#endif - diff --git a/tests/rvv_bench/memcpy/memcpy.c b/tests/rvv_bench/memcpy/memcpy.c deleted file mode 100644 index 60a977c71..000000000 --- a/tests/rvv_bench/memcpy/memcpy.c +++ /dev/null @@ -1,197 +0,0 @@ -#include "bench.h" - -void * -memcpy_scalar(void *restrict dest, void const *restrict src, size_t n) -{ - unsigned char *d = dest; - unsigned char const *s = src; - while (n--) *d++ = *s++, BENCH_CLOBBER(); - return dest; -} - -void * -memcpy_scalar_autovec(void *restrict dest, void const *restrict src, size_t n) -{ - unsigned char *d = dest; - unsigned char const *s = src; - while (n--) *d++ = *s++; - return dest; -} - -/* https://git.musl-libc.org/cgit/musl/tree/src/string/memcpy.c */ -void * -memcpy_musl(void *restrict dest, void const *restrict src, size_t n) -{ - unsigned char *d = dest; - unsigned char const *s = src; - -#ifdef __GNUC__ - -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define LS >> -#define RS << -#else -#define LS << -#define RS >> -#endif - - typedef uint32_t __attribute__((__may_alias__)) u32; - uint32_t w, x; - - for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; - - if ((uintptr_t)d % 4 == 0) { - for (; n>=16; s+=16, d+=16, n-=16) { - *(u32 *)(d+0) = *(u32 *)(s+0); - *(u32 *)(d+4) = *(u32 *)(s+4); - *(u32 *)(d+8) = *(u32 *)(s+8); - *(u32 *)(d+12) = *(u32 *)(s+12); - } - if (n&8) { - *(u32 *)(d+0) = *(u32 *)(s+0); - *(u32 *)(d+4) = *(u32 *)(s+4); - d += 8; s += 8; - } - if (n&4) { - *(u32 *)(d+0) = *(u32 *)(s+0); - d += 4; s += 4; - } - if (n&2) { - *d++ = *s++; *d++ = *s++; - } - if (n&1) { - *d = *s; - } - return dest; - } - - if (n >= 32) switch ((uintptr_t)d % 4) { - case 1: - w = *(u32 *)s; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - n -= 3; - for (; n>=17; s+=16, d+=16, n-=16) { - x = *(u32 *)(s+1); - *(u32 *)(d+0) = (w LS 24) | (x RS 8); - w = *(u32 *)(s+5); - *(u32 *)(d+4) = (x LS 24) | (w RS 8); - x = *(u32 *)(s+9); - *(u32 *)(d+8) = (w LS 24) | (x RS 8); - w = *(u32 *)(s+13); - *(u32 *)(d+12) = (x LS 24) | (w RS 8); - } - break; - case 2: - w = *(u32 *)s; - *d++ = *s++; - *d++ = *s++; - n -= 2; - for (; n>=18; s+=16, d+=16, n-=16) { - x = *(u32 *)(s+2); - *(u32 *)(d+0) = (w LS 16) | (x RS 16); - w = *(u32 *)(s+6); - *(u32 *)(d+4) = (x LS 16) | (w RS 16); - x = *(u32 *)(s+10); - *(u32 *)(d+8) = (w LS 16) | (x RS 16); - w = *(u32 *)(s+14); - *(u32 *)(d+12) = (x LS 16) | (w RS 16); - } - break; - case 3: - w = *(u32 *)s; - *d++ = *s++; - n -= 1; - for (; n>=19; s+=16, d+=16, n-=16) { - x = *(u32 *)(s+3); - *(u32 *)(d+0) = (w LS 8) | (x RS 24); - w = *(u32 *)(s+7); - *(u32 *)(d+4) = (x LS 8) | (w RS 24); - x = *(u32 *)(s+11); - *(u32 *)(d+8) = (w LS 8) | (x RS 24); - w = *(u32 *)(s+15); - *(u32 *)(d+12) = (x LS 8) | (w RS 24); - } - break; - } - if (n&16) { - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - } - if (n&8) { - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - } - if (n&4) { - *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; - } - if (n&2) { - *d++ = *s++; *d++ = *s++; - } - if (n&1) { - *d = *s; - } - return dest; -#endif - - while (n--) { *d++ = *s++; BENCH_CLOBBER(); } - return dest; -} - -#define memcpy_libc memcpy - -#define IMPLS(f) \ - IFHOSTED(f(libc)) \ - f(musl) \ - f(scalar) \ - f(scalar_autovec) \ - MX(f, rvv) \ - MX(f, rvv_align_dest) \ - MX(f, rvv_align_src) \ - MX(f, rvv_align_dest_hybrid) \ - MX(f, rvv_tail) \ - MX(f, rvv_128) \ - -typedef void *Func(void *restrict dest, void const *restrict src, size_t n); - -#define DECLARE(f) extern Func memcpy_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &memcpy_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint8_t *dest, *src; -ux last; - -void init(void) { } - -ux checksum(size_t n) { - ux sum = last; - for (size_t i = 0; i < n+9; ++i) - sum = uhash(sum) + dest[i]; - return sum; -} - -void common(size_t n, size_t dOff, size_t sOff) { - dest = mem + dOff; src = dest + MAX_MEM/2 + sOff + 9; - memset(dest, 0, n+9); -} - -BENCH(base) { - common(n, urand() & 255, urand() & 255); - TIME last = (uintptr_t)f(dest, src, n); -} BENCH_END - -BENCH(aligned) { - common(n, 0, 0); - TIME last = (uintptr_t)f(dest, src, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM/2 - 521, "memcpy", bench_base }, - { MAX_MEM/2 - 521, "memcpy aligned", bench_aligned} -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/memset/memset.S b/tests/rvv_bench/memset/memset.S deleted file mode 100644 index 3d00eae62..000000000 --- a/tests/rvv_bench/memset/memset.S +++ /dev/null @@ -1,96 +0,0 @@ -#if 0 -void *memset(void *dst, int n, size_t len) { - unsigned char *d = dst; - vuint8m8_t v = __riscv_vmv_v_x_u8m8((uint8_t)n, __riscv_vsetvlmax_e8m8()); - for (size_t vl; len > 0; len -= vl, d += vl) { - vl = __riscv_vsetvl_e8m8(len); - __riscv_vse8_v_u8m8(d, v, vl); - } - return dst; -} -#endif - -#ifdef MX - -.global MX(memset_rvv_) -MX(memset_rvv_): - vsetvli a3, zero, e8, MX(), ta, ma - vmv.v.x v8, a1 - mv a1, a0 -1: - vsetvli a3, a2, e8, MX(), ta, ma - vse8.v v8, (a1) - sub a2, a2, a3 - add a1, a1, a3 - bnez a2, 1b - ret - - -.global MX(memset_rvv_align_) -MX(memset_rvv_align_): - vsetvli t0, zero, e8, m1, ta, ma # vlen - vmv.v.x v8, a1 - mv a1, a0 - vsetvli t0, zero, e8, MX(), ta, ma # vlen - bltu a2, t0, 2f # len < vlen - # align dest to vlen - sub t1, zero, a0 - remu t1, t1, t0 # align = (-dest) % vlen - vsetvli t0, t1, e8, MX(), ta, ma -1: - vse8.v v8, (a1) - sub a2, a2, t0 - add a1, a1, t0 -2: - vsetvli t0, a2, e8, MX(), ta, ma - bnez a2, 1b - ret - -.global MX(memset_rvv_tail_) -MX(memset_rvv_tail_): - vsetvli t0, a2, e8, MX(), ta, ma - vmv.v.x v8, a1 - remu a3, a2, t0 # tail = n % vlenb - sub a2, a2, a3 # n -= tail - add a4, a0, a2 # end = dest + n - mv a2, a0 # n = dest -1: - vse8.v v8, (a2) - add a2, a2, t0 # dest += vlenb - bltu a2, a4, 1b # dest < end - # handle tail - vsetvli zero, a3, e8, MX(), ta, ma - vse8.v v8, (a2) - ret - -.global MX(memset_rvv_tail_4x_) -MX(memset_rvv_tail_4x_): - vsetvli t0, a2, e8, MX(), ta, ma - vmv.v.x v8, a1 - slli t1, t0, 2 - mv a5, a0 - mv a3, a2 - bltu a2, t1, 2f - remu a3, a2, t1 # tail = n % (vlenb*4) - sub a2, a2, a3 # n -= tail - add a4, a0, a2 # end = dest + n -1: - vse8.v v8, (a5) - add a5, a5, t0 # dest += vlenb - vse8.v v8, (a5) - add a5, a5, t0 # dest += vlenb - vse8.v v8, (a5) - add a5, a5, t0 # dest += vlenb - vse8.v v8, (a5) - add a5, a5, t0 # dest += vlenb - bltu a5, a4, 1b # dest < end - # handle tail -2: - vsetvli a4, a3, e8, MX(), ta, ma - vse8.v v8, (a5) - sub a3, a3, a4 - add a5, a5, a4 - bnez a3, 2b - ret - -#endif diff --git a/tests/rvv_bench/memset/memset.c b/tests/rvv_bench/memset/memset.c deleted file mode 100644 index 9b2f7c463..000000000 --- a/tests/rvv_bench/memset/memset.c +++ /dev/null @@ -1,163 +0,0 @@ -#include "bench.h" - -void * -memset_scalar(void *dest, int c, size_t n) -{ - unsigned char *d = dest; - while (n--) *d++ = c, BENCH_CLOBBER(); - return dest; -} - -void * -memset_scalar_autovec(void *dest, int c, size_t n) -{ - unsigned char *d = dest; - while (n--) *d++ = c; - return dest; -} - -/* https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c */ -#if __riscv_xlen >= 64 -void * -memset_musl(void *dest, int c, size_t n) -{ - unsigned char *s = dest; - size_t k; - - /* Fill head and tail with minimal branching. Each - * conditional ensures that all the subsequently used - * offsets are well-defined and in the dest region. */ - - if (!n) return dest; - s[0] = c; - s[n-1] = c; - if (n <= 2) return dest; - s[1] = c; - s[2] = c; - s[n-2] = c; - s[n-3] = c; - if (n <= 6) return dest; - s[3] = c; - s[n-4] = c; - if (n <= 8) return dest; - - /* Advance pointer to align it at a 4-byte boundary, - * and truncate n to a multiple of 4. The previous code - * already took care of any head/tail that get cut off - * by the alignment. */ - - k = -(uintptr_t)s & 3; - s += k; - n -= k; - n &= -4; - -#ifdef __GNUC__ - typedef uint32_t __attribute__((__may_alias__)) u32; - typedef uint64_t __attribute__((__may_alias__)) u64; - - u32 c32 = ((u32)-1)/255 * (unsigned char)c; - - /* In preparation to copy 32 bytes at a time, aligned on - * an 8-byte bounary, fill head/tail up to 28 bytes each. - * As in the initial byte-based head/tail fill, each - * conditional below ensures that the subsequent offsets - * are valid (e.g. !(n<=24) implies n>=28). */ - - *(u32 *)(s+0) = c32; - *(u32 *)(s+n-4) = c32; - if (n <= 8) return dest; - *(u32 *)(s+4) = c32; - *(u32 *)(s+8) = c32; - *(u32 *)(s+n-12) = c32; - *(u32 *)(s+n-8) = c32; - if (n <= 24) return dest; - *(u32 *)(s+12) = c32; - *(u32 *)(s+16) = c32; - *(u32 *)(s+20) = c32; - *(u32 *)(s+24) = c32; - *(u32 *)(s+n-28) = c32; - *(u32 *)(s+n-24) = c32; - *(u32 *)(s+n-20) = c32; - *(u32 *)(s+n-16) = c32; - - /* Align to a multiple of 8 so we can fill 64 bits at a time, - * and avoid writing the same bytes twice as much as is - * practical without introducing additional branching. */ - - k = 24 + ((uintptr_t)s & 4); - s += k; - n -= k; - - /* If this loop is reached, 28 tail bytes have already been - * filled, so any remainder when n drops below 32 can be - * safely ignored. */ - - u64 c64 = c32 | ((u64)c32 << 32); - for (; n >= 32; n-=32, s+=32) { - *(u64 *)(s+0) = c64; - *(u64 *)(s+8) = c64; - *(u64 *)(s+16) = c64; - *(u64 *)(s+24) = c64; - } -#else - /* Pure C fallback with no aliasing violations. */ - while (n--) *s++ = c; -#endif - - return dest; -} -#endif - -#define memset_libc memset - -#define IMPLS(f) \ - IFHOSTED(f(libc)) \ - IF64(f(musl)) \ - f(scalar) \ - f(scalar_autovec) \ - MX(f, rvv) \ - MX(f, rvv_align) \ - MX(f, rvv_tail) \ - MX(f, rvv_tail_4x) \ - -typedef void *Func(void *dest, int c, size_t n); - -#define DECLARE(f) extern Func memset_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &memset_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -uint8_t *dest; -ux last; -char c; - -void init(void) { c = urand(); } - -ux checksum(size_t n) { - ux sum = last; - for (size_t i = 0; i < n+9; ++i) - sum = uhash(sum) + dest[i]; - return sum; -} - -void common(size_t n, size_t off) { - dest = mem + off; - memset(dest, c+3, n+9); -} - -BENCH(base) { - common(n, urand() & 511); - TIME last = (uintptr_t)f(dest, c, n); -} BENCH_END - -BENCH(aligned) { - common(n, 0); - TIME last = (uintptr_t)f(dest, c, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM - 521, "memset", bench_base }, - { MAX_MEM - 521, "memset aligned", bench_aligned} -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/mergelines/mergelines.S b/tests/rvv_bench/mergelines/mergelines.S deleted file mode 100644 index 051a0d7de..000000000 --- a/tests/rvv_bench/mergelines/mergelines.S +++ /dev/null @@ -1,179 +0,0 @@ -#if 0 -size_t -mergelines_rvv(char *str, size_t len) -{ - uint8_t *dest = (uint8_t*)str; - uint8_t *src = (uint8_t*)str; - char last = 0; - - vuint8m8_t v, u, d; - vbool1_t m; - - for (size_t vl, VL; len > 1; ) { - VL = vl = __riscv_vsetvl_e8m8(len); - - char next = len > vl ? src[vl] : 0; - v = __riscv_vle8_v_u8m8(src, vl); - u = __riscv_vslide1up_vx_u8m8(v, last, vl); - d = __riscv_vslide1down_vx_u8m8(v, next, vl); - - m = __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(u, '\\', vl), __riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl); - #if DO_SKIP - if (likely(__riscv_vcpop_m_b1(m, vl) == vl && next != '\n')) - goto skip; - #endif - m = __riscv_vmand_mm_b1( - m, - __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(v, '\\', vl), __riscv_vmsne_vx_u8m8_b1(d, '\n', vl), vl), - vl); - - v = __riscv_vcompress_vm_u8m8(v, m, vl); - vl = __riscv_vcpop_m_b1(m, vl); - skip: - __riscv_vse8_v_u8m8(dest, v, vl); - dest += vl; src += VL; len -= VL; - last = src[-1]; - } - - if (len > 0 && !(last == '\\' && *src == '\n')) *dest++ = *src++; - return (dest - (uint8_t*)str); -} -#endif - -#ifdef MX - -.global MX(mergelines_rvv_) # generated by clang -MX(mergelines_rvv_): - li a2, 2 - bltu a1, a2, MX(rvv_6) - li t0, 0 - li a7, 92 - li a6, 1 - mv a2, a0 - mv a4, a0 - j MX(rvv_4) -MX(rvv_2): # in Loop: Header=BB0_4 Depth=1 - add a3, a4, a5 - lbu t1, 0(a3) -MX(rvv_3): # in Loop: Header=BB0_4 Depth=1 - vle8.v v8, (a4) - add a3, a4, a5 - vslide1up.vx v16, v8, t0 - vslide1down.vx v24, v8, t1 - vmsne.vx v0, v16, a7 - vmsne.vi v16, v8, 10 - vmor.mm v16, v0, v16 - vmsne.vx v17, v8, a7 - vmsne.vi v18, v24, 10 - vmor.mm v17, v17, v18 - vmand.mm v16, v16, v17 - vcompress.vm v24, v8, v16 - vcpop.m a4, v16 - vsetvli zero, a4, e8, MX(), ta, ma - vse8.v v24, (a2) - lbu t0, -1(a3) - sub a1, a1, a5 - add a2, a2, a4 - mv a4, a3 - bgeu a6, a1, MX(rvv_8) -MX(rvv_4): # =>This Inner Loop Header: Depth=1 - vsetvli a5, a1, e8, MX(), ta, ma - bltu a5, a1, MX(rvv_2) - li t1, 0 - j MX(rvv_3) -MX(rvv_6): - mv a2, a0 - beqz a1, MX(rvv_10) - lbu a1, 0(a0) - mv a2, a0 - j MX(rvv_11) -MX(rvv_8): - beqz a1, MX(rvv_10) - lbu a1, 0(a3) - xori a3, t0, 92 - xori a4, a1, 10 - or a3, a3, a4 - bnez a3, MX(rvv_11) -MX(rvv_10): - sub a0, a2, a0 - ret -MX(rvv_11): - addi a3, a2, 1 - sb a1, 0(a2) - sub a0, a3, a0 - ret - - -.global MX(mergelines_rvv_skip_) # generated by clang -MX(mergelines_rvv_skip_): - li a2, 2 - bltu a1, a2, MX(rvv_skip_9) - li a5, 0 - li a6, 92 - li a7, 1 - mv t1, a0 - mv a3, a0 -MX(rvv_skip_2): # =>This Inner Loop Header: Depth=1 - vsetvli a4, a1, e8, MX(), ta, ma - bgeu a4, a1, MX(rvv_skip_4) - add a2, a3, a4 - lbu t0, 0(a2) - j MX(rvv_skip_5) -MX(rvv_skip_4): # in Loop: Header=BB0_2 Depth=1 - li t0, 0 -MX(rvv_skip_5): # in Loop: Header=BB0_2 Depth=1 - vle8.v v8, (a3) - vslide1up.vx v16, v8, a5 - vmsne.vx v24, v16, a6 - vmsne.vi v16, v8, 10 - vmor.mm v16, v24, v16 - vcpop.m a2, v16 - xor a2, a2, a4 - seqz a2, a2 - addi a5, t0, -10 - snez a5, a5 - and a2, a2, a5 - beqz a2, MX(rvv_skip_8) - mv a2, a4 -MX(rvv_skip_7): # in Loop: Header=BB0_2 Depth=1 - add a3, a3, a4 - vsetvli zero, a2, e8, MX(), ta, ma - vse8.v v8, (t1) - lbu a5, -1(a3) - sub a1, a1, a4 - add t1, t1, a2 - bltu a7, a1, MX(rvv_skip_2) - j MX(rvv_skip_11) -MX(rvv_skip_8): # in Loop: Header=BB0_2 Depth=1 - vslide1down.vx v24, v8, t0 - vmsne.vx v17, v8, a6 - vmsne.vi v18, v24, 10 - vmor.mm v17, v17, v18 - vmand.mm v16, v16, v17 - vcompress.vm v24, v8, v16 - vcpop.m a2, v16 - vmv.v.v v8, v24 - j MX(rvv_skip_7) -MX(rvv_skip_9): - mv t1, a0 - beqz a1, MX(rvv_skip_13) - lbu a1, 0(a0) - mv t1, a0 - j MX(rvv_skip_14) -MX(rvv_skip_11): - beqz a1, MX(rvv_skip_13) - lbu a1, 0(a3) - xori a2, a5, 92 - xori a3, a1, 10 - or a2, a2, a3 - bnez a2, MX(rvv_skip_14) -MX(rvv_skip_13): - sub a0, t1, a0 - ret -MX(rvv_skip_14): - addi a2, t1, 1 - sb a1, 0(t1) - sub a0, a2, a0 - ret - -#endif diff --git a/tests/rvv_bench/mergelines/mergelines.c b/tests/rvv_bench/mergelines/mergelines.c deleted file mode 100644 index 2d1d2078d..000000000 --- a/tests/rvv_bench/mergelines/mergelines.c +++ /dev/null @@ -1,75 +0,0 @@ -#include "bench.h" - -size_t -mergelines_scalar(char *str, size_t len) -{ - char *dest = str; - char *src = str; - - while (len > 1) { - if (src[0] == '\\' && src[1] == '\n') - src += 2, len -= 2; - else - *dest++ = *src++, --len; - BENCH_CLOBBER(); - } - if (len > 0) - *dest++ = *src++; - return dest - str; -} - -#define IMPLS(f) \ - MX(f, rvv) \ - f(scalar) \ - MX(f, rvv_skip) \ - -typedef size_t Func(char *buf, size_t len); - -#define DECLARE(f) extern Func mergelines_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &mergelines_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -char *str; -ux last; - -void init(void) { } -ux checksum(size_t n) { return last; } - -void common(size_t n, char const *chars, size_t nChars) { - str = (char*)mem + (urand() & 255); - for (size_t i = 0; i < n; ++i) - str[i] = chars[urand() % nChars]; -} - -BENCH(2_3) { - common(n, "\\\na", 3); - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -BENCH(2_16) { - common(n, "\\\nabcdefgh", 16); - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -BENCH(2_32) { - common(n, "\\\nabcdefgh123456789", 32); - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -BENCH(2_256) { - str = (char*)mem + (urand() & 255); - for (size_t i = 0; i < n; ++i) - str[i] = urand() & 0xff; - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -#define COUNT SCALE_mergelines(MAX_MEM) - 256 -Bench benches[] = { - { COUNT, "mergelines 2/3", bench_2_3 }, - { COUNT, "mergelines 2/16", bench_2_16 }, - { COUNT, "mergelines 2/32", bench_2_32 }, - { COUNT, "mergelines 2/256", bench_2_256 } -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/poly1305/poly1305.S b/tests/rvv_bench/poly1305/poly1305.S deleted file mode 100644 index e5b332e02..000000000 --- a/tests/rvv_bench/poly1305/poly1305.S +++ /dev/null @@ -1,5 +0,0 @@ -#ifndef MX -#if __riscv_xlen >= 64 -#include "rvv-chacha-poly/vpoly.s" -#endif -#endif diff --git a/tests/rvv_bench/poly1305/poly1305.c b/tests/rvv_bench/poly1305/poly1305.c deleted file mode 100644 index 72849ac75..000000000 --- a/tests/rvv_bench/poly1305/poly1305.c +++ /dev/null @@ -1,64 +0,0 @@ -#include "bench.h" -#if __riscv_xlen >= 64 -#include "thirdparty/boring.h" - -uint8_t *src; -uint8_t key[32], sig[16]; - -extern uint64_t -vector_poly1305(const uint8_t* in, size_t len, - const uint8_t key[32], uint8_t sig[16]); - -static void -poly1305_boring(void const *src, size_t n) { - poly1305_state state; - boring_poly1305_init(&state, key); - boring_poly1305_update(&state, src, n); - boring_poly1305_finish(&state, sig); -} - -static void -poly1305_rvv(void const *src, size_t n) { - vector_poly1305(src, n, key, sig); -} - -typedef void *Func(void const *src, size_t n); - -Impl impls[] = { - { "boring", &poly1305_boring }, -#if HAS_E64 - { "rvv", &poly1305_rvv }, -#endif -}; - -void init(void) { - memrand(key, sizeof key); - memrand(sig, sizeof sig); -} - -ux checksum(size_t n) { - ux sum = 0; - for (size_t i = 0; i < ARR_LEN(sig); ++i) - sum = uhash(sum) + sig[i]; - return sum; -} - -BENCH(aligned) { - for (size_t i = 0; i < 256; ++i) - mem[urand()%n] = urand(); - n = (15+n) & -16; - TIME f(mem, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM, "poly1305 aligned", bench_aligned } -}; BENCH_MAIN(impls, benches) - - -#include "../thirdparty/boring.c" -#else -void init(void) {} -Impl impls[] = {}; -Bench benches[] = {}; -BENCH_MAIN(impls, benches) -#endif diff --git a/tests/rvv_bench/strlen/strlen.S b/tests/rvv_bench/strlen/strlen.S deleted file mode 100644 index d639e5a80..000000000 --- a/tests/rvv_bench/strlen/strlen.S +++ /dev/null @@ -1,91 +0,0 @@ -#if 0 -size_t strlen_rvv(char *src) { - size_t vlmax = __riscv_vsetvlmax_e8m8(); - char *p = src; - long first = -1; - size_t vl; - while (first < 0) { - vuint8m8_t v = __riscv_vle8ff_v_u8m8((uint8_t*)p, &vl, vlmax); - first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); - p += vl; - } - p -= vl - first; - return (size_t)(p - src); -} - -#define PAGE_SIZE 4096 -size_t strlen_rvv_page_aligned_(char *src) { - char *p = src; - long first = 0; - - size_t n = 0 - ((uintptr_t)src | -4096); - size_t vl; - for (; n > 0; n -= vl) { - vl = __riscv_vsetvl_e8m8(n); - vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); - first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); - p += vl; - if (first >= 0) { - goto end; - } - } - vl = __riscv_vsetvlmax_e8m8(); - do { - vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); - first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); - p += vl; - } while (first < 0); -end: - p -= vl - first; - return (size_t)(p - src); -} -#endif - - -#ifdef MX - -.global MX(strlen_rvv_) -MX(strlen_rvv_): - mv a3, a0 -1: - vsetvli a1, x0, e8, MX(), ta, ma - vle8ff.v v8, (a3) - csrr a1, vl - vmseq.vi v0, v8, 0 - vfirst.m a2, v0 - add a3, a3, a1 # end += vl - bltz a2, 1b - add a0, a0, a1 # start += vl - add a3, a3, a2 # end += idx - sub a0, a3, a0 # start - end - ret - -.global MX(strlen_rvv_page_aligned_) # generated by clang -MX(strlen_rvv_page_aligned_): - lui a1, 1048575 - or a1, a1, a0 - neg a4, a1 - mv a1, a0 -1: - vsetvli a2, a4, e8, MX(), ta, ma - vle8.v v8, (a1) - vmseq.vi v16, v8, 0 - vfirst.m a3, v16 - add a1, a1, a2 - bgez a3, 1f - sub a4, a4, a2 - bnez a4, 1b - vsetvli a2, zero, e8, MX(), ta, ma -2: - vle8.v v8, (a1) - vmseq.vi v16, v8, 0 - vfirst.m a3, v16 - add a1, a1, a2 - bltz a3, 2b -1: - sub a1, a1, a2 - sub a0, a3, a0 - add a0, a0, a1 - ret - -#endif diff --git a/tests/rvv_bench/strlen/strlen.c b/tests/rvv_bench/strlen/strlen.c deleted file mode 100644 index 709e84b6f..000000000 --- a/tests/rvv_bench/strlen/strlen.c +++ /dev/null @@ -1,76 +0,0 @@ -#include "bench.h" - -size_t -strlen_scalar(char const *s) -{ - char const *a = s; - while (*s) ++s, BENCH_CLOBBER(); - return s - a; -} - -size_t -strlen_scalar_autovec(char const *s) -{ - char const *a = s; - while (*s) ++s; - return s - a; -} - -/* https://git.musl-libc.org/cgit/musl/tree/src/string/strlen.c */ -#define ONES ((size_t)-1/UCHAR_MAX) -#define HIGHS (ONES * (UCHAR_MAX/2+1)) -#define HASZERO(x) (((x)-ONES) & ~(x) & HIGHS) -size_t -strlen_musl(char const *s) -{ - char const *a = s; -#ifdef __GNUC__ - typedef size_t __attribute__((__may_alias__)) word; - word const *w; - for (; (uintptr_t)s % sizeof *w; s++) if (!*s) return s-a; - for (w = (void const*)s; !HASZERO(*w); w++); - s = (void const*)w; -#endif - for (; *s; s++); - return s-a; -} - -#define strlen_libc strlen - -#define IMPLS(f) \ - f(scalar) \ - f(scalar_autovec) \ - IFHOSTED(f(libc)) \ - f(musl) \ - MX(f, rvv_page_aligned) \ - MX(f, rvv) \ - - -typedef size_t Func(char const *s); - -#define DECLARE(f) extern Func strlen_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &strlen_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -ux last; - -void init(void) { - for (size_t i = 0; i < MAX_MEM; ++i) - mem[i] += !mem[i]; // remove null bytes -} - -ux checksum(size_t n) { return last; } - -BENCH(base) { - char *p = (char*)mem + (urand() % 511); - p[n] = 0; - TIME last = f(p); - p[n] = urand() | 1; -} BENCH_END - -Bench benches[] = { - { MAX_MEM - 521, "strlen", bench_base }, -}; BENCH_MAIN(impls, benches) - diff --git a/tests/rvv_bench/t1_runtime.patch b/tests/rvv_bench/t1_runtime.patch new file mode 100644 index 000000000..aac973b20 --- /dev/null +++ b/tests/rvv_bench/t1_runtime.patch @@ -0,0 +1,29 @@ +diff --git a/nolibc.h b/nolibc.h +index 94d4235..06f2c0f 100644 +--- a/nolibc.h ++++ b/nolibc.h +@@ -64,7 +64,7 @@ memread(void *ptr, size_t len) + return fread(ptr, 1, len, stdin); + } + #ifndef ENABLE_RDCYCLE_HACK +-int main(void) { ++int test(void) { + int x = nolibc_main(); + print_flush(); + exit(x); +@@ -158,13 +158,8 @@ void _start(void) { + static inline ux + rv_cycles(void) + { +- ux cycle; +-#ifdef READ_MCYCLE +- __asm volatile ("csrr %0, mcycle" : "=r"(cycle)); +-#else +- __asm volatile ("csrr %0, cycle" : "=r"(cycle)); +-#endif +- return cycle; ++ // TODO: support cycle ++ return 0; + } + + diff --git a/tests/rvv_bench/utf8_count/utf8_count.S b/tests/rvv_bench/utf8_count/utf8_count.S deleted file mode 100644 index 41a079693..000000000 --- a/tests/rvv_bench/utf8_count/utf8_count.S +++ /dev/null @@ -1,213 +0,0 @@ -#if 0 -size_t utf8_count_rvv(char const *buf, size_t len) { - size_t sum = 0; - for (size_t vl; len > 0; len -= vl, buf += vl) { - vl = __riscv_vsetvl_e8m8(len); - vint8m8_t v = __riscv_vle8_v_i8m8((void*)buf, vl); - vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); - sum += __riscv_vcpop_m_b1(mask, vl); - } - return sum; -} -#endif - -#ifdef MX - -.global MX(utf8_count_rvv_) -MX(utf8_count_rvv_): - li a2, 0 - li a3, -65 -1: - vsetvli a4, a1, e8, MX(), ta, ma - vle8.v v8, (a0) - vmsgt.vx v16, v8, a3 - vcpop.m a5, v16 - add a2, a2, a5 - sub a1, a1, a4 - add a0, a0, a4 - bnez a1, 1b - mv a0, a2 - ret - -.global MX(utf8_count_rvv_align_) -MX(utf8_count_rvv_align_): - mv a2, a0 - li a0, 0 - li a3, -65 - vsetvli t0, zero, e8, MX(), ta, ma # vlen - bltu a1, t0, 2f # len < vlen - # align dest to vlen - sub t1, zero, a2 - remu t1, t1, t0 # align = (-dest) % vlen - vsetvli t0, t1, e8, MX(), ta, ma -1: - vle8.v v8,(a2) - vmsgt.vx v16, v8, a3 - vcpop.m a4, v16 - add a0, a0, a4 - sub a1, a1, t0 - add a2, a2, t0 -2: - vsetvli t0, a1, e8, MX(), ta, ma - bnez a1, 1b - ret - -.global MX(utf8_count_rvv_tail_) -MX(utf8_count_rvv_tail_): - vsetvli t0, a1, e8, MX(), ta, ma - remu a2, a1, t0 # tail = n % vlenb - sub a1, a1, a2 # n -= tail - add a3, a0, a1 # end = dest + n - mv a1, a0 # n = dest - li a0, 0 - li t1, -65 -1: - vle8.v v8, (a1) - vmsgt.vx v16, v8, t1 - vcpop.m t2, v16 - add a0, a0, t2 - add a1, a1, t0 # src += vlenb - bltu a1, a3, 1b # dest < end - # copy tail - vsetvli zero, a2, e8, MX(), ta, ma - vle8.v v8, (a1) - vmsgt.vx v16, v8, t1 - vcpop.m t2, v16 - add a0, a0, t2 - ret - -# this is supposed to test how well the implementation handles -# operations with an vl smaller than VLMAX -.global MX(utf8_count_rvv_128_) -MX(utf8_count_rvv_128_): - li t0, 128/8 - bgt a1, t0, 1f - mv t0, a1 -1: - vsetvli t0, t0, e8, MX(), ta, ma - remu a2, a1, t0 # tail = n % vlenb - sub a1, a1, a2 # n -= tail - add a3, a0, a1 # end = dest + n - mv a1, a0 # n = dest - li a0, 0 - li t1, -65 -1: - vle8.v v8, (a1) - vmsgt.vx v16, v8, t1 - vcpop.m t2, v16 - add a0, a0, t2 - add a1, a1, t0 # src += vlenb - bltu a1, a3, 1b # dest < end - # copy tail - vsetvli zero, a2, e8, MX(), ta, ma - vle8.v v8, (a1) - vmsgt.vx v16, v8, t1 - vcpop.m t2, v16 - add a0, a0, t2 - ret - - -.global MX(utf8_count_rvv_4x_) -MX(utf8_count_rvv_4x_): - mv a2, a0 - li a0, 0 - li a6, -65 -1: - vsetvli a4, a1, e8, MX(), ta, ma - vle8.v v8, (a2) - vmsgt.vx v16, v8, a6 - vcpop.m a7, v16 - sub a1, a1, a4 - add a2, a2, a4 - vsetvli a4, a1, e8, MX(), ta, ma - vle8.v v8, (a2) - vmsgt.vx v16, v8, a6 - vcpop.m a3, v16 - sub a1, a1, a4 - add a2, a2, a4 - vsetvli a4, a1, e8, MX(), ta, ma - vle8.v v8, (a2) - vmsgt.vx v16, v8, a6 - vcpop.m a5, v16 - sub a1, a1, a4 - add a2, a2, a4 - vsetvli a4, a1, e8, MX(), ta, ma - vle8.v v8, (a2) - add a0, a0, a7 - add a0, a0, a3 - add a0, a0, a5 - vmsgt.vx v16, v8, a6 - vcpop.m a3, v16 - add a0, a0, a3 - sub a1, a1, a4 - add a2, a2, a4 - bnez a1, 1b - ret - -// gcc generated from unrolled intrinsics implementation: -// https://godbolt.org/z/q75c6r3Ta -.global MX(utf8_count_rvv_4x_tail_) -MX(utf8_count_rvv_4x_tail_): - vsetvli a5, zero, e8, MX(), ta, ma - slli t3, a5, 2 - add a1, a0, a1 - add a2, a0, t3 - mv a4, a0 - bltu a1, a2, 5f - slli t4, a5, 1 - add t5, t4, a5 - li a0, 0 - li a6, -65 -1: - add a3, a5, a4 - vsetvli zero, zero, e8, MX(), ta, ma - add a7, t4, a4 - vle8.v v8, (a4) - vle8.v v16, (a3) - vmsgt.vx v8, v8, a6 - vmsgt.vx v16, v16, a6 - vcpop.m a3, v8 - vcpop.m t1, v16 - add a3, a3, t1 - vle8.v v8, (a7) - add a4, t5, a4 - vmsgt.vx v8, v8, a6 - vcpop.m a7, v8 - add a3, a3, a7 - vle8.v v8, (a4) - mv a4, a2 - vmsgt.vx v8, v8, a6 - add a2, a2, t3 - vcpop.m a7, v8 - add a3, a3, a7 - add a0, a0, a3 - bgeu a1, a2, 1b -2: - sub a3, a1, a4 - beq a1, a4, 4f - li a2, 0 - li a1, -65 -3: - vsetvli a5, a3, e8, MX(), ta, ma - sub a3, a3, a5 - vle8.v v8, (a4) - add a4, a4, a5 - vmsgt.vx v8, v8, a1 - vcpop.m a5, v8 - add a2, a2, a5 - bne a3, zero, 3b - add a0, a0, a2 -4: - ret -5: - li a0, 0 - j 2b - - - - -#endif - - - - diff --git a/tests/rvv_bench/utf8_count/utf8_count.c b/tests/rvv_bench/utf8_count/utf8_count.c deleted file mode 100644 index ebe2e678c..000000000 --- a/tests/rvv_bench/utf8_count/utf8_count.c +++ /dev/null @@ -1,135 +0,0 @@ -#include "bench.h" - -size_t -utf8_count_scalar(char const *str, size_t len) -{ - uint8_t const *p = (uint8_t const*)str; - size_t count = 0; - while (len--) count += (*p++ & 0xc0) != 0x80, BENCH_CLOBBER(); - return count; -} - -size_t -utf8_count_scalar_autovec(char const *str, size_t len) -{ - uint8_t const *p = (uint8_t const*)str; - size_t count = 0; - while (len--) count += (*p++ & 0xc0) != 0x80; - return count; -} - -#define GEN_SWAR(name, popc, clobber) \ - size_t \ - utf8_count_##name(char const *str, size_t len) \ - { \ - ux const BENCH_MAY_ALIAS *u; \ - size_t count = 0, tail = 0; \ -\ - uint8_t const *u8 = (uint8_t const*)str; \ - if (len < sizeof *u) { \ - tail = len; \ - goto skip; \ - } \ -\ - tail = sizeof *u - (uintptr_t)str % sizeof *u; \ -\ - len -= tail; \ - while (tail--) \ - count += (*u8++ & 0xC0) != 0x80, clobber; \ -\ - u = (ux const*)u8; \ - tail = len % sizeof *u; \ -\ - for (len /= sizeof *u; len--; ++u) { \ - ux b1 = ~*u & (ux)0x8080808080808080; \ - ux b2 = *u & (ux)0x4040404040404040; \ - count += popc((b1 >> 1) | b2); \ - clobber; \ - } \ -\ - u8 = (uint8_t const*)u; \ - skip: \ - while (tail--) \ - count += (*u8++ & 0xC0) != 0x80, clobber; \ - return count; \ - } - -#if __riscv_zbb -GEN_SWAR(SWAR_popc,__builtin_popcountll,BENCH_CLOBBER()) -GEN_SWAR(SWAR_popc_autovec,__builtin_popcountll,(void)0) -# define POPC(f) f(SWAR_popc) f(SWAR_popc_autovec) -#else -# define POPC(f) -#endif - -static inline int -upopcnt(ux x) -{ - /* 2-bit sums */ - x -= (x >> 1) & (-(ux)1/3); - /* 4-bit sums */ - x = (x & (-(ux)1/15*3)) + ((x >> 2) & (-(ux)1/15*3)); - /* 8-bit sums */ - x = (x + (x >> 4)) & (-(ux)1/255*15); - BENCH_VOLATILE_REG(x); - /* now we can just add the sums together, because can't overflow, - * since there can't be more than 255 bits set */ - x += (x >> 8); /* 16-bit sums */ - x += (x >> 16); /* sum 16-bit sums */ - IF64(x += (x >> 32)); /* sum 32-bit sums */ - return x & 127; -} - - -GEN_SWAR(SWAR_popc_bithack,upopcnt,BENCH_CLOBBER()) -GEN_SWAR(SWAR_popc_bithack_autovec,upopcnt,(void)0) - - -#define IMPLS(f) \ - MX(f, rvv) \ - f(scalar) \ - f(scalar_autovec) \ - POPC(f) \ - f(SWAR_popc_bithack) \ - f(SWAR_popc_bithack_autovec) \ - MX(f, rvv_align) \ - MX(f, rvv_tail) \ - MX(f, rvv_128) \ - MX(f, rvv_4x) \ - MX(f, rvv_4x_tail) \ - -typedef size_t Func(char const *str, size_t len); - -#define DECLARE(f) extern Func utf8_count_##f; -IMPLS(DECLARE) - -#define EXTRACT(f) { #f, &utf8_count_##f }, -Impl impls[] = { IMPLS(EXTRACT) }; - -char *str; -ux last; - -void init(void) { } -ux checksum(size_t n) { return last; } - -void common(size_t n, size_t off) { - str = (char*)mem + off; - memrand(str, n + 9); -} - -BENCH(base) { - common(n, urand() & 511); - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -BENCH(aligned) { - common(n, 0); - TIME last = (uintptr_t)f(str, n); -} BENCH_END - -Bench benches[] = { - { MAX_MEM - 521, "utf8 count", bench_base }, - { MAX_MEM - 521, "utf8 count aligned", bench_aligned } -}; BENCH_MAIN(impls, benches) - -