diff --git a/src/ballet/blake3/blake3.c b/src/ballet/blake3/blake3.c index 56d43c5105..992140b011 100644 --- a/src/ballet/blake3/blake3.c +++ b/src/ballet/blake3/blake3.c @@ -87,8 +87,8 @@ INLINE output_t make_output(const uint32_t input_cv[8], INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { uint32_t cv_words[8]; fd_memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); + fd_blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); store_cv_words(cv, cv_words); } @@ -98,8 +98,8 @@ INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, size_t offset_within_block = seek % 64; uint8_t wide_buf[64]; while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); + fd_blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); size_t available_bytes = 64 - offset_within_block; size_t fd_memcpy_len; if (out_len > available_bytes) { @@ -122,7 +122,7 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, input += take; input_len -= take; if (input_len > 0) { - blake3_compress_in_place( + fd_blake3_compress_in_place( self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); #pragma GCC diagnostic push @@ -135,9 +135,9 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, } while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); + fd_blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" self->blocks_compressed += 1; @@ -195,9 +195,9 @@ INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, chunks_array_len += 1; } - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); + fd_blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. @@ -238,12 +238,12 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, parents_array_len += 1; } - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); + fd_blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); // If there's an odd child left over, it becomes an output. if (num_chaining_values > 2 * parents_array_len) { @@ -273,16 +273,16 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { +static size_t fd_blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. If this implementation adds multi-threading in the future, // this gives us the option of multi-threading even the 2-chunk case, which // can help performance on smaller platforms. - if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + if (input_len <= fd_blake3_simd_degree() * BLAKE3_CHUNK_LEN) { return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); } @@ -301,7 +301,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, // account for the special case of returning 2 outputs when the SIMD degree // is 1. uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = blake3_simd_degree(); + size_t degree = fd_blake3_simd_degree(); if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { // The special case: We always use a degree of at least two, to make // sure there are two outputs. Except, as noted above, at the chunk @@ -313,9 +313,9 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, // Recurse! If this implementation adds multi-threading support in the // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( + size_t left_n = fd_blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = fd_blake3_compress_subtree_wide( right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); // The special case again. If simd_degree=1, then we'll have left_n=1 and @@ -350,8 +350,8 @@ INLINE void compress_subtree_to_parent_node( #endif uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); + size_t num_cvs = fd_blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, @@ -378,29 +378,29 @@ INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], self->cv_stack_len = 0; } -void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } +void fd_blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]) { +void fd_blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { uint32_t key_words[8]; load_key_words(key, key_words); hasher_init_base(self, key_words, KEYED_HASH); } -void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, - size_t context_len) { +void fd_blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { blake3_hasher context_hasher; hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); - blake3_hasher_update(&context_hasher, context, context_len); + fd_blake3_hasher_update(&context_hasher, context, context_len); uint8_t context_key[BLAKE3_KEY_LEN]; - blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + fd_blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); uint32_t context_key_words[8]; load_key_words(context_key, context_key_words); hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); } -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { - blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +void fd_blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + fd_blake3_hasher_init_derive_key_raw(self, context, strlen(context)); } // As described in hasher_push_cv() below, we do "lazy merging", delaying @@ -470,8 +470,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], #pragma GCC diagnostic pop } -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { +void fd_blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to fd_memcpy. This comes up in practice with things like: // std::vector v; @@ -579,13 +579,13 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, } } -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len) { - blake3_hasher_finalize_seek(self, 0, out, out_len); +void fd_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + fd_blake3_hasher_finalize_seek(self, 0, out, out_len); } -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len) { +void fd_blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to fd_memcpy. This comes up in practice with things like: // std::vector v; @@ -631,7 +631,7 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, output_root_bytes(&output, seek, out, out_len); } -void blake3_hasher_reset(blake3_hasher *self) { +void fd_blake3_hasher_reset(blake3_hasher *self) { chunk_state_reset(&self->chunk, self->key, 0); self->cv_stack_len = 0; } diff --git a/src/ballet/blake3/blake3.h b/src/ballet/blake3/blake3.h index 3d0ca76c6d..fcaee5b9a1 100644 --- a/src/ballet/blake3/blake3.h +++ b/src/ballet/blake3/blake3.h @@ -41,20 +41,20 @@ typedef struct { uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; } blake3_hasher; -const char *blake3_version(void); -void blake3_hasher_init(blake3_hasher *self); -void blake3_hasher_init_keyed(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); -void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, - size_t context_len); -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, - size_t out_len); -void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, - uint8_t *out, size_t out_len); -void blake3_hasher_reset(blake3_hasher *self); +const char *fd_blake3_version(void); +void fd_blake3_hasher_init(blake3_hasher *self); +void fd_blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +void fd_blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +void fd_blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +void fd_blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +void fd_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +void fd_blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +void fd_blake3_hasher_reset(blake3_hasher *self); #ifdef __cplusplus } diff --git a/src/ballet/blake3/blake3_avx2.c b/src/ballet/blake3/blake3_avx2.c index eae0bc8fae..b25901d1d8 100644 --- a/src/ballet/blake3/blake3_avx2.c +++ b/src/ballet/blake3/blake3_avx2.c @@ -232,10 +232,10 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, } static -void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), @@ -291,27 +291,27 @@ void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, } #if FD_HAS_AVX -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#else -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, +void fd_blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); +#else +void fd_blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); #endif /* FD_HAS_AVX */ -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { - blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } @@ -320,11 +320,11 @@ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, out = &out[DEGREE * BLAKE3_OUT_LEN]; } #if FD_HAS_AVX - blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); + fd_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); #else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); + fd_blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); #endif } diff --git a/src/ballet/blake3/blake3_avx512.c b/src/ballet/blake3/blake3_avx512.c index 71bedc6b47..d634687d69 100644 --- a/src/ballet/blake3/blake3_avx512.c +++ b/src/ballet/blake3/blake3_avx512.c @@ -288,10 +288,10 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], undiagonalize(&rows[0], &rows[2], &rows[3]); } -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { +void fd_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), &out[0]); @@ -300,10 +300,10 @@ void blake3_compress_xof_avx512(const uint32_t cv[8], storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); } -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { +void fd_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); @@ -493,10 +493,10 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter, } static -void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), @@ -748,10 +748,10 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter, } static -void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), @@ -1074,11 +1074,11 @@ INLINE void load_counters16(uint64_t counter, bool increment_counter, } static -void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out) { +void fd_blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { __m512i h_vecs[8] = { set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), @@ -1167,8 +1167,8 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, if (blocks == 1) { block_flags |= flags_end; } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); + fd_blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -1176,14 +1176,14 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, memcpy(out, cv, BLAKE3_OUT_LEN); } -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += 16; } @@ -1192,8 +1192,8 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, out = &out[16 * BLAKE3_OUT_LEN]; } while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += 8; } @@ -1202,8 +1202,8 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, out = &out[8 * BLAKE3_OUT_LEN]; } while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += 4; } diff --git a/src/ballet/blake3/blake3_dispatch.c b/src/ballet/blake3/blake3_dispatch.c index d5d8a05072..1f0dc25538 100644 --- a/src/ballet/blake3/blake3_dispatch.c +++ b/src/ballet/blake3/blake3_dispatch.c @@ -19,62 +19,62 @@ #define MAYBE_UNUSED(x) (void)((x)) -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { +void fd_blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { #if FD_HAS_AVX512 - blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + fd_blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); #elif FD_HAS_AVX - blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + fd_blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); #elif FD_HAS_SSE - blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + fd_blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); #else - blake3_compress_in_place_portable(cv, block, block_len, counter, flags); + fd_blake3_compress_in_place_portable(cv, block, block_len, counter, flags); #endif } -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]) { +void fd_blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { #if FD_HAS_AVX512 - blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + fd_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); #elif FD_HAS_AVX - blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + fd_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); #elif FD_HAS_SSE - blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + fd_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); #else - blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); + fd_blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); #endif } -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { #if FD_HAS_AVX512 - blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); + fd_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); #elif FD_HAS_AVX - blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); + fd_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); #elif FD_HAS_SSE /* TODO use sse4.1 here? */ - blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); + fd_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); #else - blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, - out); + fd_blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); #endif } // The dynamically detected SIMD degree of the current platform. -size_t blake3_simd_degree(void) { +size_t fd_blake3_simd_degree(void) { #if FD_HAS_AVX return 8; #else diff --git a/src/ballet/blake3/blake3_impl.h b/src/ballet/blake3/blake3_impl.h index 3f636c11aa..1056c798e7 100644 --- a/src/ballet/blake3/blake3_impl.h +++ b/src/ballet/blake3/blake3_impl.h @@ -147,102 +147,102 @@ INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { store32(&bytes_out[7 * 4], cv_words[7]); } -void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); +void fd_blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); -void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags, - uint8_t out[64]); +void fd_blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); -void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); +void fd_blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); -size_t blake3_simd_degree(void); +size_t fd_blake3_simd_degree(void); // Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], +void fd_blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void fd_blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void fd_blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if FD_HAS_X86 +#if FD_HAS_SSE +void fd_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void fd_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void fd_blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif /* FD_HAS_SSE */ +#if FD_HAS_AVX +void fd_blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); - -void blake3_compress_xof_portable(const uint32_t cv[8], +void fd_blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, +void fd_blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); +void fd_blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif /* FD_HAS_AVX */ +#if FD_HAS_AVX512 +void fd_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); -#if FD_HAS_X86 -#if FD_HAS_SSE -void blake3_compress_in_place_sse2(uint32_t cv[8], +void fd_blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif /* FD_HAS_SSE */ -#if FD_HAS_AVX -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -#endif /* FD_HAS_AVX */ -#if FD_HAS_AVX512 -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); - -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); + uint8_t flags, uint8_t out[64]); + +void fd_blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); #endif /* FD_HAS_AVX512 */ #endif /* FD_HAS_X86 */ #if BLAKE3_USE_NEON == 1 -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); +void fd_blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); #endif diff --git a/src/ballet/blake3/blake3_portable.c b/src/ballet/blake3/blake3_portable.c index 603ed7edb9..48b5ded259 100644 --- a/src/ballet/blake3/blake3_portable.c +++ b/src/ballet/blake3/blake3_portable.c @@ -84,10 +84,10 @@ INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], round_fn(state, &block_words[0], 6); } -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { +void fd_blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; @@ -100,10 +100,10 @@ void blake3_compress_in_place_portable(uint32_t cv[8], cv[7] = state[7] ^ state[15]; } -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { +void fd_blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { uint32_t state[16]; compress_pre(state, cv, block, block_len, counter, flags); @@ -136,8 +136,8 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks, if (blocks == 1) { block_flags |= flags_end; } - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); + fd_blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -145,11 +145,11 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks, store_cv_words(out, cv); } -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { while (num_inputs > 0) { hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); diff --git a/src/ballet/blake3/blake3_sse2.c b/src/ballet/blake3/blake3_sse2.c index 6af6e9bb5f..3eda716736 100644 --- a/src/ballet/blake3/blake3_sse2.c +++ b/src/ballet/blake3/blake3_sse2.c @@ -261,20 +261,20 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], undiagonalize(&rows[0], &rows[2], &rows[3]); } -void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { +void fd_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } -void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { +void fd_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); @@ -461,10 +461,10 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, } static -void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), @@ -533,8 +533,8 @@ INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, if (blocks == 1) { block_flags |= flags_end; } - blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); + fd_blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -542,14 +542,14 @@ INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, memcpy(out, cv, BLAKE3_OUT_LEN); } -void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { - blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } diff --git a/src/ballet/blake3/blake3_sse41.c b/src/ballet/blake3/blake3_sse41.c index 8d0c13868e..37412702b4 100644 --- a/src/ballet/blake3/blake3_sse41.c +++ b/src/ballet/blake3/blake3_sse41.c @@ -255,20 +255,20 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], undiagonalize(&rows[0], &rows[2], &rows[3]); } -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { +void fd_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { +void fd_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { __m128i rows[4]; compress_pre(rows, cv, block, block_len, counter, flags); storeu(xorv(rows[0], rows[2]), &out[0]); @@ -455,10 +455,10 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, } static -void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), @@ -527,8 +527,8 @@ INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, if (blocks == 1) { block_flags |= flags_end; } - blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); + fd_blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -536,14 +536,14 @@ INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, memcpy(out, cv, BLAKE3_OUT_LEN); } -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { +void fd_blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { while (num_inputs >= DEGREE) { - blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); + fd_blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); if (increment_counter) { counter += DEGREE; } diff --git a/src/ballet/blake3/fd_blake3.c b/src/ballet/blake3/fd_blake3.c index be57f7b2da..c0df9d4176 100644 --- a/src/ballet/blake3/fd_blake3.c +++ b/src/ballet/blake3/fd_blake3.c @@ -118,7 +118,7 @@ fd_blake3_delete( void * shsha ) { fd_blake3_t * fd_blake3_init( fd_blake3_t * sha ) { - blake3_hasher_init( &sha->hasher ); + fd_blake3_hasher_init( &sha->hasher ); return sha; } @@ -126,21 +126,21 @@ fd_blake3_t * fd_blake3_append( fd_blake3_t * sha, void const * data, ulong sz ) { - blake3_hasher_update( &sha->hasher, data, sz); + fd_blake3_hasher_update( &sha->hasher, data, sz); return sha; } void * fd_blake3_fini( fd_blake3_t * sha, void * hash ) { - blake3_hasher_finalize( &sha->hasher, (uchar *) hash, 32 ); + fd_blake3_hasher_finalize( &sha->hasher, (uchar *) hash, 32 ); return hash; } void * fd_blake3_fini_512( fd_blake3_t * sha, void * hash ) { - blake3_hasher_finalize( &sha->hasher, (uchar *) hash, 64 ); + fd_blake3_hasher_finalize( &sha->hasher, (uchar *) hash, 64 ); return hash; } @@ -148,6 +148,6 @@ void * fd_blake3_fini_varlen( fd_blake3_t * sha, void * hash, ulong hash_len ) { - blake3_hasher_finalize( &sha->hasher, (uchar *) hash, hash_len ); + fd_blake3_hasher_finalize( &sha->hasher, (uchar *) hash, hash_len ); return hash; }