Skip to content

Commit

Permalink
hs1-siv: add SSE2 version of Hasher::update_block
Browse files Browse the repository at this point in the history
Despite my best efforts I seem unable to get LLVM to emit vectorized
code, even though it should be obviously beneficial.

I suspect LLVM is thrown off by the 64 bit multiply, which is missing in
the SSE2 instruction set. It did take me a while to figure out that
casting an array of __m128i to [u64; 2] would end up the most
performant.

The SSE2 version is about ~%20 faster for me, so it is a substantial
improvement.

Also, inline(always) on pretty much everything is now beneficial,
whereas before it led to significant regressions. It does create a fair
bit of code bloat though.
  • Loading branch information
Demindiro committed Jan 24, 2025
1 parent 323b7ee commit 1ff89ad
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 27 deletions.
58 changes: 31 additions & 27 deletions hs1-siv/src/hash.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use super::{
mask, Array, ArraySize, False, Gr, Hs1HashKey, Hs1Params, PhantomData, Quot, True, B16, U4,
};
use super::{mask, Array, False, Gr, Hs1HashKey, Hs1Params, PhantomData, Quot, True, B16, U4};
use aead::array::typenum::Unsigned;
use core::mem;

#[cfg(target_feature = "sse2")]
mod sse2;

#[derive(Clone)]
pub struct Hasher<P: Hs1Params> {
k: Hs1HashKey<P>,
Expand Down Expand Up @@ -53,41 +54,52 @@ impl<P: Hs1Params> Hasher<P> {
pub fn new(k: &Hs1HashKey<P>) -> Self {
Self {
k: k.clone(),
h: array_from_iter(core::iter::repeat(1)),
block: Array::default(),
h: Array::from_fn(|_| 1),
block: Default::default(),
bytes: 0,
_marker: PhantomData,
}
}

#[inline(always)]
fn update_block(&mut self) -> &mut Self {
assert!(usize::from(self.bytes) <= self.block_u8().len());

#[cfg(target_feature = "sse2")]
if true {
// SAFETY: sse2 is supported
unsafe {
return self.update_block_sse2();
}
}

#[inline(always)]
fn nh_step(&[ax, bx, cx, dx]: &[u32; 4], &[ay, by, cy, dy]: &[u32; 4]) -> u64 {
let d = u64::from(dx.wrapping_add(dy));
let c = u64::from(cx.wrapping_add(cy));
let b = u64::from(bx.wrapping_add(by));
fn nh_step(&[ax, bx, cx, dx]: &[u32; 4], &[ay, by, cy, dy]: &[u32; 4]) -> [u64; 2] {
let a = u64::from(ax.wrapping_add(ay));
(a * c).wrapping_add(b * d)
let b = u64::from(bx.wrapping_add(by));
let c = u64::from(cx.wrapping_add(cy));
let d = u64::from(dx.wrapping_add(dy));
[a * c, b * d]
}

let m_ints = &self.block;

let block16_count = usize::from(((self.bytes + 15) / 16).max(1));

let mut nh = Array::<u64, P::T>::default();
let mut nh = Array::<[u64; 2], P::T>::default();
for (i0, m_ints_i) in m_ints.chunks_exact(4).enumerate().take(block16_count) {
for (nh_i, k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
for ([nh_i0, nh_i1], k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
let k_n_i_i = k_n_i_i.try_into().expect("exactly 4 elements");
let m_ints_i = m_ints_i.try_into().expect("exactly 4 elements");
let s = nh_step(k_n_i_i, m_ints_i);
*nh_i = nh_i.wrapping_add(s);
let [s0, s1] = nh_step(k_n_i_i, m_ints_i);
*nh_i0 = nh_i0.wrapping_add(s0);
*nh_i1 = nh_i1.wrapping_add(s1);
}
}

nh.iter()
.map(|nh_i| (nh_i + (u64::from(self.bytes) & mask(4))) & mask(60))
.map(|&[ac, bd]| ac.wrapping_add(bd))
.map(|nh_i| (nh_i.wrapping_add(u64::from(self.bytes) & mask(4))) & mask(60))
.zip(self.k.poly.iter())
.zip(self.h.iter_mut())
.for_each(|((a_i, &k_p_i), h_i)| *h_i = poly_step(*h_i, a_i, k_p_i));
Expand All @@ -97,6 +109,7 @@ impl<P: Hs1Params> Hasher<P> {
self
}

#[inline(always)]
pub fn update<'a>(&'a mut self, bytes: &[u8]) -> &'a mut Self {
assert!(usize::from(self.bytes) < self.block_u8().len());
let start = usize::from(self.bytes);
Expand All @@ -123,6 +136,7 @@ impl<P: Hs1Params> Hasher<P> {
self
}

#[inline(always)]
pub(crate) fn pad_to(&mut self, bits: u8) -> &mut Self {
debug_assert!(1 << bits <= B16::<P>::to_u8());
let m = mask(bits) as u8;
Expand All @@ -131,6 +145,7 @@ impl<P: Hs1Params> Hasher<P> {
}

// TODO &mut self helps avoid needing to clone(), but might be unintuitive
#[inline(always)]
pub fn finalize(&mut self) -> Array<Output<P>, P::T> {
// TODO we need to handle empty data properly
// However, see the note in crate::test::test_vectors::hash_me_empty
Expand All @@ -146,6 +161,7 @@ impl<P: Hs1Params> Hasher<P> {
out
}

#[inline(always)]
fn block_u8(&mut self) -> &mut Array<u8, B16<P>> {
const {
assert!(
Expand Down Expand Up @@ -177,18 +193,6 @@ const fn poly_finalize(a: u64) -> u64 {
a & c
}

#[inline(always)]
fn array_from_iter<I, L>(it: I) -> Array<I::Item, L>
where
I: IntoIterator,
L: ArraySize,
I::Item: Default,
{
let mut v = Array::<I::Item, L>::default();
v.iter_mut().zip(it).for_each(|(w, r)| *w = r);
v
}

#[cfg(test)]
mod test {
#[test]
Expand Down
49 changes: 49 additions & 0 deletions hs1-siv/src/hash/sse2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use super::{mask, poly_step, Array, Hasher, Hs1Params};
use core::arch::x86_64::*;

impl<P: Hs1Params> Hasher<P> {
#[inline(always)]
#[cfg(target_feature = "sse2")]
pub(super) unsafe fn update_block_sse2(&mut self) -> &mut Self {
assert!(usize::from(self.bytes) <= self.block_u8().len());

#[inline(always)]
unsafe fn nh_step(x: &[u32; 4], y: &[u32; 4]) -> __m128i {
let x = x.as_ptr().cast::<__m128i>().read_unaligned();
let y = y.as_ptr().cast::<__m128i>().read_unaligned();
let xy = _mm_add_epi32(x, y);

let a_b = _mm_shuffle_epi32::<0b00_01_00_00>(xy);
let c_d = _mm_shuffle_epi32::<0b00_11_00_10>(xy);
_mm_mul_epu32(a_b, c_d)
}

let m_ints = &self.block;

let block16_count = usize::from(((self.bytes + 15) / 16).max(1));

let mut nh: Array<__m128i, P::T> = Array::from_fn(|_| _mm_setzero_si128());
for (i0, m_ints_i) in m_ints.chunks_exact(4).enumerate().take(block16_count) {
for (nh_i, k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
let k_n_i_i = k_n_i_i.try_into().expect("exactly 4 elements");
let m_ints_i = m_ints_i.try_into().expect("exactly 4 elements");
let s = nh_step(k_n_i_i, m_ints_i);
*nh_i = _mm_add_epi64(*nh_i, s);
}
}

nh.iter()
.map(|nh_i| {
let &[ac, bd] = &*(nh_i as *const _ as *const [u64; 2]);
ac.wrapping_add(bd)
})
.map(|nh_i| (nh_i.wrapping_add(u64::from(self.bytes) & mask(4))) & mask(60))
.zip(self.k.poly.iter())
.zip(self.h.iter_mut())
.for_each(|((a_i, &k_p_i), h_i)| *h_i = poly_step(*h_i, a_i, k_p_i));

self.bytes = 0;

self
}
}

0 comments on commit 1ff89ad

Please sign in to comment.