Skip to content

Commit

Permalink
Merge pull request #602 from robertknight/remove-gather-mask
Browse files Browse the repository at this point in the history
Remove `SimdFloat::gather_mask` operation
  • Loading branch information
robertknight authored Feb 18, 2025
2 parents 8de5d1d + 1b80fcc commit 8e5ed66
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 85 deletions.
33 changes: 0 additions & 33 deletions rten-simd/src/arch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,3 @@ mod aarch64;
#[cfg(target_arch = "wasm32")]
#[cfg(target_feature = "simd128")]
pub mod wasm;

use crate::{Simd, SimdMask};

/// Fallback implementation for [`SimdFloat::gather_mask`], for CPUs where
/// a native gather implementation is unavailable or unusable.
///
/// The caller must set `LEN` to `S::LEN`.
///
/// # Safety
///
/// See notes in [`SimdFloat::gather_mask`]. In particular, `src` must point
/// to a non-empty buffer, so that `src[0]` is valid.
#[inline]
unsafe fn simd_gather_mask<
M: SimdMask,
S: Simd<Mask = M>,
SI: Simd<Elem = i32, Mask = M>,
const LEN: usize,
>(
src: *const S::Elem,
offsets: SI,
mask: M,
) -> S {
// Set offset to zero where masked out. `src` is required to point to
// a non-empty buffer, so index zero can be loaded as a dummy. This avoids
// an unpredictable branch.
let offsets = offsets.select(SI::zero(), mask);
let mut offset_array = [0; LEN];
offsets.store(offset_array.as_mut_ptr());

let values: [S::Elem; LEN] = std::array::from_fn(|i| *src.add(offset_array[i] as usize));
S::load(values.as_ptr()).select(S::zero(), mask)
}
5 changes: 0 additions & 5 deletions rten-simd/src/arch/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,11 +292,6 @@ impl SimdFloat for float32x4_t {
vminq_f32(self, rhs)
}

#[inline]
unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
}

#[inline]
unsafe fn sum(self) -> f32 {
vaddvq_f32(self)
Expand Down
9 changes: 0 additions & 9 deletions rten-simd/src/arch/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,15 +257,6 @@ impl SimdFloat for f32 {
f32::min(self, rhs)
}

#[inline]
unsafe fn gather_mask(ptr: *const f32, offset: i32, mask: Self::Mask) -> Self {
if mask {
*ptr.add(offset as usize)
} else {
0.
}
}

#[inline]
unsafe fn sum(self) -> f32 {
self
Expand Down
5 changes: 0 additions & 5 deletions rten-simd/src/arch/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,6 @@ impl SimdFloat for v128f {
Self(f32x4_min(self.0, rhs.0))
}

#[inline]
unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
}

#[inline]
unsafe fn sum(self) -> f32 {
// See https://github.com/WebAssembly/simd/issues/20.
Expand Down
20 changes: 0 additions & 20 deletions rten-simd/src/arch/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,20 +372,6 @@ impl SimdFloat for __m256 {
_mm256_min_ps(self, rhs)
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
// AVX2 has a gather instruction, but we don't use it because on some
// Intel CPUs it is slower than regular loads due to a mitigation for
// the Gather Data Sampling (GDS) vulnerability.
//
// From initial testing it appears that AVX512 is not affected to the
// same extent, so using an emulated gather may not pay off there.
//
// See https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn sum(self) -> f32 {
Expand Down Expand Up @@ -760,12 +746,6 @@ impl SimdFloat for __m512 {
_mm512_min_ps(self, rhs)
}

#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn gather_mask(ptr: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
_mm512_mask_i32gather_ps::<4>(Self::zero(), mask, offsets, ptr as *const u8)
}

#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn sum(self) -> f32 {
Expand Down
13 changes: 0 additions & 13 deletions rten-simd/src/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,19 +353,6 @@ pub trait SimdFloat: Simd<Elem = f32> {
/// differences in results depending on the architecture.
unsafe fn sum(self) -> f32;

/// Load `Self::LEN` values from the base memory address at `ptr` plus
/// offsets in `offsets`, excluding elements where `mask` is off.
///
/// Offsets are expressed in terms of elements, not bytes. Elements of the
/// result are set to zero where the mask is off.
///
/// # Safety
///
/// All offsets in `offsets` and the offset zero must be valid for indexing
/// into `ptr`. The requirement for offset zero to be valid is needed on
/// architectures which do not have a gather instruction.
unsafe fn gather_mask(ptr: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self;

/// Reduce the elements in this vector to a single value using `f`, then
/// return a new vector with the accumulated value broadcast to each lane.
#[inline]
Expand Down

0 comments on commit 8e5ed66

Please sign in to comment.