Merge pull request #602 from robertknight/remove-gather-mask

Remove `SimdFloat::gather_mask` operation
robertknight · Feb 18, 2025 · 8e5ed66 · 8e5ed66
2 parents 8de5d1d + 1b80fcc
commit 8e5ed66
Show file tree

Hide file tree

Showing 6 changed files with 0 additions and 85 deletions.
diff --git a/rten-simd/src/arch.rs b/rten-simd/src/arch.rs
@@ -15,36 +15,3 @@ mod aarch64;
 #[cfg(target_arch = "wasm32")]
 #[cfg(target_feature = "simd128")]
 pub mod wasm;
-
-use crate::{Simd, SimdMask};
-
-/// Fallback implementation for [`SimdFloat::gather_mask`], for CPUs where
-/// a native gather implementation is unavailable or unusable.
-///
-/// The caller must set `LEN` to `S::LEN`.
-///
-/// # Safety
-///
-/// See notes in [`SimdFloat::gather_mask`]. In particular, `src` must point
-/// to a non-empty buffer, so that `src[0]` is valid.
-#[inline]
-unsafe fn simd_gather_mask<
-    M: SimdMask,
-    S: Simd<Mask = M>,
-    SI: Simd<Elem = i32, Mask = M>,
-    const LEN: usize,
->(
-    src: *const S::Elem,
-    offsets: SI,
-    mask: M,
-) -> S {
-    // Set offset to zero where masked out. `src` is required to point to
-    // a non-empty buffer, so index zero can be loaded as a dummy. This avoids
-    // an unpredictable branch.
-    let offsets = offsets.select(SI::zero(), mask);
-    let mut offset_array = [0; LEN];
-    offsets.store(offset_array.as_mut_ptr());
-
-    let values: [S::Elem; LEN] = std::array::from_fn(|i| *src.add(offset_array[i] as usize));
-    S::load(values.as_ptr()).select(S::zero(), mask)
-}
diff --git a/rten-simd/src/arch/aarch64.rs b/rten-simd/src/arch/aarch64.rs
@@ -292,11 +292,6 @@ impl SimdFloat for float32x4_t {
         vminq_f32(self, rhs)
     }
 
-    #[inline]
-    unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
-        super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
-    }
-
     #[inline]
     unsafe fn sum(self) -> f32 {
         vaddvq_f32(self)

diff --git a/rten-simd/src/arch/scalar.rs b/rten-simd/src/arch/scalar.rs
@@ -257,15 +257,6 @@ impl SimdFloat for f32 {
         f32::min(self, rhs)
     }
 
-    #[inline]
-    unsafe fn gather_mask(ptr: *const f32, offset: i32, mask: Self::Mask) -> Self {
-        if mask {
-            *ptr.add(offset as usize)
-        } else {
-            0.
-        }
-    }
-
     #[inline]
     unsafe fn sum(self) -> f32 {
         self

diff --git a/rten-simd/src/arch/wasm.rs b/rten-simd/src/arch/wasm.rs
@@ -327,11 +327,6 @@ impl SimdFloat for v128f {
         Self(f32x4_min(self.0, rhs.0))
     }
 
-    #[inline]
-    unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
-        super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
-    }
-
     #[inline]
     unsafe fn sum(self) -> f32 {
         // See https://github.com/WebAssembly/simd/issues/20.

diff --git a/rten-simd/src/arch/x86_64.rs b/rten-simd/src/arch/x86_64.rs
@@ -372,20 +372,6 @@ impl SimdFloat for __m256 {
         _mm256_min_ps(self, rhs)
     }
 
-    #[inline]
-    #[target_feature(enable = "avx2")]
-    unsafe fn gather_mask(src: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
-        // AVX2 has a gather instruction, but we don't use it because on some
-        // Intel CPUs it is slower than regular loads due to a mitigation for
-        // the Gather Data Sampling (GDS) vulnerability.
-        //
-        // From initial testing it appears that AVX512 is not affected to the
-        // same extent, so using an emulated gather may not pay off there.
-        //
-        // See https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/gather-data-sampling.html
-        super::simd_gather_mask::<_, _, _, { Self::LEN.unwrap() }>(src, offsets, mask)
-    }
-
     #[inline]
     #[target_feature(enable = "avx2")]
     unsafe fn sum(self) -> f32 {
@@ -760,12 +746,6 @@ impl SimdFloat for __m512 {
         _mm512_min_ps(self, rhs)
     }
 
-    #[inline]
-    #[target_feature(enable = "avx512f")]
-    unsafe fn gather_mask(ptr: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self {
-        _mm512_mask_i32gather_ps::<4>(Self::zero(), mask, offsets, ptr as *const u8)
-    }
-
     #[inline]
     #[target_feature(enable = "avx512f")]
     unsafe fn sum(self) -> f32 {

diff --git a/rten-simd/src/vec.rs b/rten-simd/src/vec.rs
@@ -353,19 +353,6 @@ pub trait SimdFloat: Simd<Elem = f32> {
     /// differences in results depending on the architecture.
     unsafe fn sum(self) -> f32;
 
-    /// Load `Self::LEN` values from the base memory address at `ptr` plus
-    /// offsets in `offsets`, excluding elements where `mask` is off.
-    ///
-    /// Offsets are expressed in terms of elements, not bytes. Elements of the
-    /// result are set to zero where the mask is off.
-    ///
-    /// # Safety
-    ///
-    /// All offsets in `offsets` and the offset zero must be valid for indexing
-    /// into `ptr`. The requirement for offset zero to be valid is needed on
-    /// architectures which do not have a gather instruction.
-    unsafe fn gather_mask(ptr: *const f32, offsets: Self::Int, mask: Self::Mask) -> Self;
-
     /// Reduce the elements in this vector to a single value using `f`, then
     /// return a new vector with the accumulated value broadcast to each lane.
     #[inline]