Skip to content

Commit

Permalink
Merge pull request #19786 from hrydgard/even-more-depth-work
Browse files Browse the repository at this point in the history
DepthRaster: Better guardband rejection, parallelize triangle setup
  • Loading branch information
hrydgard authored Dec 31, 2024
2 parents 0fa8865 + f85d7db commit bbc933b
Show file tree
Hide file tree
Showing 4 changed files with 336 additions and 207 deletions.
130 changes: 88 additions & 42 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,6 @@ struct Vec4S32 {
void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}

// Swaps the two lower elements. Useful for reversing triangles..
Vec4S32 SwapLowerElements() {
return Vec4S32{
_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
};
}
Vec4S32 SignBits32ToMask() {
return Vec4S32{
_mm_srai_epi32(v, 31)
Expand All @@ -137,13 +131,19 @@ struct Vec4S32 {
// On SSE2, much faster than _mm_mullo_epi32_SSE2.
// On NEON though, it'll read the full 32 bits, so beware.
// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
Vec4S32 MulAsS16(Vec4S32 other) const {
Vec4S32 Mul16(Vec4S32 other) const {
// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
// in the other register.
return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
}

Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }

Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
Expand All @@ -152,6 +152,18 @@ struct Vec4S32 {
// TODO: andnot
void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }

Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
Vec4S32 Mul(Vec4S32 other) const { return *this * other; }

template<int imm>
Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }

// NOTE: May be slow.
int operator[](size_t index) const { return ((int *)&v)[index]; }

// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0)
Expand Down Expand Up @@ -216,10 +228,14 @@ struct Vec4F32 {
void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
// NOTE: May be slow.
float operator[](size_t index) const { return ((float *)&v)[index]; }

Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }

Vec4F32 Clamp(float lower, float higher) {
return Vec4F32{
Expand All @@ -238,13 +254,6 @@ struct Vec4F32 {
return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
}

// Swaps the two lower elements. Useful for reversing triangles..
Vec4F32 SwapLowerElements() {
return Vec4F32{
_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
};
}

inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
return Vec4F32{ _mm_add_ps(
_mm_add_ps(
Expand All @@ -261,6 +270,19 @@ struct Vec4F32 {
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
}

// This is here because ARM64 can do this very efficiently.
static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
col0.v = _mm_loadu_ps(src);
col1.v = _mm_loadu_ps(src + 4);
col2.v = _mm_loadu_ps(src + 8);
col3.v = _mm_loadu_ps(src + 12);
_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
}

Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
};

inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
Expand Down Expand Up @@ -309,6 +331,12 @@ struct Vec4U16 {
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }

inline Vec4U16 AndNot(Vec4U16 inverted) {
return Vec4U16{
_mm_andnot_si128(inverted.v, v) // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
};
}
};

struct Vec8U16 {
Expand All @@ -328,12 +356,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
};
}

inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
return Vec4U16{
_mm_andnot_si128(inverted.v, a.v) // NOTE: with andnot, the first parameter is inverted, and then and is performed.
};
}

#elif PPSSPP_ARCH(ARM_NEON)

struct Mat4F32 {
Expand Down Expand Up @@ -443,23 +465,30 @@ struct Vec4S32 {
void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
void StoreAligned(int *dst) { vst1q_s32(dst, v); }

// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4S32 SwapLowerElements() {
int32x2_t upper = vget_high_s32(v);
int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
};

// Warning: Unlike on x86, this is a full 32-bit multiplication.
Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }

Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }

// NOTE: May be slow.
int operator[](size_t index) const { return ((int *)&v)[index]; }

Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }

template<int imm>
Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }

void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
Expand Down Expand Up @@ -508,6 +537,9 @@ struct Vec4F32 {
return Vec4F32{ vcvtq_f32_s32(other.v) };
}

// NOTE: May be slow.
float operator[](size_t index) const { return ((float *)&v)[index]; }

Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
Expand All @@ -517,19 +549,27 @@ struct Vec4F32 {
void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }

Vec4F32 Recip() {
Vec4F32 Recip() const {
float32x4_t recip = vrecpeq_f32(v);
// Use a couple Newton-Raphson steps to refine the estimate.
// May be able to get away with only one refinement, not sure!
// To save one iteration at the expense of accuracy, use RecipApprox().
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
return Vec4F32{ recip };
}

Vec4F32 RecipApprox() const {
float32x4_t recip = vrecpeq_f32(v);
// To approximately match the precision of x86-64's rcpps, do a single iteration.
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
return Vec4F32{ recip };
}

Vec4F32 Clamp(float lower, float higher) {
return Vec4F32{
vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
Expand All @@ -544,12 +584,11 @@ struct Vec4F32 {
return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
}

// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
Vec4F32 SwapLowerElements() {
float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
};
Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }

// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
Expand All @@ -573,6 +612,15 @@ struct Vec4F32 {
#endif
}

static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
// The optimizer hopefully gets rid of the copies below.
float32x4x4_t r = vld4q_f32(src);
col0.v = r.val[0];
col1.v = r.val[1];
col2.v = r.val[2];
col3.v = r.val[3];
}

inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
#if PPSSPP_ARCH(ARM64_NEON)
float32x4_t sum = vaddq_f32(
Expand Down Expand Up @@ -644,6 +692,8 @@ struct Vec4U16 {
Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }

Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
};

inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
Expand All @@ -652,10 +702,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
return Vec4U16{ result };
}

inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
}

struct Vec8U16 {
uint16x8_t v;

Expand Down
34 changes: 17 additions & 17 deletions Common/Math/SIMDHeaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
// May later figure out how to use the appropriate ones depending on compile flags.

inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
return _mm_unpacklo_epi64(prod01, prod23);
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
return _mm_unpacklo_epi64(prod01, prod23);
}

inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
return _mm_xor_si128(
_mm_max_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
return _mm_xor_si128(
_mm_max_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
}

inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
return _mm_xor_si128(
_mm_min_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
return _mm_xor_si128(
_mm_min_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
}

// SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
Expand Down
Loading

0 comments on commit bbc933b

Please sign in to comment.