Merge pull request #19786 from hrydgard/even-more-depth-work

DepthRaster: Better guardband rejection, parallelize triangle setup
hrydgard · Dec 31, 2024 · bbc933b · bbc933b
2 parents 0fa8865 + f85d7db
commit bbc933b
Show file tree

Hide file tree

Showing 4 changed files with 336 additions and 207 deletions.
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
@@ -121,12 +121,6 @@ struct Vec4S32 {
 	void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
 	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
 
-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4S32 SwapLowerElements() {
-		return Vec4S32{
-			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
 	Vec4S32 SignBits32ToMask() {
 		return Vec4S32{
 			_mm_srai_epi32(v, 31)
@@ -137,13 +131,19 @@ struct Vec4S32 {
 	// On SSE2, much faster than _mm_mullo_epi32_SSE2.
 	// On NEON though, it'll read the full 32 bits, so beware.
 	// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
-	Vec4S32 MulAsS16(Vec4S32 other) const {
+	Vec4S32 Mul16(Vec4S32 other) const {
 		// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
 		// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
 		// in the other register.
 		return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
 	}
 
+	Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
@@ -152,6 +152,18 @@ struct Vec4S32 {
 	// TODO: andnot
 	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
+	void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
+	void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
+
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; }  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }
 
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
@@ -216,10 +228,14 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
 	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
 	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
-	Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
 
 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
@@ -238,13 +254,6 @@ struct Vec4F32 {
 		return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
 	}
 
-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4F32 SwapLowerElements() {
-		return Vec4F32{
-			_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
-
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 		return Vec4F32{ _mm_add_ps(
 			_mm_add_ps(
@@ -261,6 +270,19 @@ struct Vec4F32 {
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
 		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
 	}
+
+	// This is here because ARM64 can do this very efficiently.
+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		col0.v = _mm_loadu_ps(src);
+		col1.v = _mm_loadu_ps(src + 4);
+		col2.v = _mm_loadu_ps(src + 8);
+		col3.v = _mm_loadu_ps(src + 12);
+		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
+	}
+
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
 };
 
 inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
@@ -309,6 +331,12 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
+
+	inline Vec4U16 AndNot(Vec4U16 inverted) {
+		return Vec4U16{
+			_mm_andnot_si128(inverted.v, v)  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+		};
+	}
 };
 
 struct Vec8U16 {
@@ -328,12 +356,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	};
 }
 
-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{
-		_mm_andnot_si128(inverted.v, a.v)  // NOTE: with andnot, the first parameter is inverted, and then and is performed.
-	};
-}
-
 #elif PPSSPP_ARCH(ARM_NEON)
 
 struct Mat4F32 {
@@ -443,23 +465,30 @@ struct Vec4S32 {
 	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
 
-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4S32 SwapLowerElements() {
-		int32x2_t upper = vget_high_s32(v);
-		int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
-		return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
-	};
-
 	// Warning: Unlike on x86, this is a full 32-bit multiplication.
-	Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+
+	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }
 
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
 	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
 	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
+	Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
 
 	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
@@ -508,6 +537,9 @@ struct Vec4F32 {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}
 
+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }
+
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
@@ -517,19 +549,27 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
 	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
 	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+	void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
-	Vec4F32 Recip() {
+	Vec4F32 Recip() const {
 		float32x4_t recip = vrecpeq_f32(v);
 		// Use a couple Newton-Raphson steps to refine the estimate.
-		// May be able to get away with only one refinement, not sure!
+		// To save one iteration at the expense of accuracy, use RecipApprox().
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		return Vec4F32{ recip };
 	}
 
+	Vec4F32 RecipApprox() const {
+		float32x4_t recip = vrecpeq_f32(v);
+		// To approximately match the precision of x86-64's rcpps, do a single iteration.
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		return Vec4F32{ recip };
+	}
+
 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
 			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
@@ -544,12 +584,11 @@ struct Vec4F32 {
 		return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
 	}
 
-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4F32 SwapLowerElements() {
-		float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
-		return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
-	};
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
+	Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
+	Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }
 
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
@@ -573,6 +612,15 @@ struct Vec4F32 {
 #endif
 	}
 
+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		// The optimizer hopefully gets rid of the copies below.
+		float32x4x4_t r = vld4q_f32(src);
+		col0.v = r.val[0];
+		col1.v = r.val[1];
+		col2.v = r.val[2];
+		col3.v = r.val[3];
+	}
+
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 #if PPSSPP_ARCH(ARM64_NEON)
 		float32x4_t sum = vaddq_f32(
@@ -644,6 +692,8 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+
+	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
 };
 
 inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
@@ -652,10 +702,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	return Vec4U16{ result };
 }
 
-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
-}
-
 struct Vec8U16 {
 	uint16x8_t v;
 

diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
@@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
 // May later figure out how to use the appropriate ones depending on compile flags.
 
 inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
-       __m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
-       __m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
-       __m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
-       __m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
-       __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
-       __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
-       return _mm_unpacklo_epi64(prod01, prod23);
+	__m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
+	__m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
+	__m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
+	__m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
+	__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
+	__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
+	return _mm_unpacklo_epi64(prod01, prod23);
 }
 
 inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_max_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_max_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }
 
 inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_min_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_min_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }
 
 // SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.