Merge pull request #19777 from hrydgard/improve-triangle-culling

Depth raster: Improve triangle culling in the "clipping" step
hrydgard · Dec 29, 2024 · 70cddec · 70cddec
2 parents 0b82405 + ef934df
commit 70cddec
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 40 deletions.
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
@@ -59,33 +59,6 @@ struct Mat4x3F32 {
 	__m128 data2;
 };
 
-// TODO: Check if loading b by 4s and shuffling is cheaper.
-inline Mat4F32 MulMem4x4By4x4(const float *a, Mat4F32 b) {
-	Mat4F32 result;
-
-	__m128 r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[0]));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[1])));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[2])));
-	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[3])));
-
-	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[4]));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[5])));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[6])));
-	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[7])));
-
-	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[8]));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[9])));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[10])));
-	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[11])));
-
-	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[12]));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[13])));
-	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[14])));
-	result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[15])));
-
-	return result;
-}
-
 inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
 	Mat4F32 result;
 
@@ -182,6 +155,10 @@ struct Vec4S32 {
 
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
+
+	Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ _mm_cmpeq_epi32(v, other.v) }; }
+	Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ _mm_cmplt_epi32(v, other.v) }; }
+	Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ _mm_cmpgt_epi32(v, other.v) }; }
 };
 
 inline bool AnyZeroSignBit(Vec4S32 value) {
@@ -233,6 +210,8 @@ struct Vec4F32 {
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
+	Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ _mm_min_ps(v, other.v) }; }
+	Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ _mm_max_ps(v, other.v) }; }
 	void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
 	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
 	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
@@ -287,6 +266,10 @@ struct Vec4F32 {
 inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
 inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
 
+inline bool AnyZeroSignBit(Vec4F32 value) {
+	return _mm_movemask_ps(value.v) != 0xF;
+}
+
 // Make sure the W component of scale is 1.0f.
 inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
 	m.col0 = _mm_mul_ps(m.col0, scale.v);
@@ -480,6 +463,11 @@ struct Vec4S32 {
 
 	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
+
+	Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ vceqq_s32(v, other.v) }; }
+	Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ vcltq_s32(v, other.v) }; }
+	Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ vcgtq_s32(v, other.v) }; }
+	Vec4S32 CompareGtZero() const { return Vec4S32{ vcgtq_s32(v, vdupq_n_s32(0)) }; }
 };
 
 struct Vec4F32 {
@@ -523,6 +511,8 @@ struct Vec4F32 {
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
+	Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ vminq_f32(v, other.v) }; }
+	Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ vmaxq_f32(v, other.v) }; }
 	void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
 	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
 	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
@@ -623,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
 	return (mask & 0x80000000) == 0;
 }
 
+inline bool AnyZeroSignBit(Vec4F32 value) {
+	int32x4_t ival = vreinterpretq_s32_f32(value.v);
+	int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
+	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
+	return (mask & 0x80000000) == 0;
+}
+
+
 struct Vec4U16 {
 	uint16x4_t v;  // 64 bits.
 

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
@@ -99,8 +99,7 @@ struct Edge {
 enum class TriangleResult {
 	OK,
 	NoPixels,
-	Backface,
-	TooSmall,
+	SmallOrBackface,
 };
 
 constexpr int MIN_TWICE_TRI_AREA = 10;
@@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
 	if (maxX == minX || maxY == minY) {
 		// No pixels, or outside screen.
+		// Most of these are now gone in the initial pass.
 		return TriangleResult::NoPixels;
 	}
 
 	// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
 	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
-	if (triArea < 0) {
-		return TriangleResult::Backface;
-	}
 	if (triArea < MIN_TWICE_TRI_AREA) {
-		return TriangleResult::TooSmall;  // Or zero area.
+		return TriangleResult::SmallOrBackface;  // Or zero area.
 	}
 
 	float oneOverTriArea = 1.0f / (float)triArea;
@@ -332,8 +329,15 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 
 	int collected = 0;
 	int planeCulled = 0;
+	int boxCulled = 0;
 	const float *verts[12];  // four triangles at a time!
 	const int count = draw.vertexCount;
+
+	Vec4F32 scissorX1 = Vec4F32::Splat((float)draw.scissor.x1);
+	Vec4F32 scissorY1 = Vec4F32::Splat((float)draw.scissor.y1);
+	Vec4F32 scissorX2 = Vec4F32::Splat((float)draw.scissor.x2);
+	Vec4F32 scissorY2 = Vec4F32::Splat((float)draw.scissor.y2);
+
 	for (int i = 0; i < count; i += 3) {
 		// Collect valid triangles into buffer.
 		const float *v0 = transformed + indexBuffer[i] * 4;
@@ -397,6 +401,30 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		y2 *= recipW2;
 		z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f);
 
+		// Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer).
+		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1));
+		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1));
+		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2));
+		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2));
+
+		// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
+		Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
+		// Otherwise we just proceed to triangle setup with all four for now. Later might want to
+		// compact the remaining triangles... Or do more checking here.
+		// We could also save the computed boxes for later..
+		if (!AnyZeroSignBit(eqMask)) {
+			boxCulled += 4;
+			continue;
+		}
+
+		// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
+		// Still good for culling early and pretty cheap to compute.
+		Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
+		if (!AnyZeroSignBit(triArea)) {
+			gpuStats.numDepthRasterEarlySize += 4;
+			continue;
+		}
+
 		Vec4S32FromF32(x0).Store(tx + outCount);
 		Vec4S32FromF32(x1).Store(tx + outCount + 4);
 		Vec4S32FromF32(x2).Store(tx + outCount + 8);
@@ -426,6 +454,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	}
 
 	gpuStats.numDepthRasterZCulled += planeCulled;
+	gpuStats.numDepthEarlyBoxCulled += boxCulled;
 	return outCount;
 }
 
@@ -446,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
 		break;
 	case GE_PRIM_TRIANGLES:
 	{
-		int stats[4]{};
+		int stats[3]{};
 		// Batches of 4 triangles, as output by the clip function.
 		for (int i = 0; i < count; i += 12) {
 			switch (draw.compareMode) {
@@ -467,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
 			}
 			}
 		}
-		gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
 		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
-		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
+		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
 		gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
 		break;
 	}

diff --git a/GPU/GPU.h b/GPU/GPU.h
@@ -111,10 +111,11 @@ struct GPUStatistics {
 		msPrepareDepth = 0.0f;
 		msRasterizeDepth = 0.0f;
 		numDepthRasterPrims = 0;
-		numDepthRasterBackface = 0;
+		numDepthRasterEarlySize = 0;
 		numDepthRasterNoPixels = 0;
 		numDepthRasterTooSmall = 0;
 		numDepthRasterZCulled = 0;
+		numDepthEarlyBoxCulled = 0;
 		vertexGPUCycles = 0;
 		otherGPUCycles = 0;
 	}
@@ -159,10 +160,11 @@ struct GPUStatistics {
 	int vertexGPUCycles;
 	int otherGPUCycles;
 	int numDepthRasterPrims;
-	int numDepthRasterBackface;
+	int numDepthRasterEarlySize;
 	int numDepthRasterNoPixels;
 	int numDepthRasterTooSmall;
 	int numDepthRasterZCulled;
+	int numDepthEarlyBoxCulled;
 	// Flip count. Doesn't really belong here.
 	int numFlips;
 };

diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp
@@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 		"replacer: tracks %d references, %d unique textures\n"
 		"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
 		"GPU cycles: %d (%0.1f per vertex)\n"
-		"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull\n%s",
+		"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s",
 		gpuStats.msProcessingDisplayLists * 1000.0f,
 		gpuStats.numDrawSyncs,
 		gpuStats.numListSyncs,
@@ -1843,8 +1843,9 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 		gpuStats.numDepthRasterPrims,
 		gpuStats.numDepthRasterNoPixels,
 		gpuStats.numDepthRasterTooSmall,
-		gpuStats.numDepthRasterBackface,
+		gpuStats.numDepthRasterEarlySize,
 		gpuStats.numDepthRasterZCulled,
+		gpuStats.numDepthEarlyBoxCulled,
 		debugRecording_ ? "(debug-recording)" : ""
 	);
 }