From f70889ad49f27871188153a83c6363270396e16e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 13:42:25 +0100
Subject: [PATCH 01/15] Better triangle area calculation, thanks fp64 for the
 reminder

---
 GPU/Common/DepthRaster.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index b464e3012bfe..f68ac6f89b93 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -132,19 +132,19 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	// are slow on SSE2.
 
 	// NOTE: Triangles are stored in groups of 4.
-	int v0x = tx[0];
-	int v0y = ty[0];
-	int v1x = tx[4];
-	int v1y = ty[4];
-	int v2x = tx[8];
-	int v2y = ty[8];
+	int x0 = tx[0];
+	int y0 = ty[0];
+	int x1 = tx[4];
+	int y1 = ty[4];
+	int x2 = tx[8];
+	int y2 = ty[8];
 
 	// use fixed-point only for X and Y.  Avoid work for Z and W.
 	// We use 4x1 tiles for simplicity.
-	int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3;
-	int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3;
-	int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1);
-	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
+	int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3;
+	int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3;
+	int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1);
+	int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2);
 	if (maxX == minX || maxY == minY) {
 		// No pixels, or outside screen.
 		// Most of these are now gone in the initial pass.
@@ -152,7 +152,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	}
 
 	// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
-	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
+	int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0);
 	if (triArea < MIN_TWICE_TRI_AREA) {
 		return TriangleResult::SmallOrBackface;  // Or zero area.
 	}
@@ -161,9 +161,9 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 
 	Edge e01, e12, e20;
 
-	Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
-	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
-	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
+	Vec4S32 w0_row = e12.init(x1, y1, x2, y2, minX, minY);
+	Vec4S32 w1_row = e20.init(x2, y2, x0, y0, minX, minY);
+	Vec4S32 w2_row = e01.init(x0, y0, x1, y1, minX, minY);
 
 	// Prepare to interpolate Z
 	Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
@@ -435,10 +435,10 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 			continue;
 		}
 
-		// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
+		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
 		// Still good for culling early and pretty cheap to compute.
-		Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
-		if (!AnyZeroSignBit(triArea)) {
+		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
+		if (!AnyZeroSignBit(doubleTriArea)) {
 			gpuStats.numDepthRasterEarlySize += 4;
 			continue;
 		}

From 1195c630c3023ebc78784246386f27fe033cf8d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 14:01:13 +0100
Subject: [PATCH 02/15] Some variable renaming

---
 GPU/Common/DepthRaster.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index f68ac6f89b93..afd2718a4921 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -85,29 +85,29 @@ static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor sciss
 	}
 }
 
-alignas(16) static const int zero123[4]  = {0, 1, 2, 3};
+alignas(16) static const int zero123[4] = {0, 1, 2, 3};
+
+constexpr int stepXSize = 4;
+constexpr int stepYSize = 1;
 
 struct Edge {
 	// Dimensions of our pixel group
-	static const int stepXSize = 4;
-	static const int stepYSize = 1;
-
 	Vec4S32 oneStepX;
 	Vec4S32 oneStepY;
 
-	Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
+	Vec4S32 init(int xa, int ya, int xb, int yb, int originX, int originY) {
 		// Edge setup
-		int A = v0y - v1y;
-		int B = v1x - v0x;
-		int C = v0x * v1y - v0y * v1x;
+		int A = ya - yb;
+		int B = xb - xa;
+		int C = xa * yb - ya * xb;
 
 		// Step deltas
 		oneStepX = Vec4S32::Splat(A * stepXSize);
 		oneStepY = Vec4S32::Splat(B * stepYSize);
 
 		// x/y values for initial pixel block. Add horizontal offsets.
-		Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
-		Vec4S32 y = Vec4S32::Splat(p0y);
+		Vec4S32 x = Vec4S32::Splat(originX) + Vec4S32::LoadAligned(zero123);
+		Vec4S32 y = Vec4S32::Splat(originY);
 
 		// Edge function values at origin
 		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
@@ -175,7 +175,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;
 
 	// Rasterize
-	for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
+	for (int y = minY; y <= maxY; y += stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
 		// Barycentric coordinates at start of row
 		Vec4S32 w0 = w0_row;
 		Vec4S32 w1 = w1_row;
@@ -184,7 +184,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 
 		uint16_t *rowPtr = depthBuf + stride * y;
 
-		for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
+		for (int x = minX; x <= maxX; x += stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
 			// If p is on or inside all edges for any pixels,
 			// render those pixels.
 			Vec4S32 signCalc = w0 | w1 | w2;

From 69b35e914690d51a06fdf8979e33d1e0996aa7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 14:10:08 +0100
Subject: [PATCH 03/15] Inline edge calculations

---
 GPU/Common/DepthRaster.cpp | 80 +++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 32 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index afd2718a4921..b9a94afcf739 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -90,30 +90,6 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3};
 constexpr int stepXSize = 4;
 constexpr int stepYSize = 1;
 
-struct Edge {
-	// Dimensions of our pixel group
-	Vec4S32 oneStepX;
-	Vec4S32 oneStepY;
-
-	Vec4S32 init(int xa, int ya, int xb, int yb, int originX, int originY) {
-		// Edge setup
-		int A = ya - yb;
-		int B = xb - xa;
-		int C = xa * yb - ya * xb;
-
-		// Step deltas
-		oneStepX = Vec4S32::Splat(A * stepXSize);
-		oneStepY = Vec4S32::Splat(B * stepYSize);
-
-		// x/y values for initial pixel block. Add horizontal offsets.
-		Vec4S32 x = Vec4S32::Splat(originX) + Vec4S32::LoadAligned(zero123);
-		Vec4S32 y = Vec4S32::Splat(originY);
-
-		// Edge function values at origin
-		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
-	}
-};
-
 enum class TriangleResult {
 	OK,
 	NoPixels,
@@ -159,23 +135,63 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 
 	float oneOverTriArea = 1.0f / (float)triArea;
 
-	Edge e01, e12, e20;
+	// Edge setup
+	int A12 = y1 - y2;
+	int B12 = x2 - x1;
+	int C12 = x1 * y2 - y1 * x2;
+
+	// Step deltas
+	Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize);
+	Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize);
+
+	// x/y values for initial pixel block. Add horizontal offsets.
+	Vec4S32 x12 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
+	Vec4S32 y12 = Vec4S32::Splat(minY);
+
+	// Edge function values at origin
+
+	// Edge setup
+	int A20 = y2 - y0;
+	int B20 = x0 - x2;
+	int C20 = x2 * y0 - y2 * x0;
+
+	// Step deltas
+	Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize);
+	Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize);
+
+	// x/y values for initial pixel block. Add horizontal offsets.
+	Vec4S32 x20 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
+	Vec4S32 y20 = Vec4S32::Splat(minY);
+
+	// Edge setup
+	int A01 = y0 - y1;
+	int B01 = x1 - x0;
+	int C01 = x0 * y1 - y0 * x1;
+
+	// Step deltas
+	Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize);
+	Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize);
+
+	// x/y values for initial pixel block. Add horizontal offsets.
+	Vec4S32 x01 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
+	Vec4S32 y01 = Vec4S32::Splat(minY);
 
-	Vec4S32 w0_row = e12.init(x1, y1, x2, y2, minX, minY);
-	Vec4S32 w1_row = e20.init(x2, y2, x0, y0, minX, minY);
-	Vec4S32 w2_row = e01.init(x0, y0, x1, y1, minX, minY);
+	// Edge function values at origin
+	Vec4S32 w0_row = Vec4S32::Splat(A12) * x12 + Vec4S32::Splat(B12) * y12 + Vec4S32::Splat(C12);
+	Vec4S32 w1_row = Vec4S32::Splat(A20) * x20 + Vec4S32::Splat(B20) * y20 + Vec4S32::Splat(C20);
+	Vec4S32 w2_row = Vec4S32::Splat(A01) * x01 + Vec4S32::Splat(B01) * y01 + Vec4S32::Splat(C01);
 
 	// Prepare to interpolate Z
 	Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
 	Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea);
 	Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea);
 
-	Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX);
-	Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY);
+	Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(oneStepX20) + zz2 * Vec4F32FromS32(oneStepX01);
+	Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(oneStepY20) + zz2 * Vec4F32FromS32(oneStepY01);
 	Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;
 
 	// Rasterize
-	for (int y = minY; y <= maxY; y += stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) {
+	for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
 		// Barycentric coordinates at start of row
 		Vec4S32 w0 = w0_row;
 		Vec4S32 w1 = w1_row;
@@ -184,7 +200,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 
 		uint16_t *rowPtr = depthBuf + stride * y;
 
-		for (int x = minX; x <= maxX; x += stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) {
+		for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
 			// If p is on or inside all edges for any pixels,
 			// render those pixels.
 			Vec4S32 signCalc = w0 | w1 | w2;

From d435945b7c4ce058efffdc12e3d8baee5302afc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 16:43:07 +0100
Subject: [PATCH 04/15] Simplify

---
 GPU/Common/DepthRaster.cpp | 55 +++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index b9a94afcf739..e07246ac32f5 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -140,55 +140,50 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	int B12 = x2 - x1;
 	int C12 = x1 * y2 - y1 * x2;
 
-	// Step deltas
-	Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize);
-	Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize);
-
-	// x/y values for initial pixel block. Add horizontal offsets.
-	Vec4S32 x12 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
-	Vec4S32 y12 = Vec4S32::Splat(minY);
-
-	// Edge function values at origin
-
 	// Edge setup
 	int A20 = y2 - y0;
 	int B20 = x0 - x2;
 	int C20 = x2 * y0 - y2 * x0;
 
-	// Step deltas
-	Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize);
-	Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize);
-
-	// x/y values for initial pixel block. Add horizontal offsets.
-	Vec4S32 x20 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
-	Vec4S32 y20 = Vec4S32::Splat(minY);
-
 	// Edge setup
 	int A01 = y0 - y1;
 	int B01 = x1 - x0;
 	int C01 = x0 * y1 - y0 * x1;
 
+	// Prepare to interpolate Z
+	float zbase = tz[0];
+	float z_20 = (tz[4] - tz[0]) * oneOverTriArea;
+	float z_01 = (tz[8] - tz[0]) * oneOverTriArea;
+
+	// Step deltas
+	Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize);
+	Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize);
+
+	// Step deltas
+	Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize);
+	Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize);
+
 	// Step deltas
 	Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize);
 	Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize);
 
 	// x/y values for initial pixel block. Add horizontal offsets.
-	Vec4S32 x01 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
-	Vec4S32 y01 = Vec4S32::Splat(minY);
+	Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
+	int initialY = minY;
+
+	// Convert per-triangle values to wide registers.
 
 	// Edge function values at origin
-	Vec4S32 w0_row = Vec4S32::Splat(A12) * x12 + Vec4S32::Splat(B12) * y12 + Vec4S32::Splat(C12);
-	Vec4S32 w1_row = Vec4S32::Splat(A20) * x20 + Vec4S32::Splat(B20) * y20 + Vec4S32::Splat(C20);
-	Vec4S32 w2_row = Vec4S32::Splat(A01) * x01 + Vec4S32::Splat(B01) * y01 + Vec4S32::Splat(C01);
+	Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12);
+	Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20);
+	Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01);
 
-	// Prepare to interpolate Z
-	Vec4F32 zz0 = Vec4F32::Splat(tz[0]);
-	Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea);
-	Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea);
+	Vec4F32 z_20_v = Vec4F32::Splat(z_20);
+	Vec4F32 z_01_v = Vec4F32::Splat(z_01);
 
-	Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(oneStepX20) + zz2 * Vec4F32FromS32(oneStepX01);
-	Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(oneStepY20) + zz2 * Vec4F32FromS32(oneStepY01);
-	Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2;
+	Vec4F32 zdeltaX = z_20_v * Vec4F32FromS32(oneStepX20) + z_01_v * Vec4F32FromS32(oneStepX01);
+	Vec4F32 zdeltaY = z_20_v * Vec4F32FromS32(oneStepY20) + z_01_v * Vec4F32FromS32(oneStepY01);
+	Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01;
 
 	// Rasterize
 	for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {

From 2eed309d29224ad266411d3816f8544eb784d71b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 16:55:04 +0100
Subject: [PATCH 05/15] Simplify more

---
 GPU/Common/DepthRaster.cpp | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index e07246ac32f5..52bfb3035347 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -156,16 +156,12 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	float z_01 = (tz[8] - tz[0]) * oneOverTriArea;
 
 	// Step deltas
-	Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize);
-	Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize);
-
-	// Step deltas
-	Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize);
-	Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize);
-
-	// Step deltas
-	Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize);
-	Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize);
+	int stepX12 = A12 * stepXSize;
+	int stepY12 = B12 * stepYSize;
+	int stepX20 = A20 * stepXSize;
+	int stepY20 = B20 * stepYSize;
+	int stepX01 = A01 * stepXSize;
+	int stepY01 = B01 * stepYSize;
 
 	// x/y values for initial pixel block. Add horizontal offsets.
 	Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
@@ -178,13 +174,16 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20);
 	Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01);
 
-	Vec4F32 z_20_v = Vec4F32::Splat(z_20);
-	Vec4F32 z_01_v = Vec4F32::Splat(z_01);
-
-	Vec4F32 zdeltaX = z_20_v * Vec4F32FromS32(oneStepX20) + z_01_v * Vec4F32FromS32(oneStepX01);
-	Vec4F32 zdeltaY = z_20_v * Vec4F32FromS32(oneStepY20) + z_01_v * Vec4F32FromS32(oneStepY01);
+	Vec4F32 zdeltaX = Vec4F32::Splat(z_20 * (float)stepX20 + z_01 * (float)stepX01);
+	Vec4F32 zdeltaY = Vec4F32::Splat(z_20 * (float)stepY20 + z_01 * (float)stepY01);
 	Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01;
 
+	Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12);
+	Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12);
+	Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20);
+	Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20);
+	Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01);
+	Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01);
 	// Rasterize
 	for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
 		// Barycentric coordinates at start of row

From 373569bf64e33977404e8b48402a0ef271f66f8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 29 Dec 2024 17:51:44 +0100
Subject: [PATCH 06/15] More prep. Add triangle loop.

---
 GPU/Common/DepthRaster.cpp | 140 ++++++++++++++++++++-----------------
 1 file changed, 74 insertions(+), 66 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 52bfb3035347..2eb4da83b9af 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -104,8 +104,9 @@ constexpr int MIN_TWICE_TRI_AREA = 10;
 template<ZCompareMode compareMode>
 TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
 	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
-	// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
-	// are slow on SSE2.
+	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
+
+	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
 
 	// NOTE: Triangles are stored in groups of 4.
 	int x0 = tx[0];
@@ -115,12 +116,11 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	int x2 = tx[8];
 	int y2 = ty[8];
 
-	// use fixed-point only for X and Y.  Avoid work for Z and W.
-	// We use 4x1 tiles for simplicity.
 	int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3;
 	int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3;
 	int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1);
 	int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2);
+
 	if (maxX == minX || maxY == minY) {
 		// No pixels, or outside screen.
 		// Most of these are now gone in the initial pass.
@@ -150,11 +150,6 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	int B01 = x1 - x0;
 	int C01 = x0 * y1 - y0 * x1;
 
-	// Prepare to interpolate Z
-	float zbase = tz[0];
-	float z_20 = (tz[4] - tz[0]) * oneOverTriArea;
-	float z_01 = (tz[8] - tz[0]) * oneOverTriArea;
-
 	// Step deltas
 	int stepX12 = A12 * stepXSize;
 	int stepY12 = B12 * stepYSize;
@@ -163,67 +158,80 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	int stepX01 = A01 * stepXSize;
 	int stepY01 = B01 * stepYSize;
 
-	// x/y values for initial pixel block. Add horizontal offsets.
-	Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
-	int initialY = minY;
-
-	// Convert per-triangle values to wide registers.
+	// Prepare to interpolate Z
+	float zbase = tz[0];
+	float z_20 = (tz[4] - tz[0]) * oneOverTriArea;
+	float z_01 = (tz[8] - tz[0]) * oneOverTriArea;
+	float zdx = z_20 * (float)stepX20 + z_01 * (float)stepX01;
+	float zdy = z_20 * (float)stepY20 + z_01 * (float)stepY01;
 
 	// Edge function values at origin
-	Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12);
-	Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20);
-	Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01);
-
-	Vec4F32 zdeltaX = Vec4F32::Splat(z_20 * (float)stepX20 + z_01 * (float)stepX01);
-	Vec4F32 zdeltaY = Vec4F32::Splat(z_20 * (float)stepY20 + z_01 * (float)stepY01);
-	Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01;
-
-	Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12);
-	Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12);
-	Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20);
-	Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20);
-	Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01);
-	Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01);
-	// Rasterize
-	for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
-		// Barycentric coordinates at start of row
-		Vec4S32 w0 = w0_row;
-		Vec4S32 w1 = w1_row;
-		Vec4S32 w2 = w2_row;
-		Vec4F32 zs = zrow;
-
-		uint16_t *rowPtr = depthBuf + stride * y;
-
-		for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
-			// If p is on or inside all edges for any pixels,
-			// render those pixels.
-			Vec4S32 signCalc = w0 | w1 | w2;
-			if (!AnyZeroSignBit(signCalc)) {
-				continue;
-			}
-
-			Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
-			Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
-			// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+	// TODO: We could SIMD the second part here.
+	for (int t = 0; t < 1; t++) {
+		// Check for bad triangle.
+		if (triArea[t] == 0) {
+			continue;
+		}
 
-			Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
+		// Convert per-triangle values to wide registers.
+		Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
+		int initialY = minY;
+
+		Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12);
+		Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20);
+		Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01);
+
+		Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01;
+		Vec4F32 zdeltaX = Vec4F32::Splat(zdx);
+		Vec4F32 zdeltaY = Vec4F32::Splat(zdy);
+
+		Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12);
+		Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12);
+		Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20);
+		Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20);
+		Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01);
+		Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01);
+		// Rasterize
+		for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
+			// Barycentric coordinates at start of row
+			Vec4S32 w0 = w0_row;
+			Vec4S32 w1 = w1_row;
+			Vec4S32 w2 = w2_row;
+			Vec4F32 zs = zrow;
+
+			uint16_t *rowPtr = depthBuf + stride * y;
+
+			for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
+				// If p is on or inside all edges for any pixels,
+				// render those pixels.
+				Vec4S32 signCalc = w0 | w1 | w2;
+				if (!AnyZeroSignBit(signCalc)) {
+					continue;
+				}
 
-			// This switch is on a templated constant, so should collapse away.
-			switch (compareMode) {
-			case ZCompareMode::Greater:
-				// To implement the greater/greater-than comparison, we can combine mask and max.
-				// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
-				// We use AndNot to zero out Z results, before doing Max with the buffer.
-				AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
-				break;
-			case ZCompareMode::Less:  // UNTESTED
-				// This time, we OR the mask and use .Min.
-				(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
-				break;
-			case ZCompareMode::Always:  // UNTESTED
-				// This could be replaced with a vblend operation.
-				((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
-				break;
+				Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
+				Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
+				// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+
+				Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
+
+				// This switch is on a templated constant, so should collapse away.
+				switch (compareMode) {
+				case ZCompareMode::Greater:
+					// To implement the greater/greater-than comparison, we can combine mask and max.
+					// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
+					// We use AndNot to zero out Z results, before doing Max with the buffer.
+					AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+					break;
+				case ZCompareMode::Less:  // UNTESTED
+					// This time, we OR the mask and use .Min.
+					(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
+					break;
+				case ZCompareMode::Always:  // UNTESTED
+					// This could be replaced with a vblend operation.
+					((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
+					break;
+				}
 			}
 		}
 	}

From de09dec9d1da211afb95133ce2421f77ae537949 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 30 Dec 2024 13:01:08 +0100
Subject: [PATCH 07/15] Move branches out of triangle setup

---
 GPU/Common/DepthRaster.cpp | 46 +++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 2eb4da83b9af..e54e4496694a 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -109,29 +109,23 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
 
 	// NOTE: Triangles are stored in groups of 4.
-	int x0 = tx[0];
-	int y0 = ty[0];
-	int x1 = tx[4];
-	int y1 = ty[4];
-	int x2 = tx[8];
-	int y2 = ty[8];
-
-	int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3;
-	int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3;
-	int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1);
-	int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2);
-
-	if (maxX == minX || maxY == minY) {
-		// No pixels, or outside screen.
-		// Most of these are now gone in the initial pass.
-		return TriangleResult::NoPixels;
-	}
+	float x0 = tx[0];
+	float y0 = ty[0];
+	float x1 = tx[4];
+	float y1 = ty[4];
+	float x2 = tx[8];
+	float y2 = ty[8];
+
+	// Load the entire scissor rect into one SIMD register.
+	// Vec4F32 scissor = Vec4F32::LoadConvertS16(&scissor.x1);
+
+	int minX = (int)std::max(std::min(std::min(x0, x1), x2), (float)scissor.x1) & ~3;
+	int maxX = (int)std::min(std::max(std::max(x0, x1), x2) + 3, (float)scissor.x2) & ~3;
+	int minY = (int)std::max(std::min(std::min(y0, y1), y2), (float)scissor.y1);
+	int maxY = (int)std::min(std::max(std::max(y0, y1), y2), (float)scissor.y2);
 
 	// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
 	int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0);
-	if (triArea < MIN_TWICE_TRI_AREA) {
-		return TriangleResult::SmallOrBackface;  // Or zero area.
-	}
 
 	float oneOverTriArea = 1.0f / (float)triArea;
 
@@ -169,10 +163,20 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 	// TODO: We could SIMD the second part here.
 	for (int t = 0; t < 1; t++) {
 		// Check for bad triangle.
-		if (triArea[t] == 0) {
+		if (triArea /*[t]*/ <= 0) {
 			continue;
 		}
 
+		if (maxX == minX || maxY == minY) {
+			// No pixels, or outside screen.
+			// Most of these are now gone in the initial pass.
+			return TriangleResult::NoPixels;
+		}
+
+		if (triArea < MIN_TWICE_TRI_AREA) {
+			return TriangleResult::SmallOrBackface;  // Or zero area.
+		}
+
 		// Convert per-triangle values to wide registers.
 		Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
 		int initialY = minY;

From c3ac798545e848cfc02e16a5532745e1af66a260 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 30 Dec 2024 16:46:14 +0100
Subject: [PATCH 08/15] More crosssimd

---
 Common/Math/CrossSIMD.h | 67 ++++++++++++++++++++++-------------------
 unittest/UnitTest.cpp   | 16 ++++++++++
 2 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index a8b68ba0c127..4daddea68125 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -121,12 +121,6 @@ struct Vec4S32 {
 	void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
 	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
 
-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4S32 SwapLowerElements() {
-		return Vec4S32{
-			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
 	Vec4S32 SignBits32ToMask() {
 		return Vec4S32{
 			_mm_srai_epi32(v, 31)
@@ -144,6 +138,12 @@ struct Vec4S32 {
 		return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
 	}
 
+	Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
@@ -153,6 +153,11 @@ struct Vec4S32 {
 	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
 
+	Vec4S32 operator <<(int imm) const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }
+
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
 
@@ -217,9 +222,12 @@ struct Vec4F32 {
 	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
 	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
-	Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
+	Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
 
 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
@@ -238,13 +246,6 @@ struct Vec4F32 {
 		return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
 	}
 
-	// Swaps the two lower elements. Useful for reversing triangles..
-	Vec4F32 SwapLowerElements() {
-		return Vec4F32{
-			_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
-		};
-	}
-
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 		return Vec4F32{ _mm_add_ps(
 			_mm_add_ps(
@@ -443,17 +444,18 @@ struct Vec4S32 {
 	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
 
-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4S32 SwapLowerElements() {
-		int32x2_t upper = vget_high_s32(v);
-		int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
-		return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
-	};
-
 	// Warning: Unlike on x86, this is a full 32-bit multiplication.
 	Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
 
+	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
@@ -508,6 +510,9 @@ struct Vec4F32 {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}
 
+	// NOTE: May be slow.
+	float operator[](size_t index) const { return ((float *)&v)[index]; }
+
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
@@ -521,15 +526,22 @@ struct Vec4F32 {
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
-	Vec4F32 Recip() {
+	Vec4F32 Recip() const {
 		float32x4_t recip = vrecpeq_f32(v);
 		// Use a couple Newton-Raphson steps to refine the estimate.
-		// May be able to get away with only one refinement, not sure!
+		// To save one iteration at the expense of accuracy, use RecipApprox().
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
 		return Vec4F32{ recip };
 	}
 
+	Vec4F32 RecipApprox() const {
+		float32x4_t recip = vrecpeq_f32(v);
+		// To approximately match the precision of x86-64's rcpps, do a single iteration.
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		return Vec4F32{ recip };
+	}
+
 	Vec4F32 Clamp(float lower, float higher) {
 		return Vec4F32{
 			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
@@ -544,13 +556,6 @@ struct Vec4F32 {
 		return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
 	}
 
-	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
-	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
-	Vec4F32 SwapLowerElements() {
-		float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
-		return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
-	};
-
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
 #if PPSSPP_ARCH(ARM64_NEON)
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index a087d205b96b..fb45e9c6f076 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -56,6 +56,7 @@
 #include "Common/Buffer.h"
 #include "Common/File/Path.h"
 #include "Common/Math/SIMDHeaders.h"
+#include "Common/Math/CrossSIMD.h"
 // Get some more instructions for testing
 #if PPSSPP_ARCH(SSE2)
 #include <immintrin.h>
@@ -1124,6 +1125,21 @@ bool TestSIMD() {
 	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 #endif
+
+	const int testval[2][4] = {
+		{ 0x1000, 0x2000, 0x3000, 0x7000 },
+		{ -0x1000, -0x2000, -0x3000, -0x7000 }
+	};
+
+	for (int i = 0; i < 2; i++) {
+		Vec4S32 s = Vec4S32::Load(testval[i]);
+		Vec4S32 square = s * s;
+		Vec4S32 square16 = s.Mul16(s);
+		EXPECT_EQ_INT(square[0], square16[0]);
+		EXPECT_EQ_INT(square[1], square16[1]);
+		EXPECT_EQ_INT(square[2], square16[2]);
+		EXPECT_EQ_INT(square[3], square16[3]);
+	}
 	return true;
 }
 

From 36c5065d5da8fef84539ff917567b73bc26517b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 30 Dec 2024 16:50:24 +0100
Subject: [PATCH 09/15] Add crude guardband culling to depth rasterizer

---
 GPU/Common/DepthRaster.cpp | 41 ++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index e54e4496694a..768c44a5f4d7 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -90,7 +90,7 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3};
 constexpr int stepXSize = 4;
 constexpr int stepYSize = 1;
 
-enum class TriangleResult {
+enum class TriangleStat {
 	OK,
 	NoPixels,
 	SmallOrBackface,
@@ -102,7 +102,7 @@ constexpr int MIN_TWICE_TRI_AREA = 10;
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
 template<ZCompareMode compareMode>
-TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
+TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
 	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
 	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
 
@@ -170,11 +170,11 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 		if (maxX == minX || maxY == minY) {
 			// No pixels, or outside screen.
 			// Most of these are now gone in the initial pass.
-			return TriangleResult::NoPixels;
+			return TriangleStat::NoPixels;
 		}
 
 		if (triArea < MIN_TWICE_TRI_AREA) {
-			return TriangleResult::SmallOrBackface;  // Or zero area.
+			return TriangleStat::SmallOrBackface;  // Or zero area.
 		}
 
 		// Convert per-triangle values to wide registers.
@@ -239,13 +239,13 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
 			}
 		}
 	}
-	return TriangleResult::OK;
+	return TriangleStat::OK;
 }
 
 template<ZCompareMode compareMode>
 inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
 	for (int i = 0; i < 4; i++) {
-		TriangleResult result = DepthRasterTriangle<compareMode>(depthBuf, stride, scissor, tx + i, ty + i, tz + i);
+		TriangleStat result = DepthRasterTriangle<compareMode>(depthBuf, stride, scissor, tx + i, ty + i, tz + i);
 		stats[(int)result]++;
 	}
 }
@@ -373,6 +373,11 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	const float *verts[12];  // four triangles at a time!
 	const int count = draw.vertexCount;
 
+	// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
+	// This is slightly off-center since we are already in screen space, but whatever. We compensate a little for it in the bottom right.
+	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-2048);
+	Vec4S32 guardBandBottomRight = Vec4S32::Splat(2348);
+
 	Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1);
 	Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1);
 	Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2);
@@ -457,6 +462,11 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 			continue;
 		}
 
+		// Create a mask to kill coordinates of triangles that poke outside the guardband.
+		Vec4S32 inGuardBand =
+			(minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
+			(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight));
+
 		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
 		// Still good for culling early and pretty cheap to compute.
 		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
@@ -465,9 +475,10 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 			continue;
 		}
 
-		Vec4S32FromF32(x0).Store(tx + outCount);
-		Vec4S32FromF32(x1).Store(tx + outCount + 4);
-		Vec4S32FromF32(x2).Store(tx + outCount + 8);
+		// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
+		(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
+		(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 4);
+		(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 8);
 		Vec4S32FromF32(y0).Store(ty + outCount);
 		Vec4S32FromF32(y1).Store(ty + outCount + 4);
 		Vec4S32FromF32(y2).Store(ty + outCount + 8);
@@ -479,9 +490,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 
 		if (!cullEnabled) {
 			// If culling is off, store the triangles again, in the opposite order.
-			Vec4S32FromF32(x0).Store(tx + outCount);
-			Vec4S32FromF32(x2).Store(tx + outCount + 4);
-			Vec4S32FromF32(x1).Store(tx + outCount + 8);
+			(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
+			(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
+			(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);
 			Vec4S32FromF32(y0).Store(ty + outCount);
 			Vec4S32FromF32(y2).Store(ty + outCount + 4);
 			Vec4S32FromF32(y1).Store(ty + outCount + 8);
@@ -536,9 +547,9 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
 			}
 			}
 		}
-		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
-		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
-		gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
+		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels];
+		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface];
+		gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK];
 		break;
 	}
 	default:

From bcab17fcf38784ec6654cea1cd7e7ad21510a064 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 30 Dec 2024 17:07:51 +0100
Subject: [PATCH 10/15] Parallelize triangle setup. However, some glitches
 appear...

---
 GPU/Common/DepthRaster.cpp | 153 ++++++++++++++++++-------------------
 1 file changed, 75 insertions(+), 78 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 768c44a5f4d7..604c1fa37853 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -90,6 +90,9 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3};
 constexpr int stepXSize = 4;
 constexpr int stepYSize = 1;
 
+constexpr int stepXShift = 2;
+constexpr int stepYShift = 0;
+
 enum class TriangleStat {
 	OK,
 	NoPixels,
@@ -98,105 +101,104 @@ enum class TriangleStat {
 
 constexpr int MIN_TWICE_TRI_AREA = 10;
 
-// Adapted from Intel's depth rasterizer example.
-// Started with the scalar version, will SIMD-ify later.
-// x1/y1 etc are the scissor rect.
+// A mix of ideas from Intel's sample and ryg's rasterizer blog series.
 template<ZCompareMode compareMode>
-TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
-	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
+void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
+	// BEGIN triangle setup. This is done using SIMD, four triangles at a time.
 	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
 
 	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
 
 	// NOTE: Triangles are stored in groups of 4.
-	float x0 = tx[0];
-	float y0 = ty[0];
-	float x1 = tx[4];
-	float y1 = ty[4];
-	float x2 = tx[8];
-	float y2 = ty[8];
-
-	// Load the entire scissor rect into one SIMD register.
-	// Vec4F32 scissor = Vec4F32::LoadConvertS16(&scissor.x1);
+	Vec4S32 x0 = Vec4S32::LoadAligned(tx);
+	Vec4S32 y0 = Vec4S32::LoadAligned(ty);
+	Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4);
+	Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4);
+	Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
+	Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);
 
-	int minX = (int)std::max(std::min(std::min(x0, x1), x2), (float)scissor.x1) & ~3;
-	int maxX = (int)std::min(std::max(std::max(x0, x1), x2) + 3, (float)scissor.x2) & ~3;
-	int minY = (int)std::max(std::min(std::min(y0, y1), y2), (float)scissor.y1);
-	int maxY = (int)std::min(std::max(std::max(y0, y1), y2), (float)scissor.y2);
+	Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
+	Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
+	Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
+	Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
 
-	// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
-	int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0);
-
-	float oneOverTriArea = 1.0f / (float)triArea;
+	Vec4S32 triArea = (x1 - x0).MulAsS16(y2 - y0) - (x2 - x0).MulAsS16(y1 - y0);
+	// Probably not worth checking triArea here as we already did the approximatly same check previously.
 
 	// Edge setup
-	int A12 = y1 - y2;
-	int B12 = x2 - x1;
-	int C12 = x1 * y2 - y1 * x2;
+	Vec4S32 A12 = y1 - y2;
+	Vec4S32 B12 = x2 - x1;
+	Vec4S32 C12 = x1.MulAsS16(y2) - y1.MulAsS16(x2);
 
 	// Edge setup
-	int A20 = y2 - y0;
-	int B20 = x0 - x2;
-	int C20 = x2 * y0 - y2 * x0;
+	Vec4S32 A20 = y2 - y0;
+	Vec4S32 B20 = x0 - x2;
+	Vec4S32 C20 = x2.MulAsS16(y0) - y2.MulAsS16(x0);
 
 	// Edge setup
-	int A01 = y0 - y1;
-	int B01 = x1 - x0;
-	int C01 = x0 * y1 - y0 * x1;
+	Vec4S32 A01 = y0 - y1;
+	Vec4S32 B01 = x1 - x0;
+	Vec4S32 C01 = x0.MulAsS16(y1) - y0.MulAsS16(x1);
 
 	// Step deltas
-	int stepX12 = A12 * stepXSize;
-	int stepY12 = B12 * stepYSize;
-	int stepX20 = A20 * stepXSize;
-	int stepY20 = B20 * stepYSize;
-	int stepX01 = A01 * stepXSize;
-	int stepY01 = B01 * stepYSize;
+	Vec4S32 stepX12 = A12 << stepXShift;
+	Vec4S32 stepY12 = B12 << stepYShift;
+	Vec4S32 stepX20 = A20 << stepXShift;
+	Vec4S32 stepY20 = B20 << stepYShift;
+	Vec4S32 stepX01 = A01 << stepXShift;
+	Vec4S32 stepY01 = B01 << stepYShift;
 
 	// Prepare to interpolate Z
-	float zbase = tz[0];
-	float z_20 = (tz[4] - tz[0]) * oneOverTriArea;
-	float z_01 = (tz[8] - tz[0]) * oneOverTriArea;
-	float zdx = z_20 * (float)stepX20 + z_01 * (float)stepX01;
-	float zdy = z_20 * (float)stepY20 + z_01 * (float)stepY01;
+	Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
+	Vec4F32 zbase = Vec4F32::LoadAligned(tz);
+	Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea;
+	Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea;
+	Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
+	Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);
 
 	// Edge function values at origin
 	// TODO: We could SIMD the second part here.
-	for (int t = 0; t < 1; t++) {
+	// Using operator[] on the vectors actually seems to result in pretty good code.
+	for (int t = 0; t < 4; t++) {
 		// Check for bad triangle.
-		if (triArea /*[t]*/ <= 0) {
-			continue;
-		}
-
-		if (maxX == minX || maxY == minY) {
+		if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
 			// No pixels, or outside screen.
 			// Most of these are now gone in the initial pass.
-			return TriangleStat::NoPixels;
+			stats[(int)TriangleStat::NoPixels]++;
+			continue;
 		}
 
-		if (triArea < MIN_TWICE_TRI_AREA) {
-			return TriangleStat::SmallOrBackface;  // Or zero area.
+		if (triArea[t] < MIN_TWICE_TRI_AREA) {
+			stats[(int)TriangleStat::SmallOrBackface]++;  // Or zero area.
+			continue;
 		}
 
+		const int minXT = minX[t] & ~3;
+		const int maxXT = maxX[t] & ~3;
+
+		const int minYT = minY[t];
+		const int maxYT = maxY[t];
+
 		// Convert per-triangle values to wide registers.
-		Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123);
-		int initialY = minY;
-
-		Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12);
-		Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20);
-		Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01);
-
-		Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01;
-		Vec4F32 zdeltaX = Vec4F32::Splat(zdx);
-		Vec4F32 zdeltaY = Vec4F32::Splat(zdy);
-
-		Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12);
-		Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12);
-		Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20);
-		Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20);
-		Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01);
-		Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01);
+		Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
+		int initialY = minY[t];
+
+		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).MulAsS16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
+		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).MulAsS16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
+		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).MulAsS16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
+
+		Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
+		Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
+		Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]);
+
+		Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]);
+		Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]);
+		Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]);
+		Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]);
+		Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]);
+		Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]);
 		// Rasterize
-		for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
+		for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
 			// Barycentric coordinates at start of row
 			Vec4S32 w0 = w0_row;
 			Vec4S32 w1 = w1_row;
@@ -205,10 +207,12 @@ TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor sc
 
 			uint16_t *rowPtr = depthBuf + stride * y;
 
-			for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
+			for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
 				// If p is on or inside all edges for any pixels,
 				// render those pixels.
 				Vec4S32 signCalc = w0 | w1 | w2;
+
+				// TODO: Check if this check is profitable. Maybe only for big triangles?
 				if (!AnyZeroSignBit(signCalc)) {
 					continue;
 				}
@@ -238,15 +242,8 @@ TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor sc
 				}
 			}
 		}
-	}
-	return TriangleStat::OK;
-}
 
-template<ZCompareMode compareMode>
-inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
-	for (int i = 0; i < 4; i++) {
-		TriangleStat result = DepthRasterTriangle<compareMode>(depthBuf, stride, scissor, tx + i, ty + i, tz + i);
-		stats[(int)result]++;
+		stats[(int)TriangleStat::OK]++;
 	}
 }
 

From f5cc41caabb753114c0b9015455fb923c339b9d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Mon, 30 Dec 2024 17:21:34 +0100
Subject: [PATCH 11/15] More CrossSIMD (breaking change)

---
 Common/Math/CrossSIMD.h   | 61 ++++++++++++++++++++++++++++++---------
 Common/Math/SIMDHeaders.h | 34 +++++++++++-----------
 2 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 4daddea68125..b412574192fb 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -131,7 +131,7 @@ struct Vec4S32 {
 	// On SSE2, much faster than _mm_mullo_epi32_SSE2.
 	// On NEON though, it'll read the full 32 bits, so beware.
 	// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
-	Vec4S32 MulAsS16(Vec4S32 other) const {
+	Vec4S32 Mul16(Vec4S32 other) const {
 		// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
 		// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
 		// in the other register.
@@ -153,7 +153,11 @@ struct Vec4S32 {
 	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
 
-	Vec4S32 operator <<(int imm) const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; }  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
 
 	// NOTE: May be slow.
 	int operator[](size_t index) const { return ((int *)&v)[index]; }
@@ -221,6 +225,7 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
 	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
 	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
 	// NOTE: May be slow.
 	float operator[](size_t index) const { return ((float *)&v)[index]; }
@@ -262,6 +267,19 @@ struct Vec4F32 {
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
 		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
 	}
+
+	// This is here because ARM64 can do this very efficiently.
+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		col0.v = _mm_loadu_ps(src);
+		col1.v = _mm_loadu_ps(src + 4);
+		col2.v = _mm_loadu_ps(src + 8);
+		col3.v = _mm_loadu_ps(src + 12);
+		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
+	}
+
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
 };
 
 inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
@@ -310,6 +328,12 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
+
+	inline Vec4U16 AndNot(Vec4U16 inverted) {
+		return Vec4U16{
+			_mm_andnot_si128(inverted.v, v)  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
+		};
+	}
 };
 
 struct Vec8U16 {
@@ -329,12 +353,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	};
 }
 
-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{
-		_mm_andnot_si128(inverted.v, a.v)  // NOTE: with andnot, the first parameter is inverted, and then and is performed.
-	};
-}
-
 #elif PPSSPP_ARCH(ARM_NEON)
 
 struct Mat4F32 {
@@ -445,7 +463,7 @@ struct Vec4S32 {
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
 
 	// Warning: Unlike on x86, this is a full 32-bit multiplication.
-	Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
 
 	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
 	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
@@ -462,6 +480,11 @@ struct Vec4S32 {
 	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
 	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
 	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
+	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
+	Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
 
 	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
@@ -522,6 +545,7 @@ struct Vec4F32 {
 	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
 	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
 	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+	void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
@@ -556,6 +580,10 @@ struct Vec4F32 {
 		return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
 	}
 
+	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
+	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
+	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
+
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
 #if PPSSPP_ARCH(ARM64_NEON)
@@ -578,6 +606,15 @@ struct Vec4F32 {
 #endif
 	}
 
+	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		// The optimizer hopefully gets rid of the copies below.
+		float32x4x4_t r = vld4q_f32(src);
+		col0.v = r.val[0];
+		col1.v = r.val[1];
+		col2.v = r.val[2];
+		col3.v = r.val[3];
+	}
+
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 #if PPSSPP_ARCH(ARM64_NEON)
 		float32x4_t sum = vaddq_f32(
@@ -649,6 +686,8 @@ struct Vec4U16 {
 	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
 	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+
+	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
 };
 
 inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
@@ -657,10 +696,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	return Vec4U16{ result };
 }
 
-inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
-	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
-}
-
 struct Vec8U16 {
 	uint16x8_t v;
 
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index cb63b89eefac..82b18e558c2d 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
 // May later figure out how to use the appropriate ones depending on compile flags.
 
 inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
-       __m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
-       __m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
-       __m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
-       __m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
-       __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
-       __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
-       return _mm_unpacklo_epi64(prod01, prod23);
+	__m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
+	__m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
+	__m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
+	__m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
+	__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
+	__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
+	return _mm_unpacklo_epi64(prod01, prod23);
 }
 
 inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_max_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_max_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }
 
 inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
-       return _mm_xor_si128(
-               _mm_min_epi16(
-                       _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
-                       _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
-               _mm_set1_epi16((int16_t)0x8000));
+	return _mm_xor_si128(
+		_mm_min_epi16(
+			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
+			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
+		_mm_set1_epi16((int16_t)0x8000));
 }
 
 // SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.

From e0991a70707e350397a70b1a7acb2b410bdb8c6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 31 Dec 2024 01:20:45 +0100
Subject: [PATCH 12/15] DepthRaster: Improved guardband rejection, fixing
 glitches.

---
 GPU/Common/DepthRaster.cpp | 119 ++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 41 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 604c1fa37853..f75f9cd8c57a 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -117,36 +117,37 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 	Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
 	Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);
 
+	// FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations.
 	Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
 	Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
 	Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
 	Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
 
-	Vec4S32 triArea = (x1 - x0).MulAsS16(y2 - y0) - (x2 - x0).MulAsS16(y1 - y0);
+	Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);
 	// Probably not worth checking triArea here as we already did the approximatly same check previously.
 
 	// Edge setup
 	Vec4S32 A12 = y1 - y2;
 	Vec4S32 B12 = x2 - x1;
-	Vec4S32 C12 = x1.MulAsS16(y2) - y1.MulAsS16(x2);
+	Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);
 
 	// Edge setup
 	Vec4S32 A20 = y2 - y0;
 	Vec4S32 B20 = x0 - x2;
-	Vec4S32 C20 = x2.MulAsS16(y0) - y2.MulAsS16(x0);
+	Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);
 
 	// Edge setup
 	Vec4S32 A01 = y0 - y1;
 	Vec4S32 B01 = x1 - x0;
-	Vec4S32 C01 = x0.MulAsS16(y1) - y0.MulAsS16(x1);
+	Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
 
 	// Step deltas
-	Vec4S32 stepX12 = A12 << stepXShift;
-	Vec4S32 stepY12 = B12 << stepYShift;
-	Vec4S32 stepX20 = A20 << stepXShift;
-	Vec4S32 stepY20 = B20 << stepYShift;
-	Vec4S32 stepX01 = A01 << stepXShift;
-	Vec4S32 stepY01 = B01 << stepYShift;
+	Vec4S32 stepX12 = A12.Shl<stepXShift>();
+	Vec4S32 stepY12 = B12.Shl<stepYShift>();
+	Vec4S32 stepX20 = A20.Shl<stepXShift>();
+	Vec4S32 stepY20 = B20.Shl<stepYShift>();
+	Vec4S32 stepX01 = A01.Shl<stepXShift>();
+	Vec4S32 stepY01 = B01.Shl<stepYShift>();
 
 	// Prepare to interpolate Z
 	Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
@@ -163,7 +164,8 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 		// Check for bad triangle.
 		if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
 			// No pixels, or outside screen.
-			// Most of these are now gone in the initial pass.
+			// Most of these are now gone in the initial pass, but not all since we cull
+			// in 4-groups there.
 			stats[(int)TriangleStat::NoPixels]++;
 			continue;
 		}
@@ -182,10 +184,16 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 		// Convert per-triangle values to wide registers.
 		Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
 		int initialY = minY[t];
+		_dbg_assert_(A12[t] < 32767);
+		_dbg_assert_(A12[t] > -32767);
+		_dbg_assert_(A20[t] < 32767);
+		_dbg_assert_(A20[t] > -32767);
+		_dbg_assert_(A01[t] < 32767);
+		_dbg_assert_(A01[t] > -32767);
 
-		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).MulAsS16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
-		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).MulAsS16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
-		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).MulAsS16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
+		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
+		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
+		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
 
 		Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
 		Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
@@ -229,7 +237,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 					// To implement the greater/greater-than comparison, we can combine mask and max.
 					// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
 					// We use AndNot to zero out Z results, before doing Max with the buffer.
-					AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+					shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x);
 					break;
 				case ZCompareMode::Less:  // UNTESTED
 					// This time, we OR the mask and use .Min.
@@ -237,7 +245,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 					break;
 				case ZCompareMode::Always:  // UNTESTED
 					// This could be replaced with a vblend operation.
-					((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
+					((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv)).Store(rowPtr + x);
 					break;
 				}
 			}
@@ -362,7 +370,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	}
 	const bool cullEnabled = draw.cullEnabled;
 
-	static const float zerovec[4] = {};
+	static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f};
 
 	int collected = 0;
 	int planeCulled = 0;
@@ -371,22 +379,35 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	const int count = draw.vertexCount;
 
 	// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
-	// This is slightly off-center since we are already in screen space, but whatever. We compensate a little for it in the bottom right.
-	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-2048);
-	Vec4S32 guardBandBottomRight = Vec4S32::Splat(2348);
+	// This is slightly off-center since we are already in screen space, but whatever.
+	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
+	Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
 
 	Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1);
 	Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1);
 	Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2);
 	Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2);
 
+	// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
+	auto validVert = [](const float *v) -> bool {
+		if (v[3] <= 0.0f /* || v[2] <= 0.0f */) {
+			return false;
+		}
+		/*
+		if (v[2] >= 65535.0f * v[3]) {
+			return false;
+		}*/
+		return true;
+	};
+
 	for (int i = 0; i < count; i += 3) {
 		// Collect valid triangles into buffer.
 		const float *v0 = transformed + indexBuffer[i] * 4;
 		const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4;
 		const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4;
-		// Don't collect triangle if any vertex is behind the 0 plane.
-		if (v0[3] > 0.0f && v1[3] > 0.0f && v2[3] > 0.0f) {
+		// Don't collect triangle if any vertex is beyond the planes.
+		// TODO: Optimize this somehow.
+		if (validVert(v0) && validVert(v1) && validVert(v2)) {
 			verts[collected] = v0;
 			verts[collected + 1] = v1;
 			verts[collected + 2] = v2;
@@ -404,6 +425,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		}
 
 		if (collected != 12) {
+			// Fetch more!
 			continue;
 		}
 
@@ -435,47 +457,53 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		Vec4F32 recipW2 = w2.Recip();
 		x0 *= recipW0;
 		y0 *= recipW0;
-		z0 = (z0 * recipW0).Clamp(0.0f, 65535.0f);
+		z0 *= recipW0;
 		x1 *= recipW1;
 		y1 *= recipW1;
-		z1 = (z1 * recipW1).Clamp(0.0f, 65535.0f);
+		z1 *= recipW1;
 		x2 *= recipW2;
 		y2 *= recipW2;
-		z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f);
+		z2 *= recipW2;
 
-		// Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer).
-		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1));
-		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1));
-		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2));
-		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2));
+		// Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer).
+		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)));
+		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)));
+		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)));
+		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)));
 
-		// If all are equal in any dimension, all four triangles are tiny nonsense (or outside the scissor) and can be skipped early.
+		// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
 		Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
-		// Otherwise we just proceed to triangle setup with all four for now. Later might want to
-		// compact the remaining triangles... Or do more checking here.
+
+		// Otherwise we just proceed to triangle setup with all four for now.
 		// We could also save the computed boxes for later..
+		// TODO: Merge into below checks? Though nice with an early out.
 		if (!AnyZeroSignBit(eqMask)) {
 			boxCulled += 4;
 			continue;
 		}
 
-		// Create a mask to kill coordinates of triangles that poke outside the guardband.
+		// Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty).
 		Vec4S32 inGuardBand =
-			(minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
-			(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight));
+			((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
+				(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
+
+		// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
+		x0 &= inGuardBand;
+		x1 &= inGuardBand;
+		x2 &= inGuardBand;
 
 		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
 		// Still good for culling early and pretty cheap to compute.
-		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
+		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA + 2));
 		if (!AnyZeroSignBit(doubleTriArea)) {
 			gpuStats.numDepthRasterEarlySize += 4;
 			continue;
 		}
 
 		// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
-		(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
-		(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 4);
-		(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 8);
+		Vec4S32FromF32(x0).Store(tx + outCount);
+		Vec4S32FromF32(x1).Store(tx + outCount + 4);
+		Vec4S32FromF32(x2).Store(tx + outCount + 8);
 		Vec4S32FromF32(y0).Store(ty + outCount);
 		Vec4S32FromF32(y1).Store(ty + outCount + 4);
 		Vec4S32FromF32(y2).Store(ty + outCount + 8);
@@ -483,10 +511,19 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 		z1.Store(tz + outCount + 4);
 		z2.Store(tz + outCount + 8);
 
+#ifdef _DEBUG
+		for (int i = 0; i < 12; i++) {
+			_dbg_assert_(tx[outCount + i] < 32767);
+			_dbg_assert_(tx[outCount + i] >= -32768);
+			_dbg_assert_(tx[outCount + i] < 32767);
+			_dbg_assert_(tx[outCount + i] >= -32768);
+		}
+#endif
+
 		outCount += 12;
 
 		if (!cullEnabled) {
-			// If culling is off, store the triangles again, in the opposite order.
+			// If culling is off, store the triangles again, with the first two vertices swapped.
 			(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
 			(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
 			(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);

From 7ddd7024f4047daf47eb015f18ddebb70d326e43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 31 Dec 2024 02:17:08 +0100
Subject: [PATCH 13/15] Revert unintentional change. Warning fix

---
 GPU/Common/DepthRaster.cpp | 2 +-
 unittest/UnitTest.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index f75f9cd8c57a..775d4108a358 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -494,7 +494,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 
 		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
 		// Still good for culling early and pretty cheap to compute.
-		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA + 2));
+		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA));
 		if (!AnyZeroSignBit(doubleTriArea)) {
 			gpuStats.numDepthRasterEarlySize += 4;
 			continue;
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index fb45e9c6f076..475b785abd1f 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -1049,7 +1049,7 @@ CharQueue GetQueue() {
 
 bool TestCharQueue() {
 	// We use a tiny block size for testing.
-	CharQueue queue = std::move(GetQueue());
+	CharQueue queue = GetQueue();
 
 	// Add 16 chars.
 	queue.push_back("abcdefghijkl");

From dee5fe69906930fd9278269837e641323eb31912 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 31 Dec 2024 02:30:05 +0100
Subject: [PATCH 14/15] Fix issue in Midnight Club where Z now wrapped around
 at a distance, after removing the clamp. Might as well cull.

---
 Common/Math/CrossSIMD.h    |  5 +++++
 GPU/Common/DepthRaster.cpp | 13 ++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index b412574192fb..4393635b05b0 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -152,6 +152,9 @@ struct Vec4S32 {
 	// TODO: andnot
 	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
 	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
+	void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
+	void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
+	void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
 
 	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; }  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
 	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
@@ -583,6 +586,8 @@ struct Vec4F32 {
 	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
 	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
 	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
+	Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
+	Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }
 
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 775d4108a358..9ccb5c9b87a0 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -383,14 +383,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
 	Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
 
-	Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1);
-	Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1);
-	Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2);
-	Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2);
+	Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1);
+	Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1);
+	Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2);
+	Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2);
 
 	// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
 	auto validVert = [](const float *v) -> bool {
-		if (v[3] <= 0.0f /* || v[2] <= 0.0f */) {
+		if (v[3] <= 0.0f || v[2] <= 0.0f) {
 			return false;
 		}
 		/*
@@ -487,6 +487,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
 			((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
 				(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
 
+		// Create another mask to kill off-screen triangles. Not perfectly accurate.
+		inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2));
+
 		// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
 		x0 &= inGuardBand;
 		x1 &= inGuardBand;

From f85d7db5b1588c46845e89d762feb90f184bdd63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 31 Dec 2024 02:34:16 +0100
Subject: [PATCH 15/15] Comment fixes, buildfix

---
 Common/Math/CrossSIMD.h    |  1 +
 GPU/Common/DepthRaster.cpp | 15 ++++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 4393635b05b0..556d9e3b1aee 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -485,6 +485,7 @@ struct Vec4S32 {
 	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
 	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
 	Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
 
 	template<int imm>
 	Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 9ccb5c9b87a0..8bf6e2586c5b 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -104,7 +104,7 @@ constexpr int MIN_TWICE_TRI_AREA = 10;
 // A mix of ideas from Intel's sample and ryg's rasterizer blog series.
 template<ZCompareMode compareMode>
 void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
-	// BEGIN triangle setup. This is done using SIMD, four triangles at a time.
+	// Triangle setup. This is done using SIMD, four triangles at a time.
 	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
 
 	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
@@ -124,19 +124,16 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 	Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
 
 	Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);
-	// Probably not worth checking triArea here as we already did the approximatly same check previously.
 
 	// Edge setup
 	Vec4S32 A12 = y1 - y2;
 	Vec4S32 B12 = x2 - x1;
 	Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);
 
-	// Edge setup
 	Vec4S32 A20 = y2 - y0;
 	Vec4S32 B20 = x0 - x2;
 	Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);
 
-	// Edge setup
 	Vec4S32 A01 = y0 - y1;
 	Vec4S32 B01 = x1 - x0;
 	Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
@@ -157,11 +154,10 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 	Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
 	Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);
 
-	// Edge function values at origin
-	// TODO: We could SIMD the second part here.
-	// Using operator[] on the vectors actually seems to result in pretty good code.
+	// Shared setup is done, now loop per-triangle in the group of four.
 	for (int t = 0; t < 4; t++) {
 		// Check for bad triangle.
+		// Using operator[] on the vectors actually seems to result in pretty good code.
 		if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
 			// No pixels, or outside screen.
 			// Most of these are now gone in the initial pass, but not all since we cull
@@ -181,7 +177,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 		const int minYT = minY[t];
 		const int maxYT = maxY[t];
 
-		// Convert per-triangle values to wide registers.
+		// Convert to wide registers.
 		Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
 		int initialY = minY[t];
 		_dbg_assert_(A12[t] < 32767);
@@ -191,6 +187,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 		_dbg_assert_(A01[t] < 32767);
 		_dbg_assert_(A01[t] > -32767);
 
+		// TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility.
 		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
 		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
 		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
@@ -239,7 +236,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc
 					// We use AndNot to zero out Z results, before doing Max with the buffer.
 					shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x);
 					break;
-				case ZCompareMode::Less:  // UNTESTED
+				case ZCompareMode::Less:
 					// This time, we OR the mask and use .Min.
 					(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
 					break;