From f70889ad49f27871188153a83c6363270396e16e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 13:42:25 +0100 Subject: [PATCH 01/15] Better triangle area calculation, thanks fp64 for the reminder --- GPU/Common/DepthRaster.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index b464e3012bfe..f68ac6f89b93 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -132,19 +132,19 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor // are slow on SSE2. // NOTE: Triangles are stored in groups of 4. - int v0x = tx[0]; - int v0y = ty[0]; - int v1x = tx[4]; - int v1y = ty[4]; - int v2x = tx[8]; - int v2y = ty[8]; + int x0 = tx[0]; + int y0 = ty[0]; + int x1 = tx[4]; + int y1 = ty[4]; + int x2 = tx[8]; + int y2 = ty[8]; // use fixed-point only for X and Y. Avoid work for Z and W. // We use 4x1 tiles for simplicity. - int minX = std::max(std::min(std::min(v0x, v1x), v2x), (int)scissor.x1) & ~3; - int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, (int)scissor.x2) & ~3; - int minY = std::max(std::min(std::min(v0y, v1y), v2y), (int)scissor.y1); - int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2); + int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3; + int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3; + int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1); + int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2); if (maxX == minX || maxY == minY) { // No pixels, or outside screen. // Most of these are now gone in the initial pass. @@ -152,7 +152,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor } // TODO: Cull really small triangles here - we can increase the threshold a bit probably. - int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); + int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0); if (triArea < MIN_TWICE_TRI_AREA) { return TriangleResult::SmallOrBackface; // Or zero area. } @@ -161,9 +161,9 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor Edge e01, e12, e20; - Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY); - Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY); - Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY); + Vec4S32 w0_row = e12.init(x1, y1, x2, y2, minX, minY); + Vec4S32 w1_row = e20.init(x2, y2, x0, y0, minX, minY); + Vec4S32 w2_row = e01.init(x0, y0, x1, y1, minX, minY); // Prepare to interpolate Z Vec4F32 zz0 = Vec4F32::Splat(tz[0]); @@ -435,10 +435,10 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr continue; } - // Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) + // Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) // Still good for culling early and pretty cheap to compute. - Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); - if (!AnyZeroSignBit(triArea)) { + Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); + if (!AnyZeroSignBit(doubleTriArea)) { gpuStats.numDepthRasterEarlySize += 4; continue; } From 1195c630c3023ebc78784246386f27fe033cf8d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 14:01:13 +0100 Subject: [PATCH 02/15] Some variable renaming --- GPU/Common/DepthRaster.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index f68ac6f89b93..afd2718a4921 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -85,29 +85,29 @@ static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor sciss } } -alignas(16) static const int zero123[4] = {0, 1, 2, 3}; +alignas(16) static const int zero123[4] = {0, 1, 2, 3}; + +constexpr int stepXSize = 4; +constexpr int stepYSize = 1; struct Edge { // Dimensions of our pixel group - static const int stepXSize = 4; - static const int stepYSize = 1; - Vec4S32 oneStepX; Vec4S32 oneStepY; - Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) { + Vec4S32 init(int xa, int ya, int xb, int yb, int originX, int originY) { // Edge setup - int A = v0y - v1y; - int B = v1x - v0x; - int C = v0x * v1y - v0y * v1x; + int A = ya - yb; + int B = xb - xa; + int C = xa * yb - ya * xb; // Step deltas oneStepX = Vec4S32::Splat(A * stepXSize); oneStepY = Vec4S32::Splat(B * stepYSize); // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123); - Vec4S32 y = Vec4S32::Splat(p0y); + Vec4S32 x = Vec4S32::Splat(originX) + Vec4S32::LoadAligned(zero123); + Vec4S32 y = Vec4S32::Splat(originY); // Edge function values at origin return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C); @@ -175,7 +175,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2; // Rasterize - for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) { + for (int y = minY; y <= maxY; y += stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) { // Barycentric coordinates at start of row Vec4S32 w0 = w0_row; Vec4S32 w1 = w1_row; @@ -184,7 +184,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor uint16_t *rowPtr = depthBuf + stride * y; - for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) { + for (int x = minX; x <= maxX; x += stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) { // If p is on or inside all edges for any pixels, // render those pixels. Vec4S32 signCalc = w0 | w1 | w2; From 69b35e914690d51a06fdf8979e33d1e0996aa7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 14:10:08 +0100 Subject: [PATCH 03/15] Inline edge calculations --- GPU/Common/DepthRaster.cpp | 80 +++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index afd2718a4921..b9a94afcf739 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -90,30 +90,6 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3}; constexpr int stepXSize = 4; constexpr int stepYSize = 1; -struct Edge { - // Dimensions of our pixel group - Vec4S32 oneStepX; - Vec4S32 oneStepY; - - Vec4S32 init(int xa, int ya, int xb, int yb, int originX, int originY) { - // Edge setup - int A = ya - yb; - int B = xb - xa; - int C = xa * yb - ya * xb; - - // Step deltas - oneStepX = Vec4S32::Splat(A * stepXSize); - oneStepY = Vec4S32::Splat(B * stepYSize); - - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x = Vec4S32::Splat(originX) + Vec4S32::LoadAligned(zero123); - Vec4S32 y = Vec4S32::Splat(originY); - - // Edge function values at origin - return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C); - } -}; - enum class TriangleResult { OK, NoPixels, @@ -159,23 +135,63 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor float oneOverTriArea = 1.0f / (float)triArea; - Edge e01, e12, e20; + // Edge setup + int A12 = y1 - y2; + int B12 = x2 - x1; + int C12 = x1 * y2 - y1 * x2; + + // Step deltas + Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize); + Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize); + + // x/y values for initial pixel block. Add horizontal offsets. + Vec4S32 x12 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + Vec4S32 y12 = Vec4S32::Splat(minY); + + // Edge function values at origin + + // Edge setup + int A20 = y2 - y0; + int B20 = x0 - x2; + int C20 = x2 * y0 - y2 * x0; + + // Step deltas + Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize); + Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize); + + // x/y values for initial pixel block. Add horizontal offsets. + Vec4S32 x20 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + Vec4S32 y20 = Vec4S32::Splat(minY); + + // Edge setup + int A01 = y0 - y1; + int B01 = x1 - x0; + int C01 = x0 * y1 - y0 * x1; + + // Step deltas + Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize); + Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize); + + // x/y values for initial pixel block. Add horizontal offsets. + Vec4S32 x01 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + Vec4S32 y01 = Vec4S32::Splat(minY); - Vec4S32 w0_row = e12.init(x1, y1, x2, y2, minX, minY); - Vec4S32 w1_row = e20.init(x2, y2, x0, y0, minX, minY); - Vec4S32 w2_row = e01.init(x0, y0, x1, y1, minX, minY); + // Edge function values at origin + Vec4S32 w0_row = Vec4S32::Splat(A12) * x12 + Vec4S32::Splat(B12) * y12 + Vec4S32::Splat(C12); + Vec4S32 w1_row = Vec4S32::Splat(A20) * x20 + Vec4S32::Splat(B20) * y20 + Vec4S32::Splat(C20); + Vec4S32 w2_row = Vec4S32::Splat(A01) * x01 + Vec4S32::Splat(B01) * y01 + Vec4S32::Splat(C01); // Prepare to interpolate Z Vec4F32 zz0 = Vec4F32::Splat(tz[0]); Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea); Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea); - Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(e20.oneStepX) + zz2 * Vec4F32FromS32(e01.oneStepX); - Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(e20.oneStepY) + zz2 * Vec4F32FromS32(e01.oneStepY); + Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(oneStepX20) + zz2 * Vec4F32FromS32(oneStepX01); + Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(oneStepY20) + zz2 * Vec4F32FromS32(oneStepY01); Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2; // Rasterize - for (int y = minY; y <= maxY; y += stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY, zrow += zdeltaY) { + for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { // Barycentric coordinates at start of row Vec4S32 w0 = w0_row; Vec4S32 w1 = w1_row; @@ -184,7 +200,7 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor uint16_t *rowPtr = depthBuf + stride * y; - for (int x = minX; x <= maxX; x += stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX, zs += zdeltaX) { + for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { // If p is on or inside all edges for any pixels, // render those pixels. Vec4S32 signCalc = w0 | w1 | w2; From d435945b7c4ce058efffdc12e3d8baee5302afc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 16:43:07 +0100 Subject: [PATCH 04/15] Simplify --- GPU/Common/DepthRaster.cpp | 55 +++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index b9a94afcf739..e07246ac32f5 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -140,55 +140,50 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int B12 = x2 - x1; int C12 = x1 * y2 - y1 * x2; - // Step deltas - Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize); - Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize); - - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x12 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - Vec4S32 y12 = Vec4S32::Splat(minY); - - // Edge function values at origin - // Edge setup int A20 = y2 - y0; int B20 = x0 - x2; int C20 = x2 * y0 - y2 * x0; - // Step deltas - Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize); - Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize); - - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x20 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - Vec4S32 y20 = Vec4S32::Splat(minY); - // Edge setup int A01 = y0 - y1; int B01 = x1 - x0; int C01 = x0 * y1 - y0 * x1; + // Prepare to interpolate Z + float zbase = tz[0]; + float z_20 = (tz[4] - tz[0]) * oneOverTriArea; + float z_01 = (tz[8] - tz[0]) * oneOverTriArea; + + // Step deltas + Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize); + Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize); + + // Step deltas + Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize); + Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize); + // Step deltas Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize); Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize); // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 x01 = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - Vec4S32 y01 = Vec4S32::Splat(minY); + Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + int initialY = minY; + + // Convert per-triangle values to wide registers. // Edge function values at origin - Vec4S32 w0_row = Vec4S32::Splat(A12) * x12 + Vec4S32::Splat(B12) * y12 + Vec4S32::Splat(C12); - Vec4S32 w1_row = Vec4S32::Splat(A20) * x20 + Vec4S32::Splat(B20) * y20 + Vec4S32::Splat(C20); - Vec4S32 w2_row = Vec4S32::Splat(A01) * x01 + Vec4S32::Splat(B01) * y01 + Vec4S32::Splat(C01); + Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); + Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); + Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); - // Prepare to interpolate Z - Vec4F32 zz0 = Vec4F32::Splat(tz[0]); - Vec4F32 zz1 = Vec4F32::Splat((tz[4] - tz[0]) * oneOverTriArea); - Vec4F32 zz2 = Vec4F32::Splat((tz[8] - tz[0]) * oneOverTriArea); + Vec4F32 z_20_v = Vec4F32::Splat(z_20); + Vec4F32 z_01_v = Vec4F32::Splat(z_01); - Vec4F32 zdeltaX = zz1 * Vec4F32FromS32(oneStepX20) + zz2 * Vec4F32FromS32(oneStepX01); - Vec4F32 zdeltaY = zz1 * Vec4F32FromS32(oneStepY20) + zz2 * Vec4F32FromS32(oneStepY01); - Vec4F32 zrow = zz0 + Vec4F32FromS32(w1_row) * zz1 + Vec4F32FromS32(w2_row) * zz2; + Vec4F32 zdeltaX = z_20_v * Vec4F32FromS32(oneStepX20) + z_01_v * Vec4F32FromS32(oneStepX01); + Vec4F32 zdeltaY = z_20_v * Vec4F32FromS32(oneStepY20) + z_01_v * Vec4F32FromS32(oneStepY01); + Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; // Rasterize for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { From 2eed309d29224ad266411d3816f8544eb784d71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 16:55:04 +0100 Subject: [PATCH 05/15] Simplify more --- GPU/Common/DepthRaster.cpp | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index e07246ac32f5..52bfb3035347 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -156,16 +156,12 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor float z_01 = (tz[8] - tz[0]) * oneOverTriArea; // Step deltas - Vec4S32 oneStepX12 = Vec4S32::Splat(A12 * stepXSize); - Vec4S32 oneStepY12 = Vec4S32::Splat(B12 * stepYSize); - - // Step deltas - Vec4S32 oneStepX20 = Vec4S32::Splat(A20 * stepXSize); - Vec4S32 oneStepY20 = Vec4S32::Splat(B20 * stepYSize); - - // Step deltas - Vec4S32 oneStepX01 = Vec4S32::Splat(A01 * stepXSize); - Vec4S32 oneStepY01 = Vec4S32::Splat(B01 * stepYSize); + int stepX12 = A12 * stepXSize; + int stepY12 = B12 * stepYSize; + int stepX20 = A20 * stepXSize; + int stepY20 = B20 * stepYSize; + int stepX01 = A01 * stepXSize; + int stepY01 = B01 * stepYSize; // x/y values for initial pixel block. Add horizontal offsets. Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); @@ -178,13 +174,16 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); - Vec4F32 z_20_v = Vec4F32::Splat(z_20); - Vec4F32 z_01_v = Vec4F32::Splat(z_01); - - Vec4F32 zdeltaX = z_20_v * Vec4F32FromS32(oneStepX20) + z_01_v * Vec4F32FromS32(oneStepX01); - Vec4F32 zdeltaY = z_20_v * Vec4F32FromS32(oneStepY20) + z_01_v * Vec4F32FromS32(oneStepY01); + Vec4F32 zdeltaX = Vec4F32::Splat(z_20 * (float)stepX20 + z_01 * (float)stepX01); + Vec4F32 zdeltaY = Vec4F32::Splat(z_20 * (float)stepY20 + z_01 * (float)stepY01); Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; + Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); + Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); + Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); + Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); + Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); + Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); // Rasterize for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { // Barycentric coordinates at start of row From 373569bf64e33977404e8b48402a0ef271f66f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 29 Dec 2024 17:51:44 +0100 Subject: [PATCH 06/15] More prep. Add triangle loop. --- GPU/Common/DepthRaster.cpp | 140 ++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 66 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 52bfb3035347..2eb4da83b9af 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -104,8 +104,9 @@ constexpr int MIN_TWICE_TRI_AREA = 10; template TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { // BEGIN triangle setup. This should be done SIMD, four triangles at a time. - // Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls - // are slow on SSE2. + // 16x16->32 multiplications are doable on SSE2, which should be all we need. + + // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. // NOTE: Triangles are stored in groups of 4. int x0 = tx[0]; @@ -115,12 +116,11 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int x2 = tx[8]; int y2 = ty[8]; - // use fixed-point only for X and Y. Avoid work for Z and W. - // We use 4x1 tiles for simplicity. int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3; int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3; int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1); int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2); + if (maxX == minX || maxY == minY) { // No pixels, or outside screen. // Most of these are now gone in the initial pass. @@ -150,11 +150,6 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int B01 = x1 - x0; int C01 = x0 * y1 - y0 * x1; - // Prepare to interpolate Z - float zbase = tz[0]; - float z_20 = (tz[4] - tz[0]) * oneOverTriArea; - float z_01 = (tz[8] - tz[0]) * oneOverTriArea; - // Step deltas int stepX12 = A12 * stepXSize; int stepY12 = B12 * stepYSize; @@ -163,67 +158,80 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor int stepX01 = A01 * stepXSize; int stepY01 = B01 * stepYSize; - // x/y values for initial pixel block. Add horizontal offsets. - Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - int initialY = minY; - - // Convert per-triangle values to wide registers. + // Prepare to interpolate Z + float zbase = tz[0]; + float z_20 = (tz[4] - tz[0]) * oneOverTriArea; + float z_01 = (tz[8] - tz[0]) * oneOverTriArea; + float zdx = z_20 * (float)stepX20 + z_01 * (float)stepX01; + float zdy = z_20 * (float)stepY20 + z_01 * (float)stepY01; // Edge function values at origin - Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); - Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); - Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); - - Vec4F32 zdeltaX = Vec4F32::Splat(z_20 * (float)stepX20 + z_01 * (float)stepX01); - Vec4F32 zdeltaY = Vec4F32::Splat(z_20 * (float)stepY20 + z_01 * (float)stepY01); - Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; - - Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); - Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); - Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); - Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); - Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); - Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); - // Rasterize - for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { - // Barycentric coordinates at start of row - Vec4S32 w0 = w0_row; - Vec4S32 w1 = w1_row; - Vec4S32 w2 = w2_row; - Vec4F32 zs = zrow; - - uint16_t *rowPtr = depthBuf + stride * y; - - for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { - // If p is on or inside all edges for any pixels, - // render those pixels. - Vec4S32 signCalc = w0 | w1 | w2; - if (!AnyZeroSignBit(signCalc)) { - continue; - } - - Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); - Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); - // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + // TODO: We could SIMD the second part here. + for (int t = 0; t < 1; t++) { + // Check for bad triangle. + if (triArea[t] == 0) { + continue; + } - Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + // Convert per-triangle values to wide registers. + Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); + int initialY = minY; + + Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); + Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); + Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); + + Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; + Vec4F32 zdeltaX = Vec4F32::Splat(zdx); + Vec4F32 zdeltaY = Vec4F32::Splat(zdy); + + Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); + Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); + Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); + Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); + Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); + Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); + // Rasterize + for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { + // Barycentric coordinates at start of row + Vec4S32 w0 = w0_row; + Vec4S32 w1 = w1_row; + Vec4S32 w2 = w2_row; + Vec4F32 zs = zrow; + + uint16_t *rowPtr = depthBuf + stride * y; + + for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { + // If p is on or inside all edges for any pixels, + // render those pixels. + Vec4S32 signCalc = w0 | w1 | w2; + if (!AnyZeroSignBit(signCalc)) { + continue; + } - // This switch is on a templated constant, so should collapse away. - switch (compareMode) { - case ZCompareMode::Greater: - // To implement the greater/greater-than comparison, we can combine mask and max. - // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. - // We use AndNot to zero out Z results, before doing Max with the buffer. - AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Less: // UNTESTED - // This time, we OR the mask and use .Min. - (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); - break; - case ZCompareMode::Always: // UNTESTED - // This could be replaced with a vblend operation. - ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); - break; + Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); + Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); + // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + + Vec4U16 shortZ = Vec4U16::FromVec4F32(zs); + + // This switch is on a templated constant, so should collapse away. + switch (compareMode) { + case ZCompareMode::Greater: + // To implement the greater/greater-than comparison, we can combine mask and max. + // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. + // We use AndNot to zero out Z results, before doing Max with the buffer. + AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Less: // UNTESTED + // This time, we OR the mask and use .Min. + (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Always: // UNTESTED + // This could be replaced with a vblend operation. + ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); + break; + } } } } From de09dec9d1da211afb95133ce2421f77ae537949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 30 Dec 2024 13:01:08 +0100 Subject: [PATCH 07/15] Move branches out of triangle setup --- GPU/Common/DepthRaster.cpp | 46 +++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 2eb4da83b9af..e54e4496694a 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -109,29 +109,23 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. // NOTE: Triangles are stored in groups of 4. - int x0 = tx[0]; - int y0 = ty[0]; - int x1 = tx[4]; - int y1 = ty[4]; - int x2 = tx[8]; - int y2 = ty[8]; - - int minX = std::max(std::min(std::min(x0, x1), x2), (int)scissor.x1) & ~3; - int maxX = std::min(std::max(std::max(x0, x1), x2) + 3, (int)scissor.x2) & ~3; - int minY = std::max(std::min(std::min(y0, y1), y2), (int)scissor.y1); - int maxY = std::min(std::max(std::max(y0, y1), y2), (int)scissor.y2); - - if (maxX == minX || maxY == minY) { - // No pixels, or outside screen. - // Most of these are now gone in the initial pass. - return TriangleResult::NoPixels; - } + float x0 = tx[0]; + float y0 = ty[0]; + float x1 = tx[4]; + float y1 = ty[4]; + float x2 = tx[8]; + float y2 = ty[8]; + + // Load the entire scissor rect into one SIMD register. + // Vec4F32 scissor = Vec4F32::LoadConvertS16(&scissor.x1); + + int minX = (int)std::max(std::min(std::min(x0, x1), x2), (float)scissor.x1) & ~3; + int maxX = (int)std::min(std::max(std::max(x0, x1), x2) + 3, (float)scissor.x2) & ~3; + int minY = (int)std::max(std::min(std::min(y0, y1), y2), (float)scissor.y1); + int maxY = (int)std::min(std::max(std::max(y0, y1), y2), (float)scissor.y2); // TODO: Cull really small triangles here - we can increase the threshold a bit probably. int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0); - if (triArea < MIN_TWICE_TRI_AREA) { - return TriangleResult::SmallOrBackface; // Or zero area. - } float oneOverTriArea = 1.0f / (float)triArea; @@ -169,10 +163,20 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor // TODO: We could SIMD the second part here. for (int t = 0; t < 1; t++) { // Check for bad triangle. - if (triArea[t] == 0) { + if (triArea /*[t]*/ <= 0) { continue; } + if (maxX == minX || maxY == minY) { + // No pixels, or outside screen. + // Most of these are now gone in the initial pass. + return TriangleResult::NoPixels; + } + + if (triArea < MIN_TWICE_TRI_AREA) { + return TriangleResult::SmallOrBackface; // Or zero area. + } + // Convert per-triangle values to wide registers. Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); int initialY = minY; From c3ac798545e848cfc02e16a5532745e1af66a260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 30 Dec 2024 16:46:14 +0100 Subject: [PATCH 08/15] More crosssimd --- Common/Math/CrossSIMD.h | 67 ++++++++++++++++++++++------------------- unittest/UnitTest.cpp | 16 ++++++++++ 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index a8b68ba0c127..4daddea68125 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -121,12 +121,6 @@ struct Vec4S32 { void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); } void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);} - // Swaps the two lower elements. Useful for reversing triangles.. - Vec4S32 SwapLowerElements() { - return Vec4S32{ - _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1)) - }; - } Vec4S32 SignBits32ToMask() { return Vec4S32{ _mm_srai_epi32(v, 31) @@ -144,6 +138,12 @@ struct Vec4S32 { return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) }; } + Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; } + // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output. + Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; } + Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; } + Vec4S32 FixupAfterMinMax() const { return SignExtend16(); } + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; } @@ -153,6 +153,11 @@ struct Vec4S32 { void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); } void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); } + Vec4S32 operator <<(int imm) const { return Vec4S32{ _mm_slli_epi32(v, imm) }; } + + // NOTE: May be slow. + int operator[](size_t index) const { return ((int *)&v)[index]; } + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0) @@ -217,9 +222,12 @@ struct Vec4F32 { void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); } void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); } Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } + // NOTE: May be slow. + float operator[](size_t index) const { return ((float *)&v)[index]; } Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } - Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; } + Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; } + Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; } Vec4F32 Clamp(float lower, float higher) { return Vec4F32{ @@ -238,13 +246,6 @@ struct Vec4F32 { return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) }; } - // Swaps the two lower elements. Useful for reversing triangles.. - Vec4F32 SwapLowerElements() { - return Vec4F32{ - _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)) - }; - } - inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { return Vec4F32{ _mm_add_ps( _mm_add_ps( @@ -443,17 +444,18 @@ struct Vec4S32 { void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); } void StoreAligned(int *dst) { vst1q_s32(dst, v); } - // Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles.. - // This is quite awkward on ARM64 :/ Maybe there's a better solution? - Vec4S32 SwapLowerElements() { - int32x2_t upper = vget_high_s32(v); - int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v)); - return Vec4S32{ vcombine_s32(lowerSwapped, upper) }; - }; - // Warning: Unlike on x86, this is a full 32-bit multiplication. Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; } + // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least). + Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; } + Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; } + Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; } + + // NOTE: May be slow. + int operator[](size_t index) const { return ((int *)&v)[index]; } + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; } Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } @@ -508,6 +510,9 @@ struct Vec4F32 { return Vec4F32{ vcvtq_f32_s32(other.v) }; } + // NOTE: May be slow. + float operator[](size_t index) const { return ((float *)&v)[index]; } + Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; } @@ -521,15 +526,22 @@ struct Vec4F32 { Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } - Vec4F32 Recip() { + Vec4F32 Recip() const { float32x4_t recip = vrecpeq_f32(v); // Use a couple Newton-Raphson steps to refine the estimate. - // May be able to get away with only one refinement, not sure! + // To save one iteration at the expense of accuracy, use RecipApprox(). recip = vmulq_f32(vrecpsq_f32(v, recip), recip); recip = vmulq_f32(vrecpsq_f32(v, recip), recip); return Vec4F32{ recip }; } + Vec4F32 RecipApprox() const { + float32x4_t recip = vrecpeq_f32(v); + // To approximately match the precision of x86-64's rcpps, do a single iteration. + recip = vmulq_f32(vrecpsq_f32(v, recip), recip); + return Vec4F32{ recip }; + } + Vec4F32 Clamp(float lower, float higher) { return Vec4F32{ vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher)) @@ -544,13 +556,6 @@ struct Vec4F32 { return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) }; } - // Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles.. - // This is quite awkward on ARM64 :/ Maybe there's a better solution? - Vec4F32 SwapLowerElements() { - float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v)); - return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) }; - }; - // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { #if PPSSPP_ARCH(ARM64_NEON) diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index a087d205b96b..fb45e9c6f076 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -56,6 +56,7 @@ #include "Common/Buffer.h" #include "Common/File/Path.h" #include "Common/Math/SIMDHeaders.h" +#include "Common/Math/CrossSIMD.h" // Get some more instructions for testing #if PPSSPP_ARCH(SSE2) #include @@ -1124,6 +1125,21 @@ bool TestSIMD() { EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); #endif + + const int testval[2][4] = { + { 0x1000, 0x2000, 0x3000, 0x7000 }, + { -0x1000, -0x2000, -0x3000, -0x7000 } + }; + + for (int i = 0; i < 2; i++) { + Vec4S32 s = Vec4S32::Load(testval[i]); + Vec4S32 square = s * s; + Vec4S32 square16 = s.Mul16(s); + EXPECT_EQ_INT(square[0], square16[0]); + EXPECT_EQ_INT(square[1], square16[1]); + EXPECT_EQ_INT(square[2], square16[2]); + EXPECT_EQ_INT(square[3], square16[3]); + } return true; } From 36c5065d5da8fef84539ff917567b73bc26517b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 30 Dec 2024 16:50:24 +0100 Subject: [PATCH 09/15] Add crude guardband culling to depth rasterizer --- GPU/Common/DepthRaster.cpp | 41 ++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index e54e4496694a..768c44a5f4d7 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -90,7 +90,7 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3}; constexpr int stepXSize = 4; constexpr int stepYSize = 1; -enum class TriangleResult { +enum class TriangleStat { OK, NoPixels, SmallOrBackface, @@ -102,7 +102,7 @@ constexpr int MIN_TWICE_TRI_AREA = 10; // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. template -TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { +TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { // BEGIN triangle setup. This should be done SIMD, four triangles at a time. // 16x16->32 multiplications are doable on SSE2, which should be all we need. @@ -170,11 +170,11 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor if (maxX == minX || maxY == minY) { // No pixels, or outside screen. // Most of these are now gone in the initial pass. - return TriangleResult::NoPixels; + return TriangleStat::NoPixels; } if (triArea < MIN_TWICE_TRI_AREA) { - return TriangleResult::SmallOrBackface; // Or zero area. + return TriangleStat::SmallOrBackface; // Or zero area. } // Convert per-triangle values to wide registers. @@ -239,13 +239,13 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor } } } - return TriangleResult::OK; + return TriangleStat::OK; } template inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { for (int i = 0; i < 4; i++) { - TriangleResult result = DepthRasterTriangle(depthBuf, stride, scissor, tx + i, ty + i, tz + i); + TriangleStat result = DepthRasterTriangle(depthBuf, stride, scissor, tx + i, ty + i, tz + i); stats[(int)result]++; } } @@ -373,6 +373,11 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr const float *verts[12]; // four triangles at a time! const int count = draw.vertexCount; + // Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster. + // This is slightly off-center since we are already in screen space, but whatever. We compensate a little for it in the bottom right. + Vec4S32 guardBandTopLeft = Vec4S32::Splat(-2048); + Vec4S32 guardBandBottomRight = Vec4S32::Splat(2348); + Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1); Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1); Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2); @@ -457,6 +462,11 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr continue; } + // Create a mask to kill coordinates of triangles that poke outside the guardband. + Vec4S32 inGuardBand = + (minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) & + (minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight)); + // Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) // Still good for culling early and pretty cheap to compute. Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); @@ -465,9 +475,10 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr continue; } - Vec4S32FromF32(x0).Store(tx + outCount); - Vec4S32FromF32(x1).Store(tx + outCount + 4); - Vec4S32FromF32(x2).Store(tx + outCount + 8); + // Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected. + (Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount); + (Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 4); + (Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 8); Vec4S32FromF32(y0).Store(ty + outCount); Vec4S32FromF32(y1).Store(ty + outCount + 4); Vec4S32FromF32(y2).Store(ty + outCount + 8); @@ -479,9 +490,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr if (!cullEnabled) { // If culling is off, store the triangles again, in the opposite order. - Vec4S32FromF32(x0).Store(tx + outCount); - Vec4S32FromF32(x2).Store(tx + outCount + 4); - Vec4S32FromF32(x1).Store(tx + outCount + 8); + (Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount); + (Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4); + (Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8); Vec4S32FromF32(y0).Store(ty + outCount); Vec4S32FromF32(y2).Store(ty + outCount + 4); Vec4S32FromF32(y1).Store(ty + outCount + 8); @@ -536,9 +547,9 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con } } } - gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels]; - gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface]; - gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK]; + gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels]; + gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface]; + gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK]; break; } default: From bcab17fcf38784ec6654cea1cd7e7ad21510a064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 30 Dec 2024 17:07:51 +0100 Subject: [PATCH 10/15] Parallelize triangle setup. However, some glitches appear... --- GPU/Common/DepthRaster.cpp | 153 ++++++++++++++++++------------------- 1 file changed, 75 insertions(+), 78 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 768c44a5f4d7..604c1fa37853 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -90,6 +90,9 @@ alignas(16) static const int zero123[4] = {0, 1, 2, 3}; constexpr int stepXSize = 4; constexpr int stepYSize = 1; +constexpr int stepXShift = 2; +constexpr int stepYShift = 0; + enum class TriangleStat { OK, NoPixels, @@ -98,105 +101,104 @@ enum class TriangleStat { constexpr int MIN_TWICE_TRI_AREA = 10; -// Adapted from Intel's depth rasterizer example. -// Started with the scalar version, will SIMD-ify later. -// x1/y1 etc are the scissor rect. +// A mix of ideas from Intel's sample and ryg's rasterizer blog series. template -TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { - // BEGIN triangle setup. This should be done SIMD, four triangles at a time. +void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { + // BEGIN triangle setup. This is done using SIMD, four triangles at a time. // 16x16->32 multiplications are doable on SSE2, which should be all we need. // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. // NOTE: Triangles are stored in groups of 4. - float x0 = tx[0]; - float y0 = ty[0]; - float x1 = tx[4]; - float y1 = ty[4]; - float x2 = tx[8]; - float y2 = ty[8]; - - // Load the entire scissor rect into one SIMD register. - // Vec4F32 scissor = Vec4F32::LoadConvertS16(&scissor.x1); + Vec4S32 x0 = Vec4S32::LoadAligned(tx); + Vec4S32 y0 = Vec4S32::LoadAligned(ty); + Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4); + Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4); + Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8); + Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8); - int minX = (int)std::max(std::min(std::min(x0, x1), x2), (float)scissor.x1) & ~3; - int maxX = (int)std::min(std::max(std::max(x0, x1), x2) + 3, (float)scissor.x2) & ~3; - int minY = (int)std::max(std::min(std::min(y0, y1), y2), (float)scissor.y1); - int maxY = (int)std::min(std::max(std::max(y0, y1), y2), (float)scissor.y2); + Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax(); + Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax(); + Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax(); + Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax(); - // TODO: Cull really small triangles here - we can increase the threshold a bit probably. - int triArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0); - - float oneOverTriArea = 1.0f / (float)triArea; + Vec4S32 triArea = (x1 - x0).MulAsS16(y2 - y0) - (x2 - x0).MulAsS16(y1 - y0); + // Probably not worth checking triArea here as we already did the approximatly same check previously. // Edge setup - int A12 = y1 - y2; - int B12 = x2 - x1; - int C12 = x1 * y2 - y1 * x2; + Vec4S32 A12 = y1 - y2; + Vec4S32 B12 = x2 - x1; + Vec4S32 C12 = x1.MulAsS16(y2) - y1.MulAsS16(x2); // Edge setup - int A20 = y2 - y0; - int B20 = x0 - x2; - int C20 = x2 * y0 - y2 * x0; + Vec4S32 A20 = y2 - y0; + Vec4S32 B20 = x0 - x2; + Vec4S32 C20 = x2.MulAsS16(y0) - y2.MulAsS16(x0); // Edge setup - int A01 = y0 - y1; - int B01 = x1 - x0; - int C01 = x0 * y1 - y0 * x1; + Vec4S32 A01 = y0 - y1; + Vec4S32 B01 = x1 - x0; + Vec4S32 C01 = x0.MulAsS16(y1) - y0.MulAsS16(x1); // Step deltas - int stepX12 = A12 * stepXSize; - int stepY12 = B12 * stepYSize; - int stepX20 = A20 * stepXSize; - int stepY20 = B20 * stepYSize; - int stepX01 = A01 * stepXSize; - int stepY01 = B01 * stepYSize; + Vec4S32 stepX12 = A12 << stepXShift; + Vec4S32 stepY12 = B12 << stepYShift; + Vec4S32 stepX20 = A20 << stepXShift; + Vec4S32 stepY20 = B20 << stepYShift; + Vec4S32 stepX01 = A01 << stepXShift; + Vec4S32 stepY01 = B01 << stepYShift; // Prepare to interpolate Z - float zbase = tz[0]; - float z_20 = (tz[4] - tz[0]) * oneOverTriArea; - float z_01 = (tz[8] - tz[0]) * oneOverTriArea; - float zdx = z_20 * (float)stepX20 + z_01 * (float)stepX01; - float zdy = z_20 * (float)stepY20 + z_01 * (float)stepY01; + Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip(); + Vec4F32 zbase = Vec4F32::LoadAligned(tz); + Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea; + Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea; + Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01); + Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01); // Edge function values at origin // TODO: We could SIMD the second part here. - for (int t = 0; t < 1; t++) { + // Using operator[] on the vectors actually seems to result in pretty good code. + for (int t = 0; t < 4; t++) { // Check for bad triangle. - if (triArea /*[t]*/ <= 0) { - continue; - } - - if (maxX == minX || maxY == minY) { + if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) { // No pixels, or outside screen. // Most of these are now gone in the initial pass. - return TriangleStat::NoPixels; + stats[(int)TriangleStat::NoPixels]++; + continue; } - if (triArea < MIN_TWICE_TRI_AREA) { - return TriangleStat::SmallOrBackface; // Or zero area. + if (triArea[t] < MIN_TWICE_TRI_AREA) { + stats[(int)TriangleStat::SmallOrBackface]++; // Or zero area. + continue; } + const int minXT = minX[t] & ~3; + const int maxXT = maxX[t] & ~3; + + const int minYT = minY[t]; + const int maxYT = maxY[t]; + // Convert per-triangle values to wide registers. - Vec4S32 initialX = Vec4S32::Splat(minX) + Vec4S32::LoadAligned(zero123); - int initialY = minY; - - Vec4S32 w0_row = Vec4S32::Splat(A12) * initialX + Vec4S32::Splat(B12 * initialY + C12); - Vec4S32 w1_row = Vec4S32::Splat(A20) * initialX + Vec4S32::Splat(B20 * initialY + C20); - Vec4S32 w2_row = Vec4S32::Splat(A01) * initialX + Vec4S32::Splat(B01 * initialY + C01); - - Vec4F32 zrow = Vec4F32::Splat(zbase) + Vec4F32FromS32(w1_row) * z_20 + Vec4F32FromS32(w2_row) * z_01; - Vec4F32 zdeltaX = Vec4F32::Splat(zdx); - Vec4F32 zdeltaY = Vec4F32::Splat(zdy); - - Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12); - Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12); - Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20); - Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20); - Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01); - Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01); + Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123); + int initialY = minY[t]; + + Vec4S32 w0_row = Vec4S32::Splat(A12[t]).MulAsS16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]); + Vec4S32 w1_row = Vec4S32::Splat(A20[t]).MulAsS16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]); + Vec4S32 w2_row = Vec4S32::Splat(A01[t]).MulAsS16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]); + + Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t]; + Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]); + Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]); + + Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]); + Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]); + Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]); + Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]); + Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]); + Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]); // Rasterize - for (int y = minY; y <= maxY; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { + for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) { // Barycentric coordinates at start of row Vec4S32 w0 = w0_row; Vec4S32 w1 = w1_row; @@ -205,10 +207,12 @@ TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor sc uint16_t *rowPtr = depthBuf + stride * y; - for (int x = minX; x <= maxX; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { + for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) { // If p is on or inside all edges for any pixels, // render those pixels. Vec4S32 signCalc = w0 | w1 | w2; + + // TODO: Check if this check is profitable. Maybe only for big triangles? if (!AnyZeroSignBit(signCalc)) { continue; } @@ -238,15 +242,8 @@ TriangleStat DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor sc } } } - } - return TriangleStat::OK; -} -template -inline void DepthRaster4Triangles(int stats[4], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { - for (int i = 0; i < 4; i++) { - TriangleStat result = DepthRasterTriangle(depthBuf, stride, scissor, tx + i, ty + i, tz + i); - stats[(int)result]++; + stats[(int)TriangleStat::OK]++; } } From f5cc41caabb753114c0b9015455fb923c339b9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 30 Dec 2024 17:21:34 +0100 Subject: [PATCH 11/15] More CrossSIMD (breaking change) --- Common/Math/CrossSIMD.h | 61 ++++++++++++++++++++++++++++++--------- Common/Math/SIMDHeaders.h | 34 +++++++++++----------- 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 4daddea68125..b412574192fb 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -131,7 +131,7 @@ struct Vec4S32 { // On SSE2, much faster than _mm_mullo_epi32_SSE2. // On NEON though, it'll read the full 32 bits, so beware. // See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/. - Vec4S32 MulAsS16(Vec4S32 other) const { + Vec4S32 Mul16(Vec4S32 other) const { // Note that we only need to mask one of the inputs, so we get zeroes - multiplying // by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is // in the other register. @@ -153,7 +153,11 @@ struct Vec4S32 { void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); } void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); } - Vec4S32 operator <<(int imm) const { return Vec4S32{ _mm_slli_epi32(v, imm) }; } + Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed. + Vec4S32 Mul(Vec4S32 other) const { return *this * other; } + + template + Vec4S32 Shl() const { return Vec4S32{ _mm_slli_epi32(v, imm) }; } // NOTE: May be slow. int operator[](size_t index) const { return ((int *)&v)[index]; } @@ -221,6 +225,7 @@ struct Vec4F32 { void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); } void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); } void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); } + void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } // NOTE: May be slow. float operator[](size_t index) const { return ((float *)&v)[index]; } @@ -262,6 +267,19 @@ struct Vec4F32 { static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { _MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v); } + + // This is here because ARM64 can do this very efficiently. + static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + col0.v = _mm_loadu_ps(src); + col1.v = _mm_loadu_ps(src + 4); + col2.v = _mm_loadu_ps(src + 8); + col3.v = _mm_loadu_ps(src + 12); + _MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v); + } + + Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; } + Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; } + Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; } }; inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } @@ -310,6 +328,12 @@ struct Vec4U16 { Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; } Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; } + + inline Vec4U16 AndNot(Vec4U16 inverted) { + return Vec4U16{ + _mm_andnot_si128(inverted.v, v) // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed. + }; + } }; struct Vec8U16 { @@ -329,12 +353,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { }; } -inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { - return Vec4U16{ - _mm_andnot_si128(inverted.v, a.v) // NOTE: with andnot, the first parameter is inverted, and then and is performed. - }; -} - #elif PPSSPP_ARCH(ARM_NEON) struct Mat4F32 { @@ -445,7 +463,7 @@ struct Vec4S32 { void StoreAligned(int *dst) { vst1q_s32(dst, v); } // Warning: Unlike on x86, this is a full 32-bit multiplication. - Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; } // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least). @@ -462,6 +480,11 @@ struct Vec4S32 { Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; } Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; } Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; } + Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; } + Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + + template + Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; } void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); } void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); } @@ -522,6 +545,7 @@ struct Vec4F32 { void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); } void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); } void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); } + void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } @@ -556,6 +580,10 @@ struct Vec4F32 { return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) }; } + Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; } + Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; } + Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; } + // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { #if PPSSPP_ARCH(ARM64_NEON) @@ -578,6 +606,15 @@ struct Vec4F32 { #endif } + static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + // The optimizer hopefully gets rid of the copies below. + float32x4x4_t r = vld4q_f32(src); + col0.v = r.val[0]; + col1.v = r.val[1]; + col2.v = r.val[2]; + col3.v = r.val[3]; + } + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { #if PPSSPP_ARCH(ARM64_NEON) float32x4_t sum = vaddq_f32( @@ -649,6 +686,8 @@ struct Vec4U16 { Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; } Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } + + Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; } }; inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { @@ -657,10 +696,6 @@ inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { return Vec4U16{ result }; } -inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { - return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) }; -} - struct Vec8U16 { uint16x8_t v; diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index cb63b89eefac..82b18e558c2d 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -88,29 +88,29 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) { // May later figure out how to use the appropriate ones depending on compile flags. inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) { - __m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) - return _mm_unpacklo_epi64(prod01, prod23); + __m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) + return _mm_unpacklo_epi64(prod01, prod23); } inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) { - return _mm_xor_si128( - _mm_max_epi16( - _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), - _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), - _mm_set1_epi16((int16_t)0x8000)); + return _mm_xor_si128( + _mm_max_epi16( + _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), + _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), + _mm_set1_epi16((int16_t)0x8000)); } inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) { - return _mm_xor_si128( - _mm_min_epi16( - _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), - _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), - _mm_set1_epi16((int16_t)0x8000)); + return _mm_xor_si128( + _mm_min_epi16( + _mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)), + _mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))), + _mm_set1_epi16((int16_t)0x8000)); } // SSE2 replacement for half of a _mm_packus_epi32 but without the saturation. From e0991a70707e350397a70b1a7acb2b410bdb8c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 31 Dec 2024 01:20:45 +0100 Subject: [PATCH 12/15] DepthRaster: Improved guardband rejection, fixing glitches. --- GPU/Common/DepthRaster.cpp | 119 ++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 41 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 604c1fa37853..f75f9cd8c57a 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -117,36 +117,37 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8); Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8); + // FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations. Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax(); Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax(); Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax(); Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax(); - Vec4S32 triArea = (x1 - x0).MulAsS16(y2 - y0) - (x2 - x0).MulAsS16(y1 - y0); + Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0); // Probably not worth checking triArea here as we already did the approximatly same check previously. // Edge setup Vec4S32 A12 = y1 - y2; Vec4S32 B12 = x2 - x1; - Vec4S32 C12 = x1.MulAsS16(y2) - y1.MulAsS16(x2); + Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2); // Edge setup Vec4S32 A20 = y2 - y0; Vec4S32 B20 = x0 - x2; - Vec4S32 C20 = x2.MulAsS16(y0) - y2.MulAsS16(x0); + Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0); // Edge setup Vec4S32 A01 = y0 - y1; Vec4S32 B01 = x1 - x0; - Vec4S32 C01 = x0.MulAsS16(y1) - y0.MulAsS16(x1); + Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1); // Step deltas - Vec4S32 stepX12 = A12 << stepXShift; - Vec4S32 stepY12 = B12 << stepYShift; - Vec4S32 stepX20 = A20 << stepXShift; - Vec4S32 stepY20 = B20 << stepYShift; - Vec4S32 stepX01 = A01 << stepXShift; - Vec4S32 stepY01 = B01 << stepYShift; + Vec4S32 stepX12 = A12.Shl(); + Vec4S32 stepY12 = B12.Shl(); + Vec4S32 stepX20 = A20.Shl(); + Vec4S32 stepY20 = B20.Shl(); + Vec4S32 stepX01 = A01.Shl(); + Vec4S32 stepY01 = B01.Shl(); // Prepare to interpolate Z Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip(); @@ -163,7 +164,8 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc // Check for bad triangle. if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) { // No pixels, or outside screen. - // Most of these are now gone in the initial pass. + // Most of these are now gone in the initial pass, but not all since we cull + // in 4-groups there. stats[(int)TriangleStat::NoPixels]++; continue; } @@ -182,10 +184,16 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc // Convert per-triangle values to wide registers. Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123); int initialY = minY[t]; + _dbg_assert_(A12[t] < 32767); + _dbg_assert_(A12[t] > -32767); + _dbg_assert_(A20[t] < 32767); + _dbg_assert_(A20[t] > -32767); + _dbg_assert_(A01[t] < 32767); + _dbg_assert_(A01[t] > -32767); - Vec4S32 w0_row = Vec4S32::Splat(A12[t]).MulAsS16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]); - Vec4S32 w1_row = Vec4S32::Splat(A20[t]).MulAsS16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]); - Vec4S32 w2_row = Vec4S32::Splat(A01[t]).MulAsS16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]); + Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]); + Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]); + Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]); Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t]; Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]); @@ -229,7 +237,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc // To implement the greater/greater-than comparison, we can combine mask and max. // Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output. // We use AndNot to zero out Z results, before doing Max with the buffer. - AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); + shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x); break; case ZCompareMode::Less: // UNTESTED // This time, we OR the mask and use .Min. @@ -237,7 +245,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc break; case ZCompareMode::Always: // UNTESTED // This could be replaced with a vblend operation. - ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); + ((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv)).Store(rowPtr + x); break; } } @@ -362,7 +370,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr } const bool cullEnabled = draw.cullEnabled; - static const float zerovec[4] = {}; + static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f}; int collected = 0; int planeCulled = 0; @@ -371,22 +379,35 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr const int count = draw.vertexCount; // Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster. - // This is slightly off-center since we are already in screen space, but whatever. We compensate a little for it in the bottom right. - Vec4S32 guardBandTopLeft = Vec4S32::Splat(-2048); - Vec4S32 guardBandBottomRight = Vec4S32::Splat(2348); + // This is slightly off-center since we are already in screen space, but whatever. + Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096); + Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096); Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1); Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1); Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2); Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2); + // Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W. + auto validVert = [](const float *v) -> bool { + if (v[3] <= 0.0f /* || v[2] <= 0.0f */) { + return false; + } + /* + if (v[2] >= 65535.0f * v[3]) { + return false; + }*/ + return true; + }; + for (int i = 0; i < count; i += 3) { // Collect valid triangles into buffer. const float *v0 = transformed + indexBuffer[i] * 4; const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4; const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4; - // Don't collect triangle if any vertex is behind the 0 plane. - if (v0[3] > 0.0f && v1[3] > 0.0f && v2[3] > 0.0f) { + // Don't collect triangle if any vertex is beyond the planes. + // TODO: Optimize this somehow. + if (validVert(v0) && validVert(v1) && validVert(v2)) { verts[collected] = v0; verts[collected + 1] = v1; verts[collected + 2] = v2; @@ -404,6 +425,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr } if (collected != 12) { + // Fetch more! continue; } @@ -435,47 +457,53 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr Vec4F32 recipW2 = w2.Recip(); x0 *= recipW0; y0 *= recipW0; - z0 = (z0 * recipW0).Clamp(0.0f, 65535.0f); + z0 *= recipW0; x1 *= recipW1; y1 *= recipW1; - z1 = (z1 * recipW1).Clamp(0.0f, 65535.0f); + z1 *= recipW1; x2 *= recipW2; y2 *= recipW2; - z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f); + z2 *= recipW2; - // Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer). - Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1)); - Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1)); - Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2)); - Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2)); + // Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer). + Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2))); + Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2))); + Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2))); + Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2))); - // If all are equal in any dimension, all four triangles are tiny nonsense (or outside the scissor) and can be skipped early. + // If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early. Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY); - // Otherwise we just proceed to triangle setup with all four for now. Later might want to - // compact the remaining triangles... Or do more checking here. + + // Otherwise we just proceed to triangle setup with all four for now. // We could also save the computed boxes for later.. + // TODO: Merge into below checks? Though nice with an early out. if (!AnyZeroSignBit(eqMask)) { boxCulled += 4; continue; } - // Create a mask to kill coordinates of triangles that poke outside the guardband. + // Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty). Vec4S32 inGuardBand = - (minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) & - (minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight)); + ((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) & + (minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask); + + // It's enough to smash one coordinate to make future checks (like the tri area check) fail. + x0 &= inGuardBand; + x1 &= inGuardBand; + x2 &= inGuardBand; // Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) // Still good for culling early and pretty cheap to compute. - Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA); + Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA + 2)); if (!AnyZeroSignBit(doubleTriArea)) { gpuStats.numDepthRasterEarlySize += 4; continue; } // Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected. - (Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount); - (Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 4); - (Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 8); + Vec4S32FromF32(x0).Store(tx + outCount); + Vec4S32FromF32(x1).Store(tx + outCount + 4); + Vec4S32FromF32(x2).Store(tx + outCount + 8); Vec4S32FromF32(y0).Store(ty + outCount); Vec4S32FromF32(y1).Store(ty + outCount + 4); Vec4S32FromF32(y2).Store(ty + outCount + 8); @@ -483,10 +511,19 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr z1.Store(tz + outCount + 4); z2.Store(tz + outCount + 8); +#ifdef _DEBUG + for (int i = 0; i < 12; i++) { + _dbg_assert_(tx[outCount + i] < 32767); + _dbg_assert_(tx[outCount + i] >= -32768); + _dbg_assert_(tx[outCount + i] < 32767); + _dbg_assert_(tx[outCount + i] >= -32768); + } +#endif + outCount += 12; if (!cullEnabled) { - // If culling is off, store the triangles again, in the opposite order. + // If culling is off, store the triangles again, with the first two vertices swapped. (Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount); (Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4); (Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8); From 7ddd7024f4047daf47eb015f18ddebb70d326e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 31 Dec 2024 02:17:08 +0100 Subject: [PATCH 13/15] Revert unintentional change. Warning fix --- GPU/Common/DepthRaster.cpp | 2 +- unittest/UnitTest.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index f75f9cd8c57a..775d4108a358 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -494,7 +494,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr // Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...) // Still good for culling early and pretty cheap to compute. - Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA + 2)); + Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA)); if (!AnyZeroSignBit(doubleTriArea)) { gpuStats.numDepthRasterEarlySize += 4; continue; diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index fb45e9c6f076..475b785abd1f 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -1049,7 +1049,7 @@ CharQueue GetQueue() { bool TestCharQueue() { // We use a tiny block size for testing. - CharQueue queue = std::move(GetQueue()); + CharQueue queue = GetQueue(); // Add 16 chars. queue.push_back("abcdefghijkl"); From dee5fe69906930fd9278269837e641323eb31912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 31 Dec 2024 02:30:05 +0100 Subject: [PATCH 14/15] Fix issue in Midnight Club where Z now wrapped around at a distance, after removing the clamp. Might as well cull. --- Common/Math/CrossSIMD.h | 5 +++++ GPU/Common/DepthRaster.cpp | 13 ++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index b412574192fb..4393635b05b0 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -152,6 +152,9 @@ struct Vec4S32 { // TODO: andnot void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); } void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); } + void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); } + void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); } + void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); } Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; } // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed. Vec4S32 Mul(Vec4S32 other) const { return *this * other; } @@ -583,6 +586,8 @@ struct Vec4F32 { Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; } Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; } Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; } + Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; } + Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; } // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 775d4108a358..9ccb5c9b87a0 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -383,14 +383,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096); Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096); - Vec4F32 scissorX1 = Vec4F32::Splat((float)scissor.x1); - Vec4F32 scissorY1 = Vec4F32::Splat((float)scissor.y1); - Vec4F32 scissorX2 = Vec4F32::Splat((float)scissor.x2); - Vec4F32 scissorY2 = Vec4F32::Splat((float)scissor.y2); + Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1); + Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1); + Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2); + Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2); // Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W. auto validVert = [](const float *v) -> bool { - if (v[3] <= 0.0f /* || v[2] <= 0.0f */) { + if (v[3] <= 0.0f || v[2] <= 0.0f) { return false; } /* @@ -487,6 +487,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr ((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) & (minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask); + // Create another mask to kill off-screen triangles. Not perfectly accurate. + inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2)); + // It's enough to smash one coordinate to make future checks (like the tri area check) fail. x0 &= inGuardBand; x1 &= inGuardBand; From f85d7db5b1588c46845e89d762feb90f184bdd63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 31 Dec 2024 02:34:16 +0100 Subject: [PATCH 15/15] Comment fixes, buildfix --- Common/Math/CrossSIMD.h | 1 + GPU/Common/DepthRaster.cpp | 15 ++++++--------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 4393635b05b0..556d9e3b1aee 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -485,6 +485,7 @@ struct Vec4S32 { Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; } Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; } Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); } template Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; } diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 9ccb5c9b87a0..8bf6e2586c5b 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -104,7 +104,7 @@ constexpr int MIN_TWICE_TRI_AREA = 10; // A mix of ideas from Intel's sample and ryg's rasterizer blog series. template void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) { - // BEGIN triangle setup. This is done using SIMD, four triangles at a time. + // Triangle setup. This is done using SIMD, four triangles at a time. // 16x16->32 multiplications are doable on SSE2, which should be all we need. // We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying. @@ -124,19 +124,16 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax(); Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0); - // Probably not worth checking triArea here as we already did the approximatly same check previously. // Edge setup Vec4S32 A12 = y1 - y2; Vec4S32 B12 = x2 - x1; Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2); - // Edge setup Vec4S32 A20 = y2 - y0; Vec4S32 B20 = x0 - x2; Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0); - // Edge setup Vec4S32 A01 = y0 - y1; Vec4S32 B01 = x1 - x0; Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1); @@ -157,11 +154,10 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01); Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01); - // Edge function values at origin - // TODO: We could SIMD the second part here. - // Using operator[] on the vectors actually seems to result in pretty good code. + // Shared setup is done, now loop per-triangle in the group of four. for (int t = 0; t < 4; t++) { // Check for bad triangle. + // Using operator[] on the vectors actually seems to result in pretty good code. if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) { // No pixels, or outside screen. // Most of these are now gone in the initial pass, but not all since we cull @@ -181,7 +177,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc const int minYT = minY[t]; const int maxYT = maxY[t]; - // Convert per-triangle values to wide registers. + // Convert to wide registers. Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123); int initialY = minY[t]; _dbg_assert_(A12[t] < 32767); @@ -191,6 +187,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc _dbg_assert_(A01[t] < 32767); _dbg_assert_(A01[t] > -32767); + // TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility. Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]); Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]); Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]); @@ -239,7 +236,7 @@ void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthSc // We use AndNot to zero out Z results, before doing Max with the buffer. shortZ.AndNot(shortMaskInv).Max(bufferValues).Store(rowPtr + x); break; - case ZCompareMode::Less: // UNTESTED + case ZCompareMode::Less: // This time, we OR the mask and use .Min. (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); break;