Skip to content

Commit

Permalink
Merge pull request #19777 from hrydgard/improve-triangle-culling
Browse files Browse the repository at this point in the history
Depth raster: Improve triangle culling in the "clipping" step
  • Loading branch information
hrydgard authored Dec 29, 2024
2 parents 0b82405 + ef934df commit 70cddec
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 40 deletions.
52 changes: 25 additions & 27 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,33 +59,6 @@ struct Mat4x3F32 {
__m128 data2;
};

// TODO: Check if loading b by 4s and shuffling is cheaper.
inline Mat4F32 MulMem4x4By4x4(const float *a, Mat4F32 b) {
Mat4F32 result;

__m128 r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[0]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[1])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[2])));
result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[3])));

r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[4]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[5])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[6])));
result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[7])));

r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[8]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[9])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[10])));
result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[11])));

r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[12]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[13])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[14])));
result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[15])));

return result;
}

inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
Mat4F32 result;

Expand Down Expand Up @@ -182,6 +155,10 @@ struct Vec4S32 {

// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0)

Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ _mm_cmpeq_epi32(v, other.v) }; }
Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ _mm_cmplt_epi32(v, other.v) }; }
Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ _mm_cmpgt_epi32(v, other.v) }; }
};

inline bool AnyZeroSignBit(Vec4S32 value) {
Expand Down Expand Up @@ -233,6 +210,8 @@ struct Vec4F32 {
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ _mm_min_ps(v, other.v) }; }
Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ _mm_max_ps(v, other.v) }; }
void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
Expand Down Expand Up @@ -287,6 +266,10 @@ struct Vec4F32 {
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }

inline bool AnyZeroSignBit(Vec4F32 value) {
return _mm_movemask_ps(value.v) != 0xF;
}

// Make sure the W component of scale is 1.0f.
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
m.col0 = _mm_mul_ps(m.col0, scale.v);
Expand Down Expand Up @@ -480,6 +463,11 @@ struct Vec4S32 {

void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }

Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ vceqq_s32(v, other.v) }; }
Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ vcltq_s32(v, other.v) }; }
Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ vcgtq_s32(v, other.v) }; }
Vec4S32 CompareGtZero() const { return Vec4S32{ vcgtq_s32(v, vdupq_n_s32(0)) }; }
};

struct Vec4F32 {
Expand Down Expand Up @@ -523,6 +511,8 @@ struct Vec4F32 {
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ vminq_f32(v, other.v) }; }
Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ vmaxq_f32(v, other.v) }; }
void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
Expand Down Expand Up @@ -623,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
return (mask & 0x80000000) == 0;
}

inline bool AnyZeroSignBit(Vec4F32 value) {
int32x4_t ival = vreinterpretq_s32_f32(value.v);
int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
return (mask & 0x80000000) == 0;
}


struct Vec4U16 {
uint16x4_t v; // 64 bits.

Expand Down
46 changes: 37 additions & 9 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ struct Edge {
enum class TriangleResult {
OK,
NoPixels,
Backface,
TooSmall,
SmallOrBackface,
};

constexpr int MIN_TWICE_TRI_AREA = 10;
Expand Down Expand Up @@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
// Most of these are now gone in the initial pass.
return TriangleResult::NoPixels;
}

// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
if (triArea < 0) {
return TriangleResult::Backface;
}
if (triArea < MIN_TWICE_TRI_AREA) {
return TriangleResult::TooSmall; // Or zero area.
return TriangleResult::SmallOrBackface; // Or zero area.
}

float oneOverTriArea = 1.0f / (float)triArea;
Expand Down Expand Up @@ -332,8 +329,15 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr

int collected = 0;
int planeCulled = 0;
int boxCulled = 0;
const float *verts[12]; // four triangles at a time!
const int count = draw.vertexCount;

Vec4F32 scissorX1 = Vec4F32::Splat((float)draw.scissor.x1);
Vec4F32 scissorY1 = Vec4F32::Splat((float)draw.scissor.y1);
Vec4F32 scissorX2 = Vec4F32::Splat((float)draw.scissor.x2);
Vec4F32 scissorY2 = Vec4F32::Splat((float)draw.scissor.y2);

for (int i = 0; i < count; i += 3) {
// Collect valid triangles into buffer.
const float *v0 = transformed + indexBuffer[i] * 4;
Expand Down Expand Up @@ -397,6 +401,30 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
y2 *= recipW2;
z2 = (z2 * recipW2).Clamp(0.0f, 65535.0f);

// Check bounding box size (clamped to screen edges). Cast to integer for crude rounding (and to match the rasterizer).
Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)).Max(scissorX1));
Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)).Max(scissorY1));
Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)).Min(scissorX2));
Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)).Min(scissorY2));

// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
// Otherwise we just proceed to triangle setup with all four for now. Later might want to
// compact the remaining triangles... Or do more checking here.
// We could also save the computed boxes for later..
if (!AnyZeroSignBit(eqMask)) {
boxCulled += 4;
continue;
}

// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
// Still good for culling early and pretty cheap to compute.
Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
if (!AnyZeroSignBit(triArea)) {
gpuStats.numDepthRasterEarlySize += 4;
continue;
}

Vec4S32FromF32(x0).Store(tx + outCount);
Vec4S32FromF32(x1).Store(tx + outCount + 4);
Vec4S32FromF32(x2).Store(tx + outCount + 8);
Expand Down Expand Up @@ -426,6 +454,7 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
}

gpuStats.numDepthRasterZCulled += planeCulled;
gpuStats.numDepthEarlyBoxCulled += boxCulled;
return outCount;
}

Expand All @@ -446,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
break;
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
int stats[3]{};
// Batches of 4 triangles, as output by the clip function.
for (int i = 0; i < count; i += 12) {
switch (draw.compareMode) {
Expand All @@ -467,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
}
}
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
break;
}
Expand Down
6 changes: 4 additions & 2 deletions GPU/GPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,11 @@ struct GPUStatistics {
msPrepareDepth = 0.0f;
msRasterizeDepth = 0.0f;
numDepthRasterPrims = 0;
numDepthRasterBackface = 0;
numDepthRasterEarlySize = 0;
numDepthRasterNoPixels = 0;
numDepthRasterTooSmall = 0;
numDepthRasterZCulled = 0;
numDepthEarlyBoxCulled = 0;
vertexGPUCycles = 0;
otherGPUCycles = 0;
}
Expand Down Expand Up @@ -159,10 +160,11 @@ struct GPUStatistics {
int vertexGPUCycles;
int otherGPUCycles;
int numDepthRasterPrims;
int numDepthRasterBackface;
int numDepthRasterEarlySize;
int numDepthRasterNoPixels;
int numDepthRasterTooSmall;
int numDepthRasterZCulled;
int numDepthEarlyBoxCulled;
// Flip count. Doesn't really belong here.
int numFlips;
};
Expand Down
5 changes: 3 additions & 2 deletions GPU/GPUCommonHW.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
"replacer: tracks %d references, %d unique textures\n"
"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
"GPU cycles: %d (%0.1f per vertex)\n"
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull\n%s",
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s",
gpuStats.msProcessingDisplayLists * 1000.0f,
gpuStats.numDrawSyncs,
gpuStats.numListSyncs,
Expand Down Expand Up @@ -1843,8 +1843,9 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numDepthRasterPrims,
gpuStats.numDepthRasterNoPixels,
gpuStats.numDepthRasterTooSmall,
gpuStats.numDepthRasterBackface,
gpuStats.numDepthRasterEarlySize,
gpuStats.numDepthRasterZCulled,
gpuStats.numDepthEarlyBoxCulled,
debugRecording_ ? "(debug-recording)" : ""
);
}

0 comments on commit 70cddec

Please sign in to comment.