Skip to content

Commit

Permalink
Compute and cull by triangle area early before writing 4-groups of tr…
Browse files Browse the repository at this point in the history
…iangles
  • Loading branch information
hrydgard committed Dec 29, 2024
1 parent e8786fc commit ef934df
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 13 deletions.
12 changes: 12 additions & 0 deletions Common/Math/CrossSIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,10 @@ struct Vec4F32 {
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }

inline bool AnyZeroSignBit(Vec4F32 value) {
return _mm_movemask_ps(value.v) != 0xF;
}

// Make sure the W component of scale is 1.0f.
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
m.col0 = _mm_mul_ps(m.col0, scale.v);
Expand Down Expand Up @@ -609,6 +613,14 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
return (mask & 0x80000000) == 0;
}

inline bool AnyZeroSignBit(Vec4F32 value) {
int32x4_t ival = vreinterpretq_s32_f32(value.v);
int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
return (mask & 0x80000000) == 0;
}


struct Vec4U16 {
uint16x4_t v; // 64 bits.

Expand Down
22 changes: 13 additions & 9 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ struct Edge {
enum class TriangleResult {
OK,
NoPixels,
Backface,
TooSmall,
SmallOrBackface,
};

constexpr int MIN_TWICE_TRI_AREA = 10;
Expand Down Expand Up @@ -130,16 +129,14 @@ TriangleResult DepthRasterTriangle(uint16_t *depthBuf, int stride, DepthScissor
int maxY = std::min(std::max(std::max(v0y, v1y), v2y), (int)scissor.y2);
if (maxX == minX || maxY == minY) {
// No pixels, or outside screen.
// Most of these are now gone in the initial pass.
return TriangleResult::NoPixels;
}

// TODO: Cull really small triangles here - we can increase the threshold a bit probably.
int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
if (triArea < 0) {
return TriangleResult::Backface;
}
if (triArea < MIN_TWICE_TRI_AREA) {
return TriangleResult::TooSmall; // Or zero area.
return TriangleResult::SmallOrBackface; // Or zero area.
}

float oneOverTriArea = 1.0f / (float)triArea;
Expand Down Expand Up @@ -420,6 +417,14 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *tr
continue;
}

// Floating point triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
// Still good for culling early and pretty cheap to compute.
Vec4F32 triArea = (y1 - y2) * x0 + (x2 - x1) * y0 + (x1 * y2 - x2 * y1) - Vec4F32::Splat((float)MIN_TWICE_TRI_AREA);
if (!AnyZeroSignBit(triArea)) {
gpuStats.numDepthRasterEarlySize += 4;
continue;
}

Vec4S32FromF32(x0).Store(tx + outCount);
Vec4S32FromF32(x1).Store(tx + outCount + 4);
Vec4S32FromF32(x2).Store(tx + outCount + 8);
Expand Down Expand Up @@ -470,7 +475,7 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
break;
case GE_PRIM_TRIANGLES:
{
int stats[4]{};
int stats[3]{};
// Batches of 4 triangles, as output by the clip function.
for (int i = 0; i < count; i += 12) {
switch (draw.compareMode) {
Expand All @@ -491,9 +496,8 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, con
}
}
}
gpuStats.numDepthRasterBackface += stats[(int)TriangleResult::Backface];
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleResult::NoPixels];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::TooSmall];
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleResult::SmallOrBackface];
gpuStats.numDepthRasterPrims += stats[(int)TriangleResult::OK];
break;
}
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ struct GPUStatistics {
msPrepareDepth = 0.0f;
msRasterizeDepth = 0.0f;
numDepthRasterPrims = 0;
numDepthRasterBackface = 0;
numDepthRasterEarlySize = 0;
numDepthRasterNoPixels = 0;
numDepthRasterTooSmall = 0;
numDepthRasterZCulled = 0;
Expand Down Expand Up @@ -160,7 +160,7 @@ struct GPUStatistics {
int vertexGPUCycles;
int otherGPUCycles;
int numDepthRasterPrims;
int numDepthRasterBackface;
int numDepthRasterEarlySize;
int numDepthRasterNoPixels;
int numDepthRasterTooSmall;
int numDepthRasterZCulled;
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUCommonHW.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
"replacer: tracks %d references, %d unique textures\n"
"Cpy: depth %d, color %d, reint %d, blend %d, self %d\n"
"GPU cycles: %d (%0.1f per vertex)\n"
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d back, %d zcull, %d box\n%s",
"Z-rast: %0.2f/%0.2f ms, %d prim, %d nopix, %d small, %d earlysize, %d zcull, %d box\n%s",
gpuStats.msProcessingDisplayLists * 1000.0f,
gpuStats.numDrawSyncs,
gpuStats.numListSyncs,
Expand Down Expand Up @@ -1843,7 +1843,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numDepthRasterPrims,
gpuStats.numDepthRasterNoPixels,
gpuStats.numDepthRasterTooSmall,
gpuStats.numDepthRasterBackface,
gpuStats.numDepthRasterEarlySize,
gpuStats.numDepthRasterZCulled,
gpuStats.numDepthEarlyBoxCulled,
debugRecording_ ? "(debug-recording)" : ""
Expand Down

0 comments on commit ef934df

Please sign in to comment.