Skip to content

Commit

Permalink
Fastcull: SSE/NEON-optimize 16-bit position conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
hrydgard committed Dec 5, 2023
1 parent d1ab98d commit 746d320
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
25 changes: 24 additions & 1 deletion GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) {
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
float *verts = (float *)(decoded_ + 65536 * 18);
int vertStride = 3;

// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
// Let's always say objects are within bounds.
Expand All @@ -418,6 +417,9 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
VertexDecoder *dec = GetVertexDecoder(vertType);
int stride = dec->VertexSize();
int offset = dec->posoff;
int vertStride = 3;

// TODO: Possibly do the plane tests directly against the source formats instead of converting.
switch (vertType & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < vertexCount; i++) {
Expand All @@ -428,13 +430,34 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
}
break;
case GE_VTYPE_POS_16BIT:
{
#if defined(_M_SSE)
__m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f);
for (int i = 0; i < vertexCount; i++) {
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data));
// Sign extension. Ugly without SSE4.
bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
}
#elif PPSSPP_ARCH(ARM_NEON)
for (int i = 0; i < vertexCount; i++) {
const int16_t *dataPtr = ((const int16_t *)((const s8 *)vdata + i * stride + offset));
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
float32x4_t pos = vcvtq_n_s32_f32(data, 15); // This does the division by 32768.0f, effectively.
vst1q_f32(verts + i * 3, pos);
}
#else
for (int i = 0; i < vertexCount; i++) {
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
for (int j = 0; j < 3; j++) {
verts[i * 3 + j] = data[j] * (1.0f / 32768.0f);
}
}
#endif
break;
}
case GE_VTYPE_POS_FLOAT:
// No need to copy in this case, we can just read directly from the source format with a stride.
verts = (float *)((uint8_t *)vdata + offset);
Expand Down
4 changes: 2 additions & 2 deletions GPU/Common/VertexDecoderX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1503,7 +1503,7 @@ void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
if (cpu_info.bSSE4_1) {
PMOVSXBD(XMM1, R(XMM1));
} else {
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLBW(XMM1, R(XMM3)); // TODO: XMM1, XMM1, same below, then just PSRAD?
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24);
Expand All @@ -1525,7 +1525,7 @@ void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
if (cpu_info.bSSE4_1) {
PMOVSXWD(XMM1, R(XMM1));
} else {
PUNPCKLWD(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM1)); // TODO: XMM1, XMM1 then just PSRAD?
PSLLD(XMM1, 16);
PSRAD(XMM1, 16);
}
Expand Down

0 comments on commit 746d320

Please sign in to comment.