Skip to content

Commit

Permalink
More NEON
Browse files Browse the repository at this point in the history
  • Loading branch information
hrydgard committed Dec 9, 2023
1 parent 82faaa0 commit 55d4875
Showing 1 changed file with 10 additions and 11 deletions.
21 changes: 10 additions & 11 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,11 +442,12 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
}
#elif PPSSPP_ARCH(ARM_NEON)
#elif 0 && PPSSPP_ARCH(ARM_NEON)
__m128 scaleFactor = vdupq_n_f32(1.0f / 32768.0f);
for (int i = 0; i < vertexCount; i++) {
const int16_t *dataPtr = ((const int16_t *)((const s8 *)vdata + i * stride + offset));
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
float32x4_t pos = vcvtq_n_s32_f32(data, 15); // This does the division by 32768.0f, effectively.
float32x4_t pos = vmulq_f32(scaleFactor, vcvtq_s32_f32(data)); // This does the division by 32768.0f, effectively.
vst1q_f32(verts + i * 3, pos);
}
#else
Expand Down Expand Up @@ -522,28 +523,26 @@ bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u
uint32x4_t inside = vdupq_n_u32(0);
for (int i = 0; i < vertexCount; i++) {
const float *pos = verts + i * vertStride;
float32x4_t objpos = vld1q_f32(pos);
float32x4_t worldpos = vaddq_f32(
vaddq_f32(
vmulq_f32(worldX, vdupq_n_u32(pos[0])),
vmulq_f32(worldY, vdupq_n_u32(pos[1]))
vmulq_laneq_f32(worldX, objpos, 0),
vmulq_laneq_f32(worldY, objpos, 1)
),
vaddq_f32(
vmulq_f32(worldZ, vdupq_n_u32(pos[2])),
vmulq_laneq_f32(worldZ, objpos, 2),
worldW
)
);
// OK, now we check it against the four planes.
// This is really curiously similar to a matrix multiplication (well, it is one).
float32x4_t posX = vdupq_laneq_f32(worldpos, 0);
float32x4_t posY = vdupq_laneq_f32(worldpos, 1);
float32x4_t posZ = vdupq_laneq_f32(worldpos, 2);
float32x4_t planeDist = vaddq_f32(
vaddq_f32(
vmulq_f32(planeX, posX),
vmulq_f32(planeY, posY)
vmulq_laneq_f32(planeX, worldpos, 0),
vmulq_laneq_f32(planeY, worldpos, 1)
),
vaddq_f32(
vmulq_f32(planeZ, posZ),
vmulq_laneq_f32(planeZ, worldpos, 2),
planeW
)
);
Expand Down

0 comments on commit 55d4875

Please sign in to comment.