From d31044103d3dacf0b8bbaa25696dcef7d354b660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 24 Jan 2025 13:36:58 +0100 Subject: [PATCH 1/8] Add basic vscode .launch file, for easy debugging on mac --- .vscode/launch.json | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 000000000000..28b43427b97b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,26 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(lldb) Launch", + "type": "cppdbg", + "request": "launch", + "program": "", + "osx": { + "program": "${workspaceFolder}/build/PPSSPPSDL.app/Contents/MacOS/PPSSPPSDL" + }, + "linux": { + "program": "${workspaceRoot}/build/PPSSPPSDL" + }, + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "lldb" + } + ] +} From 9d164b71fbb3ba6d9d9589f3b628c8c76dd98754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 24 Jan 2025 15:02:22 +0100 Subject: [PATCH 2/8] Some new CrossSIMD operations --- Common/Math/CrossSIMD.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index a62f5d7e5e46..9f19d891232a 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -185,6 +185,19 @@ struct Vec4F32 { static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; } static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; } + static Vec4F32 LoadS8Norm(const int8_t *src) { + __m128i value = _mm_set1_epi32(*((uint32_t *)src)); + __m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, value), value); + // Sign extension. A bit ugly without SSE4. + value32 = _mm_srai_epi32(value32, 24); + return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_set1_ps(1.0f / 128.0f)) }; + } + static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f + __m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)src)); + // Sign extension. A bit ugly without SSE4. + bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16); + return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(bits), _mm_set1_ps(1.0f / 32768.0f)) }; + } void Store(float *dst) { _mm_storeu_ps(dst, v); } void Store2(float *dst) { _mm_storel_epi64((__m128i *)dst, _mm_castps_si128(v)); } void StoreAligned (float *dst) { _mm_store_ps(dst, v); } @@ -506,6 +519,14 @@ struct Vec4F32 { static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; } static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; } + static Vec4F32 LoadS8Norm(const int8_t *src) { + const int8x8_t value = (int8x8_t)vdup_n_u32(*((uint32_t *)src)); + const int16x8_t value16 = vmovl_s8(value); + return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vget_low_u16(value16)), 7) }; + } + static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f + return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vld1_s16(src)), 15) }; + } static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; } void Store(float *dst) { vst1q_f32(dst, v); } void Store2(float *dst) { vst1_f32(dst, vget_low_f32(v)); } @@ -727,12 +748,25 @@ struct Vec8U16 { struct Vec4S32 { s32 v[4]; + static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; } + static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; } + + static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; } + static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; } + void Store(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); } + void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); } + void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); } + void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); } + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } }; } + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } }; + } }; #endif From 74501b06b6af1a650ffde3a14c3671c58339776f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 24 Jan 2025 16:12:21 +0100 Subject: [PATCH 3/8] CrossSIMD: Add more no-simd fallback types --- Common/Math/CrossSIMD.h | 175 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 168 insertions(+), 7 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 9f19d891232a..f70be31d85d7 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -699,7 +699,6 @@ inline bool AnyZeroSignBit(Vec4F32 value) { #endif } - struct Vec4U16 { uint16x4_t v; // 64 bits. @@ -745,9 +744,126 @@ struct Vec8U16 { #else + +struct Mat4F32 { + Mat4F32() {} + Mat4F32(const float *src) { + memcpy(m, src, sizeof(m));) + } + void Store(float *dest) { + memcpy(dest, m, sizeof(m)); + } + static Mat4F32 Load4x3(const float *src) { + m[0] = src[0]; + m[1] = src[1]; + m[2] = src[2]; + m[3] = 0.0f; + m[0] = src[3]; + m[1] = src[4]; + m[2] = src[5]; + m[3] = 0.0f; + m[0] = src[6]; + m[1] = src[7]; + m[2] = src[8]; + m[3] = 0.0f; + m[0] = src[9]; + m[1] = src[10]; + m[2] = src[11]; + m[3] = 1.0f; + return result; + } + + // cols are consecutive + float m[16]; +}; + struct Vec4S32 { s32 v[4]; + static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; } + static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; } + + static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; } + static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; } + void Store(int *dst) { vst1q_s32(dst, v); } + void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); } + void StoreAligned(int *dst) { vst1q_s32(dst, v); } + + // Warning: Unlike on x86, this is a full 32-bit multiplication. + Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + + Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; } + // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least). + Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; } + Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; } + Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; } + + // NOTE: May be slow. + int operator[](size_t index) const { return ((int *)&v)[index]; } + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } }; + } + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } }; + } + // TODO: Can optimize the bitwise ones with 64-bit operations. + Vec4S32 operator |(Vec4S32 other) const { + return Vec4S32{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } }; + } + Vec4S32 operator &(Vec4S32 other) const { + return Vec4S32{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } }; + } + Vec4S32 operator ^(Vec4S32 other) const { + return Vec4S32{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } }; + } + Vec4S32 AndNot(Vec4S32 other) const { + return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } }; + } + Vec4S32 Mul(Vec4S32 other) const { return *this * other; } + void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; } + void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; } + void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; } + + template + Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; } + + Vec4S32 CompareEq(Vec4S32 other) const { + Vec4S32 out; + for (int i = 0; i < 4; i++) { + out[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0; + } + return out; + } + Vec4S32 CompareLt(Vec4S32 other) const { + Vec4S32 out; + for (int i = 0; i < 4; i++) { + out[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0; + } + return out; + } + Vec4S32 CompareGt(Vec4S32 other) const { + Vec4S32 out; + for (int i = 0; i < 4; i++) { + out[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0; + } + return out; + } + Vec4S32 CompareGtZero() const { + Vec4S32 out; + for (int i = 0; i < 4; i++) { + out[i] = v[i] > 0 ? 0xFFFFFFFF : 0; + } + return out; + } +}; + +struct Vec4F32 { + float v[4]; + static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; } static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; } @@ -757,16 +873,61 @@ struct Vec4S32 { void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); } void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); } void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); } +} - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; +struct Vec4U16 { + uint16_t v[4]; // 64 bits. + + static Vec4U16 Zero() { return Vec4U16{}; } + static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; } + + static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; } + void Store(uint16_t *mem) { memcpy(mem, 8, v); } + + static Vec4U16 FromVec4S32(Vec4S32 v) { + return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } }; + static Vec4U16 FromVec4F32(Vec4F32 v) { + return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; } - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } }; + + Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } }; } + Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } }; } + Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } }; } + +/* + Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; } + Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; } + Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } + + Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; } + */ +}; + +inline bool AnyZeroSignBit(Vec4S32 value) { + for (int i = 0; i < 4; i++) { + if (value.v[i] >= 0) { + return true; + } } + return false; +} + +inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { + return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31), } }; +} + +struct Vec8U16 { + uint16_t v[8]; + + static Vec8U16 Zero() { return Vec8U16{}; } + static Vec8U16 Splat(uint16_t value) { return Vec8U16{ { + value, value, value, value, value, value, value, value, + }}; } + + static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); } + void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); } }; + #endif From 32df2f7a038401ebf2dba4a3f063e5c2110f69ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 26 Jan 2025 17:54:26 +0100 Subject: [PATCH 4/8] Remove unused functions --- GPU/Software/TransformUnit.cpp | 8 -------- GPU/Software/TransformUnit.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index 63b4ae54d4dc..abcba1bb0d2a 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -155,14 +155,6 @@ WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords &coords) { return Norm3ByMatrix43(coords, gstate.worldMatrix); } -ViewCoords TransformUnit::WorldToView(const WorldCoords &coords) { - return Vec3ByMatrix43(coords, gstate.viewMatrix); -} - -ClipCoords TransformUnit::ViewToClip(const ViewCoords &coords) { - return Vec3ByMatrix44(coords, gstate.projMatrix); -} - template static ScreenCoords ClipToScreenInternal(Vec3f scaled, const ClipCoords &coords, bool *outside_range_flag) { ScreenCoords ret; diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h index 95be3b2e1911..be859e4fe892 100644 --- a/GPU/Software/TransformUnit.h +++ b/GPU/Software/TransformUnit.h @@ -121,8 +121,6 @@ class TransformUnit { static WorldCoords ModelToWorldNormal(const ModelCoords& coords); static WorldCoords ModelToWorld(const ModelCoords& coords); - static ViewCoords WorldToView(const WorldCoords& coords); - static ClipCoords ViewToClip(const ViewCoords& coords); static ScreenCoords ClipToScreen(const ClipCoords &coords, bool *outsideRangeFlag); static inline DrawingCoords ScreenToDrawing(int x, int y) { DrawingCoords ret; From 2aaa1e5379e620a267dcd0ebb9244013ce4e5ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 26 Jan 2025 17:55:18 +0100 Subject: [PATCH 5/8] CrossSIMD: Expand the no-simd path --- Common/Math/CrossSIMD.h | 400 ++++++++++++++++++++++++++++++++-------- 1 file changed, 324 insertions(+), 76 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index f70be31d85d7..24d851cde005 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -207,8 +207,6 @@ struct Vec4F32 { _mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); } - static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *)src)) }; } - static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes __m128i value = _mm_loadl_epi64((const __m128i *)src); // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend @@ -250,7 +248,7 @@ struct Vec4F32 { Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; } Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; } - Vec4F32 Clamp(float lower, float higher) { + Vec4F32 Clamp(float lower, float higher) const { return Vec4F32{ _mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher)) }; @@ -537,8 +535,6 @@ struct Vec4F32 { dst[2] = vgetq_lane_f32(v, 2); } - static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ vcombine_f32(vld1_f32(src), vdup_n_f32(0.0f)) }; } // TODO: Feels like there should be a better way. - static Vec4F32 LoadConvertS16(const int16_t *src) { int16x4_t value = vld1_s16(src); return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) }; @@ -591,7 +587,7 @@ struct Vec4F32 { return Vec4F32{ recip }; } - Vec4F32 Clamp(float lower, float higher) { + Vec4F32 Clamp(float lower, float higher) const { return Vec4F32{ vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher)) }; @@ -744,33 +740,35 @@ struct Vec8U16 { #else +// Fake SIMD by using scalar. struct Mat4F32 { Mat4F32() {} Mat4F32(const float *src) { - memcpy(m, src, sizeof(m));) + memcpy(m, src, sizeof(m)); } void Store(float *dest) { memcpy(dest, m, sizeof(m)); } static Mat4F32 Load4x3(const float *src) { - m[0] = src[0]; - m[1] = src[1]; - m[2] = src[2]; - m[3] = 0.0f; - m[0] = src[3]; - m[1] = src[4]; - m[2] = src[5]; - m[3] = 0.0f; - m[0] = src[6]; - m[1] = src[7]; - m[2] = src[8]; - m[3] = 0.0f; - m[0] = src[9]; - m[1] = src[10]; - m[2] = src[11]; - m[3] = 1.0f; - return result; + Mat4F32 mat; + mat.m[0] = src[0]; + mat.m[1] = src[1]; + mat.m[2] = src[2]; + mat.m[3] = 0.0f; + mat.m[0] = src[3]; + mat.m[1] = src[4]; + mat.m[2] = src[5]; + mat.m[3] = 0.0f; + mat.m[0] = src[6]; + mat.m[1] = src[7]; + mat.m[2] = src[8]; + mat.m[3] = 0.0f; + mat.m[0] = src[9]; + mat.m[1] = src[10]; + mat.m[2] = src[11]; + mat.m[3] = 1.0f; + return mat; } // cols are consecutive @@ -778,28 +776,45 @@ struct Mat4F32 { }; struct Vec4S32 { - s32 v[4]; + int32_t v[4]; - static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; } - static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; } + static Vec4S32 Zero() { return Vec4S32{}; } + static Vec4S32 Splat(int lane) { return Vec4S32{ { lane, lane, lane, lane } }; } - static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; } - static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; } - void Store(int *dst) { vst1q_s32(dst, v); } - void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); } - void StoreAligned(int *dst) { vst1q_s32(dst, v); } + static Vec4S32 Load(const int *src) { return Vec4S32{ { src[0], src[1], src[2], src[3] }}; } + static Vec4S32 LoadAligned(const int *src) { return Load(src); } + void Store(int *dst) { memcpy(dst, v, sizeof(v)); } + void Store2(int *dst) { memcpy(dst, v, sizeof(v[0]) * 2); } + void StoreAligned(int *dst) { memcpy(dst, v, sizeof(v)); } - // Warning: Unlike on x86, this is a full 32-bit multiplication. - Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + // Warning: Unlike on x86 SSE2, this is a full 32-bit multiplication. + Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3] } }; } - Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; } + Vec4S32 SignExtend16() const { + Vec4S32 tmp; + for (int i = 0; i < 4; i++) { + tmp.v[i] = (int32_t)(int16_t)v[i]; + } + return tmp; + } // NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least). - Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; } - Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; } - Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; } + Vec4S32 Min16(Vec4S32 other) const { + Vec4S32 tmp; + for (int i = 0; i < 4; i++) { + tmp.v[i] = other.v[i] < v[i] ? other.v[i] : v[i]; + } + return tmp; + } + Vec4S32 Max16(Vec4S32 other) const { + Vec4S32 tmp; + for (int i = 0; i < 4; i++) { + tmp.v[i] = other.v[i] > v[i] ? other.v[i] : v[i]; + } + return tmp; + } + Vec4S32 FixupAfterMinMax() const { return *this; } - // NOTE: May be slow. - int operator[](size_t index) const { return ((int *)&v)[index]; } + int operator[](size_t index) const { return v[index]; } Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; @@ -824,6 +839,7 @@ struct Vec4S32 { return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } }; } Vec4S32 Mul(Vec4S32 other) const { return *this * other; } + void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; } void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; } void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; } @@ -831,31 +847,31 @@ struct Vec4S32 { template Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; } - Vec4S32 CompareEq(Vec4S32 other) const { + Vec4S32 CompareEq(Vec4S32 other) const { Vec4S32 out; for (int i = 0; i < 4; i++) { - out[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0; + out.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0; } return out; } - Vec4S32 CompareLt(Vec4S32 other) const { + Vec4S32 CompareLt(Vec4S32 other) const { Vec4S32 out; for (int i = 0; i < 4; i++) { - out[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0; + out.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0; } return out; } - Vec4S32 CompareGt(Vec4S32 other) const { + Vec4S32 CompareGt(Vec4S32 other) const { Vec4S32 out; for (int i = 0; i < 4; i++) { - out[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0; + out.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0; } return out; } - Vec4S32 CompareGtZero() const { + Vec4S32 CompareGtZero() const { Vec4S32 out; for (int i = 0; i < 4; i++) { - out[i] = v[i] > 0 ? 0xFFFFFFFF : 0; + out.v[i] = v[i] > 0 ? 0xFFFFFFFF : 0; } return out; } @@ -864,44 +880,209 @@ struct Vec4S32 { struct Vec4F32 { float v[4]; - static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; } + static Vec4F32 Zero() { return Vec4F32{}; } static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; } static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; } static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; } - void Store(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); } - void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); } - void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); } - void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); } -} + static Vec4F32 LoadS8Norm(const int8_t *src) { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = (float)src[i] * (1.0f / 128.0f); + } + return temp; + } + static Vec4F32 LoadS16Norm(const int16_t *src) { // Divides by 32768.0f + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = (float)src[i] * (1.0f / 32768.0f); + } + return temp; + } + void Store(float *dst) { memcpy(dst, v, sizeof(v)); } + void Store2(float *dst) { memcpy(dst, v, sizeof(v[0]) * 2); } + void StoreAligned(float *dst) { memcpy(dst, v, sizeof(v)); } + void Store3(float *dst) { + memcpy(dst, v, sizeof(v[0]) * 3); + } -struct Vec4U16 { - uint16_t v[4]; // 64 bits. + static Vec4F32 LoadConvertS16(const int16_t *src) { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = (float)src[i]; + } + return temp; + } - static Vec4U16 Zero() { return Vec4U16{}; } - static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; } + static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used. + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = (float)src[i]; + } + return temp; + } - static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; } - void Store(uint16_t *mem) { memcpy(mem, 8, v); } + static Vec4F32 LoadF24x3_One(const uint32_t *src) { + uint32_t shifted[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 }; + Vec4F32 temp; + memcpy(temp.v, shifted, sizeof(temp.v)); + return temp; + } - static Vec4U16 FromVec4S32(Vec4S32 v) { - return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; + static Vec4F32 FromVec4S32(Vec4S32 src) { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = (float)src[i]; + } + return temp; } - static Vec4U16 FromVec4F32(Vec4F32 v) { - return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; + + float operator[](size_t index) const { return v[index]; } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } }; + } + Vec4F32 Min(Vec4F32 other) const { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i]; + } + return temp; + } + Vec4F32 Max(Vec4F32 other) const { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i]; + } + return temp; + } + void operator +=(Vec4F32 other) { + for (int i = 0; i < 4; i++) { + v[i] += other.v[i]; + } + } + void operator -=(Vec4F32 other) { + for (int i = 0; i < 4; i++) { + v[i] -= other.v[i]; + } + } + void operator *=(Vec4F32 other) { + for (int i = 0; i < 4; i++) { + v[i] *= other.v[i]; + } + } + void operator /=(Vec4F32 other) { + for (int i = 0; i < 4; i++) { + v[i] /= other.v[i]; + } + } + // void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } + Vec4F32 operator *(float f) const { + return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } }; } - Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } }; } - Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } }; } - Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } }; } + Vec4F32 Mul(float f) const { + return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } }; + } -/* - Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; } - Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; } - Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } + Vec4F32 Recip() const { + return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } }; + } - Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; } - */ + Vec4F32 RecipApprox() const { + return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } }; + } + + Vec4F32 Clamp(float lower, float higher) const { + Vec4F32 temp; + for (int i = 0; i < 4; i++) { + if (v[i] > higher) { + temp.v[i] = higher; + } else if (v[i] < lower) { + temp.v[i] = lower; + } else { + temp.v[i] = v[i]; + } + } + return temp; + } + + Vec4F32 WithLane3Zero() const { + return Vec4F32{ { v[0], v[1], v[2], 0.0f } }; + } + + Vec4F32 WithLane3One() const { + return Vec4F32{ { v[0], v[1], v[2], 1.0f } }; + } + + Vec4S32 CompareEq(Vec4F32 other) const { + Vec4S32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0; + } + return temp; + } + Vec4S32 CompareLt(Vec4F32 other) const { + Vec4S32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0; + } + return temp; + } + Vec4S32 CompareGt(Vec4F32 other) const { + Vec4S32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0; + } + return temp; + } + Vec4S32 CompareLe(Vec4F32 other) const { + Vec4S32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] <= other.v[i] ? 0xFFFFFFFF : 0; + } + return temp; + } + Vec4S32 CompareGe(Vec4F32 other) const { + Vec4S32 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] >= other.v[i] ? 0xFFFFFFFF : 0; + } + return temp; + } + + // In-place transpose. Fast on SIMD, not ideal on not. + static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + std::swap(col0.v[1], col1.v[0]); + std::swap(col0.v[2], col2.v[0]); + std::swap(col0.v[3], col3.v[0]); + + std::swap(col1.v[0], col0.v[1]); + std::swap(col1.v[2], col2.v[1]); + std::swap(col1.v[3], col3.v[1]); + + std::swap(col2.v[0], col0.v[2]); + std::swap(col2.v[1], col1.v[2]); + std::swap(col2.v[3], col3.v[2]); + + std::swap(col3.v[0], col0.v[3]); + std::swap(col3.v[1], col1.v[3]); + std::swap(col3.v[2], col2.v[3]); + } + + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { + float x = m.m[0] * v[0] + m.m[4] * v[1] + m.m[8] * v[2] + m.m[12]; + float y = m.m[1] * v[0] + m.m[5] * v[1] + m.m[9] * v[2] + m.m[13]; + float z = m.m[2] * v[0] + m.m[6] * v[1] + m.m[10] * v[2] + m.m[14]; + + return Vec4F32{ { x, y, z, 1.0f } }; + } }; inline bool AnyZeroSignBit(Vec4S32 value) { @@ -913,10 +1094,65 @@ inline bool AnyZeroSignBit(Vec4S32 value) { return false; } -inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { - return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31), } }; +inline bool AnyZeroSignBit(Vec4F32 value) { + for (int i = 0; i < 4; i++) { + if (value.v[i] >= 0.0f) { + return true; + } + } + return false; } +struct Vec4U16 { + uint16_t v[4]; // 64 bits. + + static Vec4U16 Zero() { return Vec4U16{}; } + static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; } + + static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; } + void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); } + + static Vec4U16 FromVec4S32(Vec4S32 v) { + return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; + } + static Vec4U16 FromVec4F32(Vec4F32 v) { + return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }}; + } + + Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] | other.v[0]), (uint16_t)(v[1] | other.v[1]), (uint16_t)(v[2] | other.v[2]), (uint16_t)(v[3] | other.v[3]), } }; } + Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] & other.v[0]), (uint16_t)(v[1] & other.v[1]), (uint16_t)(v[2] & other.v[2]), (uint16_t)(v[3] & other.v[3]), } }; } + Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { (uint16_t) (v[0] ^ other.v[0]), (uint16_t)(v[1] ^ other.v[1]), (uint16_t)(v[2] ^ other.v[2]), (uint16_t)(v[3] ^ other.v[3]), } }; } + + Vec4U16 Max(Vec4U16 other) const { + Vec4U16 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i]; + } + return temp; + } + Vec4U16 Min(Vec4U16 other) const { + Vec4U16 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i]; + } + return temp; + } + Vec4U16 CompareLT(Vec4U16 other) const { + Vec4U16 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] < other.v[i] ? 0xFFFF : 0; + } + return temp; + } + Vec4U16 AndNot(Vec4U16 other) const { + Vec4U16 temp; + for (int i = 0; i < 4; i++) { + temp.v[i] = v[i] & ~other.v[i]; + } + return temp; + } +}; + struct Vec8U16 { uint16_t v[8]; @@ -925,9 +1161,21 @@ struct Vec8U16 { value, value, value, value, value, value, value, value, }}; } - static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); } + static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); return tmp; } void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); } }; +inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { + return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31), } }; +} + +inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { + return Vec4S32{ { (int32_t)f.v[0], (int32_t)f.v[1], (int32_t)f.v[2], (int32_t)f.v[3] } }; +} + +inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { + return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } }; +} + #endif From acd5b24924e65cd9f87be2b542d12bb21efaaa28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Jan 2025 10:39:04 +0100 Subject: [PATCH 6/8] Complete CrossSIMD non-simd fallback (although buggy, it seems). Minor ARM64 opt. --- Common/Math/CrossSIMD.h | 72 ++++++++++++++++++++++++++++++++++++++++- Core/Config.cpp | 11 +++++++ UI/EmuScreen.cpp | 2 ++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 24d851cde005..66c78a51b290 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -565,7 +565,12 @@ struct Vec4F32 { void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); } void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); } void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); } +#if PPSSPP_ARCH(ARM64_NEON) + void operator /=(Vec4F32 other) { v = vdivq_f32(v, other.v); } +#else + // ARM32 doesn't have vdivq. void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); } +#endif void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } @@ -775,6 +780,15 @@ struct Mat4F32 { float m[16]; }; +// The columns are consecutive but missing the last row (implied 0,0,0,1). +// This is just intermediate storage for multiplication. +struct Mat4x3F32 { + Mat4x3F32(const float *matrix) { + memcpy(m, matrix, 12 * sizeof(float)); + } + float m[12]; +}; + struct Vec4S32 { int32_t v[4]; @@ -982,7 +996,15 @@ struct Vec4F32 { v[i] /= other.v[i]; } } - // void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } + void operator &=(Vec4S32 other) { + // TODO: This can be done simpler, although with some ugly casts. + for (int i = 0; i < 4; i++) { + uint32_t val; + memcpy(&val, &v[i], 4); + val &= other.v[i]; + memcpy(&v[i], &val, 4); + } + } Vec4F32 operator *(float f) const { return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } }; } @@ -1177,5 +1199,53 @@ inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } }; } +// Make sure the W component of scale is 1.0f. +inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) { + for (int i = 0; i < 4; i++) { + m.m[i * 4 + 0] *= scale.v[0]; + m.m[i * 4 + 1] *= scale.v[1]; + m.m[i * 4 + 2] *= scale.v[2]; + m.m[i * 4 + 3] *= scale.v[3]; + } +} + +inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) { + for (int i = 0; i < 4; i++) { + m.m[i * 4 + 0] = m.m[i * 4 + 0] * scale.v[0] + translate.v[0] * m.m[i * 4 + 3]; + m.m[i * 4 + 1] = m.m[i * 4 + 1] * scale.v[1] + translate.v[1] * m.m[i * 4 + 3]; + m.m[i * 4 + 2] = m.m[i * 4 + 2] * scale.v[2] + translate.v[2] * m.m[i * 4 + 3]; + m.m[i * 4 + 3] = m.m[i * 4 + 3] * scale.v[3] + translate.v[3] * m.m[i * 4 + 3]; + } +} + +inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) { + Mat4F32 result; + + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + float sum = 0.0f; + for (int k = 0; k < 4; k++) { + sum += b.m[i * 4 + k] * a.m[k * 4 + j]; + } + result.m[j * 4 + i] = sum; + } + } + return result; +} + +inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) { + Mat4F32 result; + + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + float sum = 0.0f; + for (int k = 0; k < 3; k++) { + sum += b.m[i * 4 + k] * a.m[k * 3 + j]; + } + result.m[j * 4 + i] = sum + b.m[i * 4 + 3]; + } + } + return result; +} #endif diff --git a/Core/Config.cpp b/Core/Config.cpp index d6d15e22f34f..bad932ae0f01 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -140,11 +140,22 @@ std::string DefaultLangRegion() { } static int DefaultDepthRaster() { + +// For 64-bit ARM and x86 with SIMD, enable depth raster. +#if PPSSPP_ARCH(ARM64_NEON) || PPSSPP_ARCH(SSE2) + #if PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(IOS) return (int)DepthRasterMode::LOW_QUALITY; #else return (int)DepthRasterMode::DEFAULT; #endif + +#else + + // 32-bit ARM or no SIMD, the depth raster will be too slow. + return (int)DepthRasterMode::OFF; + +#endif } std::string CreateRandMAC() { diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp index 95824d2ceb5b..695305a085e0 100644 --- a/UI/EmuScreen.cpp +++ b/UI/EmuScreen.cpp @@ -679,6 +679,8 @@ static void ShowFpsLimitNotice() { case FPSLimit::CUSTOM2: fpsLimit = g_Config.iFpsLimit2; break; + default: + break; } // Now display it. From 9caf37d28f742848d1764e6cdf5555cfd556f2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Jan 2025 10:53:18 +0100 Subject: [PATCH 7/8] Turn off depth raster if SIMD not available --- GPU/Common/DrawEngineCommon.cpp | 4 ++++ unittest/UnitTest.cpp | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 838281290667..4b44d43f44e4 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -56,6 +56,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex_); +#if PPSSPP_ARCH(SSE2) || PPSSPP_ARCH(ARM_NEON) switch ((DepthRasterMode)g_Config.iDepthRasterMode) { case DepthRasterMode::DEFAULT: case DepthRasterMode::LOW_QUALITY: @@ -67,6 +68,9 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { case DepthRasterMode::OFF: useDepthRaster_ = false; } +#else + useDepthRaster_ = false; +#endif if (useDepthRaster_) { depthDraws_.reserve(256); } diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index 475b785abd1f..d5cc9b050bac 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -466,7 +466,7 @@ bool TestVFPUSinCos() { return true; } -bool TestMatrixTranspose() { +bool TestVFPUMatrixTranspose() { MatrixSize sz = M_4x4; int matrix = 0; // M000 u8 cols[4]; @@ -489,6 +489,7 @@ bool TestMatrixTranspose() { return true; } +// TODO: Hook this up again! void TestGetMatrix(int matrix, MatrixSize sz) { INFO_LOG(Log::System, "Testing matrix %s", GetMatrixNotation(matrix, sz).c_str()); u8 fullMatrix[16]; @@ -1182,7 +1183,7 @@ TestItem availableTests[] = { TEST_ITEM(Parsers), TEST_ITEM(IRPassSimplify), TEST_ITEM(Jit), - TEST_ITEM(MatrixTranspose), + TEST_ITEM(VFPUMatrixTranspose), TEST_ITEM(ParseLBN), TEST_ITEM(QuickTexHash), TEST_ITEM(CLZ), From c78fa60431a2e21792d4cca37a693866112d9417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Jan 2025 10:56:52 +0100 Subject: [PATCH 8/8] Add better way to check if CrossSIMD has been natively implemented --- Common/Math/CrossSIMD.h | 2 ++ GPU/Common/DrawEngineCommon.cpp | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 66c78a51b290..d554e20f6f70 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -745,6 +745,8 @@ struct Vec8U16 { #else +#define CROSSSIMD_SLOW 1 + // Fake SIMD by using scalar. struct Mat4F32 { diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 4b44d43f44e4..c27c79e52853 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -56,7 +56,9 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex_); -#if PPSSPP_ARCH(SSE2) || PPSSPP_ARCH(ARM_NEON) +#ifdef CROSSSIMD_SLOW + useDepthRaster_ = false; +#else switch ((DepthRasterMode)g_Config.iDepthRasterMode) { case DepthRasterMode::DEFAULT: case DepthRasterMode::LOW_QUALITY: @@ -68,8 +70,6 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { case DepthRasterMode::OFF: useDepthRaster_ = false; } -#else - useDepthRaster_ = false; #endif if (useDepthRaster_) { depthDraws_.reserve(256);