From d31044103d3dacf0b8bbaa25696dcef7d354b660 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 24 Jan 2025 13:36:58 +0100
Subject: [PATCH 1/8] Add basic vscode .launch file, for easy debugging on mac

---
 .vscode/launch.json | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .vscode/launch.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 000000000000..28b43427b97b
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,26 @@
+{
+	// Use IntelliSense to learn about possible attributes.
+	// Hover to view descriptions of existing attributes.
+	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "(lldb) Launch",
+			"type": "cppdbg",
+			"request": "launch",
+			"program": "",
+			"osx": {
+				"program": "${workspaceFolder}/build/PPSSPPSDL.app/Contents/MacOS/PPSSPPSDL"
+			},
+			"linux": {
+				"program": "${workspaceRoot}/build/PPSSPPSDL"
+			},
+			"args": [],
+			"stopAtEntry": false,
+			"cwd": "${workspaceFolder}",
+			"environment": [],
+			"externalConsole": false,
+			"MIMode": "lldb"
+		}
+	]
+}

From 9d164b71fbb3ba6d9d9589f3b628c8c76dd98754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 24 Jan 2025 15:02:22 +0100
Subject: [PATCH 2/8] Some new CrossSIMD operations

---
 Common/Math/CrossSIMD.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index a62f5d7e5e46..9f19d891232a 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -185,6 +185,19 @@ struct Vec4F32 {
 
 	static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
 	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
+	static Vec4F32 LoadS8Norm(const int8_t *src) {
+		__m128i value = _mm_set1_epi32(*((uint32_t *)src));
+		__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, value), value);
+		// Sign extension. A bit ugly without SSE4.
+		value32 = _mm_srai_epi32(value32, 24);
+		return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_set1_ps(1.0f / 128.0f)) };
+	}
+	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
+		__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)src));
+		// Sign extension. A bit ugly without SSE4.
+		bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
+		return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(bits), _mm_set1_ps(1.0f / 32768.0f)) };
+	}
 	void Store(float *dst) { _mm_storeu_ps(dst, v); }
 	void Store2(float *dst) { _mm_storel_epi64((__m128i *)dst, _mm_castps_si128(v)); }
 	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
@@ -506,6 +519,14 @@ struct Vec4F32 {
 	static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; }
 
 	static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	static Vec4F32 LoadS8Norm(const int8_t *src) {
+		const int8x8_t value = (int8x8_t)vdup_n_u32(*((uint32_t *)src));
+		const int16x8_t value16 = vmovl_s8(value);
+		return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vget_low_u16(value16)), 7) };
+	}
+	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
+		return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vld1_s16(src)), 15) };
+	}
 	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
 	void Store(float *dst) { vst1q_f32(dst, v); }
 	void Store2(float *dst) { vst1_f32(dst, vget_low_f32(v)); }
@@ -727,12 +748,25 @@ struct Vec8U16 {
 struct Vec4S32 {
 	s32 v[4];
 
+	static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; }
+	static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; }
+
+	static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
+	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
+	void Store(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); }
+	void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); }
+	void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); }
+	void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); }
+
 	Vec4S32 operator +(Vec4S32 other) const {
 		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
 	}
 	Vec4S32 operator -(Vec4S32 other) const {
 		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
 	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
+	}
 };
 
 #endif

From 74501b06b6af1a650ffde3a14c3671c58339776f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 24 Jan 2025 16:12:21 +0100
Subject: [PATCH 3/8] CrossSIMD: Add more no-simd fallback types

---
 Common/Math/CrossSIMD.h | 175 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 168 insertions(+), 7 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 9f19d891232a..f70be31d85d7 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -699,7 +699,6 @@ inline bool AnyZeroSignBit(Vec4F32 value) {
 #endif
 }
 
-
 struct Vec4U16 {
 	uint16x4_t v;  // 64 bits.
 
@@ -745,9 +744,126 @@ struct Vec8U16 {
 
 #else
 
+
+struct Mat4F32 {
+	Mat4F32() {}
+	Mat4F32(const float *src) {
+		memcpy(m, src, sizeof(m));)
+	}
+	void Store(float *dest) {
+		memcpy(dest, m, sizeof(m));
+	}
+	static Mat4F32 Load4x3(const float *src) {
+		m[0] = src[0];
+		m[1] = src[1];
+		m[2] = src[2];
+		m[3] = 0.0f;
+		m[0] = src[3];
+		m[1] = src[4];
+		m[2] = src[5];
+		m[3] = 0.0f;
+		m[0] = src[6];
+		m[1] = src[7];
+		m[2] = src[8];
+		m[3] = 0.0f;
+		m[0] = src[9];
+		m[1] = src[10];
+		m[2] = src[11];
+		m[3] = 1.0f;
+		return result;
+	}
+
+	// cols are consecutive
+	float m[16];
+};
+
 struct Vec4S32 {
 	s32 v[4];
 
+	static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
+
+	static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
+	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
+	void Store(int *dst) { vst1q_s32(dst, v); }
+	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
+	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
+
+		// Warning: Unlike on x86, this is a full 32-bit multiplication.
+	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+
+	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
+	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
+	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
+	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
+	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
+
+	// NOTE: May be slow.
+	int operator[](size_t index) const { return ((int *)&v)[index]; }
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
+	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
+	}
+	// TODO: Can optimize the bitwise ones with 64-bit operations.
+	Vec4S32 operator |(Vec4S32 other) const {
+		return Vec4S32{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } };
+	}
+	Vec4S32 operator &(Vec4S32 other) const {
+		return Vec4S32{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } };
+	}
+	Vec4S32 operator ^(Vec4S32 other) const {
+		return Vec4S32{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } };
+	}
+	Vec4S32 AndNot(Vec4S32 other) const {
+		return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } };
+	}
+	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
+	void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; }
+	void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; }
+	void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; }
+
+	template<int imm>
+	Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; }
+
+	Vec4S32 CompareEq(Vec4S32 other) const { 
+		Vec4S32 out;
+		for (int i = 0; i < 4; i++) {
+			out[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return out;
+	}
+	Vec4S32 CompareLt(Vec4S32 other) const { 
+		Vec4S32 out;
+		for (int i = 0; i < 4; i++) {
+			out[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return out;
+	}
+	Vec4S32 CompareGt(Vec4S32 other) const { 
+		Vec4S32 out;
+		for (int i = 0; i < 4; i++) {
+			out[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return out;
+	}
+	Vec4S32 CompareGtZero() const { 
+		Vec4S32 out;
+		for (int i = 0; i < 4; i++) {
+			out[i] = v[i] > 0 ? 0xFFFFFFFF : 0;
+		}
+		return out;
+	}
+};
+
+struct Vec4F32 {
+	float v[4];
+
 	static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; }
 	static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; }
 
@@ -757,16 +873,61 @@ struct Vec4S32 {
 	void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); }
 	void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); }
 	void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); }
+}
 
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
+struct Vec4U16 {
+	uint16_t v[4];  // 64 bits.
+
+	static Vec4U16 Zero() { return Vec4U16{}; }
+	static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; }
+	void Store(uint16_t *mem) { memcpy(mem, 8, v); }
+
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
 	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
 	}
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } }; }
+
+/*
+	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
+	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
+	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+
+	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
+	*/
+};
+
+inline bool AnyZeroSignBit(Vec4S32 value) {
+	for (int i = 0; i < 4; i++) {
+		if (value.v[i] >= 0) {
+			return true;
+		}
 	}
+	return false;
+}
+
+inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31),  } };
+}
+
+struct Vec8U16 {
+	uint16_t v[8];
+
+	static Vec8U16 Zero() { return Vec8U16{}; }
+	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ {
+		value, value, value, value, value, value, value, value,
+	}}; }
+
+	static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); }
+	void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
 };
 
+
 #endif

From 32df2f7a038401ebf2dba4a3f063e5c2110f69ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 26 Jan 2025 17:54:26 +0100
Subject: [PATCH 4/8] Remove unused functions

---
 GPU/Software/TransformUnit.cpp | 8 --------
 GPU/Software/TransformUnit.h   | 2 --
 2 files changed, 10 deletions(-)

diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index 63b4ae54d4dc..abcba1bb0d2a 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -155,14 +155,6 @@ WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords &coords) {
 	return Norm3ByMatrix43(coords, gstate.worldMatrix);
 }
 
-ViewCoords TransformUnit::WorldToView(const WorldCoords &coords) {
-	return Vec3ByMatrix43(coords, gstate.viewMatrix);
-}
-
-ClipCoords TransformUnit::ViewToClip(const ViewCoords &coords) {
-	return Vec3ByMatrix44(coords, gstate.projMatrix);
-}
-
 template <bool depthClamp, bool alwaysCheckRange>
 static ScreenCoords ClipToScreenInternal(Vec3f scaled, const ClipCoords &coords, bool *outside_range_flag) {
 	ScreenCoords ret;
diff --git a/GPU/Software/TransformUnit.h b/GPU/Software/TransformUnit.h
index 95be3b2e1911..be859e4fe892 100644
--- a/GPU/Software/TransformUnit.h
+++ b/GPU/Software/TransformUnit.h
@@ -121,8 +121,6 @@ class TransformUnit {
 
 	static WorldCoords ModelToWorldNormal(const ModelCoords& coords);
 	static WorldCoords ModelToWorld(const ModelCoords& coords);
-	static ViewCoords WorldToView(const WorldCoords& coords);
-	static ClipCoords ViewToClip(const ViewCoords& coords);
 	static ScreenCoords ClipToScreen(const ClipCoords &coords, bool *outsideRangeFlag);
 	static inline DrawingCoords ScreenToDrawing(int x, int y) {
 		DrawingCoords ret;

From 2aaa1e5379e620a267dcd0ebb9244013ce4e5ef1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sun, 26 Jan 2025 17:55:18 +0100
Subject: [PATCH 5/8] CrossSIMD: Expand the no-simd path

---
 Common/Math/CrossSIMD.h | 400 ++++++++++++++++++++++++++++++++--------
 1 file changed, 324 insertions(+), 76 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index f70be31d85d7..24d851cde005 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -207,8 +207,6 @@ struct Vec4F32 {
 		_mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
 	}
 
-	static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *)src)) }; }
-
 	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
 		__m128i value = _mm_loadl_epi64((const __m128i *)src);
 		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
@@ -250,7 +248,7 @@ struct Vec4F32 {
 	Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
 	Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
 
-	Vec4F32 Clamp(float lower, float higher) {
+	Vec4F32 Clamp(float lower, float higher) const {
 		return Vec4F32{
 			_mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher))
 		};
@@ -537,8 +535,6 @@ struct Vec4F32 {
 		dst[2] = vgetq_lane_f32(v, 2);
 	}
 
-	static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ vcombine_f32(vld1_f32(src), vdup_n_f32(0.0f)) }; }  // TODO: Feels like there should be a better way.
-
 	static Vec4F32 LoadConvertS16(const int16_t *src) {
 		int16x4_t value = vld1_s16(src);
 		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
@@ -591,7 +587,7 @@ struct Vec4F32 {
 		return Vec4F32{ recip };
 	}
 
-	Vec4F32 Clamp(float lower, float higher) {
+	Vec4F32 Clamp(float lower, float higher) const {
 		return Vec4F32{
 			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
 		};
@@ -744,33 +740,35 @@ struct Vec8U16 {
 
 #else
 
+// Fake SIMD by using scalar.
 
 struct Mat4F32 {
 	Mat4F32() {}
 	Mat4F32(const float *src) {
-		memcpy(m, src, sizeof(m));)
+		memcpy(m, src, sizeof(m));
 	}
 	void Store(float *dest) {
 		memcpy(dest, m, sizeof(m));
 	}
 	static Mat4F32 Load4x3(const float *src) {
-		m[0] = src[0];
-		m[1] = src[1];
-		m[2] = src[2];
-		m[3] = 0.0f;
-		m[0] = src[3];
-		m[1] = src[4];
-		m[2] = src[5];
-		m[3] = 0.0f;
-		m[0] = src[6];
-		m[1] = src[7];
-		m[2] = src[8];
-		m[3] = 0.0f;
-		m[0] = src[9];
-		m[1] = src[10];
-		m[2] = src[11];
-		m[3] = 1.0f;
-		return result;
+		Mat4F32 mat;
+		mat.m[0] = src[0];
+		mat.m[1] = src[1];
+		mat.m[2] = src[2];
+		mat.m[3] = 0.0f;
+		mat.m[0] = src[3];
+		mat.m[1] = src[4];
+		mat.m[2] = src[5];
+		mat.m[3] = 0.0f;
+		mat.m[0] = src[6];
+		mat.m[1] = src[7];
+		mat.m[2] = src[8];
+		mat.m[3] = 0.0f;
+		mat.m[0] = src[9];
+		mat.m[1] = src[10];
+		mat.m[2] = src[11];
+		mat.m[3] = 1.0f;
+		return mat;
 	}
 
 	// cols are consecutive
@@ -778,28 +776,45 @@ struct Mat4F32 {
 };
 
 struct Vec4S32 {
-	s32 v[4];
+	int32_t v[4];
 
-	static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
-	static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
+	static Vec4S32 Zero() { return Vec4S32{}; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ { lane, lane, lane, lane } }; }
 
-	static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
-	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
-	void Store(int *dst) { vst1q_s32(dst, v); }
-	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
-	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
+	static Vec4S32 Load(const int *src) { return Vec4S32{ { src[0], src[1], src[2], src[3] }}; }
+	static Vec4S32 LoadAligned(const int *src) { return Load(src); }
+	void Store(int *dst) { memcpy(dst, v, sizeof(v)); }
+	void Store2(int *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
+	void StoreAligned(int *dst) { memcpy(dst, v, sizeof(v)); }
 
-		// Warning: Unlike on x86, this is a full 32-bit multiplication.
-	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	// Warning: Unlike on x86 SSE2, this is a full 32-bit multiplication.
+	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3] } }; }
 
-	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
+	Vec4S32 SignExtend16() const {
+		Vec4S32 tmp;
+		for (int i = 0; i < 4; i++) {
+			tmp.v[i] = (int32_t)(int16_t)v[i];
+		}
+		return tmp;
+	}
 	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
-	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
-	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
-	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
+	Vec4S32 Min16(Vec4S32 other) const {
+		Vec4S32 tmp;
+		for (int i = 0; i < 4; i++) {
+			tmp.v[i] = other.v[i] < v[i] ? other.v[i] : v[i];
+		}
+		return tmp;
+	}
+	Vec4S32 Max16(Vec4S32 other) const {
+		Vec4S32 tmp;
+		for (int i = 0; i < 4; i++) {
+			tmp.v[i] = other.v[i] > v[i] ? other.v[i] : v[i];
+		}
+		return tmp;
+	}
+	Vec4S32 FixupAfterMinMax() const { return *this; }
 
-	// NOTE: May be slow.
-	int operator[](size_t index) const { return ((int *)&v)[index]; }
+	int operator[](size_t index) const { return v[index]; }
 
 	Vec4S32 operator +(Vec4S32 other) const {
 		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
@@ -824,6 +839,7 @@ struct Vec4S32 {
 		return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } };
 	}
 	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
+
 	void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; }
 	void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; }
 	void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; }
@@ -831,31 +847,31 @@ struct Vec4S32 {
 	template<int imm>
 	Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; }
 
-	Vec4S32 CompareEq(Vec4S32 other) const { 
+	Vec4S32 CompareEq(Vec4S32 other) const {
 		Vec4S32 out;
 		for (int i = 0; i < 4; i++) {
-			out[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
+			out.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
 		}
 		return out;
 	}
-	Vec4S32 CompareLt(Vec4S32 other) const { 
+	Vec4S32 CompareLt(Vec4S32 other) const {
 		Vec4S32 out;
 		for (int i = 0; i < 4; i++) {
-			out[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
+			out.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
 		}
 		return out;
 	}
-	Vec4S32 CompareGt(Vec4S32 other) const { 
+	Vec4S32 CompareGt(Vec4S32 other) const {
 		Vec4S32 out;
 		for (int i = 0; i < 4; i++) {
-			out[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
+			out.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
 		}
 		return out;
 	}
-	Vec4S32 CompareGtZero() const { 
+	Vec4S32 CompareGtZero() const {
 		Vec4S32 out;
 		for (int i = 0; i < 4; i++) {
-			out[i] = v[i] > 0 ? 0xFFFFFFFF : 0;
+			out.v[i] = v[i] > 0 ? 0xFFFFFFFF : 0;
 		}
 		return out;
 	}
@@ -864,44 +880,209 @@ struct Vec4S32 {
 struct Vec4F32 {
 	float v[4];
 
-	static Vec4F32 Zero() { return Vec4F32{ { 0.0f, 0.0f, 0.0f, 0.0f } }; }
+	static Vec4F32 Zero() { return Vec4F32{}; }
 	static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; }
 
 	static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
 	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
-	void Store(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); }
-	void Store2(float *dst) { memcpy(dst, v, 2 * sizeof(s32)); }
-	void StoreAligned(float *dst) { memcpy(dst, v, sizeof(Vec4S32)); }
-	void Store3(float *dst) { memcpy(dst, v, 3 * sizeof(s32)); }
-}
+	static Vec4F32 LoadS8Norm(const int8_t *src) {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = (float)src[i] * (1.0f / 128.0f);
+		}
+		return temp;
+	}
+	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = (float)src[i] * (1.0f / 32768.0f);
+		}
+		return temp;
+	}
+	void Store(float *dst) { memcpy(dst, v, sizeof(v)); }
+	void Store2(float *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
+	void StoreAligned(float *dst) { memcpy(dst, v, sizeof(v)); }
+	void Store3(float *dst) {
+		memcpy(dst, v, sizeof(v[0]) * 3);
+	}
 
-struct Vec4U16 {
-	uint16_t v[4];  // 64 bits.
+	static Vec4F32 LoadConvertS16(const int16_t *src) {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = (float)src[i];
+		}
+		return temp;
+	}
 
-	static Vec4U16 Zero() { return Vec4U16{}; }
-	static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; }
+	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = (float)src[i];
+		}
+		return temp;
+	}
 
-	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; }
-	void Store(uint16_t *mem) { memcpy(mem, 8, v); }
+	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
+		uint32_t shifted[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
+		Vec4F32 temp;
+		memcpy(temp.v, shifted, sizeof(temp.v));
+		return temp;
+	}
 
-	static Vec4U16 FromVec4S32(Vec4S32 v) {
-		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
+	static Vec4F32 FromVec4S32(Vec4S32 src) {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = (float)src[i];
+		}
+		return temp;
 	}
-	static Vec4U16 FromVec4F32(Vec4F32 v) {
-		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
+
+	float operator[](size_t index) const { return v[index]; }
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
+	}
+	Vec4F32 Min(Vec4F32 other) const {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
+		}
+		return temp;
+	}
+	Vec4F32 Max(Vec4F32 other) const {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
+		}
+		return temp;
+	}
+	void operator +=(Vec4F32 other) {
+		for (int i = 0; i < 4; i++) {
+			v[i] += other.v[i];
+		}
+	}
+	void operator -=(Vec4F32 other) {
+		for (int i = 0; i < 4; i++) {
+			v[i] -= other.v[i];
+		}
+	}
+	void operator *=(Vec4F32 other) {
+		for (int i = 0; i < 4; i++) {
+			v[i] *= other.v[i];
+		}
+	}
+	void operator /=(Vec4F32 other) {
+		for (int i = 0; i < 4; i++) {
+			v[i] /= other.v[i];
+		}
+	}
+	// void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
+	Vec4F32 operator *(float f) const {
+		return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
 	}
 
-	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } }; }
-	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } }; }
-	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } }; }
+	Vec4F32 Mul(float f) const {
+		return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
+	}
 
-/*
-	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
-	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
-	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+	Vec4F32 Recip() const {
+		return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
+	}
 
-	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
-	*/
+	Vec4F32 RecipApprox() const {
+		return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
+	}
+
+	Vec4F32 Clamp(float lower, float higher) const {
+		Vec4F32 temp;
+		for (int i = 0; i < 4; i++) {
+			if (v[i] > higher) {
+				temp.v[i] = higher;
+			} else if (v[i] < lower) {
+				temp.v[i] = lower;
+			} else {
+				temp.v[i] = v[i];
+			}
+		}
+		return temp;
+	}
+
+	Vec4F32 WithLane3Zero() const {
+		return Vec4F32{ { v[0], v[1], v[2], 0.0f } };
+	}
+
+	Vec4F32 WithLane3One() const {
+		return Vec4F32{ { v[0], v[1], v[2], 1.0f } };
+	}
+
+	Vec4S32 CompareEq(Vec4F32 other) const {
+		Vec4S32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return temp;
+	}
+	Vec4S32 CompareLt(Vec4F32 other) const {
+		Vec4S32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return temp;
+	}
+	Vec4S32 CompareGt(Vec4F32 other) const {
+		Vec4S32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return temp;
+	}
+	Vec4S32 CompareLe(Vec4F32 other) const {
+		Vec4S32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] <= other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return temp;
+	}
+	Vec4S32 CompareGe(Vec4F32 other) const {
+		Vec4S32 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] >= other.v[i] ? 0xFFFFFFFF : 0;
+		}
+		return temp;
+	}
+
+	// In-place transpose. Fast on SIMD, not ideal on not.
+	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		std::swap(col0.v[1], col1.v[0]);
+		std::swap(col0.v[2], col2.v[0]);
+		std::swap(col0.v[3], col3.v[0]);
+
+		std::swap(col1.v[0], col0.v[1]);
+		std::swap(col1.v[2], col2.v[1]);
+		std::swap(col1.v[3], col3.v[1]);
+
+		std::swap(col2.v[0], col0.v[2]);
+		std::swap(col2.v[1], col1.v[2]);
+		std::swap(col2.v[3], col3.v[2]);
+
+		std::swap(col3.v[0], col0.v[3]);
+		std::swap(col3.v[1], col1.v[3]);
+		std::swap(col3.v[2], col2.v[3]);
+	}
+
+	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
+		float x = m.m[0] * v[0] + m.m[4] * v[1] + m.m[8] * v[2] + m.m[12];
+		float y = m.m[1] * v[0] + m.m[5] * v[1] + m.m[9] * v[2] + m.m[13];
+		float z = m.m[2] * v[0] + m.m[6] * v[1] + m.m[10] * v[2] + m.m[14];
+
+		return Vec4F32{ { x, y, z, 1.0f } };
+	}
 };
 
 inline bool AnyZeroSignBit(Vec4S32 value) {
@@ -913,10 +1094,65 @@ inline bool AnyZeroSignBit(Vec4S32 value) {
 	return false;
 }
 
-inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
-	return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31),  } };
+inline bool AnyZeroSignBit(Vec4F32 value) {
+	for (int i = 0; i < 4; i++) {
+		if (value.v[i] >= 0.0f) {
+			return true;
+		}
+	}
+	return false;
 }
 
+struct Vec4U16 {
+	uint16_t v[4];  // 64 bits.
+
+	static Vec4U16 Zero() { return Vec4U16{}; }
+	static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; }
+	void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
+
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
+	}
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
+	}
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] | other.v[0]), (uint16_t)(v[1] | other.v[1]), (uint16_t)(v[2] | other.v[2]), (uint16_t)(v[3] | other.v[3]), } }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] & other.v[0]), (uint16_t)(v[1] & other.v[1]), (uint16_t)(v[2] & other.v[2]), (uint16_t)(v[3] & other.v[3]), } }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { (uint16_t)	(v[0] ^ other.v[0]), (uint16_t)(v[1] ^ other.v[1]), (uint16_t)(v[2] ^ other.v[2]), (uint16_t)(v[3] ^ other.v[3]), } }; }
+
+	Vec4U16 Max(Vec4U16 other) const {
+		Vec4U16 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
+		}
+		return temp;
+	}
+	Vec4U16 Min(Vec4U16 other) const {
+		Vec4U16 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
+		}
+		return temp;
+	}
+	Vec4U16 CompareLT(Vec4U16 other) const {
+		Vec4U16 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] < other.v[i] ? 0xFFFF : 0;
+		}
+		return temp;
+	}
+	Vec4U16 AndNot(Vec4U16 other) const {
+		Vec4U16 temp;
+		for (int i = 0; i < 4; i++) {
+			temp.v[i] = v[i] & ~other.v[i];
+		}
+		return temp;
+	}
+};
+
 struct Vec8U16 {
 	uint16_t v[8];
 
@@ -925,9 +1161,21 @@ struct Vec8U16 {
 		value, value, value, value, value, value, value, value,
 	}}; }
 
-	static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); }
+	static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); return tmp; }
 	void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
 };
 
+inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31),  } };
+}
+
+inline Vec4S32 Vec4S32FromF32(Vec4F32 f) {
+	return Vec4S32{ { (int32_t)f.v[0], (int32_t)f.v[1], (int32_t)f.v[2], (int32_t)f.v[3] } };
+}
+
+inline Vec4F32 Vec4F32FromS32(Vec4S32 f) {
+	return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } };
+}
+
 
 #endif

From acd5b24924e65cd9f87be2b542d12bb21efaaa28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 28 Jan 2025 10:39:04 +0100
Subject: [PATCH 6/8] Complete CrossSIMD non-simd fallback (although buggy, it
 seems). Minor ARM64 opt.

---
 Common/Math/CrossSIMD.h | 72 ++++++++++++++++++++++++++++++++++++++++-
 Core/Config.cpp         | 11 +++++++
 UI/EmuScreen.cpp        |  2 ++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 24d851cde005..66c78a51b290 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -565,7 +565,12 @@ struct Vec4F32 {
 	void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
 	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
 	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
+#if PPSSPP_ARCH(ARM64_NEON)
+	void operator /=(Vec4F32 other) { v = vdivq_f32(v, other.v); }
+#else
+	// ARM32 doesn't have vdivq.
 	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+#endif
 	void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
 	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
@@ -775,6 +780,15 @@ struct Mat4F32 {
 	float m[16];
 };
 
+// The columns are consecutive but missing the last row (implied 0,0,0,1).
+// This is just intermediate storage for multiplication.
+struct Mat4x3F32 {
+	Mat4x3F32(const float *matrix) {
+		memcpy(m, matrix, 12 * sizeof(float));
+	}
+	float m[12];
+};
+
 struct Vec4S32 {
 	int32_t v[4];
 
@@ -982,7 +996,15 @@ struct Vec4F32 {
 			v[i] /= other.v[i];
 		}
 	}
-	// void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
+	void operator &=(Vec4S32 other) {
+		// TODO: This can be done simpler, although with some ugly casts.
+		for (int i = 0; i < 4; i++) {
+			uint32_t val;
+			memcpy(&val, &v[i], 4);
+			val &= other.v[i];
+			memcpy(&v[i], &val, 4);
+		}
+	}
 	Vec4F32 operator *(float f) const {
 		return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
 	}
@@ -1177,5 +1199,53 @@ inline Vec4F32 Vec4F32FromS32(Vec4S32 f) {
 	return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } };
 }
 
+// Make sure the W component of scale is 1.0f.
+inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
+	for (int i = 0; i < 4; i++) {
+		m.m[i * 4 + 0] *= scale.v[0];
+		m.m[i * 4 + 1] *= scale.v[1];
+		m.m[i * 4 + 2] *= scale.v[2];
+		m.m[i * 4 + 3] *= scale.v[3];
+	}
+}
+
+inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
+	for (int i = 0; i < 4; i++) {
+		m.m[i * 4 + 0] = m.m[i * 4 + 0] * scale.v[0] + translate.v[0] * m.m[i * 4 + 3];
+		m.m[i * 4 + 1] = m.m[i * 4 + 1] * scale.v[1] + translate.v[1] * m.m[i * 4 + 3];
+		m.m[i * 4 + 2] = m.m[i * 4 + 2] * scale.v[2] + translate.v[2] * m.m[i * 4 + 3];
+		m.m[i * 4 + 3] = m.m[i * 4 + 3] * scale.v[3] + translate.v[3] * m.m[i * 4 + 3];
+	}
+}
+
+inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	for (int j = 0; j < 4; j++) {
+		for (int i = 0; i < 4; i++) {
+			float sum = 0.0f;
+			for (int k = 0; k < 4; k++) {
+				sum += b.m[i * 4 + k] * a.m[k * 4 + j];
+			}
+			result.m[j * 4 + i] = sum;
+		}
+	}
+	return result;
+}
+
+inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	for (int j = 0; j < 4; j++) {
+		for (int i = 0; i < 4; i++) {
+			float sum = 0.0f;
+			for (int k = 0; k < 3; k++) {
+				sum += b.m[i * 4 + k] * a.m[k * 3 + j];
+			}
+			result.m[j * 4 + i] = sum + b.m[i * 4 + 3];
+		}
+	}
+	return result;
+}
 
 #endif
diff --git a/Core/Config.cpp b/Core/Config.cpp
index d6d15e22f34f..bad932ae0f01 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -140,11 +140,22 @@ std::string DefaultLangRegion() {
 }
 
 static int DefaultDepthRaster() {
+
+// For 64-bit ARM and x86 with SIMD, enable depth raster.
+#if PPSSPP_ARCH(ARM64_NEON) || PPSSPP_ARCH(SSE2)
+
 #if PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(IOS)
 	return (int)DepthRasterMode::LOW_QUALITY;
 #else
 	return (int)DepthRasterMode::DEFAULT;
 #endif
+
+#else
+
+	// 32-bit ARM or no SIMD, the depth raster will be too slow.
+	return (int)DepthRasterMode::OFF;
+
+#endif
 }
 
 std::string CreateRandMAC() {
diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp
index 95824d2ceb5b..695305a085e0 100644
--- a/UI/EmuScreen.cpp
+++ b/UI/EmuScreen.cpp
@@ -679,6 +679,8 @@ static void ShowFpsLimitNotice() {
 	case FPSLimit::CUSTOM2:
 		fpsLimit = g_Config.iFpsLimit2;
 		break;
+	default:
+		break;
 	}
 
 	// Now display it.

From 9caf37d28f742848d1764e6cdf5555cfd556f2b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 28 Jan 2025 10:53:18 +0100
Subject: [PATCH 7/8] Turn off depth raster if SIMD not available

---
 GPU/Common/DrawEngineCommon.cpp | 4 ++++
 unittest/UnitTest.cpp           | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 838281290667..4b44d43f44e4 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -56,6 +56,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	indexGen.Setup(decIndex_);
 
+#if PPSSPP_ARCH(SSE2) || PPSSPP_ARCH(ARM_NEON)
 	switch ((DepthRasterMode)g_Config.iDepthRasterMode) {
 	case DepthRasterMode::DEFAULT:
 	case DepthRasterMode::LOW_QUALITY:
@@ -67,6 +68,9 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	case DepthRasterMode::OFF:
 		useDepthRaster_ = false;
 	}
+#else
+	useDepthRaster_ = false;
+#endif
 	if (useDepthRaster_) {
 		depthDraws_.reserve(256);
 	}
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index 475b785abd1f..d5cc9b050bac 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -466,7 +466,7 @@ bool TestVFPUSinCos() {
 	return true;
 }
 
-bool TestMatrixTranspose() {
+bool TestVFPUMatrixTranspose() {
 	MatrixSize sz = M_4x4;
 	int matrix = 0;  // M000
 	u8 cols[4];
@@ -489,6 +489,7 @@ bool TestMatrixTranspose() {
 	return true;
 }
 
+// TODO: Hook this up again!
 void TestGetMatrix(int matrix, MatrixSize sz) {
 	INFO_LOG(Log::System, "Testing matrix %s", GetMatrixNotation(matrix, sz).c_str());
 	u8 fullMatrix[16];
@@ -1182,7 +1183,7 @@ TestItem availableTests[] = {
 	TEST_ITEM(Parsers),
 	TEST_ITEM(IRPassSimplify),
 	TEST_ITEM(Jit),
-	TEST_ITEM(MatrixTranspose),
+	TEST_ITEM(VFPUMatrixTranspose),
 	TEST_ITEM(ParseLBN),
 	TEST_ITEM(QuickTexHash),
 	TEST_ITEM(CLZ),

From c78fa60431a2e21792d4cca37a693866112d9417 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 28 Jan 2025 10:56:52 +0100
Subject: [PATCH 8/8] Add better way to check if CrossSIMD has been natively
 implemented

---
 Common/Math/CrossSIMD.h         | 2 ++
 GPU/Common/DrawEngineCommon.cpp | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 66c78a51b290..d554e20f6f70 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -745,6 +745,8 @@ struct Vec8U16 {
 
 #else
 
+#define CROSSSIMD_SLOW 1
+
 // Fake SIMD by using scalar.
 
 struct Mat4F32 {
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 4b44d43f44e4..c27c79e52853 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -56,7 +56,9 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	indexGen.Setup(decIndex_);
 
-#if PPSSPP_ARCH(SSE2) || PPSSPP_ARCH(ARM_NEON)
+#ifdef CROSSSIMD_SLOW
+	useDepthRaster_ = false;
+#else
 	switch ((DepthRasterMode)g_Config.iDepthRasterMode) {
 	case DepthRasterMode::DEFAULT:
 	case DepthRasterMode::LOW_QUALITY:
@@ -68,8 +70,6 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	case DepthRasterMode::OFF:
 		useDepthRaster_ = false;
 	}
-#else
-	useDepthRaster_ = false;
 #endif
 	if (useDepthRaster_) {
 		depthDraws_.reserve(256);