Skip to content

Commit

Permalink
feat: reworking simd support
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Pollind <[email protected]>
  • Loading branch information
pollend committed Aug 2, 2024
1 parent a273901 commit 64f15ce
Show file tree
Hide file tree
Showing 42 changed files with 3,805 additions and 1,563 deletions.
113 changes: 61 additions & 52 deletions Forge/Math/Internal/SimdTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
#define TF_SIMDI_MAX 0xFFFFFFFF
#define TF_SIMDF_MAX 0xFFFFFFFF

typedef __m128 TSimdFloat32x4;
typedef __m128i TSimdInt32x4;
typedef __m128 Tsimd_f32x4_t;
typedef __m128i Tsimd_i32x4_t;

typedef __m128 TSimdFloat32x3;
typedef __m128i TSimdInt32x3;
typedef __m128 Tsimd_f32x3_t;
typedef __m128i Tsimd_i32x3_t;

typedef __m128 TSimdFloat32x2;
typedef __m128i TSimdInt32x2;
typedef __m128 Tsimd_f32x2_t;
typedef __m128i Tsimd_i32x2_t;
#elif defined(TF_FEATURE_CPU_NEON)
#include <arm_neon.h>

Expand All @@ -33,14 +33,14 @@

#define TF_SIMDI_MAX 0xFFFFFFFF

typedef float32x4_t TSimdFloat32x4;
typedef int32x4_t TSimdInt32x4;
typedef float32x4_t Tsimd_f32x4_t;
typedef int32x4_t Tsimd_i32x4_t;

typedef float32x4_t TSimdFloat32x3;
typedef int32x4_t TSimdInt32x3;
typedef float32x4_t Tsimd_f32x3_t;
typedef int32x4_t Tsimd_i32x3_t;

typedef float32x2_t TSimdFloat32x2;
typedef int32x2_t TSimdInt32x2;
typedef float32x2_t Tsimd_f32x2_t;
typedef int32x2_t Tsimd_i32x2_t;
#elif defined(TF_FEATURE_CPU_SCALAR)
#include <cmath>

Expand All @@ -49,35 +49,44 @@

#define TF_SIMDI_MAX 0xFFFFFFFF

typedef struct { float v[4]; } TSimdFloat32x4;
typedef struct { int32_t v[4]; } TSimdInt32x4;
typedef struct { float v[4]; } Tsimd_f32x4_t;
typedef struct { int32_t v[4]; } Tsimd_i32x4_t;

typedef struct { float v[3]; } TSimdFloat32x3;
typedef struct { int32_t v[3]; } TSimdInt32x3;
typedef struct { float v[3]; } Tsimd_f32x3_t;
typedef struct { int32_t v[3]; } Tsimd_i32x3_t;

typedef struct { float v[2]; } TSimdFloat32x2;
typedef struct { int32_t v[2]; } TSimdInt32x2;
typedef struct { float v[2]; } Tsimd_f32x2_t;
typedef struct { int32_t v[2]; } Tsimd_i32x2_t;
#endif

// TODO: keep it simple only implement square matricies
// everything is column major

struct TSimdFloat4 {
TSimdFloat32x4 mRow;
struct TSimdQuatFloat {
Tsimd_f32x4_t mValue;
};

struct TSimdQuatFloat {
TSimdFloat32x4 mValue;
struct Tsimd_f32x4x4_s {
union {
struct {
Tsimd_f32x4_t mCol0;
Tsimd_f32x4_t mCol1;
Tsimd_f32x4_t mCol2;
Tsimd_f32x4_t mCol3;
};
Tsimd_f32x4_t mCol[4];
};
};


struct TSimdFloat4x1 {
union
{
struct
{
TSimdFloat32x4 mCol0;
Tsimd_f32x4_t mCol0;
};
TSimdFloat32x4 mCol[1];
Tsimd_f32x4_t mCol[1];
};
};

Expand All @@ -87,21 +96,21 @@ struct TSimdFloat4x2
{
struct
{
TSimdFloat32x4 mCol0;
TSimdFloat32x4 mCol1;
Tsimd_f32x4_t mCol0;
Tsimd_f32x4_t mCol1;
};
TSimdFloat32x4 mCol[2];
Tsimd_f32x4_t mCol[2];
};
};

struct TSimdFloat4x3 {
union {
struct {
TSimdFloat32x4 mCol0;
TSimdFloat32x4 mCol1;
TSimdFloat32x4 mCol2;
Tsimd_f32x4_t mCol0;
Tsimd_f32x4_t mCol1;
Tsimd_f32x4_t mCol2;
};
TSimdFloat32x4 mCol[3];
Tsimd_f32x4_t mCol[3];
};
};

Expand All @@ -111,26 +120,26 @@ struct TSimdFloat4x4
{
struct
{
TSimdFloat32x4 mCol0;
TSimdFloat32x4 mCol1;
TSimdFloat32x4 mCol2;
TSimdFloat32x4 mCol3;
Tsimd_f32x4_t mCol0;
Tsimd_f32x4_t mCol1;
Tsimd_f32x4_t mCol2;
Tsimd_f32x4_t mCol3;
};
TSimdFloat32x4 mCol[4];
Tsimd_f32x4_t mCol[4];
};
};

struct TSimdFloat3
{
TSimdFloat32x3 mRow;
Tsimd_f32x3_t mRow;
};

struct TSimdFloat3x1 {
union {
struct {
TSimdFloat32x3 mCol0;
Tsimd_f32x3_t mCol0;
};
TSimdFloat32x3 mCol[1];
Tsimd_f32x3_t mCol[1];
};
};

Expand All @@ -140,10 +149,10 @@ struct TSimdFloat3x2
{
struct
{
TSimdFloat32x3 mCol0;
TSimdFloat32x3 mCol1;
Tsimd_f32x3_t mCol0;
Tsimd_f32x3_t mCol1;
};
TSimdFloat32x3 mCol[2];
Tsimd_f32x3_t mCol[2];
};
};

Expand All @@ -153,36 +162,36 @@ struct TSimdFloat3x3
{
struct
{
TSimdFloat32x3 mCol0;
TSimdFloat32x3 mCol1;
TSimdFloat32x3 mCol2;
Tsimd_f32x3_t mCol0;
Tsimd_f32x3_t mCol1;
Tsimd_f32x3_t mCol2;
};
TSimdFloat32x3 mCol[3];
Tsimd_f32x3_t mCol[3];
};
};

struct TSimdFloat2 {
TSimdFloat32x2 mRow;
Tsimd_f32x2_t mRow;
};

struct TSimdFloat2x1
{
union {
struct {
TSimdFloat32x2 mCol0;
Tsimd_f32x2_t mCol0;
};
TSimdFloat32x2 mCol[1];
Tsimd_f32x2_t mCol[1];
};
};

struct TSimdFloat2x2
{
union {
struct {
TSimdFloat32x2 mCol0;
TSimdFloat32x2 mCol1;
Tsimd_f32x2_t mCol0;
Tsimd_f32x2_t mCol1;
};
TSimdFloat32x2 mCol[2];
Tsimd_f32x2_t mCol[2];
};
};

Expand Down
92 changes: 46 additions & 46 deletions Forge/Math/Internal/TF_Simd32x2_neon.inl
Original file line number Diff line number Diff line change
Expand Up @@ -4,84 +4,84 @@
#include "../TF_Simd32x2.h"
#endif

inline TSimdInt32x2 tfSimd2iSelect(TSimdInt32x2 arg0, TSimdInt32x2 arg1, TSimdInt32x2 mask) { return vbsl_s32(mask, arg1, arg1); }
inline TSimdFloat32x2 tfSimd2fSelect(TSimdFloat32x2 arg0, TSimdFloat32x2 arg1, TSimdFloat32x2 mask) { return vbsl_f32(mask, arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ISelect(Tsimd_i32x2_t arg0, Tsimd_i32x2_t arg1, Tsimd_i32x2_t mask) { return vbsl_s32(mask, arg1, arg1); }
inline Tsimd_f32x2_t tfS32x2FSelect(Tsimd_f32x2_t arg0, Tsimd_f32x2_t arg1, Tsimd_f32x2_t mask) { return vbsl_f32(mask, arg1, arg2); }

inline TSimdFloat32x2 tfSimd2fZero() { return vmov_n_f32(0); }
inline TSimdInt32x2 tfSimd2iZero() { return vmov_n_s32(0); }
inline Tsimd_f32x2_t tfS32x2FZero() { return vmov_n_f32(0); }
inline Tsimd_i32x2_t tfS32x2IZero() { return vmov_n_s32(0); }

inline TSimdInt32x2 tfSimd2iNot(TSimdInt32x2 value) { return vmvn_s32(value); }
inline TSimdInt32x2 tfSimd2iAnd(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vand_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iAndNot(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vand_s32(vmvn_s32(arg1), arg2); }
inline TSimdInt32x2 tfSimd2iOr(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vorr_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iXor(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return veor_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2INot(Tsimd_i32x2_t value) { return vmvn_s32(value); }
inline Tsimd_i32x2_t tfS32x2IAnd(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vand_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2IAndNot(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vand_s32(vmvn_s32(arg1), arg2); }
inline Tsimd_i32x2_t tfS32x2IOr(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vorr_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2IXor(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return veor_s32(arg1, arg2); }

inline TSimdFloat32x2 tfSimd2fNot(TSimdFloat32x2 value) { return vreinterpret_f32_s32(vmvn_s32(vreinterpret_s32_f32(value))); }
inline TSimdFloat32x2 tfSimd2fAnd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) {
inline Tsimd_f32x2_t tfS32x2FNot(Tsimd_f32x2_t value) { return vreinterpret_f32_s32(vmvn_s32(vreinterpret_s32_f32(value))); }
inline Tsimd_f32x2_t tfS32x2FAnd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) {
return vreinterpret_f32_s32(vand_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2)));
}
inline TSimdFloat32x2 tfSimd2fAndNot(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) {
inline Tsimd_f32x2_t tfS32x2FAndNot(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) {
return vreinterpret_f32_s32(vand_s32(vmvn_s32(vreinterpret_s32_f32(arg1)), vreinterpret_s32_f32(arg2)));
}
inline TSimdFloat32x2 tfSimd2fOr(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) {
inline Tsimd_f32x2_t tfS32x2FOr(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) {
return vreinterpret_f32_s32(vorr_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2)));
}
inline TSimdFloat32x2 tfSimd2fXor(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) {
inline Tsimd_f32x2_t tfS32x2FXor(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) {
return vreinterpret_f32_s32(veor_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2)));
}

inline TSimdFloat32x2 tfSimd2fFloor(TSimdFloat32x2 value) { return vrndm_f32(value); }
inline TSimdFloat32x2 tfSimd2fCeil(TSimdFloat32x2 value) { return vrndp_f32(value); }
inline TSimdFloat32x2 tfSimd2fRound(TSimdFloat32x2 value) { return vrndn_f32(value); }
inline TSimdFloat32x2 tfSimd2fTruncate(TSimdFloat32x2 value) { return tfSimd2iToSimd2f(tfSimd2fToSimd2i(value)); }
inline TSimdFloat32x2 tfSimd2fMin(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmin_f32(arg1, arg2); }
inline TSimdFloat32x2 tfSimd2fMax(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmax_f32(arg1, arg2); }
inline TSimdFloat32x2 tfSimd2fClamp(TSimdFloat32x2 value, TSimdFloat32x2 min, TSimdFloat32x2 max) {
return tfSimd2fMax(min, tfSimd2fMin(value, max));
inline Tsimd_f32x2_t tfS32x2FFloor(Tsimd_f32x2_t value) { return vrndm_f32(value); }
inline Tsimd_f32x2_t tfS32x2FCeil(Tsimd_f32x2_t value) { return vrndp_f32(value); }
inline Tsimd_f32x2_t tfS32x2FRound(Tsimd_f32x2_t value) { return vrndn_f32(value); }
inline Tsimd_f32x2_t tfS32x2FTruncate(Tsimd_f32x2_t value) { return tfS32x2IToSimd2f(tfS32x2FToSimd2i(value)); }
inline Tsimd_f32x2_t tfS32x2FMin(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmin_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FMax(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmax_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FClamp(Tsimd_f32x2_t value, Tsimd_f32x2_t min, Tsimd_f32x2_t max) {
return tfS32x2FMax(min, tfS32x2FMin(value, max));
}

inline TSimdInt32x2 tfSimd2fToSimd2i(TSimdFloat32x2 value) { return vreinterpret_s32_f32(value); }
inline Tsimd_i32x2_t tfS32x2FToSimd2i(Tsimd_f32x2_t value) { return vreinterpret_s32_f32(value); }

inline TSimdFloat32x2 tfSimd2iToSimd2f(TSimdInt32x2 value) { return vreinterpret_f32_s32(value); }
inline Tsimd_f32x2_t tfS32x2IToSimd2f(Tsimd_i32x2_t value) { return vreinterpret_f32_s32(value); }

inline float tfSimd2fSelectIndex0(TSimdFloat32x2 value) { return vget_lane_f32(value, 0); }
inline float tfS32x2FSelectIndex0(Tsimd_f32x2_t value) { return vget_lane_f32(value, 0); }

inline float tfSimd2fSelectIndex1(TSimdFloat32x2 value) { return vget_lane_f32(value, 1); }
inline float tfS32x2FSelectIndex1(Tsimd_f32x2_t value) { return vget_lane_f32(value, 1); }

inline TSimdFloat32x2 tfSimd2fAdd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vadd_f32(arg1, arg2); }
inline TSimdFloat32x2 tfSimd2fSub(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vsub_f32(arg1, arg2); }
inline TSimdFloat32x2 tfSimd2fMul(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmul_f32(arg1, arg2); }
inline TSimdFloat32x2 tfSimd2fMadd(TSimdFloat32x2 mul1, TSimdFloat32x2 mul2, TSimdFloat32x2 add) { return vmla_f32(add, mul1, mul2); }
inline TSimdFloat32x2 tfSimd2fDiv(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vdiv_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FAdd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vadd_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FSub(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vsub_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FMul(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmul_f32(arg1, arg2); }
inline Tsimd_f32x2_t tfS32x2FMadd(Tsimd_f32x2_t mul1, Tsimd_f32x2_t mul2, Tsimd_f32x2_t add) { return vmla_f32(add, mul1, mul2); }
inline Tsimd_f32x2_t tfS32x2FDiv(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vdiv_f32(arg1, arg2); }

inline TSimdFloat32x2 tfSimd2fAbs(TSimdFloat32x2 value) { return vabs_f32(value); }
inline Tsimd_f32x2_t tfS32x2FAbs(Tsimd_f32x2_t value) { return vabs_f32(value); }

inline TSimdFloat32x2 tfSimdFloat2Load(float x, float y) {
inline Tsimd_f32x2_t tfSimdFloat2Load(float x, float y) {
const float values[2] = { x, y };
return vld1_f32(values);
}

inline TSimdInt32x2 tfSimd2iLoadImmediate(int32_t x, int32_t y) {
inline Tsimd_i32x2_t tfS32x2ILoadImmediate(int32_t x, int32_t y) {
const int32_t values[2] = { x, y };
return vld1_s32(values);
}

inline TSimdFloat32x2 tfSimd2fSplatIndex0(TSimdFloat32x2 value) { return vdup_lane_f32(value, 0); }
inline Tsimd_f32x2_t tfS32x2FSplatIndex0(Tsimd_f32x2_t value) { return vdup_lane_f32(value, 0); }

inline TSimdFloat32x2 tfSimd2fSplatIndex1(TSimdFloat32x2 value) { return vdup_lane_f32(value, 1); }
inline Tsimd_f32x2_t tfS32x2FSplatIndex1(Tsimd_f32x2_t value) { return vdup_lane_f32(value, 1); }

inline TSimdInt32x2 tfSimd2iSplat(int32_t value) { return vdup_n_s32(value); }
inline Tsimd_i32x2_t tfS32x2ISplat(int32_t value) { return vdup_n_s32(value); }

inline TSimdFloat32x2 tfSimd2fSplat(float value) { return vdup_n_f32(value); }
inline Tsimd_f32x2_t tfS32x2FSplat(float value) { return vdup_n_f32(value); }

inline TSimdInt32x2 tfSimd2iCmpEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vceq_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iCmpNeq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vmvn_s32(vceq_s32(arg1, arg2)); }
inline TSimdInt32x2 tfSimd2iCmpGt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcgt_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iCmpGtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcgt_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iCmpLt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vclt_s32(arg1, arg2); }
inline TSimdInt32x2 tfSimd2iCmpLtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcle_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ICmpEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vceq_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ICmpNeq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vmvn_s32(vceq_s32(arg1, arg2)); }
inline Tsimd_i32x2_t tfS32x2ICmpGt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcgt_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ICmpGtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcgt_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ICmpLt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vclt_s32(arg1, arg2); }
inline Tsimd_i32x2_t tfS32x2ICmpLtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcle_s32(arg1, arg2); }

inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) {
inline bool tfS32x2FCmpAllEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) {
// for (int i = 0; i < 2; i++) {
// if (arg1.v[i] != arg2.v[i]) {
// return false;
Expand All @@ -90,7 +90,7 @@ inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) {
return true;
}

inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) {
inline bool tfS32x2ICmpAllEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) {
// for (int i = 0; i < 2; i++) {
// if (arg1.v[i] != arg2.v[i]) {
// return false;
Expand Down
Loading

0 comments on commit 64f15ce

Please sign in to comment.