diff --git a/Forge/Math/Internal/SimdTypes.h b/Forge/Math/Internal/SimdTypes.h index 42102681fe..1dfb4cd6e3 100644 --- a/Forge/Math/Internal/SimdTypes.h +++ b/Forge/Math/Internal/SimdTypes.h @@ -17,14 +17,14 @@ #define TF_SIMDI_MAX 0xFFFFFFFF #define TF_SIMDF_MAX 0xFFFFFFFF - typedef __m128 TSimdFloat32x4; - typedef __m128i TSimdInt32x4; + typedef __m128 Tsimd_f32x4_t; + typedef __m128i Tsimd_i32x4_t; - typedef __m128 TSimdFloat32x3; - typedef __m128i TSimdInt32x3; + typedef __m128 Tsimd_f32x3_t; + typedef __m128i Tsimd_i32x3_t; - typedef __m128 TSimdFloat32x2; - typedef __m128i TSimdInt32x2; + typedef __m128 Tsimd_f32x2_t; + typedef __m128i Tsimd_i32x2_t; #elif defined(TF_FEATURE_CPU_NEON) #include @@ -33,14 +33,14 @@ #define TF_SIMDI_MAX 0xFFFFFFFF - typedef float32x4_t TSimdFloat32x4; - typedef int32x4_t TSimdInt32x4; + typedef float32x4_t Tsimd_f32x4_t; + typedef int32x4_t Tsimd_i32x4_t; - typedef float32x4_t TSimdFloat32x3; - typedef int32x4_t TSimdInt32x3; + typedef float32x4_t Tsimd_f32x3_t; + typedef int32x4_t Tsimd_i32x3_t; - typedef float32x2_t TSimdFloat32x2; - typedef int32x2_t TSimdInt32x2; + typedef float32x2_t Tsimd_f32x2_t; + typedef int32x2_t Tsimd_i32x2_t; #elif defined(TF_FEATURE_CPU_SCALAR) #include @@ -49,35 +49,44 @@ #define TF_SIMDI_MAX 0xFFFFFFFF - typedef struct { float v[4]; } TSimdFloat32x4; - typedef struct { int32_t v[4]; } TSimdInt32x4; + typedef struct { float v[4]; } Tsimd_f32x4_t; + typedef struct { int32_t v[4]; } Tsimd_i32x4_t; - typedef struct { float v[3]; } TSimdFloat32x3; - typedef struct { int32_t v[3]; } TSimdInt32x3; + typedef struct { float v[3]; } Tsimd_f32x3_t; + typedef struct { int32_t v[3]; } Tsimd_i32x3_t; - typedef struct { float v[2]; } TSimdFloat32x2; - typedef struct { int32_t v[2]; } TSimdInt32x2; + typedef struct { float v[2]; } Tsimd_f32x2_t; + typedef struct { int32_t v[2]; } Tsimd_i32x2_t; #endif // TODO: keep it simple only implement square matricies // everything is column major -struct TSimdFloat4 { - TSimdFloat32x4 mRow; +struct TSimdQuatFloat { + Tsimd_f32x4_t mValue; }; -struct TSimdQuatFloat { - TSimdFloat32x4 mValue; +struct Tsimd_f32x4x4_s { + union { + struct { + Tsimd_f32x4_t mCol0; + Tsimd_f32x4_t mCol1; + Tsimd_f32x4_t mCol2; + Tsimd_f32x4_t mCol3; + }; + Tsimd_f32x4_t mCol[4]; + }; }; + struct TSimdFloat4x1 { union { struct { - TSimdFloat32x4 mCol0; + Tsimd_f32x4_t mCol0; }; - TSimdFloat32x4 mCol[1]; + Tsimd_f32x4_t mCol[1]; }; }; @@ -87,21 +96,21 @@ struct TSimdFloat4x2 { struct { - TSimdFloat32x4 mCol0; - TSimdFloat32x4 mCol1; + Tsimd_f32x4_t mCol0; + Tsimd_f32x4_t mCol1; }; - TSimdFloat32x4 mCol[2]; + Tsimd_f32x4_t mCol[2]; }; }; struct TSimdFloat4x3 { union { struct { - TSimdFloat32x4 mCol0; - TSimdFloat32x4 mCol1; - TSimdFloat32x4 mCol2; + Tsimd_f32x4_t mCol0; + Tsimd_f32x4_t mCol1; + Tsimd_f32x4_t mCol2; }; - TSimdFloat32x4 mCol[3]; + Tsimd_f32x4_t mCol[3]; }; }; @@ -111,26 +120,26 @@ struct TSimdFloat4x4 { struct { - TSimdFloat32x4 mCol0; - TSimdFloat32x4 mCol1; - TSimdFloat32x4 mCol2; - TSimdFloat32x4 mCol3; + Tsimd_f32x4_t mCol0; + Tsimd_f32x4_t mCol1; + Tsimd_f32x4_t mCol2; + Tsimd_f32x4_t mCol3; }; - TSimdFloat32x4 mCol[4]; + Tsimd_f32x4_t mCol[4]; }; }; struct TSimdFloat3 { - TSimdFloat32x3 mRow; + Tsimd_f32x3_t mRow; }; struct TSimdFloat3x1 { union { struct { - TSimdFloat32x3 mCol0; + Tsimd_f32x3_t mCol0; }; - TSimdFloat32x3 mCol[1]; + Tsimd_f32x3_t mCol[1]; }; }; @@ -140,10 +149,10 @@ struct TSimdFloat3x2 { struct { - TSimdFloat32x3 mCol0; - TSimdFloat32x3 mCol1; + Tsimd_f32x3_t mCol0; + Tsimd_f32x3_t mCol1; }; - TSimdFloat32x3 mCol[2]; + Tsimd_f32x3_t mCol[2]; }; }; @@ -153,25 +162,25 @@ struct TSimdFloat3x3 { struct { - TSimdFloat32x3 mCol0; - TSimdFloat32x3 mCol1; - TSimdFloat32x3 mCol2; + Tsimd_f32x3_t mCol0; + Tsimd_f32x3_t mCol1; + Tsimd_f32x3_t mCol2; }; - TSimdFloat32x3 mCol[3]; + Tsimd_f32x3_t mCol[3]; }; }; struct TSimdFloat2 { - TSimdFloat32x2 mRow; + Tsimd_f32x2_t mRow; }; struct TSimdFloat2x1 { union { struct { - TSimdFloat32x2 mCol0; + Tsimd_f32x2_t mCol0; }; - TSimdFloat32x2 mCol[1]; + Tsimd_f32x2_t mCol[1]; }; }; @@ -179,10 +188,10 @@ struct TSimdFloat2x2 { union { struct { - TSimdFloat32x2 mCol0; - TSimdFloat32x2 mCol1; + Tsimd_f32x2_t mCol0; + Tsimd_f32x2_t mCol1; }; - TSimdFloat32x2 mCol[2]; + Tsimd_f32x2_t mCol[2]; }; }; diff --git a/Forge/Math/Internal/TF_Simd32x2_neon.inl b/Forge/Math/Internal/TF_Simd32x2_neon.inl index fa3c57b9f9..c3c6803429 100644 --- a/Forge/Math/Internal/TF_Simd32x2_neon.inl +++ b/Forge/Math/Internal/TF_Simd32x2_neon.inl @@ -4,84 +4,84 @@ #include "../TF_Simd32x2.h" #endif -inline TSimdInt32x2 tfSimd2iSelect(TSimdInt32x2 arg0, TSimdInt32x2 arg1, TSimdInt32x2 mask) { return vbsl_s32(mask, arg1, arg1); } -inline TSimdFloat32x2 tfSimd2fSelect(TSimdFloat32x2 arg0, TSimdFloat32x2 arg1, TSimdFloat32x2 mask) { return vbsl_f32(mask, arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ISelect(Tsimd_i32x2_t arg0, Tsimd_i32x2_t arg1, Tsimd_i32x2_t mask) { return vbsl_s32(mask, arg1, arg1); } +inline Tsimd_f32x2_t tfS32x2FSelect(Tsimd_f32x2_t arg0, Tsimd_f32x2_t arg1, Tsimd_f32x2_t mask) { return vbsl_f32(mask, arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fZero() { return vmov_n_f32(0); } -inline TSimdInt32x2 tfSimd2iZero() { return vmov_n_s32(0); } +inline Tsimd_f32x2_t tfS32x2FZero() { return vmov_n_f32(0); } +inline Tsimd_i32x2_t tfS32x2IZero() { return vmov_n_s32(0); } -inline TSimdInt32x2 tfSimd2iNot(TSimdInt32x2 value) { return vmvn_s32(value); } -inline TSimdInt32x2 tfSimd2iAnd(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vand_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iAndNot(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vand_s32(vmvn_s32(arg1), arg2); } -inline TSimdInt32x2 tfSimd2iOr(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vorr_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iXor(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return veor_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2INot(Tsimd_i32x2_t value) { return vmvn_s32(value); } +inline Tsimd_i32x2_t tfS32x2IAnd(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vand_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IAndNot(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vand_s32(vmvn_s32(arg1), arg2); } +inline Tsimd_i32x2_t tfS32x2IOr(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vorr_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IXor(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return veor_s32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fNot(TSimdFloat32x2 value) { return vreinterpret_f32_s32(vmvn_s32(vreinterpret_s32_f32(value))); } -inline TSimdFloat32x2 tfSimd2fAnd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FNot(Tsimd_f32x2_t value) { return vreinterpret_f32_s32(vmvn_s32(vreinterpret_s32_f32(value))); } +inline Tsimd_f32x2_t tfS32x2FAnd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vreinterpret_f32_s32(vand_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2))); } -inline TSimdFloat32x2 tfSimd2fAndNot(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FAndNot(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vreinterpret_f32_s32(vand_s32(vmvn_s32(vreinterpret_s32_f32(arg1)), vreinterpret_s32_f32(arg2))); } -inline TSimdFloat32x2 tfSimd2fOr(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FOr(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vreinterpret_f32_s32(vorr_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2))); } -inline TSimdFloat32x2 tfSimd2fXor(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FXor(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vreinterpret_f32_s32(veor_s32(vreinterpret_s32_f32(arg1), vreinterpret_s32_f32(arg2))); } -inline TSimdFloat32x2 tfSimd2fFloor(TSimdFloat32x2 value) { return vrndm_f32(value); } -inline TSimdFloat32x2 tfSimd2fCeil(TSimdFloat32x2 value) { return vrndp_f32(value); } -inline TSimdFloat32x2 tfSimd2fRound(TSimdFloat32x2 value) { return vrndn_f32(value); } -inline TSimdFloat32x2 tfSimd2fTruncate(TSimdFloat32x2 value) { return tfSimd2iToSimd2f(tfSimd2fToSimd2i(value)); } -inline TSimdFloat32x2 tfSimd2fMin(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmin_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMax(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmax_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fClamp(TSimdFloat32x2 value, TSimdFloat32x2 min, TSimdFloat32x2 max) { - return tfSimd2fMax(min, tfSimd2fMin(value, max)); +inline Tsimd_f32x2_t tfS32x2FFloor(Tsimd_f32x2_t value) { return vrndm_f32(value); } +inline Tsimd_f32x2_t tfS32x2FCeil(Tsimd_f32x2_t value) { return vrndp_f32(value); } +inline Tsimd_f32x2_t tfS32x2FRound(Tsimd_f32x2_t value) { return vrndn_f32(value); } +inline Tsimd_f32x2_t tfS32x2FTruncate(Tsimd_f32x2_t value) { return tfS32x2IToSimd2f(tfS32x2FToSimd2i(value)); } +inline Tsimd_f32x2_t tfS32x2FMin(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmin_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMax(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmax_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FClamp(Tsimd_f32x2_t value, Tsimd_f32x2_t min, Tsimd_f32x2_t max) { + return tfS32x2FMax(min, tfS32x2FMin(value, max)); } -inline TSimdInt32x2 tfSimd2fToSimd2i(TSimdFloat32x2 value) { return vreinterpret_s32_f32(value); } +inline Tsimd_i32x2_t tfS32x2FToSimd2i(Tsimd_f32x2_t value) { return vreinterpret_s32_f32(value); } -inline TSimdFloat32x2 tfSimd2iToSimd2f(TSimdInt32x2 value) { return vreinterpret_f32_s32(value); } +inline Tsimd_f32x2_t tfS32x2IToSimd2f(Tsimd_i32x2_t value) { return vreinterpret_f32_s32(value); } -inline float tfSimd2fSelectIndex0(TSimdFloat32x2 value) { return vget_lane_f32(value, 0); } +inline float tfS32x2FSelectIndex0(Tsimd_f32x2_t value) { return vget_lane_f32(value, 0); } -inline float tfSimd2fSelectIndex1(TSimdFloat32x2 value) { return vget_lane_f32(value, 1); } +inline float tfS32x2FSelectIndex1(Tsimd_f32x2_t value) { return vget_lane_f32(value, 1); } -inline TSimdFloat32x2 tfSimd2fAdd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vadd_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fSub(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vsub_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMul(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vmul_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMadd(TSimdFloat32x2 mul1, TSimdFloat32x2 mul2, TSimdFloat32x2 add) { return vmla_f32(add, mul1, mul2); } -inline TSimdFloat32x2 tfSimd2fDiv(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return vdiv_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FAdd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vadd_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FSub(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vsub_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMul(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vmul_f32(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMadd(Tsimd_f32x2_t mul1, Tsimd_f32x2_t mul2, Tsimd_f32x2_t add) { return vmla_f32(add, mul1, mul2); } +inline Tsimd_f32x2_t tfS32x2FDiv(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return vdiv_f32(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fAbs(TSimdFloat32x2 value) { return vabs_f32(value); } +inline Tsimd_f32x2_t tfS32x2FAbs(Tsimd_f32x2_t value) { return vabs_f32(value); } -inline TSimdFloat32x2 tfSimdFloat2Load(float x, float y) { +inline Tsimd_f32x2_t tfSimdFloat2Load(float x, float y) { const float values[2] = { x, y }; return vld1_f32(values); } -inline TSimdInt32x2 tfSimd2iLoadImmediate(int32_t x, int32_t y) { +inline Tsimd_i32x2_t tfS32x2ILoadImmediate(int32_t x, int32_t y) { const int32_t values[2] = { x, y }; return vld1_s32(values); } -inline TSimdFloat32x2 tfSimd2fSplatIndex0(TSimdFloat32x2 value) { return vdup_lane_f32(value, 0); } +inline Tsimd_f32x2_t tfS32x2FSplatIndex0(Tsimd_f32x2_t value) { return vdup_lane_f32(value, 0); } -inline TSimdFloat32x2 tfSimd2fSplatIndex1(TSimdFloat32x2 value) { return vdup_lane_f32(value, 1); } +inline Tsimd_f32x2_t tfS32x2FSplatIndex1(Tsimd_f32x2_t value) { return vdup_lane_f32(value, 1); } -inline TSimdInt32x2 tfSimd2iSplat(int32_t value) { return vdup_n_s32(value); } +inline Tsimd_i32x2_t tfS32x2ISplat(int32_t value) { return vdup_n_s32(value); } -inline TSimdFloat32x2 tfSimd2fSplat(float value) { return vdup_n_f32(value); } +inline Tsimd_f32x2_t tfS32x2FSplat(float value) { return vdup_n_f32(value); } -inline TSimdInt32x2 tfSimd2iCmpEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vceq_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpNeq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vmvn_s32(vceq_s32(arg1, arg2)); } -inline TSimdInt32x2 tfSimd2iCmpGt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcgt_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpGtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcgt_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpLt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vclt_s32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpLtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return vcle_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vceq_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpNeq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vmvn_s32(vceq_s32(arg1, arg2)); } +inline Tsimd_i32x2_t tfS32x2ICmpGt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcgt_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpGtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcgt_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpLt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vclt_s32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpLtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return vcle_s32(arg1, arg2); } -inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { +inline bool tfS32x2FCmpAllEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { // for (int i = 0; i < 2; i++) { // if (arg1.v[i] != arg2.v[i]) { // return false; @@ -90,7 +90,7 @@ inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return true; } -inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline bool tfS32x2ICmpAllEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { // for (int i = 0; i < 2; i++) { // if (arg1.v[i] != arg2.v[i]) { // return false; diff --git a/Forge/Math/Internal/TF_Simd32x2_scalar.inl b/Forge/Math/Internal/TF_Simd32x2_scalar.inl index 712d255bea..e3270f0d05 100644 --- a/Forge/Math/Internal/TF_Simd32x2_scalar.inl +++ b/Forge/Math/Internal/TF_Simd32x2_scalar.inl @@ -4,138 +4,138 @@ #include "../TF_Simd32x2.h" #endif -inline TSimdInt32x2 tfSimd2iSelect(TSimdInt32x2 arg0, TSimdInt32x2 arg1, TSimdInt32x2 mask) { +inline Tsimd_i32x2_t tfS32x2ISelect(Tsimd_i32x2_t arg0, Tsimd_i32x2_t arg1, Tsimd_i32x2_t mask) { return { (mask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (mask.v[1] == 0) ? arg0.v[1] : arg1.v[1] }; } -inline TSimdFloat32x2 tfSimd2fSelect(TSimdFloat32x2 arg0, TSimdFloat32x2 arg1, TSimdFloat32x2 mask) { - TSimdInt32x2 intMask = tfSimd2fToSimd2i(mask); +inline Tsimd_f32x2_t tfS32x2FSelect(Tsimd_f32x2_t arg0, Tsimd_f32x2_t arg1, Tsimd_f32x2_t mask) { + Tsimd_i32x2_t intMask = tfS32x2FToSimd2i(mask); return { (intMask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (intMask.v[1] == 0) ? arg0.v[1] : arg1.v[1] }; } -inline TSimdFloat32x2 tfSimd2fZero() { return { 0, 0 }; } -inline TSimdInt32x2 tfSimd2iZero() { return { 0, 0 }; } +inline Tsimd_f32x2_t tfS32x2FZero() { return { 0, 0 }; } +inline Tsimd_i32x2_t tfS32x2IZero() { return { 0, 0 }; } -inline TSimdInt32x2 tfSimd2iNot(TSimdInt32x2 value) { return { ~value.v[0], ~value.v[1] }; } -inline TSimdInt32x2 tfSimd2iAnd(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1] }; } -inline TSimdInt32x2 tfSimd2iAndNot(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1] }; } -inline TSimdInt32x2 tfSimd2iOr(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1] }; } -inline TSimdInt32x2 tfSimd2iXor(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1] }; } +inline Tsimd_i32x2_t tfS32x2INot(Tsimd_i32x2_t value) { return { ~value.v[0], ~value.v[1] }; } +inline Tsimd_i32x2_t tfS32x2IAnd(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1] }; } +inline Tsimd_i32x2_t tfS32x2IAndNot(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1] }; } +inline Tsimd_i32x2_t tfS32x2IOr(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1] }; } +inline Tsimd_i32x2_t tfS32x2IXor(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1] }; } -inline TSimdFloat32x2 tfSimd2fNot(TSimdFloat32x2 value) { - TSimdInt32x2 result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]) } }; - return tfSimd2iToSimd2f(result); +inline Tsimd_f32x2_t tfS32x2FNot(Tsimd_f32x2_t value) { + Tsimd_i32x2_t result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]) } }; + return tfS32x2IToSimd2f(result); } -inline TSimdFloat32x2 tfSimd2fAnd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { - TSimdInt32x2 result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]) }; - return tfSimd2iToSimd2f(result); +inline Tsimd_f32x2_t tfS32x2FAnd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { + Tsimd_i32x2_t result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]) }; + return tfS32x2IToSimd2f(result); } -inline TSimdFloat32x2 tfSimd2fAndNot(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { - TSimdInt32x2 result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]) } }; - return tfSimd2iToSimd2f(result); +inline Tsimd_f32x2_t tfS32x2FAndNot(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { + Tsimd_i32x2_t result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]) } }; + return tfS32x2IToSimd2f(result); } -inline TSimdFloat32x2 tfSimd2fOr(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { - TSimdInt32x2 result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]) } }; - return tfSimd2iToSimd2f(result); +inline Tsimd_f32x2_t tfS32x2FOr(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { + Tsimd_i32x2_t result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]) } }; + return tfS32x2IToSimd2f(result); } -inline TSimdFloat32x2 tfSimd2fXor(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { - TSimdInt32x2 result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]) } }; - return tfSimd2iToSimd2f(result); +inline Tsimd_f32x2_t tfS32x2FXor(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { + Tsimd_i32x2_t result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]) } }; + return tfS32x2IToSimd2f(result); } -inline TSimdFloat32x2 tfSimd2fFloor(TSimdFloat32x2 value) { return { { floorf(value.v[0]), floorf(value.v[1]) } }; } -inline TSimdFloat32x2 tfSimd2fCeil(TSimdFloat32x2 value) { return { { ceilf(value.v[0]), ceilf(value.v[1]) } }; } -inline TSimdFloat32x2 tfSimd2fRound(TSimdFloat32x2 value) { +inline Tsimd_f32x2_t tfS32x2FFloor(Tsimd_f32x2_t value) { return { { floorf(value.v[0]), floorf(value.v[1]) } }; } +inline Tsimd_f32x2_t tfS32x2FCeil(Tsimd_f32x2_t value) { return { { ceilf(value.v[0]), ceilf(value.v[1]) } }; } +inline Tsimd_f32x2_t tfS32x2FRound(Tsimd_f32x2_t value) { // While 'roundf' may seem the obvious choice here, it rounds halfway cases // away from zero regardless of the current rounding mode, but 'rintf' uses // the current rounding mode which is consistent with other implementations. return { { rintf(value.v[0]), rintf(value.v[1]) } }; } -inline TSimdFloat32x2 tfSimd2fTruncate(TSimdFloat32x2 value) { return tfSimd2iToSimd2f(tfSimd2fToSimd2i(value)); } -inline TSimdFloat32x2 tfSimd2fMin(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FTruncate(Tsimd_f32x2_t value) { return tfS32x2IToSimd2f(tfS32x2FToSimd2i(value)); } +inline Tsimd_f32x2_t tfS32x2FMin(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { { fminf(arg1.v[0], arg2.v[0]), fminf(arg1.v[1], arg2.v[1]) } }; } -inline TSimdFloat32x2 tfSimd2fMax(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FMax(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { { fmaxf(arg1.v[0], arg2.v[0]), fmaxf(arg1.v[1], arg2.v[1]) } }; } -inline TSimdFloat32x2 tfSimd2fClamp(TSimdFloat32x2 value, TSimdFloat32x2 min, TSimdFloat32x2 max) { - return tfSimd2fMax(min, tfSimd2fMin(value, max)); +inline Tsimd_f32x2_t tfS32x2FClamp(Tsimd_f32x2_t value, Tsimd_f32x2_t min, Tsimd_f32x2_t max) { + return tfS32x2FMax(min, tfS32x2FMin(value, max)); } -inline TSimdInt32x2 tfSimd2fToSimd2i(TSimdFloat32x2 value) { return { (int32_t)value.v[0], (int32_t)value.v[1] }; } +inline Tsimd_i32x2_t tfS32x2FToSimd2i(Tsimd_f32x2_t value) { return { (int32_t)value.v[0], (int32_t)value.v[1] }; } -inline TSimdFloat32x2 tfSimd2iToSimd2f(TSimdInt32x2 value) { return { (float)value.v[0], (float)value.v[1] }; } +inline Tsimd_f32x2_t tfS32x2IToSimd2f(Tsimd_i32x2_t value) { return { (float)value.v[0], (float)value.v[1] }; } -inline float tfSimd2fSelectIndex0(TSimdFloat32x2 value) { return value.v[0]; } +inline float tfS32x2FSelectIndex0(Tsimd_f32x2_t value) { return value.v[0]; } -inline float tfSimd2fSelectIndex1(TSimdFloat32x2 value) { return value.v[1]; } +inline float tfS32x2FSelectIndex1(Tsimd_f32x2_t value) { return value.v[1]; } -inline TSimdFloat32x2 tfSimd2fAdd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FAdd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { arg1.v[0] + arg2.v[0], arg1.v[1] + arg2.v[1], }; } -inline TSimdFloat32x2 tfSimd2fSub(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FSub(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { arg1.v[0] - arg2.v[0], arg1.v[1] - arg2.v[1], }; } -inline TSimdFloat32x2 tfSimd2fMul(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FMul(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { arg1.v[0] * arg2.v[0], arg1.v[1] * arg2.v[1], }; } -inline TSimdFloat32x2 tfSimd2fMadd(TSimdFloat32x2 mul1, TSimdFloat32x2 mul2, TSimdFloat32x2 add) { - return tfSimd2fAdd(tfSimd2fMul(mul1, mul2), add); +inline Tsimd_f32x2_t tfS32x2FMadd(Tsimd_f32x2_t mul1, Tsimd_f32x2_t mul2, Tsimd_f32x2_t add) { + return tfS32x2FAdd(tfS32x2FMul(mul1, mul2), add); } -inline TSimdFloat32x2 tfSimd2fDiv(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +inline Tsimd_f32x2_t tfS32x2FDiv(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return { arg1.v[0] / arg2.v[0], arg1.v[1] / arg2.v[1], }; } -inline TSimdFloat32x2 tfSimd2fAbs(TSimdFloat32x2 value) { +inline Tsimd_f32x2_t tfS32x2FAbs(Tsimd_f32x2_t value) { return { abs(value.v[0]), abs(value.v[1]), }; } -inline TSimdFloat32x2 tfSimdFloat2Load(float x, float y) { return { x, y }; } +inline Tsimd_f32x2_t tfSimdFloat2Load(float x, float y) { return { x, y }; } -inline TSimdInt32x2 tfSimd2iLoadImmediate(int32_t x, int32_t y) { return { x, y }; } +inline Tsimd_i32x2_t tfS32x2ILoadImmediate(int32_t x, int32_t y) { return { x, y }; } -inline TSimdFloat32x2 tfSimd2fSplatIndex0(TSimdFloat32x2 value) { return { value.v[0], value.v[0] }; } -inline TSimdFloat32x2 tfSimd2fSplatIndex1(TSimdFloat32x2 value) { return { value.v[1], value.v[1] }; } +inline Tsimd_f32x2_t tfS32x2FSplatIndex0(Tsimd_f32x2_t value) { return { value.v[0], value.v[0] }; } +inline Tsimd_f32x2_t tfS32x2FSplatIndex1(Tsimd_f32x2_t value) { return { value.v[1], value.v[1] }; } -static inline TSimdFloat32x4 tfSimdFloat2To4Splat0(TSimdFloat32x2 value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } -static inline TSimdFloat32x4 tfSimdFloat2To4Splat1(TSimdFloat32x2 value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat0(Tsimd_f32x2_t value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat1(Tsimd_f32x2_t value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } -inline TSimdInt32x2 tfSimd2iSplat(int32_t value) { return { value, value }; } -inline TSimdFloat32x2 tfSimd2fSplat(float value) { return { value, value }; } +inline Tsimd_i32x2_t tfS32x2ISplat(int32_t value) { return { value, value }; } +inline Tsimd_f32x2_t tfS32x2FSplat(float value) { return { value, value }; } -inline TSimdInt32x2 tfSimd2iCmpEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x2 tfSimd2iCmpNeq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpNeq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x2 tfSimd2iCmpGt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpGt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x2 tfSimd2iCmpGtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpGtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x2 tfSimd2iCmpLt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpLt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x2 tfSimd2iCmpLtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpLtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { +inline bool tfS32x2FCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { for (int i = 0; i < 2; i++) { if (arg1.v[i] != arg2.v[i]) { return false; @@ -144,7 +144,7 @@ inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return true; } -inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline bool tfS32x2ICmpAllEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { for (int i = 0; i < 2; i++) { if (arg1.v[i] != arg2.v[i]) { return false; @@ -153,7 +153,7 @@ inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return true; } -static inline bool tfSimdFloat32x2CmpAllLt(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { +static inline bool tfS32x2FCmpAllLt(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { for (int i = 0; i < 2; i++) { if (arg1.v[i] >= arg2.v[i]) { return false; diff --git a/Forge/Math/Internal/TF_Simd32x2_sse.inl b/Forge/Math/Internal/TF_Simd32x2_sse.inl index c2d7715297..9cd5011245 100644 --- a/Forge/Math/Internal/TF_Simd32x2_sse.inl +++ b/Forge/Math/Internal/TF_Simd32x2_sse.inl @@ -4,105 +4,105 @@ #include "../TF_Simd32x2.h" #endif -inline TSimdInt32x2 tfSimd2iSelect(TSimdInt32x2 arg0, TSimdInt32x2 arg1, TSimdInt32x2 mask) { return _mm_blendv_epi8(arg0, arg1, mask); } -inline TSimdFloat32x2 tfSimd2fSelect(TSimdFloat32x2 arg0, TSimdFloat32x2 arg1, TSimdFloat32x2 mask) { +inline Tsimd_i32x2_t tfS32x2ISelect(Tsimd_i32x2_t arg0, Tsimd_i32x2_t arg1, Tsimd_i32x2_t mask) { return _mm_blendv_epi8(arg0, arg1, mask); } +inline Tsimd_f32x2_t tfS32x2FSelect(Tsimd_f32x2_t arg0, Tsimd_f32x2_t arg1, Tsimd_f32x2_t mask) { return _mm_blendv_ps(arg0, arg1, mask); } -inline TSimdFloat32x2 tfSimd2fZero() { return _mm_setzero_ps(); } -inline TSimdInt32x2 tfSimd2iZero() { return _mm_setzero_si128(); } +inline Tsimd_f32x2_t tfS32x2FZero() { return _mm_setzero_ps(); } +inline Tsimd_i32x2_t tfS32x2IZero() { return _mm_setzero_si128(); } -inline TSimdInt32x2 tfSimd2iNot(TSimdInt32x2 value) { - const TSimdInt32x2 invert = tfSimd2iSplat(TF_SIMDI_MAX); +inline Tsimd_i32x2_t tfS32x2INot(Tsimd_i32x2_t value) { + const Tsimd_i32x2_t invert = tfS32x2ISplat(TF_SIMDI_MAX); return _mm_andnot_si128(value, invert); } -inline TSimdInt32x2 tfSimd2iAnd(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_and_si128(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iAndNot(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_andnot_si128(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iOr(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_or_si128(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iXor(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_xor_si128(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IAnd(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_and_si128(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IAndNot(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_andnot_si128(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IOr(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_or_si128(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2IXor(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_xor_si128(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fNot(TSimdFloat32x2 value) { - const TSimdFloat32x2 invert = tfSimd2fSplat((float)(0xFFFFFFFF)); +inline Tsimd_f32x2_t tfS32x2FNot(Tsimd_f32x2_t value) { + const Tsimd_f32x2_t invert = tfS32x2FSplat((float)(0xFFFFFFFF)); return _mm_andnot_ps(value, invert); } -inline TSimdFloat32x2 tfSimd2fAnd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_and_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fAndNot(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_andnot_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fOr(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_or_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fXor(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_xor_ps(arg1, arg2); } - -inline TSimdFloat32x2 tfSimd2fFloor(TSimdFloat32x2 value) { return _mm_floor_ps(value); } -inline TSimdFloat32x2 tfSimd2fCeil(TSimdFloat32x2 value) { return _mm_ceil_ps(value); } -inline TSimdFloat32x2 tfSimd2fRound(TSimdFloat32x2 value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline TSimdFloat32x2 tfSimd2fTruncate(TSimdFloat32x2 value) { return tfSimd2iToSimd2f(tfSimd2fToSimd2i(value)); } -inline TSimdFloat32x2 tfSimd2fMin(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_min_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMax(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_max_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fClamp(TSimdFloat32x2 value, TSimdFloat32x2 min, TSimdFloat32x2 max) { - return tfSimd2fMax(min, tfSimd2fMin(value, max)); +inline Tsimd_f32x2_t tfS32x2FAnd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_and_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FAndNot(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_andnot_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FOr(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_or_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FXor(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_xor_ps(arg1, arg2); } + +inline Tsimd_f32x2_t tfS32x2FFloor(Tsimd_f32x2_t value) { return _mm_floor_ps(value); } +inline Tsimd_f32x2_t tfS32x2FCeil(Tsimd_f32x2_t value) { return _mm_ceil_ps(value); } +inline Tsimd_f32x2_t tfS32x2FRound(Tsimd_f32x2_t value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } +inline Tsimd_f32x2_t tfS32x2FTruncate(Tsimd_f32x2_t value) { return tfS32x2IToSimd2f(tfS32x2FToSimd2i(value)); } +inline Tsimd_f32x2_t tfS32x2FMin(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_min_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMax(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_max_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FClamp(Tsimd_f32x2_t value, Tsimd_f32x2_t min, Tsimd_f32x2_t max) { + return tfS32x2FMax(min, tfS32x2FMin(value, max)); } -inline TSimdInt32x2 tfSimd2fToSimd2i(TSimdFloat32x2 value) { return _mm_castps_si128(value); } +inline Tsimd_i32x2_t tfS32x2FToSimd2i(Tsimd_f32x2_t value) { return _mm_castps_si128(value); } -inline TSimdFloat32x2 tfSimd2iToSimd2f(TSimdInt32x2 value) { return _mm_castsi128_ps(value); } +inline Tsimd_f32x2_t tfS32x2IToSimd2f(Tsimd_i32x2_t value) { return _mm_castsi128_ps(value); } -inline float tfSimd2fSelectIndex0(TSimdFloat32x2 value) { return _mm_cvtss_f32(value); } +inline float tfS32x2FSelectIndex0(Tsimd_f32x2_t value) { return _mm_cvtss_f32(value); } -inline float tfSimd2fSelectIndex1(TSimdFloat32x2 value) { return tfSimd2fSelectIndex0(tfSimd2fSplatIndex1(value)); } +inline float tfS32x2FSelectIndex1(Tsimd_f32x2_t value) { return tfS32x2FSelectIndex0(tfS32x2FSplatIndex1(value)); } -inline TSimdFloat32x2 tfSimd2fAdd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_add_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fSub(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_sub_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMul(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_mul_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fMadd(TSimdFloat32x2 mul1, TSimdFloat32x2 mul2, TSimdFloat32x2 add) { +inline Tsimd_f32x2_t tfS32x2FAdd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_add_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FSub(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_sub_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMul(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_mul_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FMadd(Tsimd_f32x2_t mul1, Tsimd_f32x2_t mul2, Tsimd_f32x2_t add) { #if 0 return _mm_fmadd_ps(mul1, mul2, add); // Requires FMA CPUID #else - return tfSimd2fAdd(tfSimd2fMul(mul1, mul2), add); + return tfS32x2FAdd(tfS32x2FMul(mul1, mul2), add); #endif } -inline TSimdFloat32x2 tfSimd2fDiv(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2) { return _mm_div_ps(arg1, arg2); } +inline Tsimd_f32x2_t tfS32x2FDiv(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2) { return _mm_div_ps(arg1, arg2); } -inline TSimdFloat32x2 tfSimd2fAbs(TSimdFloat32x2 value) { - const TSimdFloat32x4 signMask = tfSimd2iToSimd2f(tfSimd2iSplat(0x7FFFFFFF)); +inline Tsimd_f32x2_t tfS32x2FAbs(Tsimd_f32x2_t value) { + const Tsimd_f32x4_t signMask = tfS32x2IToSimd2f(tfS32x2ISplat(0x7FFFFFFF)); return _mm_and_ps(value, signMask); } -inline TSimdFloat32x2 tfSimdFloat2Load(float x, float y) { return _mm_set_ps(0.0f, 0.0f, y, x); } +inline Tsimd_f32x2_t tfSimdFloat2Load(float x, float y) { return _mm_set_ps(0.0f, 0.0f, y, x); } -inline TSimdInt32x2 tfSimd2iLoadImmediate(int32_t x, int32_t y) { return _mm_set_epi32(0.0f, 0.0f, x, y); } +inline Tsimd_i32x2_t tfS32x2ILoadImmediate(int32_t x, int32_t y) { return _mm_set_epi32(0.0f, 0.0f, x, y); } -inline TSimdFloat32x2 tfSimd2fSplatIndex0(TSimdFloat32x2 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } -inline TSimdFloat32x2 tfSimd2fSplatIndex1(TSimdFloat32x2 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +inline Tsimd_f32x2_t tfS32x2FSplatIndex0(Tsimd_f32x2_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +inline Tsimd_f32x2_t tfS32x2FSplatIndex1(Tsimd_f32x2_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } -static inline TSimdFloat32x4 tfSimdFloat2To4Splat0(TSimdFloat32x2 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } -static inline TSimdFloat32x4 tfSimdFloat2To4Splat1(TSimdFloat32x2 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat0(Tsimd_f32x2_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat1(Tsimd_f32x2_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } -inline TSimdInt32x2 tfSimd2iSplat(int32_t value) { return _mm_set1_epi32(value); } -inline TSimdFloat32x2 tfSimd2fSplat(float value) { return _mm_set1_ps(value); } +inline Tsimd_i32x2_t tfS32x2ISplat(int32_t value) { return _mm_set1_epi32(value); } +inline Tsimd_f32x2_t tfS32x2FSplat(float value) { return _mm_set1_ps(value); } -inline TSimdInt32x2 tfSimd2iCmpEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_cmpeq_epi32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpNeq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_cmpeq_epi32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpNeq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_xor_si128(_mm_cmpeq_epi32(arg1, arg2), _mm_set1_epi32((int32_t)0xFFFFFFFF)); } -inline TSimdInt32x2 tfSimd2iCmpGt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_cmpgt_epi32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpGtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpGt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_cmpgt_epi32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpGtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_or_si128(_mm_cmpgt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline TSimdInt32x2 tfSimd2iCmpLt(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { return _mm_cmplt_epi32(arg1, arg2); } -inline TSimdInt32x2 tfSimd2iCmpLtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { +inline Tsimd_i32x2_t tfS32x2ICmpLt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_cmplt_epi32(arg1, arg2); } +inline Tsimd_i32x2_t tfS32x2ICmpLtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { return _mm_or_si128(_mm_cmplt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline bool tfSimd2fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdFloat32x4 compare = tfSimd2fCmpEq(arg1, arg2); +inline bool tfS32x2FCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + Tsimd_f32x4_t compare = tfS32x2FCmpEq(arg1, arg2); return (_mm_movemask_ps(compare) & 0b0011) == 0b0011; } -inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2) { - const TSimdInt32x2 compare = tfSimd2iCmpEq(arg1, arg2); +inline bool tfS32x2ICmpAllEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2) { + const Tsimd_i32x2_t compare = tfS32x2ICmpEq(arg1, arg2); return (_mm_movemask_epi8(compare) & 0b0011) == 0b0011; } -static inline bool tfSimdFloat32x2CmpAllLt(TSimdFloat32x2 a, TSimdFloat32x2 b) { - TSimdFloat32x3 compare = tfSimd2fCmpLt(a, b); +static inline bool tfS32x2FCmpAllLt(Tsimd_f32x2_t a, Tsimd_f32x2_t b) { + Tsimd_f32x3_t compare = tfS32x2FCmpLt(a, b); return (_mm_movemask_ps(compare) & 0b0011) == 0b0011; } diff --git a/Forge/Math/Internal/TF_Simd32x3_neon.inl b/Forge/Math/Internal/TF_Simd32x3_neon.inl index 37f3f941ca..0dd34ca426 100644 --- a/Forge/Math/Internal/TF_Simd32x3_neon.inl +++ b/Forge/Math/Internal/TF_Simd32x3_neon.inl @@ -4,93 +4,93 @@ #include "../TF_Simd32x3.h" #endif -inline TSimdInt32x3 tfSimd3iSelect(TSimdInt32x3 arg0, TSimdInt32x3 arg1, TSimdInt32x3 mask) { return vbslq_s32(mask, arg0, arg1); } -inline TSimdFloat32x3 tfSimd3fSelect(TSimdFloat32x3 arg0, TSimdFloat32x3 arg1, TSimdFloat32x3 mask) { return vbslq_f32(mask, arg1, arg1); } +inline Tsimd_i32x3_t tfS32x3iSelect(Tsimd_i32x3_t arg0, Tsimd_i32x3_t arg1, Tsimd_i32x3_t mask) { return vbslq_s32(mask, arg0, arg1); } +inline Tsimd_f32x3_t tfS32x3FSelect(Tsimd_f32x3_t arg0, Tsimd_f32x3_t arg1, Tsimd_f32x3_t mask) { return vbslq_f32(mask, arg1, arg1); } -inline TSimdFloat32x3 tfSimd3fZero() { return vmovq_n_f32(0.0f); } -inline TSimdInt32x3 tfSimd3iZero() { return vmovq_n_s32(0); } +inline Tsimd_f32x3_t tfS32x3FZero() { return vmovq_n_f32(0.0f); } +inline Tsimd_i32x3_t tfS32x3iZero() { return vmovq_n_s32(0); } -inline TSimdInt32x3 tfSimd3iNot(TSimdInt32x3 value) { return vmvnq_s32(value); } -inline TSimdInt32x3 tfSimd3iAnd(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vandq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iAndNot(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vandq_s32(vmvnq_s32(arg1), arg2); } -inline TSimdInt32x3 tfSimd3iOr(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vorrq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iXor(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return veorq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iNot(Tsimd_i32x3_t value) { return vmvnq_s32(value); } +inline Tsimd_i32x3_t tfS32x3iAnd(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vandq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iAndNot(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vandq_s32(vmvnq_s32(arg1), arg2); } +inline Tsimd_i32x3_t tfS32x3iOr(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vorrq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iXor(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return veorq_s32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fNot(TSimdFloat32x3 value) { return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(value))); } -inline TSimdFloat32x3 tfSimd3fAnd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FNot(Tsimd_f32x3_t value) { return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(value))); } +inline Tsimd_f32x3_t tfS32x3FAnd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); } -inline TSimdFloat32x3 tfSimd3fAndNot(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FAndNot(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vreinterpretq_f32_s32(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(arg1)), vreinterpretq_s32_f32(arg2))); } -inline TSimdFloat32x3 tfSimd3fOr(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FOr(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); } -inline TSimdFloat32x3 tfSimd3fXor(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FXor(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); } -inline TSimdFloat32x3 tfSimd3fFloor(TSimdFloat32x3 value) { return vrndmq_f32(value); } -inline TSimdFloat32x3 tfSimd3fCeil(TSimdFloat32x3 value) { return vrndpq_f32(value); } -inline TSimdFloat32x3 tfSimd3fRound(TSimdFloat32x3 value) { return vrndnq_f32(value); } -inline TSimdFloat32x3 tfSimd3fTruncate(TSimdFloat32x3 value) { return tfSimd3iToSimd3f(tfSimd3fToSimd3i(value)); } -inline TSimdFloat32x3 tfSimd3fMin(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vminq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMax(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vmaxq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fClamp(TSimdFloat32x3 value, TSimdFloat32x3 min, TSimdFloat32x3 max) { - return tfSimd3fMax(min, tfSimd3fMin(value, max)); +inline Tsimd_f32x3_t tfS32x3FFloor(Tsimd_f32x3_t value) { return vrndmq_f32(value); } +inline Tsimd_f32x3_t tfS32x3FCeil(Tsimd_f32x3_t value) { return vrndpq_f32(value); } +inline Tsimd_f32x3_t tfS32x3FRound(Tsimd_f32x3_t value) { return vrndnq_f32(value); } +inline Tsimd_f32x3_t tfS32x3FTruncate(Tsimd_f32x3_t value) { return tfS32x3iToSimd3f(tfS32x3FToSimd3i(value)); } +inline Tsimd_f32x3_t tfS32x3FMin(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vminq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMax(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vmaxq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FClamp(Tsimd_f32x3_t value, Tsimd_f32x3_t min, Tsimd_f32x3_t max) { + return tfS32x3FMax(min, tfS32x3FMin(value, max)); } -inline TSimdInt32x3 tfSimd3fToSimd3i(TSimdFloat32x3 value) { return vreinterpretq_f32_s32(value); } +inline Tsimd_i32x3_t tfS32x3FToSimd3i(Tsimd_f32x3_t value) { return vreinterpretq_f32_s32(value); } -inline TSimdFloat32x3 tfSimd3iToSimd3f(TSimdInt32x3 value) { return vreinterpretq_s32_f32(value); } +inline Tsimd_f32x3_t tfS32x3iToSimd3f(Tsimd_i32x3_t value) { return vreinterpretq_s32_f32(value); } -inline float tfSimd3fSelectIndex0(TSimdFloat32x3 value) { return vgetq_lane_f32(value, 0); } +inline float tfS32x3FSelectIndex0(Tsimd_f32x3_t value) { return vgetq_lane_f32(value, 0); } -inline float tfSimd3fSelectIndex1(TSimdFloat32x3 value) { return vgetq_lane_f32(value, 1); } +inline float tfS32x3FSelectIndex1(Tsimd_f32x3_t value) { return vgetq_lane_f32(value, 1); } -inline float tfSimd3fSelectIndex2(TSimdFloat32x3 value) { return vgetq_lane_f32(value, 2); } +inline float tfS32x3FSelectIndex2(Tsimd_f32x3_t value) { return vgetq_lane_f32(value, 2); } -inline TSimdFloat32x3 tfSimd3fAdd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vaddq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fSub(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vsubq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMul(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vmulq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMadd(TSimdFloat32x3 mul1, TSimdFloat32x3 mul2, TSimdFloat32x3 add) { return vmlaq_f32(add, mul1, mul2); } -inline TSimdFloat32x3 tfSimd3fDiv(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return vdivq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FAdd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vaddq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FSub(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vsubq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMul(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vmulq_f32(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMadd(Tsimd_f32x3_t mul1, Tsimd_f32x3_t mul2, Tsimd_f32x3_t add) { return vmlaq_f32(add, mul1, mul2); } +inline Tsimd_f32x3_t tfS32x3FDiv(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return vdivq_f32(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fAbs(TSimdFloat32x3 value) { return vabsq_f32(value); } +inline Tsimd_f32x3_t tfS32x3FAbs(Tsimd_f32x3_t value) { return vabsq_f32(value); } -inline TSimdFloat32x3 tfSimdFloat3Load(float x, float y, float z) { +inline Tsimd_f32x3_t tfSimdFloat3Load(float x, float y, float z) { const float values[4] = { x, y, z, 0.0f }; return vld1q_f32(values); } -inline TSimdInt32x3 tfSimdInt3Load(int32_t x, int32_t y, int32_t z) { +inline Tsimd_i32x3_t tfSimdInt3Load(int32_t x, int32_t y, int32_t z) { const int32_t values[4] = { x, y, z, 0 }; return vld1q_s32(values); } -inline TSimdFloat32x2 tfSimd3fToSimd2f(TSimdFloat32x3 value) { return vget_low_f32(value); } +inline Tsimd_f32x2_t tfS32x3FToSimd2f(Tsimd_f32x3_t value) { return vget_low_f32(value); } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat0(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 0); } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat1(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 1); } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat2(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 2); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat0(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 0); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat1(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 1); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat2(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 2); } -inline TSimdFloat32x3 tfSimd3fSplatIndex0(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 0); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex0(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 0); } -inline TSimdFloat32x3 tfSimd3fSplatIndex1(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 1); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex1(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 1); } -inline TSimdFloat32x3 tfSimd3fSplatIndex2(TSimdFloat32x3 value) { return vdupq_laneq_f32(value, 2); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex2(Tsimd_f32x3_t value) { return vdupq_laneq_f32(value, 2); } -inline TSimdInt32x3 tfSimd3iSplat(int32_t value) { return vdupq_n_s32(value); } +inline Tsimd_i32x3_t tfS32x3iSplat(int32_t value) { return vdupq_n_s32(value); } -inline TSimdFloat32x3 tfSimd3fSplat(float value) { return vdupq_n_f32(value); } +inline Tsimd_f32x3_t tfS32x3FSplat(float value) { return vdupq_n_f32(value); } -inline TSimdInt32x3 tfSimd3iCmpEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vceqq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpNeq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vmvnq_s32(vceqq_s32(arg1, arg2)); } -inline TSimdInt32x3 tfSimd3iCmpGt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vcgtq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpGtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vcgeq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpLt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vcltq_s32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpLtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return vcleq_s32(arg1, arg2); } -inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vceqq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpNeq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vmvnq_s32(vceqq_s32(arg1, arg2)); } +inline Tsimd_i32x3_t tfS32x3iCmpGt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vcgtq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpGtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vcgeq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpLt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vcltq_s32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpLtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return vcleq_s32(arg1, arg2); } +inline bool tfS32x3FCmpAllEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { // for (int i = 0; i < 3; i++) { // if (arg1.v[i] != arg2.v[i]) { // return false; @@ -99,7 +99,7 @@ inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return true; } -inline bool tfSimd3iCmpAllEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline bool tfS32x3iCmpAllEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { // for (int i = 0; i < 3; i++) { // if (arg1.v[i] != arg2.v[i]) { // return false; diff --git a/Forge/Math/Internal/TF_Simd32x3_scalar.inl b/Forge/Math/Internal/TF_Simd32x3_scalar.inl index ffae5b895c..b0f6c85ed9 100644 --- a/Forge/Math/Internal/TF_Simd32x3_scalar.inl +++ b/Forge/Math/Internal/TF_Simd32x3_scalar.inl @@ -4,115 +4,115 @@ #include "../TF_Simd32x3.h" #endif -inline TSimdInt32x3 tfSimd3iSelect(TSimdInt32x3 arg0, TSimdInt32x3 arg1, TSimdInt32x3 mask) { +inline Tsimd_i32x3_t tfS32x3iSelect(Tsimd_i32x3_t arg0, Tsimd_i32x3_t arg1, Tsimd_i32x3_t mask) { return { (mask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (mask.v[1] == 0) ? arg0.v[1] : arg1.v[1], (mask.v[2] == 0) ? arg0.v[2] : arg1.v[2] }; } -inline TSimdFloat32x3 tfSimd3fSelect(TSimdFloat32x3 arg0, TSimdFloat32x3 arg1, TSimdFloat32x3 mask) { - TSimdInt32x3 intMask = tfSimd3fToSimd3i(mask); +inline Tsimd_f32x3_t tfS32x3FSelect(Tsimd_f32x3_t arg0, Tsimd_f32x3_t arg1, Tsimd_f32x3_t mask) { + Tsimd_i32x3_t intMask = tfS32x3FToSimd3i(mask); return { (intMask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (intMask.v[1] == 0) ? arg0.v[1] : arg1.v[1], (intMask.v[2] == 0) ? arg0.v[2] : arg1.v[2] }; } -inline TSimdFloat32x3 tfSimd3fZero() { return { 0, 0, 0 }; } -inline TSimdInt32x3 tfSimd3iZero() { return { 0, 0, 0 }; } +inline Tsimd_f32x3_t tfS32x3FZero() { return { 0, 0, 0 }; } +inline Tsimd_i32x3_t tfS32x3iZero() { return { 0, 0, 0 }; } -inline TSimdInt32x3 tfSimd3iNot(TSimdInt32x3 value) { return { ~value.v[0], ~value.v[1], ~value.v[2] }; } -inline TSimdInt32x3 tfSimd3iAnd(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iNot(Tsimd_i32x3_t value) { return { ~value.v[0], ~value.v[1], ~value.v[2] }; } +inline Tsimd_i32x3_t tfS32x3iAnd(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1], arg1.v[2] & arg2.v[2] }; } -inline TSimdInt32x3 tfSimd3iAndNot(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iAndNot(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1], ~arg1.v[2] & arg2.v[2] }; } -inline TSimdInt32x3 tfSimd3iOr(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iOr(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1], arg1.v[2] | arg2.v[2] }; } -inline TSimdInt32x3 tfSimd3iXor(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iXor(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1], arg1.v[2] ^ arg2.v[2] }; } -inline TSimdFloat32x3 tfSimd3fNot(TSimdFloat32x3 value) { - TSimdInt32x3 result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]), ~((int32_t)value.v[2]) } }; - return tfSimd3iToSimd3f(result); +inline Tsimd_f32x3_t tfS32x3FNot(Tsimd_f32x3_t value) { + Tsimd_i32x3_t result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]), ~((int32_t)value.v[2]) } }; + return tfS32x3iToSimd3f(result); } -inline TSimdFloat32x3 tfSimd3fAnd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { - TSimdInt32x3 result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), +inline Tsimd_f32x3_t tfS32x3FAnd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { + Tsimd_i32x3_t result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), ((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]) }; - return tfSimd3iToSimd3f(result); + return tfS32x3iToSimd3f(result); } -inline TSimdFloat32x3 tfSimd3fAndNot(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { - TSimdInt32x3 result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), +inline Tsimd_f32x3_t tfS32x3FAndNot(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { + Tsimd_i32x3_t result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), ~((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]) } }; - return tfSimd3iToSimd3f(result); + return tfS32x3iToSimd3f(result); } -inline TSimdFloat32x3 tfSimd3fOr(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { - TSimdInt32x3 result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]), +inline Tsimd_f32x3_t tfS32x3FOr(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { + Tsimd_i32x3_t result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]), ((int32_t)arg1.v[2]) | ((int32_t)arg2.v[2]) } }; - return tfSimd3iToSimd3f(result); + return tfS32x3iToSimd3f(result); } -inline TSimdFloat32x3 tfSimd3fXor(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { - TSimdInt32x3 result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]), +inline Tsimd_f32x3_t tfS32x3FXor(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { + Tsimd_i32x3_t result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]), ((int32_t)arg1.v[2]) ^ ((int32_t)arg2.v[2]) } }; - return tfSimd3iToSimd3f(result); + return tfS32x3iToSimd3f(result); } -inline TSimdFloat32x3 tfSimd3fFloor(TSimdFloat32x3 value) { return { { floorf(value.v[0]), floorf(value.v[1]), floorf(value.v[2]) } }; } -inline TSimdFloat32x3 tfSimd3fCeil(TSimdFloat32x3 value) { return { { ceilf(value.v[0]), ceilf(value.v[1]), ceilf(value.v[2]) } }; } -inline TSimdFloat32x3 tfSimd3fRound(TSimdFloat32x3 value) { +inline Tsimd_f32x3_t tfS32x3FFloor(Tsimd_f32x3_t value) { return { { floorf(value.v[0]), floorf(value.v[1]), floorf(value.v[2]) } }; } +inline Tsimd_f32x3_t tfS32x3FCeil(Tsimd_f32x3_t value) { return { { ceilf(value.v[0]), ceilf(value.v[1]), ceilf(value.v[2]) } }; } +inline Tsimd_f32x3_t tfS32x3FRound(Tsimd_f32x3_t value) { // While 'roundf' may seem the obvious choice here, it rounds halfway cases // away from zero regardless of the current rounding mode, but 'rintf' uses // the current rounding mode which is consistent with other implementations. return { { rintf(value.v[0]), rintf(value.v[1]), rintf(value.v[2]) } }; } -inline TSimdFloat32x3 tfSimd3fTruncate(TSimdFloat32x3 value) { return tfSimd3iToSimd3f(tfSimd3fToSimd3i(value)); } -inline TSimdFloat32x3 tfSimd3fMin(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FTruncate(Tsimd_f32x3_t value) { return tfS32x3iToSimd3f(tfS32x3FToSimd3i(value)); } +inline Tsimd_f32x3_t tfS32x3FMin(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { { fminf(arg1.v[0], arg2.v[0]), fminf(arg1.v[1], arg2.v[1]), fminf(arg1.v[2], arg2.v[2]) } }; } -inline TSimdFloat32x3 tfSimd3fMax(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FMax(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { { fmaxf(arg1.v[0], arg2.v[0]), fmaxf(arg1.v[1], arg2.v[1]), fmaxf(arg1.v[2], arg2.v[2]) } }; } -inline TSimdFloat32x3 tfSimd3fClamp(TSimdFloat32x3 value, TSimdFloat32x3 min, TSimdFloat32x3 max) { - return tfSimd3fMax(min, tfSimd3fMin(value, max)); +inline Tsimd_f32x3_t tfS32x3FClamp(Tsimd_f32x3_t value, Tsimd_f32x3_t min, Tsimd_f32x3_t max) { + return tfS32x3FMax(min, tfS32x3FMin(value, max)); } -inline TSimdInt32x3 tfSimd3fToSimd3i(TSimdFloat32x3 value) { return { (int32_t)value.v[0], (int32_t)value.v[1], (int32_t)value.v[2] }; } +inline Tsimd_i32x3_t tfS32x3FToSimd3i(Tsimd_f32x3_t value) { return { (int32_t)value.v[0], (int32_t)value.v[1], (int32_t)value.v[2] }; } -inline TSimdFloat32x3 tfSimd3iToSimd3f(TSimdInt32x3 value) { return { (float)value.v[0], (float)value.v[1], (float)value.v[2] }; } +inline Tsimd_f32x3_t tfS32x3iToSimd3f(Tsimd_i32x3_t value) { return { (float)value.v[0], (float)value.v[1], (float)value.v[2] }; } -inline float tfSimd3fSelectIndex0(TSimdFloat32x3 value) { return value.v[0]; } +inline float tfS32x3FSelectIndex0(Tsimd_f32x3_t value) { return value.v[0]; } -inline float tfSimd3fSelectIndex1(TSimdFloat32x3 value) { return value.v[1]; } +inline float tfS32x3FSelectIndex1(Tsimd_f32x3_t value) { return value.v[1]; } -inline float tfSimd3fSelectIndex2(TSimdFloat32x3 value) { return value.v[2]; } +inline float tfS32x3FSelectIndex2(Tsimd_f32x3_t value) { return value.v[2]; } -inline TSimdFloat32x3 tfSimd3fAdd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FAdd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { arg1.v[0] + arg2.v[0], arg1.v[1] + arg2.v[1], arg1.v[2] + arg2.v[2], }; } -inline TSimdFloat32x3 tfSimd3fSub(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FSub(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { arg1.v[0] - arg2.v[0], arg1.v[1] - arg2.v[1], arg1.v[2] - arg2.v[2], }; } -inline TSimdFloat32x3 tfSimd3fMul(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FMul(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { arg1.v[0] * arg2.v[0], arg1.v[1] * arg2.v[1], arg1.v[2] * arg2.v[2], }; } -inline TSimdFloat32x3 tfSimd3fMadd(TSimdFloat32x3 mul1, TSimdFloat32x3 mul2, TSimdFloat32x3 add) { - return tfSimd3fAdd(tfSimd3fMul(mul1, mul2), add); +inline Tsimd_f32x3_t tfS32x3FMadd(Tsimd_f32x3_t mul1, Tsimd_f32x3_t mul2, Tsimd_f32x3_t add) { + return tfS32x3FAdd(tfS32x3FMul(mul1, mul2), add); } -inline TSimdFloat32x3 tfSimd3fDiv(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline Tsimd_f32x3_t tfS32x3FDiv(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return { arg1.v[0] / arg2.v[0], arg1.v[1] / arg2.v[1], arg1.v[2] / arg2.v[2] }; } -inline TSimdFloat32x3 tfSimd3fAbs(TSimdFloat32x3 value) { +inline Tsimd_f32x3_t tfS32x3FAbs(Tsimd_f32x3_t value) { return { abs(value.v[0]), abs(value.v[1]), @@ -120,47 +120,47 @@ inline TSimdFloat32x3 tfSimd3fAbs(TSimdFloat32x3 value) { }; } -inline TSimdFloat32x3 tfSimdFloat3x32Load(float x, float y, float z) { return { x, y, z }; } -inline TSimdInt32x3 tfSimdInt3x32Load(int32_t x, int32_t y, int32_t z) { return { x, y, z }; } +inline Tsimd_f32x3_t tfSimd3x32FLoad(float x, float y, float z) { return { x, y, z }; } +inline Tsimd_i32x3_t tfSimd3x32ILoad(int32_t x, int32_t y, int32_t z) { return { x, y, z }; } -inline TSimdFloat32x2 tfSimd3fToSimd2f(TSimdFloat32x3 value) { return { value.v[0], value.v[1] }; } +inline Tsimd_f32x2_t tfS32x3FToSimd2f(Tsimd_f32x3_t value) { return { value.v[0], value.v[1] }; } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat0(TSimdFloat32x3 value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat1(TSimdFloat32x3 value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat2(TSimdFloat32x3 value) { return { value.v[2], value.v[2], value.v[2], value.v[2] }; } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat0(Tsimd_f32x3_t value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat1(Tsimd_f32x3_t value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat2(Tsimd_f32x3_t value) { return { value.v[2], value.v[2], value.v[2], value.v[2] }; } -inline TSimdFloat32x3 tfSimd3fSplatIndex0(TSimdFloat32x3 value) { return { value.v[0], value.v[0], value.v[0] }; } -inline TSimdFloat32x3 tfSimd3fSplatIndex1(TSimdFloat32x3 value) { return { value.v[1], value.v[1], value.v[1] }; } -inline TSimdFloat32x3 tfSimd3fSplatIndex2(TSimdFloat32x3 value) { return { value.v[2], value.v[2], value.v[2] }; } +inline Tsimd_f32x3_t tfS32x3FSplatIndex0(Tsimd_f32x3_t value) { return { value.v[0], value.v[0], value.v[0] }; } +inline Tsimd_f32x3_t tfS32x3FSplatIndex1(Tsimd_f32x3_t value) { return { value.v[1], value.v[1], value.v[1] }; } +inline Tsimd_f32x3_t tfS32x3FSplatIndex2(Tsimd_f32x3_t value) { return { value.v[2], value.v[2], value.v[2] }; } -inline TSimdInt32x3 tfSimd3iSplat(int32_t value) { return { value, value, value }; } -inline TSimdFloat32x3 tfSimd3fSplat(float value) { return { value, value, value }; } +inline Tsimd_i32x3_t tfS32x3iSplat(int32_t value) { return { value, value, value }; } +inline Tsimd_f32x3_t tfS32x3FSplat(float value) { return { value, value, value }; } -inline TSimdInt32x3 tfSimd3iCmpEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x3 tfSimd3iCmpNeq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpNeq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x3 tfSimd3iCmpGt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpGt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x3 tfSimd3iCmpGtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpGtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x3 tfSimd3iCmpLt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpLt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline TSimdInt32x3 tfSimd3iCmpLtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpLtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; } -inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { +inline bool tfS32x3FCmpAllEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { for (int i = 0; i < 3; i++) { if (arg1.v[i] != arg2.v[i]) { return false; @@ -169,7 +169,7 @@ inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return true; } -inline bool tfSimd3iCmpAllEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline bool tfS32x3iCmpAllEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { for (int i = 0; i < 3; i++) { if (arg1.v[i] != arg2.v[i]) { return false; @@ -178,7 +178,7 @@ inline bool tfSimd3iCmpAllEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return true; } -static inline bool tfSimdFloat32x3CmpAllLt(TSimdFloat32x3 a, TSimdFloat32x3 b) { +static inline bool tfSimdFloat32x3CmpAllLt(Tsimd_f32x3_t a, Tsimd_f32x3_t b) { for (int i = 0; i < 3; i++) { if (a.v[i] >= b.v[i]) { return false; @@ -187,13 +187,13 @@ static inline bool tfSimdFloat32x3CmpAllLt(TSimdFloat32x3 a, TSimdFloat32x3 b) { return true; } -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex0ByValue(TSimdFloat32x3 input, float value) { +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex0ByValue(Tsimd_f32x3_t input, float value) { return {value, input.v[1], input.v[2]}; } -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex1ByValue(TSimdFloat32x3 input, float value){ +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex1ByValue(Tsimd_f32x3_t input, float value){ return {input.v[0], value, input.v[2]}; }; -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex2ByValue(TSimdFloat32x3 input, float value){ +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex2ByValue(Tsimd_f32x3_t input, float value){ return {input.v[0], input.v[1], value}; }; diff --git a/Forge/Math/Internal/TF_Simd32x3_sse.inl b/Forge/Math/Internal/TF_Simd32x3_sse.inl index 04c50f531a..9b114cdaea 100644 --- a/Forge/Math/Internal/TF_Simd32x3_sse.inl +++ b/Forge/Math/Internal/TF_Simd32x3_sse.inl @@ -4,111 +4,111 @@ #include "../TF_Simd32x3.h" #endif -inline TSimdInt32x3 tfSimd3iSelect(TSimdInt32x3 arg0, TSimdInt32x3 arg1, TSimdInt32x3 mask) { return _mm_blendv_epi8(arg0, arg1, mask); } -inline TSimdFloat32x3 tfSimd3fSelect(TSimdFloat32x3 arg0, TSimdFloat32x3 arg1, TSimdFloat32x3 mask) { +inline Tsimd_i32x3_t tfS32x3iSelect(Tsimd_i32x3_t arg0, Tsimd_i32x3_t arg1, Tsimd_i32x3_t mask) { return _mm_blendv_epi8(arg0, arg1, mask); } +inline Tsimd_f32x3_t tfS32x3FSelect(Tsimd_f32x3_t arg0, Tsimd_f32x3_t arg1, Tsimd_f32x3_t mask) { return _mm_blendv_ps(arg0, arg1, mask); } -inline TSimdFloat32x3 tfSimd3fZero() { return _mm_setzero_ps(); } -inline TSimdInt32x3 tfSimd3iZero() { return _mm_setzero_si128(); } +inline Tsimd_f32x3_t tfS32x3FZero() { return _mm_setzero_ps(); } +inline Tsimd_i32x3_t tfS32x3iZero() { return _mm_setzero_si128(); } -inline TSimdInt32x3 tfSimd3iNot(TSimdInt32x3 value) { return _mm_andnot_si128(value, _mm_set1_epi32(TF_SIMDI_MAX)); } -inline TSimdInt32x3 tfSimd3iAnd(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_and_si128(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iAndNot(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_andnot_si128(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iOr(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_or_si128(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iXor(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_xor_si128(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iNot(Tsimd_i32x3_t value) { return _mm_andnot_si128(value, _mm_set1_epi32(TF_SIMDI_MAX)); } +inline Tsimd_i32x3_t tfS32x3iAnd(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_and_si128(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iAndNot(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_andnot_si128(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iOr(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_or_si128(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iXor(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_xor_si128(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fNot(TSimdFloat32x3 value) { - const TSimdFloat32x3 invert = tfSimd3fSplat((float)(0xFFFFFFFF)); +inline Tsimd_f32x3_t tfS32x3FNot(Tsimd_f32x3_t value) { + const Tsimd_f32x3_t invert = tfS32x3FSplat((float)(0xFFFFFFFF)); return _mm_andnot_ps(value, invert); } -inline TSimdFloat32x3 tfSimd3fAnd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_and_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fAndNot(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_andnot_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fOr(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_or_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fXor(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_xor_ps(arg1, arg2); } - -inline TSimdFloat32x3 tfSimd3fFloor(TSimdFloat32x3 value) { return _mm_floor_ps(value); } -inline TSimdFloat32x3 tfSimd3fCeil(TSimdFloat32x3 value) { return _mm_ceil_ps(value); } -inline TSimdFloat32x3 tfSimd3fRound(TSimdFloat32x3 value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline TSimdFloat32x3 tfSimd3fTruncate(TSimdFloat32x3 value) { return tfSimd3iToSimd3f(tfSimd3fToSimd3i(value)); } -inline TSimdFloat32x3 tfSimd3fMin(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_min_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMax(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_max_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fClamp(TSimdFloat32x3 value, TSimdFloat32x3 min, TSimdFloat32x3 max) { - return tfSimd3fMax(min, tfSimd3fMin(value, max)); +inline Tsimd_f32x3_t tfS32x3FAnd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_and_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FAndNot(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_andnot_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FOr(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_or_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FXor(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_xor_ps(arg1, arg2); } + +inline Tsimd_f32x3_t tfS32x3FFloor(Tsimd_f32x3_t value) { return _mm_floor_ps(value); } +inline Tsimd_f32x3_t tfS32x3FCeil(Tsimd_f32x3_t value) { return _mm_ceil_ps(value); } +inline Tsimd_f32x3_t tfS32x3FRound(Tsimd_f32x3_t value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } +inline Tsimd_f32x3_t tfS32x3FTruncate(Tsimd_f32x3_t value) { return tfS32x3iToSimd3f(tfS32x3FToSimd3i(value)); } +inline Tsimd_f32x3_t tfS32x3FMin(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_min_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMax(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_max_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FClamp(Tsimd_f32x3_t value, Tsimd_f32x3_t min, Tsimd_f32x3_t max) { + return tfS32x3FMax(min, tfS32x3FMin(value, max)); } -inline TSimdInt32x3 tfSimd3fToSimd3i(TSimdFloat32x3 value) { return _mm_castps_si128(value); } -inline TSimdFloat32x3 tfSimd3iToSimd3f(TSimdInt32x3 value) { return _mm_castsi128_ps(value); } +inline Tsimd_i32x3_t tfS32x3FToSimd3i(Tsimd_f32x3_t value) { return _mm_castps_si128(value); } +inline Tsimd_f32x3_t tfS32x3iToSimd3f(Tsimd_i32x3_t value) { return _mm_castsi128_ps(value); } -inline float tfSimd3fSelectIndex0(TSimdFloat32x3 value) { return _mm_cvtss_f32(value); } -inline float tfSimd3fSelectIndex1(TSimdFloat32x3 value) { return tfSimd3fSelectIndex0(tfSimd3fSplatIndex1(value)); } -inline float tfSimd3fSelectIndex2(TSimdFloat32x3 value) { return tfSimd3fSelectIndex0(tfSimd3fSplatIndex2(value)); } +inline float tfS32x3FSelectIndex0(Tsimd_f32x3_t value) { return _mm_cvtss_f32(value); } +inline float tfS32x3FSelectIndex1(Tsimd_f32x3_t value) { return tfS32x3FSelectIndex0(tfS32x3FSplatIndex1(value)); } +inline float tfS32x3FSelectIndex2(Tsimd_f32x3_t value) { return tfS32x3FSelectIndex0(tfS32x3FSplatIndex2(value)); } -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex0ByValue(TSimdFloat32x3 input, float value) { - return _mm_blend_ps(input, tfSimd3fSplat(value), 0b0001); +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex0ByValue(Tsimd_f32x3_t input, float value) { + return _mm_blend_ps(input, tfS32x3FSplat(value), 0b0001); } -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex1ByValue(TSimdFloat32x3 input, float value) { - return _mm_blend_ps(input, tfSimd3fSplat(value), 0b0010); +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex1ByValue(Tsimd_f32x3_t input, float value) { + return _mm_blend_ps(input, tfS32x3FSplat(value), 0b0010); } -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex2ByValue(TSimdFloat32x3 input, float value) { - return _mm_blend_ps(input, tfSimd3fSplat(value), 0b0100); +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex2ByValue(Tsimd_f32x3_t input, float value) { + return _mm_blend_ps(input, tfS32x3FSplat(value), 0b0100); } -inline TSimdFloat32x3 tfSimd3fAdd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_add_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fSub(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_sub_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMul(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_mul_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fMadd(TSimdFloat32x3 mul1, TSimdFloat32x3 mul2, TSimdFloat32x3 add) { +inline Tsimd_f32x3_t tfS32x3FAdd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_add_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FSub(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_sub_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMul(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_mul_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FMadd(Tsimd_f32x3_t mul1, Tsimd_f32x3_t mul2, Tsimd_f32x3_t add) { #if 0 return _mm_fmadd_ps(mul1, mul2, add); // Requires FMA CPUID #else - return tfSimd3fAdd(tfSimd3fMul(mul1, mul2), add); + return tfS32x3FAdd(tfS32x3FMul(mul1, mul2), add); #endif } -inline TSimdFloat32x3 tfSimd3fDiv(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { return _mm_div_ps(arg1, arg2); } +inline Tsimd_f32x3_t tfS32x3FDiv(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { return _mm_div_ps(arg1, arg2); } -inline TSimdFloat32x3 tfSimd3fAbs(TSimdFloat32x3 value) { - const TSimdFloat32x4 signMask = tfSimd3iToSimd3f(tfSimd3iSplat(0x7FFFFFFF)); +inline Tsimd_f32x3_t tfS32x3FAbs(Tsimd_f32x3_t value) { + const Tsimd_f32x4_t signMask = tfS32x3iToSimd3f(tfS32x3iSplat(0x7FFFFFFF)); return _mm_and_ps(value, signMask); } -inline TSimdFloat32x3 tfSimdFloat3x32Load(float x, float y, float z) { return _mm_set_ps(0.0f, z, y, x); } -inline TSimdInt32x3 tfSimdInt3x32Load(int32_t x, int32_t y, int32_t z) { return _mm_set_epi32(0.0f, x, y, z); } +inline Tsimd_f32x3_t tfSimd3x32FLoad(float x, float y, float z) { return _mm_set_ps(0.0f, z, y, x); } +inline Tsimd_i32x3_t tfSimd3x32ILoad(int32_t x, int32_t y, int32_t z) { return _mm_set_epi32(0.0f, x, y, z); } -inline TSimdFloat32x2 tfSimd3fToSimd2f(TSimdFloat32x3 value) { return value; } +inline Tsimd_f32x2_t tfS32x3FToSimd2f(Tsimd_f32x3_t value) { return value; } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat0(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat1(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } -static inline TSimdFloat32x4 tfSimdFloat3To4Splat2(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat0(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat1(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat2(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } -inline TSimdFloat32x3 tfSimd3fSplatIndex0(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } -inline TSimdFloat32x3 tfSimd3fSplatIndex1(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } -inline TSimdFloat32x3 tfSimd3fSplatIndex2(TSimdFloat32x3 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex0(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex1(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +inline Tsimd_f32x3_t tfS32x3FSplatIndex2(Tsimd_f32x3_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } -inline TSimdInt32x3 tfSimd3iSplat(int32_t value) { return _mm_set1_epi32(value); } -inline TSimdFloat32x3 tfSimd3fSplat(float value) { return _mm_set1_ps(value); } +inline Tsimd_i32x3_t tfS32x3iSplat(int32_t value) { return _mm_set1_epi32(value); } +inline Tsimd_f32x3_t tfS32x3FSplat(float value) { return _mm_set1_ps(value); } -inline TSimdInt32x3 tfSimd3iCmpEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_cmpeq_epi32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpNeq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_cmpeq_epi32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpNeq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_xor_si128(_mm_cmpeq_epi32(arg1, arg2), _mm_set1_epi32((int32_t)0xFFFFFFFF)); } -inline TSimdInt32x3 tfSimd3iCmpGt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_cmpgt_epi32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpGtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpGt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_cmpgt_epi32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpGtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_or_si128(_mm_cmpgt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline TSimdInt32x3 tfSimd3iCmpLt(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { return _mm_cmplt_epi32(arg1, arg2); } -inline TSimdInt32x3 tfSimd3iCmpLtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { +inline Tsimd_i32x3_t tfS32x3iCmpLt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_cmplt_epi32(arg1, arg2); } +inline Tsimd_i32x3_t tfS32x3iCmpLtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { return _mm_or_si128(_mm_cmplt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2) { - TSimdFloat32x3 compare = tfSimd3fCmpEq(arg1, arg2); +inline bool tfS32x3FCmpAllEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2) { + Tsimd_f32x3_t compare = tfS32x3FCmpEq(arg1, arg2); return (_mm_movemask_ps(compare) & 0b0111) == 0b0111; } -inline bool tfSimd3iCmpAllEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2) { - const TSimdInt32x3 compare = tfSimd3iCmpEq(arg1, arg2); +inline bool tfS32x3iCmpAllEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2) { + const Tsimd_i32x3_t compare = tfS32x3iCmpEq(arg1, arg2); return (_mm_movemask_epi8(compare) & 0b0111) == 0b0111; } -static inline bool tfSimdFloat32x3CmpAllLt(TSimdFloat32x3 a, TSimdFloat32x3 b) { - TSimdFloat32x3 compare = tfSimd3fCmpLt(a, b); +static inline bool tfSimdFloat32x3CmpAllLt(Tsimd_f32x3_t a, Tsimd_f32x3_t b) { + Tsimd_f32x3_t compare = tfS32x3FCmpLt(a, b); return (_mm_movemask_ps(compare) & 0b0111) == 0b0111; } diff --git a/Forge/Math/Internal/TF_Simd32x4_neon.inl b/Forge/Math/Internal/TF_Simd32x4_neon.inl index 5b8fa6e924..cc92670a69 100644 --- a/Forge/Math/Internal/TF_Simd32x4_neon.inl +++ b/Forge/Math/Internal/TF_Simd32x4_neon.inl @@ -4,118 +4,118 @@ #include "../TF_Simd32x4.h" #endif -static inline TSimdFloat32x4 tfSimd4fReplaceIndex0ByValue(TSimdFloat32x4 input, float value) { return vsetq_lane_f32(value, input, 0); } -static inline TSimdFloat32x4 tfSimd4fReplaceIndex1ByValue(TSimdFloat32x4 input, float value) { return vsetq_lane_f32(value, input, 1); } -static inline TSimdFloat32x4 tfSimd4fReplaceIndex2ByValue(TSimdFloat32x4 input, float value) { return vsetq_lane_f32(value, input, 2); } -static inline TSimdFloat32x4 tfSimd4fReplaceIndex3ByValue(TSimdFloat32x4 input, float value) { return vsetq_lane_f32(value, input, 3); } - -inline TSimdInt32x4 tfSimd4iSelect(TSimdInt32x4 arg0, TSimdInt32x4 arg1, TSimdInt32x4 mask) { return vbslq_s32(mask, arg1, arg1); } -inline TSimdFloat32x4 tfSimd4fSelect(TSimdFloat32x4 arg0, TSimdFloat32x4 arg1, TSimdFloat32x4 mask) { return vbslq_f32(mask, arg1, arg1); } - -inline TSimdFloat32x4 tfSimd4fZero() { return vmovq_n_f32(0.0f); } -inline TSimdInt32x4 tfSimd4iZero() { return vmovq_n_s32(0); } - -inline TSimdInt32x4 tfSimd4iNot(TSimdInt32x4 value) { return vmvnq_s32(value); } -inline TSimdInt32x4 tfSimd4iAnd(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vandq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iAndNot(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vandq_s32(vmvnq_s32(arg1), arg2); } -inline TSimdInt32x4 tfSimd4iOr(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vorrq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iXor(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return veorq_s32(arg1, arg2); } - -inline TSimdFloat32x4 tfSimd4fNot(TSimdFloat32x4 value) { return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(value))); } -inline TSimdFloat32x4 tfSimd4fAnd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); -} -inline TSimdFloat32x4 tfSimd4fAndNot(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vreinterpretq_f32_s32(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(arg1)), vreinterpretq_s32_f32(arg2))); -} -inline TSimdFloat32x4 tfSimd4fOr(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); -} -inline TSimdFloat32x4 tfSimd4fXor(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); -} - -inline TSimdFloat32x4 tfSimd4fFloor(TSimdFloat32x4 value) { return vrndmq_f32(value); } -inline TSimdFloat32x4 tfSimd4fCeil(TSimdFloat32x4 value) { return vrndpq_f32(value); } -inline TSimdFloat32x4 tfSimd4fRound(TSimdFloat32x4 value) { return vrndnq_f32(value); } -inline TSimdFloat32x4 tfSimd4fTruncate(TSimdFloat32x4 value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } -inline TSimdFloat32x4 tfSimd4fMin(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vminq_f32(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMax(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vmaxq_f32(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fClamp(TSimdFloat32x4 value, TSimdFloat32x4 min, TSimdFloat32x4 max) { - return tfSimd4fMax(min, tfSimd4fMin(value, max)); -} - -inline TSimdInt32x4 tfSimd4fToSimd4i(TSimdFloat32x4 value) { return vreinterpretq_f32_s32(value); } - -inline TSimdFloat32x4 tfSimd4iToSimd4f(TSimdInt32x4 value) { return vreinterpretq_s32_f32(value); } - -inline float tfSimd4fSelectIndex0(TSimdFloat32x4 value) { return vgetq_lane_f32(value, 0); } -inline float tfSimd4fSelectIndex1(TSimdFloat32x4 value) { return vgetq_lane_f32(value, 1); } -inline float tfSimd4fSelectIndex2(TSimdFloat32x4 value) { return vgetq_lane_f32(value, 2); } -inline float tfSimd4fSelectIndex3(TSimdFloat32x4 value) { return vgetq_lane_f32(value, 3); } - -inline TSimdFloat32x4 tfSimd4fAdd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vaddq_f32(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fSub(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vsubq_f32(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMul(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vmulq_f32(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMadd(TSimdFloat32x4 mul1, TSimdFloat32x4 mul2, TSimdFloat32x4 add) { return vmlaq_f32(add, mul1, mul2); } - -inline TSimdFloat32x4 tfSimd4fDiv(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vdivq_f32(arg1, arg2); } - -inline TSimdFloat32x4 tfSimd4fAbs(TSimdFloat32x4 value) { return vabsq_f32(value); } -inline TSimdFloat32x4 tfSimdFloat4x32Load(float x, float y, float z, float w) { - const float values[4] = { x, y, z, w }; - return vld1q_f32(values); -} - -inline TSimdInt32x4 tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { - const int32_t values[4] = { x, y, z, w }; - return vld1q_s32(values); -} - -inline TSimdFloat32x2 tfSimd4fToSimd2f(TSimdFloat32x4 value) { return vget_low_f32(value); } - -inline TSimdFloat32x3 tfSimd4fToSimd3f(TSimdFloat32x4 value) { return value; } - -inline TSimdFloat32x4 tfSimd4fSplatIndex0(TSimdFloat32x4 value) { return vdupq_laneq_f32(value, 0); } - -inline TSimdFloat32x4 tfSimd4fSplatIndex1(TSimdFloat32x4 value) { return vdupq_laneq_f32(value, 1); } - -inline TSimdFloat32x4 tfSimd4fSplatIndex2(TSimdFloat32x4 value) { return vdupq_laneq_f32(value, 2); } - -inline TSimdFloat32x4 tfSimd4fSplatIndex3(TSimdFloat32x4 value) { return vdupq_laneq_f32(value, 3); } - -inline TSimdInt32x4 tfSimd4iSplat(int32_t value) { return vdupq_n_s32(value); } - -inline TSimdFloat32x4 tfSimd4fSplat(float value) { return vdupq_n_f32(value); } - -inline TSimdFloat32x4 tfSimd4fCmpEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vreinterpretq_f32_s32(vceqq_f32(arg1, arg2)); } -inline TSimdFloat32x4 tfSimd4fCmpNeq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vreinterpretq_f32_s32(vmvnq_s32(vceqq_f32(arg1, arg2))); -} -inline TSimdFloat32x4 tfSimd4fCmpGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vreinterpretq_f32_s32(vcgtq_f32(arg1, arg2)); } -inline TSimdFloat32x4 tfSimd4fCmpGtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vreinterpretq_f32_s32(vcgeq_f32(arg1, arg2)); } -inline TSimdFloat32x4 tfSimd4fCmpLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vreinterpretq_f32_s32(vcltq_f32(arg1, arg2)); } -inline TSimdFloat32x4 tfSimd4fCmpLtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return vreinterpretq_f32_s32(vcleq_f32(arg1, arg2)); } - -inline TSimdInt32x4 tfSimd4iCmpEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vceqq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpNeq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vmvnq_s32(vceqq_s32(arg1, arg2)); } -inline TSimdInt32x4 tfSimd4iCmpGt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vcgtq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpGtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vcgeq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpLt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vcltq_s32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpLtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return vcleq_s32(arg1, arg2); } - -inline bool tfSimd4fCmpAllLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vminv_u32(vcltq_f32(arg1, arg2)) != 0; -} - -inline bool tfSimd4fCmpAllGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vminv_u32(vcgtq_f32(arg1, arg2)) != 0; -} - -inline bool tfSimd4fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return vminv_u32(vceqq_f32(arg1, arg2)) != 0; -} - -inline bool tfSimd4iCmpAllEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return vminv_u32(vceqq_s32(arg1, arg2)) != 0; -} +//static inline TSimd32Fx4 tfSimd4fReplaceIndex0ByValue(TSimd32Fx4 input, float value) { return vsetq_lane_f32(value, input, 0); } +//static inline TSimd32Fx4 tfSimd4fReplaceIndex1ByValue(TSimd32Fx4 input, float value) { return vsetq_lane_f32(value, input, 1); } +//static inline TSimd32Fx4 tfSimd4fReplaceIndex2ByValue(TSimd32Fx4 input, float value) { return vsetq_lane_f32(value, input, 2); } +//static inline TSimd32Fx4 tfSimd4fReplaceIndex3ByValue(TSimd32Fx4 input, float value) { return vsetq_lane_f32(value, input, 3); } +// +//inline TSimd32Ix4 tfSimd4iSelect(TSimd32Ix4 arg0, TSimd32Ix4 arg1, TSimd32Ix4 mask) { return vbslq_s32(mask, arg1, arg1); } +//inline TSimd32Fx4 tfSimd4fSelect(TSimd32Fx4 arg0, TSimd32Fx4 arg1, TSimd32Fx4 mask) { return vbslq_f32(mask, arg1, arg1); } +// +//inline TSimd32Fx4 tfSimd4fZero() { return vmovq_n_f32(0.0f); } +//inline TSimd32Ix4 tfSimd4iZero() { return vmovq_n_s32(0); } +// +//inline TSimd32Ix4 tfSimd4iNot(TSimd32Ix4 value) { return vmvnq_s32(value); } +//inline TSimd32Ix4 tfSimd4iAnd(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vandq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iAndNot(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vandq_s32(vmvnq_s32(arg1), arg2); } +//inline TSimd32Ix4 tfSimd4iOr(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vorrq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iXor(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return veorq_s32(arg1, arg2); } +// +//inline TSimd32Fx4 tfSimd4fNot(TSimd32Fx4 value) { return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(value))); } +//inline TSimd32Fx4 tfSimd4fAnd(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +//} +//inline TSimd32Fx4 tfSimd4fAndNot(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vreinterpretq_f32_s32(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(arg1)), vreinterpretq_s32_f32(arg2))); +//} +//inline TSimd32Fx4 tfSimd4fOr(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +//} +//inline TSimd32Fx4 tfSimd4fXor(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +//} +// +//inline TSimd32Fx4 tfSimd4fFloor(TSimd32Fx4 value) { return vrndmq_f32(value); } +//inline TSimd32Fx4 tfSimd4fCeil(TSimd32Fx4 value) { return vrndpq_f32(value); } +//inline TSimd32Fx4 tfSimd4fRound(TSimd32Fx4 value) { return vrndnq_f32(value); } +//inline TSimd32Fx4 tfSimd4fTruncate(TSimd32Fx4 value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } +//inline TSimd32Fx4 tfSimd4fMin(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vminq_f32(arg1, arg2); } +//inline TSimd32Fx4 tfSimd4fMax(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vmaxq_f32(arg1, arg2); } +//inline TSimd32Fx4 tfSimd4fClamp(TSimd32Fx4 value, TSimd32Fx4 min, TSimd32Fx4 max) { +// return tfSimd4fMax(min, tfSimd4fMin(value, max)); +//} +// +//inline TSimd32Ix4 tfSimd4fToSimd4i(TSimd32Fx4 value) { return vreinterpretq_f32_s32(value); } +// +//inline TSimd32Fx4 tfSimd4iToSimd4f(TSimd32Ix4 value) { return vreinterpretq_s32_f32(value); } +// +//inline float tfSimd4fSelectIndex0(TSimd32Fx4 value) { return vgetq_lane_f32(value, 0); } +//inline float tfSimd4fSelectIndex1(TSimd32Fx4 value) { return vgetq_lane_f32(value, 1); } +//inline float tfSimd4fSelectIndex2(TSimd32Fx4 value) { return vgetq_lane_f32(value, 2); } +//inline float tfSimd4fSelectIndex3(TSimd32Fx4 value) { return vgetq_lane_f32(value, 3); } +// +//inline TSimd32Fx4 tfSimd4fAdd(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vaddq_f32(arg1, arg2); } +//inline TSimd32Fx4 tfSimd4fSub(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vsubq_f32(arg1, arg2); } +//inline TSimd32Fx4 tfSimd4fMul(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vmulq_f32(arg1, arg2); } +//inline TSimd32Fx4 tfSimd4fMadd(TSimd32Fx4 mul1, TSimd32Fx4 mul2, TSimd32Fx4 add) { return vmlaq_f32(add, mul1, mul2); } +// +//inline TSimd32Fx4 tfSimd4fDiv(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vdivq_f32(arg1, arg2); } +// +//inline TSimd32Fx4 tfSimd4fAbs(TSimd32Fx4 value) { return vabsq_f32(value); } +//inline TSimd32Fx4 tfSimdFloat4x32Load(float x, float y, float z, float w) { +// const float values[4] = { x, y, z, w }; +// return vld1q_f32(values); +//} +// +//inline TSimd32Ix4 tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { +// const int32_t values[4] = { x, y, z, w }; +// return vld1q_s32(values); +//} +// +//inline Tsimd_f32x2_t tfSimd4fToSimd2f(TSimd32Fx4 value) { return vget_low_f32(value); } +// +//inline TSimd32Fx3 tfSimd4fToSimd3f(TSimd32Fx4 value) { return value; } +// +//inline TSimd32Fx4 tfSimd4fSplatIndex0(TSimd32Fx4 value) { return vdupq_laneq_f32(value, 0); } +// +//inline TSimd32Fx4 tfSimd4fSplatIndex1(TSimd32Fx4 value) { return vdupq_laneq_f32(value, 1); } +// +//inline TSimd32Fx4 tfSimd4fSplatIndex2(TSimd32Fx4 value) { return vdupq_laneq_f32(value, 2); } +// +//inline TSimd32Fx4 tfSimd4fSplatIndex3(TSimd32Fx4 value) { return vdupq_laneq_f32(value, 3); } +// +//inline TSimd32Ix4 tfSimd4iSplat(int32_t value) { return vdupq_n_s32(value); } +// +//inline TSimd32Fx4 tfSimd4fSplat(float value) { return vdupq_n_f32(value); } +// +//inline TSimd32Fx4 tfSimd4fCmpEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vreinterpretq_f32_s32(vceqq_f32(arg1, arg2)); } +//inline TSimd32Fx4 tfSimd4fCmpNeq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vreinterpretq_f32_s32(vmvnq_s32(vceqq_f32(arg1, arg2))); +//} +//inline TSimd32Fx4 tfSimd4fCmpGt(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vreinterpretq_f32_s32(vcgtq_f32(arg1, arg2)); } +//inline TSimd32Fx4 tfSimd4fCmpGtEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vreinterpretq_f32_s32(vcgeq_f32(arg1, arg2)); } +//inline TSimd32Fx4 tfSimd4fCmpLt(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vreinterpretq_f32_s32(vcltq_f32(arg1, arg2)); } +//inline TSimd32Fx4 tfSimd4fCmpLtEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { return vreinterpretq_f32_s32(vcleq_f32(arg1, arg2)); } +// +//inline TSimd32Ix4 tfSimd4iCmpEq(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vceqq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iCmpNeq(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vmvnq_s32(vceqq_s32(arg1, arg2)); } +//inline TSimd32Ix4 tfSimd4iCmpGt(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vcgtq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iCmpGtEq(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vcgeq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iCmpLt(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vcltq_s32(arg1, arg2); } +//inline TSimd32Ix4 tfSimd4iCmpLtEq(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { return vcleq_s32(arg1, arg2); } +// +//inline bool tfSimd4fCmpAllLt(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vminv_u32(vcltq_f32(arg1, arg2)) != 0; +//} +// +//inline bool tfSimd4fCmpAllGt(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vminv_u32(vcgtq_f32(arg1, arg2)) != 0; +//} +// +//inline bool tfSimd4fCmpAllEq(TSimd32Fx4 arg1, TSimd32Fx4 arg2) { +// return vminv_u32(vceqq_f32(arg1, arg2)) != 0; +//} +// +//inline bool tfSimd4iCmpAllEq(TSimd32Ix4 arg1, TSimd32Ix4 arg2) { +// return vminv_u32(vceqq_s32(arg1, arg2)) != 0; +//} diff --git a/Forge/Math/Internal/TF_Simd32x4_scalar.inl b/Forge/Math/Internal/TF_Simd32x4_scalar.inl index 4528168ed0..26dc6de116 100644 --- a/Forge/Math/Internal/TF_Simd32x4_scalar.inl +++ b/Forge/Math/Internal/TF_Simd32x4_scalar.inl @@ -1,253 +1,508 @@ #if defined(__CLANGD__) -#include "Forge/TF_Config.h" -#undef TF_FEATURE_CPU_NEON -#undef TF_FEATURE_CPU_SSE #define TF_FEATURE_CPU_SCALAR +#include "Forge/TF_Config.h" #include "../TF_Simd32x4.h" #endif -static inline TSimdFloat32x4 tfSimd4fReplaceIndex0ByValue(TSimdFloat32x4 input, float value) { - return { value, input.v[1], input.v[2], input.v[3] }; -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex1ByValue(TSimdFloat32x4 input, float value) { - return { input.v[0], value, input.v[2], input.v[3] }; -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex2ByValue(TSimdFloat32x4 input, float value) { - return { input.v[0], input.v[1], value, input.v[3] }; -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex3ByValue(TSimdFloat32x4 input, float value) { - return { input.v[0], input.v[1], input.v[2], value }; -} +// Tsimd_f32x4_t +static inline Tsimd_f32x4_t tfSimdZero_f32x4() { return {0,0,0,0}; } +static inline Tsimd_f32x4_t tfSimdLoad_f32x4(float x, float y, float z, float w) { return { x, y, z, w }; } +static inline Tsimd_f32x4_t tfSimdSplat_f32x4(float value) { return { value, value, value, value }; } -inline TSimdInt32x4 tfSimd4iSelect(TSimdInt32x4 arg0, TSimdInt32x4 arg1, TSimdInt32x4 mask) { - return { (mask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (mask.v[1] == 0) ? arg0.v[1] : arg1.v[1], (mask.v[2] == 0) ? arg0.v[2] : arg1.v[2], - (mask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; +static inline Tsimd_f32x4_t tfSimdSplat0_f32x4(Tsimd_f32x4_t value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } +static inline Tsimd_f32x4_t tfSimdSplat1_f32x4(Tsimd_f32x4_t value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } +static inline Tsimd_f32x4_t tfSimdSplat2_f32x4(Tsimd_f32x4_t value) { return { value.v[2], value.v[2], value.v[2], value.v[2] }; } +static inline Tsimd_f32x4_t tfSimdSplat3_f32x4(Tsimd_f32x4_t value) { return { value.v[3], value.v[3], value.v[3], value.v[3] }; } + +static inline Tsimd_f32x4_t tfSimdDot_f32x4(Tsimd_f32x4_t a,Tsimd_f32x4_t b) { + const float result = tfSimdDot_f32x4_f32(a, b); + return { result, result, result, result }; } -inline TSimdFloat32x4 tfSimd4fSelect(TSimdFloat32x4 arg0, TSimdFloat32x4 arg1, TSimdFloat32x4 mask) { - TSimdInt32x4 intMask = tfSimd4fToSimd4i(mask); - return { (intMask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (intMask.v[1] == 0) ? arg0.v[1] : arg1.v[1], - (intMask.v[2] == 0) ? arg0.v[2] : arg1.v[2], (intMask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; + +static inline float tfSimdDot_f32x4_f32(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { + const float result = (a.v[0] * b.v[0]) + (a.v[1] * b.v[1]) + (a.v[2] * b.v[2]) + (a.v[3] * b.v[3]); + return result; } -inline TSimdFloat32x4 tfSimd4fZero() { return { 0, 0, 0, 0 }; } -inline TSimdInt32x4 tfSimd4iZero() { return { 0, 0, 0, 0 }; } +static inline float tfSimdSelect_f32x4(Tsimd_f32x4_t value, int index) {ASSERT(index < 4); return value.v[index];} +static inline float tfSimdSelect0_f32x4(Tsimd_f32x4_t value) { return value.v[0]; } +static inline float tfSimdSelect1_f32x4(Tsimd_f32x4_t value) { return value.v[1]; } +static inline float tfSimdSelect2_f32x4(Tsimd_f32x4_t value) { return value.v[2]; } +static inline float tfSimdSelect3_f32x4(Tsimd_f32x4_t value) { return value.v[3]; } -inline TSimdInt32x4 tfSimd4iNot(TSimdInt32x4 value) { return { ~value.v[0], ~value.v[1], ~value.v[2], ~value.v[3] }; } -inline TSimdInt32x4 tfSimd4iAnd(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1], arg1.v[2] & arg2.v[2], arg1.v[3] & arg2.v[3] }; -} -inline TSimdInt32x4 tfSimd4iAndNot(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1], ~arg1.v[2] & arg2.v[2], ~arg1.v[3] & arg2.v[3] }; -} -inline TSimdInt32x4 tfSimd4iOr(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1], arg1.v[2] | arg2.v[2], arg1.v[3] | arg2.v[3] }; -} -inline TSimdInt32x4 tfSimd4iXor(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1], arg1.v[2] ^ arg2.v[2], arg1.v[3] ^ arg2.v[3] }; +static inline Tsimd_f32x4_t tfSimdAdd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return { + a.v[0] + b.v[0], + a.v[1] + b.v[1], + a.v[2] + b.v[2], + a.v[3] + b.v[3] +}; } +static inline Tsimd_f32x4_t tfSimdMul_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { + return { a.v[0] * b.v[0], a.v[1] * b.v[1], a.v[2] * b.v[2], a.v[3] * b.v[3] }; +} +static inline Tsimd_f32x4_t tfSimdDiv_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { + return { a.v[0] / b.v[0], a.v[1] / b.v[1], a.v[2] / b.v[2], a.v[3] / b.v[3] }; +} +static inline Tsimd_f32x4_t tfSimdAbs_f32x4(Tsimd_f32x4_t a) { return { fabsf(a.v[0]), fabsf(a.v[1]), fabsf(a.v[2]), fabsf(a.v[3]) }; } +static inline Tsimd_f32x4_t tfSimdMadd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b, Tsimd_f32x4_t c) { + return { (a.v[0] * b.v[0]) + c.v[0], (a.v[1] * b.v[1]) + c.v[1], (a.v[2] * b.v[2]) + c.v[2], (a.v[3] * b.v[3]) + +c.v[3] }; } -inline TSimdFloat32x4 tfSimd4fNot(TSimdFloat32x4 value) { - TSimdInt32x4 result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]), ~((int32_t)value.v[2]), ~((int32_t)value.v[3]) } }; - return tfSimd4iToSimd4f(result); -} -inline TSimdFloat32x4 tfSimd4fAnd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdInt32x4 result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), - ((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) }; - return tfSimd4iToSimd4f(result); +static inline Tsimd_f32x4_t tfSimdNot_f32x4(Tsimd_f32x4_t value) { + return { + (float)(~((int32_t)value.v[0])), + (float)(~((int32_t)value.v[1])), + (float)(~((int32_t)value.v[2])), + (float)(~((int32_t)value.v[3])) }; +} +static inline Tsimd_f32x4_t tfSimdAnd_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return { + (float)(((int32_t)arg1.v[0]) & ((int32_t)arg1.v[0])), + (float)(((int32_t)arg1.v[1]) & ((int32_t)arg1.v[1])), + (float)(((int32_t)arg1.v[2]) & ((int32_t)arg1.v[2])), + (float)(((int32_t)arg1.v[3]) & ((int32_t)arg1.v[3])) }; +} +static inline Tsimd_f32x4_t tfSimdAndNot_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return { + (float)(~((int32_t)arg1.v[0]) & ((int32_t)arg1.v[0])), + (float)(~((int32_t)arg1.v[1]) & ((int32_t)arg1.v[1])), + (float)(~((int32_t)arg1.v[2]) & ((int32_t)arg1.v[2])), + (float)(~((int32_t)arg1.v[3]) & ((int32_t)arg1.v[3])) }; + } -inline TSimdFloat32x4 tfSimd4fAndNot(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdInt32x4 result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), - ~((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), ~((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) } }; - return tfSimd4iToSimd4f(result); +static inline Tsimd_f32x4_t tfSimdOr_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return { + (float)(((int32_t)arg1.v[0]) | ((int32_t)arg1.v[0])), + (float)(((int32_t)arg1.v[1]) | ((int32_t)arg1.v[1])), + (float)(((int32_t)arg1.v[2]) | ((int32_t)arg1.v[2])), + (float)(((int32_t)arg1.v[3]) | ((int32_t)arg1.v[3])) }; } -inline TSimdFloat32x4 tfSimd4fOr(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdInt32x4 result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]), - ((int32_t)arg1.v[2]) | ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) | ((int32_t)arg2.v[3]) } }; - return tfSimd4iToSimd4f(result); +static inline Tsimd_f32x4_t tfSimdXor_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return { + (float)(((int32_t)arg1.v[0]) ^ ((int32_t)arg1.v[0])), + (float)(((int32_t)arg1.v[1]) ^ ((int32_t)arg1.v[1])), + (float)(((int32_t)arg1.v[2]) ^ ((int32_t)arg1.v[2])), + (float)(((int32_t)arg1.v[3]) ^ ((int32_t)arg1.v[3])) }; } -inline TSimdFloat32x4 tfSimd4fXor(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdInt32x4 result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]), - ((int32_t)arg1.v[2]) ^ ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) ^ ((int32_t)arg2.v[3]) } }; - return tfSimd4iToSimd4f(result); + +static inline Tsimd_f32x4_t tfSimdCmpEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] == arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); +} +static inline Tsimd_f32x4_t tfSimdCmpNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] != arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); +} +static inline Tsimd_f32x4_t tfSimdCmpGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ + (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] > arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); +} +static inline Tsimd_f32x4_t tfSimdCmpGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] >= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); +} +static inline Tsimd_f32x4_t tfSimdCmpLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ + (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); +} +static inline Tsimd_f32x4_t tfSimdCmpLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return tfSimd_i32x4_To_f32x4({ + (arg1.v[0] <= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] <= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] <= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] <= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }); } -inline TSimdFloat32x4 tfSimd4fFloor(TSimdFloat32x4 value) { - return { { floorf(value.v[0]), floorf(value.v[1]), floorf(value.v[2]), floorf(value.v[3]) } }; +static inline bool tfSimdCmpAllEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2){ + return arg1.v[0] == arg2.v[0] && arg1.v[1] == arg2.v[1] && arg1.v[2] == arg2.v[2] && arg1.v[3] == arg2.v[3]; } -inline TSimdFloat32x4 tfSimd4fCeil(TSimdFloat32x4 value) { - return { { ceilf(value.v[0]), ceilf(value.v[1]), ceilf(value.v[2]), ceilf(value.v[3]) } }; +static inline bool tfSimdCmpAllNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2){ + return arg1.v[0] != arg2.v[0] && arg1.v[1] != arg2.v[1] && arg1.v[2] != arg2.v[2] && arg1.v[3] != arg2.v[3]; } -inline TSimdFloat32x4 tfSimd4fRound(TSimdFloat32x4 value) { - // While 'roundf' may seem the obvious choice here, it rounds halfway cases - // away from zero regardless of the current rounding mode, but 'rintf' uses - // the current rounding mode which is consistent with other implementations. - return { { rintf(value.v[0]), rintf(value.v[1]), rintf(value.v[2]), rintf(value.v[3]) } }; +static inline bool tfSimdCmpAllGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return arg1.v[0] > arg2.v[0] && arg1.v[1] > arg2.v[1] && arg1.v[2] > arg2.v[2] && arg1.v[3] > arg2.v[3]; } -inline TSimdFloat32x4 tfSimd4fTruncate(TSimdFloat32x4 value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } -inline TSimdFloat32x4 tfSimd4fMin(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { fminf(arg1.v[0], arg2.v[0]), fminf(arg1.v[1], arg2.v[1]), fminf(arg1.v[2], arg2.v[2]), fminf(arg1.v[3], arg2.v[3]) } }; +static inline bool tfSimdCmpAllGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2){ + return arg1.v[0] >= arg2.v[0] && arg1.v[1] >= arg2.v[1] && arg1.v[2] >= arg2.v[2] && arg1.v[3] >= arg2.v[3]; } -inline TSimdFloat32x4 tfSimd4fMax(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { fmaxf(arg1.v[0], arg2.v[0]), fmaxf(arg1.v[1], arg2.v[1]), fmaxf(arg1.v[2], arg2.v[2]), fmaxf(arg1.v[3], arg2.v[3]) } }; +static inline bool tfSimdCmpAllLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return arg1.v[0] < arg2.v[0] && arg1.v[1] < arg2.v[1] && arg1.v[2] < arg2.v[2] && arg1.v[3] < arg2.v[3]; } -inline TSimdFloat32x4 tfSimd4fClamp(TSimdFloat32x4 value, TSimdFloat32x4 min, TSimdFloat32x4 max) { - return tfSimd4fMax(min, tfSimd4fMin(value, max)); +static inline bool tfSimdCmpAllLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2){ + return arg1.v[0] <= arg2.v[0] && arg1.v[1] <= arg2.v[1] && arg1.v[2] <= arg2.v[2] && arg1.v[3] <= arg2.v[3]; } -inline TSimdInt32x4 tfSimd4fToSimd4i(TSimdFloat32x4 value) { - return { (int32_t)value.v[0], (int32_t)value.v[1], (int32_t)value.v[2], (int32_t)value.v[3] }; -} -inline TSimdFloat32x4 tfSimd4iToSimd4f(TSimdInt32x4 value) { - return { (float)value.v[0], (float)value.v[1], (float)value.v[2], (float)value.v[3] }; -} -inline float tfSimd4fSelectIndex0(TSimdFloat32x4 value) { return value.v[0]; } -inline float tfSimd4fSelectIndex1(TSimdFloat32x4 value) { return value.v[1]; } -inline float tfSimd4fSelectIndex2(TSimdFloat32x4 value) { return value.v[2]; } -inline float tfSimd4fSelectIndex3(TSimdFloat32x4 value) { return value.v[3]; } +// Tsimd_i32x4_t +static inline Tsimd_i32x4_t tfSimdLoad_i32x4(int32_t x, int32_t y, int32_t z, int32_t w) { return { x, y, z, w }; } +static inline Tsimd_i32x4_t tfSimdSplat_i32x4(int32_t value) { return { value, value, value, value }; } -inline TSimdFloat32x4 tfSimd4fAdd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { - arg1.v[0] + arg2.v[0], - arg1.v[1] + arg2.v[1], - arg1.v[2] + arg2.v[2], - arg1.v[3] + arg2.v[3], - }; +static inline Tsimd_i32x4_t tfSimdSplat0_i32x4(Tsimd_i32x4_t value){ return { value.v[0], value.v[0], value.v[0], value.v[0] }; } +static inline Tsimd_i32x4_t tfSimdSplat1_i32x4(Tsimd_i32x4_t value){ return { value.v[1], value.v[1], value.v[1], value.v[1] }; } +static inline Tsimd_i32x4_t tfSimdSplat2_i32x4(Tsimd_i32x4_t value){ return { value.v[2], value.v[2], value.v[2], value.v[2] }; } +static inline Tsimd_i32x4_t tfSimdSplat3_i32x4(Tsimd_i32x4_t value){ return { value.v[3], value.v[3], value.v[3], value.v[3] }; } + +static inline int32_t tfSimdSelect_i32x4(Tsimd_i32x4_t value, int index) { + ASSERT(index < 4); + return value.v[index]; +} +static inline int32_t tfSimdSelect0_i32x4(Tsimd_i32x4_t value) { return value.v[0]; } +static inline int32_t tfSimdSelect1_i32x4(Tsimd_i32x4_t value) { return value.v[1]; } +static inline int32_t tfSimdSelect2_i32x4(Tsimd_i32x4_t value) { return value.v[2]; } +static inline int32_t tfSimdSelect3_i32x4(Tsimd_i32x4_t value) { return value.v[3]; } + +static inline Tsimd_i32x4_t tfSimdAdd_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b) { + return { a.v[0] + b.v[0], a.v[1] + b.v[1], a.v[2] + b.v[2], a.v[3] + b.v[3] }; } -inline TSimdFloat32x4 tfSimd4fSub(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { - arg1.v[0] - arg2.v[0], - arg1.v[1] - arg2.v[1], - arg1.v[2] - arg2.v[2], - arg1.v[3] - arg2.v[3], - }; +static inline Tsimd_i32x4_t tfSimdMul_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b) { + return { a.v[0] * b.v[0], a.v[1] * b.v[1], a.v[2] * b.v[2], a.v[3] * b.v[3] }; } -inline TSimdFloat32x4 tfSimd4fMul(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { +static inline Tsimd_i32x4_t tfSimdAbs_i32x4(Tsimd_i32x4_t a) { return { - arg1.v[0] * arg2.v[0], - arg1.v[1] * arg2.v[1], - arg1.v[2] * arg2.v[2], - arg1.v[3] * arg2.v[3], + abs(a.v[0]), + abs(a.v[1]), + abs(a.v[2]), + abs(a.v[3]), }; } -inline TSimdFloat32x4 tfSimd4fMadd(TSimdFloat32x4 mul1, TSimdFloat32x4 mul2, TSimdFloat32x4 add) { - return tfSimd4fAdd(tfSimd4fMul(mul1, mul2), add); +static inline Tsimd_i32x4_t tfSimdMadd_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b, Tsimd_i32x4_t c) { + return { (a.v[0] * b.v[0]) + c.v[0], (a.v[1] * b.v[1]) + c.v[1], (a.v[2] * b.v[2]) + c.v[2], (a.v[3] * b.v[3]) + +c.v[3] }; } -inline TSimdFloat32x4 tfSimd4fDiv(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { - arg1.v[0] / arg2.v[0], - arg1.v[1] / arg2.v[1], - arg1.v[2] / arg2.v[2], - arg1.v[3] / arg2.v[3], - }; +static inline Tsimd_i32x4_t tfSimdNot_i32x4(Tsimd_i32x4_t value) { + return { ~(int32_t)value.v[0], ~(int32_t)value.v[1], ~(int32_t)value.v[2], ~(int32_t)value.v[3] }; } - -inline TSimdFloat32x4 tfSimd4fAbs(TSimdFloat32x4 value) { - return { - abs(value.v[0]), - abs(value.v[1]), - abs(value.v[2]), - abs(value.v[3]), - }; +static inline Tsimd_i32x4_t tfSimdAnd_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1], arg1.v[2] & arg2.v[2], arg1.v[3] & arg2.v[3] }; +} +static inline Tsimd_i32x4_t tfSimdAndNot_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1], ~arg1.v[2] & arg2.v[2], ~arg1.v[3] & arg2.v[3] }; +} +static inline Tsimd_i32x4_t tfSimdOr_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1], arg1.v[2] | arg2.v[2], arg1.v[3] | arg2.v[3] }; +} +static inline Tsimd_i32x4_t tfSimdXor_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1], arg1.v[2] ^ arg2.v[2], arg1.v[3] ^ arg2.v[3] }; } -inline TSimdFloat32x4 tfSimdFloat4x32Load(float x, float y, float z, float w) { return { x, y, z, w }; } -inline TSimdInt32x4 tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { return { x, y, z, w }; } - -inline TSimdFloat32x2 tfSimd4fToSimd2f(TSimdFloat32x4 value) { return { value.v[0], value.v[1] }; } -inline TSimdFloat32x3 tfSimd4fToSimd3f(TSimdFloat32x4 value) { return { value.v[0], value.v[1], value.v[2] }; } - -inline TSimdFloat32x4 tfSimd4fSplatIndex0(TSimdFloat32x4 value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } -inline TSimdFloat32x4 tfSimd4fSplatIndex1(TSimdFloat32x4 value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } -inline TSimdFloat32x4 tfSimd4fSplatIndex2(TSimdFloat32x4 value) { return { value.v[2], value.v[2], value.v[2], value.v[2] }; } -inline TSimdFloat32x4 tfSimd4fSplatIndex3(TSimdFloat32x4 value) { return { value.v[3], value.v[3], value.v[3], value.v[3] }; } - -inline TSimdInt32x4 tfSimd4iSplat(int32_t value) { return { value, value, value, value }; } -inline TSimdFloat32x4 tfSimd4fSplat(float value) { return { value, value, value, value }; } -inline TSimdFloat32x4 tfSimd4fCmpEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] == arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] == arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] == arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] == arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdFloat32x4 tfSimd4fCmpNeq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] != arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] != arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] != arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] != arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdFloat32x4 tfSimd4fCmpGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] > arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] > arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] > arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] > arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdFloat32x4 tfSimd4fCmpGtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] >= arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] >= arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] >= arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] >= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdFloat32x4 tfSimd4fCmpLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] < arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] < arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdFloat32x4 tfSimd4fCmpLtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - return { { (arg1.v[0] <= arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] <= arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, - (arg1.v[2] <= arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] <= arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +static inline Tsimd_i32x4_t tfSimdCmpLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return { (arg1.v[0] <= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] <= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] <= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] <= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 }; } -inline TSimdInt32x4 tfSimd4iCmpEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] == arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] == arg2.v[0] && arg1.v[1] == arg2.v[1] && arg1.v[2] == arg2.v[2] && arg1.v[3] == arg2.v[3]; } -inline TSimdInt32x4 tfSimd4iCmpNeq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] != arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] != arg2.v[0] && arg1.v[1] != arg2.v[1] && arg1.v[2] != arg2.v[2] && arg1.v[3] != arg2.v[3]; } -inline TSimdInt32x4 tfSimd4iCmpGt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] > arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] > arg2.v[0] && arg1.v[1] > arg2.v[1] && arg1.v[2] > arg2.v[2] && arg1.v[3] > arg2.v[3]; } -inline TSimdInt32x4 tfSimd4iCmpGtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] >= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] >= arg2.v[0] && arg1.v[1] >= arg2.v[1] && arg1.v[2] >= arg2.v[2] && arg1.v[3] >= arg2.v[3]; } -inline TSimdInt32x4 tfSimd4iCmpLt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] < arg2.v[0] && arg1.v[1] < arg2.v[1] && arg1.v[2] < arg2.v[2] && arg1.v[3] < arg2.v[3]; } -inline TSimdInt32x4 tfSimd4iCmpLtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, - (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +static inline bool tfSimdCmpAllLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return arg1.v[0] <= arg2.v[0] && arg1.v[1] <= arg2.v[1] && arg1.v[2] <= arg2.v[2] && arg1.v[3] <= arg2.v[3]; } -inline bool tfSimd4fCmpAllLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - for (int i = 0; i < 4; i++) { - if (arg1.v[i] >= arg2.v[i]) { - return false; - } - } - return true; +static inline Tsimd_i32x4_t tfSimd_f32x4_To_i32x4(Tsimd_f32x4_t a) { + union int_float { + Tsimd_f32x4_t a; + Tsimd_i32x4_t b; + } conversion; + conversion.a = a; + COMPILE_ASSERT(sizeof(Tsimd_f32x4_t) == sizeof(Tsimd_i32x4_t)); + return conversion.b; +} +static inline Tsimd_f32x4_t tfSimd_i32x4_To_f32x4(Tsimd_i32x4_t a) { + union int_float { + Tsimd_f32x4_t a; + Tsimd_i32x4_t b; + } conversion; + conversion.b = a; + COMPILE_ASSERT(sizeof(Tsimd_f32x4_t) == sizeof(Tsimd_i32x4_t)); + return conversion.a; } -inline bool tfSimd4fCmpAllGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - for (int i = 0; i < 4; i++) { - if (arg1.v[i] <= arg2.v[i]) { - return false; - } - } - return true; -} -inline bool tfSimd4fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - for (int i = 0; i < 4; i++) { - if (arg1.v[i] != arg2.v[i]) { - return false; - } - } - return true; -} - -inline bool tfSimd4iCmpAllEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - for (int i = 0; i < 4; i++) { - if (arg1.v[i] != arg2.v[i]) { - return false; - } - } - return true; -} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex0ByValue(Tsimd_f32x4_t input, float value) { +// return { value, input.v[1], input.v[2], input.v[3] }; +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex1ByValue(Tsimd_f32x4_t input, float value) { +// return { input.v[0], value, input.v[2], input.v[3] }; +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex2ByValue(Tsimd_f32x4_t input, float value) { +// return { input.v[0], input.v[1], value, input.v[3] }; +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex3ByValue(Tsimd_f32x4_t input, float value) { +// return { input.v[0], input.v[1], input.v[2], value }; +//} +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tSelect(Tsimd_i32x4_t arg0, Tsimd_i32x4_t arg1, Tsimd_i32x4_t mask) { +// return { (mask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (mask.v[1] == 0) ? arg0.v[1] : arg1.v[1], (mask.v[2] == 0) ? arg0.v[2] : arg1.v[2], +// (mask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; +//} +//inline Tsimd_f32x4_t tfS32x4FSelect(Tsimd_f32x4_t arg0, Tsimd_f32x4_t arg1, Tsimd_f32x4_t mask) { +// Tsimd_i32x4_t intMask = tfS32x4FToS32x4I(mask); +// return { (intMask.v[0] == 0) ? arg0.v[0] : arg1.v[0], (intMask.v[1] == 0) ? arg0.v[1] : arg1.v[1], +// (intMask.v[2] == 0) ? arg0.v[2] : arg1.v[2], (intMask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; +//} +// +//inline Tsimd_f32x4_t tfS32x4FZero() { return { 0, 0, 0, 0 }; } +//inline Tsimd_i32x4_t Tsimd_i32x4_tZero() { return { 0, 0, 0, 0 }; } +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tNot(Tsimd_i32x4_t value) { return { ~value.v[0], ~value.v[1], ~value.v[2], ~value.v[3] }; } +//inline Tsimd_i32x4_t Tsimd_i32x4_tAnd(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1], arg1.v[2] & arg2.v[2], arg1.v[3] & arg2.v[3] }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tAndNot(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1], ~arg1.v[2] & arg2.v[2], ~arg1.v[3] & arg2.v[3] }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tOr(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1], arg1.v[2] | arg2.v[2], arg1.v[3] | arg2.v[3] }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tXor(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1], arg1.v[2] ^ arg2.v[2], arg1.v[3] ^ arg2.v[3] }; +//} +// +//inline Tsimd_f32x4_t tfS32x4FNot(Tsimd_f32x4_t value) { +// Tsimd_i32x4_t result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]), ~((int32_t)value.v[2]), ~((int32_t)value.v[3]) } }; +// return Tsimd_i32x4_tToSimd4f(result); +//} +//inline Tsimd_f32x4_t tfS32x4FAnd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_i32x4_t result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), +// ((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) }; +// return Tsimd_i32x4_tToSimd4f(result); +//} +//inline Tsimd_f32x4_t tfS32x4FAndNot(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_i32x4_t result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), +// ~((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), ~((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) } }; +// return Tsimd_i32x4_tToSimd4f(result); +//} +//inline Tsimd_f32x4_t tfS32x4FOr(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_i32x4_t result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]), +// ((int32_t)arg1.v[2]) | ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) | ((int32_t)arg2.v[3]) } }; +// return Tsimd_i32x4_tToSimd4f(result); +//} +//inline Tsimd_f32x4_t tfS32x4FXor(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_i32x4_t result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]), +// ((int32_t)arg1.v[2]) ^ ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) ^ ((int32_t)arg2.v[3]) } }; +// return Tsimd_i32x4_tToSimd4f(result); +//} +// +//inline Tsimd_f32x4_t tfS32x4FFloor(Tsimd_f32x4_t value) { +// return { { floorf(value.v[0]), floorf(value.v[1]), floorf(value.v[2]), floorf(value.v[3]) } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCeil(Tsimd_f32x4_t value) { +// return { { ceilf(value.v[0]), ceilf(value.v[1]), ceilf(value.v[2]), ceilf(value.v[3]) } }; +//} +//inline Tsimd_f32x4_t tfS32x4FRound(Tsimd_f32x4_t value) { +// // While 'roundf' may seem the obvious choice here, it rounds halfway cases +// // away from zero regardless of the current rounding mode, but 'rintf' uses +// // the current rounding mode which is consistent with other implementations. +// return { { rintf(value.v[0]), rintf(value.v[1]), rintf(value.v[2]), rintf(value.v[3]) } }; +//} +//inline Tsimd_f32x4_t tfS32x4FTruncate(Tsimd_f32x4_t value) { return Tsimd_i32x4_tToSimd4f(tfS32x4FToS32x4I(value)); } +//inline Tsimd_f32x4_t tfS32x4FMin(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { fminf(arg1.v[0], arg2.v[0]), fminf(arg1.v[1], arg2.v[1]), fminf(arg1.v[2], arg2.v[2]), fminf(arg1.v[3], arg2.v[3]) } }; +//} +//inline Tsimd_f32x4_t tfS32x4FMax(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { fmaxf(arg1.v[0], arg2.v[0]), fmaxf(arg1.v[1], arg2.v[1]), fmaxf(arg1.v[2], arg2.v[2]), fmaxf(arg1.v[3], arg2.v[3]) } }; +//} +//inline Tsimd_f32x4_t tfS32x4FClamp(Tsimd_f32x4_t value, Tsimd_f32x4_t min, Tsimd_f32x4_t max) { +// return tfS32x4FMax(min, tfS32x4FMin(value, max)); +//} +// +//inline Tsimd_i32x4_t tfS32x4FToS32x4I(Tsimd_f32x4_t value) { +// return { (int32_t)value.v[0], (int32_t)value.v[1], (int32_t)value.v[2], (int32_t)value.v[3] }; +//} +// +//inline Tsimd_f32x4_t Tsimd_i32x4_tToSimd4f(Tsimd_i32x4_t value) { +// return { (float)value.v[0], (float)value.v[1], (float)value.v[2], (float)value.v[3] }; +//} +// +//inline float tfS32x4FSelectIndex0(Tsimd_f32x4_t value) { return value.v[0]; } +//inline float tfS32x4FSelectIndex1(Tsimd_f32x4_t value) { return value.v[1]; } +//inline float tfS32x4FSelectIndex2(Tsimd_f32x4_t value) { return value.v[2]; } +//inline float tfS32x4FSelectIndex3(Tsimd_f32x4_t value) { return value.v[3]; } +// +//inline Tsimd_f32x4_t tfS32x4FAdd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { +// arg1.v[0] + arg2.v[0], +// arg1.v[1] + arg2.v[1], +// arg1.v[2] + arg2.v[2], +// arg1.v[3] + arg2.v[3], +// }; +//} +//inline Tsimd_f32x4_t tfS32x4FSub(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { +// arg1.v[0] - arg2.v[0], +// arg1.v[1] - arg2.v[1], +// arg1.v[2] - arg2.v[2], +// arg1.v[3] - arg2.v[3], +// }; +//} +//inline Tsimd_f32x4_t tfS32x4FMul(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { +// arg1.v[0] * arg2.v[0], +// arg1.v[1] * arg2.v[1], +// arg1.v[2] * arg2.v[2], +// arg1.v[3] * arg2.v[3], +// }; +//} +//inline Tsimd_f32x4_t tfS32x4FMadd(Tsimd_f32x4_t mul1, Tsimd_f32x4_t mul2, Tsimd_f32x4_t add) { +// return tfS32x4FAdd(tfS32x4FMul(mul1, mul2), add); +//} +// +//inline Tsimd_f32x4_t tfS32x4FDiv(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { +// arg1.v[0] / arg2.v[0], +// arg1.v[1] / arg2.v[1], +// arg1.v[2] / arg2.v[2], +// arg1.v[3] / arg2.v[3], +// }; +//} +// +//inline Tsimd_f32x4_t tfS32x4FAbs(Tsimd_f32x4_t value) { +// return { +// abs(value.v[0]), +// abs(value.v[1]), +// abs(value.v[2]), +// abs(value.v[3]), +// }; +//} +//inline Tsimd_f32x4_t tfS32x4FLoad(float x, float y, float z, float w) { return { x, y, z, w }; } +//inline Tsimd_i32x4_t tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { return { x, y, z, w }; } +// +//inline Tsimd_f32x2_t tfS32x4FToS32x2F(Tsimd_f32x4_t value) { return { value.v[0], value.v[1] }; } +//inline Tsimd_f32x3_t tfS32x4FToS32x3F(Tsimd_f32x4_t value) { return { value.v[0], value.v[1], value.v[2] }; } +// +//inline Tsimd_f32x4_t tfS32x4FSplatIndex0(Tsimd_f32x4_t value) { return { value.v[0], value.v[0], value.v[0], value.v[0] }; } +//inline Tsimd_f32x4_t tfS32x4FSplatIndex1(Tsimd_f32x4_t value) { return { value.v[1], value.v[1], value.v[1], value.v[1] }; } +//inline Tsimd_f32x4_t tfS32x4FSplatIndex2(Tsimd_f32x4_t value) { return { value.v[2], value.v[2], value.v[2], value.v[2] }; } +//inline Tsimd_f32x4_t tfS32x4FSplatIndex3(Tsimd_f32x4_t value) { return { value.v[3], value.v[3], value.v[3], value.v[3] }; } +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tSplat(int32_t value) { return { value, value, value, value }; } +//inline Tsimd_f32x4_t tfS32x4FSplat(float value) { return { value, value, value, value }; } +// +//inline Tsimd_f32x4_t tfS32x4FCmpEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] == arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] == arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] == arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCmpNeq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] != arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] != arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] != arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCmpGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] > arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] > arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] > arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCmpGtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] >= arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] >= arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] >= arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCmpLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] < arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] < arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_f32x4_t tfS32x4FCmpLtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// return { { (arg1.v[0] <= arg2.v[0]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[1] <= arg2.v[1]) ? (float)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] <= arg2.v[2]) ? (float)0xFFFFFFFF : 0x00000000, (arg1.v[3] <= arg2.v[3]) ? (float)0xFFFFFFFF : 0x00000000 } }; +//} +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] == arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpNeq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] != arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpGt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] > arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpGtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] >= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpLt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpLtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, +// (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +//} +// +//inline bool tfS32x4FCmpAllLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// for (int i = 0; i < 4; i++) { +// if (arg1.v[i] >= arg2.v[i]) { +// return false; +// } +// } +// return true; +//} +// +//inline bool tfS32x4FCmpAllGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// for (int i = 0; i < 4; i++) { +// if (arg1.v[i] <= arg2.v[i]) { +// return false; +// } +// } +// return true; +//} +// +//inline bool tfS32x4FCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// for (int i = 0; i < 4; i++) { +// if (arg1.v[i] != arg2.v[i]) { +// return false; +// } +// } +// return true; +//} +// +//inline bool Tsimd_i32x4_tCmpAllEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// for (int i = 0; i < 4; i++) { +// if (arg1.v[i] != arg2.v[i]) { +// return false; +// } +// } +// return true; +//} diff --git a/Forge/Math/Internal/TF_Simd32x4_sse.inl b/Forge/Math/Internal/TF_Simd32x4_sse.inl index 674226a58d..2fe489b8d4 100644 --- a/Forge/Math/Internal/TF_Simd32x4_sse.inl +++ b/Forge/Math/Internal/TF_Simd32x4_sse.inl @@ -4,123 +4,291 @@ #include "../TF_Simd32x4.h" #endif -static inline TSimdFloat32x4 tfSimd4fReplaceIndex0ByValue(TSimdFloat32x4 input, float value) { - return _mm_blend_ps(input, tfSimd4fSplat(value), 0b0001); -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex1ByValue(TSimdFloat32x4 input, float value) { - return _mm_blend_ps(input, tfSimd4fSplat(value), 0b0010); -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex2ByValue(TSimdFloat32x4 input, float value) { - return _mm_blend_ps(input, tfSimd4fSplat(value), 0b0100); -} -static inline TSimdFloat32x4 tfSimd4fReplaceIndex3ByValue(TSimdFloat32x4 input, float value) { - return _mm_blend_ps(input, tfSimd4fSplat(value), 0b1000); -} +// Tsimd_f32x4_t +static inline Tsimd_f32x4_t tfSimdLoad_f32x4(float x, float y, float z, float w) { return _mm_set_ps(w, z, y, x); } +static inline Tsimd_f32x4_t tfSimdZero_f32x4() { return _mm_setzero_ps(); } +static inline Tsimd_f32x4_t tfSimdSplat_f32x4(float value) { return _mm_set1_ps(value); } -inline TSimdInt32x4 tfSimd4iSelect(TSimdInt32x4 arg0, TSimdInt32x4 arg1, TSimdInt32x4 mask) { return _mm_blendv_epi8(arg0, arg1, mask); } -inline TSimdFloat32x4 tfSimd4fSelect(TSimdFloat32x4 arg0, TSimdFloat32x4 arg1, TSimdFloat32x4 mask) { - return _mm_blendv_ps(arg0, arg1, mask); -} - -inline TSimdFloat32x4 tfSimd4fZero() { return _mm_setzero_ps(); } -inline TSimdInt32x4 tfSimd4iZero() { return _mm_setzero_si128(); } +static inline Tsimd_f32x4_t tfSimdSplat0_f32x4(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +static inline Tsimd_f32x4_t tfSimdSplat1_f32x4(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +static inline Tsimd_f32x4_t tfSimdSplat2_f32x4(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); }; +static inline Tsimd_f32x4_t tfSimdSplat3_f32x4(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(3, 3, 3, 3)); } -inline TSimdInt32x4 tfSimd4iNot(TSimdInt32x4 value) { return _mm_andnot_si128(value, _mm_set1_epi32(TF_SIMDI_MAX)); } -inline TSimdInt32x4 tfSimd4iAnd(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_and_si128(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iAndNot(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_andnot_si128(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iOr(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_or_si128(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iXor(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_xor_si128(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fNot(TSimdFloat32x4 value) { return _mm_andnot_ps(value, _mm_set1_ps((float)(TF_SIMDF_MAX))); } -inline TSimdFloat32x4 tfSimd4fAnd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_and_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fAndNot(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_andnot_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fOr(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_or_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fXor(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_xor_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdDot_f32x4(Tsimd_f32x4_t a,Tsimd_f32x4_t b) { + Tsimd_f32x4_t x2 = _mm_mul_ps(a, b); + Tsimd_f32x4_t tmp = _mm_hadd_ps(x2, x2); + return _mm_hadd_ps(tmp, tmp); +} -inline TSimdFloat32x4 tfSimd4fFloor(TSimdFloat32x4 value) { return _mm_floor_ps(value); } -inline TSimdFloat32x4 tfSimd4fCeil(TSimdFloat32x4 value) { return _mm_ceil_ps(value); } -inline TSimdFloat32x4 tfSimd4fRound(TSimdFloat32x4 value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline TSimdFloat32x4 tfSimd4fTruncate(TSimdFloat32x4 value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } -inline TSimdFloat32x4 tfSimd4fMin(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_min_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMax(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_max_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fClamp(TSimdFloat32x4 value, TSimdFloat32x4 min, TSimdFloat32x4 max) { - return tfSimd4fMax(min, tfSimd4fMin(value, max)); +static inline float tfSimdDot_f32x4_f32(Tsimd_f32x4_t a,Tsimd_f32x4_t b) { + return _mm_cvtss_f32(tfSimdDot_f32x4(a,b)); } -inline TSimdInt32x4 tfSimd4fToSimd4i(TSimdFloat32x4 value) { return _mm_castps_si128(value); } -inline TSimdFloat32x4 tfSimd4iToSimd4f(TSimdInt32x4 value) { return _mm_castsi128_ps(value); } +static inline float tfSimdSelect_f32x4(Tsimd_f32x4_t value, int index) { + ASSERT(index < 4); + switch (index) { + case 0: return tfSimdSelect0_f32x4(value); + case 1: return tfSimdSelect1_f32x4(value); + case 2: return tfSimdSelect2_f32x4(value); + case 3: return tfSimdSelect3_f32x4(value); + } + return {}; +} +static inline float tfSimdSelect0_f32x4(Tsimd_f32x4_t value) { return _mm_cvtss_f32(value); } +static inline float tfSimdSelect1_f32x4(Tsimd_f32x4_t value) { return _mm_cvtss_f32(tfSimdSplat1_f32x4(value)); } +static inline float tfSimdSelect2_f32x4(Tsimd_f32x4_t value) { return _mm_cvtss_f32(tfSimdSplat2_f32x4(value)); } +static inline float tfSimdSelect3_f32x4(Tsimd_f32x4_t value) { return _mm_cvtss_f32(tfSimdSplat3_f32x4(value)); } -inline float tfSimd4fSelectIndex0(TSimdFloat32x4 value) { return _mm_cvtss_f32(value); } -inline float tfSimd4fSelectIndex1(TSimdFloat32x4 value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex1(value)); } -inline float tfSimd4fSelectIndex2(TSimdFloat32x4 value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex2(value)); } -inline float tfSimd4fSelectIndex3(TSimdFloat32x4 value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex3(value)); } +static inline Tsimd_f32x4_t tfSimdAdd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return _mm_add_ps(a, b); } +static inline Tsimd_f32x4_t tfSimdMul_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return _mm_mul_ps(a, b); } +static inline Tsimd_f32x4_t tfSimdDiv_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return _mm_div_ps(a, b); } +static inline Tsimd_f32x4_t tfSimdAbs_f32x4(Tsimd_f32x4_t a) { + const __m128 signMask = tfSimd_i32x4_To_f32x4(tfSimdSplat_i32x4(0x7FFFFFFF)); + return tfSimdAnd_f32x4(a, signMask); +} +static inline Tsimd_f32x4_t tfSimdMadd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b, Tsimd_f32x4_t c) { + return tfSimdAdd_f32x4(tfSimdMul_f32x4(a, b), c); +} -inline TSimdFloat32x4 tfSimd4fAdd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_add_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fSub(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_sub_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMul(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_mul_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fMadd(TSimdFloat32x4 mul1, TSimdFloat32x4 mul2, TSimdFloat32x4 add) { -#if 0 - return _mm_fmadd_ps(mul1, mul2, add); // Requires FMA CPUID -#else - return tfSimd4fAdd(tfSimd4fMul(mul1, mul2), add); -#endif +static inline Tsimd_f32x4_t tfSimdNot_f32x4(Tsimd_f32x4_t value) { + return _mm_andnot_ps(value, tfSimd_i32x4_To_f32x4(tfSimdSplat_i32x4((int32_t)(0xFFFFFFFF)))); } +static inline Tsimd_f32x4_t tfSimdAnd_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_and_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdAndNot_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_andnot_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdOr_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_or_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdXor_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_xor_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimdFloat4x32Div(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_div_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpeq_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpneq_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpgt_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpge_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmplt_ps(arg1, arg2); } +static inline Tsimd_f32x4_t tfSimdCmpLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmple_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fAbs(TSimdFloat32x4 value) { - const TSimdFloat32x4 signMask = tfSimd4iToSimd4f(tfSimd4iSplat(0x7FFFFFFF)); - return _mm_and_ps(value, signMask); +static inline bool tfSimdCmpAllEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpEq_f32x4(arg1, arg2)) & 0xf) == 0xf; +} +static inline bool tfSimdCmpAllNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpNeq_f32x4(arg1, arg2)) & 0xf) == 0xf; +} +static inline bool tfSimdCmpAllGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpGt_f32x4(arg1, arg2)) & 0xf) == 0xf; +} +static inline bool tfSimdCmpAllGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpGtEq_f32x4(arg1, arg2)) & 0xf) == 0xf; +} +static inline bool tfSimdCmpAllLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpLt_f32x4(arg1, arg2)) & 0xf) == 0xf; +} +static inline bool tfSimdCmpAllLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return (_mm_movemask_ps(tfSimdCmpLtEq_f32x4(arg1, arg2)) & 0xf) == 0xf; } -inline TSimdFloat32x4 tfSimdFloat4x32Load(float x, float y, float z, float w) { return _mm_set_ps(w, z, y, x); } -inline TSimdInt32x4 tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { return _mm_set_epi32(w, z, y, x); } -inline TSimdFloat32x2 tfSimd4fToSimd2f(TSimdFloat32x4 value) { return value; } -inline TSimdFloat32x3 tfSimd4fToSimd3f(TSimdFloat32x4 value) { return value; } +// Tsimd_i32x4_t +static inline Tsimd_i32x4_t tfSimdLoad_i32x4(int32_t x, int32_t y, int32_t z, int32_t w) { return _mm_set_epi32(w, z, y, x); } +static inline Tsimd_i32x4_t tfSimdSplat_i32x4(int32_t value) { return _mm_set1_epi32(value); } +static inline Tsimd_i32x4_t tfSimdSplat0_i32x4(Tsimd_i32x4_t value) { return _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0)); } +static inline Tsimd_i32x4_t tfSimdSplat1_i32x4(Tsimd_i32x4_t value) { return _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)); } +static inline Tsimd_i32x4_t tfSimdSplat2_i32x4(Tsimd_i32x4_t value) { return _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2)); } +static inline Tsimd_i32x4_t tfSimdSplat3_i32x4(Tsimd_i32x4_t value) { return _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3)); } -inline TSimdFloat32x4 tfSimd4fSplatIndex0(TSimdFloat32x4 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } -inline TSimdFloat32x4 tfSimd4fSplatIndex1(TSimdFloat32x4 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } -inline TSimdFloat32x4 tfSimd4fSplatIndex2(TSimdFloat32x4 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } -inline TSimdFloat32x4 tfSimd4fSplatIndex3(TSimdFloat32x4 value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(3, 3, 3, 3)); } +static inline int32_t tfSimdSelect_i32x4(Tsimd_i32x4_t value, int index) { + ASSERT(index < 4); + switch(index) { + case 0: return tfSimdSelect0_i32x4(value); + case 1: return tfSimdSelect1_i32x4(value); + case 2: return tfSimdSelect2_i32x4(value); + case 3: return tfSimdSelect3_i32x4(value); + } + return {}; -inline TSimdInt32x4 tfSimd4iSplat(int32_t value) { return _mm_set1_epi32(value); } -inline TSimdFloat32x4 tfSimd4fSplat(float value) { return _mm_set1_ps(value); } +} +static inline int32_t tfSimdSelect0_i32x4(Tsimd_i32x4_t value) { return _mm_cvtsi128_si32(value); } +static inline int32_t tfSimdSelect1_i32x4(Tsimd_i32x4_t value) { return _mm_cvtsi128_si32(tfSimdSplat1_i32x4(value)); }; +static inline int32_t tfSimdSelect2_i32x4(Tsimd_i32x4_t value) { return _mm_cvtsi128_si32(tfSimdSplat2_i32x4(value)); }; +static inline int32_t tfSimdSelect3_i32x4(Tsimd_i32x4_t value) { return _mm_cvtsi128_si32(tfSimdSplat3_i32x4(value)); }; -inline TSimdFloat32x4 tfSimd4fCmpEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmpeq_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fCmpNeq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmpneq_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fCmpGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmpgt_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fCmpGtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmpge_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fCmpLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmplt_ps(arg1, arg2); } -inline TSimdFloat32x4 tfSimd4fCmpLtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { return _mm_cmple_ps(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimd_f32x4_To_i32x4(Tsimd_f32x4_t a) { return _mm_castps_si128(a); } +static inline Tsimd_f32x4_t tfSimd_i32x4_To_f32x4(Tsimd_i32x4_t a) { return _mm_castsi128_ps(a); } -inline TSimdInt32x4 tfSimd4iCmpEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_cmpeq_epi32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpNeq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - return _mm_xor_si128(_mm_cmpeq_epi32(arg1, arg2), _mm_set1_epi32((int32_t)0xFFFFFFFF)); +static inline Tsimd_f32x4_t tfSimdAdd_i32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return _mm_add_epi32(a, b); } +static inline Tsimd_f32x4_t tfSimdMul_i32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b) { return _mm_mul_epi32(a, b); } +static inline Tsimd_f32x4_t tfSimdAbs_i32x4(Tsimd_f32x4_t a) { return _mm_abs_epi32(a); } +static inline Tsimd_f32x4_t tfSimdMadd_i32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b, Tsimd_f32x4_t c) { + return tfSimdAdd_i32x4(tfSimdMul_i32x4(a, b), c); } -inline TSimdInt32x4 tfSimd4iCmpGt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_cmpgt_epi32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpGtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { +static inline Tsimd_i32x4_t tfSimdNot_i32x4(Tsimd_i32x4_t value) { return _mm_andnot_si128(value, tfSimdSplat_i32x4(0xFFFFFFFF)); } +static inline Tsimd_i32x4_t tfSimdAnd_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_and_si128(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimdAndNot_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_andnot_si128(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimdOr_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_or_si128(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimdXor_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_xor_si128(arg1, arg2); } + +static inline Tsimd_i32x4_t tfSimdCmpEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmpeq_epi32(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimdCmpNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return tfSimdNot_i32x4(tfSimdCmpEq_i32x4(arg1, arg2)); +} +static inline Tsimd_i32x4_t tfSimdCmpGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmpgt_epi32(arg1, arg2); } + +static inline Tsimd_i32x4_t tfSimdCmpGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_or_si128(_mm_cmpgt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline TSimdInt32x4 tfSimd4iCmpLt(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { return _mm_cmplt_epi32(arg1, arg2); } -inline TSimdInt32x4 tfSimd4iCmpLtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { +static inline Tsimd_i32x4_t tfSimdCmpLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmplt_epi32(arg1, arg2); } +static inline Tsimd_i32x4_t tfSimdCmpLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_or_si128(_mm_cmplt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); } -inline bool tfSimd4fCmpAllLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdFloat32x4 compare = tfSimd4fCmpLt(arg1, arg2); - return (_mm_movemask_ps(compare) & 0xf) == 0xf; -} -inline bool tfSimd4fCmpAllGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdFloat32x4 compare = tfSimd4fCmpGt(arg1, arg2); - return (_mm_movemask_ps(compare) & 0xf) == 0xf; +static inline bool tfSimdCmpAllEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpEq_i32x4(arg1, arg2)) & 0xFFFF) == 0xFFFF; } - -inline bool tfSimd4fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2) { - TSimdFloat32x4 compare = tfSimd4fCmpEq(arg1, arg2); - return (_mm_movemask_ps(compare) & 0xf) == 0xf; +static inline bool tfSimdCmpAllNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpNeq_i32x4(arg1, arg2)) & 0xFFFF) == 0xFFFF; } - -inline bool tfSimd4iCmpAllEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2) { - const TSimdInt32x4 compare = tfSimd4iCmpEq(arg1, arg2); - return (_mm_movemask_epi8(compare) & 0xf) == 0xf; +static inline bool tfSimdCmpAllGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpGt_i32x4(arg1, arg2))& 0xFFFF) == 0xFFFF; +} +static inline bool tfSimdCmpAllGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpGtEq_i32x4(arg1, arg2))& 0xFFFF) == 0xFFFF; } +static inline bool tfSimdCmpAllLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpLt_i32x4(arg1, arg2)) & 0xFFFF) == 0xFFFF; +} +static inline bool tfSimdCmpAllLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return (_mm_movemask_epi8(tfSimdCmpLtEq_i32x4(arg1, arg2))& 0xFFFF) == 0xFFFF; +} + +// DELETE everything below +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex0(Tsimd_f32x4_t input, Tsimd_f32x4_t value) { +// return _mm_blend_ps(input, value, 0b0001); +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex1(Tsimd_f32x4_t input, Tsimd_f32x4_t value){ +// return _mm_blend_ps(input, value, 0b0010); +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex2(Tsimd_f32x4_t input, Tsimd_f32x4_t value){ +// return _mm_blend_ps(input, value, 0b0100); +//} +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex3(Tsimd_f32x4_t input, Tsimd_f32x4_t value){ +// return _mm_blend_ps(input, value, 0b1000); +//} +// +// +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex0ByValue(Tsimd_f32x4_t input, float value) { +// return _mm_blend_ps(input, tfS32x4FSplat(value), 0b0001); +//} +//static inline Tsimd_f32x4_t tfSimdFloat4ReplaceIndex1ByValue(Tsimd_f32x4_t input, float value) { +// return _mm_blend_ps(input, tfS32x4FSplat(value), 0b0010); +//} +//static inline Tsimd_f32x4_t tfSimd4fReplaceIndex2ByValue(Tsimd_f32x4_t input, float value) { +// return _mm_blend_ps(input, tfS32x4FSplat(value), 0b0100); +//} +//static inline Tsimd_f32x4_t tfSimd4fReplaceIndex3ByValue(Tsimd_f32x4_t input, float value) { +// return _mm_blend_ps(input, tfS32x4FSplat(value), 0b1000); +//} +// +//inline Tsimd_i32x4_t tfSimd4iSelect(Tsimd_i32x4_t arg0, Tsimd_i32x4_t arg1, Tsimd_i32x4_t mask) { return _mm_blendv_epi8(arg0, arg1, mask); } +//inline Tsimd_f32x4_t tfSimd4fSelect(Tsimd_f32x4_t arg0, Tsimd_f32x4_t arg1, Tsimd_f32x4_t mask) { +// return _mm_blendv_ps(arg0, arg1, mask); +//} +// +//inline Tsimd_f32x4_t tfSimd4fZero() { return _mm_setzero_ps(); } +//inline Tsimd_i32x4_t tfSimd4iZero() { return _mm_setzero_si128(); } +// +//inline Tsimd_i32x4_t tfSimd4iNot(Tsimd_i32x4_t value) { return _mm_andnot_si128(value, _mm_set1_epi32(TF_SIMDI_MAX)); } +//inline Tsimd_i32x4_t tfSimd4iAnd(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_and_si128(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iAndNot(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_andnot_si128(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iOr(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_or_si128(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iXor(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_xor_si128(arg1, arg2); } +// +//inline Tsimd_f32x4_t tfSimd4fNot(Tsimd_f32x4_t value) { return _mm_andnot_ps(value, _mm_set1_ps((float)(TF_SIMDF_MAX))); } +//inline Tsimd_f32x4_t tfSimd4fAnd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_and_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fAndNot(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_andnot_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fOr(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_or_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fXor(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_xor_ps(arg1, arg2); } +// +//inline Tsimd_f32x4_t tfSimd4fFloor(Tsimd_f32x4_t value) { return _mm_floor_ps(value); } +//inline Tsimd_f32x4_t tfSimd4fCeil(Tsimd_f32x4_t value) { return _mm_ceil_ps(value); } +//inline Tsimd_f32x4_t tfSimd4fRound(Tsimd_f32x4_t value) { return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } +//inline Tsimd_f32x4_t tfSimd4fTruncate(Tsimd_f32x4_t value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } +//inline Tsimd_f32x4_t tfSimd4fMin(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_min_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fMax(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_max_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fClamp(Tsimd_f32x4_t value, Tsimd_f32x4_t min, Tsimd_f32x4_t max) { +// return tfSimd4fMax(min, tfSimd4fMin(value, max)); +//} +// +//inline Tsimd_i32x4_t tfSimd4fToSimd4i(Tsimd_f32x4_t value) { return _mm_castps_si128(value); } +//inline Tsimd_f32x4_t tfSimd4iToSimd4f(Tsimd_i32x4_t value) { return _mm_castsi128_ps(value); } +// +//inline float tfSimd4fSelectIndex0(Tsimd_f32x4_t value) { return _mm_cvtss_f32(value); } +//inline float tfSimd4fSelectIndex1(Tsimd_f32x4_t value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex1(value)); } +//inline float tfSimd4fSelectIndex2(Tsimd_f32x4_t value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex2(value)); } +//inline float tfSimd4fSelectIndex3(Tsimd_f32x4_t value) { return tfSimd4fSelectIndex0(tfSimd4fSplatIndex3(value)); } +// +//inline Tsimd_f32x4_t tfSimd4fAdd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_add_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fSub(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_sub_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fMul(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_mul_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fMadd(Tsimd_f32x4_t mul1, Tsimd_f32x4_t mul2, Tsimd_f32x4_t add) { +//#if 0 +// return _mm_fmadd_ps(mul1, mul2, add); // Requires FMA CPUID +//#else +// return tfSimd4fAdd(tfSimd4fMul(mul1, mul2), add); +//#endif +//} +// +//inline Tsimd_f32x4_t tfSimdFloat4x32Div(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_div_ps(arg1, arg2); } +// +//inline Tsimd_f32x4_t tfSimd4fAbs(Tsimd_f32x4_t value) { +// const Tsimd_f32x4_t signMask = tfSimd4iToSimd4f(tfSimd4iSplat(0x7FFFFFFF)); +// return _mm_and_ps(value, signMask); +//} +//inline Tsimd_f32x4_t tfSimdFloat4x32Load(float x, float y, float z, float w) { return _mm_set_ps(w, z, y, x); } +//inline Tsimd_i32x4_t tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { return _mm_set_epi32(w, z, y, x); } +// +//inline Tsimd_f32x2_t tfSimd4fToSimd2f(Tsimd_f32x4_t value) { return value; } +//inline Tsimd_f32x3_t tfSimd4fToSimd3f(Tsimd_f32x4_t value) { return value; } +// +//inline Tsimd_f32x4_t tfSimd4fSplatIndex0(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); } +//inline Tsimd_f32x4_t tfSimd4fSplatIndex1(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); } +//inline Tsimd_f32x4_t tfSimd4fSplatIndex2(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); } +//inline Tsimd_f32x4_t tfSimd4fSplatIndex3(Tsimd_f32x4_t value) { return _mm_shuffle_ps(value, value, _MM_SHUFFLE(3, 3, 3, 3)); } +// +//inline Tsimd_i32x4_t tfSimd4iSplat(int32_t value) { return _mm_set1_epi32(value); } +//inline Tsimd_f32x4_t tfSimd4fSplat(float value) { return _mm_set1_ps(value); } +// +//inline Tsimd_f32x4_t tfSimd4fCmpEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpeq_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fCmpNeq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpneq_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fCmpGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpgt_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fCmpGtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmpge_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fCmpLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmplt_ps(arg1, arg2); } +//inline Tsimd_f32x4_t tfSimd4fCmpLtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return _mm_cmple_ps(arg1, arg2); } +// +//inline Tsimd_i32x4_t tfSimd4iCmpEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmpeq_epi32(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iCmpNeq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return _mm_xor_si128(_mm_cmpeq_epi32(arg1, arg2), _mm_set1_epi32((int32_t)0xFFFFFFFF)); +//} +//inline Tsimd_i32x4_t tfSimd4iCmpGt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmpgt_epi32(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iCmpGtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return _mm_or_si128(_mm_cmpgt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); +//} +//inline Tsimd_i32x4_t tfSimd4iCmpLt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return _mm_cmplt_epi32(arg1, arg2); } +//inline Tsimd_i32x4_t tfSimd4iCmpLtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// return _mm_or_si128(_mm_cmplt_epi32(arg1, arg2), _mm_cmpeq_epi32(arg1, arg2)); +//} +//inline bool tfSimd4fCmpAllLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_f32x4_t compare = tfSimd4fCmpLt(arg1, arg2); +// return (_mm_movemask_ps(compare) & 0xf) == 0xf; +//} +// +//inline bool tfSimd4fCmpAllGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_f32x4_t compare = tfSimd4fCmpGt(arg1, arg2); +// return (_mm_movemask_ps(compare) & 0xf) == 0xf; +//} +// +//inline bool tfSimd4fCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { +// Tsimd_f32x4_t compare = tfSimd4fCmpEq(arg1, arg2); +// return (_mm_movemask_ps(compare) & 0xf) == 0xf; +//} +// +//inline bool tfSimd4iCmpAllEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { +// const Tsimd_i32x4_t compare = tfSimd4iCmpEq(arg1, arg2); +// return (_mm_movemask_epi8(compare) & 0xf) == 0xf; +//} diff --git a/Forge/Math/Internal/TF_Simd4x32_neon.inl b/Forge/Math/Internal/TF_Simd4x32_neon.inl new file mode 100644 index 0000000000..22a4c10ed2 --- /dev/null +++ b/Forge/Math/Internal/TF_Simd4x32_neon.inl @@ -0,0 +1,121 @@ +#if defined(__CLANGD__) +#define TF_FEATURE_CPU_NEON +#include "Forge/TF_Config.h" +#include "../TF_Simd32x4.h" +#endif + +static inline Tsimd_f32x4_t tfSimd4fReplaceIndex0ByValue(Tsimd_f32x4_t input, float value) { return vsetq_lane_f32(value, input, 0); } +static inline Tsimd_f32x4_t tfSimd4fReplaceIndex1ByValue(Tsimd_f32x4_t input, float value) { return vsetq_lane_f32(value, input, 1); } +static inline Tsimd_f32x4_t tfSimd4fReplaceIndex2ByValue(Tsimd_f32x4_t input, float value) { return vsetq_lane_f32(value, input, 2); } +static inline Tsimd_f32x4_t tfSimd4fReplaceIndex3ByValue(Tsimd_f32x4_t input, float value) { return vsetq_lane_f32(value, input, 3); } + +inline Tsimd_i32x4_t tfSimd4iSelect(Tsimd_i32x4_t arg0, Tsimd_i32x4_t arg1, Tsimd_i32x4_t mask) { return vbslq_s32(mask, arg1, arg1); } +inline Tsimd_f32x4_t tfSimd4fSelect(Tsimd_f32x4_t arg0, Tsimd_f32x4_t arg1, Tsimd_f32x4_t mask) { return vbslq_f32(mask, arg1, arg1); } + +inline Tsimd_f32x4_t tfSimd4fZero() { return vmovq_n_f32(0.0f); } +inline Tsimd_i32x4_t tfSimd4iZero() { return vmovq_n_s32(0); } + +inline Tsimd_i32x4_t tfSimd4iNot(Tsimd_i32x4_t value) { return vmvnq_s32(value); } +inline Tsimd_i32x4_t tfSimd4iAnd(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vandq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iAndNot(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vandq_s32(vmvnq_s32(arg1), arg2); } +inline Tsimd_i32x4_t tfSimd4iOr(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vorrq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iXor(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return veorq_s32(arg1, arg2); } + +inline Tsimd_f32x4_t tfSimd4fNot(Tsimd_f32x4_t value) { return vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(value))); } +inline Tsimd_f32x4_t tfSimd4fAnd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +} +inline Tsimd_f32x4_t tfSimd4fAndNot(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vreinterpretq_f32_s32(vandq_s32(vmvnq_s32(vreinterpretq_s32_f32(arg1)), vreinterpretq_s32_f32(arg2))); +} +inline Tsimd_f32x4_t tfSimd4fOr(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +} +inline Tsimd_f32x4_t tfSimd4fXor(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(arg1), vreinterpretq_s32_f32(arg2))); +} + +inline Tsimd_f32x4_t tfSimd4fFloor(Tsimd_f32x4_t value) { return vrndmq_f32(value); } +inline Tsimd_f32x4_t tfSimd4fCeil(Tsimd_f32x4_t value) { return vrndpq_f32(value); } +inline Tsimd_f32x4_t tfSimd4fRound(Tsimd_f32x4_t value) { return vrndnq_f32(value); } +inline Tsimd_f32x4_t tfSimd4fTruncate(Tsimd_f32x4_t value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } +inline Tsimd_f32x4_t tfSimd4fMin(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vminq_f32(arg1, arg2); } +inline Tsimd_f32x4_t tfSimd4fMax(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vmaxq_f32(arg1, arg2); } +inline Tsimd_f32x4_t tfSimd4fClamp(Tsimd_f32x4_t value, Tsimd_f32x4_t min, Tsimd_f32x4_t max) { + return tfSimd4fMax(min, tfSimd4fMin(value, max)); +} + +inline Tsimd_i32x4_t tfSimd4fToSimd4i(Tsimd_f32x4_t value) { return vreinterpretq_f32_s32(value); } + +inline Tsimd_f32x4_t tfSimd4iToSimd4f(Tsimd_i32x4_t value) { return vreinterpretq_s32_f32(value); } + +inline float tfS32x4FSelectIndex0(Tsimd_f32x4_t value) { return vgetq_lane_f32(value, 0); } +inline float tfS32x4FSelectIndex1(Tsimd_f32x4_t value) { return vgetq_lane_f32(value, 1); } +inline float tfS32x4FSelectIndex2(Tsimd_f32x4_t value) { return vgetq_lane_f32(value, 2); } +inline float tfS32x4FSelectIndex3(Tsimd_f32x4_t value) { return vgetq_lane_f32(value, 3); } + +inline Tsimd_f32x4_t tfSimd4fAdd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vaddq_f32(arg1, arg2); } +inline Tsimd_f32x4_t tfSimd4fSub(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vsubq_f32(arg1, arg2); } +inline Tsimd_f32x4_t tfSimd4fMul(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vmulq_f32(arg1, arg2); } +inline Tsimd_f32x4_t tfSimd4fMadd(Tsimd_f32x4_t mul1, Tsimd_f32x4_t mul2, Tsimd_f32x4_t add) { return vmlaq_f32(add, mul1, mul2); } + +inline Tsimd_f32x4_t tfSimd4fDiv(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vdivq_f32(arg1, arg2); } + +inline Tsimd_f32x4_t tfSimd4fAbs(Tsimd_f32x4_t value) { return vabsq_f32(value); } +inline Tsimd_f32x4_t tfSimdFloat4x32Load(float x, float y, float z, float w) { + const float values[4] = { x, y, z, w }; + return vld1q_f32(values); +} + +inline Tsimd_i32x4_t tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w) { + const int32_t values[4] = { x, y, z, w }; + return vld1q_s32(values); +} + +inline Tsimd_f32x2_t tfSimd4fToSimd2f(Tsimd_f32x4_t value) { return vget_low_f32(value); } + +inline Tsimd_f32x3_t tfSimd4fToSimd3f(Tsimd_f32x4_t value) { return value; } + +inline Tsimd_f32x4_t tfSimd4fSplatIndex0(Tsimd_f32x4_t value) { return vdupq_laneq_f32(value, 0); } + +inline Tsimd_f32x4_t tfSimd4fSplatIndex1(Tsimd_f32x4_t value) { return vdupq_laneq_f32(value, 1); } + +inline Tsimd_f32x4_t tfSimd4fSplatIndex2(Tsimd_f32x4_t value) { return vdupq_laneq_f32(value, 2); } + +inline Tsimd_f32x4_t tfSimd4fSplatIndex3(Tsimd_f32x4_t value) { return vdupq_laneq_f32(value, 3); } + +inline Tsimd_i32x4_t tfSimd4iSplat(int32_t value) { return vdupq_n_s32(value); } + +inline Tsimd_f32x4_t tfSimd4fSplat(float value) { return vdupq_n_f32(value); } + +inline Tsimd_f32x4_t tfSimd4fCmpEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vreinterpretq_f32_s32(vceqq_f32(arg1, arg2)); } +inline Tsimd_f32x4_t tfSimd4fCmpNeq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vreinterpretq_f32_s32(vmvnq_s32(vceqq_f32(arg1, arg2))); +} +inline Tsimd_f32x4_t tfSimd4fCmpGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vreinterpretq_f32_s32(vcgtq_f32(arg1, arg2)); } +inline Tsimd_f32x4_t tfSimd4fCmpGtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vreinterpretq_f32_s32(vcgeq_f32(arg1, arg2)); } +inline Tsimd_f32x4_t tfSimd4fCmpLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vreinterpretq_f32_s32(vcltq_f32(arg1, arg2)); } +inline Tsimd_f32x4_t tfSimd4fCmpLtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { return vreinterpretq_f32_s32(vcleq_f32(arg1, arg2)); } + +inline Tsimd_i32x4_t tfSimd4iCmpEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vceqq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iCmpNeq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vmvnq_s32(vceqq_s32(arg1, arg2)); } +inline Tsimd_i32x4_t tfSimd4iCmpGt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vcgtq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iCmpGtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vcgeq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iCmpLt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vcltq_s32(arg1, arg2); } +inline Tsimd_i32x4_t tfSimd4iCmpLtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { return vcleq_s32(arg1, arg2); } + +inline bool tfSimd4fCmpAllLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vminv_u32(vcltq_f32(arg1, arg2)) != 0; +} + +inline bool tfSimd4fCmpAllGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vminv_u32(vcgtq_f32(arg1, arg2)) != 0; +} + +inline bool tfSimd4fCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2) { + return vminv_u32(vceqq_f32(arg1, arg2)) != 0; +} + +inline bool tfSimd4iCmpAllEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2) { + return vminv_u32(vceqq_s32(arg1, arg2)) != 0; +} diff --git a/Forge/Math/Internal/TF_SimdFloat.inl b/Forge/Math/Internal/TF_SimdFloat.inl index d1f0f38f48..8c8973863f 100644 --- a/Forge/Math/Internal/TF_SimdFloat.inl +++ b/Forge/Math/Internal/TF_SimdFloat.inl @@ -17,19 +17,19 @@ static inline TSimdFloat4 tfVectorMul3x4F(const TSimdFloat4x3 a0, const TSimdFloat3 a1) { - TSimdFloat32x4 xxxx = tfSimdFloat3To4Splat0(a1.mRow); - TSimdFloat32x4 yyyy = tfSimdFloat3To4Splat1(a1.mRow); - TSimdFloat32x4 zzzz = tfSimdFloat3To4Splat2(a1.mRow); - TSimdFloat32x4 res = tfSimd4fMul(a0.mCol0, xxxx); + Tsimd_f32x4_t xxxx = tfS32x3FTo32x4FSplat0(a1.mRow); + Tsimd_f32x4_t yyyy = tfS32x3FTo32x4FSplat1(a1.mRow); + Tsimd_f32x4_t zzzz = tfS32x3FTo32x4FSplat2(a1.mRow); + Tsimd_f32x4_t res = tfSimd4fMul(a0.mCol0, xxxx); res = tfSimd4fMadd(a0.mCol1, yyyy, res); res = tfSimd4fMadd(a0.mCol2, zzzz, res); return { res }; } static inline TSimdFloat4 tfVectorMul2x4F(const TSimdFloat4x2 a0, const TSimdFloat2 a1) { - TSimdFloat32x4 xxxx = tfSimdFloat2To4Splat0(a1.mRow); - TSimdFloat32x4 yyyy = tfSimdFloat2To4Splat1(a1.mRow); - TSimdFloat32x4 res = tfSimd4fMul(a0.mCol0, xxxx); + Tsimd_f32x4_t xxxx = tfSimdFloat2To4Splat0(a1.mRow); + Tsimd_f32x4_t yyyy = tfSimdFloat2To4Splat1(a1.mRow); + Tsimd_f32x4_t res = tfSimd4fMul(a0.mCol0, xxxx); res = tfSimd4fMadd(a0.mCol1, yyyy, res); return { res }; } @@ -38,12 +38,12 @@ static inline TSimdFloat4 tfVectorMul2x4F(const TSimdFloat4x2 a0, const TSimdFlo * Multiplication of a 3x3 matrix and a 4 element vector **/ static inline TSimdFloat3 tfVectorMul3x3F(const TSimdFloat3x3 a0, const TSimdFloat3 a1) { - TSimdFloat32x3 xxx = tfSimd3fSplatIndex0(a1.mRow); - TSimdFloat32x3 yyy = tfSimd3fSplatIndex1(a1.mRow); - TSimdFloat32x3 zzz = tfSimd3fSplatIndex2(a1.mRow); - TSimdFloat32x3 res = tfSimd3fMul(a0.mCol0, xxx); - res = tfSimd3fMadd(a0.mCol1, yyy, res); - res = tfSimd3fMadd(a0.mCol2, zzz, res); + Tsimd_f32x3_t xxx = tfS32x3FSplatIndex0(a1.mRow); + Tsimd_f32x3_t yyy = tfS32x3FSplatIndex1(a1.mRow); + Tsimd_f32x3_t zzz = tfS32x3FSplatIndex2(a1.mRow); + Tsimd_f32x3_t res = tfS32x3FMul(a0.mCol0, xxx); + res = tfS32x3FMadd(a0.mCol1, yyy, res); + res = tfS32x3FMadd(a0.mCol2, zzz, res); return { res }; } @@ -52,19 +52,19 @@ static inline TSimdFloat3 tfVectorMul3x3F(const TSimdFloat3x3 a0, const TSimdFlo static inline TSimdFloat3 tfGetRowSimd3x4F(TSimdFloat4x3 input, int row) { ASSERT(row >= 0 && row < 4); switch(row) { - case 0: return {tfSimdFloat3x32Load( + case 0: return {tfSimd3x32FLoad( tfSimd4fSelectIndex0(input.mCol0), tfSimd4fSelectIndex0(input.mCol1), tfSimd4fSelectIndex0(input.mCol2))}; - case 1: return {tfSimdFloat3x32Load( + case 1: return {tfSimd3x32FLoad( tfSimd4fSelectIndex1(input.mCol0), tfSimd4fSelectIndex1(input.mCol1), tfSimd4fSelectIndex1(input.mCol2))}; - case 2: return {tfSimdFloat3x32Load( + case 2: return {tfSimd3x32FLoad( tfSimd4fSelectIndex2(input.mCol0), tfSimd4fSelectIndex2(input.mCol1), tfSimd4fSelectIndex2(input.mCol2))}; - case 3: return {tfSimdFloat3x32Load( + case 3: return {tfSimd3x32FLoad( tfSimd4fSelectIndex3(input.mCol0), tfSimd4fSelectIndex3(input.mCol1), tfSimd4fSelectIndex3(input.mCol2))}; @@ -73,34 +73,6 @@ static inline TSimdFloat3 tfGetRowSimd3x4F(TSimdFloat4x3 input, int row) { } -static inline TSimdFloat4 tfGetRowSimd4x4F(TSimdFloat4x4 input, int row) { - ASSERT(row >= 0 && row < 4); - switch(row) { - case 0: return {tfSimdFloat4x32Load( - tfSimd4fSelectIndex0(input.mCol0), - tfSimd4fSelectIndex0(input.mCol1), - tfSimd4fSelectIndex0(input.mCol2), - tfSimd4fSelectIndex0(input.mCol3) - )}; - case 1: return {tfSimdFloat4x32Load( - tfSimd4fSelectIndex1(input.mCol0), - tfSimd4fSelectIndex1(input.mCol1), - tfSimd4fSelectIndex1(input.mCol2), - tfSimd4fSelectIndex1(input.mCol3))}; - case 2: return {tfSimdFloat4x32Load( - tfSimd4fSelectIndex2(input.mCol0), - tfSimd4fSelectIndex2(input.mCol1), - tfSimd4fSelectIndex2(input.mCol2), - tfSimd4fSelectIndex2(input.mCol3))}; - case 3: return {tfSimdFloat4x32Load( - tfSimd4fSelectIndex3(input.mCol0), - tfSimd4fSelectIndex3(input.mCol1), - tfSimd4fSelectIndex3(input.mCol2), - tfSimd4fSelectIndex3(input.mCol3))}; - } - return {}; - -} static inline TSimdFloat2 tfGetRowSimd2x4F(TSimdFloat4x2 input, int row) { @@ -134,35 +106,13 @@ static inline float tfGetRowSimd1x4F(TSimdFloat4x1 input, int row) { } -static inline void tfSetElemSimd4x4F(TSimdFloat4x4* input, int col, int row, float value){ - ASSERT(col >= 0 && col < 4); - ASSERT(row >= 0 && row < 4); - switch (row) - { - case 0: input->mCol[col] = tfSimd4fReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimd4fReplaceIndex1ByValue(input->mCol[col], value); break; - case 2: input->mCol[col] = tfSimd4fReplaceIndex2ByValue(input->mCol[col], value); break; - case 3: input->mCol[col] = tfSimd4fReplaceIndex3ByValue(input->mCol[col], value); break; - } -} -static inline void tfSetElemSimd3x4F(TSimdFloat4x3* input, int col, int row, float value){ - ASSERT(col >= 0 && col < 3); - ASSERT(row >= 0 && row < 4); - switch (row) - { - case 0: input->mCol[col] = tfSimd4fReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimd4fReplaceIndex1ByValue(input->mCol[col], value); break; - case 2: input->mCol[col] = tfSimd4fReplaceIndex2ByValue(input->mCol[col], value); break; - case 3: input->mCol[col] = tfSimd4fReplaceIndex3ByValue(input->mCol[col], value); break; - } -} static inline void tfSetElemSimd2x4F(TSimdFloat4x2* input, int col, int row, float value){ ASSERT(col >= 0 && col < 2); ASSERT(row >= 0 && row < 4); switch (row) { - case 0: input->mCol[col] = tfSimd4fReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimd4fReplaceIndex1ByValue(input->mCol[col], value); break; + case 0: input->mCol[col] = tfSimdFloat4ReplaceIndex0ByValue(input->mCol[col], value); break; + case 1: input->mCol[col] = tfSimdFloat4ReplaceIndex1ByValue(input->mCol[col], value); break; case 2: input->mCol[col] = tfSimd4fReplaceIndex2ByValue(input->mCol[col], value); break; case 3: input->mCol[col] = tfSimd4fReplaceIndex3ByValue(input->mCol[col], value); break; } @@ -172,8 +122,8 @@ static inline void tfSetElemSimd1x4F(TSimdFloat4x1* input, int col, int row, flo ASSERT(row >= 0 && row < 4); switch (row) { - case 0: input->mCol[col] = tfSimd4fReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimd4fReplaceIndex1ByValue(input->mCol[col], value); break; + case 0: input->mCol[col] = tfSimdFloat4ReplaceIndex0ByValue(input->mCol[col], value); break; + case 1: input->mCol[col] = tfSimdFloat4ReplaceIndex1ByValue(input->mCol[col], value); break; case 2: input->mCol[col] = tfSimd4fReplaceIndex2ByValue(input->mCol[col], value); break; case 3: input->mCol[col] = tfSimd4fReplaceIndex3ByValue(input->mCol[col], value); break; } @@ -261,9 +211,9 @@ static inline TSimdFloat3x3 tfLoadSimd3x3F( float m20, float m21, float m22 ) { TSimdFloat3x3 res; - res.mCol0 = tfSimdFloat3x32Load(m00, m10, m20); - res.mCol1 = tfSimdFloat3x32Load(m01, m11, m21); - res.mCol2 = tfSimdFloat3x32Load(m02, m12, m22); + res.mCol0 = tfSimd3x32FLoad(m00, m10, m20); + res.mCol1 = tfSimd3x32FLoad(m01, m11, m21); + res.mCol2 = tfSimd3x32FLoad(m02, m12, m22); return res; } static inline TSimdFloat3x2 tfLoadSimd2x3F( @@ -272,8 +222,8 @@ static inline TSimdFloat3x2 tfLoadSimd2x3F( float m20, float m21 ) { TSimdFloat3x2 res; - res.mCol0 = tfSimdFloat3x32Load(m00, m10, m20); - res.mCol1 = tfSimdFloat3x32Load(m01, m11, m21); + res.mCol0 = tfSimd3x32FLoad(m00, m10, m20); + res.mCol1 = tfSimd3x32FLoad(m01, m11, m21); return res; } static inline TSimdFloat3x1 tfLoadSimd1x3F( @@ -282,7 +232,7 @@ static inline TSimdFloat3x1 tfLoadSimd1x3F( float m20 ) { TSimdFloat3x1 res; - res.mCol0 = tfSimdFloat3x32Load(m00, m10, m20); + res.mCol0 = tfSimd3x32FLoad(m00, m10, m20); return res; } @@ -291,9 +241,9 @@ static inline void tfSetElemSimd3x3F(TSimdFloat3x3* input, int col, int row, flo ASSERT(row >= 0 && row < 3); switch (row) { - case 0: input->mCol[col] = tfSimdFloat3x32ReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimdFloat3x32ReplaceIndex1ByValue(input->mCol[col], value); break; - case 2: input->mCol[col] = tfSimdFloat3x32ReplaceIndex2ByValue(input->mCol[col], value); break; + case 0: input->mCol[col] = tfSimd3x32FReplaceIndex0ByValue(input->mCol[col], value); break; + case 1: input->mCol[col] = tfSimd3x32FReplaceIndex1ByValue(input->mCol[col], value); break; + case 2: input->mCol[col] = tfSimd3x32FReplaceIndex2ByValue(input->mCol[col], value); break; } } @@ -303,9 +253,9 @@ static inline void tfSetElemSimd2x3F(TSimdFloat3x2* input, int col, int row, flo ASSERT(row >= 0 && row < 3); switch (row) { - case 0: input->mCol[col] = tfSimdFloat3x32ReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimdFloat3x32ReplaceIndex1ByValue(input->mCol[col], value); break; - case 2: input->mCol[col] = tfSimdFloat3x32ReplaceIndex2ByValue(input->mCol[col], value); break; + case 0: input->mCol[col] = tfSimd3x32FReplaceIndex0ByValue(input->mCol[col], value); break; + case 1: input->mCol[col] = tfSimd3x32FReplaceIndex1ByValue(input->mCol[col], value); break; + case 2: input->mCol[col] = tfSimd3x32FReplaceIndex2ByValue(input->mCol[col], value); break; } } @@ -314,9 +264,9 @@ static inline void tfSetElemSimd1x3F(TSimdFloat3x1* input, int col, int row, flo ASSERT(row >= 0 && row < 3); switch (row) { - case 0: input->mCol[col] = tfSimdFloat3x32ReplaceIndex0ByValue(input->mCol[col], value); break; - case 1: input->mCol[col] = tfSimdFloat3x32ReplaceIndex1ByValue(input->mCol[col], value); break; - case 2: input->mCol[col] = tfSimdFloat3x32ReplaceIndex2ByValue(input->mCol[col], value); break; + case 0: input->mCol[col] = tfSimd3x32FReplaceIndex0ByValue(input->mCol[col], value); break; + case 1: input->mCol[col] = tfSimd3x32FReplaceIndex1ByValue(input->mCol[col], value); break; + case 2: input->mCol[col] = tfSimd3x32FReplaceIndex2ByValue(input->mCol[col], value); break; } } @@ -324,27 +274,27 @@ static inline float tfGetElemSimd3F(TSimdFloat3 a, int elem) { ASSERT(elem >= 0 && elem < 3); switch (elem) { case 0: - return tfSimd3fSelectIndex0(a.mRow); + return tfS32x3FSelectIndex0(a.mRow); case 1: - return tfSimd3fSelectIndex1(a.mRow); + return tfS32x3FSelectIndex1(a.mRow); case 2: - return tfSimd3fSelectIndex2(a.mRow); + return tfS32x3FSelectIndex2(a.mRow); } return 0; } -static inline float tfGetXSimd3F(TSimdFloat3 a) { return tfSimd3fSelectIndex0(a.mRow); } -static inline float tfGetYSimd3F(TSimdFloat3 a) { return tfSimd3fSelectIndex1(a.mRow); } -static inline float tfGetZSimd3F(TSimdFloat3 a) { return tfSimd3fSelectIndex2(a.mRow); } +static inline float tfGetXSimd3F(TSimdFloat3 a) { return tfS32x3FSelectIndex0(a.mRow); } +static inline float tfGetYSimd3F(TSimdFloat3 a) { return tfS32x3FSelectIndex1(a.mRow); } +static inline float tfGetZSimd3F(TSimdFloat3 a) { return tfS32x3FSelectIndex2(a.mRow); } static inline bool tfIsCloseSimd3F(TSimdFloat3 a, TSimdFloat3 b, float epsilon) { - return tfSimdFloat32x3CmpAllLt(tfSimd3fAbs(tfSimd3fSub(a.mRow, b.mRow)), tfSimd3fSplat(epsilon)); + return tfSimdFloat32x3CmpAllLt(tfS32x3FAbs(tfS32x3FSub(a.mRow, b.mRow)), tfS32x3FSplat(epsilon)); } static inline float tfVectorDot3F(TSimdFloat3 a0, TSimdFloat3 a1) { - TSimdFloat32x3 x2 = tfSimd3fMul(a0.mRow, a1.mRow); - TSimdFloat32x3 xy = tfSimd3fAdd(tfSimd3fSplatIndex1(x2), x2); - TSimdFloat32x3 xyz = tfSimd3fAdd(tfSimd3fSplatIndex2(x2), xy); - return tfSimd3fSelectIndex0(xyz); + Tsimd_f32x3_t x2 = tfS32x3FMul(a0.mRow, a1.mRow); + Tsimd_f32x3_t xy = tfS32x3FAdd(tfS32x3FSplatIndex1(x2), x2); + Tsimd_f32x3_t xyz = tfS32x3FAdd(tfS32x3FSplatIndex2(x2), xy); + return tfS32x3FSelectIndex0(xyz); } @@ -385,10 +335,10 @@ static inline TSimdFloat2 tfLoadSimd2F(float x, float y) { } static inline TSimdFloat2 tfVectorMul2x2F(const TSimdFloat2x2 a0, const TSimdFloat2 a1) { - TSimdFloat32x2 xx = tfSimd2fSplatIndex0(a1.mRow); - TSimdFloat32x2 yy = tfSimd2fSplatIndex1(a1.mRow); - TSimdFloat32x2 res = tfSimd2fMul(a0.mCol0, xx); - res = tfSimd2fMadd(a0.mCol1, yy, res); + Tsimd_f32x2_t xx = tfS32x2FSplatIndex0(a1.mRow); + Tsimd_f32x2_t yy = tfS32x2FSplatIndex1(a1.mRow); + Tsimd_f32x2_t res = tfS32x2FMul(a0.mCol0, xx); + res = tfS32x2FMadd(a0.mCol1, yy, res); return { res }; } @@ -396,24 +346,24 @@ static inline float tfGetElemSimd2F(TSimdFloat2 a, int elem) { ASSERT(elem >= 0 && elem < 2); switch (elem) { case 0: - return tfSimd2fSelectIndex0(a.mRow); + return tfS32x2FSelectIndex0(a.mRow); case 1: - return tfSimd2fSelectIndex1(a.mRow); + return tfS32x2FSelectIndex1(a.mRow); } return 0; } -static inline float tfGetXSimd2F(TSimdFloat2 a) { return tfSimd2fSelectIndex0(a.mRow); } -static inline float tfGetYSimd2F(TSimdFloat2 a) { return tfSimd2fSelectIndex1(a.mRow); } +static inline float tfGetXSimd2F(TSimdFloat2 a) { return tfS32x2FSelectIndex0(a.mRow); } +static inline float tfGetYSimd2F(TSimdFloat2 a) { return tfS32x2FSelectIndex1(a.mRow); } -static inline TSimdFloat3 tfVectorEleDiv3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfSimd3fDiv(a0.mRow, a1.mRow) }; } -static inline TSimdFloat2 tfVectorEleDiv2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfSimd2fDiv(a0.mRow, a1.mRow) }; } +static inline TSimdFloat3 tfVectorEleDiv3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfS32x3FDiv(a0.mRow, a1.mRow) }; } +static inline TSimdFloat2 tfVectorEleDiv2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfS32x2FDiv(a0.mRow, a1.mRow) }; } -static inline TSimdFloat3 tfVectorEleSub3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfSimd3fSub(a0.mRow, a1.mRow) }; } -static inline TSimdFloat2 tfVectorEleSub2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfSimd2fSub(a0.mRow, a1.mRow) }; } +static inline TSimdFloat3 tfVectorEleSub3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfS32x3FSub(a0.mRow, a1.mRow) }; } +static inline TSimdFloat2 tfVectorEleSub2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfS32x2FSub(a0.mRow, a1.mRow) }; } -static inline TSimdFloat3 tfVectorEleAdd3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfSimd3fAdd(a0.mRow, a1.mRow) }; } -static inline TSimdFloat2 tfVectorEleAdd2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfSimd2fAdd(a0.mRow, a1.mRow) }; } +static inline TSimdFloat3 tfVectorEleAdd3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfS32x3FAdd(a0.mRow, a1.mRow) }; } +static inline TSimdFloat2 tfVectorEleAdd2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfS32x2FAdd(a0.mRow, a1.mRow) }; } -static inline TSimdFloat3 tfVectorEleMul3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfSimd3fMul(a0.mRow, a1.mRow) }; } -static inline TSimdFloat2 tfVectorEleMul2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfSimd2fMul(a0.mRow, a1.mRow) }; } +static inline TSimdFloat3 tfVectorEleMul3F(TSimdFloat3 a0, TSimdFloat3 a1) { return { tfS32x3FMul(a0.mRow, a1.mRow) }; } +static inline TSimdFloat2 tfVectorEleMul2F(TSimdFloat2 a0, TSimdFloat2 a1) { return { tfS32x2FMul(a0.mRow, a1.mRow) }; } diff --git a/Forge/Math/Internal/TF_SimdFloat3.inl b/Forge/Math/Internal/TF_SimdFloat3.inl index 8aec56c142..6bcb4ba6df 100644 --- a/Forge/Math/Internal/TF_SimdFloat3.inl +++ b/Forge/Math/Internal/TF_SimdFloat3.inl @@ -4,14 +4,14 @@ #include "Forge/Math/TF_Simd32x3.h" -static inline TSimdFloat3 tfLoadZeroSimd3x3F() { - TSimdFloat3 res; - res.mRow = tfSimd3fZero(); +static inline TSimdFloat3x3 tfLoadZeroSimd3x3F() { + TSimdFloat3x3 res; + //res.mRow = tfS32x3FZero(); return res; } static inline TSimdFloat3 tfLoadSimd3F(float x, float y, float z) { TSimdFloat3 res; - res.mRow = tfSimdFloat3x32Load(x, y, z); + res.mRow = tfSimd3x32FLoad(x, y, z); return res; } diff --git a/Forge/Math/Internal/TF_SimdFloat4.inl b/Forge/Math/Internal/TF_SimdFloat4.inl index 6f5aecc9a0..3f249dd684 100644 --- a/Forge/Math/Internal/TF_SimdFloat4.inl +++ b/Forge/Math/Internal/TF_SimdFloat4.inl @@ -14,17 +14,17 @@ static inline float tfGetElemSimd4F(TSimdFloat4 a, int elem) { ASSERT(elem >= 0 && elem < 4); switch (elem) { case 0: - return tfSimd4fSelectIndex0(a.mRow); + return tfS32x4FSelectIndex0(a.mRow); case 1: - return tfSimd4fSelectIndex1(a.mRow); + return tfS32x4FSelectIndex1(a.mRow); case 2: - return tfSimd4fSelectIndex2(a.mRow); + return tfS32x4FSelectIndex2(a.mRow); case 3: - return tfSimd4fSelectIndex3(a.mRow); + return tfS32x4FSelectIndex3(a.mRow); } return 0; } -static inline float tfGetXSimd4F(TSimdFloat4 a) { return tfSimd4fSelectIndex0(a.mRow); } -static inline float tfGetYSimd4F(TSimdFloat4 a) { return tfSimd4fSelectIndex1(a.mRow); } -static inline float tfGetZSimd4F(TSimdFloat4 a) { return tfSimd4fSelectIndex2(a.mRow); } -static inline float tfGetWSimd4F(TSimdFloat4 a) { return tfSimd4fSelectIndex3(a.mRow); } +static inline float tfGetXSimd4F(TSimdFloat4 a) { return tfS32x4FSelectIndex0(a.mRow); } +static inline float tfGetYSimd4F(TSimdFloat4 a) { return tfS32x4FSelectIndex1(a.mRow); } +static inline float tfGetZSimd4F(TSimdFloat4 a) { return tfS32x4FSelectIndex2(a.mRow); } +static inline float tfGetWSimd4F(TSimdFloat4 a) { return tfS32x4FSelectIndex3(a.mRow); } diff --git a/Forge/Math/Internal/TF_SimdFloat4x4.inl b/Forge/Math/Internal/TF_SimdFloat4x4.inl index 0aa2f9c727..53cab22b42 100644 --- a/Forge/Math/Internal/TF_SimdFloat4x4.inl +++ b/Forge/Math/Internal/TF_SimdFloat4x4.inl @@ -5,12 +5,66 @@ #include "Forge/Math/TF_Simd32x4.h" -static inline TSimdFloat4 tfVectorMul4x4F(const TSimdFloat4x4 a0, const TSimdFloat4 a1) { - TSimdFloat32x4 xxxx = tfSimd4fSplatIndex0(a1.mRow); - TSimdFloat32x4 yyyy = tfSimd4fSplatIndex1(a1.mRow); - TSimdFloat32x4 zzzz = tfSimd4fSplatIndex2(a1.mRow); - TSimdFloat32x4 wwww = tfSimd4fSplatIndex3(a1.mRow); - TSimdFloat32x4 res = tfSimd4fMul(a0.mCol0, xxxx); +static inline TSimdFloat4 tfGetRowSimd4x4F(TSimdFloat4x4 input, int row) { + ASSERT(row >= 0 && row < 4); + switch(row) { + case 0: return {tfS32x4FLoad( + tfSimd4fSelectIndex0(input.mCol0), + tfSimd4fSelectIndex0(input.mCol1), + tfSimd4fSelectIndex0(input.mCol2), + tfSimd4fSelectIndex0(input.mCol3) + )}; + case 1: return {tfS32x4FLoad( + tfS32x4FSelectIndex1(input.mCol0), + tfS32x4FSelectIndex1(input.mCol1), + tfS32x4FSelectIndex1(input.mCol2), + tfS32x4FSelectIndex1(input.mCol3))}; + case 2: return {tfS32x4FLoad( + tfS32x4FSelectIndex2(input.mCol0), + tfS32x4FSelectIndex2(input.mCol1), + tfS32x4FSelectIndex2(input.mCol2), + tfS32x4FSelectIndex2(input.mCol3))}; + case 3: return {tfS32x4FLoad( + tfS32x4FSelectIndex3(input.mCol0), + tfS32x4FSelectIndex3(input.mCol1), + tfS32x4FSelectIndex3(input.mCol2), + tfS32x4FSelectIndex3(input.mCol3))}; + } + return {}; +} + +static inline void tfSetRowSimd4x4F(TSimdFloat4x4* input, int row, TSimdFloat4 value) { + // ASSERT(row >= 0 && row < 4); + // switch (row) { + // case 0: + // input->mCol0 = tfS32x4FReplaceIndex0(input->mCol0, tfS32x4FSplatIndex0(value)); + // } + return {}; +} +static inline TSimdFloat4 tfGetColumnSimd4x4F(int column); +static inline void tfSetColumnSimd4x4F(TSimdFloat4x4* input, int row); +static inline void tfSetElemSimd4x4F(TSimdFloat4x4* input, int col, int row, float value); + + +//static inline void tfSetElemSimd3x4F(TSimdFloat4x3* input, int col, int row, float value){ +// ASSERT(col >= 0 && col < 3); +// ASSERT(row >= 0 && row < 4); +// switch (row) +// { +// case 0: input->mCol[col] = tfSimd4fReplaceIndex0ByValue(input->mCol[col], value); break; +// case 1: input->mCol[col] = tfSimd4fReplaceIndex1ByValue(input->mCol[col], value); break; +// case 2: input->mCol[col] = tfSimd4fReplaceIndex2ByValue(input->mCol[col], value); break; +// case 3: input->mCol[col] = tfSimd4fReplaceIndex3ByValue(input->mCol[col], value); break; +// } +//} + + +static inline TSimdFloat4 tfVectorMulSimd4x4F(const TSimdFloat4x4 a0, const TSimdFloat4 a1) { + Tsimd_f32x4_t xxxx = tfSimd4fSplatIndex0(a1.mRow); + Tsimd_f32x4_t yyyy = tfSimd4fSplatIndex1(a1.mRow); + Tsimd_f32x4_t zzzz = tfSimd4fSplatIndex2(a1.mRow); + Tsimd_f32x4_t wwww = tfSimd4fSplatIndex3(a1.mRow); + Tsimd_f32x4_t res = tfSimd4fMul(a0.mCol0, xxxx); res = tfSimd4fMadd(a0.mCol1, yyyy, res); res = tfSimd4fMadd(a0.mCol2, zzzz, res); res = tfSimd4fMadd(a0.mCol3, wwww, res); @@ -37,30 +91,30 @@ static inline TSimdFloat4x4 tfLoadIdentitySimd4x4F() return value; } -static inline TSimdFloat4x4 tfMatMul4x4F_4x4F(TSimdFloat4x4 a0, TSimdFloat4x4 a1) { +static inline TSimdFloat4x4 tfMulSimd4x4F_4x4F(TSimdFloat4x4 a0, TSimdFloat4x4 a1) { TSimdFloat4x4 res; - res.mCol0 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; - res.mCol1 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; - res.mCol2 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol2 }).mRow; - res.mCol3 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol3 }).mRow; + res.mCol0 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; + res.mCol1 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; + res.mCol2 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol2 }).mRow; + res.mCol3 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol3 }).mRow; return res; } -static inline TSimdFloat4x3 tfMatMul4x4F_3x4F(TSimdFloat4x4 a0, TSimdFloat4x3 a1) { +static inline TSimdFloat4x3 tfMulSimd4x4F_3x4F(TSimdFloat4x4 a0, TSimdFloat4x3 a1) { TSimdFloat4x3 res; - res.mCol0 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; - res.mCol1 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; - res.mCol2 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol2 }).mRow; + res.mCol0 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; + res.mCol1 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; + res.mCol2 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol2 }).mRow; return res; } -static inline TSimdFloat4x2 tfMatMul4x4F_2x4F(TSimdFloat4x4 a0, TSimdFloat4x2 a1) { +static inline TSimdFloat4x2 tfMulSimd4x4F_2x4F(TSimdFloat4x4 a0, TSimdFloat4x2 a1) { TSimdFloat4x2 res; - res.mCol0 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; - res.mCol1 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; + res.mCol0 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; + res.mCol1 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol1 }).mRow; return res; } -static inline TSimdFloat4x1 tfMatMul4x4F_1x4F(TSimdFloat4x4 a0, TSimdFloat4x1 a1) { +static inline TSimdFloat4x1 tfMulSimd4x4F_1x4F(TSimdFloat4x4 a0, TSimdFloat4x1 a1) { TSimdFloat4x1 res; - res.mCol0 = tfVectorMul4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; + res.mCol0 = tfVectorMulSimd4x4F(a0, TSimdFloat4{ a1.mCol0 }).mRow; return res; } diff --git a/Forge/Math/Internal/TF_SimdFloat4x4_neon.inl b/Forge/Math/Internal/TF_SimdFloat4x4_neon.inl index f177452c3b..40cefac482 100644 --- a/Forge/Math/Internal/TF_SimdFloat4x4_neon.inl +++ b/Forge/Math/Internal/TF_SimdFloat4x4_neon.inl @@ -4,15 +4,15 @@ #include "../TF_SimdFloat4x4.h" #endif -static inline TSimdFloat4x4 tfMatTranpose4x4F(TSimdFloat4x4 a0) { +static inline TSimdFloat4x4 tfTransposeSimd4x4F(TSimdFloat4x4 a0) { // abcd aecg aeim // efgh -> bfdh -> bfjn // ijkl imko cgko // mnop jnlp dhlp - const TSimdFloat32x4 tmp0 = vtrn1q_f32(a0.mCol0, a0.mCol1); - const TSimdFloat32x4 tmp1 = vtrn2q_f32(a0.mCol0, a0.mCol1); - const TSimdFloat32x4 tmp2 = vtrn1q_f32(a0.mCol2, a0.mCol3); - const TSimdFloat32x4 tmp3 = vtrn2q_f32(a0.mCol2, a0.mCol3); + const TSimd32Fx4 tmp0 = vtrn1q_f32(a0.mCol0, a0.mCol1); + const TSimd32Fx4 tmp1 = vtrn2q_f32(a0.mCol0, a0.mCol1); + const TSimd32Fx4 tmp2 = vtrn1q_f32(a0.mCol2, a0.mCol3); + const TSimd32Fx4 tmp3 = vtrn2q_f32(a0.mCol2, a0.mCol3); TSimdFloat4x4 result; result.mCol0 = vtrn1q_f64(tmp0, tmp2); result.mCol1 = vtrn1q_f64(tmp1, tmp3); diff --git a/Forge/Math/Internal/TF_SimdFloat4x4_scalar.inl b/Forge/Math/Internal/TF_SimdFloat4x4_scalar.inl index b52a058436..4d29a1b84b 100644 --- a/Forge/Math/Internal/TF_SimdFloat4x4_scalar.inl +++ b/Forge/Math/Internal/TF_SimdFloat4x4_scalar.inl @@ -4,11 +4,11 @@ #include "../TF_SimdFloat4x4.h" #endif -static inline TSimdFloat4x4 tfMatTranpose4x4F(TSimdFloat4x4 a0) { - TSimdFloat32x4 cols0 = { { a0.mCol0.v[0], a0.mCol1.v[0], a0.mCol2.v[0], a0.mCol3.v[0] } }; - TSimdFloat32x4 cols1 = { { a0.mCol0.v[1], a0.mCol1.v[1], a0.mCol2.v[1], a0.mCol3.v[1] } }; - TSimdFloat32x4 cols2 = { { a0.mCol0.v[2], a0.mCol1.v[2], a0.mCol2.v[2], a0.mCol3.v[2] } }; - TSimdFloat32x4 cols3 = { { a0.mCol0.v[3], a0.mCol1.v[3], a0.mCol2.v[3], a0.mCol3.v[3] } }; +static inline TSimdFloat4x4 tfTransposeSimd4x4F(TSimdFloat4x4 a0) { + Tsimd_f32x4_t cols0 = { { a0.mCol0.v[0], a0.mCol1.v[0], a0.mCol2.v[0], a0.mCol3.v[0] } }; + Tsimd_f32x4_t cols1 = { { a0.mCol0.v[1], a0.mCol1.v[1], a0.mCol2.v[1], a0.mCol3.v[1] } }; + Tsimd_f32x4_t cols2 = { { a0.mCol0.v[2], a0.mCol1.v[2], a0.mCol2.v[2], a0.mCol3.v[2] } }; + Tsimd_f32x4_t cols3 = { { a0.mCol0.v[3], a0.mCol1.v[3], a0.mCol2.v[3], a0.mCol3.v[3] } }; TSimdFloat4x4 result; result.mCol0 = cols0; result.mCol1 = cols1; diff --git a/Forge/Math/Internal/TF_SimdFloat4x4_sse.inl b/Forge/Math/Internal/TF_SimdFloat4x4_sse.inl index 5a178a0914..751cde3813 100644 --- a/Forge/Math/Internal/TF_SimdFloat4x4_sse.inl +++ b/Forge/Math/Internal/TF_SimdFloat4x4_sse.inl @@ -4,11 +4,11 @@ #include "../TF_SimdFloat4x4.h" #endif -static inline TSimdFloat4x4 tfMatTranpose4x4F(TSimdFloat4x4 a0) { - TSimdFloat32x4 tmp0 = _mm_shuffle_ps(a0.mCol0, a0.mCol1, 0x44); - TSimdFloat32x4 tmp2 = _mm_shuffle_ps(a0.mCol0, a0.mCol1, 0xEE); - TSimdFloat32x4 tmp1 = _mm_shuffle_ps(a0.mCol2, a0.mCol3, 0x44); - TSimdFloat32x4 tmp3 = _mm_shuffle_ps(a0.mCol2, a0.mCol3, 0xEE); +static inline TSimdFloat4x4 tfTransposeSimd4x4F(TSimdFloat4x4 a0) { + Tsimd_f32x4_t tmp0 = _mm_shuffle_ps(a0.mCol0, a0.mCol1, 0x44); + Tsimd_f32x4_t tmp2 = _mm_shuffle_ps(a0.mCol0, a0.mCol1, 0xEE); + Tsimd_f32x4_t tmp1 = _mm_shuffle_ps(a0.mCol2, a0.mCol3, 0x44); + Tsimd_f32x4_t tmp3 = _mm_shuffle_ps(a0.mCol2, a0.mCol3, 0xEE); TSimdFloat4x4 result; result.mCol0 = _mm_shuffle_ps(tmp0, tmp1, 0x88); result.mCol1 = _mm_shuffle_ps(tmp0, tmp1, 0xDD); diff --git a/Forge/Math/Internal/TF_SimdFloat_sse.inl b/Forge/Math/Internal/TF_SimdFloat_sse.inl index d559c9b678..ce6e055ca2 100644 --- a/Forge/Math/Internal/TF_SimdFloat_sse.inl +++ b/Forge/Math/Internal/TF_SimdFloat_sse.inl @@ -5,9 +5,9 @@ #endif -static inline TSimdFloat32x4 ___Simd4x324Dot4(TSimdFloat4 a0, TSimdFloat4 a1) { - TSimdFloat32x4 x2 = _mm_mul_ps(a0.mRow, a1.mRow); - TSimdFloat32x4 tmp = _mm_add_ps(x2, _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1))); +static inline Tsimd_f32x4_t ___Simd4x324Dot4(TSimdFloat4 a0, TSimdFloat4 a1) { + Tsimd_f32x4_t x2 = _mm_mul_ps(a0.mRow, a1.mRow); + Tsimd_f32x4_t tmp = _mm_add_ps(x2, _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1))); return _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 2, 3))); } diff --git a/Forge/Math/TF_Simd32x2.h b/Forge/Math/TF_Simd32x2.h index 571186ae2b..58921c93f4 100644 --- a/Forge/Math/TF_Simd32x2.h +++ b/Forge/Math/TF_Simd32x2.h @@ -12,75 +12,75 @@ #include "Forge/Math/Internal/SimdTypes.h" -inline TSimdFloat32x2 tfSimd2fSplat(float value); -inline TSimdInt32x2 tfSimd2iSplat(int32_t value); - -inline TSimdFloat32x2 tfSimd2fZero(); -inline TSimdInt32x2 tfSimd2iZero(); - -inline TSimdInt32x2 tfSimd2fToSimd2i(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2iToSimd2f(TSimdInt32x2 value); - -inline TSimdFloat32x2 tfSimd2fSplatIndex0(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2fSplatIndex1(TSimdFloat32x2 value); - -static inline TSimdFloat32x4 tfSimdFloat2To4Splat0(TSimdFloat32x2 value); -static inline TSimdFloat32x4 tfSimdFloat2To4Splat1(TSimdFloat32x2 value); - -inline TSimdFloat32x2 tfSimd2fSelect(TSimdFloat32x2 arg0, TSimdFloat32x2 arg1, TSimdFloat32x2 mask); -inline TSimdInt32x2 tfSimd2iSelect(TSimdInt32x2 arg0, TSimdInt32x2 arg1, TSimdInt32x2 mask); - -inline float tfSimd2fSelectIndex0(TSimdFloat32x2 value); -inline float tfSimd2fSelectIndex1(TSimdFloat32x2 value); - -inline TSimdFloat32x2 tfSimdFloat2x32Load(float x, float y); -inline TSimdInt32x2 tfSimdInt2x32Load(int32_t x, int32_t y); - -inline TSimdFloat32x2 tfSimd2fAdd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fSub(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fMul(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fMadd(TSimdFloat32x2 mul1, TSimdFloat32x2 mul2, TSimdFloat32x2 add); -inline TSimdFloat32x2 tfSimd2fDiv(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fAbs(TSimdFloat32x2 value); - -inline TSimdFloat32x2 tfSimd2fNot(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2fAnd(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fAndNot(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fOr(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fXor(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); - -inline TSimdInt32x2 tfSimd2iNot(TSimdInt32x2 value); -inline TSimdInt32x2 tfSimd2iAnd(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iAndNot(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iOr(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iXor(TSimdInt32x2 arg1, TSimdInt32x2 arg2); - -inline TSimdFloat32x2 tfSimd2fFloor(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2fCeil(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2fRound(TSimdFloat32x2 value); // Ties to even (banker's rounding) -inline TSimdFloat32x2 tfSimd2fTruncate(TSimdFloat32x2 value); -inline TSimdFloat32x2 tfSimd2fMin(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fMax(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fClamp(TSimdFloat32x2 value, TSimdFloat32x2 min, TSimdFloat32x2 max); - -inline TSimdInt32x2 tfSimd2iCmpEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iCmpNeq(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iCmpGt(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iCmpGtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iCmpLt(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline TSimdInt32x2 tfSimd2iCmpLtEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2); - -inline TSimdFloat32x2 tfSimd2fCmpEq(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fCmpNeq(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fCmpGt(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fCmpGtEq(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fCmpLt(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); -inline TSimdFloat32x2 tfSimd2fCmpLtEq(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); - -inline bool tfSimd2iCmpAllEq(TSimdInt32x2 arg1, TSimdInt32x2 arg2); -inline bool tfSimd2fCmpAllEq(TSimdFloat32x2 arg1, TSimdFloat32x2 arg2); - -static inline bool tfSimdFloat32x2CmpAllLt(TSimdFloat32x2 a, TSimdFloat32x2 b); +inline Tsimd_f32x2_t tfS32x2FSplat(float value); +inline Tsimd_i32x2_t tfS32x2ISplat(int32_t value); + +inline Tsimd_f32x2_t tfS32x2FZero(); +inline Tsimd_i32x2_t tfS32x2IZero(); + +inline Tsimd_i32x2_t tfS32x2FToSimd2i(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2IToSimd2f(Tsimd_i32x2_t value); + +inline Tsimd_f32x2_t tfS32x2FSplatIndex0(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2FSplatIndex1(Tsimd_f32x2_t value); + +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat0(Tsimd_f32x2_t value); +static inline Tsimd_f32x4_t tfSimdFloat2To4Splat1(Tsimd_f32x2_t value); + +inline Tsimd_f32x2_t tfS32x2FSelect(Tsimd_f32x2_t arg0, Tsimd_f32x2_t arg1, Tsimd_f32x2_t mask); +inline Tsimd_i32x2_t tfS32x2ISelect(Tsimd_i32x2_t arg0, Tsimd_i32x2_t arg1, Tsimd_i32x2_t mask); + +inline float tfS32x2FSelectIndex0(Tsimd_f32x2_t value); +inline float tfS32x2FSelectIndex1(Tsimd_f32x2_t value); + +inline Tsimd_f32x2_t tfSimdFloat2x32Load(float x, float y); +inline Tsimd_i32x2_t tfSimdInt2x32Load(int32_t x, int32_t y); + +inline Tsimd_f32x2_t tfS32x2FAdd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FSub(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FMul(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FMadd(Tsimd_f32x2_t mul1, Tsimd_f32x2_t mul2, Tsimd_f32x2_t add); +inline Tsimd_f32x2_t tfS32x2FDiv(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FAbs(Tsimd_f32x2_t value); + +inline Tsimd_f32x2_t tfS32x2FNot(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2FAnd(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FAndNot(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FOr(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FXor(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); + +inline Tsimd_i32x2_t tfS32x2INot(Tsimd_i32x2_t value); +inline Tsimd_i32x2_t tfS32x2IAnd(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2IAndNot(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2IOr(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2IXor(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); + +inline Tsimd_f32x2_t tfS32x2FFloor(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2FCeil(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2FRound(Tsimd_f32x2_t value); // Ties to even (banker's rounding) +inline Tsimd_f32x2_t tfS32x2FTruncate(Tsimd_f32x2_t value); +inline Tsimd_f32x2_t tfS32x2FMin(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FMax(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FClamp(Tsimd_f32x2_t value, Tsimd_f32x2_t min, Tsimd_f32x2_t max); + +inline Tsimd_i32x2_t tfS32x2ICmpEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2ICmpNeq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2ICmpGt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2ICmpGtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2ICmpLt(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline Tsimd_i32x2_t tfS32x2ICmpLtEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); + +inline Tsimd_f32x2_t tfS32x2FCmpEq(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FCmpNeq(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FCmpGt(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FCmpGtEq(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FCmpLt(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); +inline Tsimd_f32x2_t tfS32x2FCmpLtEq(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); + +inline bool tfS32x2ICmpAllEq(Tsimd_i32x2_t arg1, Tsimd_i32x2_t arg2); +inline bool tfS32x2FCmpAllEq(Tsimd_f32x2_t arg1, Tsimd_f32x2_t arg2); + +static inline bool tfS32x2FCmpAllLt(Tsimd_f32x2_t a, Tsimd_f32x2_t b); #if defined(TF_FEATURE_CPU_SSE) #include "Internal/TF_Simd32x2_sse.inl" diff --git a/Forge/Math/TF_Simd32x3.h b/Forge/Math/TF_Simd32x3.h index c21048a654..f5f5eedd7b 100644 --- a/Forge/Math/TF_Simd32x3.h +++ b/Forge/Math/TF_Simd32x3.h @@ -11,83 +11,83 @@ #include "Forge/Math/Internal/SimdTypes.h" -inline TSimdFloat32x3 tfSimd3fSplat(float value); -inline TSimdInt32x3 tfSimd3iSplat(int32_t value); - -inline TSimdFloat32x3 tfSimd3fZero(); -inline TSimdInt32x3 tfSimd3iZero(); - -inline TSimdFloat32x2 tfSimd3fToSimd2f(TSimdFloat32x3 value); -inline TSimdInt32x3 tfSimd3fToSimd3i(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3iToSimd3f(TSimdInt32x3 value); - -inline TSimdFloat32x3 tfSimd3fSplatIndex0(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fSplatIndex1(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fSplatIndex2(TSimdFloat32x3 value); - -static inline TSimdFloat32x4 tfSimdFloat3To4Splat0(TSimdFloat32x3 value); -static inline TSimdFloat32x4 tfSimdFloat3To4Splat1(TSimdFloat32x3 value); -static inline TSimdFloat32x4 tfSimdFloat3To4Splat2(TSimdFloat32x3 value); - -inline TSimdFloat32x3 tfSimd3fSelect(TSimdFloat32x3 arg0, TSimdFloat32x3 arg1, TSimdFloat32x3 mask); -inline TSimdInt32x3 tfSimd3iSelect(TSimdInt32x3 arg0, TSimdInt32x3 arg1, TSimdInt32x3 mask); - -inline float tfSimd3fSelectIndex0(TSimdFloat32x3 value); -inline float tfSimd3fSelectIndex1(TSimdFloat32x3 value); -inline float tfSimd3fSelectIndex2(TSimdFloat32x3 value); - -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex0ByValue(TSimdFloat32x3 input, float value); -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex1ByValue(TSimdFloat32x3 input, float value); -static inline TSimdFloat32x3 tfSimdFloat3x32ReplaceIndex2ByValue(TSimdFloat32x3 input, float value); - -inline TSimdFloat32x3 tfSimdFloat3x32Load(float x, float y, float z); -inline TSimdInt32x3 tfSimdInt3x32Load(int32_t x, int32_t y, int32_t z); - -inline TSimdFloat32x3 tfSimd3fAdd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fSub(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fMul(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fMadd(TSimdFloat32x3 mul1, TSimdFloat32x3 mul2, TSimdFloat32x3 add); -inline TSimdFloat32x3 tfSimd3fDiv(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fAbs(TSimdFloat32x3 value); - -inline TSimdFloat32x3 tfSimd3fNot(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fAnd(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fAndNot(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fOr(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fXor(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); - -inline TSimdInt32x3 tfSimd3iNot(TSimdInt32x3 value); -inline TSimdInt32x3 tfSimd3iAnd(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iAndNot(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iOr(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iXor(TSimdInt32x3 arg1, TSimdInt32x3 arg2); - -inline TSimdFloat32x3 tfSimd3fFloor(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fCeil(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fRound(TSimdFloat32x3 value); // Ties to even (banker's rounding) -inline TSimdFloat32x3 tfSimd3fTruncate(TSimdFloat32x3 value); -inline TSimdFloat32x3 tfSimd3fMin(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fMax(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fClamp(TSimdFloat32x3 value, TSimdFloat32x3 min, TSimdFloat32x3 max); - -inline TSimdInt32x3 tfSimd3iCmpEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iCmpNeq(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iCmpGt(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iCmpGtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iCmpLt(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline TSimdInt32x3 tfSimd3iCmpLtEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2); - -inline TSimdFloat32x3 tfSimd3fCmpEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fCmpNeq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fCmpGt(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fCmpGtEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fCmpLt(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); -inline TSimdFloat32x3 tfSimd3fCmpLtEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); - -inline bool tfSimd3iCmpAllEq(TSimdInt32x3 arg1, TSimdInt32x3 arg2); -inline bool tfSimd3fCmpAllEq(TSimdFloat32x3 arg1, TSimdFloat32x3 arg2); - -static inline bool tfSimdFloat32x3CmpAllLt(TSimdFloat32x3 a, TSimdFloat32x3 b); +inline Tsimd_f32x3_t tfS32x3FSplat(float value); +inline Tsimd_i32x3_t tfS32x3iSplat(int32_t value); + +inline Tsimd_f32x3_t tfS32x3FZero(); +inline Tsimd_i32x3_t tfS32x3iZero(); + +inline Tsimd_f32x2_t tfS32x3FToSimd2f(Tsimd_f32x3_t value); +inline Tsimd_i32x3_t tfS32x3FToSimd3i(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3iToSimd3f(Tsimd_i32x3_t value); + +inline Tsimd_f32x3_t tfS32x3FSplatIndex0(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FSplatIndex1(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FSplatIndex2(Tsimd_f32x3_t value); + +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat0(Tsimd_f32x3_t value); +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat1(Tsimd_f32x3_t value); +static inline Tsimd_f32x4_t tfS32x3FTo32x4FSplat2(Tsimd_f32x3_t value); + +inline Tsimd_f32x3_t tfS32x3FSelect(Tsimd_f32x3_t arg0, Tsimd_f32x3_t arg1, Tsimd_f32x3_t mask); +inline Tsimd_i32x3_t tfS32x3iSelect(Tsimd_i32x3_t arg0, Tsimd_i32x3_t arg1, Tsimd_i32x3_t mask); + +inline float tfS32x3FSelectIndex0(Tsimd_f32x3_t value); +inline float tfS32x3FSelectIndex1(Tsimd_f32x3_t value); +inline float tfS32x3FSelectIndex2(Tsimd_f32x3_t value); + +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex0ByValue(Tsimd_f32x3_t input, float value); +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex1ByValue(Tsimd_f32x3_t input, float value); +static inline Tsimd_f32x3_t tfSimd3x32FReplaceIndex2ByValue(Tsimd_f32x3_t input, float value); + +inline Tsimd_f32x3_t tfSimd3x32FLoad(float x, float y, float z); +inline Tsimd_i32x3_t tfSimd3x32ILoad(int32_t x, int32_t y, int32_t z); + +inline Tsimd_f32x3_t tfS32x3FAdd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FSub(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FMul(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FMadd(Tsimd_f32x3_t mul1, Tsimd_f32x3_t mul2, Tsimd_f32x3_t add); +inline Tsimd_f32x3_t tfS32x3FDiv(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FAbs(Tsimd_f32x3_t value); + +inline Tsimd_f32x3_t tfS32x3FNot(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FAnd(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FAndNot(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FOr(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FXor(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); + +inline Tsimd_i32x3_t tfS32x3iNot(Tsimd_i32x3_t value); +inline Tsimd_i32x3_t tfS32x3iAnd(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iAndNot(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iOr(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iXor(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); + +inline Tsimd_f32x3_t tfS32x3FFloor(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FCeil(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FRound(Tsimd_f32x3_t value); // Ties to even (banker's rounding) +inline Tsimd_f32x3_t tfS32x3FTruncate(Tsimd_f32x3_t value); +inline Tsimd_f32x3_t tfS32x3FMin(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FMax(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FClamp(Tsimd_f32x3_t value, Tsimd_f32x3_t min, Tsimd_f32x3_t max); + +inline Tsimd_i32x3_t tfS32x3iCmpEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iCmpNeq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iCmpGt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iCmpGtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iCmpLt(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline Tsimd_i32x3_t tfS32x3iCmpLtEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); + +inline Tsimd_f32x3_t tfS32x3FCmpEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FCmpNeq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FCmpGt(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FCmpGtEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FCmpLt(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); +inline Tsimd_f32x3_t tfS32x3FCmpLtEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); + +inline bool tfS32x3iCmpAllEq(Tsimd_i32x3_t arg1, Tsimd_i32x3_t arg2); +inline bool tfS32x3FCmpAllEq(Tsimd_f32x3_t arg1, Tsimd_f32x3_t arg2); + +static inline bool tfSimdFloat32x3CmpAllLt(Tsimd_f32x3_t a, Tsimd_f32x3_t b); #if defined(TF_FEATURE_CPU_SSE) #include "Internal/TF_Simd32x3_sse.inl" diff --git a/Forge/Math/TF_Simd32x4.h b/Forge/Math/TF_Simd32x4.h index e55e6d7d74..09c5648a16 100644 --- a/Forge/Math/TF_Simd32x4.h +++ b/Forge/Math/TF_Simd32x4.h @@ -12,84 +12,182 @@ #include "Forge/Math/Internal/SimdTypes.h" #include "Forge/TF_Log.h" -inline TSimdFloat32x4 tfSimd4fSplat(float value); -inline TSimdInt32x4 tfSimd4iSplat(int32_t value); - -inline TSimdFloat32x4 tfSimd4fZero(); -inline TSimdInt32x4 tfSimd4iZero(); - -inline TSimdFloat32x2 tfSimd4fToSimd2f(TSimdFloat32x4 value); -inline TSimdFloat32x3 tfSimd4fToSimd3f(TSimdFloat32x4 value); -inline TSimdInt32x4 tfSimd4fToSimd4i(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4iToSimd4f(TSimdInt32x4 value); - -inline TSimdFloat32x4 tfSimd4fSplatIndex0(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fSplatIndex1(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fSplatIndex2(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fSplatIndex3(TSimdFloat32x4 value); - -inline TSimdFloat32x4 tfSimd4fSelect(TSimdFloat32x4 arg0, TSimdFloat32x4 arg1, TSimdFloat32x4 mask); -inline TSimdInt32x4 tfSimd4iSelect(TSimdInt32x4 arg0, TSimdInt32x4 arg1, TSimdInt32x4 mask); - -inline float tfSimd4fSelectIndex0(TSimdFloat32x4 value); -inline float tfSimd4fSelectIndex1(TSimdFloat32x4 value); -inline float tfSimd4fSelectIndex2(TSimdFloat32x4 value); -inline float tfSimd4fSelectIndex3(TSimdFloat32x4 value); - -static inline TSimdFloat32x4 tfSimd4fReplaceIndex0ByValue(TSimdFloat32x4 input, float value); -static inline TSimdFloat32x4 tfSimd4fReplaceIndex1ByValue(TSimdFloat32x4 input, float value); -static inline TSimdFloat32x4 tfSimd4fReplaceIndex2ByValue(TSimdFloat32x4 input, float value); -static inline TSimdFloat32x4 tfSimd4fReplaceIndex3ByValue(TSimdFloat32x4 input, float value); - -inline TSimdFloat32x4 tfSimdFloat4x32Load(float x, float y, float z, float w); -inline TSimdInt32x4 tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w); - -inline TSimdFloat32x4 tfSimd4fAdd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fSub(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fMul(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fMadd(TSimdFloat32x4 mul1, TSimdFloat32x4 mul2, TSimdFloat32x4 add); -inline TSimdFloat32x4 tfSimdFloat4x32Div(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fAbs(TSimdFloat32x4 value); - -inline TSimdFloat32x4 tfSimd4fNot(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fAnd(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fAndNot(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fOr(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fXor(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); - -inline TSimdInt32x4 tfSimd4iNot(TSimdInt32x4 value); -inline TSimdInt32x4 tfSimd4iAnd(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iAndNot(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iOr(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iXor(TSimdInt32x4 arg1, TSimdInt32x4 arg2); - -inline TSimdFloat32x4 tfSimd4fFloor(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fCeil(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fRound(TSimdFloat32x4 value); // Ties to even (banker's rounding) -inline TSimdFloat32x4 tfSimd4fTruncate(TSimdFloat32x4 value); -inline TSimdFloat32x4 tfSimd4fMin(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fMax(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fClamp(TSimdFloat32x4 value, TSimdFloat32x4 min, TSimdFloat32x4 max); - -inline TSimdInt32x4 tfSimd4iCmpEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iCmpNeq(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iCmpGt(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iCmpGtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iCmpLt(TSimdInt32x4 arg1, TSimdInt32x4 arg2); -inline TSimdInt32x4 tfSimd4iCmpLtEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2); - -inline TSimdFloat32x4 tfSimd4fCmpEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fCmpNeq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fCmpGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fCmpGtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fCmpLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline TSimdFloat32x4 tfSimd4fCmpLtEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); - -inline bool tfSimd4iCmpAllEq(TSimdInt32x4 arg1, TSimdInt32x4 arg2); - -inline bool tfSimd4fCmpAllEq(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline bool tfSimd4fCmpAllLt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); -inline bool tfSimd4fCmpAllGt(TSimdFloat32x4 arg1, TSimdFloat32x4 arg2); +// Tsimd_f32x4_t +static inline Tsimd_f32x4_t tfSimdLoad_f32x4(float x, float y, float z, float w); +static inline Tsimd_f32x4_t tfSimdZero_f32x4(); + +static inline Tsimd_f32x4_t tfSimdSplat_f32x4(float value); +static inline Tsimd_f32x4_t tfSimdSplat0_f32x4(Tsimd_f32x4_t value); +static inline Tsimd_f32x4_t tfSimdSplat1_f32x4(Tsimd_f32x4_t value); +static inline Tsimd_f32x4_t tfSimdSplat2_f32x4(Tsimd_f32x4_t value); +static inline Tsimd_f32x4_t tfSimdSplat3_f32x4(Tsimd_f32x4_t value); + +static inline Tsimd_f32x4_t tfSimdDot_f32x4(Tsimd_f32x4_t a,Tsimd_f32x4_t b); +static inline float tfSimdDot_f32x4_f32(Tsimd_f32x4_t a,Tsimd_f32x4_t b); + +static inline float tfSimdSelect_f32x4(Tsimd_f32x4_t value, int index); +static inline float tfSimdSelect0_f32x4(Tsimd_f32x4_t value); +static inline float tfSimdSelect1_f32x4(Tsimd_f32x4_t value); +static inline float tfSimdSelect2_f32x4(Tsimd_f32x4_t value); +static inline float tfSimdSelect3_f32x4(Tsimd_f32x4_t value); + +static inline Tsimd_f32x4_t tfSimdAdd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b); +static inline Tsimd_f32x4_t tfSimdMul_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b); +static inline Tsimd_f32x4_t tfSimdDiv_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b); +static inline Tsimd_f32x4_t tfSimdAbs_f32x4(Tsimd_f32x4_t a); +static inline Tsimd_f32x4_t tfSimdMadd_f32x4(Tsimd_f32x4_t a, Tsimd_f32x4_t b, Tsimd_f32x4_t c); + +static inline Tsimd_f32x4_t tfSimdNot_f32x4(Tsimd_f32x4_t value); +static inline Tsimd_f32x4_t tfSimdAnd_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdAndNot_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdOr_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdXor_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); + +static inline Tsimd_f32x4_t tfSimdCmpEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdCmpNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdCmpGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdCmpGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdCmpLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline Tsimd_f32x4_t tfSimdCmpLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); + +static inline bool tfSimdCmpAllEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline bool tfSimdCmpAllNeq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline bool tfSimdCmpAllGt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline bool tfSimdCmpAllGtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline bool tfSimdCmpAllLt_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +static inline bool tfSimdCmpAllLtEq_f32x4(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); + +// Tsimd_i32x4_t +static inline Tsimd_i32x4_t tfSimdLoad_i32x4(int32_t x, int32_t y, int32_t z, int32_t w); + + +static inline Tsimd_i32x4_t tfSimdSplat_i32x4(int32_t value); +static inline Tsimd_i32x4_t tfSimdSplat0_i32x4(Tsimd_i32x4_t value); +static inline Tsimd_i32x4_t tfSimdSplat1_i32x4(Tsimd_i32x4_t value); +static inline Tsimd_i32x4_t tfSimdSplat2_i32x4(Tsimd_i32x4_t value); +static inline Tsimd_i32x4_t tfSimdSplat3_i32x4(Tsimd_i32x4_t value); + +static inline int32_t tfSimdSelect_i32x4(Tsimd_i32x4_t value, int index); +static inline int32_t tfSimdSelect0_i32x4(Tsimd_i32x4_t value); +static inline int32_t tfSimdSelect1_i32x4(Tsimd_i32x4_t value); +static inline int32_t tfSimdSelect2_i32x4(Tsimd_i32x4_t value); +static inline int32_t tfSimdSelect3_i32x4(Tsimd_i32x4_t value); + +static inline Tsimd_i32x4_t tfSimdAdd_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b); +static inline Tsimd_i32x4_t tfSimdMul_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b); +static inline Tsimd_i32x4_t tfSimdAbs_i32x4(Tsimd_i32x4_t a); +static inline Tsimd_i32x4_t tfSimdMadd_i32x4(Tsimd_i32x4_t a, Tsimd_i32x4_t b, Tsimd_i32x4_t c); + +static inline Tsimd_i32x4_t tfSimdNot_i32x4(Tsimd_i32x4_t value); +static inline Tsimd_i32x4_t tfSimdAnd_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdAndNot_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdOr_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdXor_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); + +static inline Tsimd_i32x4_t tfSimdCmpEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdCmpNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdCmpGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdCmpGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdCmpLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline Tsimd_i32x4_t tfSimdCmpLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); + +static inline bool tfSimdCmpAllEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline bool tfSimdCmpAllNeq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline bool tfSimdCmpAllGt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline bool tfSimdCmpAllGtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline bool tfSimdCmpAllLt_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +static inline bool tfSimdCmpAllLtEq_i32x4(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); + +// -------------------------------------------- + +static inline Tsimd_i32x4_t tfSimd_f32x4_To_i32x4(Tsimd_f32x4_t a); +static inline Tsimd_f32x4_t tfSimd_i32x4_To_f32x4(Tsimd_i32x4_t a); + + + +//inline Tsimd_f32x4_t tfS32x4FSplat(float value); +//inline Tsimd_i32x4_t Tsimd_i32x4_tSplat(int32_t value); +// +//inline Tsimd_f32x4_t tfS32x4FLoad(float x, float y, float z, float w); +//inline Tsimd_i32x4_t tfSimdInt4x32Load(int32_t x, int32_t y, int32_t z, int32_t w); +// +//inline Tsimd_f32x4_t tfS32x4FZero(); +//inline Tsimd_i32x4_t Tsimd_i32x4_tZero(); +// +//inline Tsimd_f32x2_t tfS32x4FToS32x2F(Tsimd_f32x4_t value); +//inline Tsimd_f32x3_t tfS32x4FToS32x3F(Tsimd_f32x4_t value); +//inline Tsimd_i32x4_t tfS32x4FToS32x4I(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t Tsimd_i32x4_tToSimd4f(Tsimd_i32x4_t value); +// +//inline Tsimd_f32x4_t tfS32x4FSplatIndex0(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FSplatIndex1(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FSplatIndex2(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FSplatIndex3(Tsimd_f32x4_t value); +// +//inline Tsimd_f32x4_t tfS32x4FSelect(Tsimd_f32x4_t arg0, Tsimd_f32x4_t arg1, Tsimd_f32x4_t mask); +//inline Tsimd_i32x4_t Tsimd_i32x4_tSelect(Tsimd_i32x4_t arg0, Tsimd_i32x4_t arg1, Tsimd_i32x4_t mask); +// +//inline float tfS32x4FSelectIndex0(Tsimd_f32x4_t value); +//inline float tfS32x4FSelectIndex1(Tsimd_f32x4_t value); +//inline float tfS32x4FSelectIndex2(Tsimd_f32x4_t value); +//inline float tfS32x4FSelectIndex3(Tsimd_f32x4_t value); +// +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex0ByValue(Tsimd_f32x4_t input, float value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex1ByValue(Tsimd_f32x4_t input, float value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex2ByValue(Tsimd_f32x4_t input, float value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex3ByValue(Tsimd_f32x4_t input, float value); +// +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex0(Tsimd_f32x4_t input, Tsimd_f32x4_t value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex1(Tsimd_f32x4_t input, Tsimd_f32x4_t value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex2(Tsimd_f32x4_t input, Tsimd_f32x4_t value); +//static inline Tsimd_f32x4_t tfS32x4FReplaceIndex3(Tsimd_f32x4_t input, Tsimd_f32x4_t value); +// +//inline Tsimd_f32x4_t tfS32x4FAdd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FSub(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FMul(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FMadd(Tsimd_f32x4_t mul1, Tsimd_f32x4_t mul2, Tsimd_f32x4_t add); +//inline Tsimd_f32x4_t tfS32x4FDiv(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FAbs(Tsimd_f32x4_t value); +// +//inline Tsimd_f32x4_t tfS32x4FNot(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FAnd(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FAndNot(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FOr(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FXor(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tNot(Tsimd_i32x4_t value); +//inline Tsimd_i32x4_t Tsimd_i32x4_tAnd(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tAndNot(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tOr(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tXor(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +// +//inline Tsimd_f32x4_t tfS32x4FFloor(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FCeil(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FRound(Tsimd_f32x4_t value); // Ties to even (banker's rounding) +//inline Tsimd_f32x4_t tfS32x4FTruncate(Tsimd_f32x4_t value); +//inline Tsimd_f32x4_t tfS32x4FMin(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FMax(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FClamp(Tsimd_f32x4_t value, Tsimd_f32x4_t min, Tsimd_f32x4_t max); +// +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpNeq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpGt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpGtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpLt(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +//inline Tsimd_i32x4_t Tsimd_i32x4_tCmpLtEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +// +//inline Tsimd_f32x4_t tfS32x4FCmpEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FCmpNeq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FCmpGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FCmpGtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FCmpLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline Tsimd_f32x4_t tfS32x4FCmpLtEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +// +//inline bool Tsimd_i32x4_tCmpAllEq(Tsimd_i32x4_t arg1, Tsimd_i32x4_t arg2); +// +//inline bool tfS32x4FCmpAllEq(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline bool tfS32x4FCmpAllLt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); +//inline bool tfS32x4FCmpAllGt(Tsimd_f32x4_t arg1, Tsimd_f32x4_t arg2); #if defined(TF_FEATURE_CPU_SSE) #include "Internal/TF_Simd32x4_sse.inl" diff --git a/Forge/Math/TF_Simd32x4x4.h b/Forge/Math/TF_Simd32x4x4.h new file mode 100644 index 0000000000..44274b8abe --- /dev/null +++ b/Forge/Math/TF_Simd32x4x4.h @@ -0,0 +1,45 @@ +#pragma once +#ifndef TF_MATH_SIMD_FLOAT4x4_H +#define TF_MATH_SIMD_FLOAT4x4_H + +#include "Internal/SimdTypes.h" + +//static inline TSimdFloat4x4 tfLoadZeroSimd4x4F(); +//static inline TSimdFloat4x4 tfLoadIdentitySimd4x4F(); +//static inline TSimdFloat4x4 tfLoadSimd4x4F( +// float m00, float m01, float m02, float m03, +// float m10, float m11, float m12, float m13, +// float m20, float m21, float m22, float m23, +// float m30, float m31, float m32, float m33); +// +//static inline TSimdFloat4x4 tfAddPerElemSimd4x4(TSimdFloat4x4 a0, TSimdFloat4x4 a1); +//static inline TSimdFloat4x4 tfMulPerElemSimd4x4(TSimdFloat4x4 a0, TSimdFloat4x4 a1); +//static inline TSimdFloat4x4 tfDivPerElemSimd4x4(TSimdFloat4x4 a0, TSimdFloat4x4 a1); +// +//static inline TSimdFloat4 tfVectorMulSimd4x4F(const TSimdFloat4x4 a0, const TSimdFloat4 a1); +// +//static inline TSimdFloat4x4 tfMulSimd4x4F_4x4F(TSimdFloat4x4 a0, TSimdFloat4x4 a1); +//static inline TSimdFloat4x3 tfMulSimd4x4F_3x4F(TSimdFloat4x4 a0, TSimdFloat4x3 a1); +//static inline TSimdFloat4x2 tfMulSimd4x4F_2x4F(TSimdFloat4x4 a0, TSimdFloat4x2 a1); +//static inline TSimdFloat4x1 tfMulSimd4x4F_1x4F(TSimdFloat4x4 a0, TSimdFloat4x1 a1); +//static inline TSimdFloat4x4 tfTransposeSimd4x4F(TSimdFloat4x4 a0); +//static inline TSimdFloat4x1 tfInverseFullSimd4x4F(TSimdFloat4x4 a0); +// +//static inline TSimdFloat4 tfGetRowSimd4x4F(TSimdFloat4x4 input, int row); +//static inline void tfSetRowSimd4x4F(TSimdFloat4x4* input, int row, TSimdFloat4 value); +//static inline TSimdFloat4 tfGetColumnSimd4x4F(int column); +//static inline void tfSetColumnSimd4x4F(TSimdFloat4x4* input, int row); +//static inline void tfSetElemSimd4x4F(TSimdFloat4x4* input, int col, int row, float value); +// +//static inline bool tfIsCloseSimd4x4F(TSimdFloat4x4 a, TSimdFloat4x4 b, float epsilon); + + +#include "Internal/TF_SimdFloat4x4.inl" +#if defined(TF_FEATURE_CPU_SSE) +#include "Internal/TF_SimdFloat4x4_sse.inl" +#elif defined(TF_FEATURE_CPU_NEON) +#include "Internal/TF_SimdFloat4x4_neon.inl" +#else +#include "Internal/TF_SimdFloat4x4_scalar.inl" +#endif +#endif diff --git a/Forge/Math/TF_SimdFloat.h b/Forge/Math/TF_SimdFloat.h index 174a8840f8..ead63701f4 100644 --- a/Forge/Math/TF_SimdFloat.h +++ b/Forge/Math/TF_SimdFloat.h @@ -28,28 +28,13 @@ static inline float tfGetYSimd2F(TSimdFloat2 a); static inline bool tfIsCloseSimd2F(TSimdFloat2 a, TSimdFloat2 b, float epsilon); -// ----------------------------------------------------------- -// TSimdFloat3 -// ----------------------------------------------------------- - - -// ----------------------------------------------------------- -// TSimdFloat4 -// ----------------------------------------------------------- #include "TF_SimdFloat3.h" #include "TF_SimdFloat4.h" +#include "TF_SimdFloat3x3.h" -// ----------------------------------------------------------- -// TSimdFloat3x3 -// ----------------------------------------------------------- -static inline TSimdFloat3x3 tfLoadIdentitySimd3x4F(); -static inline void tfSetElemSimd3x3F(TSimdFloat3x3* input, int col, int row, float value); - -// ----------------------------------------------------------- -// TSimdFloat3x4 -// ----------------------------------------------------------- +static inline TSimdFloat4x3 tfLoadIdentitySimd4x3F(); -static inline TSimdFloat4x3 tfLoadSimd3x4F(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, +static inline TSimdFloat4x3 tfLoadSimd4x3F(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22, float m30, float m31, float m32); // ----------------------------------------------------------- @@ -60,46 +45,27 @@ static inline TSimdFloat4x3 tfLoadSimd3x4F(float m00, float m01, float m02, flo static inline TSimdFloat4x2 tfLoadSimd4x2F(float m00, float m01, float m10, float m11, float m20, float m21, float m30, float m31); static inline TSimdFloat4x1 tfLoadSimd4x1F(float m00, float m10, float m20, float m30); -static inline TSimdFloat3x3 tfLoadSimd3x3F(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, - float m22); static inline TSimdFloat3x2 tfLoadSimd2x3F(float m00, float m01, float m10, float m11, float m20, float m21); static inline TSimdFloat3x1 tfLoadSimd1x3F(float m00, float m10, float m20); - static inline void tfSetElemSimd2x3F(TSimdFloat3x2* input, int col, int row, float value); static inline void tfSetElemSimd1x3F(TSimdFloat3x1* input, int col, int row, float value); static inline TSimdFloat2x2 tfLoadSimd2x2F(float m00, float m01, float m10, float m11); static inline TSimdFloat2x1 tfLoadSimd2x1F(float m00, float m10); -static inline TSimdFloat4x4 tfTransposeSimd4x4F(TSimdFloat4x4 input); - static inline bool tfIsCloseSimd3x4F(TSimdFloat4x3 a, TSimdFloat4x3 b, float epsilon); static inline bool tfIsCloseSimd2x4F(TSimdFloat4x2 a, TSimdFloat4x2 b, float epsilon); static inline bool tfIsCloseSimd1x4F(TSimdFloat4x1 a, TSimdFloat4x1 b, float epsilon); -static inline bool tfIsCloseSimd4F(TSimdFloat4 a, TSimdFloat4 b, float epsilon); - static inline void tfSetElemSimd3x4F(TSimdFloat4x3* input, int col, int row, float value); static inline void tfSetElemSimd2x4F(TSimdFloat4x2* input, int col, int row, float value); static inline void tfSetElemSimd1x4F(TSimdFloat4x1* input, int col, int row, float value); -static inline TSimdFloat4 tfGetRowSimd4x4F(TSimdFloat4x4 input, int row); static inline TSimdFloat3 tfGetRowSimd3x4F(TSimdFloat4x3 input, int row); static inline TSimdFloat2 tfGetRowSimd2x4F(TSimdFloat4x2 input, int row); static inline float tfGetRowSimd1x4F(TSimdFloat4x1 input, int row); -static inline float tfVectorDot4F(TSimdFloat4 a0, TSimdFloat4 a1); -static inline float tfVectorLengthSq4F(TSimdFloat4 a0, TSimdFloat4 a1); -static inline float tfVectorLength4F(TSimdFloat4 a0, TSimdFloat4 a1); - -static inline TSimdFloat4 tfVectorEleDiv4F(TSimdFloat4 a0, TSimdFloat4 a1); -static inline TSimdFloat4 tfVectorEleAdd4F(TSimdFloat4 a0, TSimdFloat4 a1); -static inline TSimdFloat4 tfVectorEleSub4F(TSimdFloat4 a0, TSimdFloat4 a1); -static inline TSimdFloat4 tfVectorEleMul4F(TSimdFloat4 a0, TSimdFloat4 a1); - -static inline bool tfIsCloseSimd4x4F(TSimdFloat4x4 a, TSimdFloat4x4 b, float epsilon); -static inline void tfSetElemSimd4x4F(TSimdFloat4x4* input, int col, int row, float value); // conviences if cpp is avaliable diff --git a/Forge/Math/TF_SimdFloat3.h b/Forge/Math/TF_SimdFloat3.h index 258e73d2ac..0937349dce 100644 --- a/Forge/Math/TF_SimdFloat3.h +++ b/Forge/Math/TF_SimdFloat3.h @@ -4,7 +4,6 @@ #include "Internal/SimdTypes.h" -static inline TSimdFloat3 tfLoadZeroSimd3x3F(); static inline TSimdFloat3 tfLoadSimd3F(float x, float y, float z); static inline float tfVectorDot3F(TSimdFloat3 a0, TSimdFloat3 a1); diff --git a/Forge/Math/TF_SimdFloat3x3.h b/Forge/Math/TF_SimdFloat3x3.h new file mode 100644 index 0000000000..cd943881ca --- /dev/null +++ b/Forge/Math/TF_SimdFloat3x3.h @@ -0,0 +1,21 @@ +#pragma once +#ifndef TF_MATH_SIMD_FLOAT3x3_H +#define TF_MATH_SIMD_FLOAT3x3_H + +#include "Internal/SimdTypes.h" + + +static inline TSimdFloat3x3 tfLoadZeroSimd3x3F(); +static inline TSimdFloat3x3 tfLoadIdentitySimd3x3F(); +static inline TSimdFloat3x3 tfLoadSimd3x3F( + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22); + + +static inline TSimdFloat4x3 tfMatMul3x3F_3x3F(TSimdFloat3x3 a0, TSimdFloat3x3 a1); +static inline TSimdFloat4x2 tfMatMul3x3F_3x2F(TSimdFloat3x3 a0, TSimdFloat3x2 a1); + +static inline void tfSetElemSimd3x3F(TSimdFloat3x3* input, int col, int row, float value); + +#endif diff --git a/Forge/Math/TF_SimdFloat4.h b/Forge/Math/TF_SimdFloat4.h index 8b4bcaeac8..46362b2652 100644 --- a/Forge/Math/TF_SimdFloat4.h +++ b/Forge/Math/TF_SimdFloat4.h @@ -12,6 +12,17 @@ static inline float tfGetYSimd4F(TSimdFloat4 a); static inline float tfGetZSimd4F(TSimdFloat4 a); static inline float tfGetWSimd4F(TSimdFloat4 a); +static inline bool tfIsCloseSimd4F(TSimdFloat4 a, TSimdFloat4 b, float epsilon); + +static inline float tfVectorDot4F(TSimdFloat4 a0, TSimdFloat4 a1); +static inline float tfVectorLengthSq4F(TSimdFloat4 a0, TSimdFloat4 a1); +static inline float tfVectorLength4F(TSimdFloat4 a0, TSimdFloat4 a1); + +static inline TSimdFloat4 tfVectorEleDiv4F(TSimdFloat4 a0, TSimdFloat4 a1); +static inline TSimdFloat4 tfVectorEleAdd4F(TSimdFloat4 a0, TSimdFloat4 a1); +static inline TSimdFloat4 tfVectorEleSub4F(TSimdFloat4 a0, TSimdFloat4 a1); +static inline TSimdFloat4 tfVectorEleMul4F(TSimdFloat4 a0, TSimdFloat4 a1); + #include "Internal/TF_SimdFloat4.inl" #endif diff --git a/Forge/Math/TF_SimdFloat4x2.h b/Forge/Math/TF_SimdFloat4x2.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Forge/Math/TF_SimdFloat4x3.h b/Forge/Math/TF_SimdFloat4x3.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Forge/Math/TF_SimdFloat4x4.h b/Forge/Math/TF_SimdFloat4x4.h deleted file mode 100644 index 25a5bcc8fd..0000000000 --- a/Forge/Math/TF_SimdFloat4x4.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once -#ifndef TF_MATH_SIMD_FLOAT4x4_H -#define TF_MATH_SIMD_FLOAT4x4_H - -#include "Internal/SimdTypes.h" - -static inline TSimdFloat4x4 tfLoadZeroSimd4x4F(); -static inline TSimdFloat4x4 tfLoadIdentitySimd4x4F(); -static inline TSimdFloat4x4 tfLoadSimd4x4F( - float m00, float m01, float m02, float m03, - float m10, float m11, float m12, float m13, - float m20, float m21, float m22, float m23, - float m30, float m31, float m32, float m33); - -static inline TSimdFloat4 tfVectorMul4x4F(const TSimdFloat4x4 a0, const TSimdFloat4 a1); -static inline TSimdFloat4x4 tfMatMul4x4F_4x4F(TSimdFloat4x4 a0, TSimdFloat4x4 a1); -static inline TSimdFloat4x3 tfMatMul4x4F_3x4F(TSimdFloat4x4 a0, TSimdFloat4x3 a1); -static inline TSimdFloat4x2 tfMatMul4x4F_2x4F(TSimdFloat4x4 a0, TSimdFloat4x2 a1); -static inline TSimdFloat4x1 tfMatMul4x4F_1x4F(TSimdFloat4x4 a0, TSimdFloat4x1 a1); -static inline TSimdFloat4x4 tfMatTranpose4x4F(TSimdFloat4x4 a0); - -static inline TSimdFloat4x1 tfMatInverseFull4x4F(TSimdFloat4x4 a0); - -#include "Internal/TF_SimdFloat4x4.inl" -#if defined(TF_FEATURE_CPU_SSE) -#include "Internal/TF_SimdFloat4x4_sse.inl" -#elif defined(TF_FEATURE_CPU_NEON) -#include "Internal/TF_SimdFloat4x4_neon.inl" -#else -#include "Internal/TF_SimdFloat4x4_scalar.inl" -#endif -#endif diff --git a/Forge/Math/TF_SimdMath4.h b/Forge/Math/TF_SimdMath4.h new file mode 100644 index 0000000000..197fc57ea2 --- /dev/null +++ b/Forge/Math/TF_SimdMath4.h @@ -0,0 +1,566 @@ +/* + * Copyright (c) Contributors to the Open 3D Engine Project. + * For complete copyright and license terms please see the LICENSE at the root of this distribution. + * + * SPDX-License-Identifier: Apache-2.0 OR MIT + * + */ +#include "Forge/Math/Internal/Types.h" + +inline TSimd32fx4 tfSimd4fSplat(float value); +inline TSimdi32x4 tfSimd4iSplat(int32_t value); + +inline TSimd32fx4 tfSimd4fZero(); +inline TSimdi32x4 tfSimd4iZero(); + +inline Simd_FloatType tfSimd4fToSimd1f(TSimd32fx4 value); +inline TSimdf32x2 tfSimd4fToSimd2f(TSimd32fx4 value); +inline TSimdf32x3 tfSimd4fToSimd3f(TSimd32fx4 value); +inline TSimdi32x4 tfSimd4fToSimd4i(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4iToSimd4f(TSimdi32x4 value); + +inline TSimd32fx4 tfSimd4fSplatIndex0(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fSplatIndex1(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fSplatIndex2(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fSplatIndex3(TSimd32fx4 value); + +inline TSimd32fx4 tfSimd4fSelect(TSimd32fx4 arg0, TSimd32fx4 arg1, TSimd32fx4 mask); +inline TSimdi32x4 tfSimd4iSelect(TSimdi32x4 arg0, TSimdi32x4 arg1, TSimdi32x4 mask); + +inline float tfS32x4FSelectIndex0(TSimd32fx4 value); +inline float tfS32x4FSelectIndex1(TSimd32fx4 value); +inline float tfS32x4FSelectIndex2(TSimd32fx4 value); +inline float tfS32x4FSelectIndex3(TSimd32fx4 value); + +inline TSimd32fx4 tfSimdFloat4Load(float x, float y, float z, float w); +inline TSimdi32x4 tfSimdInt4Load(int32_t x, int32_t y, int32_t z, int32_t w); + +inline TSimd32fx4 tfSimd4fAdd(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fSub(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fMul(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fMadd(TSimd32fx4 mul1, TSimd32fx4 mul2, TSimd32fx4 add); +inline TSimd32fx4 tfSimd4fDiv(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fAbs(TSimd32fx4 value); + +inline TSimd32fx4 tfSimd4fNot(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fAnd(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fAndNot(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fOr(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fXor(TSimd32fx4 arg1, TSimd32fx4 arg2); + +inline TSimdi32x4 tfSimd4iNot(TSimdi32x4 value); +inline TSimdi32x4 tfSimd4iAnd(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iAndNot(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iOr(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iXor(TSimdi32x4 arg1, TSimdi32x4 arg2); + +inline TSimd32fx4 tfSimd4fFloor(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fCeil(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fRound(TSimd32fx4 value); // Ties to even (banker's rounding) +inline TSimd32fx4 tfSimd4fTruncate(TSimd32fx4 value); +inline TSimd32fx4 tfSimd4fMin(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fMax(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fClamp(TSimd32fx4 value, TSimd32fx4 min, TSimd32fx4 max); + +inline TSimdi32x4 tfSimd4iCmpEq(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iCmpNeq(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iCmpGt(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iCmpGtEq(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iCmpLt(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline TSimdi32x4 tfSimd4iCmpLtEq(TSimdi32x4 arg1, TSimdi32x4 arg2); + +inline TSimd32fx4 tfSimd4fCmpEq(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fCmpNeq(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fCmpGt(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fCmpGtEq(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fCmpLt(TSimd32fx4 arg1, TSimd32fx4 arg2); +inline TSimd32fx4 tfSimd4fCmpLtEq(TSimd32fx4 arg1, TSimd32fx4 arg2); + +inline bool tfSimd4iCmpAllEq(TSimdi32x4 arg1, TSimdi32x4 arg2); +inline bool tfSimd4fCmpAllEq(TSimd32fx4 arg1, TSimd32fx4 arg2); + +// ---------------------------------------------------------------- +// --------------------- Implementaion ---------------------------- +// ---------------------------------------------------------------- +inline TSimdi32x4 tfSimd4iSelect(TSimdi32x4 arg0, TSimdi32x4 arg1, TSimdi32x4 mask) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_blendv_epi8(arg0, arg1, mask); +#else + return {(mask.v[0] == 0) ? arg0.v[0] : arg1.v[0] + , (mask.v[1] == 0) ? arg0.v[1] : arg1.v[1] + , (mask.v[2] == 0) ? arg0.v[2] : arg1.v[2] + , (mask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; +#endif +} +inline TSimd32fx4 tfSimd4fSelect(TSimd32fx4 arg0, TSimd32fx4 arg1, TSimd32fx4 mask) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_blendv_ps(arg0, arg1, mask); +#else + TSimdi32x4 intMask = tfSimd4fToSimd4i(mask); + return { (intMask.v[0] == 0) ? arg0.v[0] : arg1.v[0] + , (intMask.v[1] == 0) ? arg0.v[1] : arg1.v[1] + , (intMask.v[2] == 0) ? arg0.v[2] : arg1.v[2] + , (intMask.v[3] == 0) ? arg0.v[3] : arg1.v[3] }; +#endif +} + +inline TSimd32fx4 tfSimd4fZero() { return tfSimd4iToSimd4f(tfSimd4iZero()); } +inline TSimdi32x4 tfSimd4iZero() +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_setzero_si128(); +#else + return { 0, 0, 0, 0 }; +#endif +} + +inline TSimdi32x4 tfSimd4iNot(TSimdi32x4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + const TSimdi32x4 invert = tfSimd4iSplat(TF_SIMDI_MAX); + return _mm_andnot_si128(value, invert); +#else + return { ~value.v[0], ~value.v[1], ~value.v[2], ~value.v[3] }; +#endif +} +inline TSimdi32x4 tfSimd4iAnd(TSimdi32x4 arg1, TSimdi32x4 arg2){ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_and_si128(arg1, arg2); +#else + return { arg1.v[0] & arg2.v[0], arg1.v[1] & arg2.v[1], arg1.v[2] & arg2.v[2], arg1.v[3] & arg2.v[3] }; +#endif + +} +inline TSimdi32x4 tfSimd4iAndNot(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_andnot_si128(arg1, arg2); +#else + return { ~arg1.v[0] & arg2.v[0], ~arg1.v[1] & arg2.v[1], ~arg1.v[2] & arg2.v[2], ~arg1.v[3] & arg2.v[3] }; +#endif +} +inline TSimdi32x4 tfSimd4iOr(TSimdi32x4 arg1, TSimdi32x4 arg2){ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_or_si128(arg1, arg2); +#else + return { arg1.v[0] | arg2.v[0], arg1.v[1] | arg2.v[1], arg1.v[2] | arg2.v[2], arg1.v[3] | arg2.v[3] }; +#endif +} +inline TSimdi32x4 tfSimd4iXor(TSimdi32x4 arg1, TSimdi32x4 arg2){ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_xor_si128(arg1, arg2); +#else + return { arg1.v[0] ^ arg2.v[0], arg1.v[1] ^ arg2.v[1], arg1.v[2] ^ arg2.v[2], arg1.v[3] ^ arg2.v[3] }; +#endif +} + +inline TSimd32fx4 tfSimd4fNot(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + const TSimd32fx4 invert = tfSimd4fSplat((float)(0xFFFFFFFF)); + return _mm_andnot_ps(value, invert); +#else + TSimdi32x4 result = { { ~((int32_t)value.v[0]), ~((int32_t)value.v[1]), + ~((int32_t)value.v[2]), ~((int32_t)value.v[3]) } }; + return tfSimd4iToSimd4f(result); +#endif +} +inline TSimd32fx4 tfSimd4fAnd(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_and_ps(arg1, arg2); +#else + TSimdi32x4 result = { ((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), ((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), + ((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), ((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) }; + return tfSimd4iToSimd4f(result); +#endif +} +inline TSimd32fx4 tfSimd4fAndNot(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_andnot_ps(arg1, arg2); +#else + TSimdi32x4 result = { { ~((int32_t)arg1.v[0]) & ((int32_t)arg2.v[0]), + ~((int32_t)arg1.v[1]) & ((int32_t)arg2.v[1]), + ~((int32_t)arg1.v[2]) & ((int32_t)arg2.v[2]), + ~((int32_t)arg1.v[3]) & ((int32_t)arg2.v[3]) } }; + return tfSimd4iToSimd4f(result); +#endif +} +inline TSimd32fx4 tfSimd4fOr(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_or_ps(arg1, arg2); +#else + TSimdi32x4 result = { { ((int32_t)arg1.v[0]) | ((int32_t)arg2.v[0]), + ((int32_t)arg1.v[1]) | ((int32_t)arg2.v[1]), + ((int32_t)arg1.v[2]) | ((int32_t)arg2.v[2]), + ((int32_t)arg1.v[3]) | ((int32_t)arg2.v[3]) } }; + return tfSimd4iToSimd4f(result); +#endif +} +inline TSimd32fx4 tfSimd4fXor(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_xor_ps(arg1, arg2); +#else + TSimdi32x4 result = { { ((int32_t)arg1.v[0]) ^ ((int32_t)arg2.v[0]), + ((int32_t)arg1.v[1]) ^ ((int32_t)arg2.v[1]), + ((int32_t)arg1.v[2]) ^ ((int32_t)arg2.v[2]), + ((int32_t)arg1.v[3]) ^ ((int32_t)arg2.v[3]) } }; + return tfSimd4iToSimd4f(result); +#endif +} + +inline TSimd32fx4 tfSimd4fFloor(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_floor_ps(value); +#else + return { { floorf(value.v[0]), floorf(value.v[1]), floorf(value.v[2]), floorf(value.v[3]) } }; +#endif +} +inline TSimd32fx4 tfSimd4fCeil(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_ceil_ps(value); +#else + return { { ceilf(value.v[0]), ceilf(value.v[1]), ceilf(value.v[2]), ceilf(value.v[3]) } }; +#endif +} +inline TSimd32fx4 tfSimd4fRound(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // While 'roundf' may seem the obvious choice here, it rounds halfway cases + // away from zero regardless of the current rounding mode, but 'rintf' uses + // the current rounding mode which is consistent with other implementations. + return { { rintf(value.v[0]), rintf(value.v[1]), rintf(value.v[2]), rintf(value.v[3]) } }; +#endif +} +inline TSimd32fx4 tfSimd4fTruncate(TSimd32fx4 value) { return tfSimd4iToSimd4f(tfSimd4fToSimd4i(value)); } +inline TSimd32fx4 tfSimd4fMin(TSimd32fx4 arg1, TSimd32fx4 arg2) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_min_ps(arg1, arg2); +#else + return { { fminf(arg1.v[0], arg2.v[0]), fminf(arg1.v[1], arg2.v[1]), fminf(arg1.v[2], arg2.v[2]), fminf(arg1.v[3], arg2.v[3]) } }; +#endif +} +inline TSimd32fx4 tfSimd4fMax(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_max_ps(arg1, arg2); +#else + return { { fmaxf(arg1.v[0], arg2.v[0]), fmaxf(arg1.v[1], arg2.v[1]), fmaxf(arg1.v[2], arg2.v[2]), fmaxf(arg1.v[3], arg2.v[3]) } }; +#endif +} +inline TSimd32fx4 tfSimd4fClamp(TSimd32fx4 value, TSimd32fx4 min, TSimd32fx4 max) +{ + return tfSimd4fMax(min, tfSimd4fMin(value, max)); +} + +inline TSimdi32x4 tfSimd4fToSimd4i(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_castps_si128(value); +#elif defined(TF_FEATURE_CPU_SCALAR) + return { (int32_t)value.v[0], (int32_t)value.v[1], (int32_t)value.v[2], (int32_t)value.v[3] }; +#endif +} + +inline TSimd32fx4 tfSimd4iToSimd4f(TSimdi32x4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_castsi128_ps(value); +#elif defined(TF_FEATURE_CPU_SCALAR) + return { (float)value.v[0], (float)value.v[1], (float)value.v[2], (float)value.v[3] }; +#endif +} + +inline float tfS32x4FSelectIndex0(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_cvtss_f32(value); +#elif defined(TF_FEATURE_CPU_SCALAR) + return value.v[0]; +#endif +} +inline float tfS32x4FSelectIndex1(TSimd32fx4 value) { return tfS32x4FSelectIndex0(tfSimd4fSplatIndex1(value)); } +inline float tfS32x4FSelectIndex2(TSimd32fx4 value) { return tfS32x4FSelectIndex0(tfSimd4fSplatIndex2(value)); } +inline float tfS32x4FSelectIndex3(TSimd32fx4 value) { return tfS32x4FSelectIndex0(tfSimd4fSplatIndex3(value)); } + +inline TSimd32fx4 tfSimd4fAdd(TSimd32fx4 arg1, TSimd32fx4 arg2) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_add_ps(arg1, arg2); +#else + return { + arg1.v[0] + arg2.v[0], + arg1.v[1] + arg2.v[1], + arg1.v[2] + arg2.v[2], + arg1.v[3] + arg2.v[3], + }; +#endif +} +inline TSimd32fx4 tfSimd4fSub(TSimd32fx4 arg1, TSimd32fx4 arg2) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_sub_ps(arg1, arg2); +#else + return { + arg1.v[0] - arg2.v[0], + arg1.v[1] - arg2.v[1], + arg1.v[2] - arg2.v[2], + arg1.v[3] - arg2.v[3], + }; +#endif +} +inline TSimd32fx4 tfSimd4fMul(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_mul_ps(arg1, arg2); +#else + return { + arg1.v[0] * arg2.v[0], + arg1.v[1] * arg2.v[1], + arg1.v[2] * arg2.v[2], + arg1.v[3] * arg2.v[3], + }; +#endif + +} +inline TSimd32fx4 tfSimd4fMadd(TSimd32fx4 mul1, TSimd32fx4 mul2, TSimd32fx4 add) +{ +#if 0 + return _mm_fmadd_ps(mul1, mul2, add); // Requires FMA CPUID +#else + return tfSimd4fAdd(tfSimd4fMul(mul1, mul2), add); +#endif +} +inline TSimd32fx4 tfSimd4fDiv(TSimd32fx4 arg1, TSimd32fx4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_div_ps(arg1, arg2); +#else + return { + arg1.v[0] / arg2.v[0], + arg1.v[1] / arg2.v[1], + arg1.v[2] / arg2.v[2], + arg1.v[3] / arg2.v[3], + }; +#endif +} + +inline TSimd32fx4 tfSimd4fAbs(TSimd32fx4 value) { +#if defined(TF_FEATURE_CPU_SSE) + return value; + //return _mm_abs_epi32(value); +#else + return { + abs(value.v[0]), + abs(value.v[1]), + abs(value.v[2]), + abs(value.v[3]), + }; +#endif +} +inline TSimd32fx4 tfSimdFloat4Load(float x, float y, float z, float w) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_set_ps(w, z, y, x); +#else + return { x, y, z, w }; +#endif +} + + +inline TSimdi32x4 tfSimdInt4Load(int32_t x, int32_t y, int32_t z, int32_t w) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_set_epi32(w, z, y, x); +#else + return { x, y, z, w }; +#endif +} + +inline Simd_FloatType tfSimd4fToSimd1f(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return value; +#else + return value.v[0]; +#endif +} + +inline TSimdf32x2 tfSimd4fToSimd2f(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return value; +#else + return {value.v[0], value.v[1]}; +#endif +} + +inline TSimdf32x3 tfSimd4fToSimd3f(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return value; +#else + return {value.v[0], value.v[1], value.v[2]}; +#endif +} + +inline TSimd32fx4 tfSimd4fSplatIndex0(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_shuffle_ps(value, value, _MM_SHUFFLE(0, 0, 0, 0)); +#else + return {value.v[0],value.v[0],value.v[0],value.v[0]}; +#endif +} + +inline TSimd32fx4 tfSimd4fSplatIndex1(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_shuffle_ps(value, value, _MM_SHUFFLE(1, 1, 1, 1)); +#else + return {value.v[1],value.v[1],value.v[1],value.v[1]}; +#endif +} + +inline TSimd32fx4 tfSimd4fSplatIndex2(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_shuffle_ps(value, value, _MM_SHUFFLE(2, 2, 2, 2)); +#else + return {value.v[2],value.v[2],value.v[2],value.v[2]}; +#endif +} + +inline TSimd32fx4 tfSimd4fSplatIndex3(TSimd32fx4 value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_shuffle_ps(value, value, _MM_SHUFFLE(3, 3, 3, 3)); +#else + return {value.v[3],value.v[3],value.v[3],value.v[3]}; +#endif +} + +inline TSimdi32x4 tfSimd4iSplat(int32_t value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_set1_epi32(value); +#else + return { value, value, value, value }; +#endif +} + +inline TSimd32fx4 tfSimd4fSplat(float value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_set1_ps(value); +#else + return { value, value, value, value }; +#endif +} + +inline TSimdi32x4 tfSimdSplat4i(int32_t value) +{ +#if defined(TF_FEATURE_CPU_SSE) + return _mm_set1_epi32(value); +#else + return { value, value, value, value }; +#endif +} + + +inline TSimdi32x4 tfSimd4iCmpEq(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_cmpeq_epi32(arg1, arg2); +#else + return { { (arg1.v[0] == arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] == arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] == arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] == arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif +} +inline TSimdi32x4 tfSimd4iCmpNeq(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_xor_si128( + _mm_cmpeq_epi32(arg1, arg2), + _mm_set1_epi32((int32_t)0xFFFFFFFF)); +#else + return { { (arg1.v[0] != arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] != arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] != arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] != arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif +} +inline TSimdi32x4 tfSimd4iCmpGt(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_cmpgt_epi32(arg1, arg2); +#else + return { { (arg1.v[0] > arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] > arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] > arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] > arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif +} +inline TSimdi32x4 tfSimd4iCmpGtEq(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_or_si128( + _mm_cmpgt_epi32(arg1, arg2), + _mm_cmpeq_epi32(arg1, arg2)); +#else + return { { (arg1.v[0] >= arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] >= arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] >= arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] >= arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif +} +inline TSimdi32x4 tfSimd4iCmpLt(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_cmplt_epi32(arg1, arg2); +#else + return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif +} +inline TSimdi32x4 tfSimd4iCmpLtEq(TSimdi32x4 arg1, TSimdi32x4 arg2) { +#if defined(TF_FEATURE_CPU_SSE) + return _mm_or_si128( + _mm_cmplt_epi32(arg1, arg2), + _mm_cmpeq_epi32(arg1, arg2)); +#else + return { { (arg1.v[0] < arg2.v[0]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[1] < arg2.v[1]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[2] < arg2.v[2]) ? (int32_t)0xFFFFFFFF : 0x00000000, + (arg1.v[3] < arg2.v[3]) ? (int32_t)0xFFFFFFFF : 0x00000000 } }; +#endif + +} + + + + +//inline bool tfSimd4fCmpAllEq(TSimd32fx4 arg1, TSimd32fx4 arg2) { +//#if defined(TF_FEATURE_CPU_SSE) +// const TSimd32fx4 compare = tfSimd4fCmpAllEq(arg1, arg2); +// return (_mm_movemask_epi8(compare) & 0xf) == 0xf; +//#else +// for(int i = 0; i < 4; i++) { +// if (arg1.v[i] != arg2.v[i]) +// { +// return false; +// } +// } +// return true; +//#endif +// +//} + +inline bool tfSimd4iCmpAllEq(TSimdi32x4 arg1, TSimdi32x4 arg2) +{ +#if defined(TF_FEATURE_CPU_SSE) + const TSimdi32x4 compare = tfSimd4iCmpEq(arg1, arg2); + return (_mm_movemask_epi8(compare) & 0xf) == 0xf; +#else + for(int i = 0; i < 4; i++) { + if (arg1.v[i] != arg2.v[i]) + { + return false; + } + } + return true; +#endif +} diff --git a/Forge/Math/TF_SimdQuat32x4.h b/Forge/Math/TF_SimdQuat32x4.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Forge/tests/Math/BUCK b/Forge/tests/Math/BUCK index 2b5ea6503b..41ca447827 100644 --- a/Forge/tests/Math/BUCK +++ b/Forge/tests/Math/BUCK @@ -2,12 +2,11 @@ load(":defs.bzl", "cxx_math_simd_test") math_utils_header = ["TF_MathUtils.h"] -cxx_math_simd_test(name = "TF_SimdFloat2x32Test",srcs = ["TF_SimdFloat2x32Test.cpp"]) -cxx_math_simd_test(name = "TF_SimdFloat3x32Test",srcs = ["TF_SimdFloat3x32Test.cpp"]) -cxx_math_simd_test(name = "TF_SimdFloat4x32Test",srcs = ["TF_SimdFloat4x32Test.cpp"]) +# cxx_math_simd_test(name = "TF_SimdFloat2x32Test",srcs = ["TF_SimdFloat2x32Test.cpp"]) +# cxx_math_simd_test(name = "TF_SimdFloat3x32Test",srcs = ["TF_SimdFloat3x32Test.cpp"]) +# cxx_math_simd_test(name = "TF_SimdFloat4x32Test",srcs = ["TF_SimdFloat4x32Test.cpp"]) +# cxx_math_simd_test(name = "TF_SimdFloat3Test",srcs = ["TF_SimdFloat3Test.cpp"]) +# cxx_math_simd_test(name = "TF_SimdFloat4x4Test", srcs = ["TF_SimdFloat4x4Test.cpp"]) -cxx_math_simd_test(name = "TF_SimdFloat4Test",srcs = ["TF_SimdFloat4Test.cpp"]) -cxx_math_simd_test(name = "TF_SimdFloat3Test",srcs = ["TF_SimdFloat3Test.cpp"]) - -cxx_math_simd_test(name = "TF_SimdFloat4x4Test", srcs = ["TF_SimdFloat4x4Test.cpp"]) +cxx_math_simd_test(name = "TF_Simdf32x4Test", srcs = ["TF_Simd32x4Test.cpp"]) diff --git a/Forge/tests/Math/TF_MathUtils.h b/Forge/tests/Math/TF_MathUtils.h index e0456f82d2..b63f42f440 100644 --- a/Forge/tests/Math/TF_MathUtils.h +++ b/Forge/tests/Math/TF_MathUtils.h @@ -19,35 +19,47 @@ EXPECT_EQ(__a.getW(), __b.getW()); \ } while(false); +#define LOG_FORMAT_SIMD_32x4f(input) \ + "%.3f, %.3f, %.3f, %.3f", tfSimdSelect0_f32x4(input), tfSimdSelect1_f32x4(input), tfSimdSelect2_f32x4(input), tfSimdSelect3_f32x4(input) +#define LOG_FORMAT_SIMD_32x4i(input) \ + "%d, %d, %d, %d", tfSimdSelect0_i32x4(input), tfSimdSelect1_i32x4(input), tfSimdSelect2_i32x4(input), tfSimdSelect3_i32x4(input) -static inline void debugPrintSimd4F(TSimdFloat4 input) { - DLOGF(LogLevel::eDEBUG, "%.3f, %.3f, %.3f, %.3f", - tfSimd4fSelectIndex0(input.mRow), - tfSimd4fSelectIndex1(input.mRow), - tfSimd4fSelectIndex2(input.mRow), - tfSimd4fSelectIndex3(input.mRow)); +#define LOG_SIMD_32x4x4f(input, LOG, ...) \ + LOG(__VA_ARGS__, "%.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f", tfSimdSelectIndex0_f32x4(input.mCol0), \ + tfSimdSelectIndex1_f32x4(input.mCol1), tfSimdSelectIndex2_f32x4(input.mCol2), tfSimdSelectIndex3_f32x4(input.mCol3), \ + tfSimdSelectIndex0_f32x4(input.mCol0), tfSimdSelectIndex1_f32x4(input.mCol1), tfSimdSelectIndex2_f32x4(input.mCol2), \ + tfSimdSelectIndex3_f32x4(input.mCol3), tfSimdSelectIndex0_f32x4(input.mCol0), tfSimdSelectIndex1_f32x4(input.mCol1), \ + tfSimdSelectIndex2_f32x4(input.mCol2), tfSimdSelectIndex3_f32x4(input.mCol3), tfSimdSelectIndex0_f32x4(input.mCol0), \ + tfSimdSelectIndex1_f32x4(input.mCol1), tfSimdSelectIndex2_f32x4(input.mCol2), tfSimdSelectIndex3_f32x4(input.mCol3)) + +static inline void debugPrintSimd4F(Tsimd_f32x4_t input) { + // DLOGF(LogLevel::eDEBUG, "%.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f", + // tfSimdSelectIndex0_f32x4(input), + // tfSimdSelectIndex1_f32x4(input), + // tfSimdSelectIndex2_f32x4(input), + // tfSimdSelectIndex3_f32x4(input)); } -static inline void debugPrintSimd4x4F(TSimdFloat4x4 input) { - DLOGF(LogLevel::eDEBUG,"%.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f", - tfSimd4fSelectIndex0(input.mCol0), - tfSimd4fSelectIndex0(input.mCol1), - tfSimd4fSelectIndex0(input.mCol2), - tfSimd4fSelectIndex0(input.mCol3), +static inline void debugPrintSimd4x4F(struct Tsimd_f32x4x4_s input) { + // DLOGF(LogLevel::eDEBUG,"%.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f", + // tfSimdSelectIndex0_f32x4(input.mCol0), + // tfSimdSelectIndex1_f32x4(input.mCol1), + // tfSimdSelectIndex2_f32x4(input.mCol2), + // tfSimdSelectIndex3_f32x4(input.mCol3), - tfSimd4fSelectIndex1(input.mCol0), - tfSimd4fSelectIndex1(input.mCol1), - tfSimd4fSelectIndex1(input.mCol2), - tfSimd4fSelectIndex1(input.mCol3), - - tfSimd4fSelectIndex2(input.mCol0), - tfSimd4fSelectIndex2(input.mCol1), - tfSimd4fSelectIndex2(input.mCol2), - tfSimd4fSelectIndex2(input.mCol3), - - tfSimd4fSelectIndex3(input.mCol0), - tfSimd4fSelectIndex3(input.mCol1), - tfSimd4fSelectIndex3(input.mCol2), - tfSimd4fSelectIndex3(input.mCol3) - ); + // tfSimdSelectIndex0_f32x4(input.mCol0), + // tfSimdSelectIndex1_f32x4(input.mCol1), + // tfSimdSelectIndex2_f32x4(input.mCol2), + // tfSimdSelectIndex3_f32x4(input.mCol3), + // + // tfSimdSelectIndex0_f32x4(input.mCol0), + // tfSimdSelectIndex1_f32x4(input.mCol1), + // tfSimdSelectIndex2_f32x4(input.mCol2), + // tfSimdSelectIndex3_f32x4(input.mCol3), + // + // tfSimdSelectIndex0_f32x4(input.mCol0), + // tfSimdSelectIndex1_f32x4(input.mCol1), + // tfSimdSelectIndex2_f32x4(input.mCol2), + // tfSimdSelectIndex3_f32x4(input.mCol3) + // ); } diff --git a/Forge/tests/Math/TF_Simd2Test.cpp b/Forge/tests/Math/TF_Simd2Test.cpp new file mode 100644 index 0000000000..5f59d0bf63 --- /dev/null +++ b/Forge/tests/Math/TF_Simd2Test.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) Contributors to the Open 3D Engine Project. + * For complete copyright and license terms please see the LICENSE at the root of this distribution. + * + * SPDX-License-Identifier: Apache-2.0 OR MIT + * + */ +#include "TF_TestMain.h" +#include "utest.h" + +#include "Forge/Math/TF_Simd32x2.h" +#include "TF_MathUtils.h" + + +UTEST(TF_Simd2, tfS32x2ICmpGt) +{ + struct { + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; + } tests[] = { + // ... existing test cases ... + // Edge cases: + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed values: + {tfS32x2ILoadImmediate(0, -1), tfS32x2ILoadImmediate(1, 0), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // All elements equal: + {tfS32x2ILoadImmediate(42, 42), tfS32x2ILoadImmediate(42, 42), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Different element types: + {tfS32x2ILoadImmediate(0, 1), tfS32x2ILoadImmediate(3, 2), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpGt(tests[i].a, tests[i].b), tests[i].test)); + } +} + +UTEST(TF_Simd2, tfS32x2ICmpLt) +{ + struct { + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; + } tests[] = { + // Less than + {tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(16, 17), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed less than and equal + {tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(125, 13), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_TRUE)}, + // All elements greater than or equal + {tfS32x2ILoadImmediate(1, 2), tfS32x2ILoadImmediate(-1, 0), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Edge cases: + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpLt(tests[i].a, tests[i].b), tests[i].test)); + } +} + + + +UTEST(TF_Simd2, tfS32x2ICmpEq) +{ + struct { + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; + } tests[] = { + // Equal elements + {tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed equal and unequal elements + {tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // All elements unequal + {tfS32x2ILoadImmediate(1, 2), tfS32x2ILoadImmediate(5, 6), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Edge cases: + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpEq(tests[i].a, tests[i].b), tests[i].test)); + } +} + +UTEST(TF_Simd2, tfS32x2FZero) +{ + Tsimd_f32x2_t value = tfS32x2FZero(); + EXPECT_NEAR(tfS32x2FSelectIndex0(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value), 0.0f, DEFAULT_EPSILON); +} + +UTEST(TF_Simd2, tfS32x2FSplat) +{ + Tsimd_f32x2_t value = tfS32x2FSplat(23.f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value), 23.f, DEFAULT_EPSILON); + Tsimd_f32x2_t value1 = tfS32x2FSplat(5.1f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); + + Tsimd_f32x2_t value2 = tfSimdFloat2Load(5.1f, 1.0f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); +} + +UTEST(TF_Simd2, tfS32x3iNot) { + struct { + Tsimd_i32x2_t test; + Tsimd_i32x2_t expect; + } tests[] = { + {tfS32x2ILoadImmediate(0xFFFFFFFF, 0x0000FFFF), tfS32x2ILoadImmediate(0, 0xFFFF0000)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2INot(tests[i].test), tests[i].expect)); + } +} + +UTEST(TF_Simd2, tfS32x2ISelect) { + struct { + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t mask; + Tsimd_i32x2_t expect; + } tests[] = { + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(0,0), tfS32x2ILoadImmediate(10, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,0), tfS32x2ILoadImmediate(123, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,0), tfS32x2ILoadImmediate(123, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE), tfS32x2ILoadImmediate(123, -149)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ISelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); + } +} + +//UTEST(TF_Simd2, tfS32x3FSelect) { +// struct { +// TSimd32fx4 a; +// TSimd32fx4 b; +// TSimd32fx4 mask; +// TSimd32fx4 expect; +// } tests[] = { +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(0,0,0,0), tfS32x3FLoadImmediate(10, 11, -13, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,0,0,0), tfS32x3FLoadImmediate(123, 11, -13, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,0,TF_SIMD_TRUE,0), tfS32x3FLoadImmediate(123, 11, 0, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfS32x3FLoadImmediate(123, -149, 0, 12)}, +// }; +// for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) +// { +// //EXPECT_TRUE(tfS32x3FCmp(tfS32x2ISelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); +// } +//} + + +#include "Forge/Mem/TF_Memory.h" +#include "Forge/TF_FileSystem.h" +#include "Forge/TF_Log.h" +UTEST_STATE(); +TF_UTEST_MAIN("TF_TF_Simd2") + + diff --git a/Forge/tests/Math/TF_Simd32x4Test.cpp b/Forge/tests/Math/TF_Simd32x4Test.cpp new file mode 100644 index 0000000000..affadf2384 --- /dev/null +++ b/Forge/tests/Math/TF_Simd32x4Test.cpp @@ -0,0 +1,756 @@ +/* + * Copyright (c) Contributors to the Open 3D Engine Project. + * For complete copyright and license terms please see the LICENSE at the root of this distribution. + * + * SPDX-License-Identifier: Apache-2.0 OR MIT + * + */ +#include "TF_TestMain.h" +#include "utest.h" + +#include "Forge/Math/TF_Simd32x4.h" +#include "TF_MathUtils.h" + +UTEST(Tsimd_f32x4_t, Zero) +{ + Tsimd_f32x4_t value = tfSimdZero_f32x4(); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 0), 0, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 1), 0, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 2), 0, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 3), 0, DEFAULT_EPSILON); + +} + +UTEST(Tsimd_f32x4_t, Load_Select) +{ + Tsimd_f32x4_t value = tfSimdLoad_f32x4(123.0,12.f,45.f,12.5f); + + EXPECT_NEAR(tfSimdSelect_f32x4(value, 0), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 1), 12.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 2), 45.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect_f32x4(value, 3), 12.5f, DEFAULT_EPSILON); + + EXPECT_NEAR(tfSimdSelect0_f32x4(value), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(value), 12.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(value), 45.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(value), 12.5f, DEFAULT_EPSILON); +} + +UTEST(Tsimd_i32x4_t, Splat) { + Tsimd_i32x4_t value = tfSimdLoad_i32x4(123, 12, 45, 12); + { + Tsimd_i32x4_t test = tfSimdSplat_i32x4(123); + EXPECT_NEAR(tfSimdSelect0_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_i32x4(test), 123, DEFAULT_EPSILON); + } + { + Tsimd_i32x4_t test = tfSimdSplat0_i32x4(value); + EXPECT_NEAR(tfSimdSelect0_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_i32x4(test), 123, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_i32x4(test), 123, DEFAULT_EPSILON); + } + + { + Tsimd_i32x4_t test = tfSimdSplat1_i32x4(value); + EXPECT_NEAR(tfSimdSelect0_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_i32x4(test), 12, DEFAULT_EPSILON); + } + + { + Tsimd_i32x4_t test = tfSimdSplat2_i32x4(value); + EXPECT_NEAR(tfSimdSelect0_i32x4(test), 45, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_i32x4(test), 45, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_i32x4(test), 45, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_i32x4(test), 45, DEFAULT_EPSILON); + } + + { + Tsimd_i32x4_t test = tfSimdSplat3_i32x4(value); + EXPECT_NEAR(tfSimdSelect0_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_i32x4(test), 12, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_i32x4(test), 12, DEFAULT_EPSILON); + } +} + +UTEST(Tsimd_f32x4_t, Splat) { + Tsimd_f32x4_t value = tfSimdLoad_f32x4(123.0,12.f,45.f,12.5f); + { + Tsimd_f32x4_t test = tfSimdSplat_f32x4(123.0f); + EXPECT_NEAR(tfSimdSelect0_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(test), 123.0f, DEFAULT_EPSILON); + + } + { + Tsimd_f32x4_t test = tfSimdSplat0_f32x4(value); + EXPECT_NEAR(tfSimdSelect0_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(test), 123.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(test), 123.0f, DEFAULT_EPSILON); + } + + { + Tsimd_f32x4_t test = tfSimdSplat1_f32x4(value); + EXPECT_NEAR(tfSimdSelect0_f32x4(test), 12.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(test), 12.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(test), 12.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(test), 12.0f, DEFAULT_EPSILON); + } + + { + Tsimd_f32x4_t test = tfSimdSplat2_f32x4(value); + EXPECT_NEAR(tfSimdSelect0_f32x4(test), 45.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(test), 45.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(test), 45.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(test), 45.0f, DEFAULT_EPSILON); + } + + { + Tsimd_f32x4_t test = tfSimdSplat3_f32x4(value); + EXPECT_NEAR(tfSimdSelect0_f32x4(test), 12.5f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect1_f32x4(test), 12.5f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect2_f32x4(test), 12.5f, DEFAULT_EPSILON); + EXPECT_NEAR(tfSimdSelect3_f32x4(test), 12.5f, DEFAULT_EPSILON); + } +} + + +UTEST(Tsimd_i32x4_t, Load_Select) +{ + Tsimd_i32x4_t value = tfSimdLoad_i32x4(123,12,45,165); + + EXPECT_EQ(tfSimdSelect_i32x4(value, 0), 123); + EXPECT_EQ(tfSimdSelect_i32x4(value, 1), 12); + EXPECT_EQ(tfSimdSelect_i32x4(value, 2), 45); + EXPECT_EQ(tfSimdSelect_i32x4(value, 3), 165); + + EXPECT_EQ(tfSimdSelect0_i32x4(value), 123); + EXPECT_EQ(tfSimdSelect1_i32x4(value), 12); + EXPECT_EQ(tfSimdSelect2_i32x4(value), 45); + EXPECT_EQ(tfSimdSelect3_i32x4(value), 165); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpGt_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + // Mixed values: + {tfSimdLoad_f32x4(0, -1, 1, 0), tfSimdLoad_f32x4(1, 0, -1, 0), tfSimdLoad_i32x4(0, 0, -1, 0)}, + // All elements greater: + {tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // All elements less: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_f32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpGt_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG,LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpEq_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_f32x4(0, -1, 1, 0), tfSimdLoad_f32x4(1, 0, -1, 0), tfSimdLoad_i32x4(0, 0, 0, -1)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements different: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(5, 6, 7, 8), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpEq_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpNeq_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_f32x4(0, -1, 1, 0), tfSimdLoad_f32x4(1, 0, -1, 0), tfSimdLoad_i32x4(-1, -1, -1, 0)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // All elements different: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(5, 6, 7, 8), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpNeq_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpGtEq_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_f32x4(12, 13, 14, 15), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements greater or equal: + {tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements less: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_f32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpGtEq_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpLt_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_f32x4(0, -1, 1, 0), tfSimdLoad_f32x4(1, 0, -1, 0), tfSimdLoad_i32x4(-1, -1, 0, 0)}, + // All elements less: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // All elements greater: + {tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-2, 1, -4, 3), tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpLt_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpLtEq_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_f32x4(0, -1, 1, 0), tfSimdLoad_f32x4(1, 0, -1, 0), tfSimdLoad_i32x4(-1, -1, 0, -1)}, + // All elements less or equal: + {tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements greater: + {tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_f32x4(-2, 1, -4, 3), tfSimdLoad_f32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimd_f32x4_To_i32x4(tfSimdCmpLtEq_f32x4(tests[i].a, tests[i].b)); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + + +UTEST(Tsimd_i32x4_t, tfSimdCmpGt_i32x4) +{ + struct { + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_i32x4(0, -1, 1, 0), tfSimdLoad_i32x4(1, 0, -1, 0), tfSimdLoad_i32x4(0, 0, -1, 0)}, + // All elements greater: + {tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // All elements less: + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdCmpGt_i32x4(tests[i].a, tests[i].b); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpGtEq_i32x4) +{ + struct { + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_i32x4(0, -1, 1, 0), tfSimdLoad_i32x4(1, 0, -1, 0), tfSimdLoad_i32x4(0, 0, -1, -1)}, + // All elements greater or equal: + {tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements less: + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdCmpGtEq_i32x4(tests[i].a, tests[i].b); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpLtEq_i32x4) +{ + struct { + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_i32x4(0, -1, 1, 0), tfSimdLoad_i32x4(1, 0, -1, 0), tfSimdLoad_i32x4(-1, -1, 0, -1)}, + // All elements less or equal: + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements greater: + {tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Zero comparison: + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdCmpLtEq_i32x4(tests[i].a, tests[i].b); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpLt_i32x4) +{ + struct { + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_i32x4(0, -1, 1, 0), tfSimdLoad_i32x4(1, 0, -1, 0), tfSimdLoad_i32x4(-1, -1, 0, 0)}, + // All elements less: + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements equal: + {tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // All elements greater: + {tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-2, 1, -4, 3), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Zero comparison: + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdCmpLt_i32x4(tests[i].a, tests[i].b); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + + +UTEST(Tsimd_i32x4_t, tfSimdCmpEq_i32x4) +{ + struct { + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; + } tests[] = { + // Mixed values: + {tfSimdLoad_i32x4(0, -1, 1, 0), tfSimdLoad_i32x4(1, 0, -1, 0), tfSimdLoad_i32x4(0, 0, 0, -1)}, + // All elements equal: + {tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // All elements not equal: + {tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 0, 0)}, + // Mixed positive and negative values: + {tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-1, 2, -3, 4), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + // Zero comparison: + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(-1, -1, -1, -1)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdCmpEq_i32x4(tests[i].a, tests[i].b); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].test)); + } +} + + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllGt_f32x4) +{ + EXPECT_TRUE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(16, 16, 17.1f, 13.0f), tfSimdLoad_f32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(2.0f, 0, 0, 0), tfSimdLoad_f32x4(2.0f, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(5, 6, 7, 8), tfSimdLoad_f32x4(4, 7, 6, 9))); + EXPECT_FALSE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5))); + EXPECT_TRUE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(1, -2, 3, -4), tfSimdLoad_f32x4(0, -3, 2, -5))); + EXPECT_FALSE(tfSimdCmpAllGt_f32x4(tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllEq_f32x4) +{ + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllEq_f32x4(tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(1, 2, 3, 5))); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllNeq_f32x4) +{ + EXPECT_FALSE(tfSimdCmpAllNeq_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllLt_f32x4(tfSimdLoad_f32x4(0,0,0,0), tfSimdLoad_f32x4(0, 12, 33, 44))); + EXPECT_TRUE(tfSimdCmpAllNeq_f32x4(tfSimdLoad_f32x4(32, 45, 13, 4), tfSimdLoad_f32x4(1, 2, 3, 5))); + EXPECT_FALSE(tfSimdCmpAllNeq_f32x4(tfSimdLoad_f32x4(0, 0, 0, 0), tfSimdLoad_f32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllGtEq_f32x4) +{ + EXPECT_TRUE(tfSimdCmpAllGtEq_f32x4(tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4))); + EXPECT_FALSE(tfSimdCmpAllGtEq_f32x4(tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(1, 2, 3, 5))); + EXPECT_TRUE(tfSimdCmpAllGtEq_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllLt_f32x4) +{ + EXPECT_TRUE(tfSimdCmpAllLt_f32x4(tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllLt_f32x4(tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4))); + EXPECT_FALSE(tfSimdCmpAllLt_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); +} + +UTEST(Tsimd_f32x4_t, tfSimdCmpAllLtEq_f32x4) +{ + EXPECT_TRUE(tfSimdCmpAllLtEq_f32x4(tfSimdLoad_f32x4(1, 2, 3, 4), tfSimdLoad_f32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllLtEq_f32x4(tfSimdLoad_f32x4(2, 3, 4, 5), tfSimdLoad_f32x4(1, 2, 3, 4))); + EXPECT_TRUE(tfSimdCmpAllLtEq_f32x4(tfSimdLoad_f32x4(1, 1, 1, 1), tfSimdLoad_f32x4(1, 1, 1, 1))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllGt_i32x4) +{ + EXPECT_TRUE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(16, 16, 17, 13), tfSimdLoad_i32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(2, 0, 0, 0), tfSimdLoad_i32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(5, 6, 7, 8), tfSimdLoad_i32x4(4, 7, 6, 9))); + EXPECT_FALSE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5))); + EXPECT_TRUE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(1, -2, 3, -4), tfSimdLoad_i32x4(0, -3, 2, -5))); + EXPECT_FALSE(tfSimdCmpAllGt_i32x4(tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllEq_i32x4) +{ + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllEq_i32x4(tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(1, 2, 3, 5))); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllNeq_i32x4) +{ + EXPECT_FALSE(tfSimdCmpAllNeq_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); + EXPECT_FALSE(tfSimdCmpAllNeq_i32x4(tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 12, 33, 44))); + EXPECT_TRUE(tfSimdCmpAllNeq_i32x4(tfSimdLoad_i32x4(32, 45, 13, 4), tfSimdLoad_i32x4(1, 2, 3, 5))); + EXPECT_FALSE(tfSimdCmpAllNeq_i32x4(tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllGtEq_i32x4) +{ + EXPECT_TRUE(tfSimdCmpAllGtEq_i32x4(tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4))); + EXPECT_FALSE(tfSimdCmpAllGtEq_i32x4(tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(1, 2, 3, 5))); + EXPECT_TRUE(tfSimdCmpAllGtEq_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllLt_i32x4) +{ + EXPECT_TRUE(tfSimdCmpAllLt_i32x4(tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllLt_i32x4(tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4))); + EXPECT_FALSE(tfSimdCmpAllLt_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); +} + +UTEST(Tsimd_i32x4_t, tfSimdCmpAllLtEq_i32x4) +{ + EXPECT_TRUE(tfSimdCmpAllLtEq_i32x4(tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(2, 3, 4, 5))); + EXPECT_FALSE(tfSimdCmpAllLtEq_i32x4(tfSimdLoad_i32x4(2, 3, 4, 5), tfSimdLoad_i32x4(1, 2, 3, 4))); + EXPECT_TRUE(tfSimdCmpAllLtEq_i32x4(tfSimdLoad_i32x4(1, 1, 1, 1), tfSimdLoad_i32x4(1, 1, 1, 1))); +} + + +UTEST(Tsimd_i32x4_t, tfSimdNot_i32x4) +{ + struct { + Tsimd_i32x4_t input; + Tsimd_i32x4_t expected; + } tests[] = { + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(~0, ~0, ~0, ~0)}, + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(~1, ~2, ~3, ~4)}, + {tfSimdLoad_i32x4(-1, -2, -3, -4), tfSimdLoad_i32x4(~-1, ~-2, ~-3, ~-4)}, + {tfSimdLoad_i32x4(123, 456, 789, 101112), tfSimdLoad_i32x4(~123, ~456, ~789, ~101112)}, + {tfSimdLoad_i32x4(0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFC), tfSimdLoad_i32x4(~0xFFFFFFFF, ~0xFFFFFFFE, ~0xFFFFFFFD, ~0xFFFFFFFC)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdNot_i32x4(tests[i].input); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].expected)); + } +} + + +UTEST(Tsimd_i32x4_t, tfSimdAnd_i32x4) +{ + struct { + Tsimd_i32x4_t arg1; + Tsimd_i32x4_t arg2; + Tsimd_i32x4_t expected; + } tests[] = { + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(4, 3, 2, 1), tfSimdLoad_i32x4(0, 2, 2, 0)}, + {tfSimdLoad_i32x4(-1, -2, -3, -4), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(1, 2, 1, 4)}, + {tfSimdLoad_i32x4(0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFC), tfSimdLoad_i32x4(0xFFFFFFFB, 0xFFFFFFFA, 0xFFFFFFF9, 0xFFFFFFF8), tfSimdLoad_i32x4(0xFFFFFFFB, 0xFFFFFFFA, 0xFFFFFFF9, 0xFFFFFFF8)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdAnd_i32x4(tests[i].arg1, tests[i].arg2); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].expected)); + } +} + + +UTEST(Tsimd_i32x4_t, tfSimdAndNot_i32x4) +{ + struct { + Tsimd_i32x4_t arg1; + Tsimd_i32x4_t arg2; + Tsimd_i32x4_t expected; + } tests[] = { + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(4, 3, 2, 1), tfSimdLoad_i32x4(4, 1, 0, 1)}, + {tfSimdLoad_i32x4(-1, -2, -3, -4), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(0, 0, 2, 0)}, + {tfSimdLoad_i32x4(123, 456, 789, 101112), tfSimdLoad_i32x4(654, 321, 987, 654321), tfSimdLoad_i32x4(644, 1, 202, 553217)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdAndNot_i32x4(tests[i].arg1, tests[i].arg2); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_i32x4_t, tfSimdOr_i32x4) +{ + struct { + Tsimd_i32x4_t arg1; + Tsimd_i32x4_t arg2; + Tsimd_i32x4_t expected; + } tests[] = { + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(4, 3, 2, 1), tfSimdLoad_i32x4(5, 3, 3, 5)}, + {tfSimdLoad_i32x4(-1, -2, -3, -4), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-1, -2, -1, -4)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdOr_i32x4(tests[i].arg1, tests[i].arg2); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].expected)); + } +} + + +UTEST(Tsimd_i32x4_t, tfSimdXor_i32x4) +{ + struct { + Tsimd_i32x4_t arg1; + Tsimd_i32x4_t arg2; + Tsimd_i32x4_t expected; + } tests[] = { + {tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0), tfSimdLoad_i32x4(0, 0, 0, 0)}, + {tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(4, 3, 2, 1), tfSimdLoad_i32x4(5, 1, 1, 5)}, + {tfSimdLoad_i32x4(-1, -2, -3, -4), tfSimdLoad_i32x4(1, 2, 3, 4), tfSimdLoad_i32x4(-2, -4, -2, -8)}, + {tfSimdLoad_i32x4(123, 456, 789, 101112), tfSimdLoad_i32x4(654, 321, 987, 654321), tfSimdLoad_i32x4(757,137,206,553225)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_i32x4_t result = tfSimdXor_i32x4(tests[i].arg1, tests[i].arg2); + DLOGF(LogLevel::eDEBUG, LOG_FORMAT_SIMD_32x4i(result)); + EXPECT_TRUE(tfSimdCmpAllEq_i32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdAdd_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_f32x4_t expected; + } tests[] = { + {tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(4.0f, 3.0f, 2.0f, 1.0f), tfSimdLoad_f32x4(5.0f, 5.0f, 5.0f, 5.0f)}, + {tfSimdLoad_f32x4(-1.0f, -2.0f, -3.0f, -4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + {tfSimdLoad_f32x4(123.45f, 456.78f, 789.01f, 101112.13f), tfSimdLoad_f32x4(654.32f, 321.09f, 987.65f, 654321.98f), tfSimdLoad_f32x4(777.77f, 777.87f, 1776.66f, 755434.11f)}, + {tfSimdLoad_f32x4(0.1f, 0.2f, 0.3f, 0.4f), tfSimdLoad_f32x4(0.9f, 0.8f, 0.7f, 0.6f), tfSimdLoad_f32x4(1.0f, 1.0f, 1.0f, 1.0f)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_f32x4_t result = tfSimdAdd_f32x4(tests[i].a, tests[i].b); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdMul_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_f32x4_t expected; + } tests[] = { + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(1.0f, 4.0f, 9.0f, 16.0f)}, + {tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + {tfSimdLoad_f32x4(-1.0f, -2.0f, -3.0f, -4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(-1.0f, -4.0f, -9.0f, -16.0f)}, + {tfSimdLoad_f32x4(1.5f, 2.5f, 3.5f, 4.5f), tfSimdLoad_f32x4(2.0f, 2.0f, 2.0f, 2.0f), tfSimdLoad_f32x4(3.0f, 5.0f, 7.0f, 9.0f)}, + {tfSimdLoad_f32x4(1.0f, 0.0f, -1.0f, 0.0f), tfSimdLoad_f32x4(0.0f, 1.0f, 0.0f, -1.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_f32x4_t result = tfSimdMul_f32x4(tests[i].a, tests[i].b); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdDiv_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_f32x4_t expected; + } tests[] = { + {tfSimdLoad_f32x4(4.0f, 9.0f, 16.0f, 25.0f), tfSimdLoad_f32x4(2.0f, 3.0f, 4.0f, 5.0f), tfSimdLoad_f32x4(2.0f, 3.0f, 4.0f, 5.0f)}, + {tfSimdLoad_f32x4(1.0f, 1.0f, 1.0f, 1.0f), tfSimdLoad_f32x4(1.0f, 1.0f, 1.0f, 1.0f), tfSimdLoad_f32x4(1.0f, 1.0f, 1.0f, 1.0f)}, + {tfSimdLoad_f32x4(-4.0f, -9.0f, -16.0f, -25.0f), tfSimdLoad_f32x4(2.0f, 3.0f, 4.0f, 5.0f), tfSimdLoad_f32x4(-2.0f, -3.0f, -4.0f, -5.0f)}, + {tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(1.0f, 1.0f, 1.0f, 1.0f)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_f32x4_t result = tfSimdDiv_f32x4(tests[i].a, tests[i].b); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdAbs_f32x4) +{ + struct { + Tsimd_f32x4_t input; + Tsimd_f32x4_t expected; + } tests[] = { + {tfSimdLoad_f32x4(-1.0f, -2.0f, -3.0f, -4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f)}, + {tfSimdLoad_f32x4(1.0f, -2.0f, 3.0f, -4.0f), tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f)}, + {tfSimdLoad_f32x4(0.0f, -0.0f, 0.0f, -0.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f)}, + {tfSimdLoad_f32x4(5.0f, -6.0f, 7.0f, -8.0f), tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f)}, + {tfSimdLoad_f32x4(-9.0f, 10.0f, -11.0f, 12.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_f32x4_t result = tfSimdAbs_f32x4(tests[i].input); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdMadd_f32x4) +{ + struct { + Tsimd_f32x4_t a; + Tsimd_f32x4_t b; + Tsimd_f32x4_t c; + Tsimd_f32x4_t expected; + } tests[] = { + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f), tfSimdLoad_f32x4(14.0f, 22.0f, 32.0f, 44.0f)}, + {tfSimdLoad_f32x4(-1.0f, -2.0f, -3.0f, -4.0f), tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f), tfSimdLoad_f32x4(4.0f, -2.0f, -10.0f, -20.0f)}, + {tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f)}, + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f), tfSimdLoad_f32x4(9.0f, 10.0f, 11.0f, 12.0f)}, + {tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f), tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f), tfSimdLoad_f32x4(0.0f, 0.0f, 0.0f, 0.0f), tfSimdLoad_f32x4(5.0f, 12.0f, 21.0f, 32.0f)}, + }; + + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + Tsimd_f32x4_t result = tfSimdMadd_f32x4(tests[i].a, tests[i].b, tests[i].c); + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, tests[i].expected)); + } +} + +UTEST(Tsimd_f32x4_t, tfSimdDot_f32x4) +{ + Tsimd_f32x4_t a = tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f); + Tsimd_f32x4_t b = tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f); + Tsimd_f32x4_t result = tfSimdDot_f32x4(a, b); + Tsimd_f32x4_t expected = tfSimdLoad_f32x4(70.0f, 70.0f, 70.0f, 70.0f); // 1*5 + 2*6 + 3*7 + 4*8 = 70 + + EXPECT_TRUE(tfSimdCmpAllEq_f32x4(result, expected)); +} + +UTEST(Tsimd_f32x4_t, tfSimdDot_f32x4_f32) +{ + Tsimd_f32x4_t a = tfSimdLoad_f32x4(1.0f, 2.0f, 3.0f, 4.0f); + Tsimd_f32x4_t b = tfSimdLoad_f32x4(5.0f, 6.0f, 7.0f, 8.0f); + float result = tfSimdDot_f32x4_f32(a, b); + float expected = 70.0f; // 1*5 + 2*6 + 3*7 + 4*8 = 70 + + EXPECT_EQ(result, expected); +} + +#include "Forge/Mem/TF_Memory.h" +#include "Forge/TF_FileSystem.h" +#include "Forge/TF_Log.h" +UTEST_STATE(); +TF_UTEST_MAIN("TF_Simd4") + diff --git a/Forge/tests/Math/TF_Simd3Test.cpp b/Forge/tests/Math/TF_Simd3Test.cpp new file mode 100644 index 0000000000..f445f561d2 --- /dev/null +++ b/Forge/tests/Math/TF_Simd3Test.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Contributors to the Open 3D Engine Project. + * For complete copyright and license terms please see the LICENSE at the root of this distribution. + * + * SPDX-License-Identifier: Apache-2.0 OR MIT + * + */ +#include "TF_TestMain.h" +#include "utest.h" + +#include "Forge/Math/TF_Simd32x3.h" +#include "TF_MathUtils.h" + + +UTEST(TF_Simd3, tfS32x3iCmpGt) +{ + struct { + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; + } tests[] = { + // ... existing test cases ... + // Edge cases: + {tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed values: + {tfSimdInt3Load(0, -1, 1), tfSimdInt3Load(1, 0, -1), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, + // All elements equal: + {tfSimdInt3Load(42, 42, 42), tfSimdInt3Load(42, 42, 42), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Different element types: + {tfSimdInt3Load(0, 1, 2), tfSimdInt3Load(3, 2, 1), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpGt(tests[i].a, tests[i].b), tests[i].test)); + } +} + +UTEST(TF_Simd3, tfS32x3iCmpLt) +{ + struct { + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; + } tests[] = { + // Less than + {tfSimdInt3Load(12, 13, 14), tfSimdInt3Load(16, 17, 18), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed less than and equal + {tfSimdInt3Load(125, -12, 153), tfSimdInt3Load(125, 13, 153), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, + // All elements greater than or equal + {tfSimdInt3Load(1, 2, 3), tfSimdInt3Load(-1, 0, 1), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Edge cases: + {tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + }; + for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpLt(tests[i].a, tests[i].b), tests[i].test)); + } +} + + + +UTEST(TF_Simd3, tfS32x3iCmpEq) +{ + struct { + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; + } tests[] = { + // Equal elements + {tfSimdInt3Load(12, 13, 14), tfSimdInt3Load(12, 13, 14), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + // Mixed equal and unequal elements + {tfSimdInt3Load(125, -12, 153), tfSimdInt3Load(125, -12, 14), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, + // All elements unequal + {tfSimdInt3Load(1, 2, 3), tfSimdInt3Load(5, 6, 7), tfSimdInt3Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + // Edge cases: + {tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpEq(tests[i].a, tests[i].b), tests[i].test)); + } +} + +UTEST(TF_Simd3, tfS32x3FZero) +{ + Tsimd_f32x3_t value = tfS32x3FZero(); + EXPECT_NEAR(tfS32x3FSelectIndex0(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value), 0.0f, DEFAULT_EPSILON); +} + +UTEST(TF_Simd3, tfSimdSplat4f) +{ + Tsimd_f32x3_t value = tfS32x3FSplat(23.f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value), 23.f, DEFAULT_EPSILON); + Tsimd_f32x3_t value1 = tfS32x3FSplat(5.1f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value1), 5.1f, DEFAULT_EPSILON); + + Tsimd_f32x3_t value2 = tfSimdFloat3Load(5.1f, 1.0f, 2.0f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value2), 2.0f, DEFAULT_EPSILON); +} + +UTEST(TF_Simd3, tfS32x3iNot) { + struct { + Tsimd_i32x3_t test; + Tsimd_i32x3_t expect; + } tests[] = { + {tfSimdInt3Load(0xFFFFFFFF, 0x0000FFFF, 0xFFFF0000), tfSimdInt3Load(0, 0xFFFF0000, 0x0000FFFF)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iNot(tests[i].test), tests[i].expect)); + } +} + +UTEST(TF_Simd3, tfS32x3iSelect) { + struct { + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t mask; + Tsimd_i32x3_t expect; + } tests[] = { + {tfSimdInt3Load(10, 11, -13), tfSimdInt3Load(123, -149, 0), tfSimdInt3Load(0,0,0), tfSimdInt3Load(10, 11, -13)}, + {tfSimdInt3Load(10, 11, -13), tfSimdInt3Load(123, -149, 0), tfSimdInt3Load(TF_SIMD_TRUE,0,0), tfSimdInt3Load(123, 11, -13)}, + {tfSimdInt3Load(10, 11, -13), tfSimdInt3Load(123, -149, 0), tfSimdInt3Load(TF_SIMD_TRUE,0,TF_SIMD_TRUE), tfSimdInt3Load(123, 11, 0)}, + {tfSimdInt3Load(10, 11, -13), tfSimdInt3Load(123, -149, 0), tfSimdInt3Load(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimdInt3Load(123, -149, 0)}, + }; + for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) + { + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); + } +} + +//UTEST(TF_Simd3, tfS32x3FSelect) { +// struct { +// Tsimd_f32x3_t a; +// Tsimd_f32x3_t b; +// Tsimd_f32x3_t mask; +// Tsimd_f32x3_t expect; +// } tests[] = { +// {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(0,0,0,0), tfSimdFloat4Load(10, 11, -13, 32)}, +// {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(TF_SIMD_TRUE,0,0,0), tfSimdFloat4Load(123, 11, -13, 32)}, +// {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(TF_SIMD_TRUE,0,TF_SIMD_TRUE,0), tfSimdFloat4Load(123, 11, 0, 32)}, +// {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimdFloat4Load(123, -149, 0, 12)}, +// }; +// for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) +// { +// //EXPECT_TRUE(tfS32x3FCmp(tfS32x3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); +// } +//} + + +#include "Forge/Mem/TF_Memory.h" +#include "Forge/TF_FileSystem.h" +#include "Forge/TF_Log.h" +UTEST_STATE(); +TF_UTEST_MAIN("TF_Simd3") + diff --git a/Forge/tests/Math/TF_SimdFloat2x32Test.cpp b/Forge/tests/Math/TF_SimdFloat2x32Test.cpp index 2c18054f5d..5f59d0bf63 100644 --- a/Forge/tests/Math/TF_SimdFloat2x32Test.cpp +++ b/Forge/tests/Math/TF_SimdFloat2x32Test.cpp @@ -12,143 +12,143 @@ #include "TF_MathUtils.h" -UTEST(TF_Simd2, tfSimd2iCmpGt) +UTEST(TF_Simd2, tfS32x2ICmpGt) { struct { - TSimdInt32x2 a; - TSimdInt32x2 b; - TSimdInt32x2 test; + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; } tests[] = { // ... existing test cases ... // Edge cases: - {tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, - {tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed values: - {tfSimd2iLoadImmediate(0, -1), tfSimd2iLoadImmediate(1, 0), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(0, -1), tfS32x2ILoadImmediate(1, 0), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, // All elements equal: - {tfSimd2iLoadImmediate(42, 42), tfSimd2iLoadImmediate(42, 42), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(42, 42), tfS32x2ILoadImmediate(42, 42), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Different element types: - {tfSimd2iLoadImmediate(0, 1), tfSimd2iLoadImmediate(3, 2), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(0, 1), tfS32x2ILoadImmediate(3, 2), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, }; for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd2iCmpAllEq(tfSimd2iCmpGt(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpGt(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd2, tfSimd2iCmpLt) +UTEST(TF_Simd2, tfS32x2ICmpLt) { struct { - TSimdInt32x2 a; - TSimdInt32x2 b; - TSimdInt32x2 test; + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; } tests[] = { // Less than - {tfSimd2iLoadImmediate(12, 13), tfSimd2iLoadImmediate(16, 17), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(16, 17), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed less than and equal - {tfSimd2iLoadImmediate(125, -12), tfSimd2iLoadImmediate(125, 13), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(125, 13), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_TRUE)}, // All elements greater than or equal - {tfSimd2iLoadImmediate(1, 2), tfSimd2iLoadImmediate(-1, 0), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(1, 2), tfS32x2ILoadImmediate(-1, 0), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Edge cases: - {tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, - {tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, }; for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd2iCmpAllEq(tfSimd2iCmpLt(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpLt(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd2, tfSimd2iCmpEq) +UTEST(TF_Simd2, tfS32x2ICmpEq) { struct { - TSimdInt32x2 a; - TSimdInt32x2 b; - TSimdInt32x2 test; + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t test; } tests[] = { // Equal elements - {tfSimd2iLoadImmediate(12, 13), tfSimd2iLoadImmediate(12, 13), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(12, 13), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed equal and unequal elements - {tfSimd2iLoadImmediate(125, -12), tfSimd2iLoadImmediate(125, -12), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(125, -12), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, // All elements unequal - {tfSimd2iLoadImmediate(1, 2), tfSimd2iLoadImmediate(5, 6), tfSimd2iLoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfS32x2ILoadImmediate(1, 2), tfS32x2ILoadImmediate(5, 6), tfS32x2ILoadImmediate(TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Edge cases: - {tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(INT32_MIN, INT32_MIN), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, - {tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(INT32_MAX, INT32_MAX), tfSimd2iLoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(INT32_MIN, INT32_MIN), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(INT32_MAX, INT32_MAX), tfS32x2ILoadImmediate(TF_SIMD_TRUE, TF_SIMD_TRUE)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd2iCmpAllEq(tfSimd2iCmpEq(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ICmpEq(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd2, tfSimd2fZero) +UTEST(TF_Simd2, tfS32x2FZero) { - TSimdFloat32x2 value = tfSimd2fZero(); - EXPECT_NEAR(tfSimd2fSelectIndex0(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd2fSelectIndex1(value), 0.0f, DEFAULT_EPSILON); + Tsimd_f32x2_t value = tfS32x2FZero(); + EXPECT_NEAR(tfS32x2FSelectIndex0(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value), 0.0f, DEFAULT_EPSILON); } -UTEST(TF_Simd2, tfSimd2fSplat) +UTEST(TF_Simd2, tfS32x2FSplat) { - TSimdFloat32x2 value = tfSimd2fSplat(23.f); - EXPECT_NEAR(tfSimd2fSelectIndex0(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd2fSelectIndex1(value), 23.f, DEFAULT_EPSILON); - TSimdFloat32x2 value1 = tfSimd2fSplat(5.1f); - EXPECT_NEAR(tfSimd2fSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd2fSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); - - TSimdFloat32x2 value2 = tfSimdFloat2Load(5.1f, 1.0f); - EXPECT_NEAR(tfSimd2fSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd2fSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); + Tsimd_f32x2_t value = tfS32x2FSplat(23.f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value), 23.f, DEFAULT_EPSILON); + Tsimd_f32x2_t value1 = tfS32x2FSplat(5.1f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); + + Tsimd_f32x2_t value2 = tfSimdFloat2Load(5.1f, 1.0f); + EXPECT_NEAR(tfS32x2FSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x2FSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); } -UTEST(TF_Simd2, tfSimd3iNot) { +UTEST(TF_Simd2, tfS32x3iNot) { struct { - TSimdInt32x2 test; - TSimdInt32x2 expect; + Tsimd_i32x2_t test; + Tsimd_i32x2_t expect; } tests[] = { - {tfSimd2iLoadImmediate(0xFFFFFFFF, 0x0000FFFF), tfSimd2iLoadImmediate(0, 0xFFFF0000)}, + {tfS32x2ILoadImmediate(0xFFFFFFFF, 0x0000FFFF), tfS32x2ILoadImmediate(0, 0xFFFF0000)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd2iCmpAllEq(tfSimd2iNot(tests[i].test), tests[i].expect)); + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2INot(tests[i].test), tests[i].expect)); } } -UTEST(TF_Simd2, tfSimd2iSelect) { +UTEST(TF_Simd2, tfS32x2ISelect) { struct { - TSimdInt32x2 a; - TSimdInt32x2 b; - TSimdInt32x2 mask; - TSimdInt32x2 expect; + Tsimd_i32x2_t a; + Tsimd_i32x2_t b; + Tsimd_i32x2_t mask; + Tsimd_i32x2_t expect; } tests[] = { - {tfSimd2iLoadImmediate(10, 11), tfSimd2iLoadImmediate(123, -149), tfSimd2iLoadImmediate(0,0), tfSimd2iLoadImmediate(10, 11)}, - {tfSimd2iLoadImmediate(10, 11), tfSimd2iLoadImmediate(123, -149), tfSimd2iLoadImmediate(TF_SIMD_TRUE,0), tfSimd2iLoadImmediate(123, 11)}, - {tfSimd2iLoadImmediate(10, 11), tfSimd2iLoadImmediate(123, -149), tfSimd2iLoadImmediate(TF_SIMD_TRUE,0), tfSimd2iLoadImmediate(123, 11)}, - {tfSimd2iLoadImmediate(10, 11), tfSimd2iLoadImmediate(123, -149), tfSimd2iLoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimd2iLoadImmediate(123, -149)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(0,0), tfS32x2ILoadImmediate(10, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,0), tfS32x2ILoadImmediate(123, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,0), tfS32x2ILoadImmediate(123, 11)}, + {tfS32x2ILoadImmediate(10, 11), tfS32x2ILoadImmediate(123, -149), tfS32x2ILoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE), tfS32x2ILoadImmediate(123, -149)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd2iCmpAllEq(tfSimd2iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); + EXPECT_TRUE(tfS32x2ICmpAllEq(tfS32x2ISelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); } } -//UTEST(TF_Simd2, tfSimd3fSelect) { +//UTEST(TF_Simd2, tfS32x3FSelect) { // struct { // TSimd32fx4 a; // TSimd32fx4 b; // TSimd32fx4 mask; // TSimd32fx4 expect; // } tests[] = { -// {tfSimd3fLoadImmediate(10, 11, -13, 32), tfSimd3fLoadImmediate(123, -149, 0, 12), tfSimd3fLoadImmediate(0,0,0,0), tfSimd3fLoadImmediate(10, 11, -13, 32)}, -// {tfSimd3fLoadImmediate(10, 11, -13, 32), tfSimd3fLoadImmediate(123, -149, 0, 12), tfSimd3fLoadImmediate(TF_SIMD_TRUE,0,0,0), tfSimd3fLoadImmediate(123, 11, -13, 32)}, -// {tfSimd3fLoadImmediate(10, 11, -13, 32), tfSimd3fLoadImmediate(123, -149, 0, 12), tfSimd3fLoadImmediate(TF_SIMD_TRUE,0,TF_SIMD_TRUE,0), tfSimd3fLoadImmediate(123, 11, 0, 32)}, -// {tfSimd3fLoadImmediate(10, 11, -13, 32), tfSimd3fLoadImmediate(123, -149, 0, 12), tfSimd3fLoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimd3fLoadImmediate(123, -149, 0, 12)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(0,0,0,0), tfS32x3FLoadImmediate(10, 11, -13, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,0,0,0), tfS32x3FLoadImmediate(123, 11, -13, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,0,TF_SIMD_TRUE,0), tfS32x3FLoadImmediate(123, 11, 0, 32)}, +// {tfS32x3FLoadImmediate(10, 11, -13, 32), tfS32x3FLoadImmediate(123, -149, 0, 12), tfS32x3FLoadImmediate(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfS32x3FLoadImmediate(123, -149, 0, 12)}, // }; // for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) // { -// //EXPECT_TRUE(tfSimd3fCmp(tfSimd2iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); +// //EXPECT_TRUE(tfS32x3FCmp(tfS32x2ISelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); // } //} diff --git a/Forge/tests/Math/TF_SimdFloat3x32Test.cpp b/Forge/tests/Math/TF_SimdFloat3x32Test.cpp index d9e28b0580..5f6dcfadba 100644 --- a/Forge/tests/Math/TF_SimdFloat3x32Test.cpp +++ b/Forge/tests/Math/TF_SimdFloat3x32Test.cpp @@ -12,138 +12,138 @@ #include "TF_MathUtils.h" -UTEST(TF_Simd3, tfSimd3iCmpGt) +UTEST(TF_Simd3, tfS32x3iCmpGt) { struct { - TSimdInt32x3 a; - TSimdInt32x3 b; - TSimdInt32x3 test; + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; } tests[] = { // ... existing test cases ... // Edge cases: - {tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, - {tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed values: - {tfSimdInt3x32Load(0, -1, 1), tfSimdInt3x32Load(1, 0, -1), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(0, -1, 1), tfSimd3x32ILoad(1, 0, -1), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, // All elements equal: - {tfSimdInt3x32Load(42, 42, 42), tfSimdInt3x32Load(42, 42, 42), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(42, 42, 42), tfSimd3x32ILoad(42, 42, 42), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Different element types: - {tfSimdInt3x32Load(0, 1, 2), tfSimdInt3x32Load(3, 2, 1), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(0, 1, 2), tfSimd3x32ILoad(3, 2, 1), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_TRUE)}, }; for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd3iCmpAllEq(tfSimd3iCmpGt(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpGt(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd3, tfSimd3iCmpLt) +UTEST(TF_Simd3, tfS32x3iCmpLt) { struct { - TSimdInt32x3 a; - TSimdInt32x3 b; - TSimdInt32x3 test; + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; } tests[] = { // Less than - {tfSimdInt3x32Load(12, 13, 14), tfSimdInt3x32Load(16, 17, 18), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(12, 13, 14), tfSimd3x32ILoad(16, 17, 18), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed less than and equal - {tfSimdInt3x32Load(125, -12, 153), tfSimdInt3x32Load(125, 13, 153), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(125, -12, 153), tfSimd3x32ILoad(125, 13, 153), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, // All elements greater than or equal - {tfSimdInt3x32Load(1, 2, 3), tfSimdInt3x32Load(-1, 0, 1), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(1, 2, 3), tfSimd3x32ILoad(-1, 0, 1), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Edge cases: - {tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, - {tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, }; for(size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd3iCmpAllEq(tfSimd3iCmpLt(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpLt(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd3, tfSimd3iCmpEq) +UTEST(TF_Simd3, tfS32x3iCmpEq) { struct { - TSimdInt32x3 a; - TSimdInt32x3 b; - TSimdInt32x3 test; + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t test; } tests[] = { // Equal elements - {tfSimdInt3x32Load(12, 13, 14), tfSimdInt3x32Load(12, 13, 14), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(12, 13, 14), tfSimd3x32ILoad(12, 13, 14), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, // Mixed equal and unequal elements - {tfSimdInt3x32Load(125, -12, 153), tfSimdInt3x32Load(125, -12, 14), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(125, -12, 153), tfSimd3x32ILoad(125, -12, 14), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_FALSE)}, // All elements unequal - {tfSimdInt3x32Load(1, 2, 3), tfSimdInt3x32Load(5, 6, 7), tfSimdInt3x32Load(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, + {tfSimd3x32ILoad(1, 2, 3), tfSimd3x32ILoad(5, 6, 7), tfSimd3x32ILoad(TF_SIMD_FALSE, TF_SIMD_FALSE, TF_SIMD_FALSE)}, // Edge cases: - {tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(INT32_MIN, INT32_MIN, INT32_MIN), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, - {tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(INT32_MAX, INT32_MAX, INT32_MAX), tfSimdInt3x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(INT32_MIN, INT32_MIN, INT32_MIN), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, + {tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(INT32_MAX, INT32_MAX, INT32_MAX), tfSimd3x32ILoad(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd3iCmpAllEq(tfSimd3iCmpEq(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iCmpEq(tests[i].a, tests[i].b), tests[i].test)); } } -UTEST(TF_Simd3, tfSimd3fZero) +UTEST(TF_Simd3, tfS32x3FZero) { - TSimdFloat32x3 value = tfSimd3fZero(); - EXPECT_NEAR(tfSimd3fSelectIndex0(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex1(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex2(value), 0.0f, DEFAULT_EPSILON); + Tsimd_f32x3_t value = tfS32x3FZero(); + EXPECT_NEAR(tfS32x3FSelectIndex0(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value), 0.0f, DEFAULT_EPSILON); } UTEST(TF_Simd3, tfSimdSplat4f) { - TSimdFloat32x3 value = tfSimd3fSplat(23.f); - EXPECT_NEAR(tfSimd3fSelectIndex0(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex1(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex2(value), 23.f, DEFAULT_EPSILON); - TSimdFloat32x3 value1 = tfSimd3fSplat(5.1f); - EXPECT_NEAR(tfSimd3fSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex2(value1), 5.1f, DEFAULT_EPSILON); - - TSimdFloat32x3 value2 = tfSimdFloat3x32Load(5.1f, 1.0f, 2.0f); - EXPECT_NEAR(tfSimd3fSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd3fSelectIndex2(value2), 2.0f, DEFAULT_EPSILON); + Tsimd_f32x3_t value = tfS32x3FSplat(23.f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value), 23.f, DEFAULT_EPSILON); + Tsimd_f32x3_t value1 = tfS32x3FSplat(5.1f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value1), 5.1f, DEFAULT_EPSILON); + + Tsimd_f32x3_t value2 = tfSimd3x32FLoad(5.1f, 1.0f, 2.0f); + EXPECT_NEAR(tfS32x3FSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x3FSelectIndex2(value2), 2.0f, DEFAULT_EPSILON); } -UTEST(TF_Simd3, tfSimd3iNot) { +UTEST(TF_Simd3, tfS32x3iNot) { struct { - TSimdInt32x3 test; - TSimdInt32x3 expect; + Tsimd_i32x3_t test; + Tsimd_i32x3_t expect; } tests[] = { - {tfSimdInt3x32Load(0xFFFFFFFF, 0x0000FFFF, 0xFFFF0000), tfSimdInt3x32Load(0, 0xFFFF0000, 0x0000FFFF)}, + {tfSimd3x32ILoad(0xFFFFFFFF, 0x0000FFFF, 0xFFFF0000), tfSimd3x32ILoad(0, 0xFFFF0000, 0x0000FFFF)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd3iCmpAllEq(tfSimd3iNot(tests[i].test), tests[i].expect)); + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iNot(tests[i].test), tests[i].expect)); } } -UTEST(TF_Simd3, tfSimd3iSelect) { +UTEST(TF_Simd3, tfS32x3iSelect) { struct { - TSimdInt32x3 a; - TSimdInt32x3 b; - TSimdInt32x3 mask; - TSimdInt32x3 expect; + Tsimd_i32x3_t a; + Tsimd_i32x3_t b; + Tsimd_i32x3_t mask; + Tsimd_i32x3_t expect; } tests[] = { - {tfSimdInt3x32Load(10, 11, -13), tfSimdInt3x32Load(123, -149, 0), tfSimdInt3x32Load(0,0,0), tfSimdInt3x32Load(10, 11, -13)}, - {tfSimdInt3x32Load(10, 11, -13), tfSimdInt3x32Load(123, -149, 0), tfSimdInt3x32Load(TF_SIMD_TRUE,0,0), tfSimdInt3x32Load(123, 11, -13)}, - {tfSimdInt3x32Load(10, 11, -13), tfSimdInt3x32Load(123, -149, 0), tfSimdInt3x32Load(TF_SIMD_TRUE,0,TF_SIMD_TRUE), tfSimdInt3x32Load(123, 11, 0)}, - {tfSimdInt3x32Load(10, 11, -13), tfSimdInt3x32Load(123, -149, 0), tfSimdInt3x32Load(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimdInt3x32Load(123, -149, 0)}, + {tfSimd3x32ILoad(10, 11, -13), tfSimd3x32ILoad(123, -149, 0), tfSimd3x32ILoad(0,0,0), tfSimd3x32ILoad(10, 11, -13)}, + {tfSimd3x32ILoad(10, 11, -13), tfSimd3x32ILoad(123, -149, 0), tfSimd3x32ILoad(TF_SIMD_TRUE,0,0), tfSimd3x32ILoad(123, 11, -13)}, + {tfSimd3x32ILoad(10, 11, -13), tfSimd3x32ILoad(123, -149, 0), tfSimd3x32ILoad(TF_SIMD_TRUE,0,TF_SIMD_TRUE), tfSimd3x32ILoad(123, 11, 0)}, + {tfSimd3x32ILoad(10, 11, -13), tfSimd3x32ILoad(123, -149, 0), tfSimd3x32ILoad(TF_SIMD_TRUE,TF_SIMD_TRUE,TF_SIMD_TRUE), tfSimd3x32ILoad(123, -149, 0)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd3iCmpAllEq(tfSimd3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); + EXPECT_TRUE(tfS32x3iCmpAllEq(tfS32x3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); } } -//UTEST(TF_Simd3, tfSimd3fSelect) { +//UTEST(TF_Simd3, tfS32x3FSelect) { // struct { -// TSimdFloat32x3 a; -// TSimdFloat32x3 b; -// TSimdFloat32x3 mask; -// TSimdFloat32x3 expect; +// Tsimd_f32x3_t a; +// Tsimd_f32x3_t b; +// Tsimd_f32x3_t mask; +// Tsimd_f32x3_t expect; // } tests[] = { // {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(0,0,0,0), tfSimdFloat4Load(10, 11, -13, 32)}, // {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(TF_SIMD_TRUE,0,0,0), tfSimdFloat4Load(123, 11, -13, 32)}, @@ -152,7 +152,7 @@ UTEST(TF_Simd3, tfSimd3iSelect) { // }; // for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) // { -// //EXPECT_TRUE(tfSimd3fCmp(tfSimd3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); +// //EXPECT_TRUE(tfS32x3FCmp(tfS32x3iSelect(tests[i].a, tests[i].b, tests[i].mask), tests[i].expect)); // } //} diff --git a/Forge/tests/Math/TF_SimdFloat4Test.cpp b/Forge/tests/Math/TF_SimdFloat4Test.cpp deleted file mode 100644 index e7f7bad4eb..0000000000 --- a/Forge/tests/Math/TF_SimdFloat4Test.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) Contributors to the Open 3D Engine Project. - * For complete copyright and license terms please see the LICENSE at the root of this distribution. - * - * SPDX-License-Identifier: Apache-2.0 OR MIT - * - */ -#include "TF_TestMain.h" -#include "utest.h" - -#include "Forge/Math/TF_SimdFloat.h" -#include "TF_MathUtils.h" - - -UTEST(TSimdFloat4 , tfLoadSimd4F) -{ - TSimdFloat4 value = tfLoadSimd4F(123.0,12.f,45.f,12.5f); - - EXPECT_NEAR(tfGetElemSimd4F(value, 0), 123.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetElemSimd4F(value, 1), 12.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetElemSimd4F(value, 2), 45.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetElemSimd4F(value, 3), 12.5f, DEFAULT_EPSILON); - - EXPECT_NEAR(tfGetXSimd4F(value), 123.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetYSimd4F(value), 12.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetZSimd4F(value), 45.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfGetWSimd4F(value), 12.5f, DEFAULT_EPSILON); -} - -UTEST(TF_Matrix, tfVectorEleAdd4F) -{ - struct { - TSimdFloat4 a; - TSimdFloat4 b; - TSimdFloat4 test; - } tests[] = { - { tfLoadSimd4F(1, 0, 0, 0), tfLoadSimd4F(0, 1, 0, 0), tfLoadSimd4F(1, 1, 0, 0) }, // Original test case - { tfLoadSimd4F(2, 3, 4, 5), tfLoadSimd4F(6, 7, 8, 9), tfLoadSimd4F(8, 10, 12, 14) }, // Test with larger numbers - { tfLoadSimd4F(-1, 2, -3, 4), tfLoadSimd4F(5, -6, 7, -8), tfLoadSimd4F(4, -4, 4, -4) }, // Test with negative numbers - }; - - for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - TSimdFloat4 result = tfVectorEleAdd4F(tests[i].a, tests[i].b); - debugPrintSimd4F(result); - EXPECT_TRUE(tfIsCloseSimd4F(result, tests[i].test, DEFAULT_EPSILON)); - } -} - - - -UTEST(TSimdFloat4, tfGetRowSimd4x4F) { - TSimdFloat4x4 mat = - tfLoadSimd4x4F(1.0f, 2.0f, 3.0f, 4.0f, - 5.0f, 6.0f, 7.0f, 8.0f, - 9.0f, 10.0f, 11.0f, 12.0f, - 13.0f, 14.0f, 15.0f, 16.0f); - - EXPECT_TRUE(tfIsCloseSimd4F(tfGetRowSimd4x4F(mat, 0), tfLoadSimd4F(1.0f, 2.0f, 3.0f, 4.0f), DEFAULT_EPSILON)); - EXPECT_TRUE(tfIsCloseSimd4F(tfGetRowSimd4x4F(mat, 1), tfLoadSimd4F(5.0f, 6.0f, 7.0f, 8.0f), DEFAULT_EPSILON)); - EXPECT_TRUE(tfIsCloseSimd4F(tfGetRowSimd4x4F(mat, 2), tfLoadSimd4F(9.0f, 10.0f, 11.0f, 12.0f), DEFAULT_EPSILON)); - EXPECT_TRUE(tfIsCloseSimd4F(tfGetRowSimd4x4F(mat, 3), tfLoadSimd4F(13.0f, 14.0f, 15.0f, 16.0f), DEFAULT_EPSILON)); -} - -//UTEST(TSimdFloat4, tfIsCloseSimd3x4F) { -// TSimdFloat4x3 mat = -// tfLoadSimd3x4F(1.0f, 2.0f, 3.0f, -// 5.0f, 6.0f, 7.0f, -// 9.0f, 10.0f, 11.0f, -// 13.0f, 14.0f, 15.0f ); -// -// EXPECT_TRUE(tfIsCloseSimd3F(tfGetRowSimd3x4F(mat, 0), tfLoadSimd4F(1.0f, 2.0f, 3.0f, 4.0f), DEFAULT_EPSILON)); -// EXPECT_TRUE(tfIsCloseSimd3F(tfGetRowSimd3x4F(mat, 1), tfLoadSimd4F(5.0f, 6.0f, 7.0f, 8.0f), DEFAULT_EPSILON)); -// EXPECT_TRUE(tfIsCloseSimd3F(tfGetRowSimd3x4F(mat, 2), tfLoadSimd4F(9.0f, 10.0f, 11.0f, 12.0f), DEFAULT_EPSILON)); -// EXPECT_TRUE(tfIsCloseSimd3F(tfGetRowSimd3x4F(mat, 3), tfLoadSimd4F(13.0f, 14.0f, 15.0f, 16.0f), DEFAULT_EPSILON)); -//} - - -#include "Forge/Mem/TF_Memory.h" -#include "Forge/TF_FileSystem.h" -#include "Forge/TF_Log.h" -UTEST_STATE(); -TF_UTEST_MAIN("TF_Simd4") - diff --git a/Forge/tests/Math/TF_SimdFloat4x32Test.cpp b/Forge/tests/Math/TF_SimdFloat4x32Test.cpp index 07e59c29e4..82aab45c84 100644 --- a/Forge/tests/Math/TF_SimdFloat4x32Test.cpp +++ b/Forge/tests/Math/TF_SimdFloat4x32Test.cpp @@ -14,9 +14,9 @@ UTEST(TF_Simd4, tfSimd4iCmpGt) { struct { - TSimdInt32x4 a; - TSimdInt32x4 b; - TSimdInt32x4 test; + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; } tests[] = { // ... existing test cases ... // Edge cases: @@ -37,9 +37,9 @@ UTEST(TF_Simd4, tfSimd4iCmpGt) UTEST(TF_Simd4, tfSimd4iCmpLt) { struct { - TSimdInt32x4 a; - TSimdInt32x4 b; - TSimdInt32x4 test; + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; } tests[] = { // Less than {tfSimdInt4x32Load(12, 13, 14, 15), tfSimdInt4x32Load(16, 17, 18, 19), tfSimdInt4x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, @@ -61,9 +61,9 @@ UTEST(TF_Simd4, tfSimd4iCmpLt) UTEST(TF_Simd4, tfSimd4iCmpEq) { struct { - TSimdInt32x4 a; - TSimdInt32x4 b; - TSimdInt32x4 test; + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t test; } tests[] = { // Equal elements {tfSimdInt4x32Load(12, 13, 14, 15), tfSimdInt4x32Load(12, 13, 14, 15), tfSimdInt4x32Load(TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE, TF_SIMD_TRUE)}, @@ -77,58 +77,58 @@ UTEST(TF_Simd4, tfSimd4iCmpEq) }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd4iCmpAllEq(tfSimd4iCmpEq(tests[i].a, tests[i].b), tests[i].test)); + EXPECT_TRUE(Tsimd_i32x4_tCmpAllEq(Tsimd_i32x4_tCmpEq(tests[i].a, tests[i].b), tests[i].test)); } } UTEST(TF_Simd4, tfSimd4fZero) { - TSimdFloat32x4 value = tfSimd4fZero(); - EXPECT_NEAR(tfSimd4fSelectIndex0(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex1(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex2(value), 0.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex3(value), 0.0f, DEFAULT_EPSILON); + Tsimd_f32x4_t value = tfS32x4FZero(); + EXPECT_NEAR(tfS32x4FSelectIndex0(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex1(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex2(value), 0.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex3(value), 0.0f, DEFAULT_EPSILON); } UTEST(TF_Simd4, tfSimdSplat4f) { - TSimdFloat32x4 value = tfSimd4fSplat(23.f); - EXPECT_NEAR(tfSimd4fSelectIndex0(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex1(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex2(value), 23.f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex3(value), 23.f, DEFAULT_EPSILON); - TSimdFloat32x4 value1 = tfSimd4fSplat(5.1f); - EXPECT_NEAR(tfSimd4fSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex2(value1), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex3(value1), 5.1f, DEFAULT_EPSILON); + Tsimd_f32x4_t value = tfS32x4FSplat(23.f); + EXPECT_NEAR(tfS32x4FSelectIndex0(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex1(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex2(value), 23.f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex3(value), 23.f, DEFAULT_EPSILON); + Tsimd_f32x4_t value1 = tfS32x4FSplat(5.1f); + EXPECT_NEAR(tfS32x4FSelectIndex0(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex1(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex2(value1), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex3(value1), 5.1f, DEFAULT_EPSILON); - TSimdFloat32x4 value2 = tfSimdFloat4x32Load(5.1f, 1.0f, 2.0f, 3.0f); - EXPECT_NEAR(tfSimd4fSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex2(value2), 2.0f, DEFAULT_EPSILON); - EXPECT_NEAR(tfSimd4fSelectIndex3(value2), 3.0f, DEFAULT_EPSILON); + Tsimd_f32x4_t value2 = tfS32x4FLoad(5.1f, 1.0f, 2.0f, 3.0f); + EXPECT_NEAR(tfS32x4FSelectIndex0(value2), 5.1f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex1(value2), 1.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex2(value2), 2.0f, DEFAULT_EPSILON); + EXPECT_NEAR(tfS32x4FSelectIndex3(value2), 3.0f, DEFAULT_EPSILON); } UTEST(TF_Simd4, tfSimd4iNot) { struct { - TSimdInt32x4 test; - TSimdInt32x4 expect; + Tsimd_i32x4_t test; + Tsimd_i32x4_t expect; } tests[] = { {tfSimdInt4x32Load(0xFFFFFFFF, 0x0000FFFF, 0xFFFF0000, 0x000000FF), tfSimdInt4x32Load(0, 0xFFFF0000, 0x0000FFFF, 0xFFFFFF00)}, }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - EXPECT_TRUE(tfSimd4iCmpAllEq(tfSimd4iNot(tests[i].test), tests[i].expect)); + EXPECT_TRUE(Tsimd_i32x4_tCmpAllEq(Tsimd_i32x4_tNot(tests[i].test), tests[i].expect)); } } UTEST(TF_Simd4, tfSimd4iSelect) { struct { - TSimdInt32x4 a; - TSimdInt32x4 b; - TSimdInt32x4 mask; - TSimdInt32x4 expect; + Tsimd_i32x4_t a; + Tsimd_i32x4_t b; + Tsimd_i32x4_t mask; + Tsimd_i32x4_t expect; } tests[] = { {tfSimdInt4x32Load(10, 11, -13, 32), tfSimdInt4x32Load(123, -149, 0, 12), tfSimdInt4x32Load(0,0,0,0), tfSimdInt4x32Load(10, 11, -13, 32)}, {tfSimdInt4x32Load(10, 11, -13, 32), tfSimdInt4x32Load(123, -149, 0, 12), tfSimdInt4x32Load(TF_SIMD_TRUE,0,0,0), tfSimdInt4x32Load(123, 11, -13, 32)}, @@ -143,10 +143,10 @@ UTEST(TF_Simd4, tfSimd4iSelect) { //UTEST(TF_Simd4, tfSimd4fSelect) { // struct { -// TSimdFloat32x4 a; -// TSimdFloat32x4 b; -// TSimdFloat32x4 mask; -// TSimdFloat32x4 expect; +// Tsimd_f32x4_t a; +// Tsimd_f32x4_t b; +// Tsimd_f32x4_t mask; +// Tsimd_f32x4_t expect; // } tests[] = { // {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(0,0,0,0), tfSimdFloat4Load(10, 11, -13, 32)}, // {tfSimdFloat4Load(10, 11, -13, 32), tfSimdFloat4Load(123, -149, 0, 12), tfSimdFloat4Load(TF_SIMD_TRUE,0,0,0), tfSimdFloat4Load(123, 11, -13, 32)}, diff --git a/Forge/tests/Math/TF_SimdFloat4x4Test.cpp b/Forge/tests/Math/TF_SimdFloat4x4Test.cpp index 78fa023e9f..832cf3f22d 100644 --- a/Forge/tests/Math/TF_SimdFloat4x4Test.cpp +++ b/Forge/tests/Math/TF_SimdFloat4x4Test.cpp @@ -34,7 +34,7 @@ UTEST(TF_SimdFloat4x4, tfMatTranpose4x4F) for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - TSimdFloat4x4 result = tfMatTranpose4x4F(tests[i].a); + TSimdFloat4x4 result = tfTransposeSimd4x4F(tests[i].a); debugPrintSimd4x4F(result); EXPECT_TRUE(tfIsCloseSimd4x4F(result, tests[i].test, DEFAULT_EPSILON)); } @@ -68,7 +68,7 @@ UTEST(TF_SimdFloat4x4, tfMatMul4x4F_4x4F) for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - TSimdFloat4x4 result = tfMatMul4x4F_4x4F(tests[i].a, tests[i].b); + TSimdFloat4x4 result = tfMulSimd4x4F_4x4F(tests[i].a, tests[i].b); debugPrintSimd4x4F(result); EXPECT_TRUE(tfIsCloseSimd4x4F(result, tests[i].test, DEFAULT_EPSILON)); } @@ -137,7 +137,7 @@ UTEST(TF_SimdFloat4x4, tfVectorMul4x4F) }; for (size_t i = 0; i < TF_ARRAY_COUNT(tests); i++) { - TSimdFloat4 result = tfVectorMul4x4F(tests[i].a, tests[i].b); + TSimdFloat4 result = tfVectorMulSimd4x4F(tests[i].a, tests[i].b); debugPrintSimd4F(result); EXPECT_TRUE(tfIsCloseSimd4F(result, tests[i].test, DEFAULT_EPSILON)); }