Skip to content

Commit

Permalink
[ESIMD] Optimize the simd stride constructor (#12553)
Browse files Browse the repository at this point in the history
simd(base, stride) calls previously were lowered into a long sequence of
INSERT and ADD operations. That sequence is replaced with a vector
equivalent:
  vbase = broadcast base
  vstride = broadcast stride
  vstride_coef = {0, 1, 2, 3, ... N-1}
  vec_result = vbase + vstride * vstride_coef;

---------

Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
  • Loading branch information
v-klochkov authored Feb 5, 2024
1 parent 20aee78 commit e9a1ace
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 41 deletions.
37 changes: 21 additions & 16 deletions sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,23 @@ constexpr vector_type_t<T, N> make_vector(const T (&&Arr)[N]) {
}

template <class T, int N, size_t... Is>
constexpr vector_type_t<T, N> make_vector_impl(T Base, T Stride,
std::index_sequence<Is...>) {
return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence<Is...>) {
if constexpr (std::is_integral_v<T> && N <= 3) {
// This sequence is a bit more efficient for integral types and N <= 3.
return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
} else {
using CppT = typename element_type_traits<T>::EnclosingCppT;
CppT BaseCpp = Base;
CppT StrideCpp = Stride;
vector_type_t<CppT, N> VBase = BaseCpp;
vector_type_t<CppT, N> VStride = StrideCpp;
vector_type_t<CppT, N> VStrideCoef{(CppT)(Is)...};
vector_type_t<CppT, N> Result{VBase + VStride * VStrideCoef};
return wrapper_type_converter<T>::template to_vector<N>(Result);
}
}

template <class T, int N>
constexpr vector_type_t<T, N> make_vector(T Base, T Stride) {
template <class T, int N> constexpr auto make_vector(T Base, T Stride) {
return make_vector_impl<T, N>(Base, Stride, std::make_index_sequence<N>{});
}

Expand Down Expand Up @@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__(
/// are initialized with the arithmetic progression defined by the arguments.
/// For example, <code>simd<int, 4> x(1, 3)</code> will initialize x to the
/// <code>{1, 4, 7, 10}</code> sequence.
/// @param Val The start of the progression.
/// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan,
/// then this constructor has undefined behavior.
/// @param Base The start of the progression.
/// @param Step The step of the progression.
simd_obj_impl(Ty Val, Ty Step) noexcept {
__esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
if constexpr (is_wrapper_elem_type_v<Ty> || !std::is_integral_v<Ty>) {
for (int i = 0; i < N; ++i) {
M_data[i] = bitcast_to_raw_type(Val);
Val = binary_op<BinOp::add, Ty>(Val, Step);
}
} else {
M_data = make_vector<Ty, N>(Val, Step);
}
simd_obj_impl(Ty Base, Ty Step) noexcept {
__esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step));
M_data = make_vector<Ty, N>(Base, Step);
}

/// Broadcast constructor. Given value is type-converted to the
Expand Down
16 changes: 3 additions & 13 deletions sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,8 @@ class run_test {
});
queue.wait_and_throw();

// Verify the base value was passed as-is
if (!are_bitwise_equal(result[0], base_value)) {
passed = false;
log::fail(TestDescriptionT(data_type, BaseVal, Step),
"Unexpected value at index 0, retrieved: ", result[0],
", expected: ", base_value);
}

// Verify the step value works as expected being passed to the fill
// constructor.
DataT expected_value = base_value;
for (size_t i = 1; i < result.size(); ++i) {
// Verify the the fill constructor.
for (size_t i = 0; i < result.size(); ++i) {
if constexpr (BaseVal == init_val::nan || Step == init_val::nan) {

if (!std::isnan(result[i])) {
Expand All @@ -268,7 +258,7 @@ class run_test {
}
} else {

expected_value += step_value;
DataT expected_value = base_value + (DataT)i * step_value;
if (!are_bitwise_equal(result[i], expected_value)) {
passed = false;
log::fail(TestDescriptionT(data_type, BaseVal, Step),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
// The test verifies that simd fill constructor has no precision differences.
// The test do the following actions:
// - call simd with predefined base and step values
// - bitwise comparing that output[0] value is equal to base value and
// output[i] is equal to output[i -1] + step_value
// - bitwise comparing that output[i] is equal to base + i * step_value.

#include "ctor_fill.hpp"

Expand Down
14 changes: 13 additions & 1 deletion sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,22 @@ int main(int, char **) {
}
{
const auto types = get_tested_types<tested_types::fp>();
{
const auto base_values =
ctors::get_init_values_pack<init_val::negative>();
const auto step_values =
ctors::get_init_values_pack<init_val::positive>();
passed &= for_all_combinations<ctors::run_test>(
types, sizes, contexts, base_values, step_values, queue);
}
// The test cases below have never been guaranteed to work some certain
// way with base and step values set to inf or non. They may or may not
// work as expected by the checks in this test.
{
const auto base_values =
ctors::get_init_values_pack<init_val::neg_inf>();
const auto step_values = ctors::get_init_values_pack<init_val::max>();
const auto step_values =
ctors::get_init_values_pack<init_val::positive>();
passed &= for_all_combinations<ctors::run_test>(
types, sizes, contexts, base_values, step_values, queue);
}
Expand Down
58 changes: 49 additions & 9 deletions sycl/test/esimd/ctor_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,62 @@ SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION {
// CHECK-NEXT: }
}

// Base + step constructor, FP element type, loops exected - don't check.
SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION {
simd<double, 2> val(17, 3);
return val;
// Const base + step constructor, FP element type.
SYCL_EXTERNAL auto double_base_step_const() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z22double_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
return simd<double, 64>{1.0, 3.0};
// CHECK: store <64 x double> <double 1.000000e+00, double 4.000000e+00, double 7.000000e+00, double 1.000000e+01, double 1.300000e+01, double 1.600000e+01, double 1.900000e+01, double 2.200000e+01, double 2.500000e+01, double 2.800000e+01, double 3.100000e+01, double 3.400000e+01, double 3.700000e+01, double 4.000000e+01, double 4.300000e+01, double 4.600000e+01, double 4.900000e+01, double 5.200000e+01, double 5.500000e+01, double 5.800000e+01, double 6.100000e+01, double 6.400000e+01, double 6.700000e+01, double 7.000000e+01, double 7.300000e+01, double 7.600000e+01, double 7.900000e+01, double 8.200000e+01, double 8.500000e+01, double 8.800000e+01, double 9.100000e+01, double 9.400000e+01, double 9.700000e+01, double 1.000000e+02, double 1.030000e+02, double 1.060000e+02, double 1.090000e+02, double 1.120000e+02, double 1.150000e+02, double 1.180000e+02, double 1.210000e+02, double 1.240000e+02, double 1.270000e+02, double 1.300000e+02, double 1.330000e+02, double 1.360000e+02, double 1.390000e+02, double 1.420000e+02, double 1.450000e+02, double 1.480000e+02, double 1.510000e+02, double 1.540000e+02, double 1.570000e+02, double 1.600000e+02, double 1.630000e+02, double 1.660000e+02, double 1.690000e+02, double 1.720000e+02, double 1.750000e+02, double 1.780000e+02, double 1.810000e+02, double 1.840000e+02, double 1.870000e+02, double 1.900000e+02>, ptr addrspace(4) %[[RES]]
// CHECK-NEXT: ret void
}

// Variable base + step constructor, FP element type.
SYCL_EXTERNAL auto double_base_step_var(double base, double step) SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z20double_base_step_vardd({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], double noundef %[[BASE:[a-zA-Z0-9_\.]+]], double noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
return simd<double, 32>{base, step};
// CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[BASE]], i64 0
// CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[BASE_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
// CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[STEP]], i64 0
// CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[STEP_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer
// CHECK: %[[FMA_VEC:[a-zA-Z0-9_\.]+]] = tail call noundef <32 x double> @llvm.fmuladd.v32f64(<32 x double> %[[STEP_VEC]], <32 x double> <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00, double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01, double 1.800000e+01, double 1.900000e+01, double 2.000000e+01, double 2.100000e+01, double 2.200000e+01, double 2.300000e+01, double 2.400000e+01, double 2.500000e+01, double 2.600000e+01, double 2.700000e+01, double 2.800000e+01, double 2.900000e+01, double 3.000000e+01, double 3.100000e+01>, <32 x double> %[[BASE_VEC]])
// CHECK: store <32 x double> %[[FMA_VEC]], ptr addrspace(4) %[[RES]]
// CHECK-NEXT: ret void
}

// Base + step constructor, integer element type, no loops exected - check.
SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<int, 2> val(17, 3);
// Const base + step constructor, integer element type.
SYCL_EXTERNAL auto int_base_step_const() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z19int_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<int, 16> val(17, 3);
return val;
// CHECK: store <2 x i32> <i32 17, i32 20>, ptr addrspace(4) %[[RES]]
// CHECK: store <16 x i32> <i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62>, ptr addrspace(4) %[[RES]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// Variable base + step constructor, integer element type.
SYCL_EXTERNAL auto int_base_step_var(int base, int step) SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z17int_base_step_varii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
return simd<int, 32>{base, step};
// CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[BASE]], i64 0
// CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[BASE_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
// CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[STEP]], i64 0
// CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[STEP_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer
// CHECK: %[[MUL_VEC:[a-zA-Z0-9_\.]+]] = mul <32 x i32> %[[STEP_VEC]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// CHECK: %[[ADD_VEC:[a-zA-Z0-9_\.]+]] = add <32 x i32> %[[BASE_VEC]], %[[MUL_VEC]]
// CHECK: store <32 x i32> %[[ADD_VEC]], ptr addrspace(4) %[[RES]]
// CHECK-NEXT: ret void
}

// Variable base + step constructor, integer element type.
SYCL_EXTERNAL auto int_base_step_var_n2(int base, int step) SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z20int_base_step_var_n2ii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} {
return simd<int, 2>{base, step};
// CHECK: %[[BASE_VEC_TMP1:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> poison, i32 %[[BASE]], i64 0
// CHECK: %[[BASE_INC:[a-zA-Z0-9_\.]+]] = add nsw i32 %[[BASE]], %[[STEP]]
// CHECK: %[[RESULT_VEC:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> %[[BASE_VEC_TMP1]], i32 %[[BASE_INC]], i64 1
// CHECK: store <2 x i32> %[[RESULT_VEC]], ptr addrspace(4) %[[RES]]
// CHECK-NEXT: ret void
}

// Broadcast constructor, FP element type, no loops exected - check.
SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
Expand Down

0 comments on commit e9a1ace

Please sign in to comment.