From e9a1ace965cbdf6e27e4cd468a1d157c2db640b4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Mon, 5 Feb 2024 17:46:49 -0600 Subject: [PATCH] [ESIMD] Optimize the simd stride constructor (#12553) simd(base, stride) calls previously were lowered into a long sequence of INSERT and ADD operations. That sequence is replaced with a vector equivalent: vbase = broadcast base vstride = broadcast stride vstride_coef = {0, 1, 2, 3, ... N-1} vec_result = vbase + vstride * vstride_coef; --------- Signed-off-by: Klochkov, Vyacheslav N --- .../ext/intel/esimd/detail/simd_obj_impl.hpp | 37 +++++++----- .../ESIMD/api/functional/ctors/ctor_fill.hpp | 16 +---- .../ctors/ctor_fill_accuracy_fp.cpp | 3 +- .../api/functional/ctors/ctor_fill_core.cpp | 14 ++++- sycl/test/esimd/ctor_codegen.cpp | 58 ++++++++++++++++--- 5 files changed, 87 insertions(+), 41 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp index ae6137c01fe12..5efd958789f72 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp @@ -124,13 +124,23 @@ constexpr vector_type_t make_vector(const T (&&Arr)[N]) { } template -constexpr vector_type_t make_vector_impl(T Base, T Stride, - std::index_sequence) { - return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; +constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence) { + if constexpr (std::is_integral_v && N <= 3) { + // This sequence is a bit more efficient for integral types and N <= 3. + return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; + } else { + using CppT = typename element_type_traits::EnclosingCppT; + CppT BaseCpp = Base; + CppT StrideCpp = Stride; + vector_type_t VBase = BaseCpp; + vector_type_t VStride = StrideCpp; + vector_type_t VStrideCoef{(CppT)(Is)...}; + vector_type_t Result{VBase + VStride * VStrideCoef}; + return wrapper_type_converter::template to_vector(Result); + } } -template -constexpr vector_type_t make_vector(T Base, T Stride) { +template constexpr auto make_vector(T Base, T Stride) { return make_vector_impl(Base, Stride, std::make_index_sequence{}); } @@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__( /// are initialized with the arithmetic progression defined by the arguments. /// For example, simd x(1, 3) will initialize x to the /// {1, 4, 7, 10} sequence. - /// @param Val The start of the progression. + /// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan, + /// then this constructor has undefined behavior. + /// @param Base The start of the progression. /// @param Step The step of the progression. - simd_obj_impl(Ty Val, Ty Step) noexcept { - __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); - if constexpr (is_wrapper_elem_type_v || !std::is_integral_v) { - for (int i = 0; i < N; ++i) { - M_data[i] = bitcast_to_raw_type(Val); - Val = binary_op(Val, Step); - } - } else { - M_data = make_vector(Val, Step); - } + simd_obj_impl(Ty Base, Ty Step) noexcept { + __esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step)); + M_data = make_vector(Base, Step); } /// Broadcast constructor. Given value is type-converted to the diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp index 8ae03e7a9a577..b3c81ec144530 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp @@ -246,18 +246,8 @@ class run_test { }); queue.wait_and_throw(); - // Verify the base value was passed as-is - if (!are_bitwise_equal(result[0], base_value)) { - passed = false; - log::fail(TestDescriptionT(data_type, BaseVal, Step), - "Unexpected value at index 0, retrieved: ", result[0], - ", expected: ", base_value); - } - - // Verify the step value works as expected being passed to the fill - // constructor. - DataT expected_value = base_value; - for (size_t i = 1; i < result.size(); ++i) { + // Verify the the fill constructor. + for (size_t i = 0; i < result.size(); ++i) { if constexpr (BaseVal == init_val::nan || Step == init_val::nan) { if (!std::isnan(result[i])) { @@ -268,7 +258,7 @@ class run_test { } } else { - expected_value += step_value; + DataT expected_value = base_value + (DataT)i * step_value; if (!are_bitwise_equal(result[i], expected_value)) { passed = false; log::fail(TestDescriptionT(data_type, BaseVal, Step), diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp index 9fcac2c603c30..9d34849e106f8 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp @@ -15,8 +15,7 @@ // The test verifies that simd fill constructor has no precision differences. // The test do the following actions: // - call simd with predefined base and step values -// - bitwise comparing that output[0] value is equal to base value and -// output[i] is equal to output[i -1] + step_value +// - bitwise comparing that output[i] is equal to base + i * step_value. #include "ctor_fill.hpp" diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp index 7d2fc4f7592d8..f8417718a0962 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp @@ -112,10 +112,22 @@ int main(int, char **) { } { const auto types = get_tested_types(); + { + const auto base_values = + ctors::get_init_values_pack(); + const auto step_values = + ctors::get_init_values_pack(); + passed &= for_all_combinations( + types, sizes, contexts, base_values, step_values, queue); + } + // The test cases below have never been guaranteed to work some certain + // way with base and step values set to inf or non. They may or may not + // work as expected by the checks in this test. { const auto base_values = ctors::get_init_values_pack(); - const auto step_values = ctors::get_init_values_pack(); + const auto step_values = + ctors::get_init_values_pack(); passed &= for_all_combinations( types, sizes, contexts, base_values, step_values, queue); } diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp index cf86da2a5aad5..9e75e76fdf972 100644 --- a/sycl/test/esimd/ctor_codegen.cpp +++ b/sycl/test/esimd/ctor_codegen.cpp @@ -24,22 +24,62 @@ SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION { // CHECK-NEXT: } } -// Base + step constructor, FP element type, loops exected - don't check. -SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION { - simd val(17, 3); - return val; +// Const base + step constructor, FP element type. +SYCL_EXTERNAL auto double_base_step_const() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z22double_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{1.0, 3.0}; + // CHECK: store <64 x double> , ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + +// Variable base + step constructor, FP element type. +SYCL_EXTERNAL auto double_base_step_var(double base, double step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z20double_base_step_vardd({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], double noundef %[[BASE:[a-zA-Z0-9_\.]+]], double noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[BASE]], i64 0 + // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[BASE_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer + // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[STEP]], i64 0 + // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[STEP_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer + // CHECK: %[[FMA_VEC:[a-zA-Z0-9_\.]+]] = tail call noundef <32 x double> @llvm.fmuladd.v32f64(<32 x double> %[[STEP_VEC]], <32 x double> , <32 x double> %[[BASE_VEC]]) + // CHECK: store <32 x double> %[[FMA_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void } -// Base + step constructor, integer element type, no loops exected - check. -SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION { - // CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { - simd val(17, 3); +// Const base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_const() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z19int_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(17, 3); return val; - // CHECK: store <2 x i32> , ptr addrspace(4) %[[RES]] + // CHECK: store <16 x i32> , ptr addrspace(4) %[[RES]] // CHECK-NEXT: ret void // CHECK-NEXT: } } +// Variable base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_var(int base, int step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z17int_base_step_varii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[BASE]], i64 0 + // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[BASE_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer + // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[STEP]], i64 0 + // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[STEP_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer + // CHECK: %[[MUL_VEC:[a-zA-Z0-9_\.]+]] = mul <32 x i32> %[[STEP_VEC]], + // CHECK: %[[ADD_VEC:[a-zA-Z0-9_\.]+]] = add <32 x i32> %[[BASE_VEC]], %[[MUL_VEC]] + // CHECK: store <32 x i32> %[[ADD_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + +// Variable base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_var_n2(int base, int step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z20int_base_step_var_n2ii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP1:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> poison, i32 %[[BASE]], i64 0 + // CHECK: %[[BASE_INC:[a-zA-Z0-9_\.]+]] = add nsw i32 %[[BASE]], %[[STEP]] + // CHECK: %[[RESULT_VEC:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> %[[BASE_VEC_TMP1]], i32 %[[BASE_INC]], i64 1 + // CHECK: store <2 x i32> %[[RESULT_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + // Broadcast constructor, FP element type, no loops exected - check. SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION { // CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {