Skip to content

Commit

Permalink
[SYCL][COMPAT] Ensure launched kernels are fully inlined (#15941)
Browse files Browse the repository at this point in the history
This PR defines & uses a custom `syclcompat::detail::apply_helper` with
`[[clang::always_inline]]` to ensure kernels are inlined.
  • Loading branch information
joeatodd authored Nov 6, 2024
1 parent cda38de commit dc181bb
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 5 deletions.
23 changes: 19 additions & 4 deletions sycl/include/syclcompat/launch_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy<
detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;

namespace detail {
// Custom std::apply helpers to enable inlining
template <class F, class Tuple, size_t... Is>
__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t,
std::index_sequence<Is...>) {
[[clang::always_inline]] f(get<Is>(t)...);
}

template <class F, class Tuple>
__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) {
apply_expand(f, t, std::make_index_sequence<std::tuple_size<Tuple>{}>{});
}

template <auto F, typename Range, typename KProps, bool HasLocalMem,
typename... Args>
Expand All @@ -211,12 +222,16 @@ struct KernelFunctor {
operator()(syclcompat::detail::range_to_item_t<Range>) const {
if constexpr (HasLocalMem) {
char *local_mem_ptr = static_cast<char *>(
_local_acc.template get_multi_ptr<sycl::access::decorated::no>().get());
std::apply(
[lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); },
_local_acc.template get_multi_ptr<sycl::access::decorated::no>()
.get());
apply_helper(
[lmem_ptr = local_mem_ptr](auto &&...args) {
[[clang::always_inline]] F(args..., lmem_ptr);
},
_argument_tuple);
} else {
std::apply([](auto &&...args) { F(args...); }, _argument_tuple);
apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); },
_argument_tuple);
}
}

Expand Down
2 changes: 1 addition & 1 deletion sycl/test/syclcompat/launch/kernel_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
// We need hardware which can support at least 2 sub-group sizes, since that
// hardware (presumably) supports the `intel_reqd_sub_group_size` attribute.
// REQUIRES: sg-32 && sg-16
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
Expand Down
97 changes: 97 additions & 0 deletions sycl/test/syclcompat/launch/launch_inlining.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/***************************************************************************
*
* Copyright (C) Codeplay Software Ltd.
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SYCLcompat API
*
* launch_inlining.cpp
*
* Description:
* Ensure kernels are inlined
**************************************************************************/
// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the
// purposes of the test
#include <sycl/detail/core.hpp>
#include <sycl/group_barrier.hpp>
#include <syclcompat/launch.hpp>
#include <syclcompat/memory.hpp>

namespace compat_exp = syclcompat::experimental;
namespace sycl_exp = sycl::ext::oneapi::experimental;
namespace sycl_intel_exp = sycl::ext::intel::experimental;

static constexpr int LOCAL_MEM_SIZE = 1024;

// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} {
// CHECK-NOT: call {{.*}}write_mem_kernel
// CHECK: }

template <typename T> void write_mem_kernel(T *data, int num_elements) {
const int id =
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
if (id < num_elements) {
data[id] = static_cast<T>(id);
}
};

// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} {
// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel
// CHECK: }
template <typename T>
void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T);
T *typed_local_mem = reinterpret_cast<T *>(local_mem);

const int id =
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
if (id < num_elements) {
typed_local_mem[id] = static_cast<T>(id);
}
sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>());
if (id < num_elements) {
data[id] = typed_local_mem[num_elements - id - 1];
}
};

int test_write_mem() {
compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32},
syclcompat::dim3{32});

const int memsize = 1024;
int *d_a = (int *)syclcompat::malloc(memsize);
compat_exp::launch<write_mem_kernel<int>>(my_dim3_config, d_a,
memsize / sizeof(int))
.wait();

syclcompat::free(d_a);
return 0;
}

int test_lmem_launch() {
int local_mem_size = LOCAL_MEM_SIZE;

size_t num_elements = local_mem_size / sizeof(int);
int *d_a = (int *)syclcompat::malloc(local_mem_size);

compat_exp::launch_policy my_config(
sycl::nd_range<1>{{256}, {256}},
compat_exp::local_mem_size(local_mem_size));

compat_exp::launch<dynamic_local_mem_typed_kernel<int>>(my_config, d_a)
.wait();

syclcompat::free(d_a);

return 0;
}

0 comments on commit dc181bb

Please sign in to comment.