Skip to content

Commit

Permalink
[SYCL][ESIMD]Implement slm_scatter accepting compile time properties (#…
Browse files Browse the repository at this point in the history
…12591)

This implements the unified memory API for slm_scatter with local memory
  • Loading branch information
fineg74 authored Feb 7, 2024
1 parent e53e5d5 commit 9617939
Show file tree
Hide file tree
Showing 6 changed files with 539 additions and 38 deletions.
216 changes: 193 additions & 23 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2685,11 +2685,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
namespace detail {
template <typename T, int N, typename AccessorTy>
ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
(std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
uint32_t glob_offset, simd_mask<N> mask) {

static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
"Unexpected type or vector length");
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr int16_t scale = 0;
Expand Down Expand Up @@ -2820,10 +2822,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
/// @return is a vector of type T and size N * NElts.
///
template <typename T, int NElts, lsc_data_size DS, int N>
__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
__ESIMD_NS::simd_mask<N> pred,
__ESIMD_NS::simd<T, N * NElts> pass_thru) {
__ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
simd_mask<N> pred,
simd<T, N * NElts> pass_thru) {
check_lsc_vector_size<NElts>();
check_lsc_data_size<T, DS>();
constexpr uint16_t AddressScale = 1;
Expand All @@ -2832,16 +2833,46 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
lsc_format_input<MsgT>(pass_thru);
__ESIMD_NS::simd<MsgT, N * NElts> Result =
simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
simd<MsgT, N * NElts> Result =
__esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
AddressScale, ImmOffset, EDS, LSCVS,
Transposed, N>(pred.data(), offsets.data(),
PassThruExpanded.data());
return lsc_format_ret<T>(Result);
}

/// SLM scatter implementation.
/// Supported platforms: DG2, PVC
/// VISA instruction: lsc_store.slm
///
/// Scatters elements located to slm.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
/// @tparam DS is the data size.
/// @tparam N is the number of channels (platform dependent).
/// @param offsets is the zero-based offsets for SLM buffer in bytes.
/// @param vals is values to store.
/// @param pred is predicates.
///
template <typename T, int NElts, lsc_data_size DS, int N>
__ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
simd<T, N * NElts> vals, simd_mask<N> pred) {
check_lsc_vector_size<NElts>();
check_lsc_data_size<T, DS>();
constexpr uint16_t AddressScale = 1;
constexpr int ImmOffset = 0;
constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
__esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
ImmOffset, EDS, LSCVS, Transposed, N>(
pred.data(), offsets.data(), Tmp.data());
}

} // namespace detail

/// @endcond ESIMD_DETAIL
Expand Down Expand Up @@ -3903,7 +3934,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
static_assert(Alignment >= sizeof(T),
"slm_gather() requires at least element-size alignment");

if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
!detail::isMaskedGatherScatterLLVMAvailable())) {
simd<T, N> PassThru; // Intentionally undefined
return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
Expand Down Expand Up @@ -4118,7 +4149,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
/// @param byte_offsets the vector of 32-bit offsets in bytes.
/// For each i, (byte_offsets[i]) must be element size aligned.
/// @param props The optional compile-time properties. Only 'alignment'
/// and cache hint properties are used.
/// property is used.
/// @return A vector of elements read.
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
typename PropertyListT =
Expand All @@ -4141,17 +4172,156 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
return Res[0];
}

/// Scatter operation over the Shared Local Memory.
/// This API has almost the same interface as the @ref accessor_scatter
/// "accessor-based scatter", except that it does not have the accessor and
/// the global offset parameters.
///
template <typename T, int N>
__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
(sizeof(T) <= 4)>
slm_scatter(simd<uint32_t, N> offsets, simd<T, N> vals, simd_mask<N> mask = 1) {
detail::LocalAccessorMarker acc;
detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
/// template <typename T, int N, int VS = 1,
/// typename PropertyListT = empty_properties_t>
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
/// simd<T, N> vals, simd_mask<N / VS> mask,
/// PropertyListT props = {}); // (slm-sc-1)
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
/// simd<T, N> vals, PropertyListT props = {}); // (slm-sc-2)
///
/// The next 2 functions are variations of the first 2 above (slm-sc-1,2)
/// and were added only to support simd_view instead of simd for byte_offsets.
/// template <typename T, int N, int VS = 1, typename OffsetObjT,
/// typename OffsetRegionT, typename PropertyListT = empty_props_t>
/// void slm_scatter(OffsetSimdViewT byte_offsets,
/// simd<T, N> vals, simd_mask<N / VS> mask,
/// PropertyListT props = {}); // (slm-sc-3)
/// void slm_scatter(OffsetSimdViewT byte_offsets,
/// simd<T, N> vals, PropertyListT props = {}); // (slm-sc-4)

/// template <typename T, int N, int VS = 1,
/// typename PropertyListT = empty_properties_t>
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
/// simd<T, N> vals, simd_mask<N / VS> mask,
/// PropertyListT props = {}); // (slm-sc-1)
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
/// locations addressed by byte offsets \p byte_offsets. Storage of any element
/// can be disabled via the input vector of predicates \p mask.
/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
/// @tparam T Element type.
/// @tparam N Number of elements to read.
/// @tparam VS Vector size. It can also be read as the number of writes per each
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
/// @param byte_offsets the vector of 32-bit offsets in bytes.
/// For each i, (byte_offsets[i]) must be element size aligned.
/// If the alignment property is not passed, then it is assumed that each
/// accessed address is aligned by element-size.
/// @param vals The vector of values to store.
/// @param mask The access mask, defaults to all 1s.
/// @param props The optional compile-time properties. Only 'alignment' property
/// is used.
template <typename T, int N, int VS = 1,
typename PropertyListT =
ext::oneapi::experimental::detail::empty_properties_t>
__ESIMD_API std::enable_if_t<
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
simd_mask<N / VS> mask, PropertyListT props = {}) {
static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");

constexpr size_t Alignment =
detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
static_assert(Alignment >= sizeof(T),
"slm_scatter() requires at least element-size alignment");

// Use LSC lowering if VS > 1.
if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) {
__ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
byte_offsets, vals, mask);
} else {
detail::LocalAccessorMarker acc;
detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
}
}

/// template <typename T, int N, int VS = 1,
/// typename PropertyListT = empty_properties_t>
/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
/// PropertyListT props = {}); // (slm-sc-2)
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
/// locations addressed by byte offsets \p byte_offsets.
/// @tparam T Element type.
/// @tparam N Number of elements to read.
/// @tparam VS Vector size. It can also be read as the number of reads per each
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
/// @param byte_offsets the vector of 32-bit offsets in bytes.
/// For each i, (byte_offsets[i]) must be element size aligned.
/// @param vals The vector of values to store.
/// @param props The optional compile-time properties. Only 'alignment'
/// property is used.
template <typename T, int N, int VS = 1,
typename PropertyListT =
ext::oneapi::experimental::detail::empty_properties_t>
__ESIMD_API std::enable_if_t<
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
PropertyListT props = {}) {
simd_mask<N / VS> Mask = 1;
slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
}

/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
/// typename PropertyListT = empty_props_t>
/// void slm_scatter(
/// OffsetSimdViewT byte_offsets, simd<T, N> vals,
/// simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
/// locations addressed by byte offsets \p byte_offsets.
/// Storage to any element's memory location can be disabled via the
/// input vector of predicates \p mask. If mask[i] is unset, then the storage to
/// (byte_offsets[i]) is skipped.
/// @tparam T Element type.
/// @tparam N Number of elements to read.
/// @tparam VS Vector size. It can also be read as the number of reads per each
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
/// @param byte_offsets the vector of 32-bit offsets in bytes.
/// For each i, (byte_offsets[i]) must be element size aligned.
/// If the alignment property is not passed, then it is assumed that each
/// accessed address is aligned by element-size.
/// @param vals The vector of values to store.
/// @param mask The access mask, defaults to all 1s.
/// @param props The optional compile-time properties. Only 'alignment'
/// property is used.
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
typename PropertyListT =
ext::oneapi::experimental::detail::empty_properties_t>
__ESIMD_API std::enable_if_t<
detail::is_simd_view_type_v<OffsetSimdViewT> &&
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
simd_mask<N / VS> mask, PropertyListT props = {}) {
slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
}

/// void slm_scatter(
/// OffsetSimdViewT byte_offsets, simd<T, N> vals,
/// PropertyListT props = {}); // (slm-sc-4)
/// Loads ("gathers") elements of the type 'T' from Shared Local Memory
/// locations addressed by byte offsets \p byte_offsets, and returns the loaded
/// elements.
/// @tparam T Element type.
/// @tparam N Number of elements to read.
/// @tparam VS Vector size. It can also be read as the number of reads per each
/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
/// @param byte_offsets the vector of 32-bit offsets in bytes.
/// For each i, (byte_offsets[i]) must be element size aligned.
/// @param vals The vector of values to store.
/// @param props The optional compile-time properties. Only 'alignment'
/// property is used.
template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
typename PropertyListT =
ext::oneapi::experimental::detail::empty_properties_t>
__ESIMD_API std::enable_if_t<
detail::is_simd_view_type_v<OffsetSimdViewT> &&
ext::oneapi::experimental::is_property_list_v<PropertyListT>>
slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
PropertyListT props = {}) {
return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
}

/// Store a scalar value into the Shared Local Memory.
Expand Down
16 changes: 1 addition & 15 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1424,21 +1424,7 @@ template <typename T, int NElts = 1,
__ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
__ESIMD_NS::simd<T, N * NElts> vals,
__ESIMD_NS::simd_mask<N> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS =
detail::expand_data_size(detail::finalize_data_size<T, DS>());
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::nontranspose;
using MsgT = typename detail::lsc_expand_type<T>::type;
using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
__ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
__esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
_ImmOffset, _DS, _VS, _Transposed, N>(
pred.data(), offsets.data(), Tmp.data());
__ESIMD_DNS::slm_scatter_impl<T, NElts, DS>(offsets, vals, pred);
}

/// Transposed SLM scatter with 1 channel.
Expand Down
Loading

0 comments on commit 9617939

Please sign in to comment.