Skip to content

Commit

Permalink
[UR][CUDA][HIP] Fix incorrect outputs and improve performance of queu…
Browse files Browse the repository at this point in the history
…e::fill

Update the UR tag to fix queue::fill for the CUDA and HIP backends, which was
previously producing incorrect outputs for any pattern size other than 1, 2,
or a multiple of 4 bytes. A new optimisation is also added which speeds up
the fill greatly if the pattern equals to the first word repeated throughout
(e.g. all zeros).

Add a new e2e test to validate queue::fill outputs for any pattern size
between 1 and 32 bytes. This test fails for CUDA and HIP before the UR change
and passes with this PR. Other backends already worked correctly.
  • Loading branch information
rafbiels committed Nov 5, 2024
1 parent 4452bc1 commit fc676e8
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 8 deletions.
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/rafbiels/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
12 changes: 5 additions & 7 deletions sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# commit b0a9e2be61ad42d3447f1f246120ab25119a03e0
# Merge: fa8cc8ec a0cf2ce2
# Author: Callum Fare <[email protected]>
# Date: Mon Nov 4 10:00:08 2024 +0000
# Merge pull request #2165 from aarongreig/aaron/makeUSMPoolsOptional
# Make USM pools optional with a device query to report support.
set(UNIFIED_RUNTIME_TAG b0a9e2be61ad42d3447f1f246120ab25119a03e0)
# commit cc528219b26ab63bf5804bfa4659f9a202f6da34
# Author: Rafal Bielski <[email protected]>
# Date: Thu Oct 31 23:13:30 2024 +0000
# Fix incorrect outputs and improve performance of commonMemSetLargePattern
set(UNIFIED_RUNTIME_TAG rafbiels/improve-memset)
79 changes: 79 additions & 0 deletions sycl/test-e2e/USM/fill_any_size.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// RUN: %{build} -o %t1.out
// RUN: %{run} %t1.out

/**
* Test of the queue::fill interface with a range of pattern sizes and values.
*
* Loops over pattern sizes from 1 to MaxPatternSize bytes and calls queue::fill
* with std::array<uint8_t,Size> for the pattern. Two pattern values are tested,
* all zeros and value=index+42. The output is copied back to host and
* validated.
*/

#include <array>
#include <cstdio>
#include <sycl/detail/core.hpp>
#include <sycl/usm.hpp>

constexpr size_t MaxPatternSize{32}; // bytes
constexpr size_t NumElements{10};
constexpr size_t NumRepeats{1};
constexpr bool verbose{false};

template <size_t PatternSize, bool SameValue>
int test(sycl::queue &q, uint8_t firstValue = 0) {
using T = std::array<uint8_t, PatternSize>;
T value{};
for (unsigned int i{0}; i < PatternSize; ++i) {
if constexpr (SameValue) {
value[0] = firstValue;
} else {
value[i] = firstValue + i;
}
}

T *dptr{sycl::malloc_device<T>(NumElements, q)};
for (size_t repeat{0}; repeat < NumRepeats; ++repeat) {
q.fill(dptr, value, NumElements).wait();
}

std::array<T, NumElements> host{};
q.copy<T>(dptr, host.data(), NumElements).wait();
bool pass{true};
for (unsigned int i{0}; i < NumElements; ++i) {
for (unsigned int j{0}; j < PatternSize; ++j) {
if (host[i][j] != value[j]) {
pass = false;
}
}
}
sycl::free(dptr, q);

if (!pass || verbose) {
printf("Pattern size %3lu bytes, %s values (initial %3u) %s\n", PatternSize,
(SameValue ? " equal" : "varied"), firstValue,
(pass ? "== PASS ==" : "== FAIL =="));
}

return !pass;
}

template <size_t Size> int testOneSize(sycl::queue &q) {
return test<Size, true>(q, 0) + test<Size, false>(q, 42);
}

template <size_t... Sizes>
int testSizes(sycl::queue &q, std::index_sequence<Sizes...>) {
return (testOneSize<1u + Sizes>(q) + ...);
}

int main() {
sycl::queue q{};
int failures = testSizes(q, std::make_index_sequence<MaxPatternSize>{});
if (failures > 0) {
printf("%d / %lu tests failed\n", failures, 2 * MaxPatternSize);
} else {
printf("All %lu tests passed\n", 2 * MaxPatternSize);
}
return failures;
}

0 comments on commit fc676e8

Please sign in to comment.