-
Notifications
You must be signed in to change notification settings - Fork 744
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[UR][CUDA][HIP] Fix incorrect outputs and improve performance of queu…
…e::fill Update the UR tag to fix queue::fill for the CUDA and HIP backends, which was previously producing incorrect outputs for any pattern size other than 1, 2, or a multiple of 4 bytes. A new optimisation is also added which speeds up the fill greatly if the pattern equals to the first word repeated throughout (e.g. all zeros). Add a new e2e test to validate queue::fill outputs for any pattern size between 1 and 32 bytes. This test fails for CUDA and HIP before the UR change and passes with this PR. Other backends already worked correctly.
- Loading branch information
Showing
3 changed files
with
85 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
# commit b0a9e2be61ad42d3447f1f246120ab25119a03e0 | ||
# Merge: fa8cc8ec a0cf2ce2 | ||
# Author: Callum Fare <[email protected]> | ||
# Date: Mon Nov 4 10:00:08 2024 +0000 | ||
# Merge pull request #2165 from aarongreig/aaron/makeUSMPoolsOptional | ||
# Make USM pools optional with a device query to report support. | ||
set(UNIFIED_RUNTIME_TAG b0a9e2be61ad42d3447f1f246120ab25119a03e0) | ||
# commit cc528219b26ab63bf5804bfa4659f9a202f6da34 | ||
# Author: Rafal Bielski <[email protected]> | ||
# Date: Thu Oct 31 23:13:30 2024 +0000 | ||
# Fix incorrect outputs and improve performance of commonMemSetLargePattern | ||
set(UNIFIED_RUNTIME_TAG rafbiels/improve-memset) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
// RUN: %{build} -o %t1.out | ||
// RUN: %{run} %t1.out | ||
|
||
/** | ||
* Test of the queue::fill interface with a range of pattern sizes and values. | ||
* | ||
* Loops over pattern sizes from 1 to MaxPatternSize bytes and calls queue::fill | ||
* with std::array<uint8_t,Size> for the pattern. Two pattern values are tested, | ||
* all zeros and value=index+42. The output is copied back to host and | ||
* validated. | ||
*/ | ||
|
||
#include <array> | ||
#include <cstdio> | ||
#include <sycl/detail/core.hpp> | ||
#include <sycl/usm.hpp> | ||
|
||
constexpr size_t MaxPatternSize{32}; // bytes | ||
constexpr size_t NumElements{10}; | ||
constexpr size_t NumRepeats{1}; | ||
constexpr bool verbose{false}; | ||
|
||
template <size_t PatternSize, bool SameValue> | ||
int test(sycl::queue &q, uint8_t firstValue = 0) { | ||
using T = std::array<uint8_t, PatternSize>; | ||
T value{}; | ||
for (unsigned int i{0}; i < PatternSize; ++i) { | ||
if constexpr (SameValue) { | ||
value[0] = firstValue; | ||
} else { | ||
value[i] = firstValue + i; | ||
} | ||
} | ||
|
||
T *dptr{sycl::malloc_device<T>(NumElements, q)}; | ||
for (size_t repeat{0}; repeat < NumRepeats; ++repeat) { | ||
q.fill(dptr, value, NumElements).wait(); | ||
} | ||
|
||
std::array<T, NumElements> host{}; | ||
q.copy<T>(dptr, host.data(), NumElements).wait(); | ||
bool pass{true}; | ||
for (unsigned int i{0}; i < NumElements; ++i) { | ||
for (unsigned int j{0}; j < PatternSize; ++j) { | ||
if (host[i][j] != value[j]) { | ||
pass = false; | ||
} | ||
} | ||
} | ||
sycl::free(dptr, q); | ||
|
||
if (!pass || verbose) { | ||
printf("Pattern size %3lu bytes, %s values (initial %3u) %s\n", PatternSize, | ||
(SameValue ? " equal" : "varied"), firstValue, | ||
(pass ? "== PASS ==" : "== FAIL ==")); | ||
} | ||
|
||
return !pass; | ||
} | ||
|
||
template <size_t Size> int testOneSize(sycl::queue &q) { | ||
return test<Size, true>(q, 0) + test<Size, false>(q, 42); | ||
} | ||
|
||
template <size_t... Sizes> | ||
int testSizes(sycl::queue &q, std::index_sequence<Sizes...>) { | ||
return (testOneSize<1u + Sizes>(q) + ...); | ||
} | ||
|
||
int main() { | ||
sycl::queue q{}; | ||
int failures = testSizes(q, std::make_index_sequence<MaxPatternSize>{}); | ||
if (failures > 0) { | ||
printf("%d / %lu tests failed\n", failures, 2 * MaxPatternSize); | ||
} else { | ||
printf("All %lu tests passed\n", 2 * MaxPatternSize); | ||
} | ||
return failures; | ||
} |