Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect outputs and improve performance of commonMemSetLargePattern #2273

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 61 additions & 25 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -961,35 +961,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(

// CUDA has no memset functions that allow setting values more than 4 bytes. UR
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
// the buffer using multiple strided calls. The first 4 patterns are set using
// cuMemsetD32Async then all subsequent 1 byte patterns are set using
// cuMemset2DAsync which is called for each pattern.
// more than 4 bytes. We must break up the pattern into 1, 2 or 4-byte values
// and set the buffer using multiple strided calls.
ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
CUdeviceptr Ptr) {
// Calculate the number of patterns, stride, number of times the pattern
// needs to be applied, and the number of times the first 32 bit pattern
// needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;
auto Count32 = Size / sizeof(uint32_t);

// Get 4-byte chunk of the pattern and call cuMemsetD32Async
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream));
for (auto step = 4u; step < NumberOfSteps; ++step) {
// take 1 byte of the pattern
Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = Ptr + (step * sizeof(uint8_t));

// set all of the pattern chunks
UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
// Find the largest supported word size into which the pattern can be divided
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
: PatternSize % 2u == 0u ? 2u
: 1u;

// Calculate the number of words in the pattern, the stride, and the number of
// times the pattern needs to be applied
auto NumberOfSteps = PatternSize / BackendWordSize;
auto Pitch = NumberOfSteps * BackendWordSize;
auto Height = Size / PatternSize;

// Same implementation works for any pattern word type (uint8_t, uint16_t,
// uint32_t)
auto memsetImpl = [BackendWordSize, NumberOfSteps, Pitch, Height, Size, Ptr,
&Stream](const auto *pPatternWords,
auto &&continuousMemset, auto &&stridedMemset) {
// If the pattern is 1 word or the first word is repeated throughout, a fast
// continuous fill can be used without the need for slower strided fills
bool UseOnlyFirstValue{true};
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
if (*(pPatternWords + Step) != *pPatternWords) {
UseOnlyFirstValue = false;
}
}
auto OptimizedNumberOfSteps{UseOnlyFirstValue ? 1u : NumberOfSteps};

// Fill the pattern in steps of BackendWordSize bytes. Use a continuous
// fill in the first step because it's faster than a strided fill. Then,
// overwrite the other values in subsequent steps.
for (auto Step{0u}; Step < OptimizedNumberOfSteps; ++Step) {
if (Step == 0) {
UR_CHECK_ERROR(continuousMemset(Ptr, *(pPatternWords),
Size / BackendWordSize, Stream));
} else {
UR_CHECK_ERROR(stridedMemset(Ptr + Step * BackendWordSize, Pitch,
*(pPatternWords + Step), 1u, Height,
Stream));
}
}
};

// Apply the implementation to the chosen pattern word type
switch (BackendWordSize) {
case 4u: {
memsetImpl(static_cast<const uint32_t *>(pPattern), cuMemsetD32Async,
cuMemsetD2D32Async);
break;
}
case 2u: {
memsetImpl(static_cast<const uint16_t *>(pPattern), cuMemsetD16Async,
cuMemsetD2D16Async);
break;
}
default: {
memsetImpl(static_cast<const uint8_t *>(pPattern), cuMemsetD8Async,
cuMemsetD2D8Async);
break;
}
}

return UR_RESULT_SUCCESS;
}

Expand Down
81 changes: 62 additions & 19 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,25 +712,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(

static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
hipDeviceptr_t Ptr,
uint32_t StartOffset) {
// Calculate the number of times the pattern needs to be applied
auto Height = Size / PatternSize;

// Calculate the number of patterns, stride and the number of times the
// pattern needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;

for (auto step = 4u; step < NumberOfSteps; ++step) {
for (auto step = StartOffset; step < PatternSize; ++step) {
// take 1 byte of the pattern
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
(step * sizeof(uint8_t)));
auto OffsetPtr =
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) + step);

// set all of the pattern chunks
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
UR_CHECK_ERROR(
hipMemset2DAsync(OffsetPtr, PatternSize, Value, 1u, Height, Stream));
}
}

Expand All @@ -743,11 +740,55 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
// Find the largest supported word size into which the pattern can be divided
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
: PatternSize % 2u == 0u ? 2u
: 1u;

// Calculate the number of patterns
auto NumberOfSteps = PatternSize / BackendWordSize;

// If the pattern is 1 word or the first word is repeated throughout, a fast
// continuous fill can be used without the need for slower strided fills
bool UseOnlyFirstValue{true};
auto checkIfFirstWordRepeats = [&UseOnlyFirstValue,
NumberOfSteps](const auto *pPatternWords) {
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
if (*(pPatternWords + Step) != *pPatternWords) {
UseOnlyFirstValue = false;
}
}
};

// Get 4-byte chunk of the pattern and call hipMemsetD32Async
auto Count32 = Size / sizeof(uint32_t);
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
// Use a continuous fill for the first word in the pattern because it's faster
// than a strided fill. Then, overwrite the other values in subsequent steps.
switch (BackendWordSize) {
case 4u: {
auto *pPatternWords = static_cast<const uint32_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD32Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
case 2u: {
auto *pPatternWords = static_cast<const uint16_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD16Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
default: {
auto *pPatternWords = static_cast<const uint8_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD8Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
}

if (UseOnlyFirstValue) {
return UR_RESULT_SUCCESS;
}

// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
// to behave incorrectly when acting on host pinned memory.
Expand All @@ -761,7 +802,7 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
// we need to check that isManaged attribute is false.
if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
const auto NumOfCopySteps = Size / PatternSize;
const auto Offset = sizeof(uint32_t);
const auto Offset = BackendWordSize;
const auto LeftPatternSize = PatternSize - Offset;
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
reinterpret_cast<const uint8_t *>(pPattern) + Offset);
Expand All @@ -776,10 +817,12 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
Stream));
}
} else {
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
BackendWordSize);
}
#else
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
BackendWordSize);
#endif
return UR_RESULT_SUCCESS;
}
Expand Down
Loading