diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index f5ae19b965..fc3d0220e8 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -961,35 +961,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( // CUDA has no memset functions that allow setting values more than 4 bytes. UR // API lets you pass an arbitrary "pattern" to the buffer fill, which can be -// more than 4 bytes. We must break up the pattern into 1 byte values, and set -// the buffer using multiple strided calls. The first 4 patterns are set using -// cuMemsetD32Async then all subsequent 1 byte patterns are set using -// cuMemset2DAsync which is called for each pattern. +// more than 4 bytes. We must break up the pattern into 1, 2 or 4-byte values +// and set the buffer using multiple strided calls. ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize, size_t Size, const void *pPattern, CUdeviceptr Ptr) { - // Calculate the number of patterns, stride, number of times the pattern - // needs to be applied, and the number of times the first 32 bit pattern - // needs to be applied. - auto NumberOfSteps = PatternSize / sizeof(uint8_t); - auto Pitch = NumberOfSteps * sizeof(uint8_t); - auto Height = Size / NumberOfSteps; - auto Count32 = Size / sizeof(uint32_t); - - // Get 4-byte chunk of the pattern and call cuMemsetD32Async - auto Value = *(static_cast(pPattern)); - UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream)); - for (auto step = 4u; step < NumberOfSteps; ++step) { - // take 1 byte of the pattern - Value = *(static_cast(pPattern) + step); - - // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = Ptr + (step * sizeof(uint8_t)); - - // set all of the pattern chunks - UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t), - Height, Stream)); + // Find the largest supported word size into which the pattern can be divided + auto BackendWordSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + // Calculate the number of words in the pattern, the stride, and the number of + // times the pattern needs to be applied + auto NumberOfSteps = PatternSize / BackendWordSize; + auto Pitch = NumberOfSteps * BackendWordSize; + auto Height = Size / PatternSize; + + // Same implementation works for any pattern word type (uint8_t, uint16_t, + // uint32_t) + auto memsetImpl = [BackendWordSize, NumberOfSteps, Pitch, Height, Size, Ptr, + &Stream](const auto *pPatternWords, + auto &&continuousMemset, auto &&stridedMemset) { + // If the pattern is 1 word or the first word is repeated throughout, a fast + // continuous fill can be used without the need for slower strided fills + bool UseOnlyFirstValue{true}; + for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) { + if (*(pPatternWords + Step) != *pPatternWords) { + UseOnlyFirstValue = false; + } + } + auto OptimizedNumberOfSteps{UseOnlyFirstValue ? 1u : NumberOfSteps}; + + // Fill the pattern in steps of BackendWordSize bytes. Use a continuous + // fill in the first step because it's faster than a strided fill. Then, + // overwrite the other values in subsequent steps. + for (auto Step{0u}; Step < OptimizedNumberOfSteps; ++Step) { + if (Step == 0) { + UR_CHECK_ERROR(continuousMemset(Ptr, *(pPatternWords), + Size / BackendWordSize, Stream)); + } else { + UR_CHECK_ERROR(stridedMemset(Ptr + Step * BackendWordSize, Pitch, + *(pPatternWords + Step), 1u, Height, + Stream)); + } + } + }; + + // Apply the implementation to the chosen pattern word type + switch (BackendWordSize) { + case 4u: { + memsetImpl(static_cast(pPattern), cuMemsetD32Async, + cuMemsetD2D32Async); + break; + } + case 2u: { + memsetImpl(static_cast(pPattern), cuMemsetD16Async, + cuMemsetD2D16Async); + break; + } + default: { + memsetImpl(static_cast(pPattern), cuMemsetD8Async, + cuMemsetD2D8Async); + break; } + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index ae362372d3..025a3f41f4 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -712,25 +712,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize, size_t Size, const void *pPattern, - hipDeviceptr_t Ptr) { + hipDeviceptr_t Ptr, + uint32_t StartOffset) { + // Calculate the number of times the pattern needs to be applied + auto Height = Size / PatternSize; - // Calculate the number of patterns, stride and the number of times the - // pattern needs to be applied. - auto NumberOfSteps = PatternSize / sizeof(uint8_t); - auto Pitch = NumberOfSteps * sizeof(uint8_t); - auto Height = Size / NumberOfSteps; - - for (auto step = 4u; step < NumberOfSteps; ++step) { + for (auto step = StartOffset; step < PatternSize; ++step) { // take 1 byte of the pattern auto Value = *(static_cast(pPattern) + step); // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = reinterpret_cast(reinterpret_cast(Ptr) + - (step * sizeof(uint8_t))); + auto OffsetPtr = + reinterpret_cast(reinterpret_cast(Ptr) + step); // set all of the pattern chunks - UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t), - Height, Stream)); + UR_CHECK_ERROR( + hipMemset2DAsync(OffsetPtr, PatternSize, Value, 1u, Height, Stream)); } } @@ -743,11 +740,55 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize, ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, size_t Size, const void *pPattern, hipDeviceptr_t Ptr) { + // Find the largest supported word size into which the pattern can be divided + auto BackendWordSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + // Calculate the number of patterns + auto NumberOfSteps = PatternSize / BackendWordSize; + + // If the pattern is 1 word or the first word is repeated throughout, a fast + // continuous fill can be used without the need for slower strided fills + bool UseOnlyFirstValue{true}; + auto checkIfFirstWordRepeats = [&UseOnlyFirstValue, + NumberOfSteps](const auto *pPatternWords) { + for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) { + if (*(pPatternWords + Step) != *pPatternWords) { + UseOnlyFirstValue = false; + } + } + }; - // Get 4-byte chunk of the pattern and call hipMemsetD32Async - auto Count32 = Size / sizeof(uint32_t); - auto Value = *(static_cast(pPattern)); - UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream)); + // Use a continuous fill for the first word in the pattern because it's faster + // than a strided fill. Then, overwrite the other values in subsequent steps. + switch (BackendWordSize) { + case 4u: { + auto *pPatternWords = static_cast(pPattern); + checkIfFirstWordRepeats(pPatternWords); + UR_CHECK_ERROR( + hipMemsetD32Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream)); + break; + } + case 2u: { + auto *pPatternWords = static_cast(pPattern); + checkIfFirstWordRepeats(pPatternWords); + UR_CHECK_ERROR( + hipMemsetD16Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream)); + break; + } + default: { + auto *pPatternWords = static_cast(pPattern); + checkIfFirstWordRepeats(pPatternWords); + UR_CHECK_ERROR( + hipMemsetD8Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream)); + break; + } + } + + if (UseOnlyFirstValue) { + return UR_RESULT_SUCCESS; + } // There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D // to behave incorrectly when acting on host pinned memory. @@ -761,7 +802,7 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, // we need to check that isManaged attribute is false. if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) { const auto NumOfCopySteps = Size / PatternSize; - const auto Offset = sizeof(uint32_t); + const auto Offset = BackendWordSize; const auto LeftPatternSize = PatternSize - Offset; const auto OffsetPatternPtr = reinterpret_cast( reinterpret_cast(pPattern) + Offset); @@ -776,10 +817,12 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, Stream)); } } else { - memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr); + memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr, + BackendWordSize); } #else - memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr); + memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr, + BackendWordSize); #endif return UR_RESULT_SUCCESS; }