-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'upstream/main'
- Loading branch information
Showing
49 changed files
with
605 additions
and
736 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,313 @@ | ||
#include <cassert> | ||
#include <cstring> | ||
#include <iostream> | ||
#include <vector> | ||
|
||
#include "hip/hip_runtime.h" | ||
|
||
// Tests for the functional correctness of the lowering of memmove in device | ||
// code, including moves with overlapping source and destination ranges. Various | ||
// memmoves are performed on device side and the result of each is compared to | ||
// the corresponding operation on the host. | ||
// Global, shared, and stack memory is tested. | ||
|
||
#define VERBOSE 0 | ||
|
||
#define CHKHIP(r) \ | ||
if (r != hipSuccess) { \ | ||
std::cerr << hipGetErrorString(r) << std::endl; \ | ||
abort(); \ | ||
} | ||
|
||
using item_type = uint8_t; | ||
|
||
// Maximal number of bytes to copy with a memmove call, used to allocate | ||
// buffers. | ||
#define MAX_BYTES_PER_THREAD 2048 | ||
|
||
// LDS is small, so run only smaller tests there. | ||
#define MAX_BYTES_PER_THREAD_SHARED 128 | ||
|
||
// Number of threads that move started in parallel. | ||
#define NUM_MOVE_THREADS (2 * 32) | ||
|
||
// Size of blocks in the grid used for move threads. If the number of threads is | ||
// smaller than this, it is used instead. | ||
#define BLOCK_SIZE 256 | ||
|
||
#define ALLOC_SIZE (2 * NUM_MOVE_THREADS * MAX_BYTES_PER_THREAD) | ||
|
||
#define ALLOC_SIZE_SHARED (2 * NUM_MOVE_THREADS * MAX_BYTES_PER_THREAD_SHARED) | ||
|
||
#define TESTED_FUNCTION __builtin_memmove | ||
|
||
enum AddressSpace { | ||
GLOBAL = 0, | ||
SHARED = 1, | ||
STACK = 2, | ||
}; | ||
|
||
static const char *as_names[] = { | ||
"global", | ||
"shared", | ||
"stack", | ||
}; | ||
|
||
static constexpr size_t get_stride(size_t bytes_per_thread) { | ||
return 2 * bytes_per_thread; | ||
} | ||
|
||
__global__ void init_kernel(item_type *buf_device, size_t alloc_size) { | ||
for (size_t i = 0; i < alloc_size; ++i) { | ||
buf_device[i] = (item_type)i; | ||
} | ||
} | ||
|
||
template <size_t SZ> | ||
__global__ void move_kernel_global_const(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
(void)dyn_sz; | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
item_type *thread_buf = buf_device + get_stride(SZ) * tid; | ||
TESTED_FUNCTION(thread_buf + dst_idx, thread_buf + src_idx, SZ); | ||
} | ||
|
||
template <size_t SZ> | ||
__global__ void move_kernel_shared_const(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
(void)dyn_sz; | ||
__shared__ item_type buf_shared[ALLOC_SIZE_SHARED]; | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
constexpr size_t stride = get_stride(SZ); | ||
item_type *thread_buf = buf_device + stride * tid; | ||
item_type *thread_buf_shared = buf_shared + stride * tid; | ||
// Copy the original data to shared memory. | ||
__builtin_memcpy(thread_buf_shared, thread_buf, stride); | ||
// Perform the move there. | ||
TESTED_FUNCTION(thread_buf_shared + dst_idx, thread_buf_shared + src_idx, SZ); | ||
// Copy the modified data back to global memory. | ||
__builtin_memcpy(thread_buf, thread_buf_shared, stride); | ||
} | ||
|
||
template <size_t SZ> | ||
__global__ void move_kernel_stack_const(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
(void)dyn_sz; | ||
constexpr size_t stride = get_stride(SZ); | ||
item_type buf_stack[stride]; | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
item_type *thread_buf = buf_device + stride * tid; | ||
// Copy the original data to the stack. | ||
__builtin_memcpy(buf_stack, thread_buf, stride); | ||
// Perform the move there. | ||
TESTED_FUNCTION(buf_stack + dst_idx, buf_stack + src_idx, SZ); | ||
// Copy the modified data back to global memory. | ||
__builtin_memcpy(thread_buf, buf_stack, stride); | ||
} | ||
|
||
__global__ void move_kernel_global_var(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
item_type *thread_buf = buf_device + get_stride(dyn_sz) * tid; | ||
TESTED_FUNCTION(thread_buf + dst_idx, thread_buf + src_idx, dyn_sz); | ||
} | ||
|
||
__global__ void move_kernel_shared_var(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
__shared__ item_type buf_shared[ALLOC_SIZE_SHARED]; | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
size_t stride = get_stride(dyn_sz); | ||
item_type *thread_buf = buf_device + stride * tid; | ||
item_type *thread_buf_shared = buf_shared + stride * tid; | ||
// Copy the original data to shared memory. | ||
__builtin_memcpy(thread_buf_shared, thread_buf, stride); | ||
// perform the move there | ||
TESTED_FUNCTION(thread_buf_shared + dst_idx, thread_buf_shared + src_idx, | ||
dyn_sz); | ||
// Copy the modified data back to global memory. | ||
__builtin_memcpy(thread_buf, thread_buf_shared, stride); | ||
} | ||
|
||
template <size_t SZ> | ||
__global__ void move_kernel_stack_var(item_type *buf_device, size_t src_idx, | ||
size_t dst_idx, size_t dyn_sz) { | ||
// We use the static SZ to allocate a fixed-size stack variable. | ||
constexpr size_t stride = get_stride(SZ); | ||
item_type buf_stack[stride]; | ||
int tid = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (tid >= NUM_MOVE_THREADS) | ||
return; | ||
item_type *thread_buf = buf_device + stride * tid; | ||
// Copy the original data to the stack. | ||
__builtin_memcpy(buf_stack, thread_buf, stride); | ||
// perform the move there | ||
TESTED_FUNCTION(buf_stack + dst_idx, buf_stack + src_idx, dyn_sz); | ||
// Copy the modified data back to global memory. | ||
__builtin_memcpy(thread_buf, buf_stack, stride); | ||
} | ||
|
||
template <size_t SZ> | ||
bool run_test(item_type *buf_reference, item_type *buf_host, | ||
item_type *buf_device, size_t src_idx, size_t dst_idx, | ||
bool const_size, AddressSpace AS, size_t &differing_pos) { | ||
// Initialize device buffer. | ||
hipLaunchKernelGGL(init_kernel, dim3(1), dim3(1), 0, 0, buf_device, | ||
ALLOC_SIZE); | ||
CHKHIP(hipDeviceSynchronize()); | ||
|
||
// Set up the reference buffer. | ||
for (size_t i = 0; i < ALLOC_SIZE; ++i) | ||
buf_reference[i] = (item_type)i; | ||
|
||
// Simulate multi-threaded device-side memmove on the host. | ||
for (size_t tid = 0; tid < NUM_MOVE_THREADS; ++tid) { | ||
item_type *thread_buf = buf_reference + get_stride(SZ) * tid; | ||
std::memmove(thread_buf + dst_idx, thread_buf + src_idx, SZ); | ||
} | ||
|
||
// Do the device-side memmove. | ||
int block_size = std::min(BLOCK_SIZE, NUM_MOVE_THREADS); | ||
int num_blocks = (NUM_MOVE_THREADS + block_size - 1) / block_size; | ||
|
||
switch (AS) { | ||
case AddressSpace::GLOBAL: | ||
hipLaunchKernelGGL(const_size ? move_kernel_global_const<SZ> | ||
: move_kernel_global_var, | ||
dim3(num_blocks), dim3(block_size), 0, 0, buf_device, | ||
src_idx, dst_idx, SZ); | ||
break; | ||
case AddressSpace::SHARED: | ||
hipLaunchKernelGGL(const_size ? move_kernel_shared_const<SZ> | ||
: move_kernel_shared_var, | ||
dim3(num_blocks), dim3(block_size), 0, 0, buf_device, | ||
src_idx, dst_idx, SZ); | ||
break; | ||
case AddressSpace::STACK: | ||
hipLaunchKernelGGL(const_size ? move_kernel_stack_const<SZ> | ||
: move_kernel_stack_var<SZ>, | ||
dim3(num_blocks), dim3(block_size), 0, 0, buf_device, | ||
src_idx, dst_idx, SZ); | ||
break; | ||
}; | ||
CHKHIP(hipDeviceSynchronize()); | ||
|
||
// Fetch the result into buf_host. | ||
CHKHIP(hipMemcpy(buf_host, buf_device, ALLOC_SIZE, hipMemcpyDeviceToHost)); | ||
|
||
// Compare to the reference. | ||
bool success = true; | ||
for (size_t i = 0; i < ALLOC_SIZE; ++i) { | ||
if (buf_host[i] != buf_reference[i]) { | ||
differing_pos = i; | ||
success = false; | ||
break; | ||
} | ||
} | ||
|
||
return success; | ||
} | ||
|
||
template <size_t SZ> | ||
int run_tests(item_type *buf_reference, item_type *buf_host, | ||
item_type *buf_device, AddressSpace AS) { | ||
if (AS == AddressSpace::SHARED && SZ > MAX_BYTES_PER_THREAD_SHARED) { | ||
// LDS is too small for these tests. | ||
return 0; | ||
} | ||
assert(SZ <= MAX_BYTES_PER_THREAD && | ||
"Increase MAX_BYTES_PER_THREAD for larger sizes"); | ||
|
||
std::vector<std::pair<size_t, size_t>> index_combinations = { | ||
{0, 1}, {0, SZ}, {0, SZ - 1}, {1, 0}, {SZ, 0}, {SZ - 1, 0}, | ||
}; | ||
if (SZ > 16) { | ||
index_combinations.emplace_back(0, 16); | ||
index_combinations.emplace_back(16, 0); | ||
} | ||
|
||
int nerrs = 0; | ||
|
||
size_t differing_pos = 0; | ||
auto test_index_combinations = [&](bool const_size) { | ||
for (const auto &[src_idx, dst_idx] : index_combinations) { | ||
bool success = run_test<SZ>(buf_reference, buf_host, buf_device, src_idx, | ||
dst_idx, const_size, AS, differing_pos); | ||
nerrs += !success; | ||
if (VERBOSE || !success) { | ||
std::cout << "- moving [" << src_idx << ", " << (src_idx + SZ - 1) | ||
<< "] -> [" << dst_idx << ", " << (dst_idx + SZ - 1) << "]"; | ||
if (!VERBOSE) { | ||
std::cout << " with " << (const_size ? "static" : "dynamic") | ||
<< " size in " << as_names[AS] << " memory"; | ||
} | ||
std::cout << ":"; | ||
if (success) { | ||
std::cout << " successful\n"; | ||
} else { | ||
std::cout << " failed\n -> first difference at index " | ||
<< differing_pos << '\n'; | ||
} | ||
} | ||
} | ||
}; | ||
|
||
if (VERBOSE) | ||
std::cout << "running tests for dynamic move length " << SZ << " in " | ||
<< as_names[AS] << " memory\n"; | ||
test_index_combinations(false); | ||
|
||
// Different paths in codegen are taken if the move length is statically | ||
// known. | ||
if (VERBOSE) | ||
std::cout << "running tests for static move length " << SZ << " in " | ||
<< as_names[AS] << " memory\n"; | ||
test_index_combinations(true); | ||
|
||
return nerrs; | ||
} | ||
|
||
int main(void) { | ||
item_type *buf_device; | ||
CHKHIP(hipMalloc(&buf_device, ALLOC_SIZE)); | ||
|
||
std::unique_ptr<item_type> buf_host(new item_type[ALLOC_SIZE]); | ||
std::unique_ptr<item_type> buf_reference(new item_type[ALLOC_SIZE]); | ||
|
||
int nerrs = 0; | ||
for (AddressSpace AS : | ||
{AddressSpace::GLOBAL, AddressSpace::SHARED, AddressSpace::STACK}) { | ||
nerrs += run_tests<64>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += run_tests<66>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += run_tests<73>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += run_tests<3>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += run_tests<1>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
|
||
// Move lengths that are large enough for the IR lowering in the constant | ||
// case, with simple residual, no residual, and maximal residual: | ||
nerrs += | ||
run_tests<1025>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += | ||
run_tests<1040>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
nerrs += | ||
run_tests<1039>(buf_reference.get(), buf_host.get(), buf_device, AS); | ||
} | ||
|
||
CHKHIP(hipFree(buf_device)); | ||
|
||
if (nerrs != 0) { | ||
std::cout << nerrs << " errors\n"; | ||
return 1; | ||
} | ||
std::cout << "PASSED!\n"; | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
PASSED! | ||
exit 0 |
Oops, something went wrong.