diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 161c674b6..d0ff649fd 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -161,8 +161,9 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ "-Dhsa-runtime64_DIR=$hsa_runtime64_ROOT/lib64/cmake/hsa-runtime64" \ -S "$PWD" -B "$build_dir" + cmake --build "$build_dir" --target aie_hsa_dispatch_test - ! cmake --build "$build_dir" --target aie_hsa_dispatch_test + "$build_dir"/aie_hsa_dispatch_test $PWD popd diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index fca55e4a4..5d54d35ac 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, bin_file.read(reinterpret_cast(*buf), size); } -void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, - void **buf) { +void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf, uint32_t &num_instr) { std::ifstream bin_file(file_name, std::ios::binary | std::ios::ate | std::ios::in); @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); assert(r == HSA_STATUS_SUCCESS); std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); + num_instr = pdi_vec.size(); } } // namespace @@ -144,9 +145,9 @@ int main(int argc, char **argv) { hsa_amd_memory_pool_t global_dev_mem_pool{0}; // System memory pool. Used for allocating kernel argument data. hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; - const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt"); + const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt"); const std::string pdi_file_name(sourcePath / "add_one.pdi"); - uint32_t *dpu_inst_buf(nullptr); + uint32_t *instr_inst_buf(nullptr); uint64_t *pdi_buf(nullptr); assert(aie_agents.empty()); @@ -164,8 +165,6 @@ int main(int argc, char **argv) { // Find the AIE agents in the system. r = hsa_iterate_agents(get_aie_agents, &aie_agents); assert(r == HSA_STATUS_SUCCESS); - // assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) == - // HSA_STATUS_SUCCESS); assert(aie_agents.size() == 1); const auto &aie_agent = aie_agents.front(); @@ -190,14 +189,22 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(global_kernarg_mem_pool.handle); + // Getting the maximum size of the queue so we can submit that many consecutive + // packets. + uint32_t aie_max_queue_size; + r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size); + assert(r == HSA_STATUS_SUCCESS); + int num_pkts = aie_max_queue_size; + // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). - load_dpu_file(global_dev_mem_pool, dpu_inst_file_name, - reinterpret_cast(&dpu_inst_buf)); - uint32_t dpu_handle = 0; - r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle); + uint32_t num_instr; + load_instr_file(global_dev_mem_pool, instr_inst_file_name, + reinterpret_cast(&instr_inst_buf), num_instr); + uint32_t instr_handle = 0; + r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle); assert(r == HSA_STATUS_SUCCESS); - assert(dpu_handle != 0); + assert(instr_handle != 0); load_pdi_file(global_dev_mem_pool, pdi_file_name, reinterpret_cast(&pdi_buf)); @@ -222,85 +229,99 @@ int main(int argc, char **argv) { constexpr std::size_t data_buffer_size = num_data_elements * sizeof(std::uint32_t); - std::uint32_t *input = {}; - r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, - reinterpret_cast(&input)); - assert(r == HSA_STATUS_SUCCESS); - std::uint32_t input_handle = {}; - r = hsa_amd_get_handle_from_vaddr(input, &input_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(input_handle != 0); - - std::uint32_t *output = {}; - r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, - reinterpret_cast(&output)); - assert(r == HSA_STATUS_SUCCESS); - std::uint32_t output_handle = {}; - r = hsa_amd_get_handle_from_vaddr(output, &output_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(output_handle != 0); + std::vector input(num_pkts); + std::vector output(num_pkts); + std::vector cmd_payloads(num_pkts); + std::vector input_handle(num_pkts); + std::vector output_handle(num_pkts); + + uint64_t wr_idx = 0; + uint64_t packet_id = 0; + + for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, + reinterpret_cast(&input[pkt_iter])); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + assert(input_handle[pkt_iter] != 0); + + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, + reinterpret_cast(&output[pkt_iter])); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + assert(output_handle[pkt_iter] != 0); + + for (std::size_t i = 0; i < num_data_elements; i++) { + *(input[pkt_iter] + i) = i * (pkt_iter + 1); + *(output[pkt_iter] + i) = 0xDEFACE; + } - for (std::size_t i = 0; i < num_data_elements; i++) { - *(input + i) = i; - *(output + i) = 0xDEFACE; + // Getting a slot in the queue + wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); + packet_id = wr_idx % aie_queue->size; + + // Creating a packet to store the command + hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast( + aie_queue->base_address) + packet_id; + assert(r == HSA_STATUS_SUCCESS); + cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_pkt->count = 0xA; // # of arguments to put in command + cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; + cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; + cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + + // Creating the payload for the packet + hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_payload)); + assert(r == HSA_STATUS_SUCCESS); + // Selecting the PDI to use with this command + cmd_payload->cu_mask = 0x1; + // Transaction opcode + cmd_payload->data[0] = 0x3; + cmd_payload->data[1] = 0x0; + cmd_payload->data[2] = instr_handle; + cmd_payload->data[3] = 0x0; + cmd_payload->data[4] = num_instr; + cmd_payload->data[5] = input_handle[pkt_iter]; + cmd_payload->data[6] = 0; + cmd_payload->data[7] = output_handle[pkt_iter]; + cmd_payload->data[8] = 0; + cmd_pkt->payload_data = reinterpret_cast(cmd_payload); + + // Keeping track of payloads so we can free them at the end + cmd_payloads[pkt_iter] = cmd_payload; } - ///////////////////////////////////// Creating the cmd packet - // Creating a packet to store the command - hsa_amd_aie_ert_packet_t *cmd_pkt = NULL; - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, - reinterpret_cast(&cmd_pkt)); - assert(r == HSA_STATUS_SUCCESS); - cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; - cmd_pkt->count = 0xA; // # of arguments to put in command - cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; - cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; - cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC - << HSA_PACKET_HEADER_TYPE; - - // Creating the payload for the packet - hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; - uint32_t cmd_handle; - r = hsa_amd_get_handle_from_vaddr(reinterpret_cast(cmd_pkt), - &cmd_handle); - assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, - reinterpret_cast(&cmd_payload)); - assert(r == HSA_STATUS_SUCCESS); - cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command - cmd_payload->data[0] = 0x3; // Transaction opcode - cmd_payload->data[1] = 0x0; - cmd_payload->data[2] = dpu_handle; - cmd_payload->data[3] = 0x0; - cmd_payload->data[4] = 0x44; // Size of DPU instruction - cmd_payload->data[5] = input_handle; - cmd_payload->data[6] = 0; - cmd_payload->data[7] = output_handle; - cmd_payload->data[8] = 0; - cmd_pkt->payload_data = reinterpret_cast(cmd_payload); - - uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); - uint64_t packet_id = wr_idx % aie_queue->size; - reinterpret_cast( - aie_queue->base_address)[packet_id] = *cmd_pkt; + // Ringing the doorbell to dispatch each packet we added to + // the queue hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); - for (std::size_t i = 0; i < num_data_elements; i++) { - const auto expected = *(input + i) + 1; - const auto result = *(output + i); - assert(result == expected); + for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { + for (std::size_t i = 0; i < num_data_elements; i++) { + const auto expected = *(input[pkt_iter] + i) + 1; + const auto result = *(output[pkt_iter] + i); + assert(result == expected); + } + + r = hsa_amd_memory_pool_free(output[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(input[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); } r = hsa_queue_destroy(aie_queue); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(output); - assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(input); - assert(r == HSA_STATUS_SUCCESS); r = hsa_amd_memory_pool_free(pdi_buf); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(dpu_inst_buf); + r = hsa_amd_memory_pool_free(instr_inst_buf); assert(r == HSA_STATUS_SUCCESS); r = hsa_shut_down(); diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index fc42ac8a1..69b938e66 100644 --- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -922,6 +922,19 @@ uint32_t HSA_API wait_hint, satisfying_value); } +// Mirrors AMD Extension APIs. +hsa_status_t +hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) { + return amdExtTable->hsa_amd_queue_hw_ctx_config_fn(queue, config_type, args); +} + +// Mirrors AMD Extension APIs. +hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) { + return amdExtTable->hsa_amd_get_handle_from_vaddr_fn(ptr, handle); +} + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index 0de256664..f405bf502 100644 --- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -61,6 +61,8 @@ namespace AMD { KfdDriver::KfdDriver(std::string devnode_name) : core::Driver(core::DriverType::KFD, devnode_name) {} +hsa_status_t KfdDriver::Init() { return HSA_STATUS_SUCCESS; } + hsa_status_t KfdDriver::DiscoverDriver() { if (hsaKmtOpenKFD() == HSAKMT_STATUS_SUCCESS) { std::unique_ptr kfd_drv(new KfdDriver("/dev/kfd")); @@ -74,6 +76,10 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) { return HSA_STATUS_SUCCESS; } +hsa_status_t KfdDriver::GetAgentProperties(core::Agent &agent) const { + return HSA_STATUS_SUCCESS; +} + hsa_status_t KfdDriver::GetMemoryProperties(uint32_t node_id, core::MemoryRegion &mem_region) const { @@ -97,6 +103,11 @@ KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region, kmt_alloc_flags.ui32.NonPaged = 1; } + if (m_region.IsLocalMemory() && + (alloc_flags & core::MemoryRegion::AllocateMemoryOnly)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + // Allocating a memory handle for virtual memory kmt_alloc_flags.ui32.NoAddress = !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly); @@ -230,7 +241,7 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) { return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } -hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) { +hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const { return HSA_STATUS_SUCCESS; } @@ -238,6 +249,19 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const { return HSA_STATUS_SUCCESS; } +hsa_status_t +KfdDriver::ConfigHwCtx(core::Queue &queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) { + // Only AIE queues support this for now. + return HSA_STATUS_ERROR_INVALID_AGENT; +} + +hsa_status_t KfdDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) { + // Only AIE queues support this for now. + return HSA_STATUS_ERROR_INVALID_AGENT; +} + void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id, size_t size) { void *mem = nullptr; diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 308ffe7aa..12f140e5c 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -43,12 +43,16 @@ #include "core/inc/amd_xdna_driver.h" #include +#include +#include #include #include +#include "core/inc/amd_aie_aql_queue.h" #include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" +#include "core/util/utils.h" #include "uapi/amdxdna_accel.h" namespace rocr { @@ -57,6 +61,8 @@ namespace AMD { XdnaDriver::XdnaDriver(std::string devnode_name) : core::Driver(core::DriverType::XDNA, devnode_name) {} +XdnaDriver::~XdnaDriver() { FreeDeviceHeap(); } + hsa_status_t XdnaDriver::DiscoverDriver() { const int max_minor_num(64); const std::string devnode_prefix("/dev/accel/accel"); @@ -67,6 +73,7 @@ hsa_status_t XdnaDriver::DiscoverDriver() { if (xdna_drv->Open() == HSA_STATUS_SUCCESS) { if (xdna_drv->QueryKernelModeDriver( core::DriverQuery::GET_DRIVER_VERSION) == HSA_STATUS_SUCCESS) { + static_cast(xdna_drv.get())->Init(); core::Runtime::runtime_singleton_->RegisterDriver(xdna_drv); return HSA_STATUS_SUCCESS; } else { @@ -78,6 +85,12 @@ hsa_status_t XdnaDriver::DiscoverDriver() { return HSA_STATUS_ERROR; } +uint64_t XdnaDriver::GetDevHeapByteSize() { + return dev_heap_size; +} + +hsa_status_t XdnaDriver::Init() { return InitDeviceHeap(); } + hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) { switch (query) { case core::DriverQuery::GET_DRIVER_VERSION: @@ -88,6 +101,31 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) { return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const { + if (agent.device_type() != core::Agent::DeviceType::kAmdAieDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + auto &aie_agent(static_cast(agent)); + + amdxdna_drm_query_aie_metadata aie_metadata{0}; + amdxdna_drm_get_info get_info_args{ + .param = DRM_AMDXDNA_QUERY_AIE_METADATA, + .buffer_size = sizeof(aie_metadata), + .buffer = reinterpret_cast(&aie_metadata)}; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_INFO, &get_info_args) < 0) { + return HSA_STATUS_ERROR; + } + + // Right now can only target N-1 columns so putting this + // here as a workaround + aie_agent.SetNumCols(aie_metadata.cols - 1); + aie_agent.SetNumCoreRows(aie_metadata.core.row_count); + + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::GetMemoryProperties(uint32_t node_id, core::MemoryRegion &mem_region) const { @@ -98,18 +136,153 @@ hsa_status_t XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, core::MemoryRegion::AllocateFlags alloc_flags, void **mem, size_t size, uint32_t node_id) { + const auto ®ion = static_cast(mem_region); + amdxdna_drm_create_bo create_bo_args{.size = size}; + amdxdna_drm_get_bo_info get_bo_info_args{0}; + drm_gem_close close_bo_args{0}; + void *mapped_mem(nullptr); + + if (!region.IsSystem()) { + return HSA_STATUS_ERROR_INVALID_REGION; + } + + if (region.kernarg()) { + create_bo_args.type = AMDXDNA_BO_SHMEM; + } else { + create_bo_args.type = AMDXDNA_BO_DEV; + } + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + get_bo_info_args.handle = create_bo_args.handle; + // In case we need to close this BO to avoid leaks due to some error after + // creation. + close_bo_args.handle = create_bo_args.handle; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) { + // Close the BO in the case we can't get info about it. + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + return HSA_STATUS_ERROR; + } + + /// TODO: For now we always map the memory and keep a mapping from handles + /// to VA memory addresses. Once we can support the separate VMEM call to + /// map handles we can fix this. + if (region.kernarg()) { + mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, + get_bo_info_args.map_offset); + if (mapped_mem == MAP_FAILED) { + // Close the BO in the case when a mapping fails and we got a BO handle. + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + } else { + mapped_mem = reinterpret_cast(get_bo_info_args.vaddr); + } + + if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) { + *mem = reinterpret_cast(create_bo_args.handle); + } else { + *mem = mapped_mem; + } + + vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); + vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); + return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) { +hsa_status_t XdnaDriver::FreeMemory(void* ptr, size_t size) { + auto it = vmem_handle_mappings_reverse.find(ptr); + if (it == vmem_handle_mappings_reverse.end()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + // TODO:ypapadop-amd: need to unmap memory, but we don't know if it's mapped or not as we don't have + // region information + + auto handle = it->second; + + drm_gem_close close_args = {}; + close_args.handle = handle; + if (ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_args) < 0) { + return HSA_STATUS_ERROR; + } + + vmem_handle_mappings.erase(handle); + vmem_handle_mappings_reverse.erase(it); + return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) { +hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const { + if (!AieAqlQueue::IsType(&queue)) { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + auto &aie_queue(static_cast(queue)); + auto &aie_agent(aie_queue.GetAgent()); + + // Currently we do not leverage QoS information. + amdxdna_qos_info qos_info{0}; + amdxdna_drm_create_hwctx create_hwctx_args{ + .ext = 0, + .ext_flags = 0, + .qos_p = reinterpret_cast(&qos_info), + .umq_bo = 0, + .log_buf_bo = 0, + // TODO: Make this configurable. + .max_opc = 0x800, + // This field is for the number of core tiles. + .num_tiles = static_cast(aie_agent.GetNumCores()), + .mem_size = 0, + .umq_doorbell = 0}; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + aie_queue.SetHwCtxHandle(create_hwctx_args.handle); + return HSA_STATUS_SUCCESS; } hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const { + if (!AieAqlQueue::IsType(&queue)) { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + auto &aie_queue(static_cast(queue)); + amdxdna_drm_destroy_hwctx destroy_hwctx_args{.handle = + aie_queue.GetHwCtxHandle()}; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) { + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +XdnaDriver::ConfigHwCtx(core::Queue &queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) { + switch (config_type) { + case HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU: + return ConfigHwCtxCU( + queue, + *reinterpret_cast(args)); + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } +} + +hsa_status_t XdnaDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) { + auto it = vmem_handle_mappings_reverse.find(ptr); + if (it == vmem_handle_mappings_reverse.end()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + *handle = it->second; return HSA_STATUS_SUCCESS; } @@ -128,5 +301,131 @@ hsa_status_t XdnaDriver::QueryDriverVersion() { return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::InitDeviceHeap() { + amdxdna_drm_create_bo create_bo_args{.type = AMDXDNA_BO_DEV_HEAP, + .vaddr = + reinterpret_cast(nullptr), + .size = dev_heap_size}; + amdxdna_drm_get_bo_info get_bo_info_args{0}; + drm_gem_close close_bo_args{0}; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + get_bo_info_args.handle = create_bo_args.handle; + // In case we need to close this BO to avoid leaks due to some error after + // creation. + close_bo_args.handle = create_bo_args.handle; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) { + // Close the BO in the case we can't get info about it. + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + return HSA_STATUS_ERROR; + } + + dev_heap_parent = mmap(0, dev_heap_align * 2 - 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (dev_heap_parent == MAP_FAILED) { + // Close the BO in the case when a mapping fails and we got a BO handle. + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + dev_heap_parent = nullptr; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + void *addr_aligned(reinterpret_cast( + AlignUp(reinterpret_cast(dev_heap_parent), dev_heap_align))); + + dev_heap_aligned = + mmap(addr_aligned, dev_heap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd_, get_bo_info_args.map_offset); + + if (dev_heap_aligned == MAP_FAILED) { + // Close the BO in the case when a mapping fails and we got a BO handle. + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + // Unmap the dev_heap_parent. + dev_heap_aligned = nullptr; + FreeDeviceHeap(); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { + vmem_handle_mappings = this->vmem_handle_mappings; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::GetFd(int &fd) { + fd = fd_; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::FreeDeviceHeap() { + if (dev_heap_parent) { + munmap(dev_heap_parent, dev_heap_align * 2 - 1); + dev_heap_parent = nullptr; + } + + if (dev_heap_aligned) { + munmap(dev_heap_aligned, dev_heap_size); + dev_heap_aligned = nullptr; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::ConfigHwCtxCU( + core::Queue &queue, + hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param) { + if (!AieAqlQueue::IsType(&queue)) { + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + + auto &aie_queue(static_cast(queue)); + + size_t config_cu_param_size(sizeof(amdxdna_hwctx_param_config_cu) + + config_cu_param.num_cus * + sizeof(amdxdna_cu_config)); + + amdxdna_hwctx_param_config_cu *xdna_config_cu_param = + reinterpret_cast( + malloc(config_cu_param_size)); + if (xdna_config_cu_param == nullptr) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + MAKE_SCOPE_GUARD([xdna_config_cu_param] { free(xdna_config_cu_param); }); + + xdna_config_cu_param->num_cus = config_cu_param.num_cus; + + for (int i = 0; i < xdna_config_cu_param->num_cus; ++i) { + xdna_config_cu_param->cu_configs[i].cu_bo = + config_cu_param.cu_configs[i].cu_config_bo; + xdna_config_cu_param->cu_configs[i].cu_func = + config_cu_param.cu_configs[i].cu_func; + + // sync configuration buffer + amdxdna_drm_sync_bo sync_args = {}; + sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo; + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) { + return HSA_STATUS_ERROR; + } + } + + amdxdna_drm_config_hwctx config_hw_ctx_args{ + .handle = aie_queue.GetHwCtxHandle(), + .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU, + .param_val = reinterpret_cast(xdna_config_cu_param), + .param_val_size = static_cast(config_cu_param_size)}; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx_args) < 0) { + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + } // namespace AMD } // namespace rocr diff --git a/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/runtime/hsa-runtime/core/inc/amd_aie_agent.h index c3add8a76..d99b71ed7 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_agent.h @@ -86,18 +86,53 @@ class AieAgent : public core::Agent { return regions_; } + /// @brief Getter for the AIE system allocator. + const std::function & + system_allocator() const { + return system_allocator_; + } + + /// @brief Getter for the AIE system deallocator. + const std::function& system_deallocator() const { return system_deallocator_; } + + // AIE agent methods. + /// @brief Get the number of columns on this AIE agent. + int GetNumCols() const { return num_cols_; } + void SetNumCols(int num_cols) { num_cols_ = num_cols; } + /// @brief Get the number of core tile rows on this AIE agent. + int GetNumCoreRows() const { return num_core_rows_; } + void SetNumCoreRows(int num_core_rows) { num_core_rows_ = num_core_rows; } + /// @brief Get the number of core tiles on this AIE agent. + int GetNumCores() const { return num_cols_ * num_core_rows_; } + private: - // @brief Query the driver to get the region list owned by this agent. + /// @brief Query the driver to get the region list owned by this agent. void InitRegionList(); + /// @brief Setup the memory allocators used by this agent. + void InitAllocators(); + + /// @brief Query the driver to get properties for this AIE agent. + void GetAgentProperties(); std::vector regions_; + std::function + system_allocator_; + + + std::function system_deallocator_; const hsa_profile_t profile_ = HSA_PROFILE_BASE; - static const uint32_t maxQueues_ = 8; - static const uint32_t minAqlSize_ = 0x40; - static const uint32_t maxAqlSize_ = 0x40; - uint32_t max_queues_; - uintptr_t device_heap_vaddr_ = 0; + const uint32_t min_aql_size_ = 0x40; + const uint32_t max_aql_size_ = 0x40; + const uint32_t max_queues_ = 1; + + /// @brief Number of columns in the AIE array. + int num_cols_ = 0; + /// @brief Number of rows of core tiles in the AIE array. Not all rows in a + /// column are cores. Some can be memory or shim tiles. + int num_core_rows_ = 0; }; } // namespace AMD diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 7e59112d5..224b85d7c 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -43,11 +43,41 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_ #define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_ +#include + #include "core/inc/amd_aie_agent.h" #include "core/inc/queue.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" -#include "core/util/locks.h" + +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + __u32 command_count; + __u32 submit_index; + __u32 error_index; + __u32 reserved[3]; + __u64 data[] __counted_by(command_count); +}; + + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + __u32 state : 4; + __u32 unused : 6; + __u32 extra_cu_masks : 2; + __u32 count : 11; + __u32 opcode : 5; + __u32 reserved : 4; + }; + __u32 header; + }; + __u32 data[] __counted_by(count); +}; namespace rocr { namespace AMD { @@ -55,7 +85,9 @@ namespace AMD { /// @brief Encapsulates HW AIE AQL Command Processor functionality. It /// provides the interface for things such as doorbells, queue read and /// write pointers, and a buffer. -class AieAqlQueue : public core::Queue, public core::DoorbellSignal { +class AieAqlQueue : public core::Queue, + private core::LocalSignal, + core::DoorbellSignal { public: static __forceinline bool IsType(core::Signal *signal) { return signal->IsType(&rtti_id_); @@ -67,7 +99,7 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal { AieAqlQueue() = delete; AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id); - ~AieAqlQueue(); + ~AieAqlQueue() override; hsa_status_t Inactivate() override; hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override; @@ -95,6 +127,16 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal { hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute, void *value) override; + // AIE-specific API + AieAgent &GetAgent() const { return agent_; } + void SetHwCtxHandle(uint32_t hw_ctx_handle) { + hw_ctx_handle_ = hw_ctx_handle; + } + uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; } + + hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override; + // GPU-specific queue functions are unsupported. hsa_status_t GetCUMasking(uint32_t num_cu_mask_count, uint32_t *cu_mask) override; @@ -105,29 +147,71 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal { hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, hsa_signal_t *signal = NULL) override; - core::SharedQueue *shared_queue_; - core::SharedSignal *shared_signal_; - /// ID of the queue used in communication with the AMD AIR driver. - uint32_t queue_id_; - /// ID of the doorbell used in communication with the AMD AIR driver. - uint32_t doorbell_id_; - /// Pointer to the hardware doorbell for this queue. - uint64_t *hardware_doorbell_ptr_; - /// ID of AIE device on which this queue has been mapped. - uint32_t node_id_; - /// Queue size in bytes. - uint32_t queue_size_bytes_; + uint64_t queue_id_ = INVALID_QUEUEID; + /// @brief ID of AIE device on which this queue has been mapped. + uint32_t node_id_ = std::numeric_limits::max(); + /// @brief Queue size in bytes. + uint32_t queue_size_bytes_ = std::numeric_limits::max(); protected: bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; } private: - core::SharedQueue *CreateSharedQueue(AieAgent *agent, size_t req_size_pkts, - uint32_t node_id); - core::SharedSignal *CreateSharedSignal(AieAgent *agent); + AieAgent &agent_; + + /// @brief Base of the queue's ring buffer storage. + void *ring_buf_ = nullptr; + + static hsa_status_t SubmitCmd( + uint32_t hw_ctx_handle, int fd, void *queue_base, + uint64_t read_dispatch_id, uint64_t write_dispatch_id, + std::unordered_map &vmem_handle_mappings); + + /// @brief Creates a command BO and returns a pointer to the memory and + // the corresponding handle + /// + /// @param size size of memory to allocate + /// @param handle A pointer to the BO handle + /// @param cmd A pointer to the buffer + static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle, + amdxdna_cmd **cmd, int fd); + + /// @brief Adds all BOs in a command packet payload to a vector + /// and replaces the handles with a virtual address + /// + /// @param count Number of entries in the command + /// @param bo_args A pointer to a vector that contains all bo handles + /// @param cmd_pkt_payload A pointer to the payload of the command + static void RegisterCmdBOs( + uint32_t count, std::vector &bo_args, + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::unordered_map &vmem_handle_mappings); + + /// @brief Syncs all BOs referenced in bo_args + /// + /// @param bo_args vector containing handles of BOs to sync + static hsa_status_t SyncBos(std::vector &bo_args, int fd); + + /// @brief Executes a command and waits for its completion + /// + /// @param exec_cmd Structure containing the details of the command to execute + /// @param hw_ctx_handle the handle of the hardware context to run this + /// command + static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, + uint32_t hw_ctx_handle, int fd); + + /// @brief Handle for an application context on the AIE device. + /// + /// Each user queue will have an associated context. This handle is assigned + /// by the driver on context creation. + /// + /// TODO: For now we support a single context that allocates all core tiles in + /// the array. In the future we can make the number of tiles configurable so + /// that multiple workloads with different core tile configurations can + /// execute on the AIE agent at the same time. + uint32_t hw_ctx_handle_ = std::numeric_limits::max(); - AieAgent *agent_; - /// Indicates if queue is active. + /// @brief Indicates if queue is active. std::atomic active_; static int rtti_id_; }; @@ -135,4 +219,4 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal { } // namespace AMD } // namespace rocr -#endif // header guard +#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_ diff --git a/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aql_queue.h index 09f14f941..1e0da0549 100644 --- a/runtime/hsa-runtime/core/inc/amd_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -178,6 +178,17 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo /// @return uint64_t Value of write index before the update uint64_t AddWriteIndexRelease(uint64_t value) override; + /// @brief Configure the hardware context of a queue. + /// + /// @param config_type Specify the parameter type. Used to interpret @p args. + /// + /// @param args Queue-specific args for configuring the hardware context. + hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override { + // Currently only supported by AIE queues. + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + /// @brief Set CU Masking /// /// @param num_cu_mask_count size of mask bit array diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 608017c11..a7193d18d 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt { GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index); // @brief GPU agent destructor. - ~GpuAgent(); + ~GpuAgent() override; // @brief Ensure blits are ready (performance hint). void PreloadBlits() override; @@ -507,14 +507,14 @@ class GpuAgent : public GpuAgentInt { hsa_status_t EnableDmaProfiling(bool enable) override; hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, - void* cb_data); - hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session); + void *cb_data) override; + hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override; hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, - pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session); + pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override; hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session); static void PcSamplingThreadRun(void* agent); @@ -793,4 +793,4 @@ class GpuAgent : public GpuAgentInt { } // namespace amd } // namespace rocr -#endif // header guard +#endif // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index 1939c0511..bd6f376fb 100644 --- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -66,7 +66,9 @@ class KfdDriver : public core::Driver { static hsa_status_t DiscoverDriver(); + hsa_status_t Init() override; hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; + hsa_status_t GetAgentProperties(core::Agent &agent) const override; hsa_status_t GetMemoryProperties(uint32_t node_id, core::MemoryRegion &mem_region) const override; @@ -75,8 +77,12 @@ class KfdDriver : public core::Driver { void **mem, size_t size, uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; - hsa_status_t CreateQueue(core::Queue &queue) override; + hsa_status_t CreateQueue(core::Queue &queue) const override; hsa_status_t DestroyQueue(core::Queue &queue) const override; + hsa_status_t ConfigHwCtx(core::Queue &queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override; + hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override; private: /// @brief Allocate agent accessible memory (system / local memory). diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h index b052d5c38..bb6b76dae 100644 --- a/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -143,7 +143,8 @@ class MemoryRegion : public core::MemoryRegion { } __forceinline bool IsSystem() const { - return mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM; + return ((mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM) || + (mem_props_.HeapType == HSA_HEAPTYPE_DEVICE_SVM)); } __forceinline bool IsLDS() const { diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index c45b33b11..79cbaa710 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -43,9 +43,11 @@ #define HSA_RUNTIME_CORE_INC_AMD_XDNA_DRIVER_H_ #include +#include #include "core/inc/driver.h" #include "core/inc/memory_region.h" +#include "core/driver/xdna/uapi/amdxdna_accel.h" namespace rocr { namespace core { @@ -58,10 +60,20 @@ class XdnaDriver : public core::Driver { public: XdnaDriver() = delete; XdnaDriver(std::string devnode_name); + ~XdnaDriver(); static hsa_status_t DiscoverDriver(); + + /// @brief Returns the size of the dev heap in bytes. + static uint64_t GetDevHeapByteSize(); + + hsa_status_t Init() override; hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; + hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetFd(int &fd); + + hsa_status_t GetAgentProperties(core::Agent &agent) const override; hsa_status_t GetMemoryProperties(uint32_t node_id, core::MemoryRegion &mem_region) const override; @@ -70,11 +82,54 @@ class XdnaDriver : public core::Driver { void **mem, size_t size, uint32_t node_id) override; hsa_status_t FreeMemory(void *mem, size_t size) override; - hsa_status_t CreateQueue(core::Queue &queue) override; + + /// @brief Creates a context on the AIE device for this queue. + /// @param queue Queue whose on-device context is being created. + /// @return hsa_status_t + hsa_status_t CreateQueue(core::Queue &queue) const override; hsa_status_t DestroyQueue(core::Queue &queue) const override; + hsa_status_t ConfigHwCtx(core::Queue &queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override; + + hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override; + private: hsa_status_t QueryDriverVersion(); + /// @brief Allocate device accesible heap space. + /// + /// Allocate and map a buffer object (BO) that the AIE device can access. + hsa_status_t InitDeviceHeap(); + hsa_status_t FreeDeviceHeap(); + + /// @brief Configures the CUs associated with the HW context for this queue. + /// + /// @param config_cu_param CU configuration information. + hsa_status_t + ConfigHwCtxCU(core::Queue &queue, + hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param); + + /// TODO: Probably remove this in the future and rely on the core Runtime + /// object to track handle allocations. Using the VMEM API for mapping XDNA + /// driver handles requires a bit more refactoring. So rely on the XDNA driver + /// to manage some of this for now. + std::unordered_map vmem_handle_mappings; + + // TODO: Remove this once we move to the vmem API + std::unordered_map vmem_handle_mappings_reverse; + + /// @brief Virtual address range allocated for the device heap. + /// + /// Allocate a large enough space so we can carve out the device heap in + /// this range and ensure it is aligned to 64MB. Currently, AIE2 supports + /// 48MB device heap and it must be aligned to 64MB. + void *dev_heap_parent = nullptr; + + /// @brief The aligned device heap. + void *dev_heap_aligned = nullptr; + static constexpr size_t dev_heap_size = 48 * 1024 * 1024; + static constexpr size_t dev_heap_align = 64 * 1024 * 1024; }; } // namespace AMD diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h index 8c22a39b0..580a12ccb 100644 --- a/runtime/hsa-runtime/core/inc/driver.h +++ b/runtime/hsa-runtime/core/inc/driver.h @@ -48,6 +48,7 @@ #include "core/inc/memory_region.h" #include "inc/hsa.h" +#include "inc/hsa_ext_amd.h" namespace rocr { namespace core { @@ -74,6 +75,9 @@ class Driver { Driver(DriverType kernel_driver_type, std::string devnode_name); virtual ~Driver() = default; + /// @brief Initialize the driver's state after opening. + virtual hsa_status_t Init() = 0; + /// @brief Query the kernel-model driver. /// @retval HSA_STATUS_SUCCESS if the kernel-model driver query was /// successful. @@ -91,6 +95,13 @@ class Driver { /// @retval DriverVersionInfo containing the driver's version information. const DriverVersionInfo &Version() const { return version_; } + /// @brief Get the properties of a specific agent and initialize the agent + /// object. + /// @param agent Agent whose properties we're getting. + /// @retval HSA_STATUS_SUCCESS if the driver successfully returns the agent's + /// properties. + virtual hsa_status_t GetAgentProperties(Agent &agent) const = 0; + /// @brief Get the memory properties of a specific node. /// @param node_id Node ID of the agent /// @param[in, out] mem_region MemoryRegion object whose properties will be @@ -113,10 +124,22 @@ class Driver { virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0; - virtual hsa_status_t CreateQueue(Queue &queue) = 0; + virtual hsa_status_t CreateQueue(Queue &queue) const = 0; virtual hsa_status_t DestroyQueue(Queue &queue) const = 0; + /// @brief Configure the hardware context for a queue. + /// @param[in] queue The queue whose context is being configured. + /// @param[in] config_type Type for the @p args argument. Tells the driver + /// how to interpret the args. + /// @param[in] args Arguments for configuring the queue's hardware context. + /// @p config_type tells how to interpret args. + virtual hsa_status_t + ConfigHwCtx(Queue &queue, hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) = 0; + + virtual hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) = 0; + /// Unique identifier for supported kernel-mode drivers. const DriverType kernel_driver_type_; diff --git a/runtime/hsa-runtime/core/inc/host_queue.h b/runtime/hsa-runtime/core/inc/host_queue.h index ce0bfbbcc..3ec1f3daf 100644 --- a/runtime/hsa-runtime/core/inc/host_queue.h +++ b/runtime/hsa-runtime/core/inc/host_queue.h @@ -144,6 +144,12 @@ class HostQueue : public Queue { std::memory_order_release); } + hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override { + // Currently only supported by AIE queues. + return HSA_STATUS_ERROR_INVALID_QUEUE; + } + hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { return HSA_STATUS_ERROR_INVALID_QUEUE; } diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index 5109d3976..d10300e89 100644 --- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -108,6 +108,15 @@ uint32_t hsa_wait_state_t wait_hint, hsa_signal_value_t* satisfying_value); +// Mirrors AMD Extension APIs. +hsa_status_t +hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args); + +// Mirrors AMD Extension APIs. +hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle); + // Mirrors Amd Extension Apis hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/inc/intercept_queue.h b/runtime/hsa-runtime/core/inc/intercept_queue.h index 8088d5e92..0638507c7 100644 --- a/runtime/hsa-runtime/core/inc/intercept_queue.h +++ b/runtime/hsa-runtime/core/inc/intercept_queue.h @@ -114,6 +114,12 @@ class QueueWrapper : public Queue { uint64_t AddWriteIndexRelease(uint64_t value) override { return wrapped->AddWriteIndexRelease(value); } + + hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) override { + return wrapped->ConfigHwCtx(config_type, args); + } + hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override { return wrapped->SetCUMasking(num_cu_mask_count, cu_mask); } diff --git a/runtime/hsa-runtime/core/inc/queue.h b/runtime/hsa-runtime/core/inc/queue.h index 43bbe13e7..8979277ed 100644 --- a/runtime/hsa-runtime/core/inc/queue.h +++ b/runtime/hsa-runtime/core/inc/queue.h @@ -330,6 +330,14 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { /// @return uint64_t Value of write index before the update virtual uint64_t AddWriteIndexRelease(uint64_t value) = 0; + /// @brief Configure the hardware context of a queue. + /// + /// @param config_type Specify the parameter type. Used to interpret @p args. + /// + /// @param args Queue-specific args for configuring the hardware context. + virtual hsa_status_t + ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, void *args) = 0; + /// @brief Set CU Masking /// /// @param num_cu_mask_count size of mask bit array diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 981bd4852..3e3f48294 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -403,6 +403,8 @@ class Runtime { const core::MemoryRegion** mem_region, hsa_amd_memory_type_t* type); + hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle); + hsa_status_t EnableLogging(uint8_t* flags, void* file); const std::vector& cpu_agents() { return cpu_agents_; } diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp index b98de4da2..4bce61323 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp @@ -3,7 +3,7 @@ // The University of Illinois/NCSA // Open Source License (NCSA) // -// Copyright (c) 2022-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // @@ -42,17 +42,23 @@ #include "core/inc/amd_aie_agent.h" +#include + #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/amd_xdna_driver.h" #include "core/inc/driver.h" +#include "core/inc/runtime.h" namespace rocr { namespace AMD { AieAgent::AieAgent(uint32_t node) : core::Agent(core::DriverType::XDNA, node, - core::Agent::DeviceType::kAmdAieDevice), - max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) { + core::Agent::DeviceType::kAmdAieDevice) { InitRegionList(); + InitAllocators(); + GetAgentProperties(); } AieAgent::~AieAgent() { @@ -81,7 +87,8 @@ hsa_status_t AieAgent::IterateRegion( hsa_status_t AieAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void *data), void *data) const { - return HSA_STATUS_SUCCESS; + // AIE has no caches. + return HSA_STATUS_ERROR_INVALID_CACHE; } hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const { @@ -104,25 +111,41 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const { case HSA_AGENT_INFO_MACHINE_MODEL: *reinterpret_cast(value) = HSA_MACHINE_MODEL_LARGE; break; - case HSA_AGENT_INFO_PROFILE: + case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES: + case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: + // TODO: validate if this is true. + *reinterpret_cast(value) = + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR; + break; + case HSA_AGENT_INFO_PROFILE: *reinterpret_cast(value) = profile_; break; case HSA_AGENT_INFO_WAVEFRONT_SIZE: + *reinterpret_cast(value) = 0; + break; case HSA_AGENT_INFO_WORKGROUP_MAX_DIM: + std::memset(value, 0, sizeof(uint16_t) * 3); + break; case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: + *reinterpret_cast(value) = 0; + break; case HSA_AGENT_INFO_GRID_MAX_DIM: + std::memset(value, 0, sizeof(hsa_dim3_t)); + break; case HSA_AGENT_INFO_GRID_MAX_SIZE: + *reinterpret_cast(value) = 0; + break; case HSA_AGENT_INFO_FBARRIER_MAX_SIZE: *reinterpret_cast(value) = 0; break; case HSA_AGENT_INFO_QUEUES_MAX: - *reinterpret_cast(value) = maxQueues_; + *reinterpret_cast(value) = max_queues_; break; case HSA_AGENT_INFO_QUEUE_MIN_SIZE: - *reinterpret_cast(value) = minAqlSize_; + *reinterpret_cast(value) = min_aql_size_; break; case HSA_AGENT_INFO_QUEUE_MAX_SIZE: - *reinterpret_cast(value) = maxAqlSize_; + *reinterpret_cast(value) = max_aql_size_; break; case HSA_AGENT_INFO_QUEUE_TYPE: *reinterpret_cast(value) = HSA_QUEUE_TYPE_SINGLE; @@ -142,6 +165,36 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const { case HSA_AGENT_INFO_VERSION_MINOR: *reinterpret_cast(value) = 0; break; + case HSA_AMD_AGENT_INFO_CHIP_ID: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_CACHELINE_SIZE: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID: + *reinterpret_cast(value) = node_id(); + break; + case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_BDFID: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE: + *reinterpret_cast(value) = 0; + break; case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS: case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS: case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS: @@ -158,6 +211,68 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const { std::strcpy(reinterpret_cast(value), product_name_info_.c_str()); break; } + case HSA_AMD_AGENT_INFO_UUID: { + // TODO: uuid_value needs to be obtained somehow + uint64_t uuid_value = 0; + + if (uuid_value == 0) { + static const char uuid_tmp[] = "AIE-XX"; + snprintf(static_cast(value), sizeof(uuid_tmp), "%s", uuid_tmp); + break; + } + + // Device supports UUID, build UUID string to return. + constexpr std::size_t max_uuid_length = 36; + static const char uuid_tmp[] = "AIE-"; + snprintf(static_cast(value), max_uuid_length + sizeof(uuid_tmp), + "%s%036lX", uuid_tmp, uuid_value); + break; + } + case HSA_AMD_AGENT_INFO_ASIC_REVISION: + *reinterpret_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS: + // commented out until we populate AIE agent regions + assert(regions_.size() != 0 && "No device local memory found!"); + *reinterpret_cast(value) = true; + break; + case HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY: + return core::Runtime::runtime_singleton_->GetSystemInfo( + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, value); + break; + case HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_UCODE_VERSION: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NUM_SDMA_ENG: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_IOMMU_SUPPORT: + *static_cast(value) = HSA_IOMMU_SUPPORT_NONE; + break; + case HSA_AMD_AGENT_INFO_NUM_XCC: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_DRIVER_UID: + *static_cast(value) = 0; + break; + case HSA_AMD_AGENT_INFO_NEAREST_CPU: + static_cast(value)->handle = 0; + break; + case HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES: + memset(value, 0, sizeof(uint8_t) * 8); + break; + case HSA_AMD_AGENT_INFO_AQL_EXTENSIONS: + memset(value, 0, sizeof(uint8_t) * 8); + break; default: *reinterpret_cast(value) = 0; return HSA_STATUS_ERROR_INVALID_ARGUMENT; @@ -175,7 +290,7 @@ hsa_status_t AieAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - if (size < minAqlSize_ || size > maxAqlSize_) { + if (size < min_aql_size_ || size > max_aql_size_) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -185,7 +300,57 @@ hsa_status_t AieAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, return HSA_STATUS_SUCCESS; } -void AieAgent::InitRegionList() {} +void AieAgent::InitRegionList() { + /// TODO: Find a way to set the other memory properties in a reasonable way. + /// This should be easier once the ROCt source is incorporated into the + /// ROCr source. Since the AIE itself currently has no memory regions of + /// its own all memory is just the system DRAM. + + /// For allocating kernel arguments or other objects that only need + /// system memory. + HsaMemoryProperties sys_mem_props = {}; + sys_mem_props.HeapType = HSA_HEAPTYPE_SYSTEM; + + /// For allocating memory for programmable device image (PDI) files. These + /// need to be mapped to the device so the hardware can access the PDIs. + HsaMemoryProperties dev_mem_props = {}; + dev_mem_props.HeapType = HSA_HEAPTYPE_DEVICE_SVM, + dev_mem_props.SizeInBytes = XdnaDriver::GetDevHeapByteSize(); + + /// As of now the AIE devices support coarse-grain memory regions that require + /// explicit sync operations. + regions_.reserve(2); + regions_.push_back( + new MemoryRegion(false, true, false, false, true, this, sys_mem_props)); + regions_.push_back( + new MemoryRegion(false, false, false, false, true, this, dev_mem_props)); +} + +void AieAgent::GetAgentProperties() { + core::Runtime::runtime_singleton_->AgentDriver(driver_type) + .GetAgentProperties(*this); +} + +void AieAgent::InitAllocators() { + for (const auto *region : regions()) { + const MemoryRegion *amd_mem_region( + static_cast(region)); + if (amd_mem_region->kernarg()) { + system_allocator_ = + [region](size_t size, size_t align, + core::MemoryRegion::AllocateFlags alloc_flags) -> void * { + void *mem(nullptr); + return (core::Runtime::runtime_singleton_->AllocateMemory( + region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS) + ? mem + : nullptr; + }; + + system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; + break; + } + } +} } // namespace AMD } // namespace rocr diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 166415d07..283b5af60 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -41,28 +41,51 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/amd_xdna_driver.h" #ifdef __linux__ #include #include #include -#include -#include +#include #endif #ifdef _WIN32 #include #endif -#include -#include -#include +#include #include "core/inc/queue.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" #include "core/util/utils.h" +// The number of arguments in the packet payload before we start passing operands +constexpr int NON_OPERAND_COUNT = 6; + +// Used to transform an address into a device address +constexpr int DEV_ADDR_BASE = 0x04000000; +constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; + +// The driver places a structure before each command in a command chain. +// Need to increase the size of the command by the size of this structure. +// In the following xdna driver source can see where this is implemented: +// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23 +// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391 +// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637 +constexpr int CMD_COUNT_SIZE_INCREASE = 3; + +// Index of command payload where the instruction sequence +// address is located +constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; + +// Environment variable to define job submission timeout +constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; +constexpr int DEFAULT_TIMEOUT_VAL = 50; +char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); +int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); + namespace rocr { namespace AMD { @@ -70,23 +93,60 @@ int AieAqlQueue::rtti_id_ = 0; AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id) - : Queue(0, 0), DoorbellSignal(CreateSharedSignal(agent)), agent_(agent), - active_(false) { + : Queue(0, 0), LocalSignal(0, false), DoorbellSignal(signal()), + agent_(*agent), active_(false) { + if (agent_.device_type() != core::Agent::DeviceType::kAmdAieDevice) { + throw AMD::hsa_exception( + HSA_STATUS_ERROR_INVALID_AGENT, + "Attempting to create an AIE queue on a non-AIE agent."); + } + queue_size_bytes_ = req_size_pkts * sizeof(core::AqlPacket); + ring_buf_ = agent_.system_allocator()(queue_size_bytes_, 4096, + core::MemoryRegion::AllocateNoFlags); + + if (!ring_buf_) { + throw AMD::hsa_exception( + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION, + "Could not allocate a ring buffer for an AIE queue."); + } + + // Populate hsa_queue_t fields. + amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_SINGLE; + amd_queue_.hsa_queue.id = INVALID_QUEUEID; amd_queue_.hsa_queue.doorbell_signal = Signal::Convert(this); - amd_queue_.hsa_queue.size = 0x40; + amd_queue_.hsa_queue.size = req_size_pkts; + amd_queue_.hsa_queue.base_address = ring_buf_; + // Populate AMD queue fields. + amd_queue_.write_dispatch_id = 0; + amd_queue_.read_dispatch_id = 0; - signal_.hardware_doorbell_ptr = - reinterpret_cast(hardware_doorbell_ptr_); + signal_.hardware_doorbell_ptr = nullptr; signal_.kind = AMD_SIGNAL_KIND_DOORBELL; signal_.queue_ptr = &amd_queue_; active_ = true; + + core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type) + .CreateQueue(*this); } -AieAqlQueue::~AieAqlQueue() { Inactivate(); } +AieAqlQueue::~AieAqlQueue() { + AieAqlQueue::Inactivate(); + if (ring_buf_) { + agent_.system_deallocator()(ring_buf_); + } +} hsa_status_t AieAqlQueue::Inactivate() { bool active(active_.exchange(false, std::memory_order_relaxed)); - return HSA_STATUS_SUCCESS; + hsa_status_t status(HSA_STATUS_SUCCESS); + + if (active) { + status = core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type) + .DestroyQueue(*this); + hw_ctx_handle_ = std::numeric_limits::max(); + } + + return status; } hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) { @@ -163,8 +223,258 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { } void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { - atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value), - std::memory_order_release); + std::unordered_map vmem_handle_mappings; + + auto &driver = static_cast( + core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)); + if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { + return; + } + + int fd = 0; + if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) { + return; + } + + SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, + amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, + vmem_handle_mappings); +} + +hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { + for (unsigned int bo_arg : bo_args) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_arg; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, + uint32_t hw_ctx_handle, int fd) { + // Submit the cmd + if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) + return HSA_STATUS_ERROR; + + // Waiting for command to finish + amdxdna_drm_wait_cmd wait_cmd = {}; + wait_cmd.hwctx = hw_ctx_handle; + wait_cmd.timeout = timeout_val; + wait_cmd.seq = exec_cmd->seq; + + if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +void AieAqlQueue::RegisterCmdBOs( + uint32_t count, std::vector &bo_args, + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::unordered_map &vmem_handle_mappings) { + // This is the index where the operand addresses start in a command + const int operand_starting_index = 5; + + // Counting the number of operands in the command payload. + // Operands are 64-bits so we need to divide by two + uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2; + + // Keep track of the handles before we submit the packet + bo_args.push_back( + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); + + // Going through all of the operands in the command, keeping track of the + // handles and turning the handles into addresses. The starting index of + // the operands in a command is `operand_starting_index` and the fields + // are 32-bits we need to iterate over every two + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + bo_args.push_back( + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]); + // clang-format off + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1] = + (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32 & 0xFFFFFFFF; + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter] = + (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF; + // clang-format on + } + + // Transform the instruction sequence address into device address + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = + DEV_ADDR_BASE | + (reinterpret_cast( + vmem_handle_mappings + [cmd_pkt_payload + ->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]]) & + DEV_ADDR_OFFSET_MASK); +} + +hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, + amdxdna_cmd **cmd, int fd) { + // Creating the command + amdxdna_drm_create_bo create_cmd_bo = {}; + create_cmd_bo.type = AMDXDNA_BO_CMD, + create_cmd_bo.size = size; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) + return HSA_STATUS_ERROR; + + amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {}; + cmd_bo_get_bo_info.handle = create_cmd_bo.handle; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) + return HSA_STATUS_ERROR; + + *cmd = static_cast(mmap(nullptr, create_cmd_bo.size, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, + cmd_bo_get_bo_info.map_offset)); + *handle = create_cmd_bo.handle; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::SubmitCmd( + uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, + uint64_t write_dispatch_id, + std::unordered_map &vmem_handle_mappings) { + uint64_t cur_id = read_dispatch_id; + while (cur_id < write_dispatch_id) { + hsa_amd_aie_ert_packet_t *pkt = + static_cast(queue_base) + cur_id; + + // Get the packet header information + if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC || + pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT) + return HSA_STATUS_ERROR; + + // Get the payload information + switch (pkt->opcode) { + case HSA_AMD_AIE_ERT_START_CU: { + std::vector bo_args; + std::vector cmd_handles; + std::vector cmd_sizes; + std::vector cmds; + + // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU + // packets there are. All can be combined into a single chain. + int num_cont_start_cu_pkts = 1; + for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { + if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { + break; + } + num_cont_start_cu_pkts++; + } + + // Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets + for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) { + + // Getting the current command packet + hsa_amd_aie_ert_packet_t *pkt = + static_cast(queue_base) + pkt_iter; + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload = + reinterpret_cast( + pkt->payload_data); + + // Add the handles for all of the BOs to bo_args as well as rewrite + // the command payload handles to contain the actual virtual addresses + RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings); + + // Creating a packet that contains the command to execute the kernel + uint32_t cmd_bo_handle = 0; + amdxdna_cmd *cmd = nullptr; + uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t); + if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd)) + return HSA_STATUS_ERROR; + + // Filling in the fields of the command + cmd->state = pkt->state; + cmd->extra_cu_masks = 0; + + // The driver places a structure before each command in a command chain. + // Need to increase the size of the command by the size of this structure. + cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE; + cmd->opcode = pkt->opcode; + cmd->data[0] = cmd_pkt_payload->cu_mask; + memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count); + + // Keeping track of the handle + cmd_handles.push_back(cmd_bo_handle); + cmds.push_back(cmd); + cmd_sizes.push_back(cmd_size); + } + + // Creating a packet that contains the command chain + uint32_t cmd_chain_bo_handle = 0; + amdxdna_cmd *cmd_chain = nullptr; + int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t); + if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain, fd)) + return HSA_STATUS_ERROR; + + // Writing information to the command buffer + amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast(cmd_chain->data); + + // Creating a command chain + cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t); + cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN; + cmd_chain_payload->command_count = cmd_handles.size(); + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + for (int i = 0; i < cmd_handles.size(); i++) { + cmd_chain_payload->data[i] = cmd_handles[i]; + } + + // Syncing BOs before we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + // Removing duplicates in the bo container. The driver will report + // an error if we provide the same BO handle multiple times. + // This can happen if any of the BOs are the same across jobs + std::sort(bo_args.begin(), bo_args.end()); + bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end()); + + // Filling in the fields to execute the command chain + amdxdna_drm_exec_cmd exec_cmd_0 = {}; + exec_cmd_0.ext = 0; + exec_cmd_0.ext_flags = 0; + exec_cmd_0.hwctx = hw_ctx_handle; + exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF; + exec_cmd_0.cmd_handles = cmd_chain_bo_handle; + exec_cmd_0.args = (uint64_t)bo_args.data(); + exec_cmd_0.cmd_count = 1; + exec_cmd_0.arg_count = bo_args.size(); + + // Executing all commands in the command chain + ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); + + // Unmapping and closing the cmd BOs + drm_gem_close close_bo_args{0}; + for (int i = 0; i < cmd_handles.size(); i++) { + munmap(cmds[i], cmd_sizes[i]); + close_bo_args.handle = cmd_handles[i]; + ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + } + + // Unmapping and closing the cmd_chain BO + munmap(cmd_chain, cmd_chain_size); + close_bo_args.handle = cmd_chain_bo_handle; + ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + + // Syncing BOs after we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + cur_id += num_cont_start_cu_pkts; + break; + } + default: { + return HSA_STATUS_ERROR; + } + } + } + + return HSA_STATUS_SUCCESS; } void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { @@ -175,38 +485,25 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void *value) { switch (attribute) { - case HSA_AMD_QUEUE_INFO_AGENT: - *(reinterpret_cast(value)) = agent_->public_handle(); - break; - case HSA_AMD_QUEUE_INFO_DOORBELL_ID: - // Hardware doorbell supports AQL semantics. - *(reinterpret_cast(value)) = - reinterpret_cast(signal_.hardware_doorbell_ptr); - break; - default: - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + case HSA_AMD_QUEUE_INFO_AGENT: + *static_cast(value) = agent_.public_handle(); + break; + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: + // Hardware doorbell supports AQL semantics. + *static_cast(value) = + reinterpret_cast(signal_.hardware_doorbell_ptr); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; } -core::SharedQueue *AieAqlQueue::CreateSharedQueue(AieAgent *agent, - size_t req_size_pkts, - uint32_t node_id) { - queue_size_bytes_ = req_size_pkts * sizeof(core::AqlPacket); - - if (!IsPowerOfTwo(queue_size_bytes_)) { - throw AMD::hsa_exception( - HSA_STATUS_ERROR_INVALID_QUEUE_CREATION, - "Requested queue with non-power of two packet capacity.\n"); - } - - node_id_ = node_id; - - return nullptr; -} - -core::SharedSignal *AieAqlQueue::CreateSharedSignal(AieAgent *agent) { - return nullptr; +hsa_status_t +AieAqlQueue::ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) { + return core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type) + .ConfigHwCtx(*this, config_type, args); } hsa_status_t AieAqlQueue::GetCUMasking(uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index d54ff4b4f..6ac7e55cc 100644 --- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -208,6 +208,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_SEGMENT: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GLOBAL; @@ -223,6 +224,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_GLOBAL_FLAGS: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { uint32_t ret = 0; @@ -246,6 +248,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_ALLOC_MAX_SIZE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: *((size_t*)value) = max_sysmem_alloc_size_; break; case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: @@ -260,6 +263,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((bool*)value) = true; @@ -272,6 +276,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((size_t*)value) = kPageSize_; @@ -284,6 +289,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_DEVICE_SVM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((size_t*)value) = kPageSize_; diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 02d1e6bee..b4985c3be 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -80,7 +80,7 @@ void HsaApiTable::Init() { // they can add preprocessor macros on the new functions constexpr size_t expected_core_api_table_size = 1016; - constexpr size_t expected_amd_ext_table_size = 584; + constexpr size_t expected_amd_ext_table_size = 600;//592; constexpr size_t expected_image_ext_table_size = 120; constexpr size_t expected_finalizer_ext_table_size = 64; constexpr size_t expected_tools_table_size = 64; @@ -406,6 +406,8 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_signal_async_handler_fn = AMD::hsa_amd_signal_async_handler; amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function; amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any; + amd_ext_api.hsa_amd_queue_hw_ctx_config_fn = AMD::hsa_amd_queue_hw_ctx_config; + amd_ext_api.hsa_amd_get_handle_from_vaddr_fn = AMD::hsa_amd_get_handle_from_vaddr; amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask; amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask; amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info; diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index cdc046095..0184e175e 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -618,6 +618,35 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) { CATCH; } +hsa_status_t +hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, + hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args) { + TRY; + IS_OPEN(); + + IS_BAD_PTR(args); + core::Queue *cmd_queue = core::Queue::Convert(queue); + IS_VALID(cmd_queue); + + return cmd_queue->ConfigHwCtx(config_type, args); + + CATCH; +} + + +hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) { + TRY; + IS_OPEN(); + + IS_BAD_PTR(ptr); + IS_BAD_PTR(handle); + + return core::Runtime::runtime_singleton_->GetHandleFromVaddr(ptr, handle); + + CATCH; +} + hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, const uint32_t* cu_mask) { TRY; diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index e25cf5565..a266df260 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -3111,20 +3111,22 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz uint64_t flags_unused, hsa_amd_vmem_alloc_handle_t* memoryOnlyHandle) { const AMD::MemoryRegion* memRegion = static_cast(region); - if (!memRegion->IsLocalMemory()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; if (!IsMultipleOf(size, memRegion->GetPageSize())) return HSA_STATUS_ERROR_INVALID_ARGUMENT; ScopedAcquire lock(&memory_lock_); - void* thunk_handle; - hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0); + void *user_mode_driver_handle; + hsa_status_t status = + region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0); if (status == HSA_STATUS_SUCCESS) { memory_handle_map_.emplace(std::piecewise_construct, - std::forward_as_tuple(thunk_handle), - std::forward_as_tuple(region, size, flags_unused, thunk_handle, alloc_flags)); + std::forward_as_tuple(user_mode_driver_handle), + std::forward_as_tuple(region, size, flags_unused, + user_mode_driver_handle, + alloc_flags)); - *memoryOnlyHandle = MemoryHandle::Convert(thunk_handle); + *memoryOnlyHandle = MemoryHandle::Convert(user_mode_driver_handle); } return status; } @@ -3603,6 +3605,17 @@ hsa_status_t Runtime::VMemoryGetAllocPropertiesFromHandle(hsa_amd_vmem_alloc_han return HSA_STATUS_SUCCESS; } +hsa_status_t Runtime::GetHandleFromVaddr(void* ptr, uint32_t* handle) { + auto it = allocation_map_.find(ptr); + if (it == allocation_map_.end()) { + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + } + + auto* agent = it->second.region->owner(); + auto& driver = AgentDriver(agent->driver_type); + return driver.GetHandleFromVaddr(ptr, handle); +} + hsa_status_t Runtime::EnableLogging(uint8_t* flags, void* file) { memcpy(log_flags, flags, sizeof(log_flags)); diff --git a/runtime/hsa-runtime/hsacore.so.def b/runtime/hsa-runtime/hsacore.so.def index 3a575367f..c63d0e7d4 100644 --- a/runtime/hsa-runtime/hsacore.so.def +++ b/runtime/hsa-runtime/hsacore.so.def @@ -178,6 +178,8 @@ global: hsa_amd_signal_async_handler; hsa_amd_async_function; hsa_amd_image_get_info_max_dim; + hsa_amd_queue_hw_ctx_config; + hsa_amd_get_handle_from_vaddr; hsa_amd_queue_cu_set_mask; hsa_amd_queue_cu_get_mask; hsa_amd_memory_fill; diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h index e0063e6da..a5d85e85c 100644 --- a/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -204,6 +204,8 @@ struct AmdExtTable { decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn; decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; + decltype(hsa_amd_queue_hw_ctx_config) *hsa_amd_queue_hw_ctx_config_fn; + decltype(hsa_amd_get_handle_from_vaddr)* hsa_amd_get_handle_from_vaddr_fn; decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 1c3d006e8..be7e61b41 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -106,6 +106,12 @@ typedef enum { * queues created from AMD GPU Agents support this packet. */ HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, + /** + * Packet used to send commands to an AIE agent's embedded runtime (ERT). The + * ERT is responsible for, among other things, handling dispatches. Only + * queues created on AIE agents support this packet. + */ + HSA_AMD_PACKET_TYPE_AIE_ERT = 3 } hsa_amd_packet_type_t; /** @@ -194,6 +200,277 @@ typedef struct hsa_amd_barrier_value_packet_s { hsa_signal_t completion_signal; } hsa_amd_barrier_value_packet_t; +/** + * State of an AIE ERT command. + */ +typedef enum { + /** + * Set by the host before submitting a command to the scheduler. + */ + HSA_AMD_AIE_ERT_STATE_NEW = 1, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_QUEUED = 2, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_RUNNING = 3, + /** + * Set by the scheduler when a command completes. + */ + HSA_AMD_AIE_ERT_STATE_COMPLETED = 4, + /** + * Set by the scheduler if a command failed. + */ + HSA_AMD_AIE_ERT_STATE_ERROR = 5, + /** + * Set by the scheduler if a command aborted. + */ + HSA_AMD_AIE_ERT_STATE_ABORT = 6, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7, + /** + * Set by the scheduler on a timeout and reset. + */ + HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8, + /** + * Set by the scheduler on a timeout and fail to reset. + */ + HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9, + HSA_AMD_AIE_ERT_STATE_SKERROR = 10, + HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11, + HSA_AMD_AIE_ERT_STATE_MAX +} hsa_amd_aie_ert_state; + +/** + * Opcode types for HSA AIE ERT commands. + */ +typedef enum { + /** + * Start a workgroup on a compute unit (CU). + */ + HSA_AMD_AIE_ERT_START_CU = 0, + /** + * Currently aliased to HSA_AMD_AIE_ERT_START_CU. + */ + HSA_AMD_AIE_ERT_START_KERNEL = 0, + /** + * Configure command scheduler. + */ + HSA_AMD_AIE_ERT_CONFIGURE = 2, + HSA_AMD_AIE_ERT_EXIT = 3, + HSA_AMD_AIE_ERT_ABORT = 4, + /** + * Execute a specified CU after writing. + */ + HSA_AMD_AIE_ERT_EXEC_WRITE = 5, + /** + * Get stats about a CU's execution. + */ + HSA_AMD_AIE_ERT_CU_STAT = 6, + /** + * Start KDMA CU or P2P. + */ + HSA_AMD_AIE_ERT_START_COPYBO = 7, + /** + * Configure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_CONFIG = 8, + /** + * Start a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_START = 9, + /** + * Unconfigure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_UNCONFIG = 10, + /** + * Initialize a CU. + */ + HSA_AMD_AIE_ERT_INIT_CU = 11, + HSA_AMD_AIE_ERT_START_FA = 12, + HSA_AMD_AIE_ERT_CLK_CALIB = 13, + HSA_AMD_AIE_ERT_MB_VALIDATE = 14, + /** + * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair. + */ + HSA_AMD_AIE_ERT_START_KEY_VAL = 15, + HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16, + HSA_AMD_AIE_ERT_ACCESS_TEST = 17, + /** + * Instruction buffer command format. + */ + HSA_AMD_AIE_ERT_START_DPU = 18, + /** + * Command chain. + */ + HSA_AMD_AIE_ERT_CMD_CHAIN = 19, + /** + * Instruction buffer command format on NPU. + */ + HSA_AMD_AIE_ERT_START_NPU = 20, + /** + * Instruction buffer command with pre-emption format on the NPU. + */ + HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21 +} hsa_amd_aie_ert_cmd_opcode_t; + +/** + * Command types for HSA AMD AIE ERT. + */ +typedef enum { + /** + * Default command type. + */ + HSA_AMD_AIE_ERT_CMD_TYPE_DEFAULT = 0, + /** + * Command processed by kernel domain scheduler (KDS) locally. + */ + HSA_AMD_AIE_ERT_CMD_TYPE_KDS_LOCAL = 1, + /** + * Control command uses reserved command queue slot. + */ + HSA_AMD_AIE_ERT_CMD_TYPE_CTRL = 2, + /** + * Control command uses reserved command queue slot. + */ + HSA_AMD_AIE_ERT_CMD_TYPE_CU = 3, + /** + * CU command. + */ + HSA_AMD_AIE_ERT_CMD_TYPE_SCU = 4 +} hsa_amd_aie_ert_cmd_type_t; + +/** + * Format for start kernel packet header. + */ +typedef struct hsa_amd_aie_ert_start_kernel_header_s { + uint32_t state : 4; + /** + * Enable driver to record timestamp for various states the + * command has gone through. The stat data is appended after + * the command data. + */ + uint32_t stat_enabled : 1; + uint32_t unused : 5; + /** + * Extra CU masks in addition to the mandatory mask. + */ + uint32_t extra_cu_masks : 2; + uint32_t count : 11; + uint32_t opcode : 5; + uint32_t type : 4; +} hsa_amd_aie_ert_start_kernel_header_t; + +/** + * Payload data for AIE ERT start kernel packets (i.e., when the opcode is + * HSA_AMD_AIE_ERT_START_KERNEL). + */ +typedef struct hsa_amd_aie_ert_start_kernel_data_s { + /** + * Mandatory CU mask. + */ + uint32_t cu_mask; + /** + * Since the CU mask takes up one DWORD this is count - 1 number of DWORDs + * (i.e., the remainder of the start kernel payload data). + */ + uint32_t data[]; +} hsa_amd_aie_ert_start_kernel_data_t; + +/** + * Payload data for AIE ERT command chain packets (i.e., when the opcode is + * HSA_AMD_AIE_ERT_CMD_CHAIN). A command chain is a buffer of commands parsed + * by the ERT. + */ +typedef struct hsa_amd_aie_ert_command_chain_data_s { + /** + * Number of commands in the chain. + */ + uint32_t command_count; + /** + * Index of last successfully submitted command in the chain. + */ + uint32_t submit_index; + /** + * Index of failing command if command status is not completed. + */ + uint32_t error_index; + uint32_t reserved[3]; + /** + * Address of each command in the chain. + */ + uint64_t data[]; +} hsa_amd_aie_ert_command_chain_data_t; + +/** + * AMD AIE ERT packet. Used for sending a command to an AIE agent. + */ +typedef struct hsa_amd_aie_ert_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + /** + * Format for packets interpreted by the ERT to understand the command and + * payload data. + */ + struct { + /** + * Current state of a command. + */ + uint32_t state : 4; + /** + * Flexible field that can be interpreted on a per-command basis. + */ + uint32_t custom : 8; + /** + * Number of DWORDs in the payload data. + */ + uint32_t count : 11; + /** + * Opcode identifying the command. + */ + uint32_t opcode : 5; + /** + * Type of a command (currently 0). + */ + uint32_t type : 4; + }; + /** + * Reserved. Must be 0. + */ + uint64_t reserved0; + /** + * Reserved. Must be 0. + */ + uint64_t reserved1; + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + /** + * Reserved. Must be 0. + */ + uint64_t reserved4; + /** + * Reserved. Must be 0. + */ + uint64_t reserved5; + /** + * Address of packet data payload. ERT commands contain arbitrarily sized + * data payloads. + */ + uint64_t payload_data; +} hsa_amd_aie_ert_packet_t; + /** @} */ /** \defgroup error-codes Error codes @@ -1042,6 +1319,65 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, * @{ */ +/** + * @brief Hardware context configuration for one AIE CU. + */ +typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_s { + /** + * @brief CU configuration BO handle. + */ + uint32_t cu_config_bo; + /** + * @brief Function of a CU. + */ + uint8_t cu_func; + uint8_t reserved[3]; +} hsa_amd_aie_ert_hw_ctx_cu_config_t; + +typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_s { + /** + * @brief Number of CUs to configure. + */ + uint16_t num_cus; + uint16_t reserved[3]; + /** + * @brief List of CU configurations. + */ + hsa_amd_aie_ert_hw_ctx_cu_config_t *cu_configs; +} hsa_amd_aie_ert_hw_ctx_config_cu_param_t; + +/** + * brief Specify a hardware context configuration parameter type for a queue. + */ +typedef enum { + /** + * @brief Configure the CUs assigned to the AIE ERT HW context. + */ + HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU = 0 +} hsa_amd_queue_hw_ctx_config_param_t; + +/** + * @brief Configures the hardware context of a queue. + * + * @details This can be used to send configuration data to the queue so it can + * configure various hardware components that support the queue. The payload + * used to describe the configuration is interpreted on a per-queue-type basis. + * + * @param[in] queue HSA queue whose HW context is being configured. + * + * @param[in] config_type Specifies the type of the configuration. Used to + * determine how to interpret @p args. + * + * @param[in] args Configuration payload. Will be interpreted by the queue + * based on @p config_type. + */ +hsa_status_t HSA_API hsa_amd_queue_hw_ctx_config( + const hsa_queue_t *queue, hsa_amd_queue_hw_ctx_config_param_t config_type, + void *args); + + +hsa_status_t HSA_API hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle); + /** * @brief Set a queue's CU affinity mask. *