From 180a8bb68a85d68604c80220af074b21b6c0c86b Mon Sep 17 00:00:00 2001 From: eddierichter-amd Date: Mon, 2 Sep 2024 18:04:03 -0600 Subject: [PATCH] Adding soft queue dispatch logic to dispatch commands to AIE agents (#2) --- .../core/driver/xdna/amd_xdna_driver.cpp | 21 +- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 76 ++++- runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 18 +- .../hsa-runtime/core/inc/amd_xdna_driver.h | 8 +- .../core/runtime/amd_aie_aql_queue.cpp | 301 ++++++++++++++++-- 5 files changed, 387 insertions(+), 37 deletions(-) diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 303312932..4ba196b5f 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const { return HSA_STATUS_ERROR; } - aie_agent.SetNumCols(aie_metadata.cols); + // Right now can only target N-1 columns so putting this + // here as a workaround + aie_agent.SetNumCols(aie_metadata.cols - 1); aie_agent.SetNumCoreRows(aie_metadata.core.row_count); return HSA_STATUS_SUCCESS; @@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { + vmem_handle_mappings = this->vmem_handle_mappings; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::GetFd(int &fd) { + fd = fd_; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::FreeDeviceHeap() { if (dev_heap_parent) { munmap(dev_heap_parent, dev_heap_align * 2 - 1); @@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU( config_cu_param.cu_configs[i].cu_config_bo; xdna_config_cu_param->cu_configs[i].cu_func = config_cu_param.cu_configs[i].cu_func; + + // sync configuration buffer + amdxdna_drm_sync_bo sync_args = {}; + sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo; + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) { + return HSA_STATUS_ERROR; + } } amdxdna_drm_config_hwctx config_hw_ctx_args{ diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 70f05e28a..224b85d7c 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -49,7 +49,35 @@ #include "core/inc/queue.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" -#include "core/util/locks.h" + +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + __u32 command_count; + __u32 submit_index; + __u32 error_index; + __u32 reserved[3]; + __u64 data[] __counted_by(command_count); +}; + + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + __u32 state : 4; + __u32 unused : 6; + __u32 extra_cu_masks : 2; + __u32 count : 11; + __u32 opcode : 5; + __u32 reserved : 4; + }; + __u32 header; + }; + __u32 data[] __counted_by(count); +}; namespace rocr { namespace AMD { @@ -71,7 +99,7 @@ class AieAqlQueue : public core::Queue, AieAqlQueue() = delete; AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id); - ~AieAqlQueue(); + ~AieAqlQueue() override; hsa_status_t Inactivate() override; hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override; @@ -100,7 +128,7 @@ class AieAqlQueue : public core::Queue, void *value) override; // AIE-specific API - AieAgent &GetAgent() { return agent_; } + AieAgent &GetAgent() const { return agent_; } void SetHwCtxHandle(uint32_t hw_ctx_handle) { hw_ctx_handle_ = hw_ctx_handle; } @@ -119,7 +147,7 @@ class AieAqlQueue : public core::Queue, hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE, hsa_signal_t *signal = NULL) override; - uint32_t queue_id_ = INVALID_QUEUEID; + uint64_t queue_id_ = INVALID_QUEUEID; /// @brief ID of AIE device on which this queue has been mapped. uint32_t node_id_ = std::numeric_limits::max(); /// @brief Queue size in bytes. @@ -134,6 +162,44 @@ class AieAqlQueue : public core::Queue, /// @brief Base of the queue's ring buffer storage. void *ring_buf_ = nullptr; + static hsa_status_t SubmitCmd( + uint32_t hw_ctx_handle, int fd, void *queue_base, + uint64_t read_dispatch_id, uint64_t write_dispatch_id, + std::unordered_map &vmem_handle_mappings); + + /// @brief Creates a command BO and returns a pointer to the memory and + // the corresponding handle + /// + /// @param size size of memory to allocate + /// @param handle A pointer to the BO handle + /// @param cmd A pointer to the buffer + static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle, + amdxdna_cmd **cmd, int fd); + + /// @brief Adds all BOs in a command packet payload to a vector + /// and replaces the handles with a virtual address + /// + /// @param count Number of entries in the command + /// @param bo_args A pointer to a vector that contains all bo handles + /// @param cmd_pkt_payload A pointer to the payload of the command + static void RegisterCmdBOs( + uint32_t count, std::vector &bo_args, + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::unordered_map &vmem_handle_mappings); + + /// @brief Syncs all BOs referenced in bo_args + /// + /// @param bo_args vector containing handles of BOs to sync + static hsa_status_t SyncBos(std::vector &bo_args, int fd); + + /// @brief Executes a command and waits for its completion + /// + /// @param exec_cmd Structure containing the details of the command to execute + /// @param hw_ctx_handle the handle of the hardware context to run this + /// command + static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, + uint32_t hw_ctx_handle, int fd); + /// @brief Handle for an application context on the AIE device. /// /// Each user queue will have an associated context. This handle is assigned @@ -153,4 +219,4 @@ class AieAqlQueue : public core::Queue, } // namespace AMD } // namespace rocr -#endif // header guard +#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_ diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 608017c11..a7193d18d 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt { GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index); // @brief GPU agent destructor. - ~GpuAgent(); + ~GpuAgent() override; // @brief Ensure blits are ready (performance hint). void PreloadBlits() override; @@ -507,14 +507,14 @@ class GpuAgent : public GpuAgentInt { hsa_status_t EnableDmaProfiling(bool enable) override; hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, - void* cb_data); - hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session); + void *cb_data) override; + hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override; hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, - pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session); - hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session); + pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override; + hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override; hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session); static void PcSamplingThreadRun(void* agent); @@ -793,4 +793,4 @@ class GpuAgent : public GpuAgentInt { } // namespace amd } // namespace rocr -#endif // header guard +#endif // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 28572e135..79cbaa710 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -47,6 +47,7 @@ #include "core/inc/driver.h" #include "core/inc/memory_region.h" +#include "core/driver/xdna/uapi/amdxdna_accel.h" namespace rocr { namespace core { @@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver { hsa_status_t Init() override; hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; + hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetFd(int &fd); + hsa_status_t GetAgentProperties(core::Agent &agent) const override; hsa_status_t GetMemoryProperties(uint32_t node_id, @@ -126,10 +130,6 @@ class XdnaDriver : public core::Driver { void *dev_heap_aligned = nullptr; static constexpr size_t dev_heap_size = 48 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; - - /// @brief DRM buffer object handle for the device heap. Assigned by the - /// kernel-mode driver. - uint32_t dev_heap_handle = 0; }; } // namespace AMD diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index e8562f226..b2f8fd2d0 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -41,28 +41,52 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/amd_xdna_driver.h" #ifdef __linux__ #include #include #include -#include -#include +#include #endif #ifdef _WIN32 #include #endif -#include -#include -#include +#include #include "core/inc/queue.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" #include "core/util/utils.h" +// The number of arguments in the packet payload before we start passing operands +constexpr int NON_OPERAND_COUNT = 6; + +// Used to transform an address into a device address +constexpr int DEV_ADDR_BASE = 0x04000000; +constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; + +// BO size allocated for commands +constexpr int CMD_SIZE = 64; + +// This is a temp workaround. For some reason the first command count in a chain +// needs to be a larger than it actually is, assuming there is some other data +// structure at the beginning +// TODO: Look more into this +constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5; + +// Index of command payload where the instruction sequence +// address is located +constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; + +// Environment variable to define job submission timeout +constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; +constexpr int DEFAULT_TIMEOUT_VAL = 50; +char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); +int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); + namespace rocr { namespace AMD { @@ -106,7 +130,7 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts, .CreateQueue(*this); } -AieAqlQueue::~AieAqlQueue() { Inactivate(); } +AieAqlQueue::~AieAqlQueue() { AieAqlQueue::Inactivate(); } hsa_status_t AieAqlQueue::Inactivate() { bool active(active_.exchange(false, std::memory_order_relaxed)); @@ -195,8 +219,249 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { } void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { - atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value), - std::memory_order_release); + std::unordered_map vmem_handle_mappings; + if (reinterpret_cast( + core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)) + .GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { + return; + } + + int fd = 0; + if (reinterpret_cast( + core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)) + .GetFd(fd) != HSA_STATUS_SUCCESS) { + return; + } + + SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, + amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, + vmem_handle_mappings); +} + +hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { + for (unsigned int bo_arg : bo_args) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_arg; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, + uint32_t hw_ctx_handle, int fd) { + // Submit the cmd + if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) + return HSA_STATUS_ERROR; + + // Waiting for command to finish + amdxdna_drm_wait_cmd wait_cmd = {}; + wait_cmd.hwctx = hw_ctx_handle; + wait_cmd.timeout = timeout_val; + wait_cmd.seq = exec_cmd->seq; + + if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +void AieAqlQueue::RegisterCmdBOs( + uint32_t count, std::vector &bo_args, + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::unordered_map &vmem_handle_mappings) { + // This is the index where the operand addresses start in a command + const int operand_starting_index = 5; + + // Counting the number of operands in the command payload. + // Operands are 64-bits so we need to divide by two + uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2; + + // Keep track of the handles before we submit the packet + bo_args.push_back(cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); + + // Going through all of the operands in the command, keeping track of the + // handles and turning the handles into addresses. The starting index of + // the operands in a command is `operand_starting_index` and the fields + // are 32-bits we need to iterate over every two + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + bo_args.push_back( + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]); + // clang-format off + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1] = + (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32 & 0xFFFFFFFF; + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter] = + (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF; + // clang-format on + } + + // Transform the instruction sequence address into device address + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = + DEV_ADDR_BASE | + (reinterpret_cast( + vmem_handle_mappings + [cmd_pkt_payload + ->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]]) & + DEV_ADDR_OFFSET_MASK); +} + +hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, + amdxdna_cmd **cmd, int fd) { + // Creating the command + amdxdna_drm_create_bo create_cmd_bo = {}; + create_cmd_bo.type = AMDXDNA_BO_CMD, + create_cmd_bo.size = CMD_SIZE; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) + return HSA_STATUS_ERROR; + + amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {}; + cmd_bo_get_bo_info.handle = create_cmd_bo.handle; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) + return HSA_STATUS_ERROR; + + *cmd = static_cast(mmap(nullptr, create_cmd_bo.size, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, + cmd_bo_get_bo_info.map_offset)); + *handle = create_cmd_bo.handle; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::SubmitCmd( + uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, + uint64_t write_dispatch_id, + std::unordered_map &vmem_handle_mappings) { + uint64_t cur_id = read_dispatch_id; + while (cur_id < write_dispatch_id) { + hsa_amd_aie_ert_packet_t *pkt = + static_cast(queue_base) + cur_id; + + // Get the packet header information + if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC || + pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT) + return HSA_STATUS_ERROR; + + // Get the payload information + switch (pkt->opcode) { + case HSA_AMD_AIE_ERT_START_CU: { + + std::vector bo_args; + std::vector cmd_handles; + + // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU + // packets there are. All can be combined into a single chain. + int num_cont_start_cu_pkts = 1; + for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { + if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { + break; + } + num_cont_start_cu_pkts++; + } + + // Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets + for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) { + + // Getting the current command packet + hsa_amd_aie_ert_packet_t *pkt = + static_cast(queue_base) + pkt_iter; + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload = + reinterpret_cast( + pkt->payload_data); + + // Add the handles for all of the BOs to bo_args as well as rewrite + // the command payload handles to contain the actual virtual addresses + RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings); + + // Creating a packet that contains the command to execute the kernel + uint32_t cmd_bo_handle = 0; + amdxdna_cmd *cmd = nullptr; + if (CreateCmd(64, &cmd_bo_handle, &cmd, fd)) + return HSA_STATUS_ERROR; + + // Filling in the fields of the command + cmd->state = pkt->state; + cmd->extra_cu_masks = 0; + + // For some reason the first count needs to be a little larger than + // it actually is, assuming there is some other data structure at the + // beginning + // TODO: Look more into this + if (pkt_iter == cur_id) { + cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE; + } + else { + cmd->count = pkt->count; + } + cmd->opcode = pkt->opcode; + cmd->data[0] = cmd_pkt_payload->cu_mask; + memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count); + + // Keeping track of the handle + cmd_handles.push_back(cmd_bo_handle); + } + + // Creating a packet that contains the command chain + uint32_t cmd_chain_bo_handle = 0; + amdxdna_cmd *cmd_chain = nullptr; + int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t); + if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain, fd)) + return HSA_STATUS_ERROR; + + // Writing information to the command buffer + amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast(cmd_chain->data); + + // Creating a command chain + cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + // TODO: Figure out why this is the value + cmd_chain->count = 0xA; + cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN; + cmd_chain_payload->command_count = cmd_handles.size(); + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + for (int i = 0; i < cmd_handles.size(); i++) { + cmd_chain_payload->data[i] = cmd_handles[i]; + } + + // Syncing BOs before we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + // Removing duplicates in the bo container. The driver will report + // an error if we provide the same BO handle multiple times. + // This can happen if any of the BOs are the same across jobs + std::sort(bo_args.begin(), bo_args.end()); + bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end()); + + // Filling in the fields to execute the command chain + amdxdna_drm_exec_cmd exec_cmd_0 = {}; + exec_cmd_0.ext = 0; + exec_cmd_0.ext_flags = 0; + exec_cmd_0.hwctx = hw_ctx_handle; + exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF; + exec_cmd_0.cmd_handles = cmd_chain_bo_handle; + exec_cmd_0.args = (uint64_t)bo_args.data(); + exec_cmd_0.cmd_count = 1; + exec_cmd_0.arg_count = bo_args.size(); + + // Executing all commands in the command chain + ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); + + // Syncing BOs after we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + cur_id += num_cont_start_cu_pkts; + break; + } + default: { + return HSA_STATUS_ERROR; + } + } + } + + return HSA_STATUS_SUCCESS; } void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { @@ -207,16 +472,16 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void *value) { switch (attribute) { - case HSA_AMD_QUEUE_INFO_AGENT: - *(reinterpret_cast(value)) = agent_.public_handle(); - break; - case HSA_AMD_QUEUE_INFO_DOORBELL_ID: - // Hardware doorbell supports AQL semantics. - *(reinterpret_cast(value)) = - reinterpret_cast(signal_.hardware_doorbell_ptr); - break; - default: - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + case HSA_AMD_QUEUE_INFO_AGENT: + *static_cast(value) = agent_.public_handle(); + break; + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: + // Hardware doorbell supports AQL semantics. + *static_cast(value) = + reinterpret_cast(signal_.hardware_doorbell_ptr); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; }