Skip to content

Commit

Permalink
Adding soft queue dispatch logic to dispatch commands to AIE agents (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
eddierichter-amd authored and makslevental committed Sep 10, 2024
1 parent baa1bd7 commit 180a8bb
Show file tree
Hide file tree
Showing 5 changed files with 387 additions and 37 deletions.
21 changes: 20 additions & 1 deletion runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
return HSA_STATUS_ERROR;
}

aie_agent.SetNumCols(aie_metadata.cols);
// Right now can only target N-1 columns so putting this
// here as a workaround
aie_agent.SetNumCols(aie_metadata.cols - 1);
aie_agent.SetNumCoreRows(aie_metadata.core.row_count);

return HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
vmem_handle_mappings = this->vmem_handle_mappings;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetFd(int &fd) {
fd = fd_;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::FreeDeviceHeap() {
if (dev_heap_parent) {
munmap(dev_heap_parent, dev_heap_align * 2 - 1);
Expand Down Expand Up @@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
config_cu_param.cu_configs[i].cu_config_bo;
xdna_config_cu_param->cu_configs[i].cu_func =
config_cu_param.cu_configs[i].cu_func;

// sync configuration buffer
amdxdna_drm_sync_bo sync_args = {};
sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) {
return HSA_STATUS_ERROR;
}
}

amdxdna_drm_config_hwctx config_hw_ctx_args{
Expand Down
76 changes: 71 additions & 5 deletions runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,35 @@
#include "core/inc/queue.h"
#include "core/inc/runtime.h"
#include "core/inc/signal.h"
#include "core/util/locks.h"

/*
* Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
* amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
*/
struct amdxdna_cmd_chain {
__u32 command_count;
__u32 submit_index;
__u32 error_index;
__u32 reserved[3];
__u64 data[] __counted_by(command_count);
};


/* Exec buffer command header format */
struct amdxdna_cmd {
union {
struct {
__u32 state : 4;
__u32 unused : 6;
__u32 extra_cu_masks : 2;
__u32 count : 11;
__u32 opcode : 5;
__u32 reserved : 4;
};
__u32 header;
};
__u32 data[] __counted_by(count);
};

namespace rocr {
namespace AMD {
Expand All @@ -71,7 +99,7 @@ class AieAqlQueue : public core::Queue,

AieAqlQueue() = delete;
AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id);
~AieAqlQueue();
~AieAqlQueue() override;

hsa_status_t Inactivate() override;
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
Expand Down Expand Up @@ -100,7 +128,7 @@ class AieAqlQueue : public core::Queue,
void *value) override;

// AIE-specific API
AieAgent &GetAgent() { return agent_; }
AieAgent &GetAgent() const { return agent_; }
void SetHwCtxHandle(uint32_t hw_ctx_handle) {
hw_ctx_handle_ = hw_ctx_handle;
}
Expand All @@ -119,7 +147,7 @@ class AieAqlQueue : public core::Queue,
hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
hsa_signal_t *signal = NULL) override;

uint32_t queue_id_ = INVALID_QUEUEID;
uint64_t queue_id_ = INVALID_QUEUEID;
/// @brief ID of AIE device on which this queue has been mapped.
uint32_t node_id_ = std::numeric_limits<uint32_t>::max();
/// @brief Queue size in bytes.
Expand All @@ -134,6 +162,44 @@ class AieAqlQueue : public core::Queue,
/// @brief Base of the queue's ring buffer storage.
void *ring_buf_ = nullptr;

static hsa_status_t SubmitCmd(
uint32_t hw_ctx_handle, int fd, void *queue_base,
uint64_t read_dispatch_id, uint64_t write_dispatch_id,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings);

/// @brief Creates a command BO and returns a pointer to the memory and
// the corresponding handle
///
/// @param size size of memory to allocate
/// @param handle A pointer to the BO handle
/// @param cmd A pointer to the buffer
static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle,
amdxdna_cmd **cmd, int fd);

/// @brief Adds all BOs in a command packet payload to a vector
/// and replaces the handles with a virtual address
///
/// @param count Number of entries in the command
/// @param bo_args A pointer to a vector that contains all bo handles
/// @param cmd_pkt_payload A pointer to the payload of the command
static void RegisterCmdBOs(
uint32_t count, std::vector<uint32_t> &bo_args,
hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings);

/// @brief Syncs all BOs referenced in bo_args
///
/// @param bo_args vector containing handles of BOs to sync
static hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);

/// @brief Executes a command and waits for its completion
///
/// @param exec_cmd Structure containing the details of the command to execute
/// @param hw_ctx_handle the handle of the hardware context to run this
/// command
static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
uint32_t hw_ctx_handle, int fd);

/// @brief Handle for an application context on the AIE device.
///
/// Each user queue will have an associated context. This handle is assigned
Expand All @@ -153,4 +219,4 @@ class AieAqlQueue : public core::Queue,
} // namespace AMD
} // namespace rocr

#endif // header guard
#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
18 changes: 9 additions & 9 deletions runtime/hsa-runtime/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt {
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);

// @brief GPU agent destructor.
~GpuAgent();
~GpuAgent() override;

// @brief Ensure blits are ready (performance hint).
void PreloadBlits() override;
Expand Down Expand Up @@ -507,14 +507,14 @@ class GpuAgent : public GpuAgentInt {
hsa_status_t EnableDmaProfiling(bool enable) override;

hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data);
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
void *cb_data) override;
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);

static void PcSamplingThreadRun(void* agent);
Expand Down Expand Up @@ -793,4 +793,4 @@ class GpuAgent : public GpuAgentInt {
} // namespace amd
} // namespace rocr

#endif // header guard
#endif // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
8 changes: 4 additions & 4 deletions runtime/hsa-runtime/core/inc/amd_xdna_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
#include "core/driver/xdna/uapi/amdxdna_accel.h"

namespace rocr {
namespace core {
Expand All @@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver {
hsa_status_t Init() override;
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;

hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
hsa_status_t GetFd(int &fd);

hsa_status_t GetAgentProperties(core::Agent &agent) const override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
Expand Down Expand Up @@ -126,10 +130,6 @@ class XdnaDriver : public core::Driver {
void *dev_heap_aligned = nullptr;
static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
static constexpr size_t dev_heap_align = 64 * 1024 * 1024;

/// @brief DRM buffer object handle for the device heap. Assigned by the
/// kernel-mode driver.
uint32_t dev_heap_handle = 0;
};

} // namespace AMD
Expand Down
Loading

0 comments on commit 180a8bb

Please sign in to comment.