From 855c118ec0edd95f93386e339de1f4420f6db70b Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:47:34 +0000
Subject: [PATCH 01/20] rocr/aie: Add support for creating AIE queue context

Adds support for initialzing the XDNA driver so that
a hardware context can be created for an AIE queue.

Right now this initializes the device heap in the driver,
gets the relevant tile parameters for the AIE agent,
and creates a hardware context that backs the AIE queue.

Change-Id: Ib90e1bc67a8637f6db3ff2bebe34677843796417
---
 .../core/driver/kfd/amd_kfd_driver.cpp        |   8 +-
 .../core/driver/xdna/amd_xdna_driver.cpp      | 139 +++++++++++++++++-
 runtime/hsa-runtime/core/inc/amd_aie_agent.h  |  23 ++-
 .../hsa-runtime/core/inc/amd_aie_aql_queue.h  |  24 ++-
 runtime/hsa-runtime/core/inc/amd_kfd_driver.h |   4 +-
 .../hsa-runtime/core/inc/amd_xdna_driver.h    |  31 +++-
 runtime/hsa-runtime/core/inc/driver.h         |  12 +-
 .../core/runtime/amd_aie_agent.cpp            |   6 +
 .../core/runtime/amd_aie_aql_queue.cpp        |  19 ++-
 9 files changed, 253 insertions(+), 13 deletions(-)
diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
index 0de256664..605b02236 100644
--- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -61,6 +61,8 @@ namespace AMD {
 KfdDriver::KfdDriver(std::string devnode_name)
     : core::Driver(core::DriverType::KFD, devnode_name) {}
 
+hsa_status_t KfdDriver::Init() { return HSA_STATUS_SUCCESS; }
+
 hsa_status_t KfdDriver::DiscoverDriver() {
   if (hsaKmtOpenKFD() == HSAKMT_STATUS_SUCCESS) {
     std::unique_ptr<Driver> kfd_drv(new KfdDriver("/dev/kfd"));
@@ -74,6 +76,10 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t KfdDriver::GetAgentProperties(core::Agent &agent) const {
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t
 KfdDriver::GetMemoryProperties(uint32_t node_id,
                                core::MemoryRegion &mem_region) const {
@@ -230,7 +236,7 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
   return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }
 
-hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
+hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const {
   return HSA_STATUS_SUCCESS;
 }
 
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 308ffe7aa..1cbd2b6bb 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -43,12 +43,15 @@
 #include "core/inc/amd_xdna_driver.h"
 
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 
 #include <memory>
 #include <string>
 
+#include "core/inc/amd_aie_aql_queue.h"
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
+#include "core/util/utils.h"
 #include "uapi/amdxdna_accel.h"
 
 namespace rocr {
@@ -57,6 +60,8 @@ namespace AMD {
 XdnaDriver::XdnaDriver(std::string devnode_name)
     : core::Driver(core::DriverType::XDNA, devnode_name) {}
 
+XdnaDriver::~XdnaDriver() { FreeDeviceHeap(); }
+
 hsa_status_t XdnaDriver::DiscoverDriver() {
   const int max_minor_num(64);
   const std::string devnode_prefix("/dev/accel/accel");
@@ -67,6 +72,7 @@ hsa_status_t XdnaDriver::DiscoverDriver() {
     if (xdna_drv->Open() == HSA_STATUS_SUCCESS) {
       if (xdna_drv->QueryKernelModeDriver(
               core::DriverQuery::GET_DRIVER_VERSION) == HSA_STATUS_SUCCESS) {
+        static_cast<XdnaDriver *>(xdna_drv.get())->Init();
         core::Runtime::runtime_singleton_->RegisterDriver(xdna_drv);
         return HSA_STATUS_SUCCESS;
       } else {
@@ -78,6 +84,8 @@ hsa_status_t XdnaDriver::DiscoverDriver() {
   return HSA_STATUS_ERROR;
 }
 
+hsa_status_t XdnaDriver::Init() { return InitDeviceHeap(); }
+
 hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
   switch (query) {
   case core::DriverQuery::GET_DRIVER_VERSION:
@@ -88,6 +96,29 @@ hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
+  if (agent.device_type() != core::Agent::DeviceType::kAmdAieDevice) {
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+  }
+
+  auto &aie_agent(static_cast<AieAgent &>(agent));
+
+  amdxdna_drm_query_aie_metadata aie_metadata{0};
+  amdxdna_drm_get_info get_info_args{
+      .param = DRM_AMDXDNA_QUERY_AIE_METADATA,
+      .buffer_size = sizeof(aie_metadata),
+      .buffer = reinterpret_cast<uintptr_t>(&aie_metadata)};
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_INFO, &get_info_args) < 0) {
+    return HSA_STATUS_ERROR;
+  }
+
+  aie_agent.SetNumCols(aie_metadata.cols);
+  aie_agent.SetNumCoreRows(aie_metadata.core.row_count);
+
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t
 XdnaDriver::GetMemoryProperties(uint32_t node_id,
                                 core::MemoryRegion &mem_region) const {
@@ -105,11 +136,51 @@ hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) {
+hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
+  if (!AieAqlQueue::IsType(&queue)) {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
+  auto &aie_queue(static_cast<AieAqlQueue &>(queue));
+  auto &aie_agent(aie_queue.GetAgent());
+
+  // Currently we do not leverage QoS information.
+  amdxdna_qos_info qos_info{0};
+  amdxdna_drm_create_hwctx create_hwctx_args{
+      .ext = 0,
+      .ext_flags = 0,
+      .qos_p = reinterpret_cast<uintptr_t>(&qos_info),
+      .umq_bo = 0,
+      .log_buf_bo = 0,
+      // TODO: Make this configurable.
+      .max_opc = 0x800,
+      // This field is for the number of core tiles.
+      .num_tiles = aie_agent.GetNumCores(),
+      .mem_size = 0,
+      .umq_doorbell = 0};
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  aie_queue.SetHwCtxHandle(create_hwctx_args.handle);
+
   return HSA_STATUS_SUCCESS;
 }
 
 hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const {
+  if (!AieAqlQueue::IsType(&queue)) {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
+  auto &aie_queue(static_cast<AieAqlQueue &>(queue));
+  amdxdna_drm_destroy_hwctx destroy_hwctx_args{.handle =
+                                                   aie_queue.GetHwCtxHandle()};
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &destroy_hwctx_args) < 0) {
+    return HSA_STATUS_ERROR;
+  }
+
   return HSA_STATUS_SUCCESS;
 }
 
@@ -128,5 +199,71 @@ hsa_status_t XdnaDriver::QueryDriverVersion() {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t XdnaDriver::InitDeviceHeap() {
+  amdxdna_drm_create_bo create_bo_args{.type = AMDXDNA_BO_DEV_HEAP,
+                                       .vaddr =
+                                           reinterpret_cast<uintptr_t>(nullptr),
+                                       .size = dev_heap_size};
+  amdxdna_drm_get_bo_info get_bo_info_args{0};
+  drm_gem_close close_bo_args{0};
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  get_bo_info_args.handle = create_bo_args.handle;
+  // In case we need to close this BO to avoid leaks due to some error after
+  // creation.
+  close_bo_args.handle = create_bo_args.handle;
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) {
+    // Close the BO in the case we can't get info about it.
+    ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+    return HSA_STATUS_ERROR;
+  }
+
+  dev_heap_parent = mmap(0, dev_heap_align * 2 - 1, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+  if (dev_heap_parent == MAP_FAILED) {
+    // Close the BO in the case when a mapping fails and we got a BO handle.
+    ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+    dev_heap_parent = nullptr;
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  void *addr_aligned(reinterpret_cast<void *>(
+      AlignUp(reinterpret_cast<uintptr_t>(dev_heap_parent), dev_heap_align)));
+
+  dev_heap_aligned =
+      mmap(addr_aligned, dev_heap_size, PROT_READ | PROT_WRITE,
+           MAP_SHARED | MAP_FIXED, fd_, get_bo_info_args.map_offset);
+
+  if (dev_heap_aligned == MAP_FAILED) {
+    // Close the BO in the case when a mapping fails and we got a BO handle.
+    ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+    // Unmap the dev_heap_parent.
+    dev_heap_aligned = nullptr;
+    FreeDeviceHeap();
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t XdnaDriver::FreeDeviceHeap() {
+  if (dev_heap_parent) {
+    munmap(dev_heap_parent, dev_heap_align * 2 - 1);
+    dev_heap_parent = nullptr;
+  }
+
+  if (dev_heap_aligned) {
+    munmap(dev_heap_aligned, dev_heap_size);
+    dev_heap_aligned = nullptr;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
 } // namespace AMD
 } // namespace rocr
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
index c3add8a76..910400511 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
@@ -86,18 +86,37 @@ class AieAgent : public core::Agent {
     return regions_;
   }
 
+  // AIE agent methods.
+  /// @brief Get the number of columns on this AIE agent.
+  int GetNumCols() const { return num_cols_; }
+  void SetNumCols(int num_cols) { num_cols_ = num_cols; }
+  /// @brief Get the number of core tile rows on this AIE agent.
+  int GetNumCoreRows() const { return num_core_rows_; }
+  void SetNumCoreRows(int num_core_rows) { num_core_rows_ = num_core_rows; }
+  /// @brief Get the number of core tiles on this AIE agent.
+  int GetNumCores() const { return num_cols_ * num_core_rows_; }
+
 private:
-  // @brief Query the driver to get the region list owned by this agent.
+  /// @brief Query the driver to get the region list owned by this agent.
   void InitRegionList();
 
+  /// @brief Query the driver to get properties for this AIE agent.
+  void GetAgentProperties();
+
   std::vector<const core::MemoryRegion *> regions_;
 
   const hsa_profile_t profile_ = HSA_PROFILE_BASE;
-  static const uint32_t maxQueues_ = 8;
+  static const uint32_t maxQueues_ = 1;
   static const uint32_t minAqlSize_ = 0x40;
   static const uint32_t maxAqlSize_ = 0x40;
   uint32_t max_queues_;
   uintptr_t device_heap_vaddr_ = 0;
+
+  /// @brief Number of columns in the AIE array.
+  int num_cols_ = 0;
+  /// @brief Number of rows of core tiles in the AIE array. Not all rows in a
+  /// column are cores. Some can be memory or shim tiles.
+  int num_core_rows_ = 0;
 };
 
 } // namespace AMD
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
index 7e59112d5..ba03bc899 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
@@ -55,7 +55,9 @@ namespace AMD {
 /// @brief Encapsulates HW AIE AQL Command Processor functionality. It
 /// provides the interface for things such as doorbells, queue read and
 /// write pointers, and a buffer.
-class AieAqlQueue : public core::Queue, public core::DoorbellSignal {
+class AieAqlQueue : public core::Queue,
+                    private core::LocalSignal,
+                    core::DoorbellSignal {
 public:
   static __forceinline bool IsType(core::Signal *signal) {
     return signal->IsType(&rtti_id_);
@@ -95,6 +97,13 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal {
   hsa_status_t GetInfo(hsa_queue_info_attribute_t attribute,
                        void *value) override;
 
+  // AIE-specific API
+  AieAgent &GetAgent() { return agent_; }
+  void SetHwCtxHandle(uint32_t hw_ctx_handle) {
+    hw_ctx_handle_ = hw_ctx_handle;
+  }
+  uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; }
+
   // GPU-specific queue functions are unsupported.
   hsa_status_t GetCUMasking(uint32_t num_cu_mask_count,
                             uint32_t *cu_mask) override;
@@ -126,7 +135,18 @@ class AieAqlQueue : public core::Queue, public core::DoorbellSignal {
                                        uint32_t node_id);
   core::SharedSignal *CreateSharedSignal(AieAgent *agent);
 
-  AieAgent *agent_;
+  AieAgent &agent_;
+
+  /// @brief Handle for an application context on the AIE device.
+  ///
+  /// Each user queue will have an associated context. This handle is assigned
+  /// by the driver on context creation.
+  ///
+  /// TODO: For now we support a single context that allocates all core tiles in
+  /// the array. In the future we can make the number of tiles configurable so
+  /// that multiple workloads with different core tile configurations can
+  /// execute on the AIE agent at the same time.
+  uint32_t hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
   /// Indicates if queue is active.
   std::atomic<bool> active_;
   static int rtti_id_;
diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
index 1939c0511..764384dd8 100644
--- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
@@ -66,7 +66,9 @@ class KfdDriver : public core::Driver {
 
   static hsa_status_t DiscoverDriver();
 
+  hsa_status_t Init() override;
   hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
+  hsa_status_t GetAgentProperties(core::Agent &agent) const override;
   hsa_status_t
   GetMemoryProperties(uint32_t node_id,
                       core::MemoryRegion &mem_region) const override;
@@ -75,7 +77,7 @@ class KfdDriver : public core::Driver {
                               void **mem, size_t size,
                               uint32_t node_id) override;
   hsa_status_t FreeMemory(void *mem, size_t size) override;
-  hsa_status_t CreateQueue(core::Queue &queue) override;
+  hsa_status_t CreateQueue(core::Queue &queue) const override;
   hsa_status_t DestroyQueue(core::Queue &queue) const override;
 
 private:
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index c45b33b11..e08db0e6f 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -58,10 +58,14 @@ class XdnaDriver : public core::Driver {
 public:
   XdnaDriver() = delete;
   XdnaDriver(std::string devnode_name);
+  ~XdnaDriver();
 
   static hsa_status_t DiscoverDriver();
+
+  hsa_status_t Init() override;
   hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
 
+  hsa_status_t GetAgentProperties(core::Agent &agent) const override;
   hsa_status_t
   GetMemoryProperties(uint32_t node_id,
                       core::MemoryRegion &mem_region) const override;
@@ -70,11 +74,36 @@ class XdnaDriver : public core::Driver {
                               void **mem, size_t size,
                               uint32_t node_id) override;
   hsa_status_t FreeMemory(void *mem, size_t size) override;
-  hsa_status_t CreateQueue(core::Queue &queue) override;
+
+  /// @brief Creates a context on the AIE device for this queue.
+  /// @param queue Queue whose on-device context is being created.
+  /// @return hsa_status_t
+  hsa_status_t CreateQueue(core::Queue &queue) const override;
   hsa_status_t DestroyQueue(core::Queue &queue) const override;
 
 private:
   hsa_status_t QueryDriverVersion();
+  /// @brief Allocate device accesible heap space.
+  ///
+  /// Allocate and map a buffer object (BO) that the AIE device can access.
+  hsa_status_t InitDeviceHeap();
+  hsa_status_t FreeDeviceHeap();
+
+  /// @brief Virtual address range allocated for the device heap.
+  ///
+  /// Allocate a large enough space so we can carve out the device heap in
+  /// this range and ensure it is aligned to 64MB. Currently, AIE2 supports
+  /// 48MB device heap and it must be aligned to 64MB.
+  void *dev_heap_parent = nullptr;
+
+  /// @brief The aligned device heap.
+  void *dev_heap_aligned = nullptr;
+  static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
+  static constexpr size_t dev_heap_align = 64 * 1024 * 1024;
+
+  /// @brief DRM buffer object handle for the device heap. Assigned by the
+  ///        kernel-mode driver.
+  uint32_t dev_heap_handle = 0;
 };
 
 } // namespace AMD
diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h
index 8c22a39b0..bbb70ea69 100644
--- a/runtime/hsa-runtime/core/inc/driver.h
+++ b/runtime/hsa-runtime/core/inc/driver.h
@@ -74,6 +74,9 @@ class Driver {
   Driver(DriverType kernel_driver_type, std::string devnode_name);
   virtual ~Driver() = default;
 
+  /// @brief Initialize the driver's state after opening.
+  virtual hsa_status_t Init() = 0;
+
   /// @brief Query the kernel-model driver.
   /// @retval HSA_STATUS_SUCCESS if the kernel-model driver query was
   /// successful.
@@ -91,6 +94,13 @@ class Driver {
   /// @retval DriverVersionInfo containing the driver's version information.
   const DriverVersionInfo &Version() const { return version_; }
 
+  /// @brief Get the properties of a specific agent and initialize the agent
+  /// object.
+  /// @param agent Agent whose properties we're getting.
+  /// @retval HSA_STATUS_SUCCESS if the driver successfully returns the agent's
+  ///         properties.
+  virtual hsa_status_t GetAgentProperties(Agent &agent) const = 0;
+
   /// @brief Get the memory properties of a specific node.
   /// @param node_id Node ID of the agent
   /// @param[in, out] mem_region MemoryRegion object whose properties will be
@@ -113,7 +123,7 @@ class Driver {
 
   virtual hsa_status_t FreeMemory(void *mem, size_t size) = 0;
 
-  virtual hsa_status_t CreateQueue(Queue &queue) = 0;
+  virtual hsa_status_t CreateQueue(Queue &queue) const = 0;
 
   virtual hsa_status_t DestroyQueue(Queue &queue) const = 0;
 
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index b98de4da2..8571ab3a8 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -53,6 +53,7 @@ AieAgent::AieAgent(uint32_t node)
                   core::Agent::DeviceType::kAmdAieDevice),
       max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) {
   InitRegionList();
+  GetAgentProperties();
 }
 
 AieAgent::~AieAgent() {
@@ -187,5 +188,10 @@ hsa_status_t AieAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
 
 void AieAgent::InitRegionList() {}
 
+void AieAgent::GetAgentProperties() {
+  core::Runtime::runtime_singleton_->AgentDriver(driver_type)
+      .GetAgentProperties(*this);
+}
+
 } // namespace AMD
 } // namespace rocr
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index 166415d07..6740a0946 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -70,8 +70,8 @@ int AieAqlQueue::rtti_id_ = 0;
 
 AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
                          uint32_t node_id)
-    : Queue(0, 0), DoorbellSignal(CreateSharedSignal(agent)), agent_(agent),
-      active_(false) {
+    : Queue(0, 0), LocalSignal(0, false), DoorbellSignal(signal()),
+      agent_(*agent), active_(false) {
   amd_queue_.hsa_queue.doorbell_signal = Signal::Convert(this);
   amd_queue_.hsa_queue.size = 0x40;
 
@@ -80,13 +80,24 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
   signal_.kind = AMD_SIGNAL_KIND_DOORBELL;
   signal_.queue_ptr = &amd_queue_;
   active_ = true;
+
+  core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)
+      .CreateQueue(*this);
 }
 
 AieAqlQueue::~AieAqlQueue() { Inactivate(); }
 
 hsa_status_t AieAqlQueue::Inactivate() {
   bool active(active_.exchange(false, std::memory_order_relaxed));
-  return HSA_STATUS_SUCCESS;
+  hsa_status_t status(HSA_STATUS_SUCCESS);
+
+  if (active) {
+    status = core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)
+                 .DestroyQueue(*this);
+    hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
+  }
+
+  return status;
 }
 
 hsa_status_t AieAqlQueue::SetPriority(HSA_QUEUE_PRIORITY priority) {
@@ -176,7 +187,7 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
                                   void *value) {
   switch (attribute) {
   case HSA_AMD_QUEUE_INFO_AGENT:
-    *(reinterpret_cast<hsa_agent_t *>(value)) = agent_->public_handle();
+    *(reinterpret_cast<hsa_agent_t *>(value)) = agent_.public_handle();
     break;
   case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
     // Hardware doorbell supports AQL semantics.

From 6121b3763b7236e9abb35ca9e332f18beebd965b Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:49:13 +0000
Subject: [PATCH 02/20] rocr/aie: Add AMD AIE Embedded Runtime vendor packets

Adds support for the packet interface for interacting with
the Embedded Runtime (ERT) on AIE agents. The ERT is what
interprets command packets send to the AIE agent work
queues.

Change-Id: Id28fb98056b2c046354c446bdc9568d74385bea1
---
 runtime/hsa-runtime/inc/hsa_ext_amd.h | 277 ++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)

diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h
index 1c3d006e8..0f0f5fcf9 100644
--- a/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -106,6 +106,12 @@ typedef enum {
    * queues created from AMD GPU Agents support this packet.
    */
   HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+  /**
+   * Packet used to send commands to an AIE agent's embedded runtime (ERT). The
+   * ERT is responsible for, among other things, handling dispatches. Only
+   * queues created on AIE agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_AIE_ERT = 3
 } hsa_amd_packet_type_t;
 
 /**
@@ -194,6 +200,277 @@ typedef struct hsa_amd_barrier_value_packet_s {
   hsa_signal_t completion_signal;
 } hsa_amd_barrier_value_packet_t;
 
+/**
+ * State of an AIE ERT command.
+ */
+typedef enum {
+  /**
+   * Set by the host before submitting a command to the scheduler.
+   */
+  HSA_AMD_AIE_ERT_STATE_NEW = 1,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_QUEUED = 2,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_RUNNING = 3,
+  /**
+   * Set by the scheduler when a command completes.
+   */
+  HSA_AMD_AIE_ERT_STATE_COMPLETED = 4,
+  /**
+   * Set by the scheduler if a command failed.
+   */
+  HSA_AMD_AIE_ERT_STATE_ERROR = 5,
+  /**
+   * Set by the scheduler if a command aborted.
+   */
+  HSA_AMD_AIE_ERT_STATE_ABORT = 6,
+  /**
+   * Internal scheduler state.
+   */
+  HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7,
+  /**
+   * Set by the scheduler on a timeout and reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8,
+  /**
+   * Set by the scheduler on a timeout and fail to reset.
+   */
+  HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9,
+  HSA_AMD_AIE_ERT_STATE_SKERROR = 10,
+  HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11,
+  HSA_AMD_AIE_ERT_STATE_MAX
+} hsa_amd_aie_ert_state;
+
+/**
+ * Opcode types for HSA AIE ERT commands.
+ */
+typedef enum {
+  /**
+   * Start a workgroup on a compute unit (CU).
+   */
+  HSA_AMD_AIE_ERT_START_CU = 0,
+  /**
+   * Currently aliased to HSA_AMD_AIE_ERT_START_CU.
+   */
+  HSA_AMD_AIE_ERT_START_KERNEL = 0,
+  /**
+   * Configure command scheduler.
+   */
+  HSA_AMD_AIE_ERT_CONFIGURE = 2,
+  HSA_AMD_AIE_ERT_EXIT = 3,
+  HSA_AMD_AIE_ERT_ABORT = 4,
+  /**
+   * Execute a specified CU after writing.
+   */
+  HSA_AMD_AIE_ERT_EXEC_WRITE = 5,
+  /**
+   * Get stats about a CU's execution.
+   */
+  HSA_AMD_AIE_ERT_CU_STAT = 6,
+  /**
+   * Start KDMA CU or P2P.
+   */
+  HSA_AMD_AIE_ERT_START_COPYBO = 7,
+  /**
+   * Configure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_CONFIG = 8,
+  /**
+   * Start a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_START = 9,
+  /**
+   * Unconfigure a soft kernel.
+   */
+  HSA_AMD_AIE_ERT_SK_UNCONFIG = 10,
+  /**
+   * Initialize a CU.
+   */
+  HSA_AMD_AIE_ERT_INIT_CU = 11,
+  HSA_AMD_AIE_ERT_START_FA = 12,
+  HSA_AMD_AIE_ERT_CLK_CALIB = 13,
+  HSA_AMD_AIE_ERT_MB_VALIDATE = 14,
+  /**
+   * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair.
+   */
+  HSA_AMD_AIE_ERT_START_KEY_VAL = 15,
+  HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16,
+  HSA_AMD_AIE_ERT_ACCESS_TEST = 17,
+  /**
+   * Instruction buffer command format.
+   */
+  HSA_AMD_AIE_ERT_START_DPU = 18,
+  /**
+   * Command chain.
+   */
+  HSA_AMD_AIE_ERT_CMD_CHAIN = 19,
+  /**
+   * Instruction buffer command format on NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU = 20,
+  /**
+   * Instruction buffer command with pre-emption format on the NPU.
+   */
+  HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21
+} hsa_amd_aie_ert_cmd_opcode_t;
+
+/**
+ * Command types for HSA AMD AIE ERT.
+ */
+typedef enum {
+  /**
+   * Default command type.
+   */
+  HSA_AMD_AIE_ERT_CMD_TYPE_DEFAULT = 0,
+  /**
+   * Command processed by kernel domain scheduler (KDS) locally.
+   */
+  HSA_AMD_AIE_ERT_CMD_TYPE_KDS_LOCAL = 1,
+  /**
+   * Control command uses reserved command queue slot.
+   */
+  HSA_AMD_AIE_ERT_CMD_TYPE_CTRL = 2,
+  /**
+   * Control command uses reserved command queue slot.
+   */
+  HSA_AMD_AIE_ERT_CMD_TYPE_CU = 3,
+  /**
+   * CU command.
+   */
+  HSA_AMD_AIE_ERT_CMD_TYPE_SCU = 4
+} hsa_amd_aie_ert_cmd_type_t;
+
+/**
+ * Format for start kernel packet header.
+ */
+typedef struct hsa_amd_aie_ert_start_kernel_header_s {
+  uint32_t state : 4;
+  /**
+   * Enable driver to record timestamp for various states the
+   * command has gone through. The stat data is appended after
+   * the command data.
+   */
+  uint32_t stat_enabled : 1;
+  uint32_t unused : 5;
+  /**
+   * Extra CU masks in addition to the mandatory mask.
+   */
+  uint32_t extra_cu_masks : 2;
+  uint32_t count : 11;
+  uint32_t opcode : 5;
+  uint32_t type : 4;
+} hsa_amd_aie_ert_start_kernel_header_t;
+
+/**
+ * Payload data for AIE ERT start kernel packets (i.e., when the opcode is
+ * HSA_AMD_AIE_ERT_START_KERNEL).
+ */
+typedef struct hsa_amd_aie_ert_start_kernel_data_s {
+  /**
+   * Mandatory CU mask.
+   */
+  uint32_t cu_mask;
+  /**
+   * Since the CU mask takes up one DWORD this is count - 1 number of DWORDs
+   * (i.e., the remainder of the start kernel payload data).
+   */
+  uint32_t data[];
+} hsa_amd_aie_ert_start_kernel_data_t;
+
+/**
+ * Payload data for AIE ERT command chain packets (i.e., when the opcode is
+ * HSA_AMD_AIE_ERT_CMD_CHAIN). A command chain is a buffer of commands parsed
+ * by the ERT.
+ */
+typedef struct hsa_amd_aie_ert_command_chain_data_s {
+  /**
+   * Number of commands in the chain.
+   */
+  uint32_t command_count;
+  /**
+   * Index of last successfully submitted command in the chain.
+   */
+  uint32_t submit_index;
+  /**
+   * Index of failing command if command status is not completed.
+   */
+  uint32_t error_index;
+  uint32_t reserved[3];
+  /**
+   * Address of each command in the chain.
+   */
+  uint64_t data[];
+} hsa_amd_aie_ert_command_chain_data_t;
+
+/**
+ * AMD AIE ERT packet. Used for sending a command to an AIE agent.
+ */
+typedef struct hsa_amd_aie_ert_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+  /**
+   * Format for packets interpreted by the ERT to understand the command and
+   * payload data.
+   */
+  struct {
+    /**
+     * Current state of a command.
+     */
+    uint32_t state : 4;
+    /**
+     * Flexible field that can be interpreted on a per-command basis.
+     */
+    uint32_t custom : 8;
+    /**
+     * Number of DWORDs in the payload data.
+     */
+    uint32_t count : 11;
+    /**
+     * Opcode identifying the command.
+     */
+    uint32_t opcode : 5;
+    /**
+     * Type of a command (currently 0).
+     */
+    uint32_t type : 4;
+  };
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved0;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved1;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved4;
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved5;
+  /**
+   * Address of packet data payload. ERT commands contain arbitrarily sized
+   * data payloads.
+   */
+  uint64_t payload_data;
+} hsa_amd_aie_ert_packet_t;
+
 /** @} */
 
 /** \defgroup error-codes Error codes

From 726d631f58cf4e304f129df00932566edbd9b890 Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:49:56 +0000
Subject: [PATCH 03/20] rocr/aie: Init mem regions for AIE agents

Change-Id: If180bdbcb3eb659f0d05a710526864494316d7a9
---
 .../core/driver/xdna/amd_xdna_driver.cpp      | 42 +++++++++++++
 runtime/hsa-runtime/core/inc/amd_aie_agent.h  | 20 ++++--
 .../hsa-runtime/core/inc/amd_memory_region.h  |  3 +-
 .../core/runtime/amd_aie_agent.cpp            | 62 ++++++++++++++++---
 .../core/runtime/amd_memory_region.cpp        |  6 ++
 5 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 1cbd2b6bb..19b165618 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -129,6 +129,48 @@ hsa_status_t
 XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
                            core::MemoryRegion::AllocateFlags alloc_flags,
                            void **mem, size_t size, uint32_t node_id) {
+  const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
+  amdxdna_drm_create_bo create_bo_args{.size = size};
+  amdxdna_drm_get_bo_info get_bo_info_args{0};
+  drm_gem_close close_bo_args{0};
+
+  if (!m_region.IsSystem()) {
+    return HSA_STATUS_ERROR_INVALID_REGION;
+  }
+
+  if (m_region.kernarg()) {
+    create_bo_args.type = AMDXDNA_BO_CMD;
+  } else {
+    create_bo_args.type = AMDXDNA_BO_DEV;
+  }
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  get_bo_info_args.handle = create_bo_args.handle;
+  // In case we need to close this BO to avoid leaks due to some error after
+  // creation.
+  close_bo_args.handle = create_bo_args.handle;
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) {
+    // Close the BO in the case we can't get info about it.
+    ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+    return HSA_STATUS_ERROR;
+  }
+
+  if (m_region.kernarg()) {
+    *mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                get_bo_info_args.map_offset);
+    if (*mem == MAP_FAILED) {
+      // Close the BO in the case when a mapping fails and we got a BO handle.
+      ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+  } else {
+    *mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
+  }
+
   return HSA_STATUS_SUCCESS;
 }
 
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
index 910400511..0925a206b 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
@@ -86,6 +86,13 @@ class AieAgent : public core::Agent {
     return regions_;
   }
 
+  /// @brief Getter for the AIE system allocator.
+  const std::function<void *(size_t size, size_t align,
+                             core::MemoryRegion::AllocateFlags flags)> &
+  system_allocator() const {
+    return system_allocator_;
+  }
+
   // AIE agent methods.
   /// @brief Get the number of columns on this AIE agent.
   int GetNumCols() const { return num_cols_; }
@@ -99,18 +106,21 @@ class AieAgent : public core::Agent {
 private:
   /// @brief Query the driver to get the region list owned by this agent.
   void InitRegionList();
+  /// @brief Setup the memory allocators used by this agent.
+  void InitAllocators();
 
   /// @brief Query the driver to get properties for this AIE agent.
   void GetAgentProperties();
 
   std::vector<const core::MemoryRegion *> regions_;
+  std::function<void *(size_t size, size_t align,
+                       core::MemoryRegion::AllocateFlags flags)>
+      system_allocator_;
 
   const hsa_profile_t profile_ = HSA_PROFILE_BASE;
-  static const uint32_t maxQueues_ = 1;
-  static const uint32_t minAqlSize_ = 0x40;
-  static const uint32_t maxAqlSize_ = 0x40;
-  uint32_t max_queues_;
-  uintptr_t device_heap_vaddr_ = 0;
+  const uint32_t min_aql_size_ = 0x40;
+  const uint32_t max_aql_size_ = 0x40;
+  const uint32_t max_queues_ = 1;
 
   /// @brief Number of columns in the AIE array.
   int num_cols_ = 0;
diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h
index b052d5c38..bb6b76dae 100644
--- a/runtime/hsa-runtime/core/inc/amd_memory_region.h
+++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h
@@ -143,7 +143,8 @@ class MemoryRegion : public core::MemoryRegion {
   }
 
   __forceinline bool IsSystem() const {
-    return mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM;
+    return ((mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM) ||
+            (mem_props_.HeapType == HSA_HEAPTYPE_DEVICE_SVM));
   }
 
   __forceinline bool IsLDS() const {
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index 8571ab3a8..29d300410 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -42,17 +42,21 @@
 
 #include "core/inc/amd_aie_agent.h"
 
+#include <functional>
+
 #include "core/inc/amd_aie_aql_queue.h"
+#include "core/inc/amd_memory_region.h"
 #include "core/inc/driver.h"
+#include "core/inc/runtime.h"
 
 namespace rocr {
 namespace AMD {
 
 AieAgent::AieAgent(uint32_t node)
     : core::Agent(core::DriverType::XDNA, node,
-                  core::Agent::DeviceType::kAmdAieDevice),
-      max_queues_(core::Runtime::runtime_singleton_->flag().max_queues()) {
+                  core::Agent::DeviceType::kAmdAieDevice) {
   InitRegionList();
+  InitAllocators();
   GetAgentProperties();
 }
 
@@ -82,7 +86,8 @@ hsa_status_t AieAgent::IterateRegion(
 hsa_status_t AieAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache,
                                                              void *data),
                                     void *data) const {
-  return HSA_STATUS_SUCCESS;
+  // AIE has no caches.
+  return HSA_STATUS_ERROR_INVALID_CACHE;
 }
 
 hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
@@ -117,13 +122,13 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
     *reinterpret_cast<uint32_t *>(value) = 0;
     break;
   case HSA_AGENT_INFO_QUEUES_MAX:
-    *reinterpret_cast<uint32_t *>(value) = maxQueues_;
+    *reinterpret_cast<uint32_t *>(value) = max_queues_;
     break;
   case HSA_AGENT_INFO_QUEUE_MIN_SIZE:
-    *reinterpret_cast<uint32_t *>(value) = minAqlSize_;
+    *reinterpret_cast<uint32_t *>(value) = min_aql_size_;
     break;
   case HSA_AGENT_INFO_QUEUE_MAX_SIZE:
-    *reinterpret_cast<uint32_t *>(value) = maxAqlSize_;
+    *reinterpret_cast<uint32_t *>(value) = max_aql_size_;
     break;
   case HSA_AGENT_INFO_QUEUE_TYPE:
     *reinterpret_cast<hsa_queue_type32_t *>(value) = HSA_QUEUE_TYPE_SINGLE;
@@ -176,7 +181,7 @@ hsa_status_t AieAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
-  if (size < minAqlSize_ || size > maxAqlSize_) {
+  if (size < min_aql_size_ || size > max_aql_size_) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
 
@@ -186,12 +191,53 @@ hsa_status_t AieAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
   return HSA_STATUS_SUCCESS;
 }
 
-void AieAgent::InitRegionList() {}
+void AieAgent::InitRegionList() {
+  /// TODO: Find a way to set the other memory properties in a reasonable way.
+  ///       This should be easier once the ROCt source is incorporated into the
+  ///       ROCr source. Since the AIE itself currently has no memory regions of
+  ///       its own all memory is just the system DRAM.
+
+  /// For allocating kernel arguments or other objects that only need
+  /// system memory.
+  HsaMemoryProperties sys_mem_props{
+      .HeapType = HSA_HEAPTYPE_SYSTEM,
+  };
+  /// For allocating memory for programmable device image (PDI) files. These
+  /// need to be mapped to the device so the hardware can access the PDIs.
+  HsaMemoryProperties dev_mem_props{
+      .HeapType = HSA_HEAPTYPE_DEVICE_SVM,
+  };
+  /// As of now the AIE devices support coarse-grain memory regions that require
+  /// explicit sync operations.
+  regions_.push_back(
+      new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
+  regions_.push_back(
+      new MemoryRegion(false, false, false, false, true, this, dev_mem_props));
+}
 
 void AieAgent::GetAgentProperties() {
   core::Runtime::runtime_singleton_->AgentDriver(driver_type)
       .GetAgentProperties(*this);
 }
 
+void AieAgent::InitAllocators() {
+  for (const auto *region : regions()) {
+    const MemoryRegion *amd_mem_region(
+        static_cast<const MemoryRegion *>(region));
+    if (amd_mem_region->kernarg()) {
+      system_allocator_ =
+          [region](size_t size, size_t align,
+                   core::MemoryRegion::AllocateFlags alloc_flags) -> void * {
+        void *mem(nullptr);
+        return (core::Runtime::runtime_singleton_->AllocateMemory(
+                    region, size, alloc_flags, &mem) == HSA_STATUS_SUCCESS)
+                   ? mem
+                   : nullptr;
+      };
+      break;
+    }
+  }
+}
+
 } // namespace AMD
 } // namespace rocr
diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
index d54ff4b4f..6ac7e55cc 100644
--- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
@@ -208,6 +208,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_SEGMENT:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
           *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GLOBAL;
@@ -223,6 +224,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_GLOBAL_FLAGS:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: {
           uint32_t ret = 0;
@@ -246,6 +248,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_ALLOC_MAX_SIZE:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
           *((size_t*)value) = max_sysmem_alloc_size_;
           break;
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
@@ -260,6 +263,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
           *((bool*)value) = true;
@@ -272,6 +276,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
           *((size_t*)value) = kPageSize_;
@@ -284,6 +289,7 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
     case HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT:
       switch (mem_props_.HeapType) {
         case HSA_HEAPTYPE_SYSTEM:
+        case HSA_HEAPTYPE_DEVICE_SVM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
           *((size_t*)value) = kPageSize_;

From 93ca2cb9d838a335d9615a83894e975c91445055 Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:50:47 +0000
Subject: [PATCH 04/20] rocr/aie: Allocate AIE queue's ring buf

Change-Id: I799a8223d695ec5c0ea2eaea012bc1b5d877e103
---
 .../hsa-runtime/core/inc/amd_aie_aql_queue.h  | 29 +++++-------
 .../core/runtime/amd_aie_aql_queue.cpp        | 47 ++++++++++---------
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
index ba03bc899..da0bfb43c 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
@@ -43,6 +43,8 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
 #define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
 
+#include <limits>
+
 #include "core/inc/amd_aie_agent.h"
 #include "core/inc/queue.h"
 #include "core/inc/runtime.h"
@@ -114,29 +116,21 @@ class AieAqlQueue : public core::Queue,
                   hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
                   hsa_signal_t *signal = NULL) override;
 
-  core::SharedQueue *shared_queue_;
-  core::SharedSignal *shared_signal_;
-  /// ID of the queue used in communication with the AMD AIR driver.
-  uint32_t queue_id_;
-  /// ID of the doorbell used in communication with the AMD AIR driver.
-  uint32_t doorbell_id_;
-  /// Pointer to the hardware doorbell for this queue.
-  uint64_t *hardware_doorbell_ptr_;
-  /// ID of AIE device on which this queue has been mapped.
-  uint32_t node_id_;
-  /// Queue size in bytes.
-  uint32_t queue_size_bytes_;
+  uint32_t queue_id_ = INVALID_QUEUEID;
+  /// @brief ID of AIE device on which this queue has been mapped.
+  uint32_t node_id_ = std::numeric_limits<uint32_t>::max();
+  /// @brief Queue size in bytes.
+  uint32_t queue_size_bytes_ = std::numeric_limits<uint32_t>::max();
 
 protected:
   bool _IsA(Queue::rtti_t id) const override { return id == &rtti_id_; }
 
 private:
-  core::SharedQueue *CreateSharedQueue(AieAgent *agent, size_t req_size_pkts,
-                                       uint32_t node_id);
-  core::SharedSignal *CreateSharedSignal(AieAgent *agent);
-
   AieAgent &agent_;
 
+  /// @brief Base of the queue's ring buffer storage.
+  void *ring_buf_ = nullptr;
+
   /// @brief Handle for an application context on the AIE device.
   ///
   /// Each user queue will have an associated context. This handle is assigned
@@ -147,7 +141,8 @@ class AieAqlQueue : public core::Queue,
   /// that multiple workloads with different core tile configurations can
   /// execute on the AIE agent at the same time.
   uint32_t hw_ctx_handle_ = std::numeric_limits<uint32_t>::max();
-  /// Indicates if queue is active.
+
+  /// @brief Indicates if queue is active.
   std::atomic<bool> active_;
   static int rtti_id_;
 };
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index 6740a0946..9459909d3 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -72,11 +72,32 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
                          uint32_t node_id)
     : Queue(0, 0), LocalSignal(0, false), DoorbellSignal(signal()),
       agent_(*agent), active_(false) {
+  if (agent_.device_type() != core::Agent::DeviceType::kAmdAieDevice) {
+    throw AMD::hsa_exception(
+        HSA_STATUS_ERROR_INVALID_AGENT,
+        "Attempting to create an AIE queue on a non-AIE agent.");
+  }
+  queue_size_bytes_ = req_size_pkts * sizeof(core::AqlPacket);
+  ring_buf_ = agent_.system_allocator()(queue_size_bytes_, 4096,
+                                        core::MemoryRegion::AllocateNoFlags);
+
+  if (!ring_buf_) {
+    throw AMD::hsa_exception(
+        HSA_STATUS_ERROR_INVALID_QUEUE_CREATION,
+        "Could not allocate a ring buffer for an AIE queue.");
+  }
+
+  // Populate hsa_queue_t fields.
+  amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_SINGLE;
+  amd_queue_.hsa_queue.id = INVALID_QUEUEID;
   amd_queue_.hsa_queue.doorbell_signal = Signal::Convert(this);
-  amd_queue_.hsa_queue.size = 0x40;
+  amd_queue_.hsa_queue.size = req_size_pkts;
+  amd_queue_.hsa_queue.base_address = ring_buf_;
+  // Populate AMD queue fields.
+  amd_queue_.write_dispatch_id = 0;
+  amd_queue_.read_dispatch_id = 0;
 
-  signal_.hardware_doorbell_ptr =
-      reinterpret_cast<volatile uint64_t *>(hardware_doorbell_ptr_);
+  signal_.hardware_doorbell_ptr = nullptr;
   signal_.kind = AMD_SIGNAL_KIND_DOORBELL;
   signal_.queue_ptr = &amd_queue_;
   active_ = true;
@@ -200,26 +221,6 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
   return HSA_STATUS_SUCCESS;
 }
 
-core::SharedQueue *AieAqlQueue::CreateSharedQueue(AieAgent *agent,
-                                                  size_t req_size_pkts,
-                                                  uint32_t node_id) {
-  queue_size_bytes_ = req_size_pkts * sizeof(core::AqlPacket);
-
-  if (!IsPowerOfTwo(queue_size_bytes_)) {
-    throw AMD::hsa_exception(
-        HSA_STATUS_ERROR_INVALID_QUEUE_CREATION,
-        "Requested queue with non-power of two packet capacity.\n");
-  }
-
-  node_id_ = node_id;
-
-  return nullptr;
-}
-
-core::SharedSignal *AieAqlQueue::CreateSharedSignal(AieAgent *agent) {
-  return nullptr;
-}
-
 hsa_status_t AieAqlQueue::GetCUMasking(uint32_t num_cu_mask_count,
                                        uint32_t *cu_mask) {
   assert(false && "AIE AQL queue does not support CU masking.");

From 04e2e25792857591f88f78bddd6e9f7397e0c326 Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:51:24 +0000
Subject: [PATCH 05/20] rocr: Add AMD ext for configuring a queue's HW context

This adds an AMD extension API for configuring a queue's
hardware context. For AIEs the embedded runtime (ERT)
creates a hardware context for a queue that has various
configurable parameters, notably CUs can be configured.

This API could also be used for future GPU HW context
configuration (e.g., it could be used as a general way
to set the CU masks).

Change-Id: I7d1ce36d7ad6830f1b0c867bb2646e6e09cf4845
---
 .../core/common/hsa_table_interface.cpp       |  8 +++
 .../core/driver/kfd/amd_kfd_driver.cpp        |  8 +++
 .../core/driver/xdna/amd_xdna_driver.cpp      | 53 ++++++++++++++++++
 .../hsa-runtime/core/inc/amd_aie_aql_queue.h  |  3 +
 runtime/hsa-runtime/core/inc/amd_aql_queue.h  | 11 ++++
 runtime/hsa-runtime/core/inc/amd_kfd_driver.h |  3 +
 .../hsa-runtime/core/inc/amd_xdna_driver.h    | 11 ++++
 runtime/hsa-runtime/core/inc/driver.h         | 11 ++++
 runtime/hsa-runtime/core/inc/host_queue.h     |  6 ++
 .../hsa-runtime/core/inc/hsa_ext_amd_impl.h   |  5 ++
 .../hsa-runtime/core/inc/intercept_queue.h    |  6 ++
 runtime/hsa-runtime/core/inc/queue.h          |  8 +++
 .../core/runtime/amd_aie_aql_queue.cpp        |  7 +++
 .../core/runtime/hsa_api_trace.cpp            |  3 +-
 .../hsa-runtime/core/runtime/hsa_ext_amd.cpp  | 16 ++++++
 runtime/hsa-runtime/hsacore.so.def            |  1 +
 runtime/hsa-runtime/inc/hsa_api_trace.h       |  1 +
 runtime/hsa-runtime/inc/hsa_ext_amd.h         | 56 +++++++++++++++++++
 18 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
index fc42ac8a1..fa41e7ba6 100644
--- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
+++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
@@ -922,6 +922,14 @@ uint32_t HSA_API
                                      wait_hint, satisfying_value);
 }
 
+// Mirrors AMD Extension APIs.
+hsa_status_t
+hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
+                            hsa_amd_queue_hw_ctx_config_param_t config_type,
+                            void *args) {
+  return amdExtTable->hsa_amd_queue_hw_ctx_config_fn(queue, config_type, args);
+}
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
index 605b02236..776f4c814 100644
--- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -244,6 +244,14 @@ hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t
+KfdDriver::ConfigHwCtx(core::Queue &queue,
+                       hsa_amd_queue_hw_ctx_config_param_t config_type,
+                       void *args) {
+  // Only AIE queues support this for now.
+  return HSA_STATUS_ERROR_INVALID_QUEUE;
+}
+
 void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
                                    size_t size) {
   void *mem = nullptr;
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 19b165618..a76b6806b 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -45,6 +45,7 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
@@ -226,6 +227,20 @@ hsa_status_t XdnaDriver::DestroyQueue(core::Queue &queue) const {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t
+XdnaDriver::ConfigHwCtx(core::Queue &queue,
+                        hsa_amd_queue_hw_ctx_config_param_t config_type,
+                        void *args) {
+  switch (config_type) {
+  case HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU:
+    return ConfigHwCtxCU(
+        queue,
+        *reinterpret_cast<hsa_amd_aie_ert_hw_ctx_config_cu_param_t *>(args));
+  default:
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+}
+
 hsa_status_t XdnaDriver::QueryDriverVersion() {
   amdxdna_drm_query_aie_version aie_version{0, 0};
   amdxdna_drm_get_info args{DRM_AMDXDNA_QUERY_AIE_VERSION, sizeof(aie_version),
@@ -307,5 +322,43 @@ hsa_status_t XdnaDriver::FreeDeviceHeap() {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t XdnaDriver::ConfigHwCtxCU(
+    core::Queue &queue,
+    hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param) {
+  if (!AieAqlQueue::IsType(&queue)) {
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
+  auto &aie_queue(static_cast<AieAqlQueue &>(queue));
+
+  size_t config_cu_param_size(sizeof(amdxdna_hwctx_param_config_cu) +
+                              config_cu_param.num_cus *
+                                  sizeof(amdxdna_cu_config));
+
+  amdxdna_hwctx_param_config_cu *xdna_config_cu_param =
+      reinterpret_cast<amdxdna_hwctx_param_config_cu *>(
+          malloc(config_cu_param_size));
+  xdna_config_cu_param->num_cus = config_cu_param.num_cus;
+
+  for (int i = 0; i < xdna_config_cu_param->num_cus; ++i) {
+    xdna_config_cu_param->cu_configs[i].cu_bo =
+        config_cu_param.cu_configs[i].cu_config_bo;
+    xdna_config_cu_param->cu_configs[i].cu_func =
+        config_cu_param.cu_configs[i].cu_func;
+  }
+
+  amdxdna_drm_config_hwctx config_hw_ctx_args{
+      .handle = aie_queue.GetHwCtxHandle(),
+      .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU,
+      .param_val = reinterpret_cast<uint64_t>(xdna_config_cu_param),
+      .param_val_size = config_cu_param_size};
+
+  if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx_args) < 0) {
+    return HSA_STATUS_ERROR;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
 } // namespace AMD
 } // namespace rocr
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
index da0bfb43c..70f05e28a 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
@@ -106,6 +106,9 @@ class AieAqlQueue : public core::Queue,
   }
   uint32_t GetHwCtxHandle() const { return hw_ctx_handle_; }
 
+  hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override;
+
   // GPU-specific queue functions are unsupported.
   hsa_status_t GetCUMasking(uint32_t num_cu_mask_count,
                             uint32_t *cu_mask) override;
diff --git a/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aql_queue.h
index 09f14f941..1e0da0549 100644
--- a/runtime/hsa-runtime/core/inc/amd_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aql_queue.h
@@ -178,6 +178,17 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
   /// @return uint64_t Value of write index before the update
   uint64_t AddWriteIndexRelease(uint64_t value) override;
 
+  /// @brief Configure the hardware context of a queue.
+  ///
+  /// @param config_type Specify the parameter type. Used to interpret @p args.
+  ///
+  /// @param args Queue-specific args for configuring the hardware context.
+  hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override {
+    // Currently only supported by AIE queues.
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
   /// @brief Set CU Masking
   ///
   /// @param num_cu_mask_count size of mask bit array
diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
index 764384dd8..190aabae5 100644
--- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
@@ -79,6 +79,9 @@ class KfdDriver : public core::Driver {
   hsa_status_t FreeMemory(void *mem, size_t size) override;
   hsa_status_t CreateQueue(core::Queue &queue) const override;
   hsa_status_t DestroyQueue(core::Queue &queue) const override;
+  hsa_status_t ConfigHwCtx(core::Queue &queue,
+                           hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override;
 
 private:
   /// @brief Allocate agent accessible memory (system / local memory).
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index e08db0e6f..398ceeb01 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -81,6 +81,10 @@ class XdnaDriver : public core::Driver {
   hsa_status_t CreateQueue(core::Queue &queue) const override;
   hsa_status_t DestroyQueue(core::Queue &queue) const override;
 
+  hsa_status_t ConfigHwCtx(core::Queue &queue,
+                           hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override;
+
 private:
   hsa_status_t QueryDriverVersion();
   /// @brief Allocate device accesible heap space.
@@ -89,6 +93,13 @@ class XdnaDriver : public core::Driver {
   hsa_status_t InitDeviceHeap();
   hsa_status_t FreeDeviceHeap();
 
+  /// @brief Configures the CUs associated with the HW context for this queue.
+  ///
+  /// @param config_cu_param CU configuration information.
+  hsa_status_t
+  ConfigHwCtxCU(core::Queue &queue,
+                hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param);
+
   /// @brief Virtual address range allocated for the device heap.
   ///
   /// Allocate a large enough space so we can carve out the device heap in
diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h
index bbb70ea69..094eb913d 100644
--- a/runtime/hsa-runtime/core/inc/driver.h
+++ b/runtime/hsa-runtime/core/inc/driver.h
@@ -48,6 +48,7 @@
 
 #include "core/inc/memory_region.h"
 #include "inc/hsa.h"
+#include "inc/hsa_ext_amd.h"
 
 namespace rocr {
 namespace core {
@@ -127,6 +128,16 @@ class Driver {
 
   virtual hsa_status_t DestroyQueue(Queue &queue) const = 0;
 
+  /// @brief Configure the hardware context for a queue.
+  /// @param[in] queue The queue whose context is being configured.
+  /// @param[in] config_type Type for the @p args argument. Tells the driver
+  ///            how to interpret the args.
+  /// @param[in] args Arguments for configuring the queue's hardware context.
+  ///            @p config_type tells how to interpret args.
+  virtual hsa_status_t
+  ConfigHwCtx(Queue &queue, hsa_amd_queue_hw_ctx_config_param_t config_type,
+              void *args) = 0;
+
   /// Unique identifier for supported kernel-mode drivers.
   const DriverType kernel_driver_type_;
 
diff --git a/runtime/hsa-runtime/core/inc/host_queue.h b/runtime/hsa-runtime/core/inc/host_queue.h
index ce0bfbbcc..3ec1f3daf 100644
--- a/runtime/hsa-runtime/core/inc/host_queue.h
+++ b/runtime/hsa-runtime/core/inc/host_queue.h
@@ -144,6 +144,12 @@ class HostQueue : public Queue {
                        std::memory_order_release);
   }
 
+  hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override {
+    // Currently only supported by AIE queues.
+    return HSA_STATUS_ERROR_INVALID_QUEUE;
+  }
+
   hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
     return HSA_STATUS_ERROR_INVALID_QUEUE;
   }
diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
index 5109d3976..4d3356a63 100644
--- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
+++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
@@ -108,6 +108,11 @@ uint32_t
                             hsa_wait_state_t wait_hint,
                             hsa_signal_value_t* satisfying_value);
 
+// Mirrors AMD Extension APIs.
+hsa_status_t
+hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
+                            hsa_amd_queue_hw_ctx_config_param_t config_type,
+                            void *args);
 // Mirrors Amd Extension Apis
 hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
diff --git a/runtime/hsa-runtime/core/inc/intercept_queue.h b/runtime/hsa-runtime/core/inc/intercept_queue.h
index 8088d5e92..0638507c7 100644
--- a/runtime/hsa-runtime/core/inc/intercept_queue.h
+++ b/runtime/hsa-runtime/core/inc/intercept_queue.h
@@ -114,6 +114,12 @@ class QueueWrapper : public Queue {
   uint64_t AddWriteIndexRelease(uint64_t value) override {
     return wrapped->AddWriteIndexRelease(value);
   }
+
+  hsa_status_t ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type,
+                           void *args) override {
+    return wrapped->ConfigHwCtx(config_type, args);
+  }
+
   hsa_status_t SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) override {
     return wrapped->SetCUMasking(num_cu_mask_count, cu_mask);
   }
diff --git a/runtime/hsa-runtime/core/inc/queue.h b/runtime/hsa-runtime/core/inc/queue.h
index 43bbe13e7..8979277ed 100644
--- a/runtime/hsa-runtime/core/inc/queue.h
+++ b/runtime/hsa-runtime/core/inc/queue.h
@@ -330,6 +330,14 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue {
   /// @return uint64_t Value of write index before the update
   virtual uint64_t AddWriteIndexRelease(uint64_t value) = 0;
 
+  /// @brief Configure the hardware context of a queue.
+  ///
+  /// @param config_type Specify the parameter type. Used to interpret @p args.
+  ///
+  /// @param args Queue-specific args for configuring the hardware context.
+  virtual hsa_status_t
+  ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type, void *args) = 0;
+
   /// @brief Set CU Masking
   ///
   /// @param num_cu_mask_count size of mask bit array
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index 9459909d3..e8562f226 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -221,6 +221,13 @@ hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t
+AieAqlQueue::ConfigHwCtx(hsa_amd_queue_hw_ctx_config_param_t config_type,
+                         void *args) {
+  return core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)
+      .ConfigHwCtx(*this, config_type, args);
+}
+
 hsa_status_t AieAqlQueue::GetCUMasking(uint32_t num_cu_mask_count,
                                        uint32_t *cu_mask) {
   assert(false && "AIE AQL queue does not support CU masking.");
diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
index 02d1e6bee..cee7ebac2 100644
--- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
+++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
@@ -80,7 +80,7 @@ void HsaApiTable::Init() {
   // they can add preprocessor macros on the new functions
 
   constexpr size_t expected_core_api_table_size = 1016;
-  constexpr size_t expected_amd_ext_table_size = 584;
+  constexpr size_t expected_amd_ext_table_size = 592;
   constexpr size_t expected_image_ext_table_size = 120;
   constexpr size_t expected_finalizer_ext_table_size = 64;
   constexpr size_t expected_tools_table_size = 64;
@@ -406,6 +406,7 @@ void HsaApiTable::UpdateAmdExts() {
   amd_ext_api.hsa_amd_signal_async_handler_fn = AMD::hsa_amd_signal_async_handler;
   amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
   amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
+  amd_ext_api.hsa_amd_queue_hw_ctx_config_fn = AMD::hsa_amd_queue_hw_ctx_config;
   amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
   amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask;
   amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
index cdc046095..b9207d8bb 100644
--- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
+++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
@@ -618,6 +618,22 @@ hsa_status_t hsa_amd_async_function(void (*callback)(void* arg), void* arg) {
   CATCH;
 }
 
+hsa_status_t
+hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
+                            hsa_amd_queue_hw_ctx_config_param_t config_type,
+                            void *args) {
+  TRY;
+  IS_OPEN();
+
+  IS_BAD_PTR(args);
+  core::Queue *cmd_queue = core::Queue::Convert(queue);
+  IS_VALID(cmd_queue);
+
+  return cmd_queue->ConfigHwCtx(config_type, args);
+
+  CATCH;
+}
+
 hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
                                        const uint32_t* cu_mask) {
   TRY;
diff --git a/runtime/hsa-runtime/hsacore.so.def b/runtime/hsa-runtime/hsacore.so.def
index 3a575367f..7bd67e619 100644
--- a/runtime/hsa-runtime/hsacore.so.def
+++ b/runtime/hsa-runtime/hsacore.so.def
@@ -178,6 +178,7 @@ global:
 	hsa_amd_signal_async_handler;
 	hsa_amd_async_function;
 	hsa_amd_image_get_info_max_dim;
+	hsa_amd_queue_hw_ctx_config;
 	hsa_amd_queue_cu_set_mask;
 	hsa_amd_queue_cu_get_mask;
 	hsa_amd_memory_fill;
diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h
index e0063e6da..3cadef74e 100644
--- a/runtime/hsa-runtime/inc/hsa_api_trace.h
+++ b/runtime/hsa-runtime/inc/hsa_api_trace.h
@@ -204,6 +204,7 @@ struct AmdExtTable {
   decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn;
   decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
   decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
+  decltype(hsa_amd_queue_hw_ctx_config) *hsa_amd_queue_hw_ctx_config_fn;
   decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
   decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
   decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h
index 0f0f5fcf9..6a4bce698 100644
--- a/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -1319,6 +1319,62 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
  *  @{
  */
 
+/**
+ * @brief Hardware context configuration for one AIE CU.
+ */
+typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_s {
+  /**
+   * @brief CU configuration BO handle.
+   */
+  uint32_t cu_config_bo;
+  /**
+   * @brief Function of a CU.
+   */
+  uint8_t cu_func;
+  uint8_t reserved[3];
+} hsa_amd_aie_ert_hw_ctx_cu_config_t;
+
+typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_s {
+  /**
+   * @brief Number of CUs to configure.
+   */
+  uint16_t num_cus;
+  uint16_t reserved[3];
+  /**
+   * @brief List of CU configurations.
+   */
+  hsa_amd_aie_ert_hw_ctx_cu_config_t *cu_configs;
+} hsa_amd_aie_ert_hw_ctx_config_cu_param_t;
+
+/**
+ * brief Specify a hardware context configuration parameter type for a queue.
+ */
+typedef enum {
+  /**
+   * @brief Configure the CUs assigned to the AIE ERT HW context.
+   */
+  HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU = 0
+} hsa_amd_queue_hw_ctx_config_param_t;
+
+/**
+ * @brief Configures the hardware context of a queue.
+ *
+ * @details This can be used to send configuration data to the queue so it can
+ * configure various hardware components that support the queue. The payload
+ * used to describe the configuration is interpreted on a per-queue-type basis.
+ *
+ * @param[in] queue HSA queue whose HW context is being configured.
+ *
+ * @param[in] config_type Specifies the type of the configuration. Used to
+ * determine how to interpret @p args.
+ *
+ * @param[in] args Configuration payload. Will be interpreted by the queue
+ * based on @p config_type.
+ */
+hsa_status_t HSA_API hsa_amd_queue_hw_ctx_config(
+    const hsa_queue_t *queue, hsa_amd_queue_hw_ctx_config_param_t config_type,
+    void *args);
+
 /**
  * @brief Set a queue's CU affinity mask.
  *

From a5790de101965464fa6562de777a681857140ccd Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Mon, 19 Aug 2024 15:52:43 +0000
Subject: [PATCH 06/20] rocr/aie: Support VMEM handle creation

Adds support for AllocateMemoryOnly inside XDNA driver.

Move the IsLocalMemory() check inside the KFD driver
since the XDNA driver can, and needs to, create handles
on system memory buffer objects.

Changed handle variable name from thunk_handle to user_mode_driver_handle,
which is more representative if we support non-GPU drivers.

Change-Id: I95db9d575afd1ab0ff2de74cea5175d9a12a721b
---
 .../core/driver/kfd/amd_kfd_driver.cpp        |  5 +++++
 .../core/driver/xdna/amd_xdna_driver.cpp      | 20 +++++++++++++++----
 .../hsa-runtime/core/inc/amd_xdna_driver.h    |  7 +++++++
 runtime/hsa-runtime/core/runtime/runtime.cpp  | 14 +++++++------
 4 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
index 776f4c814..59ac3cd83 100644
--- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -103,6 +103,11 @@ KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
     kmt_alloc_flags.ui32.NonPaged = 1;
   }
 
+  if (m_region.IsLocalMemory() &&
+      (alloc_flags & core::MemoryRegion::AllocateMemoryOnly)) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
   // Allocating a memory handle for virtual memory
   kmt_alloc_flags.ui32.NoAddress =
       !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index a76b6806b..25071a311 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -134,6 +134,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
   amdxdna_drm_create_bo create_bo_args{.size = size};
   amdxdna_drm_get_bo_info get_bo_info_args{0};
   drm_gem_close close_bo_args{0};
+  void *mapped_mem(nullptr);
 
   if (!m_region.IsSystem()) {
     return HSA_STATUS_ERROR_INVALID_REGION;
@@ -160,18 +161,29 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
     return HSA_STATUS_ERROR;
   }
 
+  /// TODO: For now we always map the memory and keep a mapping from handles
+  /// to VA memory addresses. Once we can support the separate VMEM call to
+  /// map handles we can fix this.
   if (m_region.kernarg()) {
-    *mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
-                get_bo_info_args.map_offset);
-    if (*mem == MAP_FAILED) {
+    mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                      get_bo_info_args.map_offset);
+    if (mapped_mem == MAP_FAILED) {
       // Close the BO in the case when a mapping fails and we got a BO handle.
       ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
   } else {
-    *mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
+    mapped_mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
   }
 
+  if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) {
+    *mem = reinterpret_cast<void *>(create_bo_args.handle);
+  } else {
+    *mem = mapped_mem;
+  }
+
+  vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
+
   return HSA_STATUS_SUCCESS;
 }
 
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index 398ceeb01..0d64cea7a 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -43,6 +43,7 @@
 #define HSA_RUNTIME_CORE_INC_AMD_XDNA_DRIVER_H_
 
 #include <memory>
+#include <unordered_map>
 
 #include "core/inc/driver.h"
 #include "core/inc/memory_region.h"
@@ -100,6 +101,12 @@ class XdnaDriver : public core::Driver {
   ConfigHwCtxCU(core::Queue &queue,
                 hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param);
 
+  /// TODO: Probably remove this in the future and rely on the core Runtime
+  /// object to track handle allocations. Using the VMEM API for mapping XDNA
+  /// driver handles requires a bit more refactoring. So rely on the XDNA driver
+  /// to manage some of this for now.
+  std::unordered_map<uint32_t, void *> vmem_handle_mappings;
+
   /// @brief Virtual address range allocated for the device heap.
   ///
   /// Allocate a large enough space so we can carve out the device heap in
diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp
index e25cf5565..ab53fbe98 100644
--- a/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -3111,20 +3111,22 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
                                           uint64_t flags_unused,
                                           hsa_amd_vmem_alloc_handle_t* memoryOnlyHandle) {
   const AMD::MemoryRegion* memRegion = static_cast<const AMD::MemoryRegion*>(region);
-  if (!memRegion->IsLocalMemory()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   if (!IsMultipleOf(size, memRegion->GetPageSize()))
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
-  void* thunk_handle;
-  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
+  void *user_mode_driver_handle;
+  hsa_status_t status =
+      region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
   if (status == HSA_STATUS_SUCCESS) {
     memory_handle_map_.emplace(std::piecewise_construct,
-          std::forward_as_tuple(thunk_handle),
-          std::forward_as_tuple(region, size, flags_unused, thunk_handle, alloc_flags));
+                               std::forward_as_tuple(user_mode_driver_handle),
+                               std::forward_as_tuple(region, size, flags_unused,
+                                                     user_mode_driver_handle,
+                                                     alloc_flags));
 
-    *memoryOnlyHandle = MemoryHandle::Convert(thunk_handle);
+    *memoryOnlyHandle = MemoryHandle::Convert(user_mode_driver_handle);
   }
   return status;
 }

From 03edf037b150ecb6c58c921f542a376145e1c7c3 Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <yiannis.papadopoulos@amd.com>
Date: Wed, 28 Aug 2024 10:33:36 -0700
Subject: [PATCH 07/20] rocr/aie: Correct reporting of dev heap size

Storing the correct dev heap size in the memory region.

Change-Id: I14b053330c187da1d7d0213256625e50795b9902
---
 .../core/driver/xdna/amd_xdna_driver.cpp          |  4 ++++
 runtime/hsa-runtime/core/inc/amd_xdna_driver.h    |  3 +++
 .../hsa-runtime/core/runtime/amd_aie_agent.cpp    | 15 +++++++++------
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 25071a311..690c043b4 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -85,6 +85,10 @@ hsa_status_t XdnaDriver::DiscoverDriver() {
   return HSA_STATUS_ERROR;
 }
 
+uint64_t XdnaDriver::GetDevHeapByteSize() {
+  return dev_heap_size;
+}
+
 hsa_status_t XdnaDriver::Init() { return InitDeviceHeap(); }
 
 hsa_status_t XdnaDriver::QueryKernelModeDriver(core::DriverQuery query) {
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index 0d64cea7a..871d1185f 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -63,6 +63,9 @@ class XdnaDriver : public core::Driver {
 
   static hsa_status_t DiscoverDriver();
 
+  /// @brief Returns the size of the dev heap in bytes.
+  static uint64_t GetDevHeapByteSize();
+
   hsa_status_t Init() override;
   hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
 
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index 29d300410..4d2d1f60f 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -46,6 +46,7 @@
 
 #include "core/inc/amd_aie_aql_queue.h"
 #include "core/inc/amd_memory_region.h"
+#include "core/inc/amd_xdna_driver.h"
 #include "core/inc/driver.h"
 #include "core/inc/runtime.h"
 
@@ -199,16 +200,18 @@ void AieAgent::InitRegionList() {
 
   /// For allocating kernel arguments or other objects that only need
   /// system memory.
-  HsaMemoryProperties sys_mem_props{
-      .HeapType = HSA_HEAPTYPE_SYSTEM,
-  };
+  HsaMemoryProperties sys_mem_props = {};
+  sys_mem_props.HeapType = HSA_HEAPTYPE_SYSTEM;
+
   /// For allocating memory for programmable device image (PDI) files. These
   /// need to be mapped to the device so the hardware can access the PDIs.
-  HsaMemoryProperties dev_mem_props{
-      .HeapType = HSA_HEAPTYPE_DEVICE_SVM,
-  };
+  HsaMemoryProperties dev_mem_props = {};
+  dev_mem_props.HeapType = HSA_HEAPTYPE_DEVICE_SVM,
+  dev_mem_props.SizeInBytes = XdnaDriver::GetDevHeapByteSize();
+
   /// As of now the AIE devices support coarse-grain memory regions that require
   /// explicit sync operations.
+  regions_.reserve(2);
   regions_.push_back(
       new MemoryRegion(false, true, false, false, true, this, sys_mem_props));
   regions_.push_back(

From bfaa50e8c2f1947eddd30e91a299f2a350c1a28a Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Date: Tue, 11 Jun 2024 16:13:31 -0400
Subject: [PATCH 08/20] rocr/aie: Missing AIEAgent info cases

* Add missing info cases to AIEAgent

* Following the single value set from GPUAgent

* AIEs have no caches, no cacheline is reported
---
 .../core/runtime/amd_aie_agent.cpp            | 58 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index 4d2d1f60f..cd374c65c 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -3,7 +3,7 @@
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
 //
-// Copyright (c) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2024, Advanced Micro Devices, Inc. All rights reserved.
 //
 // Developed by:
 //
@@ -111,14 +111,30 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
   case HSA_AGENT_INFO_MACHINE_MODEL:
     *reinterpret_cast<hsa_machine_model_t *>(value) = HSA_MACHINE_MODEL_LARGE;
     break;
-  case HSA_AGENT_INFO_PROFILE:
+  case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES:
+  case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE:
+    // TODO: validate if this is true.
+    *reinterpret_cast<hsa_default_float_rounding_mode_t *>(value) =
+        HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR;
+    break;
+   case HSA_AGENT_INFO_PROFILE:
     *reinterpret_cast<hsa_profile_t *>(value) = profile_;
     break;
   case HSA_AGENT_INFO_WAVEFRONT_SIZE:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
   case HSA_AGENT_INFO_WORKGROUP_MAX_DIM:
+    std::memset(value, 0, sizeof(uint16_t) * 3);
+    break;
   case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE:
+    *reinterpret_cast<uint32_t*>(value) = 0;
+    break;
   case HSA_AGENT_INFO_GRID_MAX_DIM:
+    std::memset(value, 0, sizeof(hsa_dim3_t));
+    break;
   case HSA_AGENT_INFO_GRID_MAX_SIZE:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
   case HSA_AGENT_INFO_FBARRIER_MAX_SIZE:
     *reinterpret_cast<uint32_t *>(value) = 0;
     break;
@@ -149,6 +165,36 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
   case HSA_AGENT_INFO_VERSION_MINOR:
     *reinterpret_cast<uint32_t *>(value) = 0;
     break;
+  case HSA_AMD_AGENT_INFO_CHIP_ID:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_CACHELINE_SIZE:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID:
+    *reinterpret_cast<uint32_t *>(value) = node_id();
+    break;
+  case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_BDFID:
+   *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
   case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS:
   case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS:
   case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS:
@@ -165,6 +211,14 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
     std::strcpy(reinterpret_cast<char *>(value), product_name_info_.c_str());
     break;
   }
+  case HSA_AMD_AGENT_INFO_ASIC_REVISION:
+    *reinterpret_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS:
+    // commented out until we populate AIE agent regions
+    assert(regions_.size() != 0 && "No device local memory found!");
+    *reinterpret_cast<bool *>(value) = true;
+    break;
   default:
     *reinterpret_cast<uint32_t *>(value) = 0;
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;

From 349d6f5a06a940bbf7195a89a47988b5ae12af19 Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Thu, 8 Aug 2024 10:18:16 -0600
Subject: [PATCH 09/20] rocr/aie: Handle sideband

* Adding a sideband interface to get a handle from a virtual address. This is just temporary and should be removed when we move to the vmem API

* Removing unused queue structure

* Removing the queue from the arguments of hsa_amd_get_handle_from_vaddr

* Free XDNA BOs

---------

Co-authored-by: Yiannis Papadopoulos <Yiannis.Papadopoulos@amd.com>
---
 .../core/common/hsa_table_interface.cpp       |  5 +++
 .../core/driver/kfd/amd_kfd_driver.cpp        |  7 +++-
 .../core/driver/xdna/amd_xdna_driver.cpp      | 35 ++++++++++++++++---
 runtime/hsa-runtime/core/inc/amd_kfd_driver.h |  1 +
 .../hsa-runtime/core/inc/amd_xdna_driver.h    |  5 +++
 runtime/hsa-runtime/core/inc/driver.h         |  2 ++
 .../hsa-runtime/core/inc/hsa_ext_amd_impl.h   |  4 +++
 runtime/hsa-runtime/core/inc/runtime.h        |  2 ++
 .../core/runtime/hsa_api_trace.cpp            |  3 +-
 .../hsa-runtime/core/runtime/hsa_ext_amd.cpp  | 13 +++++++
 runtime/hsa-runtime/core/runtime/runtime.cpp  | 11 ++++++
 runtime/hsa-runtime/hsacore.so.def            |  1 +
 runtime/hsa-runtime/inc/hsa_api_trace.h       |  1 +
 runtime/hsa-runtime/inc/hsa_ext_amd.h         |  3 ++
 14 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
index fa41e7ba6..69b938e66 100644
--- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
+++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp
@@ -930,6 +930,11 @@ hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
   return amdExtTable->hsa_amd_queue_hw_ctx_config_fn(queue, config_type, args);
 }
 
+// Mirrors AMD Extension APIs.
+hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) {
+  return amdExtTable->hsa_amd_get_handle_from_vaddr_fn(ptr, handle);
+}
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
index 59ac3cd83..f405bf502 100644
--- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -254,7 +254,12 @@ KfdDriver::ConfigHwCtx(core::Queue &queue,
                        hsa_amd_queue_hw_ctx_config_param_t config_type,
                        void *args) {
   // Only AIE queues support this for now.
-  return HSA_STATUS_ERROR_INVALID_QUEUE;
+  return HSA_STATUS_ERROR_INVALID_AGENT;
+}
+
+hsa_status_t KfdDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) {
+  // Only AIE queues support this for now.
+  return HSA_STATUS_ERROR_INVALID_AGENT;
 }
 
 void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 690c043b4..7fbca3d64 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -134,17 +134,17 @@ hsa_status_t
 XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
                            core::MemoryRegion::AllocateFlags alloc_flags,
                            void **mem, size_t size, uint32_t node_id) {
-  const MemoryRegion &m_region(static_cast<const MemoryRegion &>(mem_region));
+  const auto &region = static_cast<const MemoryRegion &>(mem_region);
   amdxdna_drm_create_bo create_bo_args{.size = size};
   amdxdna_drm_get_bo_info get_bo_info_args{0};
   drm_gem_close close_bo_args{0};
   void *mapped_mem(nullptr);
 
-  if (!m_region.IsSystem()) {
+  if (!region.IsSystem()) {
     return HSA_STATUS_ERROR_INVALID_REGION;
   }
 
-  if (m_region.kernarg()) {
+  if (region.kernarg()) {
     create_bo_args.type = AMDXDNA_BO_CMD;
   } else {
     create_bo_args.type = AMDXDNA_BO_DEV;
@@ -187,11 +187,30 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
   }
 
   vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
+  vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle);
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
+hsa_status_t XdnaDriver::FreeMemory(void* ptr, size_t size) {
+  auto it = vmem_handle_mappings_reverse.find(ptr);
+  if (it == vmem_handle_mappings_reverse.end())
+    return HSA_STATUS_ERROR_INVALID_ALLOCATION;
+
+  // TODO:ypapadop-amd: need to unmap memory, but we don't know if it's mapped or not as we don't have
+  // region information
+
+  auto handle = it->second;
+
+  drm_gem_close close_args = {};
+  close_args.handle = handle;
+  if (ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_args) < 0) {
+    return HSA_STATUS_ERROR;
+  }
+
+  vmem_handle_mappings.erase(handle);
+  vmem_handle_mappings_reverse.erase(it);
+
   return HSA_STATUS_SUCCESS;
 }
 
@@ -257,6 +276,14 @@ XdnaDriver::ConfigHwCtx(core::Queue &queue,
   }
 }
 
+hsa_status_t XdnaDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) {
+  auto it = vmem_handle_mappings_reverse.find(ptr);
+  if (it == vmem_handle_mappings_reverse.end())
+    return HSA_STATUS_ERROR_INVALID_ALLOCATION;
+  *handle = it->second;
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t XdnaDriver::QueryDriverVersion() {
   amdxdna_drm_query_aie_version aie_version{0, 0};
   amdxdna_drm_get_info args{DRM_AMDXDNA_QUERY_AIE_VERSION, sizeof(aie_version),
diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
index 190aabae5..bd6f376fb 100644
--- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h
@@ -82,6 +82,7 @@ class KfdDriver : public core::Driver {
   hsa_status_t ConfigHwCtx(core::Queue &queue,
                            hsa_amd_queue_hw_ctx_config_param_t config_type,
                            void *args) override;
+  hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override;
 
 private:
   /// @brief Allocate agent accessible memory (system / local memory).
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index 871d1185f..28572e135 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -89,6 +89,8 @@ class XdnaDriver : public core::Driver {
                            hsa_amd_queue_hw_ctx_config_param_t config_type,
                            void *args) override;
 
+  hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override;
+
 private:
   hsa_status_t QueryDriverVersion();
   /// @brief Allocate device accesible heap space.
@@ -110,6 +112,9 @@ class XdnaDriver : public core::Driver {
   /// to manage some of this for now.
   std::unordered_map<uint32_t, void *> vmem_handle_mappings;
 
+  // TODO: Remove this once we move to the vmem API
+  std::unordered_map<void*, uint32_t> vmem_handle_mappings_reverse;
+
   /// @brief Virtual address range allocated for the device heap.
   ///
   /// Allocate a large enough space so we can carve out the device heap in
diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h
index 094eb913d..580a12ccb 100644
--- a/runtime/hsa-runtime/core/inc/driver.h
+++ b/runtime/hsa-runtime/core/inc/driver.h
@@ -138,6 +138,8 @@ class Driver {
   ConfigHwCtx(Queue &queue, hsa_amd_queue_hw_ctx_config_param_t config_type,
               void *args) = 0;
 
+  virtual hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) = 0;
+
   /// Unique identifier for supported kernel-mode drivers.
   const DriverType kernel_driver_type_;
 
diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
index 4d3356a63..d10300e89 100644
--- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
+++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h
@@ -113,6 +113,10 @@ hsa_status_t
 hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
                             hsa_amd_queue_hw_ctx_config_param_t config_type,
                             void *args);
+
+// Mirrors AMD Extension APIs.
+hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle);
+
 // Mirrors Amd Extension Apis
 hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                uint32_t num_cu_mask_count,
diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h
index 981bd4852..3e3f48294 100644
--- a/runtime/hsa-runtime/core/inc/runtime.h
+++ b/runtime/hsa-runtime/core/inc/runtime.h
@@ -403,6 +403,8 @@ class Runtime {
                                                    const core::MemoryRegion** mem_region,
                                                    hsa_amd_memory_type_t* type);
 
+  hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle);
+
   hsa_status_t EnableLogging(uint8_t* flags, void* file);
 
   const std::vector<Agent*>& cpu_agents() { return cpu_agents_; }
diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
index cee7ebac2..b4985c3be 100644
--- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
+++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp
@@ -80,7 +80,7 @@ void HsaApiTable::Init() {
   // they can add preprocessor macros on the new functions
 
   constexpr size_t expected_core_api_table_size = 1016;
-  constexpr size_t expected_amd_ext_table_size = 592;
+  constexpr size_t expected_amd_ext_table_size = 600;//592;
   constexpr size_t expected_image_ext_table_size = 120;
   constexpr size_t expected_finalizer_ext_table_size = 64;
   constexpr size_t expected_tools_table_size = 64;
@@ -407,6 +407,7 @@ void HsaApiTable::UpdateAmdExts() {
   amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function;
   amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any;
   amd_ext_api.hsa_amd_queue_hw_ctx_config_fn = AMD::hsa_amd_queue_hw_ctx_config;
+  amd_ext_api.hsa_amd_get_handle_from_vaddr_fn = AMD::hsa_amd_get_handle_from_vaddr;
   amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask;
   amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask;
   amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info;
diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
index b9207d8bb..0184e175e 100644
--- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
+++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
@@ -634,6 +634,19 @@ hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
   CATCH;
 }
 
+
+hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) {
+  TRY;
+  IS_OPEN();
+
+  IS_BAD_PTR(ptr);
+  IS_BAD_PTR(handle);
+
+  return core::Runtime::runtime_singleton_->GetHandleFromVaddr(ptr, handle);
+
+  CATCH;
+}
+
 hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
                                        const uint32_t* cu_mask) {
   TRY;
diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp
index ab53fbe98..a266df260 100644
--- a/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -3605,6 +3605,17 @@ hsa_status_t Runtime::VMemoryGetAllocPropertiesFromHandle(hsa_amd_vmem_alloc_han
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t Runtime::GetHandleFromVaddr(void* ptr, uint32_t* handle) {
+  auto it = allocation_map_.find(ptr);
+  if (it == allocation_map_.end()) {
+    return HSA_STATUS_ERROR_INVALID_ALLOCATION;
+  }
+
+  auto* agent = it->second.region->owner();
+  auto& driver = AgentDriver(agent->driver_type);
+  return driver.GetHandleFromVaddr(ptr, handle);
+}
+
 hsa_status_t Runtime::EnableLogging(uint8_t* flags, void* file) {
   memcpy(log_flags, flags, sizeof(log_flags));
 
diff --git a/runtime/hsa-runtime/hsacore.so.def b/runtime/hsa-runtime/hsacore.so.def
index 7bd67e619..c63d0e7d4 100644
--- a/runtime/hsa-runtime/hsacore.so.def
+++ b/runtime/hsa-runtime/hsacore.so.def
@@ -179,6 +179,7 @@ global:
 	hsa_amd_async_function;
 	hsa_amd_image_get_info_max_dim;
 	hsa_amd_queue_hw_ctx_config;
+	hsa_amd_get_handle_from_vaddr;
 	hsa_amd_queue_cu_set_mask;
 	hsa_amd_queue_cu_get_mask;
 	hsa_amd_memory_fill;
diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h
index 3cadef74e..a5d85e85c 100644
--- a/runtime/hsa-runtime/inc/hsa_api_trace.h
+++ b/runtime/hsa-runtime/inc/hsa_api_trace.h
@@ -205,6 +205,7 @@ struct AmdExtTable {
   decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
   decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
   decltype(hsa_amd_queue_hw_ctx_config) *hsa_amd_queue_hw_ctx_config_fn;
+  decltype(hsa_amd_get_handle_from_vaddr)* hsa_amd_get_handle_from_vaddr_fn;
   decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
   decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
   decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h
index 6a4bce698..be7e61b41 100644
--- a/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -1375,6 +1375,9 @@ hsa_status_t HSA_API hsa_amd_queue_hw_ctx_config(
     const hsa_queue_t *queue, hsa_amd_queue_hw_ctx_config_param_t config_type,
     void *args);
 
+
+hsa_status_t HSA_API hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle);
+
 /**
  * @brief Set a queue's CU affinity mask.
  *

From 6c5b5e59682617de8b40949b04dbb4413eb01d2f Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <Yiannis.Papadopoulos@amd.com>
Date: Thu, 29 Aug 2024 13:29:35 -0400
Subject: [PATCH 10/20] rocr/aie: Fix merge conflict

---
 runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 7fbca3d64..303312932 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -168,7 +168,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
   /// TODO: For now we always map the memory and keep a mapping from handles
   /// to VA memory addresses. Once we can support the separate VMEM call to
   /// map handles we can fix this.
-  if (m_region.kernarg()) {
+  if (region.kernarg()) {
     mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
                       get_bo_info_args.map_offset);
     if (mapped_mem == MAP_FAILED) {

From fc4356b36bd3068774469c17b78941aa2dfc8f4c Mon Sep 17 00:00:00 2001
From: Jose Manuel Monsalve Diaz <josemonsalve2@gmail.com>
Date: Fri, 9 Aug 2024 14:17:57 -0700
Subject: [PATCH 11/20] rocr/aie: Adding a placeholder UUID for NPU devices

---
 .../hsa-runtime/core/runtime/amd_aie_agent.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index cd374c65c..6868ef8ec 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -211,6 +211,24 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
     std::strcpy(reinterpret_cast<char *>(value), product_name_info_.c_str());
     break;
   }
+  case HSA_AMD_AGENT_INFO_UUID: {
+    // TODO: uuid_value needs to be obtained somehow
+    uint64_t uuid_value = 0;
+
+    if (uuid_value == 0) {
+      static const char uuid_tmp[] = "AIE-XX";
+      snprintf(static_cast<char *>(value), sizeof(uuid_tmp), "%s", uuid_tmp);
+      break;
+    }
+
+    // Device supports UUID, build UUID string to return.
+    constexpr std::size_t max_uuid_length = 36;
+    static const char uuid_tmp[] = "AIE-";
+    snprintf(static_cast<char *>(value), max_uuid_length + sizeof(uuid_tmp),
+             "%s%036lX", uuid_tmp, uuid_value);
+    break;
+  }
+
   case HSA_AMD_AGENT_INFO_ASIC_REVISION:
     *reinterpret_cast<uint32_t *>(value) = 0;
     break;

From 531e0a2d88d19641279122a5df3edb63e3e587c5 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Sun, 1 Sep 2024 16:30:52 -0500
Subject: [PATCH 12/20] re-enable aie_hsa_dispatch_test

---
 .github/workflows/ci-linux.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 161c674b6..d0ff649fd 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -161,8 +161,9 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             "-Dhsa-runtime64_DIR=$hsa_runtime64_ROOT/lib64/cmake/hsa-runtime64" \
             -S "$PWD" -B "$build_dir"
+          cmake --build "$build_dir" --target aie_hsa_dispatch_test
           
-          ! cmake --build "$build_dir" --target aie_hsa_dispatch_test
+          "$build_dir"/aie_hsa_dispatch_test $PWD
           
           popd
 

From f01b0e43a9e183b89008e8858bcc431bde2bd7ef Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Mon, 2 Sep 2024 18:04:03 -0600
Subject: [PATCH 13/20] Adding soft queue dispatch logic to dispatch commands
 to AIE agents (#2)

---
 .../core/driver/xdna/amd_xdna_driver.cpp      |  21 +-
 .../hsa-runtime/core/inc/amd_aie_aql_queue.h  |  76 ++++-
 runtime/hsa-runtime/core/inc/amd_gpu_agent.h  |  18 +-
 .../hsa-runtime/core/inc/amd_xdna_driver.h    |   8 +-
 .../core/runtime/amd_aie_aql_queue.cpp        | 301 ++++++++++++++++--
 5 files changed, 387 insertions(+), 37 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 303312932..4ba196b5f 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
     return HSA_STATUS_ERROR;
   }
 
-  aie_agent.SetNumCols(aie_metadata.cols);
+  // Right now can only target N-1 columns so putting this 
+  // here as a workaround
+  aie_agent.SetNumCols(aie_metadata.cols - 1);
   aie_agent.SetNumCoreRows(aie_metadata.core.row_count);
 
   return HSA_STATUS_SUCCESS;
@@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
+  vmem_handle_mappings = this->vmem_handle_mappings;
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t XdnaDriver::GetFd(int &fd) {
+  fd = fd_;
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t XdnaDriver::FreeDeviceHeap() {
   if (dev_heap_parent) {
     munmap(dev_heap_parent, dev_heap_align * 2 - 1);
@@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
         config_cu_param.cu_configs[i].cu_config_bo;
     xdna_config_cu_param->cu_configs[i].cu_func =
         config_cu_param.cu_configs[i].cu_func;
+
+    // sync configuration buffer
+    amdxdna_drm_sync_bo sync_args = {};
+    sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo;
+    if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) {
+      return HSA_STATUS_ERROR;
+    }
   }
 
   amdxdna_drm_config_hwctx config_hw_ctx_args{
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
index 70f05e28a..224b85d7c 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
@@ -49,7 +49,35 @@
 #include "core/inc/queue.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
-#include "core/util/locks.h"
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+  __u32 command_count;
+  __u32 submit_index;
+  __u32 error_index;
+  __u32 reserved[3];
+  __u64 data[] __counted_by(command_count);
+};
+
+
+/* Exec buffer command header format */
+struct amdxdna_cmd {
+  union {
+    struct {
+      __u32 state : 4;
+      __u32 unused : 6;
+      __u32 extra_cu_masks : 2;
+      __u32 count : 11;
+      __u32 opcode : 5;
+      __u32 reserved : 4;
+    };
+    __u32 header;
+  };
+  __u32 data[]  __counted_by(count);
+};
 
 namespace rocr {
 namespace AMD {
@@ -71,7 +99,7 @@ class AieAqlQueue : public core::Queue,
 
   AieAqlQueue() = delete;
   AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id);
-  ~AieAqlQueue();
+  ~AieAqlQueue() override;
 
   hsa_status_t Inactivate() override;
   hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
@@ -100,7 +128,7 @@ class AieAqlQueue : public core::Queue,
                        void *value) override;
 
   // AIE-specific API
-  AieAgent &GetAgent() { return agent_; }
+  AieAgent &GetAgent() const { return agent_; }
   void SetHwCtxHandle(uint32_t hw_ctx_handle) {
     hw_ctx_handle_ = hw_ctx_handle;
   }
@@ -119,7 +147,7 @@ class AieAqlQueue : public core::Queue,
                   hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
                   hsa_signal_t *signal = NULL) override;
 
-  uint32_t queue_id_ = INVALID_QUEUEID;
+  uint64_t queue_id_ = INVALID_QUEUEID;
   /// @brief ID of AIE device on which this queue has been mapped.
   uint32_t node_id_ = std::numeric_limits<uint32_t>::max();
   /// @brief Queue size in bytes.
@@ -134,6 +162,44 @@ class AieAqlQueue : public core::Queue,
   /// @brief Base of the queue's ring buffer storage.
   void *ring_buf_ = nullptr;
 
+  static hsa_status_t SubmitCmd(
+      uint32_t hw_ctx_handle, int fd, void *queue_base,
+      uint64_t read_dispatch_id, uint64_t write_dispatch_id,
+      std::unordered_map<uint32_t, void *> &vmem_handle_mappings);
+
+  /// @brief Creates a command BO and returns a pointer to the memory and
+  //          the corresponding handle
+  ///
+  /// @param size size of memory to allocate
+  /// @param handle A pointer to the BO handle
+  /// @param cmd A pointer to the buffer
+  static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle,
+                                amdxdna_cmd **cmd, int fd);
+
+  /// @brief Adds all BOs in a command packet payload to a vector
+  ///         and replaces the handles with a virtual address
+  ///
+  /// @param count Number of entries in the command
+  /// @param bo_args A pointer to a vector that contains all bo handles
+  /// @param cmd_pkt_payload A pointer to the payload of the command
+  static void RegisterCmdBOs(
+      uint32_t count, std::vector<uint32_t> &bo_args,
+      hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
+      std::unordered_map<uint32_t, void *> &vmem_handle_mappings);
+
+  /// @brief Syncs all BOs referenced in bo_args
+  ///
+  /// @param bo_args vector containing handles of BOs to sync
+  static hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);
+
+  /// @brief Executes a command and waits for its completion
+  ///
+  /// @param exec_cmd Structure containing the details of the command to execute
+  /// @param hw_ctx_handle the handle of the hardware context to run this
+  /// command
+  static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
+                                     uint32_t hw_ctx_handle, int fd);
+
   /// @brief Handle for an application context on the AIE device.
   ///
   /// Each user queue will have an associated context. This handle is assigned
@@ -153,4 +219,4 @@ class AieAqlQueue : public core::Queue,
 } // namespace AMD
 } // namespace rocr
 
-#endif // header guard
+#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 608017c11..a7193d18d 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt {
   GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);
 
   // @brief GPU agent destructor.
-  ~GpuAgent();
+  ~GpuAgent() override;
 
   // @brief Ensure blits are ready (performance hint).
   void PreloadBlits() override;
@@ -507,14 +507,14 @@ class GpuAgent : public GpuAgentInt {
   hsa_status_t EnableDmaProfiling(bool enable) override;
 
   hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
-                                       void* cb_data);
-  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
+                          void *cb_data) override;
+  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override;
   hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
-                                      pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
+                         pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override;
   hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
 
   static void PcSamplingThreadRun(void* agent);
@@ -793,4 +793,4 @@ class GpuAgent : public GpuAgentInt {
 }  // namespace amd
 }  // namespace rocr
 
-#endif  // header guard
+#endif  // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index 28572e135..79cbaa710 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -47,6 +47,7 @@
 
 #include "core/inc/driver.h"
 #include "core/inc/memory_region.h"
+#include "core/driver/xdna/uapi/amdxdna_accel.h"
 
 namespace rocr {
 namespace core {
@@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver {
   hsa_status_t Init() override;
   hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
 
+  hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
+  hsa_status_t GetFd(int &fd);
+
   hsa_status_t GetAgentProperties(core::Agent &agent) const override;
   hsa_status_t
   GetMemoryProperties(uint32_t node_id,
@@ -126,10 +130,6 @@ class XdnaDriver : public core::Driver {
   void *dev_heap_aligned = nullptr;
   static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
   static constexpr size_t dev_heap_align = 64 * 1024 * 1024;
-
-  /// @brief DRM buffer object handle for the device heap. Assigned by the
-  ///        kernel-mode driver.
-  uint32_t dev_heap_handle = 0;
 };
 
 } // namespace AMD
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index e8562f226..b2f8fd2d0 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -41,28 +41,52 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "core/inc/amd_aie_aql_queue.h"
+#include "core/inc/amd_xdna_driver.h"
 
 #ifdef __linux__
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
-#include <sys/syscall.h>
-#include <unistd.h>
+#include <sys/ioctl.h>
 #endif
 
 #ifdef _WIN32
 #include <Windows.h>
 #endif
 
-#include <stdio.h>
-#include <string.h>
-#include <thread>
+#include <cstring>
 
 #include "core/inc/queue.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
 #include "core/util/utils.h"
 
+// The number of arguments in the packet payload before we start passing operands
+constexpr int NON_OPERAND_COUNT = 6;
+
+// Used to transform an address into a device address
+constexpr int DEV_ADDR_BASE = 0x04000000;
+constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
+
+// BO size allocated for commands
+constexpr int CMD_SIZE = 64;
+
+// This is a temp workaround. For some reason the first command count in a chain
+// needs to be a larger than it actually is, assuming there is some other data 
+// structure at the beginning
+// TODO: Look more into this
+constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
+
+// Index of command payload where the instruction sequence 
+// address is located 
+constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;
+
+// Environment variable to define job submission timeout
+constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT";
+constexpr int DEFAULT_TIMEOUT_VAL = 50;
+char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR);
+int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr);
+
 namespace rocr {
 namespace AMD {
 
@@ -106,7 +130,7 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
       .CreateQueue(*this);
 }
 
-AieAqlQueue::~AieAqlQueue() { Inactivate(); }
+AieAqlQueue::~AieAqlQueue() { AieAqlQueue::Inactivate(); }
 
 hsa_status_t AieAqlQueue::Inactivate() {
   bool active(active_.exchange(false, std::memory_order_relaxed));
@@ -195,8 +219,249 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
 }
 
 void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
-  atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value),
-                std::memory_order_release);
+  std::unordered_map<uint32_t, void*> vmem_handle_mappings;
+  if (reinterpret_cast<XdnaDriver &>(
+          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
+          .GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) {
+    return;
+  }
+
+  int fd = 0;
+  if (reinterpret_cast<XdnaDriver &>(
+          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
+          .GetFd(fd) != HSA_STATUS_SUCCESS) {
+    return;
+  }
+
+  SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address,
+            amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id,
+            vmem_handle_mappings);
+}
+
+hsa_status_t AieAqlQueue::SyncBos(std::vector<uint32_t> &bo_args, int fd) {
+  for (unsigned int bo_arg : bo_args) {
+    amdxdna_drm_sync_bo sync_params = {};
+    sync_params.handle = bo_arg;
+    if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params))
+      return HSA_STATUS_ERROR;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
+                                         uint32_t hw_ctx_handle, int fd) {
+  // Submit the cmd
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd))
+    return HSA_STATUS_ERROR;
+
+  // Waiting for command to finish
+  amdxdna_drm_wait_cmd wait_cmd = {};
+  wait_cmd.hwctx = hw_ctx_handle;
+  wait_cmd.timeout = timeout_val; 
+  wait_cmd.seq = exec_cmd->seq;
+
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void AieAqlQueue::RegisterCmdBOs(
+    uint32_t count, std::vector<uint32_t> &bo_args,
+    hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
+    std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
+  // This is the index where the operand addresses start in a command
+  const int operand_starting_index = 5;
+
+  // Counting the number of operands in the command payload.
+  // Operands are 64-bits so we need to divide by two
+  uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2;
+
+  // Keep track of the handles before we submit the packet
+  bo_args.push_back(cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); 
+
+  // Going through all of the operands in the command, keeping track of the
+  // handles and turning the handles into addresses. The starting index of
+  // the operands in a command is `operand_starting_index` and the fields
+  // are 32-bits we need to iterate over every two
+  for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
+    bo_args.push_back(
+        cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]);
+    // clang-format off
+    cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1] =
+        (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32 & 0xFFFFFFFF;
+    cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter] =
+        (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF;
+    // clang-format on
+  }
+
+  // Transform the instruction sequence address into device address
+  cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] =
+      DEV_ADDR_BASE |
+      (reinterpret_cast<uint64_t>(
+           vmem_handle_mappings
+               [cmd_pkt_payload
+                    ->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]]) &
+       DEV_ADDR_OFFSET_MASK);
+}
+
+hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
+                                    amdxdna_cmd **cmd, int fd) {
+  // Creating the command
+  amdxdna_drm_create_bo create_cmd_bo = {};
+  create_cmd_bo.type = AMDXDNA_BO_CMD,
+  create_cmd_bo.size = CMD_SIZE;
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
+    return HSA_STATUS_ERROR;
+
+  amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {};
+  cmd_bo_get_bo_info.handle = create_cmd_bo.handle;
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info))
+    return HSA_STATUS_ERROR;
+
+  *cmd = static_cast<amdxdna_cmd *>(mmap(nullptr, create_cmd_bo.size,
+                                         PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+                                         cmd_bo_get_bo_info.map_offset));
+  *handle = create_cmd_bo.handle;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AieAqlQueue::SubmitCmd(
+    uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id,
+    uint64_t write_dispatch_id,
+    std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
+  uint64_t cur_id = read_dispatch_id;
+  while (cur_id < write_dispatch_id) {
+    hsa_amd_aie_ert_packet_t *pkt =
+        static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + cur_id;
+
+    // Get the packet header information
+    if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC ||
+        pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT)
+      return HSA_STATUS_ERROR;
+
+    // Get the payload information
+    switch (pkt->opcode) {
+      case HSA_AMD_AIE_ERT_START_CU: {
+
+        std::vector<uint32_t> bo_args;
+        std::vector<uint32_t> cmd_handles;
+
+        // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU
+        // packets there are. All can be combined into a single chain.
+        int num_cont_start_cu_pkts = 1;
+        for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
+          if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) {
+            break;
+          }
+          num_cont_start_cu_pkts++;
+        }
+
+        // Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets
+        for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) {
+
+          // Getting the current command packet
+          hsa_amd_aie_ert_packet_t *pkt =
+              static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + pkt_iter;
+          hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload =
+              reinterpret_cast<hsa_amd_aie_ert_start_kernel_data_t *>(
+                  pkt->payload_data);
+
+          // Add the handles for all of the BOs to bo_args as well as rewrite
+          // the command payload handles to contain the actual virtual addresses
+          RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings);
+
+          // Creating a packet that contains the command to execute the kernel
+          uint32_t cmd_bo_handle = 0;
+          amdxdna_cmd *cmd = nullptr;
+          if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
+            return HSA_STATUS_ERROR;
+
+          // Filling in the fields of the command
+          cmd->state = pkt->state;
+          cmd->extra_cu_masks = 0;
+
+          // For some reason the first count needs to be a little larger than
+          // it actually is, assuming there is some other data structure at the
+          // beginning
+          // TODO: Look more into this
+          if (pkt_iter == cur_id) {
+            cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
+          }
+          else {
+            cmd->count = pkt->count;
+          }
+          cmd->opcode = pkt->opcode;
+          cmd->data[0] = cmd_pkt_payload->cu_mask;
+          memcpy((cmd->data + 1),  cmd_pkt_payload->data, 4 * pkt->count);
+
+          // Keeping track of the handle
+          cmd_handles.push_back(cmd_bo_handle);
+        }
+
+        // Creating a packet that contains the command chain
+        uint32_t cmd_chain_bo_handle = 0;
+        amdxdna_cmd *cmd_chain = nullptr;
+        int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t);
+        if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain, fd))
+          return HSA_STATUS_ERROR;
+
+        // Writing information to the command buffer
+        amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast<amdxdna_cmd_chain *>(cmd_chain->data);
+
+        // Creating a command chain
+        cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
+        cmd_chain->extra_cu_masks = 0;
+        // TODO: Figure out why this is the value
+        cmd_chain->count = 0xA;
+        cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
+        cmd_chain_payload->command_count = cmd_handles.size();
+        cmd_chain_payload->submit_index = 0;
+        cmd_chain_payload->error_index = 0;
+        for (int i = 0; i < cmd_handles.size(); i++) {
+          cmd_chain_payload->data[i] = cmd_handles[i];
+        }
+
+        // Syncing BOs before we execute the command
+        if (SyncBos(bo_args, fd))
+          return HSA_STATUS_ERROR;
+
+        // Removing duplicates in the bo container. The driver will report
+        // an error if we provide the same BO handle multiple times.
+        // This can happen if any of the BOs are the same across jobs
+        std::sort(bo_args.begin(), bo_args.end());
+        bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end());
+
+        // Filling in the fields to execute the command chain
+        amdxdna_drm_exec_cmd exec_cmd_0 = {};
+        exec_cmd_0.ext = 0;
+        exec_cmd_0.ext_flags = 0;
+        exec_cmd_0.hwctx = hw_ctx_handle;
+        exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
+        exec_cmd_0.cmd_handles = cmd_chain_bo_handle;
+        exec_cmd_0.args = (uint64_t)bo_args.data();
+        exec_cmd_0.cmd_count = 1;
+        exec_cmd_0.arg_count = bo_args.size();
+
+        // Executing all commands in the command chain
+        ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd);
+
+        // Syncing BOs after we execute the command
+        if (SyncBos(bo_args, fd))
+          return HSA_STATUS_ERROR;
+
+        cur_id += num_cont_start_cu_pkts;
+        break;
+      }
+      default: {
+        return HSA_STATUS_ERROR;
+      }
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
 }
 
 void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
@@ -207,16 +472,16 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
 hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
                                   void *value) {
   switch (attribute) {
-  case HSA_AMD_QUEUE_INFO_AGENT:
-    *(reinterpret_cast<hsa_agent_t *>(value)) = agent_.public_handle();
-    break;
-  case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
-    // Hardware doorbell supports AQL semantics.
-    *(reinterpret_cast<uint64_t *>(value)) =
-        reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
-    break;
-  default:
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    case HSA_AMD_QUEUE_INFO_AGENT:
+      *static_cast<hsa_agent_t *>(value) = agent_.public_handle();
+      break;
+    case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
+      // Hardware doorbell supports AQL semantics.
+      *static_cast<uint64_t *>(value) =
+          reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
+      break;
+    default:
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
   return HSA_STATUS_SUCCESS;
 }

From 5089203416c1d5ed89b3aaad4c43a1fb6467e866 Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:37:24 -0400
Subject: [PATCH 14/20] Fix narrowing conversion warnings (#17)

---
 runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 4ba196b5f..d21fdbc5f 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -235,7 +235,7 @@ hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
       // TODO: Make this configurable.
       .max_opc = 0x800,
       // This field is for the number of core tiles.
-      .num_tiles = aie_agent.GetNumCores(),
+      .num_tiles = static_cast<uint32_t>(aie_agent.GetNumCores()),
       .mem_size = 0,
       .umq_doorbell = 0};
 
@@ -413,7 +413,7 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
       .handle = aie_queue.GetHwCtxHandle(),
       .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU,
       .param_val = reinterpret_cast<uint64_t>(xdna_config_cu_param),
-      .param_val_size = config_cu_param_size};
+      .param_val_size = static_cast<uint32_t>(config_cu_param_size)};
 
   if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx_args) < 0) {
     return HSA_STATUS_ERROR;

From 5e978ceb46ad8dbed746c31eeb78f6d95000659b Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:44:05 -0400
Subject: [PATCH 15/20] Releasing buffer for XDNA hw context params (#18)

---
 runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index d21fdbc5f..4982bc910 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -393,6 +393,11 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
   amdxdna_hwctx_param_config_cu *xdna_config_cu_param =
       reinterpret_cast<amdxdna_hwctx_param_config_cu *>(
           malloc(config_cu_param_size));
+  if (xdna_config_cu_param == nullptr) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+  MAKE_SCOPE_GUARD([xdna_config_cu_param] { free(xdna_config_cu_param); });
+
   xdna_config_cu_param->num_cus = config_cu_param.num_cus;
 
   for (int i = 0; i < xdna_config_cu_param->num_cus; ++i) {

From 63d45b73cc8fa6274b2921757f44973a40c280db Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Thu, 5 Sep 2024 10:20:39 -0600
Subject: [PATCH 16/20] Using BOs of type BO_SHMEM instead of BO_CMD for
 kernarg memory region (#19)

* Using BOs of type BO_SHMEM instead of BO_CMD for kernarg memory region

* Changing test to use kernarg memory region for operand allocation
---
 rocrtst/suites/aie/aie_hsa_dispatch_test.cc              | 4 ++--
 runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
index fca55e4a4..e2b91bf44 100644
--- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
+++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
@@ -223,7 +223,7 @@ int main(int argc, char **argv) {
       num_data_elements * sizeof(std::uint32_t);
 
   std::uint32_t *input = {};
-  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
                                    reinterpret_cast<void **>(&input));
   assert(r == HSA_STATUS_SUCCESS);
   std::uint32_t input_handle = {};
@@ -232,7 +232,7 @@ int main(int argc, char **argv) {
   assert(input_handle != 0);
 
   std::uint32_t *output = {};
-  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
                                    reinterpret_cast<void **>(&output));
   assert(r == HSA_STATUS_SUCCESS);
   std::uint32_t output_handle = {};
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 4982bc910..12f140e5c 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -147,7 +147,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
   }
 
   if (region.kernarg()) {
-    create_bo_args.type = AMDXDNA_BO_CMD;
+    create_bo_args.type = AMDXDNA_BO_SHMEM;
   } else {
     create_bo_args.type = AMDXDNA_BO_DEV;
   }

From c8177042ee24e4e0b5ed2ff1677de8b0e2962293 Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:22:08 -0400
Subject: [PATCH 17/20] Adding GetInfo keys for ROCm 6.2 support (#20)

---
 .../core/runtime/amd_aie_agent.cpp            | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index 6868ef8ec..6340c2bb6 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -228,7 +228,6 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
              "%s%036lX", uuid_tmp, uuid_value);
     break;
   }
-
   case HSA_AMD_AGENT_INFO_ASIC_REVISION:
     *reinterpret_cast<uint32_t *>(value) = 0;
     break;
@@ -237,6 +236,43 @@ hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
     assert(regions_.size() != 0 && "No device local memory found!");
     *reinterpret_cast<bool *>(value) = true;
     break;
+  case HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY:
+    return core::Runtime::runtime_singleton_->GetSystemInfo(
+        HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, value);
+    break;
+  case HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_UCODE_VERSION:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_SDMA_ENG:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_IOMMU_SUPPORT:
+    *static_cast<hsa_amd_iommu_version_t *>(value) = HSA_IOMMU_SUPPORT_NONE;
+    break;
+  case HSA_AMD_AGENT_INFO_NUM_XCC:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_DRIVER_UID:
+    *static_cast<uint32_t *>(value) = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_NEAREST_CPU:
+    static_cast<hsa_agent_t *>(value)->handle = 0;
+    break;
+  case HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES:
+    memset(value, 0, sizeof(uint8_t) * 8);
+    break;
+  case HSA_AMD_AGENT_INFO_AQL_EXTENSIONS:
+    memset(value, 0, sizeof(uint8_t) * 8);
+    break;
   default:
     *reinterpret_cast<uint32_t *>(value) = 0;
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;

From 0757f630910084ef5dbc2ec72bcaa44318f0c536 Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Thu, 12 Sep 2024 08:42:04 -0700
Subject: [PATCH 18/20] Fixing command sizing and changing the test to issue
 the maximum number of packets the queue supports. (#23)

* Fixed workarounds in the AIE soft queue regarding the size of the command chain and the individual commands.

* Added the functionality to aie_hsa_dispatch_test.cc to query the size of the AIE queue and issue the maximum number of packets it supports.

* Some additional small fixes on aie_hsa_dispatch_test.cc test.

* Adding links to driver source
---
 rocrtst/suites/aie/aie_hsa_dispatch_test.cc   | 177 ++++++++++--------
 .../core/runtime/amd_aie_aql_queue.cpp        |  37 ++--
 2 files changed, 113 insertions(+), 101 deletions(-)

diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
index e2b91bf44..5d54d35ac 100644
--- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
+++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc
@@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
   bin_file.read(reinterpret_cast<char *>(*buf), size);
 }
 
-void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
-                   void **buf) {
+void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf, uint32_t &num_instr) {
   std::ifstream bin_file(file_name,
                          std::ios::binary | std::ios::ate | std::ios::in);
 
@@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
   auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
   assert(r == HSA_STATUS_SUCCESS);
   std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
+  num_instr = pdi_vec.size();
 }
 
 }  // namespace
@@ -144,9 +145,9 @@ int main(int argc, char **argv) {
   hsa_amd_memory_pool_t global_dev_mem_pool{0};
   // System memory pool. Used for allocating kernel argument data.
   hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
-  const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
+  const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
   const std::string pdi_file_name(sourcePath / "add_one.pdi");
-  uint32_t *dpu_inst_buf(nullptr);
+  uint32_t *instr_inst_buf(nullptr);
   uint64_t *pdi_buf(nullptr);
 
   assert(aie_agents.empty());
@@ -164,8 +165,6 @@ int main(int argc, char **argv) {
   // Find the AIE agents in the system.
   r = hsa_iterate_agents(get_aie_agents, &aie_agents);
   assert(r == HSA_STATUS_SUCCESS);
-  //    assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
-  //    HSA_STATUS_SUCCESS);
   assert(aie_agents.size() == 1);
 
   const auto &aie_agent = aie_agents.front();
@@ -190,14 +189,22 @@ int main(int argc, char **argv) {
   assert(r == HSA_STATUS_SUCCESS);
   assert(global_kernarg_mem_pool.handle);
 
+  // Getting the maximum size of the queue so we can submit that many consecutive
+  // packets.
+  uint32_t aie_max_queue_size;
+  r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
+  assert(r == HSA_STATUS_SUCCESS);
+  int num_pkts = aie_max_queue_size;
+
   // Load the DPU and PDI files into a global pool that doesn't support kernel
   // args (DEV BO).
-  load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
-                reinterpret_cast<void **>(&dpu_inst_buf));
-  uint32_t dpu_handle = 0;
-  r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
+  uint32_t num_instr;
+  load_instr_file(global_dev_mem_pool, instr_inst_file_name,
+                reinterpret_cast<void **>(&instr_inst_buf), num_instr);
+  uint32_t instr_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
   assert(r == HSA_STATUS_SUCCESS);
-  assert(dpu_handle != 0);
+  assert(instr_handle != 0);
 
   load_pdi_file(global_dev_mem_pool, pdi_file_name,
                 reinterpret_cast<void **>(&pdi_buf));
@@ -222,85 +229,99 @@ int main(int argc, char **argv) {
   constexpr std::size_t data_buffer_size =
       num_data_elements * sizeof(std::uint32_t);
 
-  std::uint32_t *input = {};
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
-                                   reinterpret_cast<void **>(&input));
-  assert(r == HSA_STATUS_SUCCESS);
-  std::uint32_t input_handle = {};
-  r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  assert(input_handle != 0);
-
-  std::uint32_t *output = {};
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
-                                   reinterpret_cast<void **>(&output));
-  assert(r == HSA_STATUS_SUCCESS);
-  std::uint32_t output_handle = {};
-  r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  assert(output_handle != 0);
+  std::vector<uint32_t *> input(num_pkts);
+  std::vector<uint32_t *> output(num_pkts);
+  std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
+  std::vector<uint32_t> input_handle(num_pkts);
+  std::vector<uint32_t> output_handle(num_pkts);
+
+  uint64_t wr_idx = 0;
+  uint64_t packet_id = 0;
+
+  for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
+                                     reinterpret_cast<void **>(&input[pkt_iter]));
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    assert(input_handle[pkt_iter] != 0);
+
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
+                                     reinterpret_cast<void **>(&output[pkt_iter]));
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    assert(output_handle[pkt_iter] != 0);
+
+    for (std::size_t i = 0; i < num_data_elements; i++) {
+      *(input[pkt_iter] + i) = i * (pkt_iter + 1);
+      *(output[pkt_iter] + i) = 0xDEFACE;
+    }
 
-  for (std::size_t i = 0; i < num_data_elements; i++) {
-    *(input + i) = i;
-    *(output + i) = 0xDEFACE;
+    // Getting a slot in the queue
+    wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
+    packet_id = wr_idx % aie_queue->size;
+
+    // Creating a packet to store the command
+    hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
+        aie_queue->base_address) + packet_id;
+    assert(r == HSA_STATUS_SUCCESS);
+    cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
+    cmd_pkt->count = 0xA;  // # of arguments to put in command
+    cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
+    cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
+    cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
+                             << HSA_PACKET_HEADER_TYPE;
+
+    // Creating the payload for the packet
+    hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                     reinterpret_cast<void **>(&cmd_payload));
+    assert(r == HSA_STATUS_SUCCESS);
+    // Selecting the PDI to use with this command
+    cmd_payload->cu_mask = 0x1;
+    // Transaction opcode
+    cmd_payload->data[0] = 0x3;
+    cmd_payload->data[1] = 0x0;
+    cmd_payload->data[2] = instr_handle;
+    cmd_payload->data[3] = 0x0;
+    cmd_payload->data[4] = num_instr;
+    cmd_payload->data[5] = input_handle[pkt_iter];
+    cmd_payload->data[6] = 0;
+    cmd_payload->data[7] = output_handle[pkt_iter];
+    cmd_payload->data[8] = 0;
+    cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
+
+    // Keeping track of payloads so we can free them at the end
+    cmd_payloads[pkt_iter] = cmd_payload;
   }
 
-  ///////////////////////////////////// Creating the cmd packet
-  // Creating a packet to store the command
-  hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
-                                   reinterpret_cast<void **>(&cmd_pkt));
-  assert(r == HSA_STATUS_SUCCESS);
-  cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
-  cmd_pkt->count = 0xA;  // # of arguments to put in command
-  cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
-  cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
-  cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
-                           << HSA_PACKET_HEADER_TYPE;
-
-  // Creating the payload for the packet
-  hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
-  uint32_t cmd_handle;
-  r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
-                                    &cmd_handle);
-  assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
-                                   reinterpret_cast<void **>(&cmd_payload));
-  assert(r == HSA_STATUS_SUCCESS);
-  cmd_payload->cu_mask = 0x1;  // Selecting the PDI to use with this command
-  cmd_payload->data[0] = 0x3;  // Transaction opcode
-  cmd_payload->data[1] = 0x0;
-  cmd_payload->data[2] = dpu_handle;
-  cmd_payload->data[3] = 0x0;
-  cmd_payload->data[4] = 0x44;  // Size of DPU instruction
-  cmd_payload->data[5] = input_handle;
-  cmd_payload->data[6] = 0;
-  cmd_payload->data[7] = output_handle;
-  cmd_payload->data[8] = 0;
-  cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
-
-  uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
-  uint64_t packet_id = wr_idx % aie_queue->size;
-  reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
-      aie_queue->base_address)[packet_id] = *cmd_pkt;
+  // Ringing the doorbell to dispatch each packet we added to
+  // the queue
   hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);
 
-  for (std::size_t i = 0; i < num_data_elements; i++) {
-    const auto expected = *(input + i) + 1;
-    const auto result = *(output + i);
-    assert(result == expected);
+  for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
+    for (std::size_t i = 0; i < num_data_elements; i++) {
+      const auto expected = *(input[pkt_iter] + i) + 1;
+      const auto result = *(output[pkt_iter] + i);
+      assert(result == expected);
+    }
+
+    r = hsa_amd_memory_pool_free(output[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_free(input[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
+    r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
+    assert(r == HSA_STATUS_SUCCESS);
   }
 
   r = hsa_queue_destroy(aie_queue);
   assert(r == HSA_STATUS_SUCCESS);
 
-  r = hsa_amd_memory_pool_free(output);
-  assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_free(input);
-  assert(r == HSA_STATUS_SUCCESS);
   r = hsa_amd_memory_pool_free(pdi_buf);
   assert(r == HSA_STATUS_SUCCESS);
-  r = hsa_amd_memory_pool_free(dpu_inst_buf);
+  r = hsa_amd_memory_pool_free(instr_inst_buf);
   assert(r == HSA_STATUS_SUCCESS);
 
   r = hsa_shut_down();
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index b2f8fd2d0..50229daf4 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6;
 constexpr int DEV_ADDR_BASE = 0x04000000;
 constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
 
-// BO size allocated for commands
-constexpr int CMD_SIZE = 64;
-
-// This is a temp workaround. For some reason the first command count in a chain
-// needs to be a larger than it actually is, assuming there is some other data 
-// structure at the beginning
-// TODO: Look more into this
-constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
+// The driver places a structure before each command in a command chain.
+// Need to increase the size of the command by the size of this structure.
+// In the following xdna driver source can see where this is implemented:
+// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
+// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
+// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
+constexpr int CMD_COUNT_SIZE_INCREASE = 3;
 
 // Index of command payload where the instruction sequence 
 // address is located 
@@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
   // Creating the command
   amdxdna_drm_create_bo create_cmd_bo = {};
   create_cmd_bo.type = AMDXDNA_BO_CMD,
-  create_cmd_bo.size = CMD_SIZE;
+  create_cmd_bo.size = size;
   if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
     return HSA_STATUS_ERROR;
 
@@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
     // Get the payload information
     switch (pkt->opcode) {
       case HSA_AMD_AIE_ERT_START_CU: {
-
         std::vector<uint32_t> bo_args;
         std::vector<uint32_t> cmd_handles;
 
@@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd(
           // Creating a packet that contains the command to execute the kernel
           uint32_t cmd_bo_handle = 0;
           amdxdna_cmd *cmd = nullptr;
-          if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
+          uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
+          if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd))
             return HSA_STATUS_ERROR;
 
           // Filling in the fields of the command
           cmd->state = pkt->state;
           cmd->extra_cu_masks = 0;
 
-          // For some reason the first count needs to be a little larger than
-          // it actually is, assuming there is some other data structure at the
-          // beginning
-          // TODO: Look more into this
-          if (pkt_iter == cur_id) {
-            cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
-          }
-          else {
-            cmd->count = pkt->count;
-          }
+          // The driver places a structure before each command in a command chain.
+          // Need to increase the size of the command by the size of this structure.
+          cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
           cmd->opcode = pkt->opcode;
           cmd->data[0] = cmd_pkt_payload->cu_mask;
           memcpy((cmd->data + 1),  cmd_pkt_payload->data, 4 * pkt->count);
@@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd(
         // Creating a command chain
         cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
         cmd_chain->extra_cu_masks = 0;
-        // TODO: Figure out why this is the value
-        cmd_chain->count = 0xA;
+        cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
         cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
         cmd_chain_payload->command_count = cmd_handles.size();
         cmd_chain_payload->submit_index = 0;

From 793c0382f5f17fd92f404747bcb7b64367f47953 Mon Sep 17 00:00:00 2001
From: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Date: Fri, 13 Sep 2024 09:03:51 -0400
Subject: [PATCH 19/20] Avoid incorrect casting (#24)

---
 .../core/runtime/amd_aie_aql_queue.cpp        | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index 50229daf4..6f796441a 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -76,8 +76,8 @@ constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
 // https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
 constexpr int CMD_COUNT_SIZE_INCREASE = 3;
 
-// Index of command payload where the instruction sequence 
-// address is located 
+// Index of command payload where the instruction sequence
+// address is located
 constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;
 
 // Environment variable to define job submission timeout
@@ -219,16 +219,15 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
 
 void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
   std::unordered_map<uint32_t, void*> vmem_handle_mappings;
-  if (reinterpret_cast<XdnaDriver &>(
-          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
-          .GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) {
+
+  auto &driver = static_cast<XdnaDriver &>(
+      core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type));
+  if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) {
     return;
   }
 
   int fd = 0;
-  if (reinterpret_cast<XdnaDriver &>(
-          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
-          .GetFd(fd) != HSA_STATUS_SUCCESS) {
+  if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) {
     return;
   }
 
@@ -257,7 +256,7 @@ hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
   // Waiting for command to finish
   amdxdna_drm_wait_cmd wait_cmd = {};
   wait_cmd.hwctx = hw_ctx_handle;
-  wait_cmd.timeout = timeout_val; 
+  wait_cmd.timeout = timeout_val;
   wait_cmd.seq = exec_cmd->seq;
 
   if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd))
@@ -278,7 +277,8 @@ void AieAqlQueue::RegisterCmdBOs(
   uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2;
 
   // Keep track of the handles before we submit the packet
-  bo_args.push_back(cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); 
+  bo_args.push_back(
+      cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]);
 
   // Going through all of the operands in the command, keeping track of the
   // handles and turning the handles into addresses. The starting index of

From c046b7059660a43bb218c169e2277eb00c5127a7 Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Sat, 21 Sep 2024 13:41:07 -0600
Subject: [PATCH 20/20] Freeing the commands and the command chain created
 during dispatch (#29)

* Freeing the commands and the command chain created during dispatch

* Unmaping and closing cmd BOs as well as freeing the queue ring buffer
---
 runtime/hsa-runtime/core/inc/amd_aie_agent.h  |  6 +++++
 .../core/runtime/amd_aie_agent.cpp            |  2 ++
 .../core/runtime/amd_aie_aql_queue.cpp        | 24 ++++++++++++++++++-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
index 0925a206b..d99b71ed7 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_agent.h
@@ -93,6 +93,9 @@ class AieAgent : public core::Agent {
     return system_allocator_;
   }
 
+  /// @brief Getter for the AIE system deallocator.
+  const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
+
   // AIE agent methods.
   /// @brief Get the number of columns on this AIE agent.
   int GetNumCols() const { return num_cols_; }
@@ -117,6 +120,9 @@ class AieAgent : public core::Agent {
                        core::MemoryRegion::AllocateFlags flags)>
       system_allocator_;
 
+
+  std::function<void(void*)> system_deallocator_;
+
   const hsa_profile_t profile_ = HSA_PROFILE_BASE;
   const uint32_t min_aql_size_ = 0x40;
   const uint32_t max_aql_size_ = 0x40;
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
index 6340c2bb6..4bce61323 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
@@ -345,6 +345,8 @@ void AieAgent::InitAllocators() {
                    ? mem
                    : nullptr;
       };
+
+      system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
       break;
     }
   }
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index 6f796441a..283b5af60 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -129,7 +129,12 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
       .CreateQueue(*this);
 }
 
-AieAqlQueue::~AieAqlQueue() { AieAqlQueue::Inactivate(); }
+AieAqlQueue::~AieAqlQueue() {
+  AieAqlQueue::Inactivate();
+  if (ring_buf_) {
+    agent_.system_deallocator()(ring_buf_);
+  }
+}
 
 hsa_status_t AieAqlQueue::Inactivate() {
   bool active(active_.exchange(false, std::memory_order_relaxed));
@@ -346,6 +351,8 @@ hsa_status_t AieAqlQueue::SubmitCmd(
       case HSA_AMD_AIE_ERT_START_CU: {
         std::vector<uint32_t> bo_args;
         std::vector<uint32_t> cmd_handles;
+        std::vector<uint32_t> cmd_sizes;
+        std::vector<amdxdna_cmd *> cmds;
 
         // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU
         // packets there are. All can be combined into a single chain.
@@ -391,6 +398,8 @@ hsa_status_t AieAqlQueue::SubmitCmd(
 
           // Keeping track of the handle
           cmd_handles.push_back(cmd_bo_handle);
+          cmds.push_back(cmd);
+          cmd_sizes.push_back(cmd_size);
         }
 
         // Creating a packet that contains the command chain
@@ -439,6 +448,19 @@ hsa_status_t AieAqlQueue::SubmitCmd(
         // Executing all commands in the command chain
         ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd);
 
+        // Unmapping and closing the cmd BOs
+        drm_gem_close close_bo_args{0};
+        for (int i = 0; i < cmd_handles.size(); i++) {
+          munmap(cmds[i], cmd_sizes[i]);
+          close_bo_args.handle = cmd_handles[i];
+          ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+        }
+
+        // Unmapping and closing the cmd_chain BO
+        munmap(cmd_chain, cmd_chain_size);
+        close_bo_args.handle = cmd_chain_bo_handle;
+        ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
+
         // Syncing BOs after we execute the command
         if (SyncBos(bo_args, fd))
           return HSA_STATUS_ERROR;