From 180a8bb68a85d68604c80220af074b21b6c0c86b Mon Sep 17 00:00:00 2001
From: eddierichter-amd <eddie.richter@amd.com>
Date: Mon, 2 Sep 2024 18:04:03 -0600
Subject: [PATCH] Adding soft queue dispatch logic to dispatch commands to AIE
 agents (#2)

---
 .../core/driver/xdna/amd_xdna_driver.cpp      |  21 +-
 .../hsa-runtime/core/inc/amd_aie_aql_queue.h  |  76 ++++-
 runtime/hsa-runtime/core/inc/amd_gpu_agent.h  |  18 +-
 .../hsa-runtime/core/inc/amd_xdna_driver.h    |   8 +-
 .../core/runtime/amd_aie_aql_queue.cpp        | 301 ++++++++++++++++--
 5 files changed, 387 insertions(+), 37 deletions(-)

diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 303312932..4ba196b5f 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
     return HSA_STATUS_ERROR;
   }
 
-  aie_agent.SetNumCols(aie_metadata.cols);
+  // Right now can only target N-1 columns so putting this 
+  // here as a workaround
+  aie_agent.SetNumCols(aie_metadata.cols - 1);
   aie_agent.SetNumCoreRows(aie_metadata.core.row_count);
 
   return HSA_STATUS_SUCCESS;
@@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
   return HSA_STATUS_SUCCESS;
 }
 
+hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
+  vmem_handle_mappings = this->vmem_handle_mappings;
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t XdnaDriver::GetFd(int &fd) {
+  fd = fd_;
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t XdnaDriver::FreeDeviceHeap() {
   if (dev_heap_parent) {
     munmap(dev_heap_parent, dev_heap_align * 2 - 1);
@@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
         config_cu_param.cu_configs[i].cu_config_bo;
     xdna_config_cu_param->cu_configs[i].cu_func =
         config_cu_param.cu_configs[i].cu_func;
+
+    // sync configuration buffer
+    amdxdna_drm_sync_bo sync_args = {};
+    sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo;
+    if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) {
+      return HSA_STATUS_ERROR;
+    }
   }
 
   amdxdna_drm_config_hwctx config_hw_ctx_args{
diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
index 70f05e28a..224b85d7c 100644
--- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
+++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
@@ -49,7 +49,35 @@
 #include "core/inc/queue.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
-#include "core/util/locks.h"
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+  __u32 command_count;
+  __u32 submit_index;
+  __u32 error_index;
+  __u32 reserved[3];
+  __u64 data[] __counted_by(command_count);
+};
+
+
+/* Exec buffer command header format */
+struct amdxdna_cmd {
+  union {
+    struct {
+      __u32 state : 4;
+      __u32 unused : 6;
+      __u32 extra_cu_masks : 2;
+      __u32 count : 11;
+      __u32 opcode : 5;
+      __u32 reserved : 4;
+    };
+    __u32 header;
+  };
+  __u32 data[]  __counted_by(count);
+};
 
 namespace rocr {
 namespace AMD {
@@ -71,7 +99,7 @@ class AieAqlQueue : public core::Queue,
 
   AieAqlQueue() = delete;
   AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id);
-  ~AieAqlQueue();
+  ~AieAqlQueue() override;
 
   hsa_status_t Inactivate() override;
   hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
@@ -100,7 +128,7 @@ class AieAqlQueue : public core::Queue,
                        void *value) override;
 
   // AIE-specific API
-  AieAgent &GetAgent() { return agent_; }
+  AieAgent &GetAgent() const { return agent_; }
   void SetHwCtxHandle(uint32_t hw_ctx_handle) {
     hw_ctx_handle_ = hw_ctx_handle;
   }
@@ -119,7 +147,7 @@ class AieAqlQueue : public core::Queue,
                   hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
                   hsa_signal_t *signal = NULL) override;
 
-  uint32_t queue_id_ = INVALID_QUEUEID;
+  uint64_t queue_id_ = INVALID_QUEUEID;
   /// @brief ID of AIE device on which this queue has been mapped.
   uint32_t node_id_ = std::numeric_limits<uint32_t>::max();
   /// @brief Queue size in bytes.
@@ -134,6 +162,44 @@ class AieAqlQueue : public core::Queue,
   /// @brief Base of the queue's ring buffer storage.
   void *ring_buf_ = nullptr;
 
+  static hsa_status_t SubmitCmd(
+      uint32_t hw_ctx_handle, int fd, void *queue_base,
+      uint64_t read_dispatch_id, uint64_t write_dispatch_id,
+      std::unordered_map<uint32_t, void *> &vmem_handle_mappings);
+
+  /// @brief Creates a command BO and returns a pointer to the memory and
+  //          the corresponding handle
+  ///
+  /// @param size size of memory to allocate
+  /// @param handle A pointer to the BO handle
+  /// @param cmd A pointer to the buffer
+  static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle,
+                                amdxdna_cmd **cmd, int fd);
+
+  /// @brief Adds all BOs in a command packet payload to a vector
+  ///         and replaces the handles with a virtual address
+  ///
+  /// @param count Number of entries in the command
+  /// @param bo_args A pointer to a vector that contains all bo handles
+  /// @param cmd_pkt_payload A pointer to the payload of the command
+  static void RegisterCmdBOs(
+      uint32_t count, std::vector<uint32_t> &bo_args,
+      hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
+      std::unordered_map<uint32_t, void *> &vmem_handle_mappings);
+
+  /// @brief Syncs all BOs referenced in bo_args
+  ///
+  /// @param bo_args vector containing handles of BOs to sync
+  static hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);
+
+  /// @brief Executes a command and waits for its completion
+  ///
+  /// @param exec_cmd Structure containing the details of the command to execute
+  /// @param hw_ctx_handle the handle of the hardware context to run this
+  /// command
+  static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
+                                     uint32_t hw_ctx_handle, int fd);
+
   /// @brief Handle for an application context on the AIE device.
   ///
   /// Each user queue will have an associated context. This handle is assigned
@@ -153,4 +219,4 @@ class AieAqlQueue : public core::Queue,
 } // namespace AMD
 } // namespace rocr
 
-#endif // header guard
+#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 608017c11..a7193d18d 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt {
   GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);
 
   // @brief GPU agent destructor.
-  ~GpuAgent();
+  ~GpuAgent() override;
 
   // @brief Ensure blits are ready (performance hint).
   void PreloadBlits() override;
@@ -507,14 +507,14 @@ class GpuAgent : public GpuAgentInt {
   hsa_status_t EnableDmaProfiling(bool enable) override;
 
   hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
-                                       void* cb_data);
-  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
+                          void *cb_data) override;
+  hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override;
   hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
-                                      pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
-  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
+                         pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override;
+  hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override;
   hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);
 
   static void PcSamplingThreadRun(void* agent);
@@ -793,4 +793,4 @@ class GpuAgent : public GpuAgentInt {
 }  // namespace amd
 }  // namespace rocr
 
-#endif  // header guard
+#endif  // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index 28572e135..79cbaa710 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -47,6 +47,7 @@
 
 #include "core/inc/driver.h"
 #include "core/inc/memory_region.h"
+#include "core/driver/xdna/uapi/amdxdna_accel.h"
 
 namespace rocr {
 namespace core {
@@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver {
   hsa_status_t Init() override;
   hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
 
+  hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
+  hsa_status_t GetFd(int &fd);
+
   hsa_status_t GetAgentProperties(core::Agent &agent) const override;
   hsa_status_t
   GetMemoryProperties(uint32_t node_id,
@@ -126,10 +130,6 @@ class XdnaDriver : public core::Driver {
   void *dev_heap_aligned = nullptr;
   static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
   static constexpr size_t dev_heap_align = 64 * 1024 * 1024;
-
-  /// @brief DRM buffer object handle for the device heap. Assigned by the
-  ///        kernel-mode driver.
-  uint32_t dev_heap_handle = 0;
 };
 
 } // namespace AMD
diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
index e8562f226..b2f8fd2d0 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
@@ -41,28 +41,52 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "core/inc/amd_aie_aql_queue.h"
+#include "core/inc/amd_xdna_driver.h"
 
 #ifdef __linux__
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
-#include <sys/syscall.h>
-#include <unistd.h>
+#include <sys/ioctl.h>
 #endif
 
 #ifdef _WIN32
 #include <Windows.h>
 #endif
 
-#include <stdio.h>
-#include <string.h>
-#include <thread>
+#include <cstring>
 
 #include "core/inc/queue.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
 #include "core/util/utils.h"
 
+// The number of arguments in the packet payload before we start passing operands
+constexpr int NON_OPERAND_COUNT = 6;
+
+// Used to transform an address into a device address
+constexpr int DEV_ADDR_BASE = 0x04000000;
+constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
+
+// BO size allocated for commands
+constexpr int CMD_SIZE = 64;
+
+// This is a temp workaround. For some reason the first command count in a chain
+// needs to be a larger than it actually is, assuming there is some other data 
+// structure at the beginning
+// TODO: Look more into this
+constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
+
+// Index of command payload where the instruction sequence 
+// address is located 
+constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;
+
+// Environment variable to define job submission timeout
+constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT";
+constexpr int DEFAULT_TIMEOUT_VAL = 50;
+char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR);
+int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr);
+
 namespace rocr {
 namespace AMD {
 
@@ -106,7 +130,7 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
       .CreateQueue(*this);
 }
 
-AieAqlQueue::~AieAqlQueue() { Inactivate(); }
+AieAqlQueue::~AieAqlQueue() { AieAqlQueue::Inactivate(); }
 
 hsa_status_t AieAqlQueue::Inactivate() {
   bool active(active_.exchange(false, std::memory_order_relaxed));
@@ -195,8 +219,249 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
 }
 
 void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
-  atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value),
-                std::memory_order_release);
+  std::unordered_map<uint32_t, void*> vmem_handle_mappings;
+  if (reinterpret_cast<XdnaDriver &>(
+          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
+          .GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) {
+    return;
+  }
+
+  int fd = 0;
+  if (reinterpret_cast<XdnaDriver &>(
+          core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type))
+          .GetFd(fd) != HSA_STATUS_SUCCESS) {
+    return;
+  }
+
+  SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address,
+            amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id,
+            vmem_handle_mappings);
+}
+
+hsa_status_t AieAqlQueue::SyncBos(std::vector<uint32_t> &bo_args, int fd) {
+  for (unsigned int bo_arg : bo_args) {
+    amdxdna_drm_sync_bo sync_params = {};
+    sync_params.handle = bo_arg;
+    if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params))
+      return HSA_STATUS_ERROR;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
+                                         uint32_t hw_ctx_handle, int fd) {
+  // Submit the cmd
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd))
+    return HSA_STATUS_ERROR;
+
+  // Waiting for command to finish
+  amdxdna_drm_wait_cmd wait_cmd = {};
+  wait_cmd.hwctx = hw_ctx_handle;
+  wait_cmd.timeout = timeout_val; 
+  wait_cmd.seq = exec_cmd->seq;
+
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd))
+    return HSA_STATUS_ERROR;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void AieAqlQueue::RegisterCmdBOs(
+    uint32_t count, std::vector<uint32_t> &bo_args,
+    hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
+    std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
+  // This is the index where the operand addresses start in a command
+  const int operand_starting_index = 5;
+
+  // Counting the number of operands in the command payload.
+  // Operands are 64-bits so we need to divide by two
+  uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2;
+
+  // Keep track of the handles before we submit the packet
+  bo_args.push_back(cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); 
+
+  // Going through all of the operands in the command, keeping track of the
+  // handles and turning the handles into addresses. The starting index of
+  // the operands in a command is `operand_starting_index` and the fields
+  // are 32-bits we need to iterate over every two
+  for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
+    bo_args.push_back(
+        cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]);
+    // clang-format off
+    cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1] =
+        (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32 & 0xFFFFFFFF;
+    cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter] =
+        (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF;
+    // clang-format on
+  }
+
+  // Transform the instruction sequence address into device address
+  cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] =
+      DEV_ADDR_BASE |
+      (reinterpret_cast<uint64_t>(
+           vmem_handle_mappings
+               [cmd_pkt_payload
+                    ->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]]) &
+       DEV_ADDR_OFFSET_MASK);
+}
+
+hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
+                                    amdxdna_cmd **cmd, int fd) {
+  // Creating the command
+  amdxdna_drm_create_bo create_cmd_bo = {};
+  create_cmd_bo.type = AMDXDNA_BO_CMD,
+  create_cmd_bo.size = CMD_SIZE;
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
+    return HSA_STATUS_ERROR;
+
+  amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {};
+  cmd_bo_get_bo_info.handle = create_cmd_bo.handle;
+  if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info))
+    return HSA_STATUS_ERROR;
+
+  *cmd = static_cast<amdxdna_cmd *>(mmap(nullptr, create_cmd_bo.size,
+                                         PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+                                         cmd_bo_get_bo_info.map_offset));
+  *handle = create_cmd_bo.handle;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t AieAqlQueue::SubmitCmd(
+    uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id,
+    uint64_t write_dispatch_id,
+    std::unordered_map<uint32_t, void *> &vmem_handle_mappings) {
+  uint64_t cur_id = read_dispatch_id;
+  while (cur_id < write_dispatch_id) {
+    hsa_amd_aie_ert_packet_t *pkt =
+        static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + cur_id;
+
+    // Get the packet header information
+    if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC ||
+        pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT)
+      return HSA_STATUS_ERROR;
+
+    // Get the payload information
+    switch (pkt->opcode) {
+      case HSA_AMD_AIE_ERT_START_CU: {
+
+        std::vector<uint32_t> bo_args;
+        std::vector<uint32_t> cmd_handles;
+
+        // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU
+        // packets there are. All can be combined into a single chain.
+        int num_cont_start_cu_pkts = 1;
+        for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
+          if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) {
+            break;
+          }
+          num_cont_start_cu_pkts++;
+        }
+
+        // Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets
+        for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) {
+
+          // Getting the current command packet
+          hsa_amd_aie_ert_packet_t *pkt =
+              static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + pkt_iter;
+          hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload =
+              reinterpret_cast<hsa_amd_aie_ert_start_kernel_data_t *>(
+                  pkt->payload_data);
+
+          // Add the handles for all of the BOs to bo_args as well as rewrite
+          // the command payload handles to contain the actual virtual addresses
+          RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings);
+
+          // Creating a packet that contains the command to execute the kernel
+          uint32_t cmd_bo_handle = 0;
+          amdxdna_cmd *cmd = nullptr;
+          if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
+            return HSA_STATUS_ERROR;
+
+          // Filling in the fields of the command
+          cmd->state = pkt->state;
+          cmd->extra_cu_masks = 0;
+
+          // For some reason the first count needs to be a little larger than
+          // it actually is, assuming there is some other data structure at the
+          // beginning
+          // TODO: Look more into this
+          if (pkt_iter == cur_id) {
+            cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
+          }
+          else {
+            cmd->count = pkt->count;
+          }
+          cmd->opcode = pkt->opcode;
+          cmd->data[0] = cmd_pkt_payload->cu_mask;
+          memcpy((cmd->data + 1),  cmd_pkt_payload->data, 4 * pkt->count);
+
+          // Keeping track of the handle
+          cmd_handles.push_back(cmd_bo_handle);
+        }
+
+        // Creating a packet that contains the command chain
+        uint32_t cmd_chain_bo_handle = 0;
+        amdxdna_cmd *cmd_chain = nullptr;
+        int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t);
+        if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain, fd))
+          return HSA_STATUS_ERROR;
+
+        // Writing information to the command buffer
+        amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast<amdxdna_cmd_chain *>(cmd_chain->data);
+
+        // Creating a command chain
+        cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
+        cmd_chain->extra_cu_masks = 0;
+        // TODO: Figure out why this is the value
+        cmd_chain->count = 0xA;
+        cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
+        cmd_chain_payload->command_count = cmd_handles.size();
+        cmd_chain_payload->submit_index = 0;
+        cmd_chain_payload->error_index = 0;
+        for (int i = 0; i < cmd_handles.size(); i++) {
+          cmd_chain_payload->data[i] = cmd_handles[i];
+        }
+
+        // Syncing BOs before we execute the command
+        if (SyncBos(bo_args, fd))
+          return HSA_STATUS_ERROR;
+
+        // Removing duplicates in the bo container. The driver will report
+        // an error if we provide the same BO handle multiple times.
+        // This can happen if any of the BOs are the same across jobs
+        std::sort(bo_args.begin(), bo_args.end());
+        bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end());
+
+        // Filling in the fields to execute the command chain
+        amdxdna_drm_exec_cmd exec_cmd_0 = {};
+        exec_cmd_0.ext = 0;
+        exec_cmd_0.ext_flags = 0;
+        exec_cmd_0.hwctx = hw_ctx_handle;
+        exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
+        exec_cmd_0.cmd_handles = cmd_chain_bo_handle;
+        exec_cmd_0.args = (uint64_t)bo_args.data();
+        exec_cmd_0.cmd_count = 1;
+        exec_cmd_0.arg_count = bo_args.size();
+
+        // Executing all commands in the command chain
+        ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd);
+
+        // Syncing BOs after we execute the command
+        if (SyncBos(bo_args, fd))
+          return HSA_STATUS_ERROR;
+
+        cur_id += num_cont_start_cu_pkts;
+        break;
+      }
+      default: {
+        return HSA_STATUS_ERROR;
+      }
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
 }
 
 void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
@@ -207,16 +472,16 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
 hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
                                   void *value) {
   switch (attribute) {
-  case HSA_AMD_QUEUE_INFO_AGENT:
-    *(reinterpret_cast<hsa_agent_t *>(value)) = agent_.public_handle();
-    break;
-  case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
-    // Hardware doorbell supports AQL semantics.
-    *(reinterpret_cast<uint64_t *>(value)) =
-        reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
-    break;
-  default:
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    case HSA_AMD_QUEUE_INFO_AGENT:
+      *static_cast<hsa_agent_t *>(value) = agent_.public_handle();
+      break;
+    case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
+      // Hardware doorbell supports AQL semantics.
+      *static_cast<uint64_t *>(value) =
+          reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
+      break;
+    default:
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
   return HSA_STATUS_SUCCESS;
 }