Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IREE AIE #14

Open
wants to merge 20 commits into
base: amd-staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
855c118
rocr/aie: Add support for creating AIE queue context
atgutier Aug 19, 2024
6121b37
rocr/aie: Add AMD AIE Embedded Runtime vendor packets
atgutier Aug 19, 2024
726d631
rocr/aie: Init mem regions for AIE agents
atgutier Aug 19, 2024
93ca2cb
rocr/aie: Allocate AIE queue's ring buf
atgutier Aug 19, 2024
04e2e25
rocr: Add AMD ext for configuring a queue's HW context
atgutier Aug 19, 2024
a5790de
rocr/aie: Support VMEM handle creation
atgutier Aug 19, 2024
03edf03
rocr/aie: Correct reporting of dev heap size
ypapadop-amd Aug 28, 2024
bfaa50e
rocr/aie: Missing AIEAgent info cases
ypapadop-amd Jun 11, 2024
349d6f5
rocr/aie: Handle sideband
eddierichter-amd Aug 8, 2024
6c5b5e5
rocr/aie: Fix merge conflict
ypapadop-amd Aug 29, 2024
fc4356b
rocr/aie: Adding a placeholder UUID for NPU devices
josemonsalve2 Aug 9, 2024
531e0a2
re-enable aie_hsa_dispatch_test
makslevental Sep 1, 2024
f01b0e4
Adding soft queue dispatch logic to dispatch commands to AIE agents (#2)
eddierichter-amd Sep 3, 2024
5089203
Fix narrowing conversion warnings (#17)
ypapadop-amd Sep 4, 2024
5e978ce
Releasing buffer for XDNA hw context params (#18)
ypapadop-amd Sep 4, 2024
63d45b7
Using BOs of type BO_SHMEM instead of BO_CMD for kernarg memory regio…
eddierichter-amd Sep 5, 2024
c817704
Adding GetInfo keys for ROCm 6.2 support (#20)
ypapadop-amd Sep 5, 2024
0757f63
Fixing command sizing and changing the test to issue the maximum numb…
eddierichter-amd Sep 12, 2024
793c038
Avoid incorrect casting (#24)
ypapadop-amd Sep 13, 2024
c046b70
Freeing the commands and the command chain created during dispatch (#29)
eddierichter-amd Sep 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,9 @@ jobs:
-DCMAKE_BUILD_TYPE=Release \
"-Dhsa-runtime64_DIR=$hsa_runtime64_ROOT/lib64/cmake/hsa-runtime64" \
-S "$PWD" -B "$build_dir"
cmake --build "$build_dir" --target aie_hsa_dispatch_test

! cmake --build "$build_dir" --target aie_hsa_dispatch_test
"$build_dir"/aie_hsa_dispatch_test $PWD

popd

Expand Down
177 changes: 99 additions & 78 deletions rocrtst/suites/aie/aie_hsa_dispatch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
bin_file.read(reinterpret_cast<char *>(*buf), size);
}

void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf) {
void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf, uint32_t &num_instr) {
std::ifstream bin_file(file_name,
std::ios::binary | std::ios::ate | std::ios::in);

Expand All @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
assert(r == HSA_STATUS_SUCCESS);
std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
num_instr = pdi_vec.size();
}

} // namespace
Expand All @@ -144,9 +145,9 @@ int main(int argc, char **argv) {
hsa_amd_memory_pool_t global_dev_mem_pool{0};
// System memory pool. Used for allocating kernel argument data.
hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string pdi_file_name(sourcePath / "add_one.pdi");
uint32_t *dpu_inst_buf(nullptr);
uint32_t *instr_inst_buf(nullptr);
uint64_t *pdi_buf(nullptr);

assert(aie_agents.empty());
Expand All @@ -164,8 +165,6 @@ int main(int argc, char **argv) {
// Find the AIE agents in the system.
r = hsa_iterate_agents(get_aie_agents, &aie_agents);
assert(r == HSA_STATUS_SUCCESS);
// assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
// HSA_STATUS_SUCCESS);
assert(aie_agents.size() == 1);

const auto &aie_agent = aie_agents.front();
Expand All @@ -190,14 +189,22 @@ int main(int argc, char **argv) {
assert(r == HSA_STATUS_SUCCESS);
assert(global_kernarg_mem_pool.handle);

// Getting the maximum size of the queue so we can submit that many consecutive
// packets.
uint32_t aie_max_queue_size;
r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
assert(r == HSA_STATUS_SUCCESS);
int num_pkts = aie_max_queue_size;

// Load the DPU and PDI files into a global pool that doesn't support kernel
// args (DEV BO).
load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
reinterpret_cast<void **>(&dpu_inst_buf));
uint32_t dpu_handle = 0;
r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
uint32_t num_instr;
load_instr_file(global_dev_mem_pool, instr_inst_file_name,
reinterpret_cast<void **>(&instr_inst_buf), num_instr);
uint32_t instr_handle = 0;
r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(dpu_handle != 0);
assert(instr_handle != 0);

load_pdi_file(global_dev_mem_pool, pdi_file_name,
reinterpret_cast<void **>(&pdi_buf));
Expand All @@ -222,85 +229,99 @@ int main(int argc, char **argv) {
constexpr std::size_t data_buffer_size =
num_data_elements * sizeof(std::uint32_t);

std::uint32_t *input = {};
r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t input_handle = {};
r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle != 0);

std::uint32_t *output = {};
r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t output_handle = {};
r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle != 0);
std::vector<uint32_t *> input(num_pkts);
std::vector<uint32_t *> output(num_pkts);
std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
std::vector<uint32_t> input_handle(num_pkts);
std::vector<uint32_t> output_handle(num_pkts);

uint64_t wr_idx = 0;
uint64_t packet_id = 0;

for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle[pkt_iter] != 0);

r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle[pkt_iter] != 0);

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input[pkt_iter] + i) = i * (pkt_iter + 1);
*(output[pkt_iter] + i) = 0xDEFACE;
}

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input + i) = i;
*(output + i) = 0xDEFACE;
// Getting a slot in the queue
wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
packet_id = wr_idx % aie_queue->size;

// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address) + packet_id;
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
// Selecting the PDI to use with this command
cmd_payload->cu_mask = 0x1;
// Transaction opcode
cmd_payload->data[0] = 0x3;
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = instr_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = num_instr;
cmd_payload->data[5] = input_handle[pkt_iter];
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle[pkt_iter];
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

// Keeping track of payloads so we can free them at the end
cmd_payloads[pkt_iter] = cmd_payload;
}

///////////////////////////////////// Creating the cmd packet
// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_pkt));
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
uint32_t cmd_handle;
r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
&cmd_handle);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command
cmd_payload->data[0] = 0x3; // Transaction opcode
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = dpu_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = 0x44; // Size of DPU instruction
cmd_payload->data[5] = input_handle;
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle;
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
uint64_t packet_id = wr_idx % aie_queue->size;
reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address)[packet_id] = *cmd_pkt;
// Ringing the doorbell to dispatch each packet we added to
// the queue
hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);

for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input + i) + 1;
const auto result = *(output + i);
assert(result == expected);
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input[pkt_iter] + i) + 1;
const auto result = *(output[pkt_iter] + i);
assert(result == expected);
}

r = hsa_amd_memory_pool_free(output[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
}

r = hsa_queue_destroy(aie_queue);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_amd_memory_pool_free(output);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(pdi_buf);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(dpu_inst_buf);
r = hsa_amd_memory_pool_free(instr_inst_buf);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_shut_down();
Expand Down
13 changes: 13 additions & 0 deletions runtime/hsa-runtime/core/common/hsa_table_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,19 @@ uint32_t HSA_API
wait_hint, satisfying_value);
}

// Mirrors AMD Extension APIs.
hsa_status_t
hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue,
hsa_amd_queue_hw_ctx_config_param_t config_type,
void *args) {
return amdExtTable->hsa_amd_queue_hw_ctx_config_fn(queue, config_type, args);
}

// Mirrors AMD Extension APIs.
hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) {
return amdExtTable->hsa_amd_get_handle_from_vaddr_fn(ptr, handle);
}

// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
uint32_t num_cu_mask_count,
Expand Down
26 changes: 25 additions & 1 deletion runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ namespace AMD {
KfdDriver::KfdDriver(std::string devnode_name)
: core::Driver(core::DriverType::KFD, devnode_name) {}

hsa_status_t KfdDriver::Init() { return HSA_STATUS_SUCCESS; }

hsa_status_t KfdDriver::DiscoverDriver() {
if (hsaKmtOpenKFD() == HSAKMT_STATUS_SUCCESS) {
std::unique_ptr<Driver> kfd_drv(new KfdDriver("/dev/kfd"));
Expand All @@ -74,6 +76,10 @@ hsa_status_t KfdDriver::QueryKernelModeDriver(core::DriverQuery query) {
return HSA_STATUS_SUCCESS;
}

hsa_status_t KfdDriver::GetAgentProperties(core::Agent &agent) const {
return HSA_STATUS_SUCCESS;
}

hsa_status_t
KfdDriver::GetMemoryProperties(uint32_t node_id,
core::MemoryRegion &mem_region) const {
Expand All @@ -97,6 +103,11 @@ KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
kmt_alloc_flags.ui32.NonPaged = 1;
}

if (m_region.IsLocalMemory() &&
(alloc_flags & core::MemoryRegion::AllocateMemoryOnly)) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}

// Allocating a memory handle for virtual memory
kmt_alloc_flags.ui32.NoAddress =
!!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
Expand Down Expand Up @@ -230,14 +241,27 @@ hsa_status_t KfdDriver::FreeMemory(void *mem, size_t size) {
return FreeKfdMemory(mem, size) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}

hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) {
hsa_status_t KfdDriver::CreateQueue(core::Queue &queue) const {
return HSA_STATUS_SUCCESS;
}

hsa_status_t KfdDriver::DestroyQueue(core::Queue &queue) const {
return HSA_STATUS_SUCCESS;
}

hsa_status_t
KfdDriver::ConfigHwCtx(core::Queue &queue,
hsa_amd_queue_hw_ctx_config_param_t config_type,
void *args) {
// Only AIE queues support this for now.
return HSA_STATUS_ERROR_INVALID_AGENT;
}

hsa_status_t KfdDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) {
// Only AIE queues support this for now.
return HSA_STATUS_ERROR_INVALID_AGENT;
}

void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id,
size_t size) {
void *mem = nullptr;
Expand Down
Loading