From 91918778617d766f3044e127040f753d3e988388 Mon Sep 17 00:00:00 2001 From: makslevental Date: Fri, 30 Aug 2024 12:27:52 -0500 Subject: [PATCH] [WIP] AIE CI tests --- .github/workflows/ci-linux.yml | 168 ++++++ rocrtst/suites/aie/CMakeLists.txt | 8 + rocrtst/suites/aie/add_one.pdi | Bin 0 -> 3552 bytes rocrtst/suites/aie/add_one_insts.txt | 68 +++ rocrtst/suites/aie/aie_hsa_bare_add_one.cc | 485 +++++++++++++++++ rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 309 +++++++++++ rocrtst/suites/aie/amdxdna_accel.h | 569 ++++++++++++++++++++ rocrtst/suites/aie/hsa_ipu.h | 271 ++++++++++ 8 files changed, 1878 insertions(+) create mode 100644 .github/workflows/ci-linux.yml create mode 100644 rocrtst/suites/aie/CMakeLists.txt create mode 100644 rocrtst/suites/aie/add_one.pdi create mode 100644 rocrtst/suites/aie/add_one_insts.txt create mode 100644 rocrtst/suites/aie/aie_hsa_bare_add_one.cc create mode 100644 rocrtst/suites/aie/aie_hsa_dispatch_test.cc create mode 100644 rocrtst/suites/aie/amdxdna_accel.h create mode 100644 rocrtst/suites/aie/hsa_ipu.h diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml new file mode 100644 index 000000000..f83f3eeac --- /dev/null +++ b/.github/workflows/ci-linux.yml @@ -0,0 +1,168 @@ +name: CI Linux + +on: + workflow_call: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - main + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). + group: ci-build-test-cpp-linux-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build: + name: Build (linux) + strategy: + fail-fast: false + matrix: + runs-on: + - ubuntu-22.04 +# - nod-ai-shared-cpubuilder-manylinux-x86_64 + runs-on: ${{ matrix.runs-on }} + steps: + - name: Install tmate + if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }} + run: dnf install -y epel-release && dnf install -y tmate + + - name: Configure local git mirrors + if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }} + run: | + /gitmirror/scripts/trigger_update_mirrors.sh + /gitmirror/scripts/git_config.sh + + - name: Install deps + if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }} + run: | + dnf install -y almalinux-release-devel + yum install -y elfutils-libelf-devel p7zip p7zip-plugins + + - name: Install deps + if: ${{ matrix.runs-on == 'ubuntu-22.04' }} + run: | + sudo apt install -y libelf-dev libnuma-dev libdrm-dev + + - name: Setup Cpp + if: ${{ matrix.runs-on == 'ubuntu-22.04' }} + uses: aminya/setup-cpp@v1 + with: + compiler: llvm-18 + cmake: true + ninja: true + ccache: true + + - name: "Checking out repository" + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: recursive + + - name: Build and install libnuma + if: ${{ matrix.runs-on == 'nod-ai-shared-cpubuilder-manylinux-x86_64' }} + run: | + curl --silent -L \ + https://github.com/numactl/numactl/releases/download/v2.0.18/numactl-2.0.18.tar.gz \ + -o numactl-2.0.18.tar.gz + tar -xf numactl-2.0.18.tar.gz + pushd numactl-2.0.18 + ./configure + make install + popd + + - name: Hack ROCR + run: | + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/trap_handler/CMakeLists.txt + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/image/blit_src/CMakeLists.txt + + - name: Build ROCR distro + run: | + rocr_dir="$(cd ${{ github.workspace }} && pwd)" + build_rocr_dir="${{ github.workspace }}/rocr-build" + mkdir -p "$build_rocr_dir" + build_rocr_dir="$(cd $build_rocr_dir && pwd)" + rocr_install_dir="${{ github.workspace }}/rocr-install" + + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$rocr_install_dir" \ + -DClang_DIR=/usr/lib/llvm-18/lib/cmake/clang \ + -DLLVM_DIR=/usr/lib/llvm-18/lib/cmake/llvm \ + -DIMAGE_SUPPORT=OFF \ + -S "$rocr_dir" -B "$build_rocr_dir" + + cmake --build "$build_rocr_dir" --target install + tar -cf rocr-$(git rev-parse --short HEAD).tar rocr-install + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: linux_x86_64_distro + path: rocr-*.tar + if-no-files-found: warn + + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3.18 + with: + limit-access-to-actor: true + install-dependencies: ${{ matrix.runs-on == 'ubuntu-22.04' }} + + test_aie: + name: AIE tests + needs: build + strategy: + fail-fast: false + matrix: + runs-on: [linux-phoenix] + runs-on: ${{ matrix.runs-on }} + steps: + - name: "Checking out repository" + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: linux_x86_64_distro + + - name: Extract artifact + run: | + tar -xf rocr-*.tar + echo hsa_runtime64_ROOT="$PWD/rocr-install" >> $GITHUB_ENV + + - name: Build and run AIE smoke test + run: | + pushd rocrtst/suites/aie + + build_dir="$PWD/build" + mkdir -p $build_dir + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + "-Dhsa-runtime64_DIR=$hsa_runtime64_ROOT/lib/cmake/hsa-runtime64" \ + -S "$PWD" -B "$build_dir" + cmake --build "$build_dir" --target aie_hsa_bare_add_one + + "$build_dir"/aie_hsa_bare_add_one $PWD + + popd + + - name: Build AIE test suite + run: | + pushd rocrtst/suites/aie + + build_dir="$PWD/build" + mkdir -p $build_dir + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + "-Dhsa-runtime64_DIR=$hsa_runtime64_ROOT/lib/cmake/hsa-runtime64" \ + -S "$PWD" -B "$build_dir" + + ! cmake --build "$build_dir" --target aie_hsa_dispatch_test + + popd diff --git a/rocrtst/suites/aie/CMakeLists.txt b/rocrtst/suites/aie/CMakeLists.txt new file mode 100644 index 000000000..b6aaf0cac --- /dev/null +++ b/rocrtst/suites/aie/CMakeLists.txt @@ -0,0 +1,8 @@ +find_package(hsa-runtime64 CONFIG REQUIRED NAMES hsa_runtime64 hsa-runtime64) + +# smoke test +add_executable(aie_hsa_bare_add_one aie_hsa_bare_add_one.cc) + +# hsa test +add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc) +target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64) diff --git a/rocrtst/suites/aie/add_one.pdi b/rocrtst/suites/aie/add_one.pdi new file mode 100644 index 0000000000000000000000000000000000000000..f21475d21f3ef286d180bea6e2a3127c53a48239 GIT binary patch literal 3552 zcmcInO>7%Q6n^VHc)u)J;mr zN*D9Ey|nSJ>$4nB9)I@2xic4DJ$wG#w!`=~^rN}Xp4JYP=rB6pG?ev30Xc@u z`#1R5n!Oo>gH-=>h(OeeIm~^&Xh#q%qCnxcI;BCNT+pPUI!{~s9)j9G)}~J?lRr!; zoswTc9wY)4*Zo9!?a}5die*tzK`x!&;GYv4ww=i)4jmaM`YiaO8}8Zc)Q{y~JQm-L zDCygBKJ;l6zD(IyCVM=ln$PeNp(ce!dB?mT8o{;8dfzr~Z-tFp8)4(-df2#Yhm8^R;mvUV z#=S7+GZUcHlKoI8w%v*TWF>!M{;qwXroEt3Y(=5x6}6z3lbGP-D18AWb`u!u>+Jm) zdSM}!z4gMnCwcrW5*hQxJMyCX4*OF_*_t(Xd&m0c1C>#|->7B?>&%)W|M2J7Q%kBj zTyy>ZMgGce{$_d*FqB39W_m>afJOeULN6@7e8IXWd0ai^^XJHy)ce?wy3QhiQ zx1%Zqo&E66?{4#7o$u=L|7-qBH|}YfyS9e9fphNVr+4W*b?Q{KHKVanH)7j$MtVLu zJ(Uq+KO1!)m+MjH!-kksRk`X7QVyj%)!A_M#MBYO-jxGI!&H;T9Xp)wLlr7+g}J^a z$)emTTO%r6RyA){A_l5!COh5g>GH#?P4KfEz&1EPZ@icumHM`@KJ+`bR(jobeaS1D z0C9}TD;y7d+X7buq!e_k4`?Y(y${cIEIe@BN>k>E`gmsTQhacJ1n&uscX91{EN_0T zr#j#J=#PX(rD;_wU0#>wY@GJ8$NAvrwIL^-UXDq|)Nj#D^FFuQE0k%z*ut|zj;55h zb`b;Tb=rFe@VhXeKr-9;73p+IW}hm*E6Z&R1=b~ON;n~5OTuXh4@r1b!Z``gNO)et zixMtMcv-?L5-v;F1m^YeS`xC}a$Y_{FJY77+k9Ye?IiipB|jwbb(t^u)Lk4w$cW;^ ztyB1m0&TgCOw<5@rOV2R-~s>lhC?wOK2D9lfnQ&XZwbETCqw%6Oz^>4{2|9be?Ivd z_|2TF;LmMBFV2pOc|~lCLKbHU=SY`q5fJK>eXhS2vVd~b;>!M;WuJUQ=9}O#zewKr zf96{*KSx|-ceazZev^aoshQqmJ0B$QLZk1Gc`-vU1Zy@yHC$Dzl%1ScrzgV5ja>bqrq txm}u1d&OrI`ih@bM=x@Rk0}1OCp;l62O;x+FCCrFt>BnqERV0WzW_#=ycPfe literal 0 HcmV?d00001 diff --git a/rocrtst/suites/aie/add_one_insts.txt b/rocrtst/suites/aie/add_one_insts.txt new file mode 100644 index 000000000..a5e9f9d33 --- /dev/null +++ b/rocrtst/suites/aie/add_one_insts.txt @@ -0,0 +1,68 @@ +06030100 +00000105 +00000007 +00000110 +00000001 +00000000 +0001D000 +00000030 +00000400 +00000000 +00000000 +00000000 +80000000 +00000000 +00000000 +02000000 +00000081 +00000030 +00000000 +00000000 +00000000 +00000000 +0001D004 +00000000 +00000001 +00000000 +00000000 +00000000 +00000000 +00000000 +0001D204 +00000000 +80000000 +00000018 +00000001 +00000000 +0001D020 +00000030 +00000400 +00000000 +00000000 +00000000 +80000000 +00000000 +00000000 +02000000 +00000081 +00000030 +00000000 +00000000 +00000000 +00000000 +0001D024 +00000000 +00000000 +00000000 +00000000 +00000000 +00000000 +00000000 +0001D214 +00000000 +00000001 +00000018 +00000080 +00000010 +00000000 +00010100 diff --git a/rocrtst/suites/aie/aie_hsa_bare_add_one.cc b/rocrtst/suites/aie/aie_hsa_bare_add_one.cc new file mode 100644 index 000000000..ad41567b0 --- /dev/null +++ b/rocrtst/suites/aie/aie_hsa_bare_add_one.cc @@ -0,0 +1,485 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +#include "amdxdna_accel.h" +#include "hsa_ipu.h" + +#define DATA_BUFFER_SIZE (1024 * 4) + +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[] __counted_by(command_count); +}; + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + uint32_t state : 4; + uint32_t unused : 6; + uint32_t extra_cu_masks : 2; + uint32_t count : 11; + uint32_t opcode : 5; + uint32_t reserved : 4; + }; + uint32_t header; + }; + uint32_t data[] __counted_by(count); +}; + +// These packets are variable width but using this as a +// maximum size for now +#define PACKET_SIZE 64 + +int main(int argc, char **argv) { + int drv_fd; + int ret; + const char drv_path[] = "/dev/accel/accel0"; + std::string test_dir(argv[1]); + std::string inst_path = test_dir + "/add_one_insts.txt"; + std::string pdi_path_str = test_dir + "/add_one.pdi"; + const char *dpu_inst_path = inst_path.c_str(); + const char *pdi_path = pdi_path_str.c_str(); // Add one kernel + uint32_t heap_handle; + uint32_t major, minor; + + // open the driver + drv_fd = open(drv_path, O_RDWR); + + if (drv_fd < 0) { + printf("Error %i opening %s\n", drv_fd, drv_path); + return -1; + } + + printf("%s open\n", drv_path); + + // get driver version + if (get_driver_version(drv_fd, &major, &minor) < 0) { + printf("Error getting driver version\n"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("Driver version %u.%u\n", major, minor); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 0: Allocate the necessary BOs. This includes: + // 1. The operands for the two kernels that will be launched + // 2. A heap which contains: + // a. A PDI for the design that will be run + // b. Instruction sequences for both runs + + // reserve some device memory for the heap + if (alloc_heap(drv_fd, 48 * 1024 * 1024, &heap_handle) < 0) { + perror("Error allocating device heap"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t pdi_vaddr; + uint64_t pdi_sram_vaddr; + uint32_t pdi_handle; + printf("Loading pdi\n"); + ret = load_pdi(drv_fd, &pdi_vaddr, &pdi_sram_vaddr, &pdi_handle, pdi_path); + if (ret < 0) { + printf("Error %i loading pdi\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_0_vaddr; + uint64_t dpu_0_sram_vaddr; + uint32_t dpu_0_handle; + uint32_t num_dpu_0_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_0_vaddr, &dpu_0_sram_vaddr, + &dpu_0_handle, dpu_inst_path, &num_dpu_0_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_1_vaddr; + uint64_t dpu_1_sram_vaddr; + uint32_t dpu_1_handle; + uint32_t num_dpu_1_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_1_vaddr, &dpu_1_sram_vaddr, + &dpu_1_handle, dpu_inst_path, &num_dpu_1_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("DPU 0 instructions @: %p\n", (void *)dpu_0_vaddr); + printf("DPU 1 instructions @: %p\n", (void *)dpu_1_vaddr); + printf("PDI file @: %p\n", (void *)pdi_vaddr); + printf("PDI handle @: %d\n", pdi_handle); + + uint64_t input_0; + uint64_t input_0_sram_vaddr; + uint32_t input_0_handle; + ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle, + DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_0); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_0; + uint64_t output_0_sram_vaddr; + uint32_t output_0_handle; + ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle, + DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_0); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t input_1; + uint64_t input_1_sram_vaddr; + uint32_t input_1_handle; + ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle, + DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_1); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_1; + uint64_t output_1_sram_vaddr; + uint32_t output_1_handle; + ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle, + DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_1); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + *((uint32_t *)input_0 + i) = i; + *((uint32_t *)input_1 + i) = i + 0xFEEDED1E; + *((uint32_t *)output_0 + i) = 0xDEFACE; + *((uint32_t *)output_1 + i) = 0xDEADBEEF; + } + + // Writing the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + // Performing a sync on the queue descriptor, completion signal, queue buffer + // and config cu bo. + sync_bo(drv_fd, dpu_0_handle); + sync_bo(drv_fd, dpu_1_handle); + sync_bo(drv_fd, pdi_handle); + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 1: Create a user mode queue + // This is going to be where we create a queue where we: + // 1. Create and configure a hardware context + // 2. Allocate the queue buffer as a user-mode queue + + // Allocating a structure to store QOS information + amdxdna_qos_info *qos = + (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info)); + qos->gops = 0; + qos->fps = 0; + qos->dma_bandwidth = 0; + qos->latency = 0; + qos->frame_exec_time = 0; + qos->priority = 0; + + // This is the structure that we pass + amdxdna_drm_create_hwctx create_hw_ctx = { + .ext = 0, + .ext_flags = 0, + .qos_p = (uint64_t)qos, + .umq_bo = 0, + .log_buf_bo = 0, + .max_opc = 0x800, // Not sure what this is but this was the value used + .num_tiles = 4, + .mem_size = 0, + .umq_doorbell = 0, + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hw_ctx); + if (ret != 0) { + perror("Failed to create hwctx"); + return -1; + } + + // Creating a structure to configure the CU + amdxdna_cu_config cu_config = { + .cu_bo = pdi_handle, + .cu_func = 0, + }; + + // Creating a structure to configure the hardware context + amdxdna_hwctx_param_config_cu param_config_cu; + param_config_cu.num_cus = 1; + param_config_cu.cu_configs[0] = cu_config; + + printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu)); + + // Configuring the hardware context with the PDI + amdxdna_drm_config_hwctx config_hw_ctx = { + .handle = create_hw_ctx.handle, + .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU, + // Pass in the pointer to the param value + .param_val = (uint64_t)¶m_config_cu, + // Size of param config CU is 16B + .param_val_size = 0x10, + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx); + if (ret != 0) { + perror("Failed to config hwctx"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 2: Configuring the CMD BOs with the different instruction sequences + amdxdna_drm_create_bo create_cmd_bo_0 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + int cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_0); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_0"); + return -1; + } + + amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = {.handle = + create_cmd_bo_0.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + // Writing the first packet to the queue + amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_0_get_bo_info.map_offset); + cmd_0->state = 1; // ERT_CMD_STATE_NEW; + cmd_0->extra_cu_masks = 0; + cmd_0->count = 0xF; // NOTE: For some reason this needs to be larger + cmd_0->opcode = 0x0; // ERT_START_CU; + cmd_0->data[0] = 0x3; // NOTE: This one seems to be skipped + cmd_0->data[1] = 0x3; // Transaction opcode + cmd_0->data[2] = 0x0; + cmd_0->data[3] = dpu_0_sram_vaddr; + cmd_0->data[4] = 0x0; + cmd_0->data[5] = 0x44; // Size of DPU instruction + cmd_0->data[6] = input_0 & 0xFFFFFFFF; // Input low + cmd_0->data[7] = (input_0 >> 32) & 0xFFFFFFFF; // Input high + cmd_0->data[8] = output_0 & 0xFFFFFFFF; // Output low + cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF; // Output high + + // Writing to the second packet of the queue + amdxdna_drm_create_bo create_cmd_bo_1 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_1); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_1"); + return -1; + } + + amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = {.handle = + create_cmd_bo_1.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_1_get_bo_info.map_offset); + cmd_1->state = 1; // ERT_CMD_STATE_NEW; + cmd_1->extra_cu_masks = 0; + cmd_1->count = 10; // Number of commands + cmd_1->opcode = 0x0; // ERT_START_CU; + cmd_1->data[0] = 0x3; // This one seems to be skipped + cmd_1->data[1] = 0x3; // Transaction opcode + cmd_1->data[2] = 0x0; + cmd_1->data[3] = dpu_1_sram_vaddr; + cmd_1->data[4] = 0x0; + cmd_1->data[5] = 0x44; // Size of DPU instruction + cmd_1->data[6] = input_1 & 0xFFFFFFFF; // Input low + cmd_1->data[7] = (input_1 >> 32) & 0xFFFFFFFF; // Input high + cmd_1->data[8] = output_1 & 0xFFFFFFFF; // Output low + cmd_1->data[9] = (output_1 >> 32) & 0xFFFFFFFF; // Output high + + ///////////////////////////////////////////////////////////////////////////////// + // Step 3: Submit commands -- This requires creating a BO_EXEC that contains + // the command chain that points to the instruction sequences just created + + // Allocate a command chain + void *bo_cmd_chain_buf = nullptr; + cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096); + if (cmd_bo_ret != 0 || bo_cmd_chain_buf == nullptr) { + printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096); + } + + amdxdna_drm_create_bo create_cmd_chain_bo = { + .type = AMDXDNA_BO_CMD, + .size = 4096, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_chain_bo); + if (cmd_bo_ret != 0) { + perror("Failed to create command chain BO"); + return -1; + } + + amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = { + .handle = create_cmd_chain_bo.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + amdxdna_cmd *cmd_chain = + (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, + drv_fd, cmd_chain_bo_get_bo_info.map_offset); + + // Writing information to the command buffer + amdxdna_cmd_chain *cmd_chain_payload = + (struct amdxdna_cmd_chain *)(cmd_chain->data); + cmd_chain->state = 1; // ERT_CMD_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = 0xA; // TODO: Why is this the value? + cmd_chain->opcode = 0x13; // ERT_CMD_CHAIN + cmd_chain_payload->command_count = 2; + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + cmd_chain_payload->data[0] = create_cmd_bo_0.handle; + cmd_chain_payload->data[1] = create_cmd_bo_1.handle; + + // Reading the user buffers + sync_bo(drv_fd, create_cmd_chain_bo.handle); + sync_bo(drv_fd, create_cmd_bo_0.handle); + sync_bo(drv_fd, create_cmd_bo_1.handle); + + // Perform a submit cmd + uint32_t bo_args[6] = {dpu_0_handle, dpu_1_handle, input_0_handle, + output_0_handle, input_1_handle, output_1_handle}; + amdxdna_drm_exec_cmd exec_cmd_0 = { + .ext = 0, + .ext_flags = 0, + .hwctx = create_hw_ctx.handle, + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, + .cmd_handles = create_cmd_chain_bo.handle, + .args = (uint64_t)bo_args, + .cmd_count = 1, + .arg_count = sizeof(bo_args) / sizeof(uint32_t), + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, &exec_cmd_0); + if (ret != 0) { + perror("Failed to submit work"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 4: Wait for the output + // Use the wait IOCTL to wait for our submission to complete + amdxdna_drm_wait_cmd wait_cmd = { + .hwctx = create_hw_ctx.handle, + .timeout = 50, // 50ms timeout + .seq = exec_cmd_0.seq, + }; + + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd); + if (ret != 0) { + perror("Failed to wait"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 5: Verify output + + // Reading the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + int errors = 0; + printf("Checking run 0:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_0 + i); + uint32_t dst = *((uint32_t *)output_0 + i); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + printf("Checking run 1:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_1 + i); + uint32_t dst = *((uint32_t *)output_1 + i); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + if (!errors) { + printf("PASS!\n"); + } else { + printf("FAIL! %d/2048\n", errors); + } + + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return 0; +} \ No newline at end of file diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc new file mode 100644 index 000000000..fca55e4a4 --- /dev/null +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -0,0 +1,309 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +namespace { + +hsa_status_t get_agent(hsa_agent_t agent, std::vector *agents, + hsa_device_type_t requested_dev_type) { + if (!agents || !(requested_dev_type == HSA_DEVICE_TYPE_AIE || + requested_dev_type == HSA_DEVICE_TYPE_GPU || + requested_dev_type == HSA_DEVICE_TYPE_CPU)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t device_type; + hsa_status_t ret = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (device_type == requested_dev_type) { + agents->push_back(agent); + } + + return ret; +} + +hsa_status_t get_aie_agents(hsa_agent_t agent, void *data) { + if (!data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + auto *aie_agents = reinterpret_cast *>(data); + return get_agent(agent, aie_agents, HSA_DEVICE_TYPE_AIE); +} + +hsa_status_t get_coarse_global_mem_pool(hsa_amd_memory_pool_t pool, void *data, + bool kernarg) { + hsa_amd_segment_t segment_type; + auto ret = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (segment_type == HSA_AMD_SEGMENT_GLOBAL) { + hsa_amd_memory_pool_global_flag_t global_pool_flags; + ret = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_pool_flags); + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (kernarg) { + if ((global_pool_flags & + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && + (global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { + *static_cast(data) = pool; + } + } else { + if ((global_pool_flags & + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && + !(global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { + *static_cast(data) = pool; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t get_coarse_global_dev_mem_pool(hsa_amd_memory_pool_t pool, + void *data) { + return get_coarse_global_mem_pool(pool, data, false); +} + +hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool, + void *data) { + return get_coarse_global_mem_pool(pool, data, true); +} + +void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf) { + std::ifstream bin_file(file_name, + std::ios::binary | std::ios::ate | std::ios::in); + + assert(bin_file.fail() == false); + + auto size(bin_file.tellg()); + + bin_file.seekg(0, std::ios::beg); + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + assert(r == HSA_STATUS_SUCCESS); + bin_file.read(reinterpret_cast(*buf), size); +} + +void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf) { + std::ifstream bin_file(file_name, + std::ios::binary | std::ios::ate | std::ios::in); + + assert(bin_file.fail() == false); + + auto size(bin_file.tellg()); + bin_file.seekg(0, std::ios::beg); + std::vector pdi_vec; + std::string val; + + while (bin_file >> val) { + pdi_vec.push_back(std::stoul(val, nullptr, 16)); + } + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + assert(r == HSA_STATUS_SUCCESS); + std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); +} + +} // namespace + +int main(int argc, char **argv) { + std::filesystem::path sourcePath(argv[1]); + // List of AIE agents in the system. + std::vector aie_agents; + // For creating a queue on an AIE agent. + hsa_queue_t *aie_queue(nullptr); + // Memory pool for allocating device-mapped memory. Used for PDI/DPU + // instructions. + hsa_amd_memory_pool_t global_dev_mem_pool{0}; + // System memory pool. Used for allocating kernel argument data. + hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; + const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt"); + const std::string pdi_file_name(sourcePath / "add_one.pdi"); + uint32_t *dpu_inst_buf(nullptr); + uint64_t *pdi_buf(nullptr); + + assert(aie_agents.empty()); + assert(global_dev_mem_pool.handle == 0); + assert(global_kernarg_mem_pool.handle == 0); + + // Initialize the runtime. + auto r = hsa_init(); + assert(r == HSA_STATUS_SUCCESS); + + assert(sizeof(hsa_kernel_dispatch_packet_s) == + sizeof(hsa_amd_aie_ert_packet_s)); + + // Test a launch of an AIE kernel using the HSA API. + // Find the AIE agents in the system. + r = hsa_iterate_agents(get_aie_agents, &aie_agents); + assert(r == HSA_STATUS_SUCCESS); + // assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) == + // HSA_STATUS_SUCCESS); + assert(aie_agents.size() == 1); + + const auto &aie_agent = aie_agents.front(); + + // Create a queue on the first agent. + r = hsa_queue_create(aie_agent, 64, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, + 0, 0, &aie_queue); + assert(r == HSA_STATUS_SUCCESS); + assert(aie_queue); + assert(aie_queue->base_address); + + // Find a pool for DEV BOs. This is a global system memory pool that is + // mapped to the device. Will be used for PDIs and DPU instructions. + r = hsa_amd_agent_iterate_memory_pools( + aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool); + assert(r == HSA_STATUS_SUCCESS); + + // Find a pool that supports kernel args. This is just normal system memory. + // It will be used for commands and input data. + r = hsa_amd_agent_iterate_memory_pools( + aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool); + assert(r == HSA_STATUS_SUCCESS); + assert(global_kernarg_mem_pool.handle); + + // Load the DPU and PDI files into a global pool that doesn't support kernel + // args (DEV BO). + load_dpu_file(global_dev_mem_pool, dpu_inst_file_name, + reinterpret_cast(&dpu_inst_buf)); + uint32_t dpu_handle = 0; + r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(dpu_handle != 0); + + load_pdi_file(global_dev_mem_pool, pdi_file_name, + reinterpret_cast(&pdi_buf)); + uint32_t pdi_handle = 0; + r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(pdi_handle != 0); + + hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, + .cu_func = 0}; + + hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{ + .num_cus = 1, .cu_configs = &cu_config}; + + // Configure the queue's hardware context. + r = hsa_amd_queue_hw_ctx_config( + aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args); + assert(r == HSA_STATUS_SUCCESS); + + // create inputs / outputs + constexpr std::size_t num_data_elements = 1024; + constexpr std::size_t data_buffer_size = + num_data_elements * sizeof(std::uint32_t); + + std::uint32_t *input = {}; + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, + reinterpret_cast(&input)); + assert(r == HSA_STATUS_SUCCESS); + std::uint32_t input_handle = {}; + r = hsa_amd_get_handle_from_vaddr(input, &input_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(input_handle != 0); + + std::uint32_t *output = {}; + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, + reinterpret_cast(&output)); + assert(r == HSA_STATUS_SUCCESS); + std::uint32_t output_handle = {}; + r = hsa_amd_get_handle_from_vaddr(output, &output_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(output_handle != 0); + + for (std::size_t i = 0; i < num_data_elements; i++) { + *(input + i) = i; + *(output + i) = 0xDEFACE; + } + + ///////////////////////////////////// Creating the cmd packet + // Creating a packet to store the command + hsa_amd_aie_ert_packet_t *cmd_pkt = NULL; + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_pkt)); + assert(r == HSA_STATUS_SUCCESS); + cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_pkt->count = 0xA; // # of arguments to put in command + cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; + cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; + cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + + // Creating the payload for the packet + hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; + uint32_t cmd_handle; + r = hsa_amd_get_handle_from_vaddr(reinterpret_cast(cmd_pkt), + &cmd_handle); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_payload)); + assert(r == HSA_STATUS_SUCCESS); + cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command + cmd_payload->data[0] = 0x3; // Transaction opcode + cmd_payload->data[1] = 0x0; + cmd_payload->data[2] = dpu_handle; + cmd_payload->data[3] = 0x0; + cmd_payload->data[4] = 0x44; // Size of DPU instruction + cmd_payload->data[5] = input_handle; + cmd_payload->data[6] = 0; + cmd_payload->data[7] = output_handle; + cmd_payload->data[8] = 0; + cmd_pkt->payload_data = reinterpret_cast(cmd_payload); + + uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); + uint64_t packet_id = wr_idx % aie_queue->size; + reinterpret_cast( + aie_queue->base_address)[packet_id] = *cmd_pkt; + hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); + + for (std::size_t i = 0; i < num_data_elements; i++) { + const auto expected = *(input + i) + 1; + const auto result = *(output + i); + assert(result == expected); + } + + r = hsa_queue_destroy(aie_queue); + assert(r == HSA_STATUS_SUCCESS); + + r = hsa_amd_memory_pool_free(output); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(input); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(pdi_buf); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(dpu_inst_buf); + assert(r == HSA_STATUS_SUCCESS); + + r = hsa_shut_down(); + assert(r == HSA_STATUS_SUCCESS); + std::cout << "PASS\n"; +} diff --git a/rocrtst/suites/aie/amdxdna_accel.h b/rocrtst/suites/aie/amdxdna_accel.h new file mode 100644 index 000000000..048fd3fe6 --- /dev/null +++ b/rocrtst/suites/aie/amdxdna_accel.h @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: NCSA OR GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef __counted_by +#define __counted_by(cnt) +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot + * to submit a new command. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +enum amdxdna_power_mode_type { + XRT_POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + XRT_POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + XRT_POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + XRT_POWER_MODE_HIGH, /**< Set frequency to highest DPM */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/rocrtst/suites/aie/hsa_ipu.h b/rocrtst/suites/aie/hsa_ipu.h new file mode 100644 index 000000000..4ba2505e6 --- /dev/null +++ b/rocrtst/suites/aie/hsa_ipu.h @@ -0,0 +1,271 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "amdxdna_accel.h" + +// want to mmap the file + +#define MAX_NUM_INSTRUCTIONS 1024 // Maximum number of dpu or pdi instructions. + +// Dummy packet defines + +int map_doorbell(int fd, uint64_t *doorbell) { + // Mmap the mailbox. + int32_t page_size = 4096; + *doorbell = (uint64_t)mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (doorbell != MAP_FAILED) { + printf("Doorbell mapped\n"); + return 0; + } + + printf("[ERROR] doorbell mmap failed: %s\n", strerror(errno)); + return errno; +} + +void ring_doorbell(uint64_t doorbell) { + int32_t curr_tail = *((int32_t *)doorbell); + *((uint32_t *)doorbell) = curr_tail + 0x94; +} + +int get_driver_version(int fd, __u32 *major, __u32 *minor) { + int ret; + amdxdna_drm_query_aie_version version; + + amdxdna_drm_get_info info_params = { + .param = DRM_AMDXDNA_QUERY_AIE_VERSION, + .buffer_size = sizeof(version), + .buffer = (__u64)&version, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_INFO, &info_params); + if (ret == 0) { + *major = version.major; + *minor = version.minor; + } + + return ret; +} + +/* + Allocates a heap on the device by creating a BO of type dev heap +*/ +int alloc_heap(int fd, __u32 size, __u32 *handle) { + int ret; + void *heap_buf = NULL; + const size_t alignment = 64 * 1024 * 1024; + ret = posix_memalign(&heap_buf, alignment, size); + if (ret != 0 || heap_buf == NULL) { + printf("[ERROR] Failed to allocate heap buffer of size %d\n", size); + } + + void *dev_heap_parent = mmap(0, alignment * 2 - 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (dev_heap_parent == MAP_FAILED) { + dev_heap_parent = nullptr; + return -1; + } + + amdxdna_drm_create_bo create_bo_params = { + .type = AMDXDNA_BO_DEV_HEAP, + .size = size, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_params); + if (ret == 0 && handle) { + *handle = create_bo_params.handle; + } + + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo_params.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + // Need to free the heap buf but still use the address so we can + // ensure alignment + free(heap_buf); + heap_buf = (void *)mmap(heap_buf, size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, get_bo_info.map_offset); + printf("Heap buffer @: %p\n", heap_buf); + + return ret; +} + +/* + Creates a dev bo which is carved out of the heap bo. +*/ +int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, __u32 *handle, + __u64 size_in_bytes) { + amdxdna_drm_create_bo create_bo = { + .type = AMDXDNA_BO_DEV, + .size = size_in_bytes, + }; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = get_bo_info.vaddr; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Creates a shmem bo +*/ +int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { + const size_t alignment = 64 * 1024 * 1024; + void *shmem_create = NULL; + int ret = posix_memalign(&shmem_create, alignment, size_in_bytes); + if (ret != 0) { + printf("[ERROR] Failed to allocate shmem bo of size %lld\n", size_in_bytes); + } + + // Touching buffer to map page + *(uint32_t *)shmem_create = 0xDEADBEEF; + + printf("Shmem BO @: %p\n", shmem_create); + + amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM, + .vaddr = (__u64)shmem_create, + .size = size_in_bytes}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = (__u64)shmem_create; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Wrapper around synch bo ioctl. +*/ +int sync_bo(int fd, __u32 handle) { + amdxdna_drm_sync_bo sync_params = {.handle = handle}; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params); + if (ret != 0) { + printf("Synch bo ioctl failed for handle %d\n", handle); + } + return ret; +} + +/* + Create a BO_DEV and populate it with a PDI +*/ + +int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, + const char *path) { + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + fseek(file, 0L, SEEK_END); + ssize_t file_size = ftell(file); + fseek(file, 0L, SEEK_SET); + + printf("Pdi file size: %ld\n", file_size); + + fclose(file); + + // Mmaping the file + int pdi_fd = open(path, O_RDONLY); + uint64_t *file_data = + (uint64_t *)mmap(0, file_size, PROT_READ, MAP_PRIVATE, pdi_fd, 0); + + // Creating a BO_DEV bo to store the pdi file. + int ret = create_dev_bo(fd, vaddr, sram_addr, handle, file_size); + if (ret != 0) { + perror("Failed to create pdi BO"); + return -1; + } + + // copy the file into Bo dev + uint64_t *bo = (uint64_t *)*vaddr; + memcpy(bo, file_data, file_size); + + close(pdi_fd); + return 0; +} + +/* + Create a BO DEV and populate it with instructions whose virtual address is + passed to the driver via an HSA packet. +*/ +int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr, + __u32 *handle, const char *path, __u32 *num_inst) { + // read dpu instructions into an array + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + char *line = NULL; + size_t len = 0; + __u32 inst_array[MAX_NUM_INSTRUCTIONS]; + __u32 inst_counter = 0; + while (getline(&line, &len, file) != -1) { + inst_array[inst_counter++] = strtoul(line, NULL, 16); + if (inst_counter >= MAX_NUM_INSTRUCTIONS) { + perror("Instruction array overflowed."); + return -2; + } + } + fclose(file); + + // Creating a BO_DEV bo to store the instruction. + int ret = + create_dev_bo(fd, vaddr, sram_addr, handle, inst_counter * sizeof(__u32)); + if (ret != 0) { + perror("Failed to create dpu BO"); + return -3; + } + + *num_inst = inst_counter; + + memcpy((__u32 *)*vaddr, inst_array, inst_counter * sizeof(__u32)); + return ret; +}