From d5fa07a168a6631cc62688a21c2e09c7e7edd52c Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Fri, 15 Sep 2023 11:36:40 -0700
Subject: [PATCH 01/11] Feature: Add  CXL host exerciser cache  application

Description:
Host Exerciser (HE) is responsible for generating traffic to create scenarios like Cache Hit/Miss in Device or Host Caches with the intention of exercising the path from AFU to the Host via CXL IP at full bandwidth.

Command line options:

Options:
  -h,--help                   Print this help message and exit
  -p,--pci-address TEXT       [<domain>:]<bus>:<device>.<function>
  -l,--log-level TEXT:{trace,debug,info,warning,error,critical,off} [warning]
                              stdout logging level
  -t,--timeout UINT [60000]   test timeout (msec)
  --test UINT:value in {fpgardcachehit->0,fpgardcachemiss->2,fpgawrcachehit->1,fpgawrcachemiss->3,hostrdcachehit->4,hostrdcachemiss->6,hostwrcachehit->5,hostwrcachemiss->7} OR {0,2,1,3,4,6,5,7} [fpgardcachehit]
                              host exerciser cache test {fpgardcachehit, fpgawrcachehit, all}
  --continuousmode BOOLEAN [false]
                              test rollover or test termination
  --contmodetime UINT [1]     Continuous mode time in seconds
  --target UINT:value in {fpga->1,host->0} OR {1,0} [host]
                              host exerciser run on host or fpga
  --stride UINT [0]           Enable stride mode
  --linerepcount UINT:INT in [1 - 256] [10]
                              Line repeat count
  --testall BOOLEAN [false]   Run all tests

Subcommands:
  cache                       run simple cxl he cache test

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 opae.spec.fedora                              |    1 +
 packaging/opae/deb/opae-extra-tools.install   |    1 +
 samples/CMakeLists.txt                        |    1 +
 samples/cxl_host_exerciser/CMakeLists.txt     |   47 +
 .../cxl_host_exerciser/cxl_host_exerciser.cpp |   48 +
 .../cxl_host_exerciser/cxl_host_exerciser.h   |  504 ++++++++
 .../cxl_host_exerciser_cache.h                |   62 +
 .../cxl_host_exerciser_cmd.h                  | 1013 +++++++++++++++++
 samples/cxl_host_exerciser/dfl-he-cache.h     |  133 +++
 samples/cxl_host_exerciser/he_cache_test.h    |  829 ++++++++++++++
 10 files changed, 2639 insertions(+)
 create mode 100644 samples/cxl_host_exerciser/CMakeLists.txt
 create mode 100644 samples/cxl_host_exerciser/cxl_host_exerciser.cpp
 create mode 100644 samples/cxl_host_exerciser/cxl_host_exerciser.h
 create mode 100644 samples/cxl_host_exerciser/cxl_host_exerciser_cache.h
 create mode 100644 samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h
 create mode 100644 samples/cxl_host_exerciser/dfl-he-cache.h
 create mode 100644 samples/cxl_host_exerciser/he_cache_test.h
diff --git a/opae.spec.fedora b/opae.spec.fedora
index 893bcec6416a..dcdbe10e7fa2 100644
--- a/opae.spec.fedora
+++ b/opae.spec.fedora
@@ -355,6 +355,7 @@ done
 %{_bindir}/mem_tg
 %{_bindir}/ofs.uio
 %{_bindir}/cxl_mem_tg
+%{_bindir}/cxl_host_exerciser
 
 %{python3_sitearch}/opae.diag*
 %{python3_sitearch}/opae/diag*
diff --git a/packaging/opae/deb/opae-extra-tools.install b/packaging/opae/deb/opae-extra-tools.install
index a85827b3f0de..a363035c3704 100644
--- a/packaging/opae/deb/opae-extra-tools.install
+++ b/packaging/opae/deb/opae-extra-tools.install
@@ -18,6 +18,7 @@ usr/bin/fpga_dma_N3000_test
 usr/bin/fpga_dma_test
 usr/bin/host_exerciser
 usr/bin/cxl_mem_tg
+usr/bin/cxl_host_exerciser
 usr/bin/bist
 usr/bin/hps
 usr/bin/hssi
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index d9bce3ec663b..e03073ad863a 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -70,3 +70,4 @@ opae_add_subdirectory(host_exerciser)
 opae_add_subdirectory(n5010-test)
 opae_add_subdirectory(n5010-ctl)
 opae_add_subdirectory(clx_mem_tg)
+opae_add_subdirectory(cxl_host_exerciser)
\ No newline at end of file
diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
new file mode 100644
index 000000000000..adcdf4580a62
--- /dev/null
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -0,0 +1,47 @@
+## Copyright(c) 2023, Intel Corporation
+##
+## Redistribution  and  use  in source  and  binary  forms,  with  or  without
+## modification, are permitted provided that the following conditions are met:
+##
+## * Redistributions of  source code  must retain the  above copyright notice,
+##   this list of conditions and the following disclaimer.
+## * Redistributions in binary form must reproduce the above copyright notice,
+##   this list of conditions and the following disclaimer in the documentation
+##   and/or other materials provided with the distribution.
+## * Neither the name  of Intel Corporation  nor the names of its contributors
+##   may be used to  endorse or promote  products derived  from this  software
+##   without specific prior written permission.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+## AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+## IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+## ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+## LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+## CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+## SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+## INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+## CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+## POSSIBILITY OF SUCH DAMAGE.
+
+if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
+    opae_add_executable(TARGET cxl_host_exerciser
+        SOURCE cxl_host_exerciser.cpp
+        LIBS
+            opae-c
+            opae-cxx-core
+            ${spdlog_LIBRARIES}
+            ${json-c_LIBRARIES}
+            ${uuid_LIBRARIES}
+            numa
+        COMPONENT samplebin
+    )
+    target_include_directories(cxl_host_exerciser
+        PRIVATE
+           ${OPAE_INCLUDE_PATHS}
+           ${CMAKE_CURRENT_SOURCE_DIR}
+            ${CLI11_INCLUDE_DIRS}
+            ${spdlog_INCLUDE_DIRS})
+
+
+endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
new file mode 100644
index 000000000000..3d5eb10f1604
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
@@ -0,0 +1,48 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#include <CLI/CLI.hpp>
+#include <iostream>
+#include <signal.h>
+
+#include "cxl_host_exerciser.h"
+#include "cxl_host_exerciser_cache.h"
+
+void he_sig_handler(int);
+
+int main(int argc, char *argv[]) {
+  host_exerciser::host_exerciser app;
+  app.register_command<host_exerciser::host_exerciser_cache>();
+
+  // host exerciser signal handler
+  struct sigaction act_old, act_new;
+  memset(&act_old, 0, sizeof(act_old));
+  memset(&act_new, 0, sizeof(act_new));
+
+  act_new.sa_handler = he_sig_handler;
+  sigaction(SIGINT, &act_new, &act_old);
+
+  return app.main(argc, argv);
+}
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.h b/samples/cxl_host_exerciser/cxl_host_exerciser.h
new file mode 100644
index 000000000000..293293a30255
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.h
@@ -0,0 +1,504 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include <opae/cxx/core/events.h>
+#include <opae/cxx/core/shared_buffer.h>
+#include <opae/cxx/core/token.h>
+
+#include "he_cache_test.h"
+
+namespace host_exerciser {
+using opae::fpga::types::event;
+using opae::fpga::types::shared_buffer;
+using opae::fpga::types::token;
+
+static const uint64_t HELPBK_TEST_TIMEOUT = 30000;
+static const uint64_t HELPBK_TEST_SLEEP_INVL = 100;
+static const uint64_t CL = 64;
+static const uint64_t KB = 1024;
+static const uint64_t MB = KB * 1024;
+static const uint64_t LOG2_CL = 6;
+
+static const uint64_t BUFFER_SIZE_2MB = 2 * 1024 * 1024;
+
+static const uint64_t FPGA_32KB_CACHE_LINES = (32 * 1024) / 64;
+
+static const uint64_t FPGA_2MB_CACHE_LINES = (2 * 1024 * 1024) / 64;
+
+// Host execiser CSR Offset
+enum {
+  HE_DFH = 0x0000,
+  HE_ID_L = 0x0008,
+  HE_ID_H = 0x0010,
+  HE_DFH_RSVD0 = 0x0018,
+  HE_DFH_RSVD1 = 0x0020,
+  HE_SCRATCHPAD0 = 0x028,
+  HE_DSM_BASE = 0x030,
+  HE_CTL = 0x038,
+  HE_INFO = 0x040,
+
+  HE_WR_NUM_LINES = 0x048,
+  HE_WR_BYTE_ENABLE = 0x050,
+  HE_WR_CONFIG = 0x058,
+  HE_WR_ADDR_TABLE_CTRL = 0x060,
+  HE_WR_ADDR_TABLE_DATA = 0x068,
+
+  HE_RD_NUM_LINES = 0x070,
+  HE_RD_CONFIG = 0x078,
+  HE_RD_ADDR_TABLE_CTRL = 0x080,
+  HE_RD_ADDR_TABLE_DATA = 0x088,
+  HE_ERROR_STATUS = 0x090,
+
+  HE_ERROR_EXP_DATA = 0x098,
+  HE_ERROR_ACT_DATA0 = 0x0A0,
+  HE_ERROR_ACT_DATA1 = 0x0A8,
+  HE_ERROR_ACT_DATA2 = 0x0B0,
+  HE_ERROR_ACT_DATA3 = 0x0B8,
+  HE_ERROR_ACT_DATA4 = 0x0C0,
+  HE_ERROR_ACT_DATA5 = 0x0C8,
+  HE_ERROR_ACT_DATA6 = 0x0D0,
+  HE_ERROR_ACT_DATA7 = 0x0D8,
+
+};
+
+// configures test mode
+typedef enum {
+  HOST_EXEMODE_READ = 0x0,
+  HOST_EXEMODE_WRITE = 0x1,
+  HOST_EXEMODE_ALL = 0x2,
+} host_exe_mode;
+
+// Write Traffic Opcode
+typedef enum {
+  RD_LINE_I = 0x0,
+  RD_LINE_S = 0x1,
+  RD_LINE_EM = 0x2,
+} he_rd_opcode;
+
+typedef enum {
+  WR_LINE_I = 0x0,
+  WR_LINE_M = 0x1,
+  WR_PUSH_I = 0x2,
+  WR_BARRIER_FRNCE = 0x3,
+  WR_FLUSH_CL = 0x4,
+  WR_FLUSH_CL_HCOH = 0x5,
+  WR_FLUSH_CL_DCOH = 0x6,
+} he_wr_opcode;
+
+// DFH Header
+union he_dfh {
+  enum { offset = HE_DFH };
+  uint64_t value;
+  struct {
+    uint16_t CcipVersionNumber : 12;
+    uint8_t AfuMajVersion : 4;
+    uint32_t NextDfhOffset : 24;
+    uint8_t EOL : 1;
+    uint32_t Reserved : 19;
+    uint8_t FeatureType : 4;
+  };
+};
+
+// DSM BASEL
+union he_dsm_base {
+  enum { offset = HE_DSM_BASE };
+  uint64_t value;
+  struct {
+    uint64_t DsmBase : 64;
+  };
+};
+
+// CSR CTL
+union he_ctl {
+  enum { offset = HE_CTL };
+  uint64_t value;
+  struct {
+    uint64_t ResetL : 1;
+    uint64_t Start : 1;
+    uint64_t ForcedTestCmpl : 1;
+    uint64_t bios_support : 1;
+    uint64_t Reserved : 60;
+  };
+};
+
+// CSR INFO
+union he_info {
+  enum { offset = HE_INFO };
+  uint64_t value;
+  struct {
+    uint64_t write_addr_table_size : 4;
+    uint64_t read_addr_table_size : 4;
+    uint64_t Reserved : 56;
+  };
+};
+
+// HE_WR_NUM_LINES
+union he_wr_num_lines {
+  enum { offset = HE_WR_NUM_LINES };
+  uint64_t value;
+  struct {
+    uint64_t write_num_lines : 16;
+    uint64_t reserved : 48;
+  };
+};
+
+// HE_WR_BYTE_ENABLE
+union he_wr_byte_enable {
+  enum { offset = HE_WR_BYTE_ENABLE };
+  uint64_t value;
+  struct {
+    uint64_t write_byte_enable : 64;
+  };
+};
+
+// HE_WR_CONFIG
+union he_wr_config {
+  enum { offset = HE_WR_CONFIG };
+  uint64_t value;
+  struct {
+    uint64_t write_traffic_enable : 1;
+    uint64_t continuous_mode_enable : 1;
+    uint64_t waitfor_completion : 1;
+    uint64_t preread_sync_enable : 1;
+    uint64_t postread_sync_enable : 1;
+    uint64_t daata_pattern : 2;
+    uint64_t cl_evict_enable : 1;
+    uint64_t opcode : 4;
+    uint64_t line_repeat_count : 8;
+    uint64_t reserved : 44;
+  };
+};
+
+// HE_WR_ADDR_TABLE_CTRL
+union he_wr_addr_table_ctrl {
+  enum { offset = HE_WR_ADDR_TABLE_CTRL };
+  uint64_t value;
+  struct {
+    uint64_t enable_address_table : 1;
+    uint64_t enable_address_stride : 1;
+    uint64_t stride : 2;
+    uint64_t reserved : 60;
+  };
+};
+
+// HE_WR_ADDR_TABLE_DATA
+union he_wr_addr_table_data {
+  enum { offset = HE_WR_ADDR_TABLE_DATA };
+  uint64_t value;
+  struct {
+    uint64_t address_table_value : 64;
+  };
+};
+
+// HE_RD_NUM_LINES
+union he_rd_num_lines {
+  enum { offset = HE_RD_NUM_LINES };
+  uint64_t value;
+  struct {
+    uint64_t read_num_lines : 16;
+    uint64_t reserved : 48;
+  };
+};
+
+// HE_RD_CONFIG
+union he_rd_config {
+  enum { offset = HE_RD_CONFIG };
+  uint64_t value;
+  struct {
+    uint64_t read_traffic_enable : 1;
+    uint64_t continuous_mode_Enable : 1;
+    uint64_t waitfor_completion : 1;
+    uint64_t prewrite_sync_enable : 1;
+    uint64_t postwrite_sync_enable : 1;
+    uint64_t daata_pattern : 2;
+    uint64_t cl_evict_enable : 1;
+    uint64_t opcode : 4;
+    uint64_t line_repeat_count : 8;
+    uint64_t reserved : 44;
+  };
+};
+
+// HE_RD_ADDR_TABLE_CTRL
+union he_rd_addr_table_ctrl {
+  enum { offset = HE_RD_ADDR_TABLE_CTRL };
+  uint64_t value;
+  struct {
+    uint64_t enable_address_table : 1;
+    uint64_t enable_address_stride : 1;
+    uint64_t stride : 2;
+    uint64_t reserved : 60;
+  };
+};
+
+// HE_RD_ADDR_TABLE_DATA
+union he_rd_addr_table_data {
+  enum { offset = HE_RD_ADDR_TABLE_DATA };
+  uint64_t value;
+  struct {
+    uint64_t address_table_value : 64;
+  };
+};
+
+// HE_RD_ADDR_TABLE_DATA
+union he_err_status {
+  enum { offset = HE_ERROR_STATUS };
+  uint64_t value;
+  struct {
+    uint64_t data_error : 1;
+    uint64_t rsvd1 : 15;
+    uint64_t err_index : 16;
+    uint64_t rsvd2 : 32;
+  };
+};
+
+// HE DSM status
+struct he_cache_dsm_status {
+  uint32_t test_completed : 1;
+  uint32_t dsm_number : 15;
+  uint32_t res1 : 16;
+  uint32_t err_vector : 32;
+  uint64_t num_ticks : 64;
+  uint32_t num_reads : 32;
+  uint32_t num_writes : 32;
+  uint32_t penalty_start : 32;
+  uint32_t penalty_end : 32;
+  uint32_t actual_data : 32;
+  uint32_t expected_data : 32;
+  uint32_t res5[2];
+};
+
+const std::map<std::string, uint32_t> he_modes = {
+    {"read", HOST_EXEMODE_READ},
+    {"write", HOST_EXEMODE_WRITE},
+    {"all", HOST_EXEMODE_ALL},
+};
+
+// configures test mode
+typedef enum {
+  HE_FPGA_RD_CACHE_HIT = 0x0,
+  HE_FPGA_WR_CACHE_HIT = 0x1,
+
+  HE_FPGA_RD_CACHE_MISS = 0x2,
+  HE_FPGA_WR_CACHE_MISS = 0x3,
+
+  HE_HOST_RD_CACHE_HIT = 0x4,
+  HE_HOST_WR_CACHE_HIT = 0x5,
+
+  HE_HOST_RD_CACHE_MISS = 0x6,
+  HE_HOST_WR_CACHE_MISS = 0x7,
+
+} he_test_mode;
+
+// configures traget
+typedef enum {
+  HE_TARGET_HOST = 0x0,
+  HE_TARGET_FPGA = 0x1,
+} he_target;
+
+const std::map<std::string, uint32_t> he_test_modes = {
+    {"fpgardcachehit", HE_FPGA_RD_CACHE_HIT},
+    {"fpgawrcachehit", HE_FPGA_WR_CACHE_HIT},
+    {"fpgardcachemiss", HE_FPGA_RD_CACHE_MISS},
+    {"fpgawrcachemiss", HE_FPGA_WR_CACHE_MISS},
+
+    {"hostrdcachehit", HE_HOST_RD_CACHE_HIT},
+    {"hostwrcachehit", HE_HOST_WR_CACHE_HIT},
+    {"hostrdcachemiss", HE_HOST_RD_CACHE_MISS},
+    {"hostwrcachemiss", HE_HOST_WR_CACHE_MISS},
+};
+
+const std::map<std::string, uint32_t> he_targets = {
+    {"host", HE_TARGET_HOST},
+    {"fpga", HE_TARGET_FPGA},
+};
+
+///////////////////////
+// Bias Support
+typedef enum {
+  HOST_BIOS = 0x0,
+  DEVIC_BIOA = 0x1,
+} he_ctl_bios_support;
+
+// configures test mode
+typedef enum {
+  HE_ADDRTABLE_SIZE4096 = 0xC,
+  HE_ADDRTABLE_SIZE2048 = 0xB,
+  HE_ADDRTABLE_SIZE1024 = 0xA,
+  HE_ADDRTABLE_SIZE512 = 0x9,
+  HE_ADDRTABLE_SIZE256 = 0x8,
+  HE_ADDRTABLE_SIZE128 = 0x7,
+  HE_ADDRTABLE_SIZE64 = 0x6,
+  HE_ADDRTABLE_SIZE32 = 0x5,
+  HE_ADDRTABLE_SIZE16 = 0x4,
+  HE_ADDRTABLE_SIZE8 = 0x3,
+  HE_ADDRTABLE_SIZE4 = 0x2,
+  HE_ADDRTABLE_SIZE2 = 0x1,
+
+} he_addrtable_size;
+
+// he test type
+typedef enum {
+  HE_ENABLE_TRAFFIC_STAGE = 0x0,
+  HE_SIP_SEQ_STAGE = 0x1,
+} he_traffic_enable;
+
+const std::map<std::string, uint32_t> traffic_enable = {
+    {"enable", HE_ENABLE_TRAFFIC_STAGE},
+    {"skip", HE_SIP_SEQ_STAGE},
+
+};
+
+std::map<uint32_t, uint32_t> addrtable_size = {
+    {HE_ADDRTABLE_SIZE4096, 4096}, {HE_ADDRTABLE_SIZE2048, 2048},
+    {HE_ADDRTABLE_SIZE1024, 1024}, {HE_ADDRTABLE_SIZE512, 512},
+    {HE_ADDRTABLE_SIZE256, 256},   {HE_ADDRTABLE_SIZE128, 128},
+    {HE_ADDRTABLE_SIZE64, 64},     {HE_ADDRTABLE_SIZE32, 32},
+    {HE_ADDRTABLE_SIZE16, 16},     {HE_ADDRTABLE_SIZE8, 8},
+    {HE_ADDRTABLE_SIZE4, 4},       {HE_ADDRTABLE_SIZE2, 2},
+
+};
+
+// he test type
+typedef enum {
+  HE_DISABLE_DATA_INTEGRITY_CHECK = 0x0,
+  HE_ENABLE_DATA_INTEGRITY_CHECK = 0x1,
+} he_data_integrity_check;
+
+struct MapKeyComparator {
+  bool operator()(const std::string &a, const std::string &b) const {
+    if (a.length() != b.length())
+      return (a.length() < b.length());
+    else
+      return (a < b);
+  }
+};
+
+using test_afu = opae::afu_test::afu;
+using test_command = opae::afu_test::command;
+
+class host_exerciser : public test_afu {
+public:
+  host_exerciser()
+      : test_afu("host_exerciser", nullptr, "warning"), count_(1),
+      he_continuousmode_(false), he_test_all_(0), he_contmodetime_(0),
+      he_clock_mhz_(0),he_linerep_count_(10), he_stide_(0), he_target_(0), he_test_(0) {
+
+
+    // test
+    app_.add_option(
+            "--test", he_test_,
+            "host exerciser cache test {fpgardcachehit, fpgawrcachehit, all}")
+        ->transform(CLI::CheckedTransformer(he_test_modes))
+        ->default_val("fpgardcachehit");
+
+    // Configures test rollover or test termination
+    app_.add_option("--continuousmode", he_continuousmode_,
+                    "test rollover or test termination")
+        ->default_val("false");
+
+    // Continuous mode time
+    app_.add_option("--contmodetime", he_contmodetime_,
+                    "Continuous mode time in seconds")
+        ->default_val("1");
+
+    app_.add_option("--target", he_target_,
+                    "host exerciser run on host or fpga")
+        ->transform(CLI::CheckedTransformer(he_targets))
+        ->default_val("host");
+
+
+    app_.add_option("--stride", he_stide_, "Enable stride mode")
+        ->default_val("0");
+
+    app_.add_option("--linerepcount", he_linerep_count_, "Line repeat count")
+        ->transform(CLI::Range(1, 256))
+        ->default_val("10");
+
+    // Test all
+    app_.add_option("--testall", he_test_all_, "Run all tests")
+        ->default_val("false");
+  }
+
+  virtual int run(CLI::App *app, test_command::ptr_t test) override {
+    int res = exit_codes::not_run;
+
+    logger_->set_pattern("    %v");
+    // Info prints details of an individual run. Turn it on if doing only one
+    // test and the user hasn't changed level from the default.
+    if ((log_level_.compare("warning") == 0) && !he_test_all_)
+      logger_->set_level(spdlog::level::info);
+
+    logger_->info("starting test run, count of {0:d}", count_);
+    uint32_t count = 0;
+    try {
+      while (count < count_) {
+        logger_->debug("starting iteration: {0:d}", count + 1);
+
+        res = test_afu::run(app, test);
+        count++;
+        logger_->debug("end iteration: {0:d}", count);
+        if (res)
+          break;
+      }
+    } catch (std::exception &ex) {
+      logger_->error(ex.what());
+      res = exit_codes::exception;
+    }
+
+    auto pass = res == exit_codes::success ? "PASS" : "FAIL";
+    logger_->info("Test {}({}): {}", test->name(), count, pass);
+    spdlog::drop_all();
+    return res;
+  }
+
+public:
+  uint32_t count_;
+  bool he_continuousmode_;
+  bool he_test_all_;
+  uint32_t he_contmodetime_;
+  uint32_t he_clock_mhz_;
+  uint32_t he_linerep_count_;
+  uint32_t he_stide_;
+  uint32_t he_target_;
+  uint32_t he_test_;
+  std::map<uint32_t, uint32_t> limits_;
+
+  uint32_t get_offset(uint32_t base, uint32_t i) const {
+    auto limit = limits_.find(base);
+    auto offset = base + sizeof(uint64_t) * i;
+    if (limit != limits_.end() && offset > limit->second - sizeof(uint64_t)) {
+      throw std::out_of_range("offset out range in csr space");
+    }
+    return offset;
+  }
+
+  bool option_passed(std::string option_str) {
+    if (app_.count(option_str) == 0)
+      return false;
+    return true;
+  }
+};
+} // end of namespace cxl_host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser_cache.h b/samples/cxl_host_exerciser/cxl_host_exerciser_cache.h
new file mode 100644
index 000000000000..892584ef36b1
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser_cache.h
@@ -0,0 +1,62 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include "cxl_host_exerciser.h"
+#include "cxl_host_exerciser_cmd.h"
+#include "he_cache_test.h"
+
+const char *HE_CACHE_AFU_ID = "0118E06B-1FA3-49B9-8159-9b5C2EBD4b23";
+
+#define MEM_TG_FEATURE_ID 0x25
+#define MEM_TG_FEATURE_GUIDL 0x81599b5c2ebd4b23
+#define MEM_TG_FEATURE_GUIDH 0x0118e06b1fa349b9
+
+using test_afu = opae::afu_test::afu;
+using opae::fpga::types::shared_buffer;
+
+namespace host_exerciser {
+
+class host_exerciser_cache : public host_exerciser_cmd {
+public:
+  host_exerciser_cache() {}
+
+  virtual ~host_exerciser_cache() {}
+  virtual const char *name() const override { return "cache"; }
+
+  virtual const char *description() const override {
+    return "run simple cxl he cache test";
+  }
+
+  virtual const char *afu_id() const override { return HE_CACHE_AFU_ID; }
+
+  virtual uint64_t featureid() const override { return MEM_TG_FEATURE_ID; }
+
+  virtual uint64_t guidl() const override { return MEM_TG_FEATURE_GUIDL; }
+
+  virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
+};
+
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h b/samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h
new file mode 100644
index 000000000000..c9a099694788
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h
@@ -0,0 +1,1013 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "cxl_host_exerciser.h"
+#include "he_cache_test.h"
+#include <map>
+#include <numa.h>
+#include <unistd.h>
+
+using test_afu = opae::afu_test::afu;
+using opae::fpga::types::shared_buffer;
+using opae::fpga::types::token;
+namespace fpga = opae::fpga::types;
+
+#define UNUSED_PARAM(x) ((void)x)
+
+// HE exit global flag
+volatile bool g_he_exit = false;
+volatile static bool g_stop_thread = false;
+
+// host exerciser signal handler
+void he_sig_handler(int) {
+  g_he_exit = true;
+  g_stop_thread = true;
+  printf("HE signal handler exit app \n");
+}
+
+namespace host_exerciser {
+
+std::mutex he_cache_read_mutex;
+std::mutex he_cache_write_mutex;
+
+class host_exerciser_cmd;
+
+void he_cache_thread(uint8_t *buf_ptr, uint64_t len);
+
+class host_exerciser_cmd : public test_command {
+public:
+  host_exerciser_cmd() : host_exe_(NULL), numa_node_(0) {}
+  virtual ~host_exerciser_cmd() {}
+
+  int he_run_fpga_rd_cache_hit_test() {
+    cout << "********** FPGA Read cache hit test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer // flush
+    2) set cache lines 32kb/64
+    3) set loop count
+    4) Set RdShared (CXL) config
+    5) Run test ( AFU copies cache from host memory to FPGA cache)
+    6) Set RdShared (CXL) config
+    5) Run test ( AFU read cache from FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+
+    cout << "Numa node:" << numa_node_ << endl;
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "allocate dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** "
+         << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_dsm();
+    host_exe_->free_cache_read();
+
+    cout
+        << "********** AFU reads cache from FPGA Cache successfully ********** "
+        << endl;
+
+    cout << "********** FPGA Read cache hit test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_fpga_wr_cache_hit_test() {
+    cout << "********** FPGA Write cache hit test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer // flush
+    2) set cache lines 32kb/64
+    3) set loop count
+    4) Set RdShared (CXL) config
+    5) Run test ( AFU copies cache from host memory to FPGA cache)
+    6) Set WrLine_M/WrPart_M (CXL) config
+    5) Run test ( AFU writes to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read, Write buffer
+    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** "
+         << endl;
+
+    // set W_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_M;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // Set WR_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    cout << "********** AFU Write to  FPGA Cache  successfully ********** "
+         << endl;
+
+    host_exe_->free_cache_read_write();
+    host_exe_->free_dsm();
+
+    cout << "********** FPGA Write cache hit test end**********" << endl;
+
+    return 0;
+  }
+
+  int he_run_fpga_rd_cache_miss_test() {
+    cout << "********** FPGA Read cache miss test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer
+    2) Write number of lines more then 32 kb  2mb/64
+    3) Set RdShared (CXL) config
+    4) Run test (Buffer is not present in FPGA - FPGA read Cache miss )
+
+   // 2) Set RdShared (CXL) config
+    //3) Run test ( AFU copies cache from host memory to FPGA cache)
+    //4) Set write Evict (CXL) config
+    //5) Run test ( AFU Invalidate to FPGA cache)
+    3) Set RdShared (CXL) config
+    4) Run test (Buffer is not present in FPGA - FPGA read Cache miss )
+    */
+
+    // 2MB / 64
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_2MB_CACHE_LINES - 1);
+    cout << "Read number Lines:" << FPGA_2MB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read, Write buffer
+    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read write failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_read_write();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Read FPGA Cache Miss successfully ********** "
+         << endl;
+
+    cout << "********** FPGA Read cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_fpga_wr_cache_miss_test() {
+    cout << "********** FPGA write cache miss test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer
+    2) Write number of lines more then 32 kb  2mb/64
+    3) Set WR ItoMWr (CXL) config
+    4) Run test ( Buffer is not present in FPGA - FPGA write Cache miss )
+
+    //2) Set RdShared (CXL) config
+    //3) Run test ( AFU copies cache from host to HDM
+    //4) Set write Evict  (CXL) config
+    //5) Run test ( AFU Invalidate to FPGA cache)
+    6) Set WR ItoMWr (CXL) config
+    7) Run test ( Buffer is not present in FPGA - FPGA write Cache miss )
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_2MB_CACHE_LINES - 1);
+    cout << "Read/write number Lines:" << FPGA_2MB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set W_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_M;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // Set WR_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read, Write buffer
+    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_read_write();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Write FPGA Cache Miss successfully ********** "
+         << endl;
+
+    cout << "********** FPGA Write cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_rd_cache_hit_test() {
+    cout << "********** 1 Host LLC Read cache hit test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer
+    2) create thread read buffer
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_I;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    cout << " create thread - moves read buffer to host cache " << endl;
+    std::thread t1(he_cache_thread, host_exe_->get_read(), BUFFER_SIZE_2MB);
+    sleep(1);
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+
+      g_stop_thread = true;
+      t1.join();
+      sleep(1);
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    g_stop_thread = true;
+    t1.join();
+
+    he_perf_counters();
+    sleep(1);
+    host_exe_->free_cache_read();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** "
+         << endl;
+
+    cout << "********** Host LLC cache hit test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_wr_cache_hit_test() {
+    cout << "********** Host LLC Write cache hit test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Write buffer
+    2) create thread read buffer
+    3) Set ItoMWr (CXL) config
+    4) Run test ( AFU write to host cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_I;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    cout << " create thread - moves read buffer to host cache " << endl;
+    std::thread t1(he_cache_thread, host_exe_->get_write(), BUFFER_SIZE_2MB);
+    sleep(1);
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+
+      he_perf_counters();
+      host_exerciser_errors();
+      g_stop_thread = true;
+      t1.join();
+      sleep(1);
+      host_exe_->free_cache_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    g_stop_thread = true;
+    t1.join();
+    he_perf_counters();
+    cout << "********** AFU write  host cache successfully ********** " << endl;
+
+    sleep(1);
+    host_exe_->free_cache_write();
+    host_exe_->free_dsm();
+
+    cout << "********** Host LLC cache hit Write test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_rd_cache_miss_test() {
+    cout << "********** Host LLC Read cache miss test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer
+    2) flush host read buffer cachde
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_I;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTR
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // flush host cache
+    // int status = cacheflush((host_exe_->get_read(), BUFFER_SIZE_2MB, BCACHE);
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    host_exe_->free_cache_read();
+    host_exe_->free_dsm();
+
+    cout << "********** Ran  Host LLC Read cache miss successfully ********** "
+         << endl;
+
+    cout << "********** Host LLC Read cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_wr_cache_miss_test() {
+    cout << "********** Host LLC Write cache miss test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, write buffer
+    2) flush host write buffer cachde
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set write number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+         << endl;
+
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+
+    // set RD_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_PUSH_I;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTR
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      cerr << "timeout error" << endl;
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    host_exe_->free_cache_write();
+    host_exe_->free_dsm();
+
+    cout << "********** Ran  Host LLC Write cache miss successfully ********** "
+         << endl;
+
+    cout << "********** Host LLC Write cache miss test end**********" << endl;
+    return 0;
+  }
+
+  void he_perf_counters() {
+    volatile he_cache_dsm_status *dsm_status = NULL;
+
+    dsm_status = reinterpret_cast<he_cache_dsm_status *>(
+        (uint8_t *)(host_exe_->get_dsm()));
+    if (!dsm_status)
+      return;
+
+    std::cout << "\n********* DSM Status CSR Start *********" << std::endl;
+
+    std::cout << "test completed :" << dsm_status->test_completed << std::endl;
+    std::cout << "dsm number:" << dsm_status->dsm_number << std::endl;
+    std::cout << "error vector:" << dsm_status->err_vector << std::endl;
+    std::cout << "num ticks:" << dsm_status->num_ticks << std::endl;
+    std::cout << "num reads:" << dsm_status->num_reads << std::endl;
+    std::cout << "num writes:" << dsm_status->num_writes << std::endl;
+    std::cout << "penalty start:" << dsm_status->penalty_start << std::endl;
+    std::cout << "penalty end:" << dsm_status->penalty_end << std::endl;
+    std::cout << "actual data:" << dsm_status->actual_data << std::endl;
+    std::cout << "expected data:" << dsm_status->expected_data << std::endl;
+
+    std::cout << "********* DSM Status CSR end *********" << std::endl;
+  }
+
+  void host_exerciser_errors() {
+    he_err_status err_status;
+    uint64_t err = 0;
+    if (host_exe_ == NULL)
+      return;
+
+    err_status.value = host_exe_->read64(HE_ERROR_STATUS);
+    if (err_status.data_error == 1) {
+      cout << "Data Integrity Check error occured" << endl;
+    }
+
+    if (err_status.err_index > 0) {
+      cout << "Error occurred at cache line address:" << err_status.err_index
+           << endl;
+    }
+
+    err = host_exe_->read64(HE_ERROR_EXP_DATA);
+    cout << "Error Expected Data:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA0);
+    cout << "Error Expected Data0:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA1);
+    cout << "Error Expected Data1:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA2);
+    cout << "Error Expected Data2:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA3);
+    cout << "Error Expected Data3:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA4);
+    cout << "Error Expected Data4:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA5);
+    cout << "Error Expected Data5:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA6);
+    cout << "Error Expected Data6:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA7);
+    cout << "Error Expected Data7:" << err << endl;
+  }
+
+  int parse_input_options() {
+
+    if (!host_exe_)
+      return -1;
+
+    return 0;
+  }
+
+  bool he_wait_test_completion() {
+    /* Wait for test completion */
+    uint32_t timeout = HELPBK_TEST_TIMEOUT;
+
+    volatile uint8_t *status_ptr = host_exe_->get_dsm();
+    while (0 == ((*status_ptr) & 0x1)) {
+      usleep(HELPBK_TEST_SLEEP_INVL);
+      if (--timeout == 0) {
+        cout << "HE LPBK TIME OUT" << std::endl;
+
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool verify_numa_node() {
+
+    if (numa_available() < 0) {
+      printf("System does not support NUMA API!\n");
+      return false;
+    }
+
+    printf("SUpported NUMA API!\n");
+
+    int n = numa_max_node();
+    printf("There are %d nodes on your system\n", n + 1);
+
+    int cup_num = sched_getcpu();
+    printf("cup_num:%d\n", cup_num);
+
+    int node = numa_node_of_cpu(cup_num);
+    printf("node:%d\n", node);
+
+    if (host_exe_->he_target_ == HE_TARGET_HOST) {
+      numa_node_ = node;
+      printf("HE_TARGET_HOST numa_node_:%d\n", numa_node_);
+
+    } else {
+      // find fpga numa node numebr
+      numa_node_ = 2;
+      printf("HE_TARGET_FPGA numa_node_:%d\n", numa_node_);
+    }
+
+    int num_task = numa_num_task_nodes();
+    printf("num_task:%d\n", num_task);
+
+    return true;
+  }
+
+  virtual int run(test_afu *afu, CLI::App *app) {
+    (void)app;
+    int ret = 0;
+
+    host_exe_ = dynamic_cast<host_exerciser *>(afu);
+
+    if (!verify_numa_node()) {
+      numa_node_ = 0;
+      cout << "numa nodes are available set numa node to 0" << endl;
+    };
+
+    // reset HE cache
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    if (host_exe_->he_test_ == HE_FPGA_RD_CACHE_HIT) {
+      ret = he_run_fpga_rd_cache_hit_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_FPGA_WR_CACHE_HIT) {
+      ret = he_run_fpga_wr_cache_hit_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_FPGA_RD_CACHE_MISS) {
+      ret = he_run_fpga_rd_cache_miss_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_FPGA_WR_CACHE_MISS) {
+      ret = he_run_fpga_wr_cache_miss_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_HOST_RD_CACHE_HIT) {
+      ret = he_run_host_rd_cache_hit_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_HOST_WR_CACHE_HIT) {
+      ret = he_run_host_wr_cache_hit_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_HOST_RD_CACHE_MISS) {
+      ret = he_run_host_rd_cache_miss_test();
+      return ret;
+    }
+
+    if (host_exe_->he_test_ == HE_HOST_WR_CACHE_MISS) {
+      ret = he_run_host_wr_cache_miss_test();
+      return ret;
+    }
+
+    return 0;
+  }
+
+protected:
+  host_exerciser *host_exe_;
+  token::ptr_t token_;
+
+  he_ctl he_ctl_;
+  he_info he_info_;
+  he_rd_config he_rd_cfg_;
+  he_wr_config he_wr_cfg_;
+
+  he_rd_addr_table_ctrl rd_table_ctl_;
+  he_wr_addr_table_ctrl wr_table_ctl_;
+  uint8_t *dsm_buf_;
+  uint8_t *rd_buf_;
+
+  uint32_t numa_node_;
+};
+
+void he_cache_thread(uint8_t *buf_ptr, uint64_t len) {
+  cout << "he_cache_thread  enter" << endl;
+  if (buf_ptr == NULL || len == 0) {
+    return;
+  }
+  uint64_t value;
+  UNUSED_PARAM(value);
+  uint64_t cache_lines = len / 64;
+  uint64_t i = 0;
+  cout << "he_cache_thread  cache_lines:" << cache_lines << endl;
+
+  while (true) {
+
+    if (g_stop_thread == true) {
+      cout << "he_cache_thread g_stop_thread " << endl;
+      return;
+    }
+    // cout << "he_cache_thread:i "<<i << endl;
+    if (i < cache_lines) {
+      value = *((volatile uint64_t *)(buf_ptr + i * 8));
+    }
+    i++;
+    if (i >= cache_lines) {
+      i = 0;
+    }
+  }
+
+  cout << "he_cache_thread  end" << endl;
+  return;
+}
+
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/dfl-he-cache.h b/samples/cxl_host_exerciser/dfl-he-cache.h
new file mode 100644
index 000000000000..b95df7414bdb
--- /dev/null
+++ b/samples/cxl_host_exerciser/dfl-he-cache.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header File for host exerciser cache DFL User API
+ *
+ * Copyright (C) 2023 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Russell H. Weight <russell.h.weight@intel.com>
+ */
+
+#ifndef _UAPI_LINUX_HE_CACHE_DFL_H
+#define _UAPI_LINUX_HE_CACHE_DFL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define DFL_HE_CACHE_API_VERSION 0
+
+/*
+ * The IOCTL interface for DFL based HE CACHE is designed for extensibility by
+ * embedding the structure length (argsz) and flags into structures passed
+ * between kernel and userspace. This design referenced the VFIO IOCTL
+ * interface (include/uapi/linux/vfio.h).
+ */
+
+#define DFL_HE_CACHE_MAGIC 0xB6
+
+#define DFL_HE_CACHE_BASE 0
+
+/**
+ * DFL_FPGA_GET_API_VERSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
+ *
+ * Report the version of the driver API.
+ * Return: Driver API Version.
+ */
+
+#define DFL_HE_CACHE_GET_API_VERSION                                           \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 0)
+
+/**
+ * DFL_FPGA_CHECK_EXTENSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, otherwise the extension is supported.
+ */
+
+#define DFL_HE_CACHE_CHECK_EXTENSION                                           \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 1)
+
+#define DFL_HE_CACHE_GET_REGION_INFO                                           \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 2)
+
+/**
+ * FPGA_PORT_GET_REGION_INFO - _IOWR(FPGA_MAGIC, PORT_BASE + 2,
+ *                                      struct dfl_he_cache_region_info)
+ *
+ * Retrieve information about a device memory region.
+ * Caller provides struct dfl_fpga_port_region_info with index value set.
+ * Driver returns the region info in other fields.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_he_cache_region_info {
+  /* Input */
+  __u32 argsz; /* Structure length */
+  /* Output */
+  __u32 flags;                             /* Access permission */
+#define DFL_HE_CACHE_REGION_READ (1 << 0)  /* Region is readable */
+#define DFL_HE_CACHE_REGION_WRITE (1 << 1) /* Region is writable */
+#define DFL_HE_CACHE_REGION_MMAP (1 << 2)  /* Can be mmaped to userspace */
+  __u64 size;                              /* Region size (bytes) */
+  __u64 offset; /* Region offset from start of device fd */
+};
+
+#define DFL_HE_CACHE_SET_DSM_INFO _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3)
+
+struct dfl_he_cache_dsm_info {
+  /* Input */
+  __u32 argsz;     /* Structure length */
+  __u64 user_addr; /* Process virtual address */
+  __u64 length;    /* Length of mapping (bytes)*/
+};
+
+#define DFL_HE_CACHE_CLEAR_DSM_INFO                                            \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 4)
+
+#define DFL_HE_CACHE_ALLOC_ADDR_TABLE                                          \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 5)
+#define DFL_HE_CACHE_FREE_ADDR_TABLE                                           \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 6)
+#define DFL_HE_CACHE_APPEND_ADDR_TABLE                                         \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 7)
+
+#define DFL_HE_CACHE_NUM_LINES_MIN 1
+#define DFL_HE_CACHE_NUM_LINES_MAX 0xffff
+
+struct dfl_he_cache_addr_table {
+  /* Input */
+  __u32 argsz; /* Structure length */
+  __u32 flags; /* Address Table ID */
+#define DFL_HE_CACHE_READ_ADDR_TABLE (1 << 0)
+#define DFL_HE_CACHE_WRITE_ADDR_TABLE (1 << 1)
+  __u32 cache_lines; /* Buffer size/offset in cache lines */
+};
+
+#define DFL_HE_CACHE_NUMA_DMA_MAP                                              \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 12)
+#define DFL_HE_CACHE_NUMA_DMA_UNMAP                                            \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 13)
+
+#define DFL_ARRAY_MAX_SIZE 0x10
+
+struct dfl_he_cache_dma_map {
+  /* Input */
+  __u32 argsz;                         /* Structure length */
+  __u32 flags;                         /* flags */
+  __u64 user_addr;                     /* Process virtual address */
+  __u64 length;                        /* Length of mapping (bytes)*/
+  __u32 numa_node;                     /* Node 0,1 2 */
+  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR  */
+};
+
+struct dfl_he_cache_dma_unmap {
+  /* Input */
+  __u32 argsz;                         /* Structure length */
+  __u32 flags;                         /* flags */
+  __u64 user_addr;                     /* Process virtual address */
+  __u64 length;                        /* Length of mapping (bytes)*/
+  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR  */
+};
+
+#endif /* _UAPI_LINUX_HE_CACHE_DFL_H */
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
new file mode 100644
index 000000000000..1e3b4d503c60
--- /dev/null
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -0,0 +1,829 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <CLI/CLI.hpp>
+#include <future>
+#include <glob.h>
+#include <inttypes.h>
+#include <numa.h>
+#include <opae/cxx/core.h>
+#include <regex.h>
+#include <spdlog/sinks/basic_file_sink.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include "dfl-he-cache.h"
+
+using namespace std;
+
+const char *sbdf_pattern =
+    "(([0-9a-fA-F]{4}):)?([0-9a-fA-F]{2}):([0-9a-fA-F]{2})\\.([0-9])";
+
+enum { MATCHES_SIZE = 6 };
+#define FEATURE_DEV                                                            \
+  "/sys/bus/pci/devices/%s/"                                                   \
+  "fpga_region/region*/dfl-fme*/dfl_dev*/feature_id"
+
+#define MAX_SIZE 256
+
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#define MAP_2M_HUGEPAGE (0x15 << MAP_HUGE_SHIFT) /* 2 ^ 0x15 = 2M */
+#define MAP_1G_HUGEPAGE (0x1e << MAP_HUGE_SHIFT) /* 2 ^ 0x1e = 1G */
+
+#ifdef __ia64__
+#define ADDR ((void *)(0x8000000000000000UL))
+#define FLAGS_4K (MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED)
+#define FLAGS_2M (FLAGS_4K | MAP_2M_HUGEPAGE | MAP_HUGETLB)
+#define FLAGS_1G (FLAGS_4K | MAP_1G_HUGEPAGE | MAP_HUGETLB)
+#else
+#define ADDR ((void *)(0x0UL))
+#define FLAGS_4K (MAP_PRIVATE | MAP_ANONYMOUS)
+#define FLAGS_2M (FLAGS_4K | MAP_2M_HUGEPAGE | MAP_HUGETLB)
+#define FLAGS_1G (FLAGS_4K | MAP_1G_HUGEPAGE | MAP_HUGETLB)
+#endif
+
+#define KiB(x) ((x)*1024)
+#define MiB(x) ((x)*1024 * 1024)
+#define GiB(x) ((x)*1024 * 1024 * 1024)
+
+#define DFL_HE_CACHE_DSM_BASE 0x030
+#define DFL_HE_CACHE_WR_ADDR_TABLE_DATA 0x068
+#define DFL_HE_CACHE_RD_ADDR_TABLE_DATA 0x088
+
+void *alloc_2mb_hugepage(void) {
+  void *addr;
+
+  addr = mmap(ADDR, MiB(2), PROTECTION, FLAGS_2M, 0, 0);
+  if (addr == MAP_FAILED) {
+    printf("alloc_2mb_hugepage() failed: %s\n", strerror(errno));
+    addr = NULL;
+  }
+
+  return addr;
+}
+void free_memory(void *addr, uint64_t len) { munmap(addr, len); }
+
+void *alloc_32kb_hugepage(void) {
+  void *addr;
+
+  addr = mmap(ADDR, KiB(32), PROTECTION, FLAGS_4K, 0, 0);
+  if (addr == MAP_FAILED) {
+    printf("alloc_1kb_hugepage() failed: %s\n", strerror(errno));
+    addr = NULL;
+  }
+
+  return addr;
+}
+
+void *alloc_4kb_hugepage(void) {
+  void *addr;
+
+  addr = mmap(ADDR, KiB(4), PROTECTION, FLAGS_4K, 0, 0);
+  if (addr == MAP_FAILED) {
+    printf("alloc_1kb_hugepage() failed: %s\n", strerror(errno));
+    addr = NULL;
+  }
+
+  return addr;
+}
+
+bool sysfs_read_u64(const char *path, uint64_t *value) {
+  ifstream fs;
+  fs.open(path, ios::in);
+
+  std::string s;
+  if (fs.is_open()) {
+    std::string line;
+    std::getline(fs, line);
+    *value = std::stoul(line, 0, 16);
+    fs.close();
+    return true;
+  }
+  return false;
+}
+
+namespace opae {
+namespace afu_test {
+
+namespace fpga = fpga::types;
+
+template <typename T>
+inline bool parse_match_int(const char *s, regmatch_t m, T &v, int radix = 10) {
+  if (m.rm_so == -1 || m.rm_eo == -1)
+    return false;
+  errno = 0;
+  v = std::strtoul(s + m.rm_so, NULL, radix);
+  return errno == 0;
+}
+
+union pcie_address {
+  struct {
+    uint32_t function : 3;
+    uint32_t device : 5;
+    uint32_t bus : 8;
+    uint32_t domain : 16;
+  } fields;
+  uint32_t value;
+
+  static pcie_address parse(const char *s) {
+    auto deleter = [&](regex_t *r) {
+      regfree(r);
+      delete r;
+    };
+    std::unique_ptr<regex_t, decltype(deleter)> re(new regex_t, deleter);
+    regmatch_t matches[MATCHES_SIZE];
+
+    int reg_res = regcomp(re.get(), sbdf_pattern, REG_EXTENDED | REG_ICASE);
+    if (reg_res)
+      throw std::runtime_error("could not compile regex");
+
+    reg_res = regexec(re.get(), s, MATCHES_SIZE, matches, 0);
+    if (reg_res)
+      throw std::runtime_error("pcie address not valid format");
+
+    uint16_t domain, bus, device, function;
+    if (!parse_match_int(s, matches[2], domain, 16))
+      domain = 0;
+    if (!parse_match_int(s, matches[3], bus, 16))
+      throw std::runtime_error("error parsing pcie address");
+    if (!parse_match_int(s, matches[4], device, 16))
+      throw std::runtime_error("error parsing pcie address");
+    if (!parse_match_int(s, matches[5], function))
+      throw std::runtime_error("error parsing; pcie address");
+    pcie_address a;
+    a.fields.domain = domain;
+    a.fields.bus = bus;
+    a.fields.device = device;
+    a.fields.function = function;
+    return a;
+  }
+};
+
+class afu; // forward declaration
+
+class command {
+public:
+  typedef std::shared_ptr<command> ptr_t;
+  command() : running_(true) {}
+  virtual ~command() {}
+  virtual const char *name() const = 0;
+  virtual const char *description() const = 0;
+  virtual int run(afu *afu, CLI::App *app) = 0;
+  virtual void add_options(CLI::App *app) { (void)app; }
+  virtual const char *afu_id() const { return nullptr; }
+
+  virtual uint64_t featureid() const = 0;
+  virtual uint64_t guidl() const = 0;
+  virtual uint64_t guidh() const = 0;
+
+  bool running() const { return running_; }
+  void stop() { running_ = false; }
+
+private:
+  std::atomic<bool> running_;
+};
+
+#if SPDLOG_VERSION >= 10900
+// spdlog version 1.9.0 defines SPDLOG_LEVEL_NAMES as an array of string_view_t.
+// Convert to vector of std::string to be used in CLI::IsMember().
+inline std::vector<std::string> spdlog_levels() {
+  std::vector<spdlog::string_view_t> levels_view = SPDLOG_LEVEL_NAMES;
+  std::vector<std::string> levels_str(levels_view.size());
+  std::transform(levels_view.begin(), levels_view.end(), levels_str.begin(),
+                 [](spdlog::string_view_t sv) {
+                   return std::string(sv.data(), sv.size());
+                 });
+  return levels_str;
+}
+#else
+inline std::vector<std::string> spdlog_levels() { return SPDLOG_LEVEL_NAMES; }
+#endif // SPDLOG_VERSION
+
+class afu {
+public:
+  typedef int (*command_fn)(afu *afu, CLI::App *app);
+  enum exit_codes {
+    success = 0,
+    not_run,
+    not_found,
+    no_access,
+    exception,
+    error
+  };
+
+  afu(const char *name, const char *afu_id = nullptr,
+      const char *log_level = nullptr)
+      : name_(name), afu_id_(afu_id ? afu_id : ""), app_(name_), pci_addr_(""),
+        log_level_(log_level ? log_level : "info"), timeout_msec_(60000),
+        current_command_(nullptr) {
+    if (!afu_id_.empty())
+      app_.add_option("-g,--guid", afu_id_, "GUID")->default_str(afu_id_);
+    app_.add_option("-p,--pci-address", pci_addr_,
+                    "[<domain>:]<bus>:<device>.<function>");
+    app_.add_option("-l,--log-level", log_level_, "stdout logging level")
+        ->default_str(log_level_)
+        ->check(CLI::IsMember(spdlog_levels()));
+    app_.add_option("-t,--timeout", timeout_msec_, "test timeout (msec)")
+        ->default_str(std::to_string(timeout_msec_));
+  }
+  virtual ~afu() {
+    if (logger_)
+      spdlog::drop(logger_->name());
+  }
+
+  CLI::App &cli() { return app_; }
+
+  int find_dev_feature() {
+    glob_t pglob;
+    char feature_path[MAX_SIZE] = {0};
+    int gres = 0;
+    uint64_t value = 0;
+    size_t i = 0;
+
+    if (!pci_addr_.empty()) {
+      if (snprintf(feature_path, sizeof(feature_path), FEATURE_DEV,
+                   pci_addr_.c_str()) < 0) {
+        cerr << "snprintf buffer overflow" << endl;
+        return 1;
+      }
+    } else {
+      if (snprintf(feature_path, sizeof(feature_path), FEATURE_DEV, "*:*:*.*") <
+          0) {
+        cerr << "snprintf buffer overflow" << endl;
+        return 2;
+      }
+    }
+
+    gres = glob(feature_path, GLOB_NOSORT, NULL, &pglob);
+    if (gres) {
+      cerr << "Failed pattern match" << feature_path << ":" << strerror(errno)
+           << endl;
+      globfree(&pglob);
+      return 3;
+    }
+
+    for (i = 0; i < pglob.gl_pathc; i++) {
+      bool retval = sysfs_read_u64(pglob.gl_pathv[i], &value);
+      if (!retval) {
+        cerr << "Failed to read sysfs value" << endl;
+        continue;
+      }
+
+      if (current_command()->featureid() == value) {
+        string str(pglob.gl_pathv[i]);
+        string substr_dev(str.substr(0, str.rfind("/")));
+        globfree(&pglob);
+
+        substr_dev.append("/he-cache/he-cache*");
+        gres = glob(substr_dev.c_str(), GLOB_NOSORT, NULL, &pglob);
+        if (gres) {
+          cerr << "Failed pattern match" << substr_dev.c_str() << ":"
+               << strerror(errno) << endl;
+          globfree(&pglob);
+          return 4;
+        }
+        string str1(pglob.gl_pathv[0]);
+        globfree(&pglob);
+        dev_path_.append("/dev");
+        dev_path_.append(str1.substr(str1.rfind("/"), 13));
+
+        return 0;
+      }
+    }
+
+    return 5;
+  }
+
+  void unmap_mmio() {
+    if (mmio_base_) {
+      if (munmap(mmio_base_, rinfo_.size) == -1)
+        cerr << "Failed to unmap MMIO:" << strerror(errno) << endl;
+    }
+  }
+
+  bool map_mmio() {
+    void *user_v;
+    user_v = mmap(NULL, rinfo_.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                  rinfo_.offset);
+    if (user_v == MAP_FAILED) {
+      cerr << "Failed to map MMIO:" << strerror(errno) << endl;
+      return false;
+    }
+    mmio_base_ = (uint8_t *)user_v;
+
+    return true;
+  }
+
+  int open_handle() {
+
+    int res = 0;
+    cout << "dev_path_:" << dev_path_ << endl;
+
+    fd_ = open(dev_path_.c_str(), O_RDWR);
+    if (fd_ < 0) {
+      cerr << "open() failed:" << strerror(errno) << endl;
+      return 1;
+    }
+
+    memset(&rinfo_, 0, sizeof(rinfo_));
+    rinfo_.argsz = sizeof(rinfo_);
+    res = ioctl(fd_, DFL_HE_CACHE_GET_REGION_INFO, &rinfo_);
+    if (res) {
+      cerr << "ioctl() DFL_HE_CACHE_GET_REGION_INFO failed:" << strerror(errno)
+           << endl;
+      close(fd_);
+      return 2;
+    }
+
+    printf("MMIO region flags: 0x%x size: %llu offset: %llu\n", rinfo_.flags,
+           rinfo_.size, rinfo_.offset);
+
+    if (!map_mmio()) {
+      cerr << "mmap failed:" << strerror(errno) << endl;
+      close(fd_);
+      return 3;
+    }
+
+    volatile uint64_t *u64 = (volatile uint64_t *)mmio_base_;
+    printf("DFH     : 0x%016" PRIx64 "\n", *u64);
+    printf("DFH + 8 : 0x%016" PRIx64 "\n", *(u64 + 1));
+    printf("DFH + 16: 0x%016" PRIx64 "\n", *(u64 + 2));
+    printf("DFH + 24: 0x%016" PRIx64 "\n", *(u64 + 3));
+
+    return exit_codes::not_run;
+  }
+
+  int main(int argc, char *argv[]) {
+    if (!commands_.empty())
+      app_.require_subcommand();
+    CLI11_PARSE(app_, argc, argv);
+
+    command::ptr_t test(nullptr);
+    CLI::App *app = nullptr;
+    for (auto kv : commands_) {
+      if (*kv.first) {
+        app = kv.first;
+        test = kv.second;
+        break;
+      }
+    }
+    if (!test) {
+      std::cerr << "no command specified\n";
+      return exit_codes::not_run;
+    }
+
+    auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+    logger_ = std::make_shared<spdlog::logger>(test->name(), console_sink);
+    spdlog::register_logger(logger_);
+    logger_->set_level(spdlog::level::from_str(log_level_));
+    current_command_ = test;
+    if (find_dev_feature() != 0) {
+      cerr << "fails to find feature" << endl;
+      return exit_codes::exception;
+    };
+
+    int res = open_handle();
+    if (res != exit_codes::not_run) {
+      return res;
+    }
+
+    return run(app, test);
+  }
+
+  virtual int run(CLI::App *app, command::ptr_t test) {
+    int res = exit_codes::not_run;
+    current_command_ = test;
+
+    try {
+      std::future<int> f = std::async(std::launch::async, [this, test, app]() {
+        return test->run(this, app);
+      });
+      auto status = f.wait_for(std::chrono::milliseconds(timeout_msec_));
+      if (status == std::future_status::timeout) {
+        std::cerr << "Error: test timed out" << std::endl;
+        current_command_->stop();
+        throw std::runtime_error("timeout");
+      }
+      res = f.get();
+    } catch (std::exception &ex) {
+      res = exit_codes::exception;
+    }
+
+    current_command_.reset();
+    return res;
+  }
+
+  template <class T> CLI::App *register_command() {
+    command::ptr_t cmd(new T());
+    auto sub = app_.add_subcommand(cmd->name(), cmd->description());
+    cmd->add_options(sub);
+    commands_[sub] = cmd;
+    return sub;
+  }
+
+  uint64_t read64(uint32_t offset) {
+    uint64_t value = *((uint64_t *)(mmio_base_ + offset));
+    return value;
+  }
+
+  void write64(uint32_t offset, uint64_t value) {
+    *((uint64_t *)(mmio_base_ + offset)) = value;
+    return;
+  }
+
+  uint32_t read32(uint32_t offset) {
+    uint32_t value = *((uint64_t *)(mmio_base_ + offset));
+    return value;
+  }
+
+  void write32(uint32_t offset, uint32_t value) {
+    *((uint32_t *)(mmio_base_ + offset)) = value;
+    return;
+  }
+
+  command::ptr_t current_command() const { return current_command_; }
+
+  bool open_device() {
+
+    // std::cerr << "open\n" << dev_str;
+    fd_ = open(dev_path_.c_str(), O_RDWR);
+    if (fd_ < 0) {
+      printf("open() failed: %s\n", strerror(errno));
+      return false;
+    }
+
+    return true;
+  }
+
+  bool close_device() {
+    if (fd_ > 0)
+      close(fd_);
+    return true;
+  }
+
+  bool allocate_dsm(size_t len = KiB(4), uint32_t node = 0) {
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_he_cache_dma_map dma_map;
+    // cout << "allocate_dsm\n";
+
+    memset(&dma_map, 0, sizeof(dma_map));
+
+    ptr = alloc_4kb_hugepage();
+    if (!ptr) {
+      cerr << "failed to allocate 4k huge page:" << strerror(errno) << endl;
+      return false;
+    }
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = node;
+    dma_map.csr_array[0] = DFL_HE_CACHE_DSM_BASE; // 0x030
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_DSM_BASE);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_NODE_DSM_INFO failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+    printf("DSM_BASE: 0x%016" PRIx64 "\n", *u64);
+
+    dsm_buffer_ = (uint8_t *)ptr;
+    dsm_buf_len_ = len;
+    return true;
+
+  out_free:
+    free_memory(ptr, len);
+    return false;
+  }
+
+  bool free_dsm() {
+    struct dfl_he_cache_dma_unmap dma_unmap;
+    int res = 0;
+
+    // cout << "free_dsm\n" << endl;
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)dsm_buffer_;
+    dma_unmap.length = dsm_buf_len_;
+    dma_unmap.csr_array[0] = DFL_HE_CACHE_DSM_BASE; // 0x030
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_DSM_BASE);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+    printf("DSM_BASE: 0x%016" PRIx64 "\n", *u64);
+    free_memory(dsm_buffer_, dsm_buf_len_);
+
+    return true;
+  }
+
+  bool allocate_cache_read(size_t len = MiB(2), uint32_t numa_node = 0) {
+
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_he_cache_dma_map dma_map;
+
+    // cout << "allocate_cache_read\n";
+
+    memset(&dma_map, 0, sizeof(dma_map));
+
+    ptr = alloc_2mb_hugepage();
+    if (!ptr) {
+      cerr << "failed to allocate huge pages\n" << endl;
+      return false;
+    }
+
+    cout << "numa_node: " << numa_node << endl;
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+    printf("DFL_HE_CACHE_RD_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+
+    rd_buffer_ = (uint8_t *)ptr;
+    rd_buf_len_ = len;
+    return true;
+
+  out_free:
+    free_memory(ptr, len);
+    return false;
+  }
+
+  bool free_cache_read() {
+    struct dfl_he_cache_dma_unmap dma_unmap;
+    int res = 0;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)rd_buffer_;
+    dma_unmap.length = rd_buf_len_;
+    dma_unmap.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    printf("DFL_HE_CACHE_RD_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+    free_memory(rd_buffer_, rd_buf_len_);
+
+    return true;
+  }
+
+  bool allocate_cache_write(size_t len = MiB(2), uint32_t numa_node = 0) {
+    int res;
+    void *ptr;
+    struct dfl_he_cache_dma_map dma_map;
+
+    // std::cout << "allocate_cache_write" << endl;
+
+    memset(&dma_map, 0, sizeof(dma_map));
+
+    ptr = alloc_2mb_hugepage();
+    if (!ptr) {
+      cerr << "failed to allocate huge pages\n" << endl;
+      return false;
+    }
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+
+    wr_buffer_ = (uint8_t *)ptr;
+
+    return true;
+
+  out_free:
+    free_memory(ptr, len);
+    return false;
+  }
+
+  bool free_cache_write() {
+    struct dfl_he_cache_dma_unmap dma_unmap;
+    int res;
+
+    // cout << "free_cache_write" << endl;
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)wr_buffer_;
+    dma_unmap.length = wr_buf_len_;
+    dma_unmap.csr_array[0] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+    free_memory(wr_buffer_, wr_buf_len_);
+
+    return true;
+  }
+
+  bool allocate_cache_read_write(size_t len = MiB(2), uint32_t numa_node = 0) {
+
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_he_cache_dma_map dma_map;
+
+    // cout<< "allocate_cache_read_write";
+
+    memset(&dma_map, 0, sizeof(dma_map));
+    ptr = alloc_2mb_hugepage();
+    if (!ptr) {
+      cerr << "failed to allocate huge pages\n" << endl;
+      return false;
+    }
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
+    dma_map.csr_array[1] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    volatile uint64_t *u64_wr =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+    volatile uint64_t *u64_rd =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+
+    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64_wr);
+    printf("\nDFL_HE_CACHE_RD_ADDR_TABLE_DATAs: 0x%016" PRIx64 "\n", *u64_rd);
+
+    rd_wr_buffer_ = (uint8_t *)ptr;
+    rd_wr_buf_len_ = len;
+
+    return true;
+
+  out_free:
+    free_memory(ptr, len);
+    return false;
+  }
+
+  bool free_cache_read_write() {
+    struct dfl_he_cache_dma_unmap dma_unmap;
+    int res;
+
+    // cout << "free_cache_read_write\n" << endl;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)rd_wr_buffer_;
+    dma_unmap.length = rd_wr_buf_len_;
+    dma_unmap.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
+    dma_unmap.csr_array[1] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    volatile uint64_t *u64_wr =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+    volatile uint64_t *u64_rd =
+        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64_wr);
+    printf("\nDFL_HE_CACHE_RD_ADDR_TABLE_DATAs: 0x%016" PRIx64 "\n", *u64_rd);
+
+    free_memory(rd_wr_buffer_, rd_wr_buf_len_);
+    rd_wr_buffer_ = NULL;
+    return true;
+  }
+
+  uint8_t *get_dsm() const { return dsm_buffer_; }
+
+  uint8_t *get_read() const { return rd_buffer_; }
+
+  uint8_t *get_write() const { return wr_buffer_; }
+
+  uint8_t *get_read_write() const { return rd_wr_buffer_; }
+
+protected:
+  std::string name_;
+  std::string afu_id_;
+  CLI::App app_;
+  std::string pci_addr_;
+  std::string log_level_;
+  uint32_t timeout_msec_;
+
+  int fd_;
+  uint8_t *mmio_base_;
+  uint64_t mmio_len_;
+
+  uint8_t *dsm_buffer_;
+  uint64_t dsm_buf_len_;
+
+  uint8_t *rd_buffer_;
+  uint64_t rd_buf_len_;
+
+  uint8_t *wr_buffer_;
+  uint64_t wr_buf_len_;
+
+  uint8_t *rd_wr_buffer_;
+  uint64_t rd_wr_buf_len_;
+
+  struct dfl_he_cache_region_info rinfo_;
+
+  std::string dev_path_;
+
+  command::ptr_t current_command_;
+  std::map<CLI::App *, command::ptr_t> commands_;
+
+public:
+  std::shared_ptr<spdlog::logger> logger_;
+};
+
+} // end of namespace afu_test
+} // end of namespace opae

From 5b92bb079795f3738d9d608de4619eb83aefb647 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 10:03:20 -0700
Subject: [PATCH 02/11] fix: code rview comments and bugs fixed

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/cxl_host_exerciser/CMakeLists.txt     |   4 +-
 ...ost_exerciser_cmd.h => cxl_he_cache_cmd.h} | 285 +++++++++++-------
 ...rciser_cache.h => cxl_he_cache_lpbk_cmd.h} |  51 +++-
 samples/cxl_host_exerciser/cxl_he_cmd.h       | 206 +++++++++++++
 .../cxl_host_exerciser/cxl_host_exerciser.cpp |   7 +-
 .../cxl_host_exerciser/cxl_host_exerciser.h   | 135 ++-------
 samples/cxl_host_exerciser/dfl-he-cache.h     |  83 +++--
 samples/cxl_host_exerciser/he_cache_test.h    |   1 -
 8 files changed, 481 insertions(+), 291 deletions(-)
 rename samples/cxl_host_exerciser/{cxl_host_exerciser_cmd.h => cxl_he_cache_cmd.h} (79%)
 rename samples/cxl_host_exerciser/{cxl_host_exerciser_cache.h => cxl_he_cache_lpbk_cmd.h} (67%)
 create mode 100644 samples/cxl_host_exerciser/cxl_he_cmd.h

diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
index adcdf4580a62..ed8c2f5534dc 100644
--- a/samples/cxl_host_exerciser/CMakeLists.txt
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -33,7 +33,7 @@ if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
             ${spdlog_LIBRARIES}
             ${json-c_LIBRARIES}
             ${uuid_LIBRARIES}
-            numa
+            ${numa_LIBRARIES}
         COMPONENT samplebin
     )
     target_include_directories(cxl_host_exerciser
@@ -42,6 +42,4 @@ if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
            ${CMAKE_CURRENT_SOURCE_DIR}
             ${CLI11_INCLUDE_DIRS}
             ${spdlog_INCLUDE_DIRS})
-
-
 endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
similarity index 79%
rename from samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h
rename to samples/cxl_host_exerciser/cxl_he_cache_cmd.h
index c9a099694788..81460d81bd6d 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser_cmd.h
+++ b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
@@ -25,16 +25,9 @@
 // POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
+#include "cxl_he_cmd.h"
 #include "cxl_host_exerciser.h"
 #include "he_cache_test.h"
-#include <map>
-#include <numa.h>
-#include <unistd.h>
-
-using test_afu = opae::afu_test::afu;
-using opae::fpga::types::shared_buffer;
-using opae::fpga::types::token;
-namespace fpga = opae::fpga::types;
 
 #define UNUSED_PARAM(x) ((void)x)
 
@@ -51,17 +44,65 @@ void he_sig_handler(int) {
 
 namespace host_exerciser {
 
-std::mutex he_cache_read_mutex;
-std::mutex he_cache_write_mutex;
-
-class host_exerciser_cmd;
-
 void he_cache_thread(uint8_t *buf_ptr, uint64_t len);
 
-class host_exerciser_cmd : public test_command {
+class he_cache_cmd : public he_cmd {
 public:
-  host_exerciser_cmd() : host_exe_(NULL), numa_node_(0) {}
-  virtual ~host_exerciser_cmd() {}
+  he_cache_cmd()
+      : he_continuousmode_(false), he_contmodetime_(0), he_linerep_count_(0),
+        he_stide_(0), he_test_(0), he_test_all_(false) {}
+
+  virtual ~he_cache_cmd() {}
+
+  virtual const char *name() const override { return "cache"; }
+
+  virtual const char *description() const override {
+    return "run simple cxl he cache test";
+  }
+
+  virtual const char *afu_id() const override { return HE_CACHE_AFU_ID; }
+
+  virtual uint64_t featureid() const override { return MEM_TG_FEATURE_ID; }
+
+  virtual uint64_t guidl() const override { return MEM_TG_FEATURE_GUIDL; }
+
+  virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
+
+  virtual void add_options(CLI::App *app) override {
+    app->add_option(
+           "--test", he_test_,
+           "host exerciser cache test {fpgardcachehit, fpgawrcachehit, all}")
+        ->transform(CLI::CheckedTransformer(he_test_modes))
+        ->default_val("fpgardcachehit");
+
+    // Continuous mode
+    app->add_option("--continuousmode", he_continuousmode_,
+                    "test rollover or test termination")
+        ->default_val("false");
+
+    // Continuous mode time
+    app->add_option("--contmodetime", he_contmodetime_,
+                    "Continuous mode time in seconds")
+        ->default_val("1");
+
+    // target host or fpga
+    app->add_option("--target", he_target_,
+                    "host exerciser run on host or fpga")
+        ->transform(CLI::CheckedTransformer(he_targets))
+        ->default_val("host");
+
+    app->add_option("--stride", he_stide_, "Enable stride mode")
+        ->default_val("0");
+
+    // Line repeat count
+    app->add_option("--linerepcount", he_linerep_count_, "Line repeat count")
+        ->transform(CLI::Range(1, 256))
+        ->default_val("10");
+
+    // Test all
+    app->add_option("--testall", he_test_all_, "Run all tests")
+        ->default_val("false");
+  }
 
   int he_run_fpga_rd_cache_hit_test() {
     cout << "********** FPGA Read cache hit test start**********" << endl;
@@ -69,11 +110,12 @@ class host_exerciser_cmd : public test_command {
     STEPS
     1) Allocate DSM, Read buffer // flush
     2) set cache lines 32kb/64
-    3) set loop count
+    3) set line repeat count
     4) Set RdShared (CXL) config
     5) Run test ( AFU copies cache from host memory to FPGA cache)
-    6) Set RdShared (CXL) config
-    5) Run test ( AFU read cache from FPGA cache)
+    6) set line repeat count
+    7) Set RdShared (CXL) config
+    8) Run test ( AFU read cache from FPGA cache)
     */
 
     // HE_INFO
@@ -82,13 +124,13 @@ class host_exerciser_cmd : public test_command {
     cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
 
     cout << "Numa node:" << numa_node_ << endl;
-    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = 1;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_S;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -135,7 +177,7 @@ class host_exerciser_cmd : public test_command {
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_S;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -180,11 +222,12 @@ class host_exerciser_cmd : public test_command {
     STEPS
     1) Allocate DSM, Read buffer, Write buffer // flush
     2) set cache lines 32kb/64
-    3) set loop count
+    3) set line repeat count
     4) Set RdShared (CXL) config
     5) Run test ( AFU copies cache from host memory to FPGA cache)
-    6) Set WrLine_M/WrPart_M (CXL) config
-    5) Run test ( AFU writes to FPGA cache)
+    6) set line repeat count
+    7) Set WrLine_M/WrPart_M (CXL) config
+    8) Run test ( AFU writes to FPGA cache)
     */
 
     // HE_INFO
@@ -194,14 +237,13 @@ class host_exerciser_cmd : public test_command {
     cout << "Write address table size:" << he_info_.write_addr_table_size
          << endl;
 
-    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = 1;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_S;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -248,7 +290,7 @@ class host_exerciser_cmd : public test_command {
 
     // set W_CONFIG
     he_wr_cfg_.value = 0;
-    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
     he_wr_cfg_.write_traffic_enable = 1;
     he_wr_cfg_.opcode = WR_LINE_M;
     host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
@@ -258,6 +300,7 @@ class host_exerciser_cmd : public test_command {
     wr_table_ctl_.enable_address_stride = 1;
     host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
 
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     // Start test
     he_ctl_.Start = 1;
     host_exe_->write64(HE_CTL, he_ctl_.value);
@@ -291,20 +334,11 @@ class host_exerciser_cmd : public test_command {
     /*
     STEPS
     1) Allocate DSM, Read buffer, Write buffer
-    2) Write number of lines more then 32 kb  2mb/64
-    3) Set RdShared (CXL) config
-    4) Run test (Buffer is not present in FPGA - FPGA read Cache miss )
-
-   // 2) Set RdShared (CXL) config
-    //3) Run test ( AFU copies cache from host memory to FPGA cache)
-    //4) Set write Evict (CXL) config
-    //5) Run test ( AFU Invalidate to FPGA cache)
+    2) Write number of lines more then 32kb 2mb/64
     3) Set RdShared (CXL) config
     4) Run test (Buffer is not present in FPGA - FPGA read Cache miss )
     */
 
-    // 2MB / 64
-
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
@@ -312,11 +346,11 @@ class host_exerciser_cmd : public test_command {
 
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_2MB_CACHE_LINES - 1);
     cout << "Read number Lines:" << FPGA_2MB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_S;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -375,13 +409,6 @@ class host_exerciser_cmd : public test_command {
     2) Write number of lines more then 32 kb  2mb/64
     3) Set WR ItoMWr (CXL) config
     4) Run test ( Buffer is not present in FPGA - FPGA write Cache miss )
-
-    //2) Set RdShared (CXL) config
-    //3) Run test ( AFU copies cache from host to HDM
-    //4) Set write Evict  (CXL) config
-    //5) Run test ( AFU Invalidate to FPGA cache)
-    6) Set WR ItoMWr (CXL) config
-    7) Run test ( Buffer is not present in FPGA - FPGA write Cache miss )
     */
 
     // HE_INFO
@@ -391,13 +418,13 @@ class host_exerciser_cmd : public test_command {
     cout << "Write address table size:" << he_info_.write_addr_table_size
          << endl;
 
-    host_exe_->write64(HE_WR_NUM_LINES, FPGA_2MB_CACHE_LINES - 1);
-    cout << "Read/write number Lines:" << FPGA_2MB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_2MB_CACHE_LINES);
+    cout << "Read/write number Lines:" << FPGA_2MB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set W_CONFIG
     he_wr_cfg_.value = 0;
-    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
     he_wr_cfg_.write_traffic_enable = 1;
     he_wr_cfg_.opcode = WR_LINE_M;
     host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
@@ -467,11 +494,11 @@ class host_exerciser_cmd : public test_command {
 
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_I;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -556,11 +583,11 @@ class host_exerciser_cmd : public test_command {
 
     host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG
     he_wr_cfg_.value = 0;
-    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
     he_wr_cfg_.write_traffic_enable = 1;
     he_wr_cfg_.opcode = WR_LINE_I;
     host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
@@ -640,11 +667,11 @@ class host_exerciser_cmd : public test_command {
 
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG
     he_rd_cfg_.value = 0;
-    he_rd_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
     he_rd_cfg_.read_traffic_enable = 1;
     he_rd_cfg_.opcode = RD_LINE_I;
     host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
@@ -718,11 +745,11 @@ class host_exerciser_cmd : public test_command {
 
     host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << host_exe_->he_linerep_count_ << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
 
     // set RD_CONFIG
     he_wr_cfg_.value = 0;
-    he_wr_cfg_.line_repeat_count = host_exe_->he_linerep_count_;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
     he_wr_cfg_.write_traffic_enable = 1;
     he_wr_cfg_.opcode = WR_PUSH_I;
     host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
@@ -773,6 +800,11 @@ class host_exerciser_cmd : public test_command {
     return 0;
   }
 
+  // Convert number of transactions to bandwidth (GB/s)
+  double he_num_xfers_to_bw(uint64_t num_lines, uint64_t num_ticks) {
+    return (double)(num_lines * 64) / ((1000.0 / he_clock_mhz_ * num_ticks));
+  }
+
   void he_perf_counters() {
     volatile he_cache_dsm_status *dsm_status = NULL;
 
@@ -781,18 +813,26 @@ class host_exerciser_cmd : public test_command {
     if (!dsm_status)
       return;
 
-    std::cout << "\n********* DSM Status CSR Start *********" << std::endl;
-
-    std::cout << "test completed :" << dsm_status->test_completed << std::endl;
-    std::cout << "dsm number:" << dsm_status->dsm_number << std::endl;
-    std::cout << "error vector:" << dsm_status->err_vector << std::endl;
-    std::cout << "num ticks:" << dsm_status->num_ticks << std::endl;
-    std::cout << "num reads:" << dsm_status->num_reads << std::endl;
-    std::cout << "num writes:" << dsm_status->num_writes << std::endl;
-    std::cout << "penalty start:" << dsm_status->penalty_start << std::endl;
-    std::cout << "penalty end:" << dsm_status->penalty_end << std::endl;
-    std::cout << "actual data:" << dsm_status->actual_data << std::endl;
-    std::cout << "expected data:" << dsm_status->expected_data << std::endl;
+    cout << "\n********* DSM Status CSR Start *********" << std::endl;
+
+    cout << "test completed :" << dsm_status->test_completed << endl;
+    cout << "dsm number:" << dsm_status->dsm_number << endl;
+    cout << "error vector:" << dsm_status->err_vector << endl;
+    cout << "num ticks:" << dsm_status->num_ticks << endl;
+    cout << "num reads:" << dsm_status->num_reads << endl;
+    cout << "num writes:" << dsm_status->num_writes << endl;
+    cout << "penalty start:" << dsm_status->penalty_start << endl;
+    cout << "penalty end:" << dsm_status->penalty_end << endl;
+    cout << "actual data:" << dsm_status->actual_data << endl;
+    cout << "expected data:" << dsm_status->expected_data << endl;
+
+    // print bandwidth
+    if (dsm_status->num_ticks > 0) {
+      double perf_data =
+          he_num_xfers_to_bw(dsm_status->num_reads + dsm_status->num_writes,
+                             dsm_status->num_ticks);
+      host_exe_->logger_->info("Bandwidth: {0:0.3f} GB/s", perf_data);
+    }
 
     std::cout << "********* DSM Status CSR end *********" << std::endl;
   }
@@ -871,9 +911,6 @@ class host_exerciser_cmd : public test_command {
       printf("System does not support NUMA API!\n");
       return false;
     }
-
-    printf("SUpported NUMA API!\n");
-
     int n = numa_max_node();
     printf("There are %d nodes on your system\n", n + 1);
 
@@ -883,7 +920,7 @@ class host_exerciser_cmd : public test_command {
     int node = numa_node_of_cpu(cup_num);
     printf("node:%d\n", node);
 
-    if (host_exe_->he_target_ == HE_TARGET_HOST) {
+    if (he_target_ == HE_TARGET_HOST) {
       numa_node_ = node;
       printf("HE_TARGET_HOST numa_node_:%d\n", numa_node_);
 
@@ -893,9 +930,6 @@ class host_exerciser_cmd : public test_command {
       printf("HE_TARGET_FPGA numa_node_:%d\n", numa_node_);
     }
 
-    int num_task = numa_num_task_nodes();
-    printf("num_task:%d\n", num_task);
-
     return true;
   }
 
@@ -919,42 +953,82 @@ class host_exerciser_cmd : public test_command {
     he_ctl_.ResetL = 1;
     host_exe_->write64(HE_CTL, he_ctl_.value);
 
-    if (host_exe_->he_test_ == HE_FPGA_RD_CACHE_HIT) {
+    if (he_test_all_ == true) {
+      int retvalue = 0;
+      ret = he_run_fpga_rd_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_fpga_wr_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      ret = he_run_fpga_rd_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_fpga_wr_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_rd_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_wr_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      ret = he_run_host_rd_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_wr_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      return retvalue;
+    }
+
+    if (he_test_ == HE_FPGA_RD_CACHE_HIT) {
       ret = he_run_fpga_rd_cache_hit_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_FPGA_WR_CACHE_HIT) {
+    if (he_test_ == HE_FPGA_WR_CACHE_HIT) {
       ret = he_run_fpga_wr_cache_hit_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_FPGA_RD_CACHE_MISS) {
+    if (he_test_ == HE_FPGA_RD_CACHE_MISS) {
       ret = he_run_fpga_rd_cache_miss_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_FPGA_WR_CACHE_MISS) {
+    if (he_test_ == HE_FPGA_WR_CACHE_MISS) {
       ret = he_run_fpga_wr_cache_miss_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_HOST_RD_CACHE_HIT) {
+    if (he_test_ == HE_HOST_RD_CACHE_HIT) {
       ret = he_run_host_rd_cache_hit_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_HOST_WR_CACHE_HIT) {
+    if (he_test_ == HE_HOST_WR_CACHE_HIT) {
       ret = he_run_host_wr_cache_hit_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_HOST_RD_CACHE_MISS) {
+    if (he_test_ == HE_HOST_RD_CACHE_MISS) {
       ret = he_run_host_rd_cache_miss_test();
       return ret;
     }
 
-    if (host_exe_->he_test_ == HE_HOST_WR_CACHE_MISS) {
+    if (he_test_ == HE_HOST_WR_CACHE_MISS) {
       ret = he_run_host_wr_cache_miss_test();
       return ret;
     }
@@ -963,40 +1037,30 @@ class host_exerciser_cmd : public test_command {
   }
 
 protected:
-  host_exerciser *host_exe_;
-  token::ptr_t token_;
-
-  he_ctl he_ctl_;
-  he_info he_info_;
-  he_rd_config he_rd_cfg_;
-  he_wr_config he_wr_cfg_;
-
-  he_rd_addr_table_ctrl rd_table_ctl_;
-  he_wr_addr_table_ctrl wr_table_ctl_;
-  uint8_t *dsm_buf_;
-  uint8_t *rd_buf_;
-
-  uint32_t numa_node_;
+  bool he_continuousmode_;
+  uint32_t he_contmodetime_;
+  uint32_t he_linerep_count_;
+  uint32_t he_stide_;
+  uint32_t he_target_;
+  uint32_t he_test_;
+  bool he_test_all_;
 };
 
 void he_cache_thread(uint8_t *buf_ptr, uint64_t len) {
-  cout << "he_cache_thread  enter" << endl;
   if (buf_ptr == NULL || len == 0) {
     return;
   }
   uint64_t value;
   UNUSED_PARAM(value);
-  uint64_t cache_lines = len / 64;
+  uint64_t cache_lines = len / CL;
   uint64_t i = 0;
-  cout << "he_cache_thread  cache_lines:" << cache_lines << endl;
 
   while (true) {
 
     if (g_stop_thread == true) {
-      cout << "he_cache_thread g_stop_thread " << endl;
+      // cout << "he_cache_thread g_stop_thread " << endl;
       return;
     }
-    // cout << "he_cache_thread:i "<<i << endl;
     if (i < cache_lines) {
       value = *((volatile uint64_t *)(buf_ptr + i * 8));
     }
@@ -1006,7 +1070,6 @@ void he_cache_thread(uint8_t *buf_ptr, uint64_t len) {
     }
   }
 
-  cout << "he_cache_thread  end" << endl;
   return;
 }
 
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser_cache.h b/samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h
similarity index 67%
rename from samples/cxl_host_exerciser/cxl_host_exerciser_cache.h
rename to samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h
index 892584ef36b1..82f2dcba91a0 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser_cache.h
+++ b/samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h
@@ -24,30 +24,21 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 #pragma once
+
 #include "cxl_host_exerciser.h"
-#include "cxl_host_exerciser_cmd.h"
 #include "he_cache_test.h"
 
-const char *HE_CACHE_AFU_ID = "0118E06B-1FA3-49B9-8159-9b5C2EBD4b23";
-
-#define MEM_TG_FEATURE_ID 0x25
-#define MEM_TG_FEATURE_GUIDL 0x81599b5c2ebd4b23
-#define MEM_TG_FEATURE_GUIDH 0x0118e06b1fa349b9
-
-using test_afu = opae::afu_test::afu;
-using opae::fpga::types::shared_buffer;
-
 namespace host_exerciser {
 
-class host_exerciser_cache : public host_exerciser_cmd {
+class he_cache_lpbk_cmd : public he_cmd {
 public:
-  host_exerciser_cache() {}
+  he_cache_lpbk_cmd() {}
+  virtual ~he_cache_lpbk_cmd() {}
 
-  virtual ~host_exerciser_cache() {}
-  virtual const char *name() const override { return "cache"; }
+  virtual const char *name() const override { return "lpbk"; }
 
   virtual const char *description() const override {
-    return "run simple cxl he cache test";
+    return "run simple cxl he lpbk test";
   }
 
   virtual const char *afu_id() const override { return HE_CACHE_AFU_ID; }
@@ -57,6 +48,34 @@ class host_exerciser_cache : public host_exerciser_cmd {
   virtual uint64_t guidl() const override { return MEM_TG_FEATURE_GUIDL; }
 
   virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
-};
+  virtual void add_options(CLI::App *app) override {
+    // target host or fpga
+    app->add_option("--target", he_target_,
+                    "host exerciser run on host or fpga")
+        ->transform(CLI::CheckedTransformer(he_targets))
+        ->default_val("host");
+  }
 
+  virtual int run(test_afu *afu, CLI::App *app) {
+    (void)app;
+    //  int ret = 0;
+    cout << "HE LPBK run" << endl;
+    host_exe_ = dynamic_cast<host_exerciser *>(afu);
+
+    if (!verify_numa_node()) {
+      numa_node_ = 0;
+      cout << "numa nodes are available set numa node to 0" << endl;
+    };
+
+    // reset HE cache
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    return 0;
+  }
+};
 } // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_he_cmd.h b/samples/cxl_host_exerciser/cxl_he_cmd.h
new file mode 100644
index 000000000000..048937fd2dcf
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_he_cmd.h
@@ -0,0 +1,206 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include <map>
+#include <numa.h>
+#include <unistd.h>
+
+#include "cxl_he_cmd.h"
+#include "cxl_host_exerciser.h"
+#include "he_cache_test.h"
+
+namespace host_exerciser {
+
+class he_cmd : public test_command {
+public:
+  he_cmd() : host_exe_(NULL), he_clock_mhz_(400), numa_node_(0), he_target_(0) {
+
+    he_ctl_.value = 0;
+    he_info_.value = 0;
+    he_rd_cfg_.value = 0;
+    he_wr_cfg_.value = 0;
+    rd_table_ctl_.value = 0;
+    wr_table_ctl_.value = 0;
+  }
+
+  virtual ~he_cmd() {}
+
+  // Convert number of transactions to bandwidth (GB/s)
+  double he_num_xfers_to_bw(uint64_t num_lines, uint64_t num_ticks) {
+    return (double)(num_lines * 64) / ((1000.0 / he_clock_mhz_ * num_ticks));
+  }
+
+  void he_perf_counters() {
+    volatile he_cache_dsm_status *dsm_status = NULL;
+
+    dsm_status = reinterpret_cast<he_cache_dsm_status *>(
+        (uint8_t *)(host_exe_->get_dsm()));
+    if (!dsm_status)
+      return;
+
+    std::cout << "\n********* DSM Status CSR Start *********" << std::endl;
+
+    std::cout << "test completed :" << dsm_status->test_completed << std::endl;
+    std::cout << "dsm number:" << dsm_status->dsm_number << std::endl;
+    std::cout << "error vector:" << dsm_status->err_vector << std::endl;
+    std::cout << "num ticks:" << dsm_status->num_ticks << std::endl;
+    std::cout << "num reads:" << dsm_status->num_reads << std::endl;
+    std::cout << "num writes:" << dsm_status->num_writes << std::endl;
+    std::cout << "penalty start:" << dsm_status->penalty_start << std::endl;
+    std::cout << "penalty end:" << dsm_status->penalty_end << std::endl;
+    std::cout << "actual data:" << dsm_status->actual_data << std::endl;
+    std::cout << "expected data:" << dsm_status->expected_data << std::endl;
+
+    // print bandwidth
+    if (dsm_status->num_ticks > 0) {
+      double perf_data =
+          he_num_xfers_to_bw(dsm_status->num_reads + dsm_status->num_writes,
+                             dsm_status->num_ticks);
+      host_exe_->logger_->info("Bandwidth: {0:0.3f} GB/s", perf_data);
+    }
+
+    std::cout << "********* DSM Status CSR end *********" << std::endl;
+  }
+
+  void host_exerciser_errors() {
+    he_err_status err_status;
+    uint64_t err = 0;
+    if (host_exe_ == NULL)
+      return;
+
+    err_status.value = host_exe_->read64(HE_ERROR_STATUS);
+    if (err_status.data_error == 1) {
+      cout << "Data Integrity Check error occured" << endl;
+    }
+
+    if (err_status.err_index > 0) {
+      cout << "Error occurred at cache line address:" << err_status.err_index
+           << endl;
+    }
+
+    err = host_exe_->read64(HE_ERROR_EXP_DATA);
+    cout << "Error Expected Data:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA0);
+    cout << "Error Expected Data0:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA1);
+    cout << "Error Expected Data1:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA2);
+    cout << "Error Expected Data2:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA3);
+    cout << "Error Expected Data3:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA4);
+    cout << "Error Expected Data4:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA5);
+    cout << "Error Expected Data5:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA6);
+    cout << "Error Expected Data6:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA7);
+    cout << "Error Expected Data7:" << err << endl;
+  }
+
+  int parse_input_options() {
+
+    if (!host_exe_)
+      return -1;
+
+    return 0;
+  }
+
+  bool he_wait_test_completion() {
+    /* Wait for test completion */
+    uint32_t timeout = HELPBK_TEST_TIMEOUT;
+
+    volatile uint8_t *status_ptr = host_exe_->get_dsm();
+    while (0 == ((*status_ptr) & 0x1)) {
+      usleep(HELPBK_TEST_SLEEP_INVL);
+      if (--timeout == 0) {
+        cout << "HE LPBK TIME OUT" << std::endl;
+
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool verify_numa_node() {
+
+    if (numa_available() < 0) {
+      printf("System does not support NUMA API!\n");
+      return false;
+    }
+
+    printf("SUpported NUMA API!\n");
+
+    int n = numa_max_node();
+    printf("There are %d nodes on your system\n", n + 1);
+
+    int cup_num = sched_getcpu();
+    printf("cup_num:%d\n", cup_num);
+
+    int node = numa_node_of_cpu(cup_num);
+    printf("node:%d\n", node);
+
+    if (he_target_ == HE_TARGET_HOST) {
+      numa_node_ = node;
+      printf("HE_TARGET_HOST numa_node_:%d\n", numa_node_);
+
+    } else {
+      // find fpga numa node numebr
+      numa_node_ = 2;
+      printf("HE_TARGET_FPGA numa_node_:%d\n", numa_node_);
+    }
+
+    int num_config_cpu = numa_num_configured_cpus();
+    printf("num_config_cpu:%d\n", num_config_cpu);
+
+    int num_task_nodes = numa_num_task_nodes();
+    printf("num_task_nodes:%d\n", num_task_nodes);
+
+    return true;
+  }
+
+protected:
+  host_exerciser *host_exe_;
+  uint32_t he_clock_mhz_;
+  uint32_t numa_node_;
+  uint32_t he_target_;
+
+  he_ctl he_ctl_;
+  he_info he_info_;
+  he_rd_config he_rd_cfg_;
+  he_wr_config he_wr_cfg_;
+  he_rd_addr_table_ctrl rd_table_ctl_;
+  he_wr_addr_table_ctrl wr_table_ctl_;
+};
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
index 3d5eb10f1604..0f31d9155dce 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
@@ -27,14 +27,17 @@
 #include <iostream>
 #include <signal.h>
 
+#include "cxl_he_cache_cmd.h"
+#include "cxl_he_cache_lpbk_cmd.h"
 #include "cxl_host_exerciser.h"
-#include "cxl_host_exerciser_cache.h"
 
 void he_sig_handler(int);
 
 int main(int argc, char *argv[]) {
+
   host_exerciser::host_exerciser app;
-  app.register_command<host_exerciser::host_exerciser_cache>();
+  app.register_command<host_exerciser::he_cache_cmd>();
+  app.register_command<host_exerciser::he_cache_lpbk_cmd>();
 
   // host exerciser signal handler
   struct sigaction act_old, act_new;
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.h b/samples/cxl_host_exerciser/cxl_host_exerciser.h
index 293293a30255..adae83320674 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser.h
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.h
@@ -24,28 +24,23 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 #pragma once
-#include <opae/cxx/core/events.h>
-#include <opae/cxx/core/shared_buffer.h>
-#include <opae/cxx/core/token.h>
 
 #include "he_cache_test.h"
 
+#define MEM_TG_FEATURE_ID 0x25
+#define MEM_TG_FEATURE_GUIDL 0x81599b5c2ebd4b23
+#define MEM_TG_FEATURE_GUIDH 0x0118e06b1fa349b9
+const char *HE_CACHE_AFU_ID = "0118E06B-1FA3-49B9-8159-9b5C2EBD4b23";
+
 namespace host_exerciser {
-using opae::fpga::types::event;
-using opae::fpga::types::shared_buffer;
-using opae::fpga::types::token;
 
 static const uint64_t HELPBK_TEST_TIMEOUT = 30000;
 static const uint64_t HELPBK_TEST_SLEEP_INVL = 100;
 static const uint64_t CL = 64;
 static const uint64_t KB = 1024;
 static const uint64_t MB = KB * 1024;
-static const uint64_t LOG2_CL = 6;
-
 static const uint64_t BUFFER_SIZE_2MB = 2 * 1024 * 1024;
-
 static const uint64_t FPGA_32KB_CACHE_LINES = (32 * 1024) / 64;
-
 static const uint64_t FPGA_2MB_CACHE_LINES = (2 * 1024 * 1024) / 64;
 
 // Host execiser CSR Offset
@@ -59,19 +54,16 @@ enum {
   HE_DSM_BASE = 0x030,
   HE_CTL = 0x038,
   HE_INFO = 0x040,
-
   HE_WR_NUM_LINES = 0x048,
   HE_WR_BYTE_ENABLE = 0x050,
   HE_WR_CONFIG = 0x058,
   HE_WR_ADDR_TABLE_CTRL = 0x060,
   HE_WR_ADDR_TABLE_DATA = 0x068,
-
   HE_RD_NUM_LINES = 0x070,
   HE_RD_CONFIG = 0x078,
   HE_RD_ADDR_TABLE_CTRL = 0x080,
   HE_RD_ADDR_TABLE_DATA = 0x088,
   HE_ERROR_STATUS = 0x090,
-
   HE_ERROR_EXP_DATA = 0x098,
   HE_ERROR_ACT_DATA0 = 0x0A0,
   HE_ERROR_ACT_DATA1 = 0x0A8,
@@ -81,23 +73,16 @@ enum {
   HE_ERROR_ACT_DATA5 = 0x0C8,
   HE_ERROR_ACT_DATA6 = 0x0D0,
   HE_ERROR_ACT_DATA7 = 0x0D8,
-
 };
 
-// configures test mode
-typedef enum {
-  HOST_EXEMODE_READ = 0x0,
-  HOST_EXEMODE_WRITE = 0x1,
-  HOST_EXEMODE_ALL = 0x2,
-} host_exe_mode;
-
-// Write Traffic Opcode
+// Read Traffic Opcode
 typedef enum {
   RD_LINE_I = 0x0,
   RD_LINE_S = 0x1,
   RD_LINE_EM = 0x2,
 } he_rd_opcode;
 
+// Write Traffic Opcode
 typedef enum {
   WR_LINE_I = 0x0,
   WR_LINE_M = 0x1,
@@ -113,16 +98,16 @@ union he_dfh {
   enum { offset = HE_DFH };
   uint64_t value;
   struct {
-    uint16_t CcipVersionNumber : 12;
-    uint8_t AfuMajVersion : 4;
-    uint32_t NextDfhOffset : 24;
-    uint8_t EOL : 1;
-    uint32_t Reserved : 19;
-    uint8_t FeatureType : 4;
+    uint64_t CcipVersionNumber : 12;
+    uint64_t AfuMajVersion : 4;
+    uint64_t NextDfhOffset : 24;
+    uint64_t EOL : 1;
+    uint64_t Reserved : 19;
+    uint64_t FeatureType : 4;
   };
 };
 
-// DSM BASEL
+// DSM BASE
 union he_dsm_base {
   enum { offset = HE_DSM_BASE };
   uint64_t value;
@@ -139,7 +124,7 @@ union he_ctl {
     uint64_t ResetL : 1;
     uint64_t Start : 1;
     uint64_t ForcedTestCmpl : 1;
-    uint64_t bios_support : 1;
+    uint64_t bias_support : 1;
     uint64_t Reserved : 60;
   };
 };
@@ -184,7 +169,7 @@ union he_wr_config {
     uint64_t waitfor_completion : 1;
     uint64_t preread_sync_enable : 1;
     uint64_t postread_sync_enable : 1;
-    uint64_t daata_pattern : 2;
+    uint64_t data_pattern : 2;
     uint64_t cl_evict_enable : 1;
     uint64_t opcode : 4;
     uint64_t line_repeat_count : 8;
@@ -233,7 +218,7 @@ union he_rd_config {
     uint64_t waitfor_completion : 1;
     uint64_t prewrite_sync_enable : 1;
     uint64_t postwrite_sync_enable : 1;
-    uint64_t daata_pattern : 2;
+    uint64_t data_pattern : 2;
     uint64_t cl_evict_enable : 1;
     uint64_t opcode : 4;
     uint64_t line_repeat_count : 8;
@@ -262,7 +247,7 @@ union he_rd_addr_table_data {
   };
 };
 
-// HE_RD_ADDR_TABLE_DATA
+// ERROR_STATUS
 union he_err_status {
   enum { offset = HE_ERROR_STATUS };
   uint64_t value;
@@ -290,12 +275,6 @@ struct he_cache_dsm_status {
   uint32_t res5[2];
 };
 
-const std::map<std::string, uint32_t> he_modes = {
-    {"read", HOST_EXEMODE_READ},
-    {"write", HOST_EXEMODE_WRITE},
-    {"all", HOST_EXEMODE_ALL},
-};
-
 // configures test mode
 typedef enum {
   HE_FPGA_RD_CACHE_HIT = 0x0,
@@ -323,7 +302,6 @@ const std::map<std::string, uint32_t> he_test_modes = {
     {"fpgawrcachehit", HE_FPGA_WR_CACHE_HIT},
     {"fpgardcachemiss", HE_FPGA_RD_CACHE_MISS},
     {"fpgawrcachemiss", HE_FPGA_WR_CACHE_MISS},
-
     {"hostrdcachehit", HE_HOST_RD_CACHE_HIT},
     {"hostwrcachehit", HE_HOST_WR_CACHE_HIT},
     {"hostrdcachemiss", HE_HOST_RD_CACHE_MISS},
@@ -381,66 +359,13 @@ std::map<uint32_t, uint32_t> addrtable_size = {
 
 };
 
-// he test type
-typedef enum {
-  HE_DISABLE_DATA_INTEGRITY_CHECK = 0x0,
-  HE_ENABLE_DATA_INTEGRITY_CHECK = 0x1,
-} he_data_integrity_check;
-
-struct MapKeyComparator {
-  bool operator()(const std::string &a, const std::string &b) const {
-    if (a.length() != b.length())
-      return (a.length() < b.length());
-    else
-      return (a < b);
-  }
-};
-
 using test_afu = opae::afu_test::afu;
 using test_command = opae::afu_test::command;
 
 class host_exerciser : public test_afu {
 public:
   host_exerciser()
-      : test_afu("host_exerciser", nullptr, "warning"), count_(1),
-      he_continuousmode_(false), he_test_all_(0), he_contmodetime_(0),
-      he_clock_mhz_(0),he_linerep_count_(10), he_stide_(0), he_target_(0), he_test_(0) {
-
-
-    // test
-    app_.add_option(
-            "--test", he_test_,
-            "host exerciser cache test {fpgardcachehit, fpgawrcachehit, all}")
-        ->transform(CLI::CheckedTransformer(he_test_modes))
-        ->default_val("fpgardcachehit");
-
-    // Configures test rollover or test termination
-    app_.add_option("--continuousmode", he_continuousmode_,
-                    "test rollover or test termination")
-        ->default_val("false");
-
-    // Continuous mode time
-    app_.add_option("--contmodetime", he_contmodetime_,
-                    "Continuous mode time in seconds")
-        ->default_val("1");
-
-    app_.add_option("--target", he_target_,
-                    "host exerciser run on host or fpga")
-        ->transform(CLI::CheckedTransformer(he_targets))
-        ->default_val("host");
-
-
-    app_.add_option("--stride", he_stide_, "Enable stride mode")
-        ->default_val("0");
-
-    app_.add_option("--linerepcount", he_linerep_count_, "Line repeat count")
-        ->transform(CLI::Range(1, 256))
-        ->default_val("10");
-
-    // Test all
-    app_.add_option("--testall", he_test_all_, "Run all tests")
-        ->default_val("false");
-  }
+      : test_afu("host_exerciser", nullptr, "warning"), count_(1) {}
 
   virtual int run(CLI::App *app, test_command::ptr_t test) override {
     int res = exit_codes::not_run;
@@ -448,7 +373,7 @@ class host_exerciser : public test_afu {
     logger_->set_pattern("    %v");
     // Info prints details of an individual run. Turn it on if doing only one
     // test and the user hasn't changed level from the default.
-    if ((log_level_.compare("warning") == 0) && !he_test_all_)
+    if ((log_level_.compare("warning") == 0))
       logger_->set_level(spdlog::level::info);
 
     logger_->info("starting test run, count of {0:d}", count_);
@@ -476,24 +401,6 @@ class host_exerciser : public test_afu {
 
 public:
   uint32_t count_;
-  bool he_continuousmode_;
-  bool he_test_all_;
-  uint32_t he_contmodetime_;
-  uint32_t he_clock_mhz_;
-  uint32_t he_linerep_count_;
-  uint32_t he_stide_;
-  uint32_t he_target_;
-  uint32_t he_test_;
-  std::map<uint32_t, uint32_t> limits_;
-
-  uint32_t get_offset(uint32_t base, uint32_t i) const {
-    auto limit = limits_.find(base);
-    auto offset = base + sizeof(uint64_t) * i;
-    if (limit != limits_.end() && offset > limit->second - sizeof(uint64_t)) {
-      throw std::out_of_range("offset out range in csr space");
-    }
-    return offset;
-  }
 
   bool option_passed(std::string option_str) {
     if (app_.count(option_str) == 0)
@@ -501,4 +408,4 @@ class host_exerciser : public test_afu {
     return true;
   }
 };
-} // end of namespace cxl_host_exerciser
+} // namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/dfl-he-cache.h b/samples/cxl_host_exerciser/dfl-he-cache.h
index b95df7414bdb..d6036c832dbf 100644
--- a/samples/cxl_host_exerciser/dfl-he-cache.h
+++ b/samples/cxl_host_exerciser/dfl-he-cache.h
@@ -30,7 +30,7 @@
 #define DFL_HE_CACHE_BASE 0
 
 /**
- * DFL_FPGA_GET_API_VERSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
+ * DFL_HE_CACHE_GET_API_VERSION - _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 0)
  *
  * Report the version of the driver API.
  * Return: Driver API Version.
@@ -40,7 +40,7 @@
   _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 0)
 
 /**
- * DFL_FPGA_CHECK_EXTENSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
+ * DFL_HE_CACHE_CHECK_EXTENSION - _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 1)
  *
  * Check whether an extension is supported.
  * Return: 0 if not supported, otherwise the extension is supported.
@@ -49,18 +49,19 @@
 #define DFL_HE_CACHE_CHECK_EXTENSION                                           \
   _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 1)
 
-#define DFL_HE_CACHE_GET_REGION_INFO                                           \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 2)
-
 /**
- * FPGA_PORT_GET_REGION_INFO - _IOWR(FPGA_MAGIC, PORT_BASE + 2,
- *                                      struct dfl_he_cache_region_info)
+ * DFL_HE_CACHE_GET_REGION_INFO - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE +
+ * 2, struct dfl_he_cache_region_info)
  *
  * Retrieve information about a device memory region.
- * Caller provides struct dfl_fpga_port_region_info with index value set.
+ * Caller provides struct dfl_he_cache_region_info with flags.
  * Driver returns the region info in other fields.
  * Return: 0 on success, -errno on failure.
  */
+
+#define DFL_HE_CACHE_GET_REGION_INFO                                           \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 2)
+
 struct dfl_he_cache_region_info {
   /* Input */
   __u32 argsz; /* Structure length */
@@ -73,44 +74,25 @@ struct dfl_he_cache_region_info {
   __u64 offset; /* Region offset from start of device fd */
 };
 
-#define DFL_HE_CACHE_SET_DSM_INFO _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3)
-
-struct dfl_he_cache_dsm_info {
-  /* Input */
-  __u32 argsz;     /* Structure length */
-  __u64 user_addr; /* Process virtual address */
-  __u64 length;    /* Length of mapping (bytes)*/
-};
-
-#define DFL_HE_CACHE_CLEAR_DSM_INFO                                            \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 4)
-
-#define DFL_HE_CACHE_ALLOC_ADDR_TABLE                                          \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 5)
-#define DFL_HE_CACHE_FREE_ADDR_TABLE                                           \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 6)
-#define DFL_HE_CACHE_APPEND_ADDR_TABLE                                         \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 7)
-
-#define DFL_HE_CACHE_NUM_LINES_MIN 1
-#define DFL_HE_CACHE_NUM_LINES_MAX 0xffff
-
-struct dfl_he_cache_addr_table {
-  /* Input */
-  __u32 argsz; /* Structure length */
-  __u32 flags; /* Address Table ID */
-#define DFL_HE_CACHE_READ_ADDR_TABLE (1 << 0)
-#define DFL_HE_CACHE_WRITE_ADDR_TABLE (1 << 1)
-  __u32 cache_lines; /* Buffer size/offset in cache lines */
-};
-
-#define DFL_HE_CACHE_NUMA_DMA_MAP                                              \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 12)
-#define DFL_HE_CACHE_NUMA_DMA_UNMAP                                            \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 13)
+/**
+* DFL_HE_CACHE_NUMA_DMA_MAP - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3,
+*                                      struct dfl_he_cache_dma_map)
+*
+* Map the dma memory per user_addr,length and numa node which are provided by
+caller.
+* The driver allocates memory on the numa node, converts the user's virtual
+address
+* to a continuous physical address, and writes the physical address to
+* the host executor's read/write address table CSR.
+
+* This interface only accepts page-size aligned user memory for dma mapping.
+* Return: 0 on success, -errno on failure.
+*/
 
 #define DFL_ARRAY_MAX_SIZE 0x10
 
+#define DFL_HE_CACHE_NUMA_DMA_MAP _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3)
+
 struct dfl_he_cache_dma_map {
   /* Input */
   __u32 argsz;                         /* Structure length */
@@ -121,13 +103,26 @@ struct dfl_he_cache_dma_map {
   __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR  */
 };
 
+/**
+ * DFL_HE_CACHE_NUMA_DMA_UNMAP - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE +
+ * 4, struct dfl_he_cache_dma_unmap)
+ *
+ * Unmpas the dma memory per user_addr and length which are provided by caller.
+ * The driver deletes the physical pages of the user address and writes a zero
+ * to the read/write address table CSR.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_HE_CACHE_NUMA_DMA_UNMAP                                            \
+  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 4)
+
 struct dfl_he_cache_dma_unmap {
   /* Input */
   __u32 argsz;                         /* Structure length */
   __u32 flags;                         /* flags */
   __u64 user_addr;                     /* Process virtual address */
   __u64 length;                        /* Length of mapping (bytes)*/
-  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR  */
+  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR */
 };
 
 #endif /* _UAPI_LINUX_HE_CACHE_DFL_H */
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index 1e3b4d503c60..db4d3f340d06 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -144,7 +144,6 @@ bool sysfs_read_u64(const char *path, uint64_t *value) {
 namespace opae {
 namespace afu_test {
 
-namespace fpga = fpga::types;
 
 template <typename T>
 inline bool parse_match_int(const char *s, regmatch_t m, T &v, int radix = 10) {

From 79fe277cd1004e0542e98ba6729c73fe6624bc94 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 10:20:25 -0700
Subject: [PATCH 03/11] fix: cmakefile typo

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 4f3661b4be4e..ab942e774b65 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -69,6 +69,6 @@ opae_add_subdirectory(mem_tg)
 opae_add_subdirectory(host_exerciser)
 opae_add_subdirectory(n5010-test)
 opae_add_subdirectory(n5010-ctl)
-opae_add_subdirectory(clx_mem_tg)
+opae_add_subdirectory(cxl_mem_tg)
 opae_add_subdirectory(cxl_host_exerciser)
 

From 83b05bc7a3b8ac7912394668daed604f7d4fc957 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 13:50:54 -0700
Subject: [PATCH 04/11] fix: ci build errors

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/cxl_host_exerciser/CMakeLists.txt | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
index ed8c2f5534dc..b9ee688717ea 100644
--- a/samples/cxl_host_exerciser/CMakeLists.txt
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -24,22 +24,33 @@
 ## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
 ## POSSIBILITY OF SUCH DAMAGE.
 
-if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
+if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
+
+    if (fmt_LIBRARIES)
+        # if we found fmt before (from CMakeLists.txt)
+        # then we need to find it again from this directory
+        # so we can "import" the fmt::fmt link target
+        find_package(fmt)
+    endif (fmt_LIBRARIES)
+
     opae_add_executable(TARGET cxl_host_exerciser
         SOURCE cxl_host_exerciser.cpp
         LIBS
-            opae-c
             opae-cxx-core
+            opae-c
             ${spdlog_LIBRARIES}
             ${json-c_LIBRARIES}
             ${uuid_LIBRARIES}
             ${numa_LIBRARIES}
+            ${fmt_LIBRARIES}
         COMPONENT samplebin
     )
     target_include_directories(cxl_host_exerciser
         PRIVATE
            ${OPAE_INCLUDE_PATHS}
            ${CMAKE_CURRENT_SOURCE_DIR}
-            ${CLI11_INCLUDE_DIRS}
-            ${spdlog_INCLUDE_DIRS})
-endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG)
+           ${CLI11_INCLUDE_DIRS}
+           ${numa_INCLUDE_DIRS}
+           ${spdlog_INCLUDE_DIRS})
+
+endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)

From 16dbe5b644babdcd0cc62e3075311e09410d25c2 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 14:13:42 -0700
Subject: [PATCH 05/11] fix: ci build error

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/cxl_host_exerciser/he_cache_test.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index db4d3f340d06..ce9c33b6046b 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -26,22 +26,23 @@
 
 #pragma once
 
-#include <CLI/CLI.hpp>
 #include <future>
 #include <glob.h>
 #include <inttypes.h>
 #include <numa.h>
 #include <opae/cxx/core.h>
 #include <regex.h>
-#include <spdlog/sinks/basic_file_sink.h>
-#include <spdlog/sinks/stdout_color_sinks.h>
-#include <spdlog/spdlog.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <CLI/CLI.hpp>
+#include <spdlog/spdlog.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/sinks/basic_file_sink.h>
+#include <opae/cxx/core.h>
 
 #include "dfl-he-cache.h"
 

From 688631f4fc7170aedf650bc56b38646621f66b6f Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 14:53:37 -0700
Subject: [PATCH 06/11] fix: ci build error

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/cxl_host_exerciser/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
index b9ee688717ea..7298b63b62d9 100644
--- a/samples/cxl_host_exerciser/CMakeLists.txt
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -45,6 +45,7 @@ if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
             ${fmt_LIBRARIES}
         COMPONENT samplebin
     )
+
     target_include_directories(cxl_host_exerciser
         PRIVATE
            ${OPAE_INCLUDE_PATHS}
@@ -53,4 +54,12 @@ if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
            ${numa_INCLUDE_DIRS}
            ${spdlog_INCLUDE_DIRS})
 
+    target_compile_options(cxl_host_exerciser PUBLIC
+        -Wno-unused-result
+    )
+
+    target_compile_definitions(cxl_host_exerciser PUBLIC
+        ${spdlog_DEFINITIONS}
+    )
+
 endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)

From f96980774b2cf1a98dd2faf290e275680af18c26 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Wed, 20 Sep 2023 15:12:35 -0700
Subject: [PATCH 07/11] fix: ci build errors

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 samples/cxl_host_exerciser/he_cache_test.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index ce9c33b6046b..4841eb9267ab 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -38,6 +38,7 @@
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <fcntl.h>
 #include <CLI/CLI.hpp>
 #include <spdlog/spdlog.h>
 #include <spdlog/sinks/stdout_color_sinks.h>

From 62a889447f0b913f847928ecec4bad17777edd5f Mon Sep 17 00:00:00 2001
From: Thanneeru Srinivasulu <thanneeru.srinivasulu@intel.com>
Date: Sat, 23 Sep 2023 03:52:19 +0530
Subject: [PATCH 08/11] fix: he cache ioctl

Signed-off-by: Thanneeru Srinivasulu <thanneeru.srinivasulu@intel.com>
---
 libraries/plugins/xfpga/fpga-dfl.h            | 169 +++++++++--
 samples/cxl_host_exerciser/cxl_he_cache_cmd.h | 269 ++++--------------
 samples/cxl_host_exerciser/cxl_he_cmd.h       |  56 ++--
 .../cxl_host_exerciser/cxl_host_exerciser.h   |   5 +-
 samples/cxl_host_exerciser/dfl-he-cache.h     | 128 ---------
 samples/cxl_host_exerciser/he_cache_test.h    | 242 ++++++++--------
 6 files changed, 351 insertions(+), 518 deletions(-)
 delete mode 100644 samples/cxl_host_exerciser/dfl-he-cache.h

diff --git a/libraries/plugins/xfpga/fpga-dfl.h b/libraries/plugins/xfpga/fpga-dfl.h
index fa5af9ae87bc..36fe3b98671d 100644
--- a/libraries/plugins/xfpga/fpga-dfl.h
+++ b/libraries/plugins/xfpga/fpga-dfl.h
@@ -1,28 +1,17 @@
-// Copyright(c) 2017-2020, Intel Corporation
-//
-// Redistribution  and  use  in source  and  binary  forms,  with  or  without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of  source code  must retain the  above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name  of Intel Corporation  nor the names of its contributors
-//   may be used to  endorse or promote  products derived  from this  software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
-// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
-// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
-// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
-// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
-// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
-// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header File for FPGA DFL User API
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Zhang Yi <yi.z.zhang@intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ */
 
 #ifndef _UAPI_LINUX_FPGA_DFL_H
 #define _UAPI_LINUX_FPGA_DFL_H
@@ -44,6 +33,8 @@
 #define DFL_FPGA_BASE 0
 #define DFL_PORT_BASE 0x40
 #define DFL_FME_BASE 0x80
+#define DFL_PCI_SVA_BASE 0xf8
+#define DFL_CXL_CACHE_BASE 0xA0
 
 /* Common IOCTLs for both FME and AFU file descriptor */
 
@@ -134,12 +125,20 @@ struct dfl_fpga_port_region_info {
  * Map the dma memory per user_addr and length which are provided by caller.
  * Driver fills the iova in provided struct afu_port_dma_map.
  * This interface only accepts page-size aligned user memory for dma mapping.
+ *
+ * Setting only one of DFL_DMA_MAP_FLAG_READ or WRITE limits FPGA-initiated
+ * DMA requests to only reads or only writes. To be back-compatiable with
+ * legacy driver, setting neither flag is equivalent to setting both flags:
+ * both read and write are requests permitted.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct dfl_fpga_port_dma_map {
 	/* Input */
 	__u32 argsz;		/* Structure length */
-	__u32 flags;		/* Zero for now */
+	__u32 flags;
+#define DFL_DMA_MAP_FLAG_READ	(1 << 0)/* readable from device */
+#define DFL_DMA_MAP_FLAG_WRITE	(1 << 1)/* writable from device */
 	__u64 user_addr;        /* Process virtual address */
 	__u64 length;           /* Length of mapping (bytes)*/
 	/* Output */
@@ -169,7 +168,7 @@ struct dfl_fpga_port_dma_unmap {
  *
  * @start: Index of the first irq.
  * @count: The number of eventfd handler.
- * @evtfds: Eventfd handler.
+ * @evtfds: Eventfd handlers.
  */
 struct dfl_fpga_irq_set {
 	__u32 start;
@@ -289,4 +288,120 @@ struct dfl_fpga_fme_port_pr {
 					     DFL_FME_BASE + 4,	\
 					     struct dfl_fpga_irq_set)
 
+/**
+ * DFL_PCI_SVA_BIND_DEV - _IO(DFL_FPGA_MAGIC, DFL_PCI_SVA_BASE + 0)
+ *
+ * Ensure that a PASID is present in the user process and enable the
+ * PASID on the IOMMU domain of the device associated with the file handle.
+ * Returns the PASID on success, -errno on failure.
+ */
+#define DFL_PCI_SVA_BIND_DEV		_IO(DFL_FPGA_MAGIC,	\
+					     DFL_PCI_SVA_BASE + 0)
+
+/**
+ * DFL_PCI_SVA_UNBIND_DEV - _IO(DFL_FPGA_MAGIC,	DFL_PCI_SVA_BASE + 1)
+ *
+ * Unbind the current PASID from the device.
+ */
+#define DFL_PCI_SVA_UNBIND_DEV		_IO(DFL_FPGA_MAGIC,	\
+					    DFL_PCI_SVA_BASE + 1)
+
+ /**
+  * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
+  *                                      struct dfl_cxl_cache_region_info)
+  *
+  * Retrieve information about a device memory region.
+  * Caller provides struct dfl_cxl_cache_region_info with flags.
+  * Driver returns the region info in other fields.
+  * Return: 0 on success, -errno on failure.
+  */
+
+#define DFL_CXL_CACHE_GET_REGION_INFO _IO(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0)
+
+  /**
+   * struct dfl_cxl_cache_region_info - CXL cache region information
+   * @argsz: structure length
+   * @flags: access permission
+   * @size: region size (bytes)
+   * @offset: region offset from start of device fd
+   *
+   * to retrieve  information about a device memory region
+   */
+struct dfl_cxl_cache_region_info {
+	__u32 argsz;
+	__u32 flags;
+#define DFL_CXL_CACHE_REGION_READ	BIT(0)
+#define DFL_CXL_CACHE_REGION_WRITE	BIT(1)
+#define DFL_CXL_CACHE_REGION_MMAP	BIT(2)
+	__u64 size;
+	__u64 offset;
+};
+
+/**
+ * DFL_CXL_CACHE_NUMA_DMA_MAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_dma_map)
+ *
+ * Map the dma memory per user_addr, length and numa node which are provided by caller
+ * The driver allocates memory on the numa node, converts the user's virtual address
+ * to a continuous physical address, and writes the physical address to
+ * the cxl cache read/write address table CSR.
+
+ * This interface only accepts page-size aligned user memory for dma mapping.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_ARRAY_MAX_SIZE   0x10
+
+#define DFL_CXL_CACHE_NUMA_DMA_MAP    _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 1)
+
+ /**
+  * struct dfl_cxl_cache_dma_map - maps user address to physical address.
+  * @argsz: structure length
+  * @flags: flags
+  * @user_addr: user mmap virtual address
+  * @length: length of mapping (bytes)
+  * @numa_node: Numa node number
+  * @csr_array: array of region address offset
+  *
+  * maps user allocated virtual address to physical address.
+  */
+struct dfl_cxl_cache_dma_map {
+	__u32 argsz;
+	__u32 flags;
+	__u64 user_addr;
+	__u64 length;
+	__u32 numa_node;
+	__u64 csr_array[DFL_ARRAY_MAX_SIZE];
+};
+
+/**
+ * DFL_CXL_CACHE_NUMA_DMA_UNMAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_dma_unmap)
+ *
+ * Unmaps the dma memory per user_addr and length which are provided by caller
+ * The driver deletes the physical pages of the user address and writes a zero
+ * to the read/write address table CSR.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_CXL_CACHE_NUMA_DMA_UNMAP  _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 2)
+
+ /**
+  * struct dfl_cxl_cache_dma_unmap - unmaps user allocated memory.
+  * @argsz: structure length
+  * @flags: flags
+  * @user_addr: user mmap virtual address
+  * @length: length of mapping (bytes)
+  * @csr_array: array of region address offset
+  *
+  * unmaps user allocated memory.
+  */
+struct dfl_cxl_cache_dma_unmap {
+	__u32 argsz;
+	__u32 flags;
+	__u64 user_addr;
+	__u64 length;
+	__u64 csr_array[DFL_ARRAY_MAX_SIZE];
+};
+
 #endif /* _UAPI_LINUX_FPGA_DFL_H */
diff --git a/samples/cxl_host_exerciser/cxl_he_cache_cmd.h b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
index 81460d81bd6d..dd0631b04325 100644
--- a/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
+++ b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
@@ -39,7 +39,7 @@ volatile static bool g_stop_thread = false;
 void he_sig_handler(int) {
   g_he_exit = true;
   g_stop_thread = true;
-  printf("HE signal handler exit app \n");
+  cout << "HE signal handler exit app" << endl;
 }
 
 namespace host_exerciser {
@@ -69,9 +69,11 @@ class he_cache_cmd : public he_cmd {
   virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
 
   virtual void add_options(CLI::App *app) override {
+
+    // test mode
     app->add_option(
            "--test", he_test_,
-           "host exerciser cache test {fpgardcachehit, fpgawrcachehit, all}")
+           "host exerciser cache test")
         ->transform(CLI::CheckedTransformer(he_test_modes))
         ->default_val("fpgardcachehit");
 
@@ -121,12 +123,12 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-
-    cout << "Numa node:" << numa_node_ << endl;
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+
     cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Numa node:" << numa_node_ << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
@@ -172,8 +174,7 @@ class he_cache_cmd : public he_cmd {
     he_perf_counters();
 
     cout << "********** AFU Copied host cache to FPGA Cache successfully "
-            "********** "
-         << endl;
+            "********** " << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
@@ -207,17 +208,15 @@ class he_cache_cmd : public he_cmd {
     host_exe_->free_dsm();
     host_exe_->free_cache_read();
 
-    cout
-        << "********** AFU reads cache from FPGA Cache successfully ********** "
-        << endl;
-
+    cout << "********** AFU reads cache from FPGA Cache successfully"
+        " **********" << endl;
     cout << "********** FPGA Read cache hit test end**********" << endl;
     return 0;
   }
 
   int he_run_fpga_wr_cache_hit_test() {
-    cout << "********** FPGA Write cache hit test start**********" << endl;
 
+    cout << "********** FPGA Write cache hit test start**********" << endl;
     /*
     STEPS
     1) Allocate DSM, Read buffer, Write buffer // flush
@@ -233,13 +232,13 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-    cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
-
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+
     cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
@@ -255,7 +254,7 @@ class he_cache_cmd : public he_cmd {
 
     // Allocate DSM buffer
     if (!host_exe_->allocate_dsm()) {
-      cerr << "alloc dsm failed" << endl;
+      cerr << "allocate dsm failed" << endl;
       return -1;
     }
 
@@ -285,8 +284,7 @@ class he_cache_cmd : public he_cmd {
     he_perf_counters();
 
     cout << "********** AFU Copied host cache to FPGA Cache successfully "
-            "********** "
-         << endl;
+            "********** " << endl;
 
     // set W_CONFIG
     he_wr_cfg_.value = 0;
@@ -300,7 +298,7 @@ class he_cache_cmd : public he_cmd {
     wr_table_ctl_.enable_address_stride = 1;
     host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
 
-    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES);
     // Start test
     he_ctl_.Start = 1;
     host_exe_->write64(HE_CTL, he_ctl_.value);
@@ -330,6 +328,7 @@ class he_cache_cmd : public he_cmd {
   }
 
   int he_run_fpga_rd_cache_miss_test() {
+
     cout << "********** FPGA Read cache miss test start**********" << endl;
     /*
     STEPS
@@ -342,11 +341,11 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_2MB_CACHE_LINES);
 
-    host_exe_->write64(HE_RD_NUM_LINES, FPGA_2MB_CACHE_LINES - 1);
-    cout << "Read number Lines:" << FPGA_2MB_CACHE_LINES - 1 << endl;
+    cout << "Read number Lines:" << FPGA_2MB_CACHE_LINES << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
@@ -362,13 +361,13 @@ class he_cache_cmd : public he_cmd {
 
     // Allocate DSM buffer
     if (!host_exe_->allocate_dsm()) {
-      cerr << "alloc dsm failed" << endl;
+      cerr << "allocate dsm failed" << endl;
       return -1;
     }
 
-    // Allocate Read, Write buffer
-    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
-      cerr << "allocate cache read write failed" << endl;
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
       host_exe_->free_dsm();
       return -1;
     }
@@ -384,25 +383,24 @@ class he_cache_cmd : public he_cmd {
       cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
-      host_exe_->free_cache_read_write();
+      host_exe_->free_cache_read();
       host_exe_->free_dsm();
       return -1;
     }
 
     he_perf_counters();
-    host_exe_->free_cache_read_write();
+    host_exe_->free_cache_read();
     host_exe_->free_dsm();
 
     cout << "********** AFU Read FPGA Cache Miss successfully ********** "
          << endl;
-
     cout << "********** FPGA Read cache miss test end**********" << endl;
     return 0;
   }
 
   int he_run_fpga_wr_cache_miss_test() {
-    cout << "********** FPGA write cache miss test start**********" << endl;
 
+    cout << "********** FPGA write cache miss test start**********" << endl;
     /*
     STEPS
     1) Allocate DSM, Read buffer, Write buffer
@@ -414,13 +412,13 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-    cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
-
     host_exe_->write64(HE_WR_NUM_LINES, FPGA_2MB_CACHE_LINES);
+
     cout << "Read/write number Lines:" << FPGA_2MB_CACHE_LINES << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
 
     // set W_CONFIG
     he_wr_cfg_.value = 0;
@@ -436,7 +434,7 @@ class he_cache_cmd : public he_cmd {
 
     // Allocate DSM buffer
     if (!host_exe_->allocate_dsm()) {
-      cerr << "alloc dsm failed" << endl;
+      cerr << "allocate dsm failed" << endl;
       return -1;
     }
 
@@ -469,14 +467,13 @@ class he_cache_cmd : public he_cmd {
 
     cout << "********** AFU Write FPGA Cache Miss successfully ********** "
          << endl;
-
     cout << "********** FPGA Write cache miss test end**********" << endl;
     return 0;
   }
 
   int he_run_host_rd_cache_hit_test() {
-    cout << "********** 1 Host LLC Read cache hit test start**********" << endl;
 
+      cout << "********** 1 Host LLC Read cache hit test start**********" << endl;
     /*
     STEPS
     1) Allocate DSM, Read buffer
@@ -488,13 +485,13 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-    cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
 
-    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
 
     // set RD_CONFIG RdShared (CXL)
     he_rd_cfg_.value = 0;
@@ -545,8 +542,6 @@ class he_cache_cmd : public he_cmd {
       return -1;
     }
 
-    he_perf_counters();
-
     g_stop_thread = true;
     t1.join();
 
@@ -556,14 +551,13 @@ class he_cache_cmd : public he_cmd {
     host_exe_->free_dsm();
 
     cout << "********** AFU Copied host cache to FPGA Cache successfully "
-            "********** "
-         << endl;
-
+            "********** " << endl;
     cout << "********** Host LLC cache hit test end**********" << endl;
     return 0;
   }
 
   int he_run_host_wr_cache_hit_test() {
+
     cout << "********** Host LLC Write cache hit test start**********" << endl;
 
     /*
@@ -577,13 +571,13 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-    cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
 
-    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES);
+    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES  << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
 
     // set RD_CONFIG
     he_wr_cfg_.value = 0;
@@ -661,13 +655,12 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set Read number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
-    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
-    cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
-
     host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
     cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
     cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
 
     // set RD_CONFIG
     he_rd_cfg_.value = 0;
@@ -694,9 +687,6 @@ class he_cache_cmd : public he_cmd {
       return -1;
     }
 
-    // flush host cache
-    // int status = cacheflush((host_exe_->get_read(), BUFFER_SIZE_2MB, BCACHE);
-
     // start
     he_ctl_.Start = 1;
     host_exe_->write64(HE_CTL, he_ctl_.value);
@@ -714,7 +704,6 @@ class he_cache_cmd : public he_cmd {
     }
 
     he_perf_counters();
-
     host_exe_->free_cache_read();
     host_exe_->free_dsm();
 
@@ -726,6 +715,7 @@ class he_cache_cmd : public he_cmd {
   }
 
   int he_run_host_wr_cache_miss_test() {
+
     cout << "********** Host LLC Write cache miss test start**********" << endl;
 
     /*
@@ -739,13 +729,12 @@ class he_cache_cmd : public he_cmd {
     // HE_INFO
     // Set write number Lines
     he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_WR_NUM_LINES, 1);
+    cout << "Write number Lines:" << 1 << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
     cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
     cout << "Write address table size:" << he_info_.write_addr_table_size
-         << endl;
-
-    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
-    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
-    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+        << endl;
 
     // set RD_CONFIG
     he_wr_cfg_.value = 0;
@@ -765,7 +754,7 @@ class he_cache_cmd : public he_cmd {
       return -1;
     }
 
-    // Allocate Read buffer
+    // Allocate write buffer
     if (!host_exe_->allocate_cache_write(BUFFER_SIZE_2MB, numa_node_)) {
       cerr << "allocate cache read failed" << endl;
       host_exe_->free_dsm();
@@ -789,7 +778,6 @@ class he_cache_cmd : public he_cmd {
     }
 
     he_perf_counters();
-
     host_exe_->free_cache_write();
     host_exe_->free_dsm();
 
@@ -800,139 +788,6 @@ class he_cache_cmd : public he_cmd {
     return 0;
   }
 
-  // Convert number of transactions to bandwidth (GB/s)
-  double he_num_xfers_to_bw(uint64_t num_lines, uint64_t num_ticks) {
-    return (double)(num_lines * 64) / ((1000.0 / he_clock_mhz_ * num_ticks));
-  }
-
-  void he_perf_counters() {
-    volatile he_cache_dsm_status *dsm_status = NULL;
-
-    dsm_status = reinterpret_cast<he_cache_dsm_status *>(
-        (uint8_t *)(host_exe_->get_dsm()));
-    if (!dsm_status)
-      return;
-
-    cout << "\n********* DSM Status CSR Start *********" << std::endl;
-
-    cout << "test completed :" << dsm_status->test_completed << endl;
-    cout << "dsm number:" << dsm_status->dsm_number << endl;
-    cout << "error vector:" << dsm_status->err_vector << endl;
-    cout << "num ticks:" << dsm_status->num_ticks << endl;
-    cout << "num reads:" << dsm_status->num_reads << endl;
-    cout << "num writes:" << dsm_status->num_writes << endl;
-    cout << "penalty start:" << dsm_status->penalty_start << endl;
-    cout << "penalty end:" << dsm_status->penalty_end << endl;
-    cout << "actual data:" << dsm_status->actual_data << endl;
-    cout << "expected data:" << dsm_status->expected_data << endl;
-
-    // print bandwidth
-    if (dsm_status->num_ticks > 0) {
-      double perf_data =
-          he_num_xfers_to_bw(dsm_status->num_reads + dsm_status->num_writes,
-                             dsm_status->num_ticks);
-      host_exe_->logger_->info("Bandwidth: {0:0.3f} GB/s", perf_data);
-    }
-
-    std::cout << "********* DSM Status CSR end *********" << std::endl;
-  }
-
-  void host_exerciser_errors() {
-    he_err_status err_status;
-    uint64_t err = 0;
-    if (host_exe_ == NULL)
-      return;
-
-    err_status.value = host_exe_->read64(HE_ERROR_STATUS);
-    if (err_status.data_error == 1) {
-      cout << "Data Integrity Check error occured" << endl;
-    }
-
-    if (err_status.err_index > 0) {
-      cout << "Error occurred at cache line address:" << err_status.err_index
-           << endl;
-    }
-
-    err = host_exe_->read64(HE_ERROR_EXP_DATA);
-    cout << "Error Expected Data:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA0);
-    cout << "Error Expected Data0:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA1);
-    cout << "Error Expected Data1:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA2);
-    cout << "Error Expected Data2:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA3);
-    cout << "Error Expected Data3:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA4);
-    cout << "Error Expected Data4:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA5);
-    cout << "Error Expected Data5:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA6);
-    cout << "Error Expected Data6:" << err << endl;
-
-    err = host_exe_->read64(HE_ERROR_ACT_DATA7);
-    cout << "Error Expected Data7:" << err << endl;
-  }
-
-  int parse_input_options() {
-
-    if (!host_exe_)
-      return -1;
-
-    return 0;
-  }
-
-  bool he_wait_test_completion() {
-    /* Wait for test completion */
-    uint32_t timeout = HELPBK_TEST_TIMEOUT;
-
-    volatile uint8_t *status_ptr = host_exe_->get_dsm();
-    while (0 == ((*status_ptr) & 0x1)) {
-      usleep(HELPBK_TEST_SLEEP_INVL);
-      if (--timeout == 0) {
-        cout << "HE LPBK TIME OUT" << std::endl;
-
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool verify_numa_node() {
-
-    if (numa_available() < 0) {
-      printf("System does not support NUMA API!\n");
-      return false;
-    }
-    int n = numa_max_node();
-    printf("There are %d nodes on your system\n", n + 1);
-
-    int cup_num = sched_getcpu();
-    printf("cup_num:%d\n", cup_num);
-
-    int node = numa_node_of_cpu(cup_num);
-    printf("node:%d\n", node);
-
-    if (he_target_ == HE_TARGET_HOST) {
-      numa_node_ = node;
-      printf("HE_TARGET_HOST numa_node_:%d\n", numa_node_);
-
-    } else {
-      // find fpga numa node numebr
-      numa_node_ = 2;
-      printf("HE_TARGET_FPGA numa_node_:%d\n", numa_node_);
-    }
-
-    return true;
-  }
-
   virtual int run(test_afu *afu, CLI::App *app) {
     (void)app;
     int ret = 0;
@@ -1047,18 +902,18 @@ class he_cache_cmd : public he_cmd {
 };
 
 void he_cache_thread(uint8_t *buf_ptr, uint64_t len) {
+
+    uint64_t value;
+    UNUSED_PARAM(value);
+    uint64_t cache_lines = len / CL;
+    uint64_t i = 0;
+
   if (buf_ptr == NULL || len == 0) {
     return;
   }
-  uint64_t value;
-  UNUSED_PARAM(value);
-  uint64_t cache_lines = len / CL;
-  uint64_t i = 0;
 
   while (true) {
-
     if (g_stop_thread == true) {
-      // cout << "he_cache_thread g_stop_thread " << endl;
       return;
     }
     if (i < cache_lines) {
diff --git a/samples/cxl_host_exerciser/cxl_he_cmd.h b/samples/cxl_host_exerciser/cxl_he_cmd.h
index 048937fd2dcf..9596feec7930 100644
--- a/samples/cxl_host_exerciser/cxl_he_cmd.h
+++ b/samples/cxl_host_exerciser/cxl_he_cmd.h
@@ -61,18 +61,17 @@ class he_cmd : public test_command {
     if (!dsm_status)
       return;
 
-    std::cout << "\n********* DSM Status CSR Start *********" << std::endl;
-
-    std::cout << "test completed :" << dsm_status->test_completed << std::endl;
-    std::cout << "dsm number:" << dsm_status->dsm_number << std::endl;
-    std::cout << "error vector:" << dsm_status->err_vector << std::endl;
-    std::cout << "num ticks:" << dsm_status->num_ticks << std::endl;
-    std::cout << "num reads:" << dsm_status->num_reads << std::endl;
-    std::cout << "num writes:" << dsm_status->num_writes << std::endl;
-    std::cout << "penalty start:" << dsm_status->penalty_start << std::endl;
-    std::cout << "penalty end:" << dsm_status->penalty_end << std::endl;
-    std::cout << "actual data:" << dsm_status->actual_data << std::endl;
-    std::cout << "expected data:" << dsm_status->expected_data << std::endl;
+    cout << "\n********* DSM Status CSR Start *********" << endl;
+    cout << "test completed :" << dsm_status->test_completed << endl;
+    cout << "dsm number:" << dsm_status->dsm_number << endl;
+    cout << "error vector:" << dsm_status->err_vector << endl;
+    cout << "num ticks:" << dsm_status->num_ticks << endl;
+    cout << "num reads:" << dsm_status->num_reads << endl;
+    cout << "num writes:" << dsm_status->num_writes << endl;
+    cout << "penalty start:" << dsm_status->penalty_start << endl;
+    cout << "penalty end:" << dsm_status->penalty_end << endl;
+    cout << "actual data:" << dsm_status->actual_data << endl;
+    cout << "expected data:" << dsm_status->expected_data << endl;
 
     // print bandwidth
     if (dsm_status->num_ticks > 0) {
@@ -82,7 +81,7 @@ class he_cmd : public test_command {
       host_exe_->logger_->info("Bandwidth: {0:0.3f} GB/s", perf_data);
     }
 
-    std::cout << "********* DSM Status CSR end *********" << std::endl;
+    cout << "********* DSM Status CSR end *********" << endl;
   }
 
   void host_exerciser_errors() {
@@ -145,7 +144,7 @@ class he_cmd : public test_command {
     while (0 == ((*status_ptr) & 0x1)) {
       usleep(HELPBK_TEST_SLEEP_INVL);
       if (--timeout == 0) {
-        cout << "HE LPBK TIME OUT" << std::endl;
+        cout << "HE LPBK TIME OUT" << endl;
 
         return false;
       }
@@ -156,37 +155,28 @@ class he_cmd : public test_command {
   bool verify_numa_node() {
 
     if (numa_available() < 0) {
-      printf("System does not support NUMA API!\n");
+      cerr << "System does not support NUMA API" << endl;
       return false;
     }
 
-    printf("SUpported NUMA API!\n");
-
     int n = numa_max_node();
-    printf("There are %d nodes on your system\n", n + 1);
+    cout << "There are %d nodes on your system:" << n + 1 << endl;
 
-    int cup_num = sched_getcpu();
-    printf("cup_num:%d\n", cup_num);
+    int cpu_num = sched_getcpu();
+    cout << "cpu num:" << cpu_num << endl;
 
-    int node = numa_node_of_cpu(cup_num);
-    printf("node:%d\n", node);
+    int numa_node = numa_node_of_cpu(cpu_num);
+    cout << "numa node:" << numa_node << endl;
 
     if (he_target_ == HE_TARGET_HOST) {
-      numa_node_ = node;
-      printf("HE_TARGET_HOST numa_node_:%d\n", numa_node_);
-
+      numa_node_ = numa_node;
+      cout << "HE_TARGET_HOST numa node:" << numa_node_ << endl;
     } else {
-      // find fpga numa node numebr
+      // find fpga numa node number
       numa_node_ = 2;
-      printf("HE_TARGET_FPGA numa_node_:%d\n", numa_node_);
+      cout << "HE_TARGET_FPGA numa node:" << numa_node_ << endl;
     }
 
-    int num_config_cpu = numa_num_configured_cpus();
-    printf("num_config_cpu:%d\n", num_config_cpu);
-
-    int num_task_nodes = numa_num_task_nodes();
-    printf("num_task_nodes:%d\n", num_task_nodes);
-
     return true;
   }
 
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.h b/samples/cxl_host_exerciser/cxl_host_exerciser.h
index adae83320674..917e59f798a3 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser.h
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.h
@@ -365,7 +365,7 @@ using test_command = opae::afu_test::command;
 class host_exerciser : public test_afu {
 public:
   host_exerciser()
-      : test_afu("host_exerciser", nullptr, "warning"), count_(1) {}
+      : test_afu("host_exerciser", nullptr, "info"), count_(1) {}
 
   virtual int run(CLI::App *app, test_command::ptr_t test) override {
     int res = exit_codes::not_run;
@@ -374,7 +374,8 @@ class host_exerciser : public test_afu {
     // Info prints details of an individual run. Turn it on if doing only one
     // test and the user hasn't changed level from the default.
     if ((log_level_.compare("warning") == 0))
-      logger_->set_level(spdlog::level::info);
+       logger_->set_level(spdlog::level::info);
+
 
     logger_->info("starting test run, count of {0:d}", count_);
     uint32_t count = 0;
diff --git a/samples/cxl_host_exerciser/dfl-he-cache.h b/samples/cxl_host_exerciser/dfl-he-cache.h
deleted file mode 100644
index d6036c832dbf..000000000000
--- a/samples/cxl_host_exerciser/dfl-he-cache.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Header File for host exerciser cache DFL User API
- *
- * Copyright (C) 2023 Intel Corporation, Inc.
- *
- * Authors:
- *   Tim Whisonant <tim.whisonant@intel.com>
- *   Ananda Ravuri <ananda.ravuri@intel.com>
- *   Russell H. Weight <russell.h.weight@intel.com>
- */
-
-#ifndef _UAPI_LINUX_HE_CACHE_DFL_H
-#define _UAPI_LINUX_HE_CACHE_DFL_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-#define DFL_HE_CACHE_API_VERSION 0
-
-/*
- * The IOCTL interface for DFL based HE CACHE is designed for extensibility by
- * embedding the structure length (argsz) and flags into structures passed
- * between kernel and userspace. This design referenced the VFIO IOCTL
- * interface (include/uapi/linux/vfio.h).
- */
-
-#define DFL_HE_CACHE_MAGIC 0xB6
-
-#define DFL_HE_CACHE_BASE 0
-
-/**
- * DFL_HE_CACHE_GET_API_VERSION - _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 0)
- *
- * Report the version of the driver API.
- * Return: Driver API Version.
- */
-
-#define DFL_HE_CACHE_GET_API_VERSION                                           \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 0)
-
-/**
- * DFL_HE_CACHE_CHECK_EXTENSION - _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 1)
- *
- * Check whether an extension is supported.
- * Return: 0 if not supported, otherwise the extension is supported.
- */
-
-#define DFL_HE_CACHE_CHECK_EXTENSION                                           \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 1)
-
-/**
- * DFL_HE_CACHE_GET_REGION_INFO - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE +
- * 2, struct dfl_he_cache_region_info)
- *
- * Retrieve information about a device memory region.
- * Caller provides struct dfl_he_cache_region_info with flags.
- * Driver returns the region info in other fields.
- * Return: 0 on success, -errno on failure.
- */
-
-#define DFL_HE_CACHE_GET_REGION_INFO                                           \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 2)
-
-struct dfl_he_cache_region_info {
-  /* Input */
-  __u32 argsz; /* Structure length */
-  /* Output */
-  __u32 flags;                             /* Access permission */
-#define DFL_HE_CACHE_REGION_READ (1 << 0)  /* Region is readable */
-#define DFL_HE_CACHE_REGION_WRITE (1 << 1) /* Region is writable */
-#define DFL_HE_CACHE_REGION_MMAP (1 << 2)  /* Can be mmaped to userspace */
-  __u64 size;                              /* Region size (bytes) */
-  __u64 offset; /* Region offset from start of device fd */
-};
-
-/**
-* DFL_HE_CACHE_NUMA_DMA_MAP - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3,
-*                                      struct dfl_he_cache_dma_map)
-*
-* Map the dma memory per user_addr,length and numa node which are provided by
-caller.
-* The driver allocates memory on the numa node, converts the user's virtual
-address
-* to a continuous physical address, and writes the physical address to
-* the host executor's read/write address table CSR.
-
-* This interface only accepts page-size aligned user memory for dma mapping.
-* Return: 0 on success, -errno on failure.
-*/
-
-#define DFL_ARRAY_MAX_SIZE 0x10
-
-#define DFL_HE_CACHE_NUMA_DMA_MAP _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 3)
-
-struct dfl_he_cache_dma_map {
-  /* Input */
-  __u32 argsz;                         /* Structure length */
-  __u32 flags;                         /* flags */
-  __u64 user_addr;                     /* Process virtual address */
-  __u64 length;                        /* Length of mapping (bytes)*/
-  __u32 numa_node;                     /* Node 0,1 2 */
-  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR  */
-};
-
-/**
- * DFL_HE_CACHE_NUMA_DMA_UNMAP - _IOWR(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE +
- * 4, struct dfl_he_cache_dma_unmap)
- *
- * Unmpas the dma memory per user_addr and length which are provided by caller.
- * The driver deletes the physical pages of the user address and writes a zero
- * to the read/write address table CSR.
- * Return: 0 on success, -errno on failure.
- */
-
-#define DFL_HE_CACHE_NUMA_DMA_UNMAP                                            \
-  _IO(DFL_HE_CACHE_MAGIC, DFL_HE_CACHE_BASE + 4)
-
-struct dfl_he_cache_dma_unmap {
-  /* Input */
-  __u32 argsz;                         /* Structure length */
-  __u32 flags;                         /* flags */
-  __u64 user_addr;                     /* Process virtual address */
-  __u64 length;                        /* Length of mapping (bytes)*/
-  __u64 csr_array[DFL_ARRAY_MAX_SIZE]; /* CSR */
-};
-
-#endif /* _UAPI_LINUX_HE_CACHE_DFL_H */
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index 4841eb9267ab..9c60e5ad5d62 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -45,7 +45,7 @@
 #include <spdlog/sinks/basic_file_sink.h>
 #include <opae/cxx/core.h>
 
-#include "dfl-he-cache.h"
+#include "../../libraries/plugins/xfpga/fpga-dfl.h"
 
 using namespace std;
 
@@ -87,16 +87,16 @@ enum { MATCHES_SIZE = 6 };
 #define MiB(x) ((x)*1024 * 1024)
 #define GiB(x) ((x)*1024 * 1024 * 1024)
 
-#define DFL_HE_CACHE_DSM_BASE 0x030
-#define DFL_HE_CACHE_WR_ADDR_TABLE_DATA 0x068
-#define DFL_HE_CACHE_RD_ADDR_TABLE_DATA 0x088
+#define DFL_CXL_CACHE_DSM_BASE 0x030
+#define DFL_CXL_CACHE_WR_ADDR_TABLE_DATA 0x068
+#define DFL_CXL_CACHE_RD_ADDR_TABLE_DATA 0x088
 
 void *alloc_2mb_hugepage(void) {
   void *addr;
 
   addr = mmap(ADDR, MiB(2), PROTECTION, FLAGS_2M, 0, 0);
   if (addr == MAP_FAILED) {
-    printf("alloc_2mb_hugepage() failed: %s\n", strerror(errno));
+    cerr << "alloc_2mb_hugepage() failed:" << strerror(errno) << endl;
     addr = NULL;
   }
 
@@ -109,7 +109,7 @@ void *alloc_32kb_hugepage(void) {
 
   addr = mmap(ADDR, KiB(32), PROTECTION, FLAGS_4K, 0, 0);
   if (addr == MAP_FAILED) {
-    printf("alloc_1kb_hugepage() failed: %s\n", strerror(errno));
+    cerr << "alloc_32kb_hugepage() failed:" << strerror(errno) << endl;;
     addr = NULL;
   }
 
@@ -121,7 +121,7 @@ void *alloc_4kb_hugepage(void) {
 
   addr = mmap(ADDR, KiB(4), PROTECTION, FLAGS_4K, 0, 0);
   if (addr == MAP_FAILED) {
-    printf("alloc_1kb_hugepage() failed: %s\n", strerror(errno));
+    cerr << "alloc_4kb_hugepage() failed:" << strerror(errno) << endl;;
     addr = NULL;
   }
 
@@ -267,6 +267,9 @@ class afu {
         ->default_str(std::to_string(timeout_msec_));
   }
   virtual ~afu() {
+
+    if (fd_ > 0)
+        close(fd_);
     if (logger_)
       spdlog::drop(logger_->name());
   }
@@ -314,7 +317,7 @@ class afu {
         string substr_dev(str.substr(0, str.rfind("/")));
         globfree(&pglob);
 
-        substr_dev.append("/he-cache/he-cache*");
+        substr_dev.append("/dfl-cxl-cache/dfl-cxl-cache*");
         gres = glob(substr_dev.c_str(), GLOB_NOSORT, NULL, &pglob);
         if (gres) {
           cerr << "Failed pattern match" << substr_dev.c_str() << ":"
@@ -325,7 +328,7 @@ class afu {
         string str1(pglob.gl_pathv[0]);
         globfree(&pglob);
         dev_path_.append("/dev");
-        dev_path_.append(str1.substr(str1.rfind("/"), 13));
+        dev_path_.append(str1.substr(str1.rfind("/"), 16));
 
         return 0;
       }
@@ -357,7 +360,7 @@ class afu {
   int open_handle() {
 
     int res = 0;
-    cout << "dev_path_:" << dev_path_ << endl;
+    logger_->debug("dev_path_:{0}", dev_path_);
 
     fd_ = open(dev_path_.c_str(), O_RDWR);
     if (fd_ < 0) {
@@ -367,16 +370,15 @@ class afu {
 
     memset(&rinfo_, 0, sizeof(rinfo_));
     rinfo_.argsz = sizeof(rinfo_);
-    res = ioctl(fd_, DFL_HE_CACHE_GET_REGION_INFO, &rinfo_);
+    res = ioctl(fd_, DFL_CXL_CACHE_GET_REGION_INFO, &rinfo_);
     if (res) {
-      cerr << "ioctl() DFL_HE_CACHE_GET_REGION_INFO failed:" << strerror(errno)
+      cerr << "ioctl() DFL_CXL_CACHE_GET_REGION_INFO failed:" << strerror(errno)
            << endl;
       close(fd_);
       return 2;
     }
-
-    printf("MMIO region flags: 0x%x size: %llu offset: %llu\n", rinfo_.flags,
-           rinfo_.size, rinfo_.offset);
+    logger_->debug("MMIO region flags:0x:{0:x} size:0x {1:x} offset:0x {2:x}",
+        rinfo_.flags, rinfo_.size, rinfo_.offset);
 
     if (!map_mmio()) {
       cerr << "mmap failed:" << strerror(errno) << endl;
@@ -385,10 +387,10 @@ class afu {
     }
 
     volatile uint64_t *u64 = (volatile uint64_t *)mmio_base_;
-    printf("DFH     : 0x%016" PRIx64 "\n", *u64);
-    printf("DFH + 8 : 0x%016" PRIx64 "\n", *(u64 + 1));
-    printf("DFH + 16: 0x%016" PRIx64 "\n", *(u64 + 2));
-    printf("DFH + 24: 0x%016" PRIx64 "\n", *(u64 + 3));
+    logger_->debug("DFH     : 0x:{0:X}", *u64);
+    logger_->debug("DFH + 8 : 0x:{0:X}", *(u64 + 1));
+    logger_->debug("DFH + 16: 0x:{0:X}", *(u64 + 2));
+    logger_->debug("DFH + 24: 0x:{0:X}", *(u64 + 3));
 
     return exit_codes::not_run;
   }
@@ -483,54 +485,39 @@ class afu {
 
   command::ptr_t current_command() const { return current_command_; }
 
-  bool open_device() {
-
-    // std::cerr << "open\n" << dev_str;
-    fd_ = open(dev_path_.c_str(), O_RDWR);
-    if (fd_ < 0) {
-      printf("open() failed: %s\n", strerror(errno));
-      return false;
-    }
-
-    return true;
-  }
-
-  bool close_device() {
-    if (fd_ > 0)
-      close(fd_);
-    return true;
-  }
+  bool allocate_dsm(size_t len = KiB(4), uint32_t numa_node = 0) {
 
-  bool allocate_dsm(size_t len = KiB(4), uint32_t node = 0) {
     int res = 0;
     void *ptr = NULL;
-    struct dfl_he_cache_dma_map dma_map;
-    // cout << "allocate_dsm\n";
+    struct dfl_cxl_cache_dma_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-
     ptr = alloc_4kb_hugepage();
     if (!ptr) {
-      cerr << "failed to allocate 4k huge page:" << strerror(errno) << endl;
+      cerr << "Fails to allocate 4k huge page:" << strerror(errno) << endl;
       return false;
     }
 
+    cout << "DSM buffer numa node: " << numa_node << endl;
     dma_map.argsz = sizeof(dma_map);
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
-    dma_map.numa_node = node;
-    dma_map.csr_array[0] = DFL_HE_CACHE_DSM_BASE; // 0x030
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_DSM_BASE; // 0x030
+
+    logger_->debug("Allocate DSM buffer user addr 0x:{0:x} length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_DSM_BASE);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_NODE_DSM_INFO failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_NODE_DSM_INFO failed" << strerror(errno)
            << endl;
       goto out_free;
     }
-    printf("DSM_BASE: 0x%016" PRIx64 "\n", *u64);
+    logger_->debug("DSM_BASE     : 0x:{0:x}", *u64);
 
     dsm_buffer_ = (uint8_t *)ptr;
     dsm_buf_len_ = len;
@@ -542,28 +529,30 @@ class afu {
   }
 
   bool free_dsm() {
-    struct dfl_he_cache_dma_unmap dma_unmap;
+
     int res = 0;
+    struct dfl_cxl_cache_dma_unmap dma_unmap;
 
-    // cout << "free_dsm\n" << endl;
     memset(&dma_unmap, 0, sizeof(dma_unmap));
-
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)dsm_buffer_;
     dma_unmap.length = dsm_buf_len_;
-    dma_unmap.csr_array[0] = DFL_HE_CACHE_DSM_BASE; // 0x030
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_DSM_BASE; // 0x030
+
+    logger_->debug("free dsm user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_DSM_BASE);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
            << endl;
     }
-    printf("DSM_BASE: 0x%016" PRIx64 "\n", *u64);
-    free_memory(dsm_buffer_, dsm_buf_len_);
 
+    logger_->debug("DSM_BASE     : 0x:{0:x}", *u64);
+    free_memory(dsm_buffer_, dsm_buf_len_);
     return true;
   }
 
@@ -571,37 +560,37 @@ class afu {
 
     int res = 0;
     void *ptr = NULL;
-    struct dfl_he_cache_dma_map dma_map;
-
-    // cout << "allocate_cache_read\n";
+    struct dfl_cxl_cache_dma_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-
     ptr = alloc_2mb_hugepage();
     if (!ptr) {
-      cerr << "failed to allocate huge pages\n" << endl;
+      cerr << "Fails to allocate 2MB huge pages" << endl;
       return false;
     }
 
-    cout << "numa_node: " << numa_node << endl;
+    cout << "Read buffer numa node: " << numa_node << endl;
 
     dma_map.argsz = sizeof(dma_map);
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+
+    logger_->debug("Allocate read buffer user addr 0x:{0:x} length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
-    printf("DFL_HE_CACHE_RD_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
 
+    logger_->debug("DFL_CXL_CACHE_RD_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
     rd_buffer_ = (uint8_t *)ptr;
     rd_buf_len_ = len;
     return true;
@@ -612,64 +601,70 @@ class afu {
   }
 
   bool free_cache_read() {
-    struct dfl_he_cache_dma_unmap dma_unmap;
+
     int res = 0;
+    struct dfl_cxl_cache_dma_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
-
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)rd_buffer_;
     dma_unmap.length = rd_buf_len_;
-    dma_unmap.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+
+    logger_->debug("free read user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
            << endl;
     }
 
-    printf("DFL_HE_CACHE_RD_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+    logger_->debug("DFL_CXL_CACHE_RD_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
     free_memory(rd_buffer_, rd_buf_len_);
 
     return true;
   }
 
   bool allocate_cache_write(size_t len = MiB(2), uint32_t numa_node = 0) {
-    int res;
-    void *ptr;
-    struct dfl_he_cache_dma_map dma_map;
 
-    // std::cout << "allocate_cache_write" << endl;
+    int res  = 0;
+    void *ptr = NULL;
+    struct dfl_cxl_cache_dma_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-
     ptr = alloc_2mb_hugepage();
     if (!ptr) {
-      cerr << "failed to allocate huge pages\n" << endl;
+      cerr << "Fails to allocate 2MB huge pages" << endl;
       return false;
     }
 
+    cout << "Write buffer numa node: " << numa_node << endl;
     dma_map.argsz = sizeof(dma_map);
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    logger_->debug("Allocate write buffer user addr 0x:{0:x}\
+        length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
-    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
 
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
     wr_buffer_ = (uint8_t *)ptr;
-
+    wr_buf_len_ = len;
     return true;
 
   out_free:
@@ -678,28 +673,29 @@ class afu {
   }
 
   bool free_cache_write() {
-    struct dfl_he_cache_dma_unmap dma_unmap;
-    int res;
 
-    // cout << "free_cache_write" << endl;
-    memset(&dma_unmap, 0, sizeof(dma_unmap));
+    int res = 0;
+    struct dfl_cxl_cache_dma_unmap dma_unmap;
 
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)wr_buffer_;
     dma_unmap.length = wr_buf_len_;
-    dma_unmap.csr_array[0] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    logger_->debug("free write user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64 =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
            << endl;
     }
 
-    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64);
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
     free_memory(wr_buffer_, wr_buf_len_);
-
     return true;
   }
 
@@ -707,38 +703,41 @@ class afu {
 
     int res = 0;
     void *ptr = NULL;
-    struct dfl_he_cache_dma_map dma_map;
-
-    // cout<< "allocate_cache_read_write";
+    struct dfl_cxl_cache_dma_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
     ptr = alloc_2mb_hugepage();
     if (!ptr) {
-      cerr << "failed to allocate huge pages\n" << endl;
+      cerr << "Fails to allocate 2MB huge pages" << endl;
       return false;
     }
+    cout << "Read/Write buffer numa node: " << numa_node << endl;
 
     dma_map.argsz = sizeof(dma_map);
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
-    dma_map.csr_array[1] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
+    dma_map.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    logger_->debug("Allocate read/write buffer user addr 0x:{0:x}\
+        length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64_wr =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
     volatile uint64_t *u64_rd =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
 
-    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64_wr);
-    printf("\nDFL_HE_CACHE_RD_ADDR_TABLE_DATAs: 0x%016" PRIx64 "\n", *u64_rd);
+    logger_->debug("nDFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_rd);
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_wr);
 
     rd_wr_buffer_ = (uint8_t *)ptr;
     rd_wr_buf_len_ = len;
@@ -751,32 +750,33 @@ class afu {
   }
 
   bool free_cache_read_write() {
-    struct dfl_he_cache_dma_unmap dma_unmap;
-    int res;
 
-    // cout << "free_cache_read_write\n" << endl;
+    int res = 0 ;
+    struct dfl_cxl_cache_dma_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
-
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)rd_wr_buffer_;
     dma_unmap.length = rd_wr_buf_len_;
-    dma_unmap.csr_array[0] = DFL_HE_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
-    dma_unmap.csr_array[1] = DFL_HE_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
+    dma_unmap.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+
+    logger_->debug("free read/write user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64_wr =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_WR_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
     volatile uint64_t *u64_rd =
-        (volatile uint64_t *)(mmio_base_ + DFL_HE_CACHE_RD_ADDR_TABLE_DATA);
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_HE_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_HE_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
            << endl;
     }
 
-    printf("\nDFL_HE_CACHE_WR_ADDR_TABLE_DATA: 0x%016" PRIx64 "\n", *u64_wr);
-    printf("\nDFL_HE_CACHE_RD_ADDR_TABLE_DATAs: 0x%016" PRIx64 "\n", *u64_rd);
+    logger_->debug("nDFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_rd);
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_wr);
 
     free_memory(rd_wr_buffer_, rd_wr_buf_len_);
     rd_wr_buffer_ = NULL;
@@ -815,7 +815,7 @@ class afu {
   uint8_t *rd_wr_buffer_;
   uint64_t rd_wr_buf_len_;
 
-  struct dfl_he_cache_region_info rinfo_;
+  struct dfl_cxl_cache_region_info rinfo_;
 
   std::string dev_path_;
 

From e44dc942a664ad0567bbb97edccd829e90de0781 Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Tue, 26 Sep 2023 11:23:02 -0700
Subject: [PATCH 09/11] fix: update cxl cache ioctl and review comments

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 libraries/plugins/xfpga/fpga-dfl.h            |  71 +++---
 samples/cxl_host_exerciser/CMakeLists.txt     |   1 +
 samples/cxl_host_exerciser/cxl_he_cache_cmd.h |  13 --
 samples/cxl_host_exerciser/cxl_he_cmd.h       |  11 +-
 .../cxl_host_exerciser/cxl_host_exerciser.cpp |   5 +-
 samples/cxl_host_exerciser/he_cache_test.h    | 210 ++++++++++--------
 6 files changed, 158 insertions(+), 153 deletions(-)

diff --git a/libraries/plugins/xfpga/fpga-dfl.h b/libraries/plugins/xfpga/fpga-dfl.h
index cec3023c1496..c218f7e27df6 100644
--- a/libraries/plugins/xfpga/fpga-dfl.h
+++ b/libraries/plugins/xfpga/fpga-dfl.h
@@ -33,9 +33,8 @@
 #define DFL_FPGA_BASE 0
 #define DFL_PORT_BASE 0x40
 #define DFL_FME_BASE 0x80
-#define DFL_PCI_SVA_BASE 0xf8
 #define DFL_CXL_CACHE_BASE 0xA0
-
+#define DFL_PCI_SVA_BASE 0xf8
 
 /* Common IOCTLs for both FME and AFU file descriptor */
 
@@ -308,26 +307,26 @@ struct dfl_fpga_fme_port_pr {
 					    DFL_PCI_SVA_BASE + 1)
 
  /**
-  * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
-  *                                      struct dfl_cxl_cache_region_info)
-  *
-  * Retrieve information about a device memory region.
-  * Caller provides struct dfl_cxl_cache_region_info with flags.
-  * Driver returns the region info in other fields.
-  * Return: 0 on success, -errno on failure.
-  */
+   * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
+   *                                      struct dfl_cxl_cache_region_info)
+   *
+   * Retrieve information about a device memory region.
+   * Caller provides struct dfl_cxl_cache_region_info with flags.
+   * Driver returns the region info in other fields.
+   * Return: 0 on success, -errno on failure.
+   */
 
 #define DFL_CXL_CACHE_GET_REGION_INFO _IO(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0)
 
-  /**
-   * struct dfl_cxl_cache_region_info - CXL cache region information
-   * @argsz: structure length
-   * @flags: access permission
-   * @size: region size (bytes)
-   * @offset: region offset from start of device fd
-   *
-   * to retrieve  information about a device memory region
-   */
+   /**
+	* struct dfl_cxl_cache_region_info - CXL cache region information
+	* @argsz: structure length
+	* @flags: access permission
+	* @size: region size (bytes)
+	* @offset: region offset from start of device fd
+	*
+	* to retrieve  information about a device memory region
+	*/
 struct dfl_cxl_cache_region_info {
 	__u32 argsz;
 	__u32 flags;
@@ -339,24 +338,24 @@ struct dfl_cxl_cache_region_info {
 };
 
 /**
- * DFL_CXL_CACHE_NUMA_DMA_MAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
- *                                      struct dfl_cxl_cache_dma_map)
+ * DFL_CXL_CACHE_NUMA_BUFFER_MAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_buffer_map)
  *
- * Map the dma memory per user_addr, length and numa node which are provided by caller
- * The driver allocates memory on the numa node, converts the user's virtual address
- * to a continuous physical address, and writes the physical address to
- * the cxl cache read/write address table CSR.
-
- * This interface only accepts page-size aligned user memory for dma mapping.
+ * Map the user memory per user_addr, length and numa node which are
+ * provided by caller. The driver allocates memory on the numa node,
+ * converts the user's virtual addressto a continuous physical address,
+ * and writes the physical address to the cxl cache read/write address table CSR.
+ *
+ * This interface only accepts page-size aligned user memory for mapping.
  * Return: 0 on success, -errno on failure.
  */
 
 #define DFL_ARRAY_MAX_SIZE   0x10
 
-#define DFL_CXL_CACHE_NUMA_DMA_MAP    _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 1)
+#define DFL_CXL_CACHE_NUMA_BUFFER_MAP    _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 1)
 
  /**
-  * struct dfl_cxl_cache_dma_map - maps user address to physical address.
+  * struct dfl_cxl_cache_buffer_map - maps user address to physical address.
   * @argsz: structure length
   * @flags: flags
   * @user_addr: user mmap virtual address
@@ -366,7 +365,7 @@ struct dfl_cxl_cache_region_info {
   *
   * maps user allocated virtual address to physical address.
   */
-struct dfl_cxl_cache_dma_map {
+struct dfl_cxl_cache_buffer_map {
 	__u32 argsz;
 	__u32 flags;
 	__u64 user_addr;
@@ -376,19 +375,19 @@ struct dfl_cxl_cache_dma_map {
 };
 
 /**
- * DFL_CXL_CACHE_NUMA_DMA_UNMAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
- *                                      struct dfl_cxl_cache_dma_unmap)
+ * DFL_CXL_CACHE_NUMA_BUFFER_UNMAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_buffer_unmap)
  *
- * Unmaps the dma memory per user_addr and length which are provided by caller
+ * Unmaps the user memory per user_addr and length which are provided by caller
  * The driver deletes the physical pages of the user address and writes a zero
  * to the read/write address table CSR.
  * Return: 0 on success, -errno on failure.
  */
 
-#define DFL_CXL_CACHE_NUMA_DMA_UNMAP  _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 2)
+#define DFL_CXL_CACHE_NUMA_BUFFER_UNMAP  _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 2)
 
  /**
-  * struct dfl_cxl_cache_dma_unmap - unmaps user allocated memory.
+  * struct dfl_cxl_cache_buffer_unmap - unmaps user allocated memory.
   * @argsz: structure length
   * @flags: flags
   * @user_addr: user mmap virtual address
@@ -397,7 +396,7 @@ struct dfl_cxl_cache_dma_map {
   *
   * unmaps user allocated memory.
   */
-struct dfl_cxl_cache_dma_unmap {
+struct dfl_cxl_cache_buffer_unmap {
 	__u32 argsz;
 	__u32 flags;
 	__u64 user_addr;
diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
index 7298b63b62d9..2bdf25fc0bcd 100644
--- a/samples/cxl_host_exerciser/CMakeLists.txt
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -50,6 +50,7 @@ if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
         PRIVATE
            ${OPAE_INCLUDE_PATHS}
            ${CMAKE_CURRENT_SOURCE_DIR}
+           ${OPAE_LIB_SOURCE}/plugins/xfpga/
            ${CLI11_INCLUDE_DIRS}
            ${numa_INCLUDE_DIRS}
            ${spdlog_INCLUDE_DIRS})
diff --git a/samples/cxl_host_exerciser/cxl_he_cache_cmd.h b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
index dd0631b04325..5272d5333067 100644
--- a/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
+++ b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
@@ -163,7 +163,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read();
@@ -196,7 +195,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read();
@@ -273,7 +271,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read_write();
@@ -307,7 +304,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read_write();
@@ -380,7 +376,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read();
@@ -453,7 +448,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read_write();
@@ -530,10 +524,8 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
-
       g_stop_thread = true;
       t1.join();
       sleep(1);
@@ -616,8 +608,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
-
       he_perf_counters();
       host_exerciser_errors();
       g_stop_thread = true;
@@ -695,7 +685,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_read();
@@ -769,7 +758,6 @@ class he_cache_cmd : public he_cmd {
 
     // wait for completion
     if (!he_wait_test_completion()) {
-      cerr << "timeout error" << endl;
       he_perf_counters();
       host_exerciser_errors();
       host_exe_->free_cache_write();
@@ -896,7 +884,6 @@ class he_cache_cmd : public he_cmd {
   uint32_t he_contmodetime_;
   uint32_t he_linerep_count_;
   uint32_t he_stide_;
-  uint32_t he_target_;
   uint32_t he_test_;
   bool he_test_all_;
 };
diff --git a/samples/cxl_host_exerciser/cxl_he_cmd.h b/samples/cxl_host_exerciser/cxl_he_cmd.h
index 9596feec7930..a5efe4b9f641 100644
--- a/samples/cxl_host_exerciser/cxl_he_cmd.h
+++ b/samples/cxl_host_exerciser/cxl_he_cmd.h
@@ -140,12 +140,12 @@ class he_cmd : public test_command {
     /* Wait for test completion */
     uint32_t timeout = HELPBK_TEST_TIMEOUT;
 
+    cout << "Test started ......" << endl;
     volatile uint8_t *status_ptr = host_exe_->get_dsm();
     while (0 == ((*status_ptr) & 0x1)) {
       usleep(HELPBK_TEST_SLEEP_INVL);
       if (--timeout == 0) {
-        cout << "HE LPBK TIME OUT" << endl;
-
+        cout << "HE Cache time out error" << endl;
         return false;
       }
     }
@@ -162,11 +162,8 @@ class he_cmd : public test_command {
     int n = numa_max_node();
     cout << "There are %d nodes on your system:" << n + 1 << endl;
 
-    int cpu_num = sched_getcpu();
-    cout << "cpu num:" << cpu_num << endl;
-
-    int numa_node = numa_node_of_cpu(cpu_num);
-    cout << "numa node:" << numa_node << endl;
+    int numa_node = numa_node_of_cpu(sched_getcpu());
+    cout << "HE Cache app numa node:" << numa_node << endl;
 
     if (he_target_ == HE_TARGET_HOST) {
       numa_node_ = numa_node;
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
index 0f31d9155dce..8fe4eecfad02 100644
--- a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
@@ -40,12 +40,11 @@ int main(int argc, char *argv[]) {
   app.register_command<host_exerciser::he_cache_lpbk_cmd>();
 
   // host exerciser signal handler
-  struct sigaction act_old, act_new;
-  memset(&act_old, 0, sizeof(act_old));
+  struct sigaction  act_new;
   memset(&act_new, 0, sizeof(act_new));
 
   act_new.sa_handler = he_sig_handler;
-  sigaction(SIGINT, &act_new, &act_old);
+  sigaction(SIGINT, &act_new, NULL);
 
   return app.main(argc, argv);
 }
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index 9c60e5ad5d62..845e895c5356 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -45,7 +45,7 @@
 #include <spdlog/sinks/basic_file_sink.h>
 #include <opae/cxx/core.h>
 
-#include "../../libraries/plugins/xfpga/fpga-dfl.h"
+#include "fpga-dfl.h"
 
 using namespace std;
 
@@ -91,41 +91,65 @@ enum { MATCHES_SIZE = 6 };
 #define DFL_CXL_CACHE_WR_ADDR_TABLE_DATA 0x068
 #define DFL_CXL_CACHE_RD_ADDR_TABLE_DATA 0x088
 
-void *alloc_2mb_hugepage(void) {
-  void *addr;
 
-  addr = mmap(ADDR, MiB(2), PROTECTION, FLAGS_2M, 0, 0);
-  if (addr == MAP_FAILED) {
-    cerr << "alloc_2mb_hugepage() failed:" << strerror(errno) << endl;
-    addr = NULL;
-  }
-
-  return addr;
-}
-void free_memory(void *addr, uint64_t len) { munmap(addr, len); }
+bool buffer_allocate(void** addr, uint64_t len, uint32_t numa_node)
+{
+    void* addr_local = NULL;
+    int i            = 0;
+    long status      = 0;
+    unsigned long mask[4];
+    unsigned int bits_per_UL = sizeof(unsigned long) * 8;
+
+    for (i = 0; i < 4; i++) mask[i] = 0;
+    mask[numa_node / bits_per_UL] |= 1UL << (numa_node % bits_per_UL);
+
+    if (len > MiB(2))
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_1G, 0, 0);
+    else if (len > KiB(4))
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_2M, 0, 0);
+    else
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_4K, 0, 0);
+
+    if (addr_local == MAP_FAILED) {
+        if (errno == ENOMEM) {
+            if (len > MiB(2))
+                cerr <<"Could not allocate buffer (no free 1 "
+                    "GiB huge pages)";
+            if (len > KiB(4))
+                cerr << "Could not allocate buffer (no free 2 "
+                    "MiB huge pages)";
+            else
+                cerr <<"Could not allocate buffer (out of "
+                    "memory)";
+            return false;
+        }
+        cerr << "CXL cache mmap failed:"<< strerror(errno) << endl;
+        return false;
+    }
 
-void *alloc_32kb_hugepage(void) {
-  void *addr;
+    if (addr_local == NULL) { 
+        cerr << "Unable to mmap" << endl;
+        return false;
+    }
 
-  addr = mmap(ADDR, KiB(32), PROTECTION, FLAGS_4K, 0, 0);
-  if (addr == MAP_FAILED) {
-    cerr << "alloc_32kb_hugepage() failed:" << strerror(errno) << endl;;
-    addr = NULL;
-  }
+    status = syscall(__NR_mbind, addr_local, len, 2, &mask, numa_node + 2, 1);
+    if (status != 0) {
+        cerr << "buffer_allocate(): unable to mbind:"
+              << strerror(errno) << endl;
+        return false;
+    }
 
-  return addr;
+    *addr = addr_local;
+    return true;
 }
 
-void *alloc_4kb_hugepage(void) {
-  void *addr;
-
-  addr = mmap(ADDR, KiB(4), PROTECTION, FLAGS_4K, 0, 0);
-  if (addr == MAP_FAILED) {
-    cerr << "alloc_4kb_hugepage() failed:" << strerror(errno) << endl;;
-    addr = NULL;
-  }
-
-  return addr;
+bool buffer_release(void* addr, uint64_t len)
+{
+    if (munmap(addr, len)) {
+        cerr << "CXL cache unmap failed:", strerror(errno);
+            return false;
+    }
+    return true;
 }
 
 bool sysfs_read_u64(const char *path, uint64_t *value) {
@@ -489,13 +513,13 @@ class afu {
 
     int res = 0;
     void *ptr = NULL;
-    struct dfl_cxl_cache_dma_map dma_map;
+    struct dfl_cxl_cache_buffer_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-    ptr = alloc_4kb_hugepage();
-    if (!ptr) {
-      cerr << "Fails to allocate 4k huge page:" << strerror(errno) << endl;
-      return false;
+
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 4k huge page:" << strerror(errno) << endl;
+        return false;
     }
 
     cout << "DSM buffer numa node: " << numa_node << endl;
@@ -503,17 +527,18 @@ class afu {
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_CXL_CACHE_DSM_BASE; // 0x030
+    dma_map.csr_array[0] = DFL_CXL_CACHE_DSM_BASE;
 
-    logger_->debug("Allocate DSM buffer user addr 0x:{0:x} length : {1:d} numa node : {2:d}",
+    logger_->debug("Allocate DSM buffer user addr 0x:{0:x} length :"
+        "{1:d} numa node : {2:d}",
         dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
 
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_NODE_DSM_INFO failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
@@ -524,20 +549,20 @@ class afu {
     return true;
 
   out_free:
-    free_memory(ptr, len);
+    buffer_release(ptr, len);
     return false;
   }
 
   bool free_dsm() {
 
     int res = 0;
-    struct dfl_cxl_cache_dma_unmap dma_unmap;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)dsm_buffer_;
     dma_unmap.length = dsm_buf_len_;
-    dma_unmap.csr_array[0] = DFL_CXL_CACHE_DSM_BASE; // 0x030
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_DSM_BASE;
 
     logger_->debug("free dsm user addr 0x:{0:x} length : {1:d} ",
         dma_unmap.user_addr, dma_unmap.length);
@@ -545,14 +570,14 @@ class afu {
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
 
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
-           << endl;
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed"
+          << strerror(errno) << endl;
     }
 
     logger_->debug("DSM_BASE     : 0x:{0:x}", *u64);
-    free_memory(dsm_buffer_, dsm_buf_len_);
+    buffer_release(dsm_buffer_, dsm_buf_len_);
     return true;
   }
 
@@ -560,32 +585,32 @@ class afu {
 
     int res = 0;
     void *ptr = NULL;
-    struct dfl_cxl_cache_dma_map dma_map;
+    struct dfl_cxl_cache_buffer_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-    ptr = alloc_2mb_hugepage();
-    if (!ptr) {
-      cerr << "Fails to allocate 2MB huge pages" << endl;
-      return false;
-    }
 
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
+    }
     cout << "Read buffer numa node: " << numa_node << endl;
 
     dma_map.argsz = sizeof(dma_map);
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
 
-    logger_->debug("Allocate read buffer user addr 0x:{0:x} length : {1:d} numa node : {2:d}",
+    logger_->debug("Allocate read buffer user addr 0x:{0:x} length :"
+        "{1:d} numa node : {2:d}",
         dma_map.user_addr, dma_map.length, dma_map.numa_node);
 
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
-
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
+    sleep(1);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
@@ -596,35 +621,34 @@ class afu {
     return true;
 
   out_free:
-    free_memory(ptr, len);
+    buffer_release(ptr, len);
     return false;
   }
 
   bool free_cache_read() {
 
     int res = 0;
-    struct dfl_cxl_cache_dma_unmap dma_unmap;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)rd_buffer_;
     dma_unmap.length = rd_buf_len_;
-    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
 
     logger_->debug("free read user addr 0x:{0:x} length : {1:d} ",
         dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
            << endl;
     }
 
     logger_->debug("DFL_CXL_CACHE_RD_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
-    free_memory(rd_buffer_, rd_buf_len_);
-
+    buffer_release(rd_buffer_, rd_buf_len_);
     return true;
   }
 
@@ -632,13 +656,12 @@ class afu {
 
     int res  = 0;
     void *ptr = NULL;
-    struct dfl_cxl_cache_dma_map dma_map;
+    struct dfl_cxl_cache_buffer_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-    ptr = alloc_2mb_hugepage();
-    if (!ptr) {
-      cerr << "Fails to allocate 2MB huge pages" << endl;
-      return false;
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
     }
 
     cout << "Write buffer numa node: " << numa_node << endl;
@@ -646,7 +669,7 @@ class afu {
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
 
     logger_->debug("Allocate write buffer user addr 0x:{0:x}\
         length : {1:d} numa node : {2:d}",
@@ -655,9 +678,9 @@ class afu {
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
@@ -668,34 +691,34 @@ class afu {
     return true;
 
   out_free:
-    free_memory(ptr, len);
+    buffer_release(ptr, len);
     return false;
   }
 
   bool free_cache_write() {
 
     int res = 0;
-    struct dfl_cxl_cache_dma_unmap dma_unmap;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)wr_buffer_;
     dma_unmap.length = wr_buf_len_;
-    dma_unmap.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
 
     logger_->debug("free write user addr 0x:{0:x} length : {1:d} ",
         dma_unmap.user_addr, dma_unmap.length);
 
     volatile uint64_t *u64 =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
            << endl;
     }
 
     logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
-    free_memory(wr_buffer_, wr_buf_len_);
+    buffer_release(wr_buffer_, wr_buf_len_);
     return true;
   }
 
@@ -703,13 +726,12 @@ class afu {
 
     int res = 0;
     void *ptr = NULL;
-    struct dfl_cxl_cache_dma_map dma_map;
+    struct dfl_cxl_cache_buffer_map dma_map;
 
     memset(&dma_map, 0, sizeof(dma_map));
-    ptr = alloc_2mb_hugepage();
-    if (!ptr) {
-      cerr << "Fails to allocate 2MB huge pages" << endl;
-      return false;
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
     }
     cout << "Read/Write buffer numa node: " << numa_node << endl;
 
@@ -717,8 +739,8 @@ class afu {
     dma_map.user_addr = (__u64)ptr;
     dma_map.length = len;
     dma_map.numa_node = numa_node;
-    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
-    dma_map.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+    dma_map.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
 
     logger_->debug("Allocate read/write buffer user addr 0x:{0:x}\
         length : {1:d} numa node : {2:d}",
@@ -729,9 +751,9 @@ class afu {
     volatile uint64_t *u64_rd =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_MAP, &dma_map);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_MAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
            << endl;
       goto out_free;
     }
@@ -745,21 +767,21 @@ class afu {
     return true;
 
   out_free:
-    free_memory(ptr, len);
+    buffer_release(ptr, len);
     return false;
   }
 
   bool free_cache_read_write() {
 
     int res = 0 ;
-    struct dfl_cxl_cache_dma_unmap dma_unmap;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
 
     memset(&dma_unmap, 0, sizeof(dma_unmap));
     dma_unmap.argsz = sizeof(dma_unmap);
     dma_unmap.user_addr = (__u64)rd_wr_buffer_;
     dma_unmap.length = rd_wr_buf_len_;
-    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA; // 0x88;
-    dma_unmap.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA; // 0x68;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+    dma_unmap.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
 
     logger_->debug("free read/write user addr 0x:{0:x} length : {1:d} ",
         dma_unmap.user_addr, dma_unmap.length);
@@ -769,16 +791,16 @@ class afu {
     volatile uint64_t *u64_rd =
         (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
 
-    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_DMA_UNMAP, &dma_unmap);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
     if (res) {
-      cerr << "ioctl DFL_CXL_CACHE_NUMA_DMA_UNMAP failed" << strerror(errno)
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
            << endl;
     }
 
     logger_->debug("nDFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_rd);
     logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_wr);
 
-    free_memory(rd_wr_buffer_, rd_wr_buf_len_);
+    buffer_release(rd_wr_buffer_, rd_wr_buf_len_);
     rd_wr_buffer_ = NULL;
     return true;
   }

From 3cd6e2cb9a633901e448ec7ad0e7f32bc0d3dd9e Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Tue, 26 Sep 2023 11:48:45 -0700
Subject: [PATCH 10/11] fix: ci build errors

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 libraries/plugins/xfpga/fpga-dfl.h         | 78 +++++++++++-----------
 samples/cxl_host_exerciser/he_cache_test.h |  4 ++
 2 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/libraries/plugins/xfpga/fpga-dfl.h b/libraries/plugins/xfpga/fpga-dfl.h
index c218f7e27df6..83ab05447bc8 100644
--- a/libraries/plugins/xfpga/fpga-dfl.h
+++ b/libraries/plugins/xfpga/fpga-dfl.h
@@ -306,27 +306,27 @@ struct dfl_fpga_fme_port_pr {
 #define DFL_PCI_SVA_UNBIND_DEV		_IO(DFL_FPGA_MAGIC,	\
 					    DFL_PCI_SVA_BASE + 1)
 
- /**
-   * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
-   *                                      struct dfl_cxl_cache_region_info)
-   *
-   * Retrieve information about a device memory region.
-   * Caller provides struct dfl_cxl_cache_region_info with flags.
-   * Driver returns the region info in other fields.
-   * Return: 0 on success, -errno on failure.
-   */
+/**
+ * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
+ *                                      struct dfl_cxl_cache_region_info)
+ *
+ * Retrieve information about a device memory region.
+ * Caller provides struct dfl_cxl_cache_region_info with flags.
+ * Driver returns the region info in other fields.
+ * Return: 0 on success, -errno on failure.
+ */
 
 #define DFL_CXL_CACHE_GET_REGION_INFO _IO(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0)
 
-   /**
-	* struct dfl_cxl_cache_region_info - CXL cache region information
-	* @argsz: structure length
-	* @flags: access permission
-	* @size: region size (bytes)
-	* @offset: region offset from start of device fd
-	*
-	* to retrieve  information about a device memory region
-	*/
+/**
+ * struct dfl_cxl_cache_region_info - CXL cache region information
+ * @argsz: structure length
+ * @flags: access permission
+ * @size: region size (bytes)
+ * @offset: region offset from start of device fd
+ *
+ * to retrieve  information about a device memory region
+ */
 struct dfl_cxl_cache_region_info {
 	__u32 argsz;
 	__u32 flags;
@@ -354,17 +354,17 @@ struct dfl_cxl_cache_region_info {
 
 #define DFL_CXL_CACHE_NUMA_BUFFER_MAP    _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 1)
 
- /**
-  * struct dfl_cxl_cache_buffer_map - maps user address to physical address.
-  * @argsz: structure length
-  * @flags: flags
-  * @user_addr: user mmap virtual address
-  * @length: length of mapping (bytes)
-  * @numa_node: Numa node number
-  * @csr_array: array of region address offset
-  *
-  * maps user allocated virtual address to physical address.
-  */
+/**
+ * struct dfl_cxl_cache_buffer_map - maps user address to physical address.
+ * @argsz: structure length
+ * @flags: flags
+ * @user_addr: user mmap virtual address
+ * @length: length of mapping (bytes)
+ * @numa_node: Numa node number
+ * @csr_array: array of region address offset
+ *
+ * maps user allocated virtual address to physical address.
+ */
 struct dfl_cxl_cache_buffer_map {
 	__u32 argsz;
 	__u32 flags;
@@ -386,16 +386,16 @@ struct dfl_cxl_cache_buffer_map {
 
 #define DFL_CXL_CACHE_NUMA_BUFFER_UNMAP  _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 2)
 
- /**
-  * struct dfl_cxl_cache_buffer_unmap - unmaps user allocated memory.
-  * @argsz: structure length
-  * @flags: flags
-  * @user_addr: user mmap virtual address
-  * @length: length of mapping (bytes)
-  * @csr_array: array of region address offset
-  *
-  * unmaps user allocated memory.
-  */
+/**
+ * struct dfl_cxl_cache_buffer_unmap - unmaps user allocated memory.
+ * @argsz: structure length
+ * @flags: flags
+ * @user_addr: user mmap virtual address
+ * @length: length of mapping (bytes)
+ * @csr_array: array of region address offset
+ *
+ * unmaps user allocated memory.
+ */
 struct dfl_cxl_cache_buffer_unmap {
 	__u32 argsz;
 	__u32 flags;
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
index 845e895c5356..900e56bf8f7c 100644
--- a/samples/cxl_host_exerciser/he_cache_test.h
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -39,12 +39,16 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <fcntl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
 #include <CLI/CLI.hpp>
 #include <spdlog/spdlog.h>
 #include <spdlog/sinks/stdout_color_sinks.h>
 #include <spdlog/sinks/basic_file_sink.h>
 #include <opae/cxx/core.h>
 
+
+
 #include "fpga-dfl.h"
 
 using namespace std;

From 4776c2d12dd753f35634241c98c16192afe171bb Mon Sep 17 00:00:00 2001
From: anandaravuri <ananda.ravuri@intel.com>
Date: Tue, 26 Sep 2023 15:36:26 -0700
Subject: [PATCH 11/11] fix: replace original license header fpga-dfl.h

Signed-off-by: anandaravuri <ananda.ravuri@intel.com>
---
 libraries/plugins/xfpga/fpga-dfl.h | 39 +++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/libraries/plugins/xfpga/fpga-dfl.h b/libraries/plugins/xfpga/fpga-dfl.h
index 83ab05447bc8..ab54fd40796b 100644
--- a/libraries/plugins/xfpga/fpga-dfl.h
+++ b/libraries/plugins/xfpga/fpga-dfl.h
@@ -1,17 +1,28 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Header File for FPGA DFL User API
- *
- * Copyright (C) 2017-2018 Intel Corporation, Inc.
- *
- * Authors:
- *   Kang Luwei <luwei.kang@intel.com>
- *   Zhang Yi <yi.z.zhang@intel.com>
- *   Wu Hao <hao.wu@intel.com>
- *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
- *   Tim Whisonant <tim.whisonant@intel.com>
- *   Ananda Ravuri <ananda.ravuri@intel.com>
- */
+// Copyright(c) 2017-2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef _UAPI_LINUX_FPGA_DFL_H
 #define _UAPI_LINUX_FPGA_DFL_H