diff --git a/libraries/plugins/xfpga/fpga-dfl.h b/libraries/plugins/xfpga/fpga-dfl.h
index 215908bf9742..ab54fd40796b 100644
--- a/libraries/plugins/xfpga/fpga-dfl.h
+++ b/libraries/plugins/xfpga/fpga-dfl.h
@@ -1,4 +1,4 @@
-// Copyright(c) 2017-2020, Intel Corporation
+// Copyright(c) 2017-2023, Intel Corporation
 //
 // Redistribution  and  use  in source  and  binary  forms,  with  or  without
 // modification, are permitted provided that the following conditions are met:
@@ -44,6 +44,7 @@
 #define DFL_FPGA_BASE 0
 #define DFL_PORT_BASE 0x40
 #define DFL_FME_BASE 0x80
+#define DFL_CXL_CACHE_BASE 0xA0
 #define DFL_PCI_SVA_BASE 0xf8
 
 /* Common IOCTLs for both FME and AFU file descriptor */
@@ -135,12 +136,20 @@ struct dfl_fpga_port_region_info {
  * Map the dma memory per user_addr and length which are provided by caller.
  * Driver fills the iova in provided struct afu_port_dma_map.
  * This interface only accepts page-size aligned user memory for dma mapping.
+ *
+ * Setting only one of DFL_DMA_MAP_FLAG_READ or WRITE limits FPGA-initiated
+ * DMA requests to only reads or only writes. To be back-compatiable with
+ * legacy driver, setting neither flag is equivalent to setting both flags:
+ * both read and write are requests permitted.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct dfl_fpga_port_dma_map {
 	/* Input */
 	__u32 argsz;		/* Structure length */
-	__u32 flags;		/* Zero for now */
+	__u32 flags;
+#define DFL_DMA_MAP_FLAG_READ	(1 << 0)/* readable from device */
+#define DFL_DMA_MAP_FLAG_WRITE	(1 << 1)/* writable from device */
 	__u64 user_addr;        /* Process virtual address */
 	__u64 length;           /* Length of mapping (bytes)*/
 	/* Output */
@@ -308,4 +317,102 @@ struct dfl_fpga_fme_port_pr {
 #define DFL_PCI_SVA_UNBIND_DEV		_IO(DFL_FPGA_MAGIC,	\
 					    DFL_PCI_SVA_BASE + 1)
 
+/**
+ * DFL_CXL_CACHE_GET_REGION_INFO - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0,
+ *                                      struct dfl_cxl_cache_region_info)
+ *
+ * Retrieve information about a device memory region.
+ * Caller provides struct dfl_cxl_cache_region_info with flags.
+ * Driver returns the region info in other fields.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_CXL_CACHE_GET_REGION_INFO _IO(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 0)
+
+/**
+ * struct dfl_cxl_cache_region_info - CXL cache region information
+ * @argsz: structure length
+ * @flags: access permission
+ * @size: region size (bytes)
+ * @offset: region offset from start of device fd
+ *
+ * to retrieve  information about a device memory region
+ */
+struct dfl_cxl_cache_region_info {
+	__u32 argsz;
+	__u32 flags;
+#define DFL_CXL_CACHE_REGION_READ	BIT(0)
+#define DFL_CXL_CACHE_REGION_WRITE	BIT(1)
+#define DFL_CXL_CACHE_REGION_MMAP	BIT(2)
+	__u64 size;
+	__u64 offset;
+};
+
+/**
+ * DFL_CXL_CACHE_NUMA_BUFFER_MAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_buffer_map)
+ *
+ * Map the user memory per user_addr, length and numa node which are
+ * provided by caller. The driver allocates memory on the numa node,
+ * converts the user's virtual addressto a continuous physical address,
+ * and writes the physical address to the cxl cache read/write address table CSR.
+ *
+ * This interface only accepts page-size aligned user memory for mapping.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_ARRAY_MAX_SIZE   0x10
+
+#define DFL_CXL_CACHE_NUMA_BUFFER_MAP    _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 1)
+
+/**
+ * struct dfl_cxl_cache_buffer_map - maps user address to physical address.
+ * @argsz: structure length
+ * @flags: flags
+ * @user_addr: user mmap virtual address
+ * @length: length of mapping (bytes)
+ * @numa_node: Numa node number
+ * @csr_array: array of region address offset
+ *
+ * maps user allocated virtual address to physical address.
+ */
+struct dfl_cxl_cache_buffer_map {
+	__u32 argsz;
+	__u32 flags;
+	__u64 user_addr;
+	__u64 length;
+	__u32 numa_node;
+	__u64 csr_array[DFL_ARRAY_MAX_SIZE];
+};
+
+/**
+ * DFL_CXL_CACHE_NUMA_BUFFER_UNMAP - _IOWR(DFL_FPGA_MAGIC, DFL_CXL_CACHE_BASE + 1,
+ *                                      struct dfl_cxl_cache_buffer_unmap)
+ *
+ * Unmaps the user memory per user_addr and length which are provided by caller
+ * The driver deletes the physical pages of the user address and writes a zero
+ * to the read/write address table CSR.
+ * Return: 0 on success, -errno on failure.
+ */
+
+#define DFL_CXL_CACHE_NUMA_BUFFER_UNMAP  _IO(DFL_FPGA_MAGIC,  DFL_CXL_CACHE_BASE + 2)
+
+/**
+ * struct dfl_cxl_cache_buffer_unmap - unmaps user allocated memory.
+ * @argsz: structure length
+ * @flags: flags
+ * @user_addr: user mmap virtual address
+ * @length: length of mapping (bytes)
+ * @csr_array: array of region address offset
+ *
+ * unmaps user allocated memory.
+ */
+struct dfl_cxl_cache_buffer_unmap {
+	__u32 argsz;
+	__u32 flags;
+	__u64 user_addr;
+	__u64 length;
+	__u64 csr_array[DFL_ARRAY_MAX_SIZE];
+};
+
 #endif /* _UAPI_LINUX_FPGA_DFL_H */
diff --git a/opae.spec.fedora b/opae.spec.fedora
index 893bcec6416a..dcdbe10e7fa2 100644
--- a/opae.spec.fedora
+++ b/opae.spec.fedora
@@ -355,6 +355,7 @@ done
 %{_bindir}/mem_tg
 %{_bindir}/ofs.uio
 %{_bindir}/cxl_mem_tg
+%{_bindir}/cxl_host_exerciser
 
 %{python3_sitearch}/opae.diag*
 %{python3_sitearch}/opae/diag*
diff --git a/packaging/opae/deb/opae-extra-tools.install b/packaging/opae/deb/opae-extra-tools.install
index a85827b3f0de..a363035c3704 100644
--- a/packaging/opae/deb/opae-extra-tools.install
+++ b/packaging/opae/deb/opae-extra-tools.install
@@ -18,6 +18,7 @@ usr/bin/fpga_dma_N3000_test
 usr/bin/fpga_dma_test
 usr/bin/host_exerciser
 usr/bin/cxl_mem_tg
+usr/bin/cxl_host_exerciser
 usr/bin/bist
 usr/bin/hps
 usr/bin/hssi
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index db5023c44118..ab942e774b65 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -70,3 +70,5 @@ opae_add_subdirectory(host_exerciser)
 opae_add_subdirectory(n5010-test)
 opae_add_subdirectory(n5010-ctl)
 opae_add_subdirectory(cxl_mem_tg)
+opae_add_subdirectory(cxl_host_exerciser)
+
diff --git a/samples/cxl_host_exerciser/CMakeLists.txt b/samples/cxl_host_exerciser/CMakeLists.txt
new file mode 100644
index 000000000000..2bdf25fc0bcd
--- /dev/null
+++ b/samples/cxl_host_exerciser/CMakeLists.txt
@@ -0,0 +1,66 @@
+## Copyright(c) 2023, Intel Corporation
+##
+## Redistribution  and  use  in source  and  binary  forms,  with  or  without
+## modification, are permitted provided that the following conditions are met:
+##
+## * Redistributions of  source code  must retain the  above copyright notice,
+##   this list of conditions and the following disclaimer.
+## * Redistributions in binary form must reproduce the above copyright notice,
+##   this list of conditions and the following disclaimer in the documentation
+##   and/or other materials provided with the distribution.
+## * Neither the name  of Intel Corporation  nor the names of its contributors
+##   may be used to  endorse or promote  products derived  from this  software
+##   without specific prior written permission.
+##
+## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+## AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+## IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+## ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+## LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+## CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+## SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+## INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+## CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+## POSSIBILITY OF SUCH DAMAGE.
+
+if (OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
+
+    if (fmt_LIBRARIES)
+        # if we found fmt before (from CMakeLists.txt)
+        # then we need to find it again from this directory
+        # so we can "import" the fmt::fmt link target
+        find_package(fmt)
+    endif (fmt_LIBRARIES)
+
+    opae_add_executable(TARGET cxl_host_exerciser
+        SOURCE cxl_host_exerciser.cpp
+        LIBS
+            opae-cxx-core
+            opae-c
+            ${spdlog_LIBRARIES}
+            ${json-c_LIBRARIES}
+            ${uuid_LIBRARIES}
+            ${numa_LIBRARIES}
+            ${fmt_LIBRARIES}
+        COMPONENT samplebin
+    )
+
+    target_include_directories(cxl_host_exerciser
+        PRIVATE
+           ${OPAE_INCLUDE_PATHS}
+           ${CMAKE_CURRENT_SOURCE_DIR}
+           ${OPAE_LIB_SOURCE}/plugins/xfpga/
+           ${CLI11_INCLUDE_DIRS}
+           ${numa_INCLUDE_DIRS}
+           ${spdlog_INCLUDE_DIRS})
+
+    target_compile_options(cxl_host_exerciser PUBLIC
+        -Wno-unused-result
+    )
+
+    target_compile_definitions(cxl_host_exerciser PUBLIC
+        ${spdlog_DEFINITIONS}
+    )
+
+endif(OPAE_WITH_CLI11 AND OPAE_WITH_SPDLOG AND OPAE_WITH_NUMA)
diff --git a/samples/cxl_host_exerciser/cxl_he_cache_cmd.h b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
new file mode 100644
index 000000000000..5272d5333067
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_he_cache_cmd.h
@@ -0,0 +1,918 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "cxl_he_cmd.h"
+#include "cxl_host_exerciser.h"
+#include "he_cache_test.h"
+
+#define UNUSED_PARAM(x) ((void)x)
+
+// HE exit global flag
+volatile bool g_he_exit = false;
+volatile static bool g_stop_thread = false;
+
+// host exerciser signal handler
+void he_sig_handler(int) {
+  g_he_exit = true;
+  g_stop_thread = true;
+  cout << "HE signal handler exit app" << endl;
+}
+
+namespace host_exerciser {
+
+void he_cache_thread(uint8_t *buf_ptr, uint64_t len);
+
+class he_cache_cmd : public he_cmd {
+public:
+  he_cache_cmd()
+      : he_continuousmode_(false), he_contmodetime_(0), he_linerep_count_(0),
+        he_stide_(0), he_test_(0), he_test_all_(false) {}
+
+  virtual ~he_cache_cmd() {}
+
+  virtual const char *name() const override { return "cache"; }
+
+  virtual const char *description() const override {
+    return "run simple cxl he cache test";
+  }
+
+  virtual const char *afu_id() const override { return HE_CACHE_AFU_ID; }
+
+  virtual uint64_t featureid() const override { return MEM_TG_FEATURE_ID; }
+
+  virtual uint64_t guidl() const override { return MEM_TG_FEATURE_GUIDL; }
+
+  virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
+
+  virtual void add_options(CLI::App *app) override {
+
+    // test mode
+    app->add_option(
+           "--test", he_test_,
+           "host exerciser cache test")
+        ->transform(CLI::CheckedTransformer(he_test_modes))
+        ->default_val("fpgardcachehit");
+
+    // Continuous mode
+    app->add_option("--continuousmode", he_continuousmode_,
+                    "test rollover or test termination")
+        ->default_val("false");
+
+    // Continuous mode time
+    app->add_option("--contmodetime", he_contmodetime_,
+                    "Continuous mode time in seconds")
+        ->default_val("1");
+
+    // target host or fpga
+    app->add_option("--target", he_target_,
+                    "host exerciser run on host or fpga")
+        ->transform(CLI::CheckedTransformer(he_targets))
+        ->default_val("host");
+
+    app->add_option("--stride", he_stide_, "Enable stride mode")
+        ->default_val("0");
+
+    // Line repeat count
+    app->add_option("--linerepcount", he_linerep_count_, "Line repeat count")
+        ->transform(CLI::Range(1, 256))
+        ->default_val("10");
+
+    // Test all
+    app->add_option("--testall", he_test_all_, "Run all tests")
+        ->default_val("false");
+  }
+
+  int he_run_fpga_rd_cache_hit_test() {
+    cout << "********** FPGA Read cache hit test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer // flush
+    2) set cache lines 32kb/64
+    3) set line repeat count
+    4) Set RdShared (CXL) config
+    5) Run test ( AFU copies cache from host memory to FPGA cache)
+    6) set line repeat count
+    7) Set RdShared (CXL) config
+    8) Run test ( AFU read cache from FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Numa node:" << numa_node_ << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = 1;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "allocate dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** " << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_dsm();
+    host_exe_->free_cache_read();
+
+    cout << "********** AFU reads cache from FPGA Cache successfully"
+        " **********" << endl;
+    cout << "********** FPGA Read cache hit test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_fpga_wr_cache_hit_test() {
+
+    cout << "********** FPGA Write cache hit test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer // flush
+    2) set cache lines 32kb/64
+    3) set line repeat count
+    4) Set RdShared (CXL) config
+    5) Run test ( AFU copies cache from host memory to FPGA cache)
+    6) set line repeat count
+    7) Set WrLine_M/WrPart_M (CXL) config
+    8) Run test ( AFU writes to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+
+    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = 1;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "allocate dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read, Write buffer
+    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** " << endl;
+
+    // set W_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_M;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // Set WR_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES);
+    // Start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    cout << "********** AFU Write to  FPGA Cache  successfully ********** "
+         << endl;
+
+    host_exe_->free_cache_read_write();
+    host_exe_->free_dsm();
+
+    cout << "********** FPGA Write cache hit test end**********" << endl;
+
+    return 0;
+  }
+
+  int he_run_fpga_rd_cache_miss_test() {
+
+    cout << "********** FPGA Read cache miss test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer
+    2) Write number of lines more then 32kb 2mb/64
+    3) Set RdShared (CXL) config
+    4) Run test (Buffer is not present in FPGA - FPGA read Cache miss )
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_2MB_CACHE_LINES);
+
+    cout << "Read number Lines:" << FPGA_2MB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_S;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "allocate dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_read();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Read FPGA Cache Miss successfully ********** "
+         << endl;
+    cout << "********** FPGA Read cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_fpga_wr_cache_miss_test() {
+
+    cout << "********** FPGA write cache miss test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer, Write buffer
+    2) Write number of lines more then 32 kb  2mb/64
+    3) Set WR ItoMWr (CXL) config
+    4) Run test ( Buffer is not present in FPGA - FPGA write Cache miss )
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_2MB_CACHE_LINES);
+
+    cout << "Read/write number Lines:" << FPGA_2MB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set W_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_M;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // Set WR_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "allocate dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read, Write buffer
+    if (!host_exe_->allocate_cache_read_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start test
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_read_write();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Write FPGA Cache Miss successfully ********** "
+         << endl;
+    cout << "********** FPGA Write cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_rd_cache_hit_test() {
+
+      cout << "********** 1 Host LLC Read cache hit test start**********" << endl;
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer
+    2) create thread read buffer
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES);
+
+    cout << "Read number Lines:" << FPGA_32KB_CACHE_LINES << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set RD_CONFIG RdShared (CXL)
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_I;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    cout << " create thread - moves read buffer to host cache " << endl;
+    std::thread t1(he_cache_thread, host_exe_->get_read(), BUFFER_SIZE_2MB);
+    sleep(1);
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      g_stop_thread = true;
+      t1.join();
+      sleep(1);
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    g_stop_thread = true;
+    t1.join();
+
+    he_perf_counters();
+    sleep(1);
+    host_exe_->free_cache_read();
+    host_exe_->free_dsm();
+
+    cout << "********** AFU Copied host cache to FPGA Cache successfully "
+            "********** " << endl;
+    cout << "********** Host LLC cache hit test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_wr_cache_hit_test() {
+
+    cout << "********** Host LLC Write cache hit test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Write buffer
+    2) create thread read buffer
+    3) Set ItoMWr (CXL) config
+    4) Run test ( AFU write to host cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+
+    host_exe_->write64(HE_WR_NUM_LINES, FPGA_32KB_CACHE_LINES);
+    cout << "Write number Lines:" << FPGA_32KB_CACHE_LINES  << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set RD_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_LINE_I;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTRL
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, wr_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    cout << " create thread - moves read buffer to host cache " << endl;
+    std::thread t1(he_cache_thread, host_exe_->get_write(), BUFFER_SIZE_2MB);
+    sleep(1);
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      g_stop_thread = true;
+      t1.join();
+      sleep(1);
+      host_exe_->free_cache_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    g_stop_thread = true;
+    t1.join();
+    he_perf_counters();
+    cout << "********** AFU write  host cache successfully ********** " << endl;
+
+    sleep(1);
+    host_exe_->free_cache_write();
+    host_exe_->free_dsm();
+
+    cout << "********** Host LLC cache hit Write test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_rd_cache_miss_test() {
+    cout << "********** Host LLC Read cache miss test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, Read buffer
+    2) flush host read buffer cachde
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set Read number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_RD_NUM_LINES, FPGA_32KB_CACHE_LINES - 1);
+    cout << "Read/write number Lines:" << FPGA_32KB_CACHE_LINES - 1 << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set RD_CONFIG
+    he_rd_cfg_.value = 0;
+    he_rd_cfg_.line_repeat_count = he_linerep_count_;
+    he_rd_cfg_.read_traffic_enable = 1;
+    he_rd_cfg_.opcode = RD_LINE_I;
+    host_exe_->write64(HE_RD_CONFIG, he_rd_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTR
+    rd_table_ctl_.value = 0;
+    rd_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_RD_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate Read buffer
+    if (!host_exe_->allocate_cache_read(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_read();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_read();
+    host_exe_->free_dsm();
+
+    cout << "********** Ran  Host LLC Read cache miss successfully ********** "
+         << endl;
+
+    cout << "********** Host LLC Read cache miss test end**********" << endl;
+    return 0;
+  }
+
+  int he_run_host_wr_cache_miss_test() {
+
+    cout << "********** Host LLC Write cache miss test start**********" << endl;
+
+    /*
+    STEPS
+    1) Allocate DSM, write buffer
+    2) flush host write buffer cachde
+    3) Set RdLine_I (CXL) config
+    4) Run test ( AFU reads from host cache to FPGA cache)
+    */
+
+    // HE_INFO
+    // Set write number Lines
+    he_info_.value = host_exe_->read64(HE_INFO);
+    host_exe_->write64(HE_WR_NUM_LINES, 1);
+    cout << "Write number Lines:" << 1 << endl;
+    cout << "Line Repeat Count:" << he_linerep_count_ << endl;
+    cout << "Read address table size:" << he_info_.read_addr_table_size << endl;
+    cout << "Write address table size:" << he_info_.write_addr_table_size
+        << endl;
+
+    // set RD_CONFIG
+    he_wr_cfg_.value = 0;
+    he_wr_cfg_.line_repeat_count = he_linerep_count_;
+    he_wr_cfg_.write_traffic_enable = 1;
+    he_wr_cfg_.opcode = WR_PUSH_I;
+    host_exe_->write64(HE_WR_CONFIG, he_wr_cfg_.value);
+
+    // set RD_ADDR_TABLE_CTR
+    wr_table_ctl_.value = 0;
+    wr_table_ctl_.enable_address_stride = 1;
+    host_exe_->write64(HE_WR_ADDR_TABLE_CTRL, rd_table_ctl_.value);
+
+    // Allocate DSM buffer
+    if (!host_exe_->allocate_dsm()) {
+      cerr << "alloc dsm failed" << endl;
+      return -1;
+    }
+
+    // Allocate write buffer
+    if (!host_exe_->allocate_cache_write(BUFFER_SIZE_2MB, numa_node_)) {
+      cerr << "allocate cache read failed" << endl;
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    // start
+    he_ctl_.Start = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    he_ctl_.Start = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    // wait for completion
+    if (!he_wait_test_completion()) {
+      he_perf_counters();
+      host_exerciser_errors();
+      host_exe_->free_cache_write();
+      host_exe_->free_dsm();
+      return -1;
+    }
+
+    he_perf_counters();
+    host_exe_->free_cache_write();
+    host_exe_->free_dsm();
+
+    cout << "********** Ran  Host LLC Write cache miss successfully ********** "
+         << endl;
+
+    cout << "********** Host LLC Write cache miss test end**********" << endl;
+    return 0;
+  }
+
+  virtual int run(test_afu *afu, CLI::App *app) {
+    (void)app;
+    int ret = 0;
+
+    host_exe_ = dynamic_cast<host_exerciser *>(afu);
+
+    if (!verify_numa_node()) {
+      numa_node_ = 0;
+      cout << "numa nodes are available set numa node to 0" << endl;
+    };
+
+    // reset HE cache
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    if (he_test_all_ == true) {
+      int retvalue = 0;
+      ret = he_run_fpga_rd_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_fpga_wr_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      ret = he_run_fpga_rd_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_fpga_wr_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_rd_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_wr_cache_hit_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      ret = he_run_host_rd_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+      ret = he_run_host_wr_cache_miss_test();
+      if (ret != 0) {
+        retvalue = ret;
+      }
+
+      return retvalue;
+    }
+
+    if (he_test_ == HE_FPGA_RD_CACHE_HIT) {
+      ret = he_run_fpga_rd_cache_hit_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_FPGA_WR_CACHE_HIT) {
+      ret = he_run_fpga_wr_cache_hit_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_FPGA_RD_CACHE_MISS) {
+      ret = he_run_fpga_rd_cache_miss_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_FPGA_WR_CACHE_MISS) {
+      ret = he_run_fpga_wr_cache_miss_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_HOST_RD_CACHE_HIT) {
+      ret = he_run_host_rd_cache_hit_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_HOST_WR_CACHE_HIT) {
+      ret = he_run_host_wr_cache_hit_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_HOST_RD_CACHE_MISS) {
+      ret = he_run_host_rd_cache_miss_test();
+      return ret;
+    }
+
+    if (he_test_ == HE_HOST_WR_CACHE_MISS) {
+      ret = he_run_host_wr_cache_miss_test();
+      return ret;
+    }
+
+    return 0;
+  }
+
+protected:
+  bool he_continuousmode_;
+  uint32_t he_contmodetime_;
+  uint32_t he_linerep_count_;
+  uint32_t he_stide_;
+  uint32_t he_test_;
+  bool he_test_all_;
+};
+
+void he_cache_thread(uint8_t *buf_ptr, uint64_t len) {
+
+    uint64_t value;
+    UNUSED_PARAM(value);
+    uint64_t cache_lines = len / CL;
+    uint64_t i = 0;
+
+  if (buf_ptr == NULL || len == 0) {
+    return;
+  }
+
+  while (true) {
+    if (g_stop_thread == true) {
+      return;
+    }
+    if (i < cache_lines) {
+      value = *((volatile uint64_t *)(buf_ptr + i * 8));
+    }
+    i++;
+    if (i >= cache_lines) {
+      i = 0;
+    }
+  }
+
+  return;
+}
+
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h b/samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h
new file mode 100644
index 000000000000..82f2dcba91a0
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_he_cache_lpbk_cmd.h
@@ -0,0 +1,81 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "cxl_host_exerciser.h"
+#include "he_cache_test.h"
+
+namespace host_exerciser {
+
+class he_cache_lpbk_cmd : public he_cmd {
+public:
+  he_cache_lpbk_cmd() {}
+  virtual ~he_cache_lpbk_cmd() {}
+
+  virtual const char *name() const override { return "lpbk"; }
+
+  virtual const char *description() const override {
+    return "run simple cxl he lpbk test";
+  }
+
+  virtual const char *afu_id() const override { return HE_CACHE_AFU_ID; }
+
+  virtual uint64_t featureid() const override { return MEM_TG_FEATURE_ID; }
+
+  virtual uint64_t guidl() const override { return MEM_TG_FEATURE_GUIDL; }
+
+  virtual uint64_t guidh() const override { return MEM_TG_FEATURE_GUIDH; }
+  virtual void add_options(CLI::App *app) override {
+    // target host or fpga
+    app->add_option("--target", he_target_,
+                    "host exerciser run on host or fpga")
+        ->transform(CLI::CheckedTransformer(he_targets))
+        ->default_val("host");
+  }
+
+  virtual int run(test_afu *afu, CLI::App *app) {
+    (void)app;
+    //  int ret = 0;
+    cout << "HE LPBK run" << endl;
+    host_exe_ = dynamic_cast<host_exerciser *>(afu);
+
+    if (!verify_numa_node()) {
+      numa_node_ = 0;
+      cout << "numa nodes are available set numa node to 0" << endl;
+    };
+
+    // reset HE cache
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 0;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+
+    he_ctl_.value = 0;
+    he_ctl_.ResetL = 1;
+    host_exe_->write64(HE_CTL, he_ctl_.value);
+    return 0;
+  }
+};
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_he_cmd.h b/samples/cxl_host_exerciser/cxl_he_cmd.h
new file mode 100644
index 000000000000..a5efe4b9f641
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_he_cmd.h
@@ -0,0 +1,193 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include <map>
+#include <numa.h>
+#include <unistd.h>
+
+#include "cxl_he_cmd.h"
+#include "cxl_host_exerciser.h"
+#include "he_cache_test.h"
+
+namespace host_exerciser {
+
+class he_cmd : public test_command {
+public:
+  he_cmd() : host_exe_(NULL), he_clock_mhz_(400), numa_node_(0), he_target_(0) {
+
+    he_ctl_.value = 0;
+    he_info_.value = 0;
+    he_rd_cfg_.value = 0;
+    he_wr_cfg_.value = 0;
+    rd_table_ctl_.value = 0;
+    wr_table_ctl_.value = 0;
+  }
+
+  virtual ~he_cmd() {}
+
+  // Convert number of transactions to bandwidth (GB/s)
+  double he_num_xfers_to_bw(uint64_t num_lines, uint64_t num_ticks) {
+    return (double)(num_lines * 64) / ((1000.0 / he_clock_mhz_ * num_ticks));
+  }
+
+  void he_perf_counters() {
+    volatile he_cache_dsm_status *dsm_status = NULL;
+
+    dsm_status = reinterpret_cast<he_cache_dsm_status *>(
+        (uint8_t *)(host_exe_->get_dsm()));
+    if (!dsm_status)
+      return;
+
+    cout << "\n********* DSM Status CSR Start *********" << endl;
+    cout << "test completed :" << dsm_status->test_completed << endl;
+    cout << "dsm number:" << dsm_status->dsm_number << endl;
+    cout << "error vector:" << dsm_status->err_vector << endl;
+    cout << "num ticks:" << dsm_status->num_ticks << endl;
+    cout << "num reads:" << dsm_status->num_reads << endl;
+    cout << "num writes:" << dsm_status->num_writes << endl;
+    cout << "penalty start:" << dsm_status->penalty_start << endl;
+    cout << "penalty end:" << dsm_status->penalty_end << endl;
+    cout << "actual data:" << dsm_status->actual_data << endl;
+    cout << "expected data:" << dsm_status->expected_data << endl;
+
+    // print bandwidth
+    if (dsm_status->num_ticks > 0) {
+      double perf_data =
+          he_num_xfers_to_bw(dsm_status->num_reads + dsm_status->num_writes,
+                             dsm_status->num_ticks);
+      host_exe_->logger_->info("Bandwidth: {0:0.3f} GB/s", perf_data);
+    }
+
+    cout << "********* DSM Status CSR end *********" << endl;
+  }
+
+  void host_exerciser_errors() {
+    he_err_status err_status;
+    uint64_t err = 0;
+    if (host_exe_ == NULL)
+      return;
+
+    err_status.value = host_exe_->read64(HE_ERROR_STATUS);
+    if (err_status.data_error == 1) {
+      cout << "Data Integrity Check error occured" << endl;
+    }
+
+    if (err_status.err_index > 0) {
+      cout << "Error occurred at cache line address:" << err_status.err_index
+           << endl;
+    }
+
+    err = host_exe_->read64(HE_ERROR_EXP_DATA);
+    cout << "Error Expected Data:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA0);
+    cout << "Error Expected Data0:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA1);
+    cout << "Error Expected Data1:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA2);
+    cout << "Error Expected Data2:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA3);
+    cout << "Error Expected Data3:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA4);
+    cout << "Error Expected Data4:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA5);
+    cout << "Error Expected Data5:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA6);
+    cout << "Error Expected Data6:" << err << endl;
+
+    err = host_exe_->read64(HE_ERROR_ACT_DATA7);
+    cout << "Error Expected Data7:" << err << endl;
+  }
+
+  int parse_input_options() {
+
+    if (!host_exe_)
+      return -1;
+
+    return 0;
+  }
+
+  bool he_wait_test_completion() {
+    /* Wait for test completion */
+    uint32_t timeout = HELPBK_TEST_TIMEOUT;
+
+    cout << "Test started ......" << endl;
+    volatile uint8_t *status_ptr = host_exe_->get_dsm();
+    while (0 == ((*status_ptr) & 0x1)) {
+      usleep(HELPBK_TEST_SLEEP_INVL);
+      if (--timeout == 0) {
+        cout << "HE Cache time out error" << endl;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool verify_numa_node() {
+
+    if (numa_available() < 0) {
+      cerr << "System does not support NUMA API" << endl;
+      return false;
+    }
+
+    int n = numa_max_node();
+    cout << "There are %d nodes on your system:" << n + 1 << endl;
+
+    int numa_node = numa_node_of_cpu(sched_getcpu());
+    cout << "HE Cache app numa node:" << numa_node << endl;
+
+    if (he_target_ == HE_TARGET_HOST) {
+      numa_node_ = numa_node;
+      cout << "HE_TARGET_HOST numa node:" << numa_node_ << endl;
+    } else {
+      // find fpga numa node number
+      numa_node_ = 2;
+      cout << "HE_TARGET_FPGA numa node:" << numa_node_ << endl;
+    }
+
+    return true;
+  }
+
+protected:
+  host_exerciser *host_exe_;
+  uint32_t he_clock_mhz_;
+  uint32_t numa_node_;
+  uint32_t he_target_;
+
+  he_ctl he_ctl_;
+  he_info he_info_;
+  he_rd_config he_rd_cfg_;
+  he_wr_config he_wr_cfg_;
+  he_rd_addr_table_ctrl rd_table_ctl_;
+  he_wr_addr_table_ctrl wr_table_ctl_;
+};
+} // end of namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.cpp b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
new file mode 100644
index 000000000000..8fe4eecfad02
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.cpp
@@ -0,0 +1,50 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#include <CLI/CLI.hpp>
+#include <iostream>
+#include <signal.h>
+
+#include "cxl_he_cache_cmd.h"
+#include "cxl_he_cache_lpbk_cmd.h"
+#include "cxl_host_exerciser.h"
+
+void he_sig_handler(int);
+
+int main(int argc, char *argv[]) {
+
+  host_exerciser::host_exerciser app;
+  app.register_command<host_exerciser::he_cache_cmd>();
+  app.register_command<host_exerciser::he_cache_lpbk_cmd>();
+
+  // host exerciser signal handler
+  struct sigaction  act_new;
+  memset(&act_new, 0, sizeof(act_new));
+
+  act_new.sa_handler = he_sig_handler;
+  sigaction(SIGINT, &act_new, NULL);
+
+  return app.main(argc, argv);
+}
diff --git a/samples/cxl_host_exerciser/cxl_host_exerciser.h b/samples/cxl_host_exerciser/cxl_host_exerciser.h
new file mode 100644
index 000000000000..917e59f798a3
--- /dev/null
+++ b/samples/cxl_host_exerciser/cxl_host_exerciser.h
@@ -0,0 +1,412 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "he_cache_test.h"
+
+#define MEM_TG_FEATURE_ID 0x25
+#define MEM_TG_FEATURE_GUIDL 0x81599b5c2ebd4b23
+#define MEM_TG_FEATURE_GUIDH 0x0118e06b1fa349b9
+const char *HE_CACHE_AFU_ID = "0118E06B-1FA3-49B9-8159-9b5C2EBD4b23";
+
+namespace host_exerciser {
+
+static const uint64_t HELPBK_TEST_TIMEOUT = 30000;
+static const uint64_t HELPBK_TEST_SLEEP_INVL = 100;
+static const uint64_t CL = 64;
+static const uint64_t KB = 1024;
+static const uint64_t MB = KB * 1024;
+static const uint64_t BUFFER_SIZE_2MB = 2 * 1024 * 1024;
+static const uint64_t FPGA_32KB_CACHE_LINES = (32 * 1024) / 64;
+static const uint64_t FPGA_2MB_CACHE_LINES = (2 * 1024 * 1024) / 64;
+
+// Host execiser CSR Offset
+enum {
+  HE_DFH = 0x0000,
+  HE_ID_L = 0x0008,
+  HE_ID_H = 0x0010,
+  HE_DFH_RSVD0 = 0x0018,
+  HE_DFH_RSVD1 = 0x0020,
+  HE_SCRATCHPAD0 = 0x028,
+  HE_DSM_BASE = 0x030,
+  HE_CTL = 0x038,
+  HE_INFO = 0x040,
+  HE_WR_NUM_LINES = 0x048,
+  HE_WR_BYTE_ENABLE = 0x050,
+  HE_WR_CONFIG = 0x058,
+  HE_WR_ADDR_TABLE_CTRL = 0x060,
+  HE_WR_ADDR_TABLE_DATA = 0x068,
+  HE_RD_NUM_LINES = 0x070,
+  HE_RD_CONFIG = 0x078,
+  HE_RD_ADDR_TABLE_CTRL = 0x080,
+  HE_RD_ADDR_TABLE_DATA = 0x088,
+  HE_ERROR_STATUS = 0x090,
+  HE_ERROR_EXP_DATA = 0x098,
+  HE_ERROR_ACT_DATA0 = 0x0A0,
+  HE_ERROR_ACT_DATA1 = 0x0A8,
+  HE_ERROR_ACT_DATA2 = 0x0B0,
+  HE_ERROR_ACT_DATA3 = 0x0B8,
+  HE_ERROR_ACT_DATA4 = 0x0C0,
+  HE_ERROR_ACT_DATA5 = 0x0C8,
+  HE_ERROR_ACT_DATA6 = 0x0D0,
+  HE_ERROR_ACT_DATA7 = 0x0D8,
+};
+
+// Read Traffic Opcode
+typedef enum {
+  RD_LINE_I = 0x0,
+  RD_LINE_S = 0x1,
+  RD_LINE_EM = 0x2,
+} he_rd_opcode;
+
+// Write Traffic Opcode
+typedef enum {
+  WR_LINE_I = 0x0,
+  WR_LINE_M = 0x1,
+  WR_PUSH_I = 0x2,
+  WR_BARRIER_FRNCE = 0x3,
+  WR_FLUSH_CL = 0x4,
+  WR_FLUSH_CL_HCOH = 0x5,
+  WR_FLUSH_CL_DCOH = 0x6,
+} he_wr_opcode;
+
+// DFH Header
+union he_dfh {
+  enum { offset = HE_DFH };
+  uint64_t value;
+  struct {
+    uint64_t CcipVersionNumber : 12;
+    uint64_t AfuMajVersion : 4;
+    uint64_t NextDfhOffset : 24;
+    uint64_t EOL : 1;
+    uint64_t Reserved : 19;
+    uint64_t FeatureType : 4;
+  };
+};
+
+// DSM BASE
+union he_dsm_base {
+  enum { offset = HE_DSM_BASE };
+  uint64_t value;
+  struct {
+    uint64_t DsmBase : 64;
+  };
+};
+
+// CSR CTL
+union he_ctl {
+  enum { offset = HE_CTL };
+  uint64_t value;
+  struct {
+    uint64_t ResetL : 1;
+    uint64_t Start : 1;
+    uint64_t ForcedTestCmpl : 1;
+    uint64_t bias_support : 1;
+    uint64_t Reserved : 60;
+  };
+};
+
+// CSR INFO
+union he_info {
+  enum { offset = HE_INFO };
+  uint64_t value;
+  struct {
+    uint64_t write_addr_table_size : 4;
+    uint64_t read_addr_table_size : 4;
+    uint64_t Reserved : 56;
+  };
+};
+
+// HE_WR_NUM_LINES
+union he_wr_num_lines {
+  enum { offset = HE_WR_NUM_LINES };
+  uint64_t value;
+  struct {
+    uint64_t write_num_lines : 16;
+    uint64_t reserved : 48;
+  };
+};
+
+// HE_WR_BYTE_ENABLE
+union he_wr_byte_enable {
+  enum { offset = HE_WR_BYTE_ENABLE };
+  uint64_t value;
+  struct {
+    uint64_t write_byte_enable : 64;
+  };
+};
+
+// HE_WR_CONFIG
+union he_wr_config {
+  enum { offset = HE_WR_CONFIG };
+  uint64_t value;
+  struct {
+    uint64_t write_traffic_enable : 1;
+    uint64_t continuous_mode_enable : 1;
+    uint64_t waitfor_completion : 1;
+    uint64_t preread_sync_enable : 1;
+    uint64_t postread_sync_enable : 1;
+    uint64_t data_pattern : 2;
+    uint64_t cl_evict_enable : 1;
+    uint64_t opcode : 4;
+    uint64_t line_repeat_count : 8;
+    uint64_t reserved : 44;
+  };
+};
+
+// HE_WR_ADDR_TABLE_CTRL
+union he_wr_addr_table_ctrl {
+  enum { offset = HE_WR_ADDR_TABLE_CTRL };
+  uint64_t value;
+  struct {
+    uint64_t enable_address_table : 1;
+    uint64_t enable_address_stride : 1;
+    uint64_t stride : 2;
+    uint64_t reserved : 60;
+  };
+};
+
+// HE_WR_ADDR_TABLE_DATA
+union he_wr_addr_table_data {
+  enum { offset = HE_WR_ADDR_TABLE_DATA };
+  uint64_t value;
+  struct {
+    uint64_t address_table_value : 64;
+  };
+};
+
+// HE_RD_NUM_LINES
+union he_rd_num_lines {
+  enum { offset = HE_RD_NUM_LINES };
+  uint64_t value;
+  struct {
+    uint64_t read_num_lines : 16;
+    uint64_t reserved : 48;
+  };
+};
+
+// HE_RD_CONFIG
+union he_rd_config {
+  enum { offset = HE_RD_CONFIG };
+  uint64_t value;
+  struct {
+    uint64_t read_traffic_enable : 1;
+    uint64_t continuous_mode_Enable : 1;
+    uint64_t waitfor_completion : 1;
+    uint64_t prewrite_sync_enable : 1;
+    uint64_t postwrite_sync_enable : 1;
+    uint64_t data_pattern : 2;
+    uint64_t cl_evict_enable : 1;
+    uint64_t opcode : 4;
+    uint64_t line_repeat_count : 8;
+    uint64_t reserved : 44;
+  };
+};
+
+// HE_RD_ADDR_TABLE_CTRL
+union he_rd_addr_table_ctrl {
+  enum { offset = HE_RD_ADDR_TABLE_CTRL };
+  uint64_t value;
+  struct {
+    uint64_t enable_address_table : 1;
+    uint64_t enable_address_stride : 1;
+    uint64_t stride : 2;
+    uint64_t reserved : 60;
+  };
+};
+
+// HE_RD_ADDR_TABLE_DATA
+union he_rd_addr_table_data {
+  enum { offset = HE_RD_ADDR_TABLE_DATA };
+  uint64_t value;
+  struct {
+    uint64_t address_table_value : 64;
+  };
+};
+
+// ERROR_STATUS
+union he_err_status {
+  enum { offset = HE_ERROR_STATUS };
+  uint64_t value;
+  struct {
+    uint64_t data_error : 1;
+    uint64_t rsvd1 : 15;
+    uint64_t err_index : 16;
+    uint64_t rsvd2 : 32;
+  };
+};
+
+// HE DSM status
+struct he_cache_dsm_status {
+  uint32_t test_completed : 1;
+  uint32_t dsm_number : 15;
+  uint32_t res1 : 16;
+  uint32_t err_vector : 32;
+  uint64_t num_ticks : 64;
+  uint32_t num_reads : 32;
+  uint32_t num_writes : 32;
+  uint32_t penalty_start : 32;
+  uint32_t penalty_end : 32;
+  uint32_t actual_data : 32;
+  uint32_t expected_data : 32;
+  uint32_t res5[2];
+};
+
+// configures test mode
+typedef enum {
+  HE_FPGA_RD_CACHE_HIT = 0x0,
+  HE_FPGA_WR_CACHE_HIT = 0x1,
+
+  HE_FPGA_RD_CACHE_MISS = 0x2,
+  HE_FPGA_WR_CACHE_MISS = 0x3,
+
+  HE_HOST_RD_CACHE_HIT = 0x4,
+  HE_HOST_WR_CACHE_HIT = 0x5,
+
+  HE_HOST_RD_CACHE_MISS = 0x6,
+  HE_HOST_WR_CACHE_MISS = 0x7,
+
+} he_test_mode;
+
+// configures traget
+typedef enum {
+  HE_TARGET_HOST = 0x0,
+  HE_TARGET_FPGA = 0x1,
+} he_target;
+
+const std::map<std::string, uint32_t> he_test_modes = {
+    {"fpgardcachehit", HE_FPGA_RD_CACHE_HIT},
+    {"fpgawrcachehit", HE_FPGA_WR_CACHE_HIT},
+    {"fpgardcachemiss", HE_FPGA_RD_CACHE_MISS},
+    {"fpgawrcachemiss", HE_FPGA_WR_CACHE_MISS},
+    {"hostrdcachehit", HE_HOST_RD_CACHE_HIT},
+    {"hostwrcachehit", HE_HOST_WR_CACHE_HIT},
+    {"hostrdcachemiss", HE_HOST_RD_CACHE_MISS},
+    {"hostwrcachemiss", HE_HOST_WR_CACHE_MISS},
+};
+
+const std::map<std::string, uint32_t> he_targets = {
+    {"host", HE_TARGET_HOST},
+    {"fpga", HE_TARGET_FPGA},
+};
+
+///////////////////////
+// Bias Support
+typedef enum {
+  HOST_BIOS = 0x0,
+  DEVIC_BIOA = 0x1,
+} he_ctl_bios_support;
+
+// configures test mode
+typedef enum {
+  HE_ADDRTABLE_SIZE4096 = 0xC,
+  HE_ADDRTABLE_SIZE2048 = 0xB,
+  HE_ADDRTABLE_SIZE1024 = 0xA,
+  HE_ADDRTABLE_SIZE512 = 0x9,
+  HE_ADDRTABLE_SIZE256 = 0x8,
+  HE_ADDRTABLE_SIZE128 = 0x7,
+  HE_ADDRTABLE_SIZE64 = 0x6,
+  HE_ADDRTABLE_SIZE32 = 0x5,
+  HE_ADDRTABLE_SIZE16 = 0x4,
+  HE_ADDRTABLE_SIZE8 = 0x3,
+  HE_ADDRTABLE_SIZE4 = 0x2,
+  HE_ADDRTABLE_SIZE2 = 0x1,
+
+} he_addrtable_size;
+
+// he test type
+typedef enum {
+  HE_ENABLE_TRAFFIC_STAGE = 0x0,
+  HE_SIP_SEQ_STAGE = 0x1,
+} he_traffic_enable;
+
+const std::map<std::string, uint32_t> traffic_enable = {
+    {"enable", HE_ENABLE_TRAFFIC_STAGE},
+    {"skip", HE_SIP_SEQ_STAGE},
+
+};
+
+std::map<uint32_t, uint32_t> addrtable_size = {
+    {HE_ADDRTABLE_SIZE4096, 4096}, {HE_ADDRTABLE_SIZE2048, 2048},
+    {HE_ADDRTABLE_SIZE1024, 1024}, {HE_ADDRTABLE_SIZE512, 512},
+    {HE_ADDRTABLE_SIZE256, 256},   {HE_ADDRTABLE_SIZE128, 128},
+    {HE_ADDRTABLE_SIZE64, 64},     {HE_ADDRTABLE_SIZE32, 32},
+    {HE_ADDRTABLE_SIZE16, 16},     {HE_ADDRTABLE_SIZE8, 8},
+    {HE_ADDRTABLE_SIZE4, 4},       {HE_ADDRTABLE_SIZE2, 2},
+
+};
+
+using test_afu = opae::afu_test::afu;
+using test_command = opae::afu_test::command;
+
+class host_exerciser : public test_afu {
+public:
+  host_exerciser()
+      : test_afu("host_exerciser", nullptr, "info"), count_(1) {}
+
+  virtual int run(CLI::App *app, test_command::ptr_t test) override {
+    int res = exit_codes::not_run;
+
+    logger_->set_pattern("    %v");
+    // Info prints details of an individual run. Turn it on if doing only one
+    // test and the user hasn't changed level from the default.
+    if ((log_level_.compare("warning") == 0))
+       logger_->set_level(spdlog::level::info);
+
+
+    logger_->info("starting test run, count of {0:d}", count_);
+    uint32_t count = 0;
+    try {
+      while (count < count_) {
+        logger_->debug("starting iteration: {0:d}", count + 1);
+
+        res = test_afu::run(app, test);
+        count++;
+        logger_->debug("end iteration: {0:d}", count);
+        if (res)
+          break;
+      }
+    } catch (std::exception &ex) {
+      logger_->error(ex.what());
+      res = exit_codes::exception;
+    }
+
+    auto pass = res == exit_codes::success ? "PASS" : "FAIL";
+    logger_->info("Test {}({}): {}", test->name(), count, pass);
+    spdlog::drop_all();
+    return res;
+  }
+
+public:
+  uint32_t count_;
+
+  bool option_passed(std::string option_str) {
+    if (app_.count(option_str) == 0)
+      return false;
+    return true;
+  }
+};
+} // namespace host_exerciser
diff --git a/samples/cxl_host_exerciser/he_cache_test.h b/samples/cxl_host_exerciser/he_cache_test.h
new file mode 100644
index 000000000000..900e56bf8f7c
--- /dev/null
+++ b/samples/cxl_host_exerciser/he_cache_test.h
@@ -0,0 +1,856 @@
+// Copyright(c) 2023, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <future>
+#include <glob.h>
+#include <inttypes.h>
+#include <numa.h>
+#include <opae/cxx/core.h>
+#include <regex.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <CLI/CLI.hpp>
+#include <spdlog/spdlog.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/sinks/basic_file_sink.h>
+#include <opae/cxx/core.h>
+
+
+
+#include "fpga-dfl.h"
+
+using namespace std;
+
+const char *sbdf_pattern =
+    "(([0-9a-fA-F]{4}):)?([0-9a-fA-F]{2}):([0-9a-fA-F]{2})\\.([0-9])";
+
+enum { MATCHES_SIZE = 6 };
+#define FEATURE_DEV                                                            \
+  "/sys/bus/pci/devices/%s/"                                                   \
+  "fpga_region/region*/dfl-fme*/dfl_dev*/feature_id"
+
+#define MAX_SIZE 256
+
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#define MAP_2M_HUGEPAGE (0x15 << MAP_HUGE_SHIFT) /* 2 ^ 0x15 = 2M */
+#define MAP_1G_HUGEPAGE (0x1e << MAP_HUGE_SHIFT) /* 2 ^ 0x1e = 1G */
+
+#ifdef __ia64__
+#define ADDR ((void *)(0x8000000000000000UL))
+#define FLAGS_4K (MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED)
+#define FLAGS_2M (FLAGS_4K | MAP_2M_HUGEPAGE | MAP_HUGETLB)
+#define FLAGS_1G (FLAGS_4K | MAP_1G_HUGEPAGE | MAP_HUGETLB)
+#else
+#define ADDR ((void *)(0x0UL))
+#define FLAGS_4K (MAP_PRIVATE | MAP_ANONYMOUS)
+#define FLAGS_2M (FLAGS_4K | MAP_2M_HUGEPAGE | MAP_HUGETLB)
+#define FLAGS_1G (FLAGS_4K | MAP_1G_HUGEPAGE | MAP_HUGETLB)
+#endif
+
+#define KiB(x) ((x)*1024)
+#define MiB(x) ((x)*1024 * 1024)
+#define GiB(x) ((x)*1024 * 1024 * 1024)
+
+#define DFL_CXL_CACHE_DSM_BASE 0x030
+#define DFL_CXL_CACHE_WR_ADDR_TABLE_DATA 0x068
+#define DFL_CXL_CACHE_RD_ADDR_TABLE_DATA 0x088
+
+
+bool buffer_allocate(void** addr, uint64_t len, uint32_t numa_node)
+{
+    void* addr_local = NULL;
+    int i            = 0;
+    long status      = 0;
+    unsigned long mask[4];
+    unsigned int bits_per_UL = sizeof(unsigned long) * 8;
+
+    for (i = 0; i < 4; i++) mask[i] = 0;
+    mask[numa_node / bits_per_UL] |= 1UL << (numa_node % bits_per_UL);
+
+    if (len > MiB(2))
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_1G, 0, 0);
+    else if (len > KiB(4))
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_2M, 0, 0);
+    else
+        addr_local = mmap(ADDR, len, PROTECTION, FLAGS_4K, 0, 0);
+
+    if (addr_local == MAP_FAILED) {
+        if (errno == ENOMEM) {
+            if (len > MiB(2))
+                cerr <<"Could not allocate buffer (no free 1 "
+                    "GiB huge pages)";
+            if (len > KiB(4))
+                cerr << "Could not allocate buffer (no free 2 "
+                    "MiB huge pages)";
+            else
+                cerr <<"Could not allocate buffer (out of "
+                    "memory)";
+            return false;
+        }
+        cerr << "CXL cache mmap failed:"<< strerror(errno) << endl;
+        return false;
+    }
+
+    if (addr_local == NULL) { 
+        cerr << "Unable to mmap" << endl;
+        return false;
+    }
+
+    status = syscall(__NR_mbind, addr_local, len, 2, &mask, numa_node + 2, 1);
+    if (status != 0) {
+        cerr << "buffer_allocate(): unable to mbind:"
+              << strerror(errno) << endl;
+        return false;
+    }
+
+    *addr = addr_local;
+    return true;
+}
+
+bool buffer_release(void* addr, uint64_t len)
+{
+    if (munmap(addr, len)) {
+        cerr << "CXL cache unmap failed:", strerror(errno);
+            return false;
+    }
+    return true;
+}
+
+bool sysfs_read_u64(const char *path, uint64_t *value) {
+  ifstream fs;
+  fs.open(path, ios::in);
+
+  std::string s;
+  if (fs.is_open()) {
+    std::string line;
+    std::getline(fs, line);
+    *value = std::stoul(line, 0, 16);
+    fs.close();
+    return true;
+  }
+  return false;
+}
+
+namespace opae {
+namespace afu_test {
+
+
+template <typename T>
+inline bool parse_match_int(const char *s, regmatch_t m, T &v, int radix = 10) {
+  if (m.rm_so == -1 || m.rm_eo == -1)
+    return false;
+  errno = 0;
+  v = std::strtoul(s + m.rm_so, NULL, radix);
+  return errno == 0;
+}
+
+union pcie_address {
+  struct {
+    uint32_t function : 3;
+    uint32_t device : 5;
+    uint32_t bus : 8;
+    uint32_t domain : 16;
+  } fields;
+  uint32_t value;
+
+  static pcie_address parse(const char *s) {
+    auto deleter = [&](regex_t *r) {
+      regfree(r);
+      delete r;
+    };
+    std::unique_ptr<regex_t, decltype(deleter)> re(new regex_t, deleter);
+    regmatch_t matches[MATCHES_SIZE];
+
+    int reg_res = regcomp(re.get(), sbdf_pattern, REG_EXTENDED | REG_ICASE);
+    if (reg_res)
+      throw std::runtime_error("could not compile regex");
+
+    reg_res = regexec(re.get(), s, MATCHES_SIZE, matches, 0);
+    if (reg_res)
+      throw std::runtime_error("pcie address not valid format");
+
+    uint16_t domain, bus, device, function;
+    if (!parse_match_int(s, matches[2], domain, 16))
+      domain = 0;
+    if (!parse_match_int(s, matches[3], bus, 16))
+      throw std::runtime_error("error parsing pcie address");
+    if (!parse_match_int(s, matches[4], device, 16))
+      throw std::runtime_error("error parsing pcie address");
+    if (!parse_match_int(s, matches[5], function))
+      throw std::runtime_error("error parsing; pcie address");
+    pcie_address a;
+    a.fields.domain = domain;
+    a.fields.bus = bus;
+    a.fields.device = device;
+    a.fields.function = function;
+    return a;
+  }
+};
+
+class afu; // forward declaration
+
+class command {
+public:
+  typedef std::shared_ptr<command> ptr_t;
+  command() : running_(true) {}
+  virtual ~command() {}
+  virtual const char *name() const = 0;
+  virtual const char *description() const = 0;
+  virtual int run(afu *afu, CLI::App *app) = 0;
+  virtual void add_options(CLI::App *app) { (void)app; }
+  virtual const char *afu_id() const { return nullptr; }
+
+  virtual uint64_t featureid() const = 0;
+  virtual uint64_t guidl() const = 0;
+  virtual uint64_t guidh() const = 0;
+
+  bool running() const { return running_; }
+  void stop() { running_ = false; }
+
+private:
+  std::atomic<bool> running_;
+};
+
+#if SPDLOG_VERSION >= 10900
+// spdlog version 1.9.0 defines SPDLOG_LEVEL_NAMES as an array of string_view_t.
+// Convert to vector of std::string to be used in CLI::IsMember().
+inline std::vector<std::string> spdlog_levels() {
+  std::vector<spdlog::string_view_t> levels_view = SPDLOG_LEVEL_NAMES;
+  std::vector<std::string> levels_str(levels_view.size());
+  std::transform(levels_view.begin(), levels_view.end(), levels_str.begin(),
+                 [](spdlog::string_view_t sv) {
+                   return std::string(sv.data(), sv.size());
+                 });
+  return levels_str;
+}
+#else
+inline std::vector<std::string> spdlog_levels() { return SPDLOG_LEVEL_NAMES; }
+#endif // SPDLOG_VERSION
+
+class afu {
+public:
+  typedef int (*command_fn)(afu *afu, CLI::App *app);
+  enum exit_codes {
+    success = 0,
+    not_run,
+    not_found,
+    no_access,
+    exception,
+    error
+  };
+
+  afu(const char *name, const char *afu_id = nullptr,
+      const char *log_level = nullptr)
+      : name_(name), afu_id_(afu_id ? afu_id : ""), app_(name_), pci_addr_(""),
+        log_level_(log_level ? log_level : "info"), timeout_msec_(60000),
+        current_command_(nullptr) {
+    if (!afu_id_.empty())
+      app_.add_option("-g,--guid", afu_id_, "GUID")->default_str(afu_id_);
+    app_.add_option("-p,--pci-address", pci_addr_,
+                    "[<domain>:]<bus>:<device>.<function>");
+    app_.add_option("-l,--log-level", log_level_, "stdout logging level")
+        ->default_str(log_level_)
+        ->check(CLI::IsMember(spdlog_levels()));
+    app_.add_option("-t,--timeout", timeout_msec_, "test timeout (msec)")
+        ->default_str(std::to_string(timeout_msec_));
+  }
+  virtual ~afu() {
+
+    if (fd_ > 0)
+        close(fd_);
+    if (logger_)
+      spdlog::drop(logger_->name());
+  }
+
+  CLI::App &cli() { return app_; }
+
+  int find_dev_feature() {
+    glob_t pglob;
+    char feature_path[MAX_SIZE] = {0};
+    int gres = 0;
+    uint64_t value = 0;
+    size_t i = 0;
+
+    if (!pci_addr_.empty()) {
+      if (snprintf(feature_path, sizeof(feature_path), FEATURE_DEV,
+                   pci_addr_.c_str()) < 0) {
+        cerr << "snprintf buffer overflow" << endl;
+        return 1;
+      }
+    } else {
+      if (snprintf(feature_path, sizeof(feature_path), FEATURE_DEV, "*:*:*.*") <
+          0) {
+        cerr << "snprintf buffer overflow" << endl;
+        return 2;
+      }
+    }
+
+    gres = glob(feature_path, GLOB_NOSORT, NULL, &pglob);
+    if (gres) {
+      cerr << "Failed pattern match" << feature_path << ":" << strerror(errno)
+           << endl;
+      globfree(&pglob);
+      return 3;
+    }
+
+    for (i = 0; i < pglob.gl_pathc; i++) {
+      bool retval = sysfs_read_u64(pglob.gl_pathv[i], &value);
+      if (!retval) {
+        cerr << "Failed to read sysfs value" << endl;
+        continue;
+      }
+
+      if (current_command()->featureid() == value) {
+        string str(pglob.gl_pathv[i]);
+        string substr_dev(str.substr(0, str.rfind("/")));
+        globfree(&pglob);
+
+        substr_dev.append("/dfl-cxl-cache/dfl-cxl-cache*");
+        gres = glob(substr_dev.c_str(), GLOB_NOSORT, NULL, &pglob);
+        if (gres) {
+          cerr << "Failed pattern match" << substr_dev.c_str() << ":"
+               << strerror(errno) << endl;
+          globfree(&pglob);
+          return 4;
+        }
+        string str1(pglob.gl_pathv[0]);
+        globfree(&pglob);
+        dev_path_.append("/dev");
+        dev_path_.append(str1.substr(str1.rfind("/"), 16));
+
+        return 0;
+      }
+    }
+
+    return 5;
+  }
+
+  void unmap_mmio() {
+    if (mmio_base_) {
+      if (munmap(mmio_base_, rinfo_.size) == -1)
+        cerr << "Failed to unmap MMIO:" << strerror(errno) << endl;
+    }
+  }
+
+  bool map_mmio() {
+    void *user_v;
+    user_v = mmap(NULL, rinfo_.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                  rinfo_.offset);
+    if (user_v == MAP_FAILED) {
+      cerr << "Failed to map MMIO:" << strerror(errno) << endl;
+      return false;
+    }
+    mmio_base_ = (uint8_t *)user_v;
+
+    return true;
+  }
+
+  int open_handle() {
+
+    int res = 0;
+    logger_->debug("dev_path_:{0}", dev_path_);
+
+    fd_ = open(dev_path_.c_str(), O_RDWR);
+    if (fd_ < 0) {
+      cerr << "open() failed:" << strerror(errno) << endl;
+      return 1;
+    }
+
+    memset(&rinfo_, 0, sizeof(rinfo_));
+    rinfo_.argsz = sizeof(rinfo_);
+    res = ioctl(fd_, DFL_CXL_CACHE_GET_REGION_INFO, &rinfo_);
+    if (res) {
+      cerr << "ioctl() DFL_CXL_CACHE_GET_REGION_INFO failed:" << strerror(errno)
+           << endl;
+      close(fd_);
+      return 2;
+    }
+    logger_->debug("MMIO region flags:0x:{0:x} size:0x {1:x} offset:0x {2:x}",
+        rinfo_.flags, rinfo_.size, rinfo_.offset);
+
+    if (!map_mmio()) {
+      cerr << "mmap failed:" << strerror(errno) << endl;
+      close(fd_);
+      return 3;
+    }
+
+    volatile uint64_t *u64 = (volatile uint64_t *)mmio_base_;
+    logger_->debug("DFH     : 0x:{0:X}", *u64);
+    logger_->debug("DFH + 8 : 0x:{0:X}", *(u64 + 1));
+    logger_->debug("DFH + 16: 0x:{0:X}", *(u64 + 2));
+    logger_->debug("DFH + 24: 0x:{0:X}", *(u64 + 3));
+
+    return exit_codes::not_run;
+  }
+
+  int main(int argc, char *argv[]) {
+    if (!commands_.empty())
+      app_.require_subcommand();
+    CLI11_PARSE(app_, argc, argv);
+
+    command::ptr_t test(nullptr);
+    CLI::App *app = nullptr;
+    for (auto kv : commands_) {
+      if (*kv.first) {
+        app = kv.first;
+        test = kv.second;
+        break;
+      }
+    }
+    if (!test) {
+      std::cerr << "no command specified\n";
+      return exit_codes::not_run;
+    }
+
+    auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+    logger_ = std::make_shared<spdlog::logger>(test->name(), console_sink);
+    spdlog::register_logger(logger_);
+    logger_->set_level(spdlog::level::from_str(log_level_));
+    current_command_ = test;
+    if (find_dev_feature() != 0) {
+      cerr << "fails to find feature" << endl;
+      return exit_codes::exception;
+    };
+
+    int res = open_handle();
+    if (res != exit_codes::not_run) {
+      return res;
+    }
+
+    return run(app, test);
+  }
+
+  virtual int run(CLI::App *app, command::ptr_t test) {
+    int res = exit_codes::not_run;
+    current_command_ = test;
+
+    try {
+      std::future<int> f = std::async(std::launch::async, [this, test, app]() {
+        return test->run(this, app);
+      });
+      auto status = f.wait_for(std::chrono::milliseconds(timeout_msec_));
+      if (status == std::future_status::timeout) {
+        std::cerr << "Error: test timed out" << std::endl;
+        current_command_->stop();
+        throw std::runtime_error("timeout");
+      }
+      res = f.get();
+    } catch (std::exception &ex) {
+      res = exit_codes::exception;
+    }
+
+    current_command_.reset();
+    return res;
+  }
+
+  template <class T> CLI::App *register_command() {
+    command::ptr_t cmd(new T());
+    auto sub = app_.add_subcommand(cmd->name(), cmd->description());
+    cmd->add_options(sub);
+    commands_[sub] = cmd;
+    return sub;
+  }
+
+  uint64_t read64(uint32_t offset) {
+    uint64_t value = *((uint64_t *)(mmio_base_ + offset));
+    return value;
+  }
+
+  void write64(uint32_t offset, uint64_t value) {
+    *((uint64_t *)(mmio_base_ + offset)) = value;
+    return;
+  }
+
+  uint32_t read32(uint32_t offset) {
+    uint32_t value = *((uint64_t *)(mmio_base_ + offset));
+    return value;
+  }
+
+  void write32(uint32_t offset, uint32_t value) {
+    *((uint32_t *)(mmio_base_ + offset)) = value;
+    return;
+  }
+
+  command::ptr_t current_command() const { return current_command_; }
+
+  bool allocate_dsm(size_t len = KiB(4), uint32_t numa_node = 0) {
+
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_cxl_cache_buffer_map dma_map;
+
+    memset(&dma_map, 0, sizeof(dma_map));
+
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 4k huge page:" << strerror(errno) << endl;
+        return false;
+    }
+
+    cout << "DSM buffer numa node: " << numa_node << endl;
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_DSM_BASE;
+
+    logger_->debug("Allocate DSM buffer user addr 0x:{0:x} length :"
+        "{1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
+
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+    logger_->debug("DSM_BASE     : 0x:{0:x}", *u64);
+
+    dsm_buffer_ = (uint8_t *)ptr;
+    dsm_buf_len_ = len;
+    return true;
+
+  out_free:
+    buffer_release(ptr, len);
+    return false;
+  }
+
+  bool free_dsm() {
+
+    int res = 0;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)dsm_buffer_;
+    dma_unmap.length = dsm_buf_len_;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_DSM_BASE;
+
+    logger_->debug("free dsm user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_DSM_BASE);
+
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed"
+          << strerror(errno) << endl;
+    }
+
+    logger_->debug("DSM_BASE     : 0x:{0:x}", *u64);
+    buffer_release(dsm_buffer_, dsm_buf_len_);
+    return true;
+  }
+
+  bool allocate_cache_read(size_t len = MiB(2), uint32_t numa_node = 0) {
+
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_cxl_cache_buffer_map dma_map;
+
+    memset(&dma_map, 0, sizeof(dma_map));
+
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
+    }
+    cout << "Read buffer numa node: " << numa_node << endl;
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+
+    logger_->debug("Allocate read buffer user addr 0x:{0:x} length :"
+        "{1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
+    sleep(1);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+
+    logger_->debug("DFL_CXL_CACHE_RD_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
+    rd_buffer_ = (uint8_t *)ptr;
+    rd_buf_len_ = len;
+    return true;
+
+  out_free:
+    buffer_release(ptr, len);
+    return false;
+  }
+
+  bool free_cache_read() {
+
+    int res = 0;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)rd_buffer_;
+    dma_unmap.length = rd_buf_len_;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+
+    logger_->debug("free read user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    logger_->debug("DFL_CXL_CACHE_RD_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
+    buffer_release(rd_buffer_, rd_buf_len_);
+    return true;
+  }
+
+  bool allocate_cache_write(size_t len = MiB(2), uint32_t numa_node = 0) {
+
+    int res  = 0;
+    void *ptr = NULL;
+    struct dfl_cxl_cache_buffer_map dma_map;
+
+    memset(&dma_map, 0, sizeof(dma_map));
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
+    }
+
+    cout << "Write buffer numa node: " << numa_node << endl;
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
+
+    logger_->debug("Allocate write buffer user addr 0x:{0:x}\
+        length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
+    wr_buffer_ = (uint8_t *)ptr;
+    wr_buf_len_ = len;
+    return true;
+
+  out_free:
+    buffer_release(ptr, len);
+    return false;
+  }
+
+  bool free_cache_write() {
+
+    int res = 0;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)wr_buffer_;
+    dma_unmap.length = wr_buf_len_;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
+
+    logger_->debug("free write user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
+
+    volatile uint64_t *u64 =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64);
+    buffer_release(wr_buffer_, wr_buf_len_);
+    return true;
+  }
+
+  bool allocate_cache_read_write(size_t len = MiB(2), uint32_t numa_node = 0) {
+
+    int res = 0;
+    void *ptr = NULL;
+    struct dfl_cxl_cache_buffer_map dma_map;
+
+    memset(&dma_map, 0, sizeof(dma_map));
+    if (!buffer_allocate(&ptr, len, numa_node)) {
+        cerr << "Fails to allocate 2MB huge page:" << strerror(errno) << endl;
+        return false;
+    }
+    cout << "Read/Write buffer numa node: " << numa_node << endl;
+
+    dma_map.argsz = sizeof(dma_map);
+    dma_map.user_addr = (__u64)ptr;
+    dma_map.length = len;
+    dma_map.numa_node = numa_node;
+    dma_map.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+    dma_map.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
+
+    logger_->debug("Allocate read/write buffer user addr 0x:{0:x}\
+        length : {1:d} numa node : {2:d}",
+        dma_map.user_addr, dma_map.length, dma_map.numa_node);
+
+    volatile uint64_t *u64_wr =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
+    volatile uint64_t *u64_rd =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_MAP, &dma_map);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_MAP failed" << strerror(errno)
+           << endl;
+      goto out_free;
+    }
+
+    logger_->debug("nDFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_rd);
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_wr);
+
+    rd_wr_buffer_ = (uint8_t *)ptr;
+    rd_wr_buf_len_ = len;
+
+    return true;
+
+  out_free:
+    buffer_release(ptr, len);
+    return false;
+  }
+
+  bool free_cache_read_write() {
+
+    int res = 0 ;
+    struct dfl_cxl_cache_buffer_unmap dma_unmap;
+
+    memset(&dma_unmap, 0, sizeof(dma_unmap));
+    dma_unmap.argsz = sizeof(dma_unmap);
+    dma_unmap.user_addr = (__u64)rd_wr_buffer_;
+    dma_unmap.length = rd_wr_buf_len_;
+    dma_unmap.csr_array[0] = DFL_CXL_CACHE_RD_ADDR_TABLE_DATA;
+    dma_unmap.csr_array[1] = DFL_CXL_CACHE_WR_ADDR_TABLE_DATA;
+
+    logger_->debug("free read/write user addr 0x:{0:x} length : {1:d} ",
+        dma_unmap.user_addr, dma_unmap.length);
+
+    volatile uint64_t *u64_wr =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_WR_ADDR_TABLE_DATA);
+    volatile uint64_t *u64_rd =
+        (volatile uint64_t *)(mmio_base_ + DFL_CXL_CACHE_RD_ADDR_TABLE_DATA);
+
+    res = ioctl(fd_, DFL_CXL_CACHE_NUMA_BUFFER_UNMAP, &dma_unmap);
+    if (res) {
+      cerr << "ioctl DFL_CXL_CACHE_NUMA_BUFFER_UNMAP failed" << strerror(errno)
+           << endl;
+    }
+
+    logger_->debug("nDFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_rd);
+    logger_->debug("DFL_CXL_CACHE_WR_ADDR_TABLE_DATA     : 0x:{0:x}", *u64_wr);
+
+    buffer_release(rd_wr_buffer_, rd_wr_buf_len_);
+    rd_wr_buffer_ = NULL;
+    return true;
+  }
+
+  uint8_t *get_dsm() const { return dsm_buffer_; }
+
+  uint8_t *get_read() const { return rd_buffer_; }
+
+  uint8_t *get_write() const { return wr_buffer_; }
+
+  uint8_t *get_read_write() const { return rd_wr_buffer_; }
+
+protected:
+  std::string name_;
+  std::string afu_id_;
+  CLI::App app_;
+  std::string pci_addr_;
+  std::string log_level_;
+  uint32_t timeout_msec_;
+
+  int fd_;
+  uint8_t *mmio_base_;
+  uint64_t mmio_len_;
+
+  uint8_t *dsm_buffer_;
+  uint64_t dsm_buf_len_;
+
+  uint8_t *rd_buffer_;
+  uint64_t rd_buf_len_;
+
+  uint8_t *wr_buffer_;
+  uint64_t wr_buf_len_;
+
+  uint8_t *rd_wr_buffer_;
+  uint64_t rd_wr_buf_len_;
+
+  struct dfl_cxl_cache_region_info rinfo_;
+
+  std::string dev_path_;
+
+  command::ptr_t current_command_;
+  std::map<CLI::App *, command::ptr_t> commands_;
+
+public:
+  std::shared_ptr<spdlog::logger> logger_;
+};
+
+} // end of namespace afu_test
+} // end of namespace opae