[benchmark] Option to use sync API for Latency measurements (#947)

intel · Mar 5, 2025 · 7b79e58 · 7b79e58
1 parent 948ee13
commit 7b79e58
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 18 deletions.
diff --git a/tools/benchmarks/include/cmd_decl.hpp b/tools/benchmarks/include/cmd_decl.hpp
@@ -20,6 +20,7 @@ BM_DECLARE_bool(full_time);
 BM_DECLARE_bool(no_hw);
 BM_DECLARE_string(in_mem);
 BM_DECLARE_string(out_mem);
+BM_DECLARE_bool(sync_api);
 
 std::int32_t get_block_size();
 mem_loc_e    get_in_mem();

diff --git a/tools/benchmarks/include/details/measure_sync.hpp b/tools/benchmarks/include/details/measure_sync.hpp
@@ -24,6 +24,10 @@ static statistics_t measure_sync(benchmark::State& state, const case_params_t& c
         res.operations = res.queue_size;
     }
 
+    if (res.queue_size != 1 && common_params.use_sync_api_) {
+        throw std::runtime_error("Using --sync_api for measurements do not support queue size > 1");
+    }
+
     res.operations_per_thread = res.operations;
     if (state.threads() > 1) throw std::runtime_error("Synchronous measurements do not support threading");
 
@@ -33,29 +37,40 @@ static statistics_t measure_sync(benchmark::State& state, const case_params_t& c
         operation.mem_control(common_params.in_mem_, mem_loc_mask_e::src);
     }
 
-    // Strategies:
-    // - File at once. Each operation works on same file independently.
-    // - Chunk at once. Measure each chunk independently one by one, gather aggregate in the end. Is this reasonable?
-    // - File by chunks. Measure for the whole file processing different chunks in parallel (map file before processing). Like normal processing
-
+    // Non-timed warm-up
     for (auto& operation : operations) {
-        operation.async_submit();
-        operation.async_wait();
+        if (common_params.use_sync_api_) {
+            operation.sync_execute();
+        } else {
+            operation.async_submit();
+            operation.async_wait();
+        }
         operation.light_reset();
     }
 
+    // Timed measurements
     for (auto _ : state) {
-        for (auto& operation : operations) {
-            operation.async_submit();
-        }
+        if (common_params.use_sync_api_) {
+            for (auto& operation : operations) {
+                operation.sync_execute();
+                operation.light_reset();
 
-        for (auto& operation : operations) {
-            operation.async_wait();
-            operation.light_reset();
+                res.completed_operations++;
+                res.data_read += operation.get_bytes_read();
+                res.data_written += operation.get_bytes_written();
+            }
+        } else {
+            for (auto& operation : operations) {
+                operation.async_submit();
+            }
+            for (auto& operation : operations) {
+                operation.async_wait();
+                operation.light_reset();
 
-            res.completed_operations++;
-            res.data_read += operation.get_bytes_read();
-            res.data_written += operation.get_bytes_written();
+                res.completed_operations++;
+                res.data_read += operation.get_bytes_read();
+                res.data_written += operation.get_bytes_written();
+            }
         }
     }
 

diff --git a/tools/benchmarks/include/utility.hpp b/tools/benchmarks/include/utility.hpp
@@ -169,6 +169,7 @@ struct case_params_t {
     bool         full_time_ {cmd::FLAGS_full_time};
     std::int32_t queue_size_ {cmd::FLAGS_queue_size};
     std::int32_t node_ {cmd::FLAGS_node};
+    bool         use_sync_api_ {cmd::FLAGS_sync_api};
 };
 
 template <typename CaseT, typename CaseParamsT, typename... ArgsT>

diff --git a/tools/benchmarks/src/main.cpp b/tools/benchmarks/src/main.cpp
@@ -98,6 +98,7 @@ BM_DEFINE_string(in_mem, "llc");
 BM_DEFINE_string(out_mem, "cс_ram");
 BM_DEFINE_bool(full_time, false);
 BM_DEFINE_bool(no_hw, false);
+BM_DEFINE_bool(sync_api, false);
 
 /**
  * Print the help message that includes information about the input parameters that can be used to configure the benchmark.
@@ -120,7 +121,8 @@ static void print_help() {
             "          [--in_mem=<location>]         - Input memory type: cache, llc or ram. Set to llc by default. \n"
             "          [--out_mem=<location>]        - Output memory type: cache_ram or ram. Set to cache_ram by default. \n"
             "          [--full_time]                 - Include initialization and destruction into measured time. Off by default.\n"
-            "          [--no_hw]                     - Skip accelerator initialization check and run only using qpl_software_path. Off by default.\n");
+            "          [--no_hw]                     - Skip accelerator initialization check and run only using qpl_software_path. Off by default.\n"
+            "          [--sync_api]                  - (Experimental) Use synchronous API for execution. Default is Off. Only applicable for single-threaded runs.\n");
 }
 
 static void parse_cmd_line(int* argc, char** argv) {
@@ -133,7 +135,8 @@ static void parse_cmd_line(int* argc, char** argv) {
             benchmark::ParseStringFlag(argv[i], "in_mem", &FLAGS_in_mem) ||
             benchmark::ParseStringFlag(argv[i], "out_mem", &FLAGS_out_mem) ||
             benchmark::ParseBoolFlag(argv[i], "full_time", &FLAGS_full_time) ||
-            benchmark::ParseBoolFlag(argv[i], "no_hw", &FLAGS_no_hw)) {
+            benchmark::ParseBoolFlag(argv[i], "no_hw", &FLAGS_no_hw) ||
+            benchmark::ParseBoolFlag(argv[i], "sync_api", &FLAGS_sync_api)) {
             for (int j = i; j != *argc - 1; ++j)
                 argv[j] = argv[j + 1];