From 31da4eeec2759c364671b82b1fb31bc1ec219c6a Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Mon, 5 Feb 2024 15:50:22 -0800
Subject: [PATCH 1/6] Add response statistics reporting and custom delays

---
 src/square.cc | 254 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 218 insertions(+), 36 deletions(-)
diff --git a/src/square.cc b/src/square.cc
index b65a823..58ea5cb 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -24,6 +24,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <memory>
@@ -89,6 +90,133 @@ namespace triton { namespace backend { namespace square {
     }                                                                        \
   } while (false)
 
+//
+// ModelParameters
+//
+// Helper class for parsing and storing model config parameters, and help
+// performing required operations on the stored parameters.
+//
+class ModelParameters {
+ public:
+  enum DelayType { Infer, Output };
+  enum InferResultType { Success, Fail, Empty };
+
+  ModelParameters()
+      : custom_infer_delay_ns_(0), custom_output_delay_ns_(0),
+        custom_fail_count_(0), custom_empty_count_(0)
+  {
+  }
+  ModelParameters(common::TritonJson::Value& model_config_);
+
+  void Sleep(DelayType delay_type) const;
+  InferResultType InferResult(size_t current_index, size_t element_count) const;
+
+ private:
+  void ReadParameter(
+      common::TritonJson::Value& parameters_json, const std::string& key,
+      size_t* value) const;
+  void Sleep(size_t delay_ns) const;
+  bool IsNumber(const std::string& str) const;
+
+  size_t custom_infer_delay_ns_;
+  size_t custom_output_delay_ns_;
+  size_t custom_fail_count_;
+  size_t custom_empty_count_;
+};
+
+ModelParameters::ModelParameters(common::TritonJson::Value& model_config_)
+    : ModelParameters()
+{
+  // Parse and store model config parameters. Any non-well-formed parameter
+  // will be left at its default value.
+  common::TritonJson::Value parameters_json;
+  if (model_config_.Find("parameters", &parameters_json)) {
+    ReadParameter(
+        parameters_json, "CUSTOM_INFER_DELAY_NS", &custom_infer_delay_ns_);
+    ReadParameter(
+        parameters_json, "CUSTOM_OUTPUT_DELAY_NS", &custom_output_delay_ns_);
+    ReadParameter(parameters_json, "CUSTOM_FAIL_COUNT", &custom_fail_count_);
+    ReadParameter(parameters_json, "CUSTOM_EMPTY_COUNT", &custom_empty_count_);
+  }
+}
+
+void
+ModelParameters::Sleep(DelayType delay_type) const
+{
+  // Sleep on the requested delay type.
+  if (delay_type == DelayType::Infer) {
+    Sleep(custom_infer_delay_ns_);
+  } else if (delay_type == DelayType::Output) {
+    Sleep(custom_output_delay_ns_);
+  }
+}
+
+ModelParameters::InferResultType
+ModelParameters::InferResult(size_t current_index, size_t element_count) const
+{
+  // i.e. there are N element_count, F fail count and E empty count. Return
+  // empty on [N - E, N) index and fail on [N - E - F, N - E) elements. With
+  // proper N, F and E values, infer result will initially return success, and
+  // then fail, and then empty.
+  if (current_index + custom_fail_count_ + custom_empty_count_ <
+      element_count) {
+    // [0, N - E - F)
+    return ModelParameters::InferResultType::Success;
+  }
+  if (current_index + custom_empty_count_ < element_count) {
+    // [N - E - F, N - E)
+    return ModelParameters::InferResultType::Fail;
+  }
+  // [N - E, N)
+  return ModelParameters::InferResultType::Empty;
+}
+
+void
+ModelParameters::ReadParameter(
+    common::TritonJson::Value& parameters_json, const std::string& key,
+    size_t* value) const
+{
+  common::TritonJson::Value value_json;
+  if (parameters_json.Find(key.c_str(), &value_json)) {
+    std::string value_str;
+    if (value_json.MemberAsString("string_value", &value_str) != nullptr) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("string_value cannot be parsed from ") + key +
+           " parameter")
+              .c_str());
+      return;
+    }
+    if (!IsNumber(value_str)) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO, (value_str + " string_value from " + key +
+                                  " parameter is not a number")
+                                     .c_str());
+      return;
+    }
+    *value = std::stoi(value_str);
+  }
+}
+
+void
+ModelParameters::Sleep(size_t delay_ns) const
+{
+  if (delay_ns > 0) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("add delay ") + std::to_string(delay_ns) + " ns").c_str());
+    std::this_thread::sleep_for(std::chrono::nanoseconds(delay_ns));
+  }
+}
+
+bool
+ModelParameters::IsNumber(const std::string& str) const
+{
+  return std::find_if(str.begin(), str.end(), [](unsigned char c) {
+           return !std::isdigit(c);
+         }) == str.end();
+}
+
 //
 // ModelState
 //
@@ -107,6 +235,9 @@ class ModelState {
   // Validate that model configuration is supported by this backend.
   TRITONSERVER_Error* ValidateModelConfig();
 
+  // Get model parameters.
+  const ModelParameters& get_model_parameters() { return model_parameters_; }
+
  private:
   ModelState(
       TRITONBACKEND_Model* triton_model,
@@ -114,6 +245,7 @@ class ModelState {
 
   TRITONBACKEND_Model* triton_model_;
   common::TritonJson::Value model_config_;
+  ModelParameters model_parameters_;
 };
 
 TRITONSERVER_Error*
@@ -146,7 +278,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(
     TRITONBACKEND_Model* triton_model, common::TritonJson::Value&& model_config)
-    : triton_model_(triton_model), model_config_(std::move(model_config))
+    : triton_model_(triton_model), model_config_(std::move(model_config)),
+      model_parameters_(model_config_)
 {
 }
 
@@ -380,47 +513,96 @@ ModelInstanceState::RequestThread(
       TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
       factory(factory_ptr);
 
-  // Copy IN->OUT, and send a response.
+  // Copy IN -> OUT, and send a response.
   const std::vector<int64_t> output_shape(dims_count, 1);
-  for (size_t e = 0; e < element_count; ++e) {
-    // Create the response with a single OUT output.
-    TRITONBACKEND_Response* response;
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ResponseNewFromFactory(&response, factory.get()));
+  for (size_t e = 0; e < element_count; e++) {
+    // Timestamp at start of the response.
+    uint64_t response_start_ns;
+    SET_TIMESTAMP(response_start_ns);
+
+    // Simulate compute delay, if provided.
+    model_state_->get_model_parameters().Sleep(
+        ModelParameters::DelayType::Infer);
+
+    // Result type of the simulated inference.
+    ModelParameters::InferResultType result_type =
+        model_state_->get_model_parameters().InferResult(e, element_count);
+
+    // Populate 'compute_output_start_ns' and 'response' if not empty result.
+    uint64_t compute_output_start_ns = 0;
+    TRITONBACKEND_Response* response = nullptr;
+    if (result_type != ModelParameters::InferResultType::Empty) {
+      // Timestamp at start of outputting compute tensors.
+      SET_TIMESTAMP(compute_output_start_ns);
+
+      // Create the response with a single OUT output.
+      RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+          factory.get(),
+          TRITONBACKEND_ResponseNewFromFactory(&response, factory.get()));
 
-    TRITONBACKEND_Output* output;
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(), TRITONBACKEND_ResponseOutput(
-                           response, &output, "OUT", TRITONSERVER_TYPE_INT32,
-                           output_shape.data(), dims_count));
-
-    // Get the output buffer. We request a buffer in CPU memory but we
-    // have to handle any returned type. If we get back a buffer in
-    // GPU memory we just fail the request.
-    void* output_buffer;
-    TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
-    int64_t output_memory_type_id = 0;
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(), TRITONBACKEND_OutputBuffer(
-                           output, &output_buffer, sizeof(int32_t),
-                           &output_memory_type, &output_memory_type_id));
-    if (output_memory_type == TRITONSERVER_MEMORY_GPU) {
+      // Get response output container.
+      TRITONBACKEND_Output* output;
       RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-          factory.get(), TRITONSERVER_ErrorNew(
-                             TRITONSERVER_ERROR_INTERNAL,
-                             "failed to create output buffer in CPU memory"));
+          factory.get(), TRITONBACKEND_ResponseOutput(
+                             response, &output, "OUT", TRITONSERVER_TYPE_INT32,
+                             output_shape.data(), dims_count));
+
+      // Get the output buffer. We request a buffer in CPU memory but we
+      // have to handle any returned type. If we get back a buffer in
+      // GPU memory we just fail the request.
+      void* output_buffer;
+      TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
+      int64_t output_memory_type_id = 0;
+      RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+          factory.get(), TRITONBACKEND_OutputBuffer(
+                             output, &output_buffer, sizeof(int32_t),
+                             &output_memory_type, &output_memory_type_id));
+      if (output_memory_type == TRITONSERVER_MEMORY_GPU) {
+        RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+            factory.get(), TRITONSERVER_ErrorNew(
+                               TRITONSERVER_ERROR_INTERNAL,
+                               "failed to create output buffer in CPU memory"));
+      }
+
+      // Copy IN -> OUT
+      *(reinterpret_cast<int32_t*>(output_buffer)) = element_count;
+
+      // Simulate output delay, if provided.
+      model_state_->get_model_parameters().Sleep(
+          ModelParameters::DelayType::Output);
     }
 
-    // Copy IN -> OUT
-    *(reinterpret_cast<int32_t*>(output_buffer)) = element_count;
+    // Timestamp at end of the response.
+    uint64_t response_end_ns;
+    SET_TIMESTAMP(response_end_ns);
 
-    // Send the response.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ResponseSend(
-            response, 0 /* flags */, nullptr /* success */),
-        "failed sending response");
+    // Set error for simulated failure.
+    TRITONSERVER_Error* error = nullptr;
+    if (result_type == ModelParameters::InferResultType::Fail) {
+      error = TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNKNOWN, "simulated failure");
+    }
+
+    // Send response if not empty.
+    if (result_type != ModelParameters::InferResultType::Empty) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(response, 0 /* flags */, error),
+          "failed sending response");
+    }
+
+    // Report response statistics.
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceReportResponseStatistics(
+            TritonModelInstance(), factory.get(), response_start_ns,
+            compute_output_start_ns, response_end_ns, 0 /* flags */, error));
+
+    // Delete error, if any.
+    if (error != nullptr) {
+      TRITONSERVER_ErrorDelete(error);
+    }
 
+    // Additional logs for debugging.
     LOG_MESSAGE(
         TRITONSERVER_LOG_INFO,
         (std::string("sent response ") + std::to_string(e + 1) + " of " +

From 80195d1744ef3f0a5b8aa23e40417b80e06b600a Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:04:13 -0800
Subject: [PATCH 2/6] enum capital case

---
 src/square.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/square.cc b/src/square.cc
index 58ea5cb..5c498b4 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -98,8 +98,8 @@ namespace triton { namespace backend { namespace square {
 //
 class ModelParameters {
  public:
-  enum DelayType { Infer, Output };
-  enum InferResultType { Success, Fail, Empty };
+  enum DelayType { INFER, OUTPUT };
+  enum InferResultType { SUCCESS, FAIL, EMPTY };
 
   ModelParameters()
       : custom_infer_delay_ns_(0), custom_output_delay_ns_(0),
@@ -144,9 +144,9 @@ void
 ModelParameters::Sleep(DelayType delay_type) const
 {
   // Sleep on the requested delay type.
-  if (delay_type == DelayType::Infer) {
+  if (delay_type == DelayType::INFER) {
     Sleep(custom_infer_delay_ns_);
-  } else if (delay_type == DelayType::Output) {
+  } else if (delay_type == DelayType::OUTPUT) {
     Sleep(custom_output_delay_ns_);
   }
 }
@@ -161,14 +161,14 @@ ModelParameters::InferResult(size_t current_index, size_t element_count) const
   if (current_index + custom_fail_count_ + custom_empty_count_ <
       element_count) {
     // [0, N - E - F)
-    return ModelParameters::InferResultType::Success;
+    return ModelParameters::InferResultType::SUCCESS;
   }
   if (current_index + custom_empty_count_ < element_count) {
     // [N - E - F, N - E)
-    return ModelParameters::InferResultType::Fail;
+    return ModelParameters::InferResultType::FAIL;
   }
   // [N - E, N)
-  return ModelParameters::InferResultType::Empty;
+  return ModelParameters::InferResultType::EMPTY;
 }
 
 void
@@ -522,7 +522,7 @@ ModelInstanceState::RequestThread(
 
     // Simulate compute delay, if provided.
     model_state_->get_model_parameters().Sleep(
-        ModelParameters::DelayType::Infer);
+        ModelParameters::DelayType::INFER);
 
     // Result type of the simulated inference.
     ModelParameters::InferResultType result_type =
@@ -531,7 +531,7 @@ ModelInstanceState::RequestThread(
     // Populate 'compute_output_start_ns' and 'response' if not empty result.
     uint64_t compute_output_start_ns = 0;
     TRITONBACKEND_Response* response = nullptr;
-    if (result_type != ModelParameters::InferResultType::Empty) {
+    if (result_type != ModelParameters::InferResultType::EMPTY) {
       // Timestamp at start of outputting compute tensors.
       SET_TIMESTAMP(compute_output_start_ns);
 
@@ -569,7 +569,7 @@ ModelInstanceState::RequestThread(
 
       // Simulate output delay, if provided.
       model_state_->get_model_parameters().Sleep(
-          ModelParameters::DelayType::Output);
+          ModelParameters::DelayType::OUTPUT);
     }
 
     // Timestamp at end of the response.
@@ -578,13 +578,13 @@ ModelInstanceState::RequestThread(
 
     // Set error for simulated failure.
     TRITONSERVER_Error* error = nullptr;
-    if (result_type == ModelParameters::InferResultType::Fail) {
+    if (result_type == ModelParameters::InferResultType::FAIL) {
       error = TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_UNKNOWN, "simulated failure");
     }
 
     // Send response if not empty.
-    if (result_type != ModelParameters::InferResultType::Empty) {
+    if (result_type != ModelParameters::InferResultType::EMPTY) {
       LOG_IF_ERROR(
           TRITONBACKEND_ResponseSend(response, 0 /* flags */, error),
           "failed sending response");

From 53048e62e8bc6ce598653feb7f8b9230d20bcab5 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:49:31 -0800
Subject: [PATCH 3/6] Move API parameters into a struct

---
 src/square.cc | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/square.cc b/src/square.cc
index 5c498b4..59d9ce6 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -591,11 +591,18 @@ ModelInstanceState::RequestThread(
     }
 
     // Report response statistics.
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics =
+        new TRITONBACKEND_ModelInstanceResponseStatistics();
+    response_statistics->model_instance = TritonModelInstance();
+    response_statistics->response_factory = factory.get();
+    response_statistics->response_start = response_start_ns;
+    response_statistics->compute_output_start = compute_output_start_ns;
+    response_statistics->response_end = response_end_ns;
+    response_statistics->error = error;
     RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceReportResponseStatistics(
-            TritonModelInstance(), factory.get(), response_start_ns,
-            compute_output_start_ns, response_end_ns, 0 /* flags */, error));
+        factory.get(), TRITONBACKEND_ModelInstanceReportResponseStatistics(
+                           response_statistics));
+    delete response_statistics;
 
     // Delete error, if any.
     if (error != nullptr) {

From 035d1008bdff38281486d24278670e54e9e98a3c Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:37:31 -0800
Subject: [PATCH 4/6] Make API parameters struct opaque

---
 src/square.cc | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/src/square.cc b/src/square.cc
index 59d9ce6..c67c3ca 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -591,18 +591,39 @@ ModelInstanceState::RequestThread(
     }
 
     // Report response statistics.
-    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics =
-        new TRITONBACKEND_ModelInstanceResponseStatistics();
-    response_statistics->model_instance = TritonModelInstance();
-    response_statistics->response_factory = factory.get();
-    response_statistics->response_start = response_start_ns;
-    response_statistics->compute_output_start = compute_output_start_ns;
-    response_statistics->response_end = response_end_ns;
-    response_statistics->error = error;
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics;
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsNew(&response_statistics));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
+            response_statistics, TritonModelInstance()));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
+            response_statistics, factory.get()));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
+            response_statistics, response_start_ns));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
+            response_statistics, compute_output_start_ns));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(),
+        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
+            response_statistics, response_end_ns));
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(), TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
+                           response_statistics, error));
     RESPOND_FACTORY_AND_RETURN_IF_ERROR(
         factory.get(), TRITONBACKEND_ModelInstanceReportResponseStatistics(
                            response_statistics));
-    delete response_statistics;
+    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+        factory.get(), TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
+                           response_statistics));
 
     // Delete error, if any.
     if (error != nullptr) {

From 54779566c6582f29eef47c7d42c0eb3baf40e2fc Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 15 Feb 2024 18:14:09 -0800
Subject: [PATCH 5/6] Add comment on where fail and empty count are from

---
 src/square.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/square.cc b/src/square.cc
index c67c3ca..a867ae0 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -157,7 +157,8 @@ ModelParameters::InferResult(size_t current_index, size_t element_count) const
   // i.e. there are N element_count, F fail count and E empty count. Return
   // empty on [N - E, N) index and fail on [N - E - F, N - E) elements. With
   // proper N, F and E values, infer result will initially return success, and
-  // then fail, and then empty.
+  // then fail, and then empty. See the constructor for how the fail count and
+  // empty count are imported from the parameters on model config.
   if (current_index + custom_fail_count_ + custom_empty_count_ <
       element_count) {
     // [0, N - E - F)

From 968431ab614356383e89e33b51f8153e92d899f0 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:43:23 -0800
Subject: [PATCH 6/6] Group response statistics api calls

---
 src/square.cc | 82 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/src/square.cc b/src/square.cc
index a867ae0..c0c8ab2 100644
--- a/src/square.cc
+++ b/src/square.cc
@@ -396,6 +396,11 @@ class ModelInstanceState {
   void RequestThread(
       TRITONBACKEND_ResponseFactory* factory_ptr, const size_t element_count,
       uint32_t dims_count);
+  void ReportResponseStatistics(
+      TRITONBACKEND_ModelInstance* model_instance,
+      TRITONBACKEND_ResponseFactory* factory_ptr,
+      const uint64_t response_start_ns, const uint64_t compute_output_start_ns,
+      const uint64_t response_end_ns, TRITONSERVER_Error* error) const;
 
   ModelState* model_state_;
   TRITONBACKEND_ModelInstance* triton_model_instance_;
@@ -592,39 +597,9 @@ ModelInstanceState::RequestThread(
     }
 
     // Report response statistics.
-    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics;
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsNew(&response_statistics));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
-            response_statistics, TritonModelInstance()));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
-            response_statistics, factory.get()));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
-            response_statistics, response_start_ns));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
-            response_statistics, compute_output_start_ns));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(),
-        TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
-            response_statistics, response_end_ns));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(), TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
-                           response_statistics, error));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(), TRITONBACKEND_ModelInstanceReportResponseStatistics(
-                           response_statistics));
-    RESPOND_FACTORY_AND_RETURN_IF_ERROR(
-        factory.get(), TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
-                           response_statistics));
+    ReportResponseStatistics(
+        TritonModelInstance(), factory.get(), response_start_ns,
+        compute_output_start_ns, response_end_ns, error);
 
     // Delete error, if any.
     if (error != nullptr) {
@@ -660,6 +635,47 @@ ModelInstanceState::RequestThread(
   inflight_thread_count_--;
 }
 
+void
+ModelInstanceState::ReportResponseStatistics(
+    TRITONBACKEND_ModelInstance* model_instance,
+    TRITONBACKEND_ResponseFactory* factory_ptr,
+    const uint64_t response_start_ns, const uint64_t compute_output_start_ns,
+    const uint64_t response_end_ns, TRITONSERVER_Error* error) const
+{
+  TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics;
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsNew(&response_statistics));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
+          response_statistics, model_instance));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
+          response_statistics, factory_ptr));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
+          response_statistics, response_start_ns));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
+          response_statistics, compute_output_start_ns));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr, TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
+                       response_statistics, response_end_ns));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr, TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
+                       response_statistics, error));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceReportResponseStatistics(response_statistics));
+  RESPOND_FACTORY_AND_RETURN_IF_ERROR(
+      factory_ptr,
+      TRITONBACKEND_ModelInstanceResponseStatisticsDelete(response_statistics));
+}
+
 /////////////
 
 extern "C" {