From 80131b37e841b7566511b23a41ad9eb54c77ad92 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 2 Feb 2024 18:50:05 -0800
Subject: [PATCH 01/14] Add response statistics

---
 src/grpc/grpc_server.cc | 92 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
index ebe53c82e0..187272217d 100644
--- a/src/grpc/grpc_server.cc
+++ b/src/grpc/grpc_server.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1100,6 +1100,96 @@ CommonHandler::RegisterModelStatistics()
               ucnt);
         }
 
+        {
+          triton::common::TritonJson::Value responses_json;
+          err = model_stat.MemberAsObject("response_stats", &responses_json);
+          GOTO_IF_ERR(err, earlyexit);
+
+          std::vector<std::string> keys;
+          err = responses_json.Members(&keys);
+          GOTO_IF_ERR(err, earlyexit);
+
+          for (const auto& key : keys) {
+            triton::common::TritonJson::Value res_json;
+            err = responses_json.MemberAsObject(key.c_str(), &res_json);
+            GOTO_IF_ERR(err, earlyexit);
+
+            inference::InferResponseStatistics res;
+
+            {
+              triton::common::TritonJson::Value stat_json;
+              err = res_json.MemberAsObject("compute_infer", &stat_json);
+              GOTO_IF_ERR(err, earlyexit);
+
+              uint64_t val;
+              err = stat_json.MemberAsUInt("count", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_compute_infer()->set_count(val);
+              err = stat_json.MemberAsUInt("ns", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_compute_infer()->set_ns(val);
+            }
+
+            {
+              triton::common::TritonJson::Value stat_json;
+              err = res_json.MemberAsObject("compute_output", &stat_json);
+              GOTO_IF_ERR(err, earlyexit);
+
+              uint64_t val;
+              err = stat_json.MemberAsUInt("count", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_compute_output()->set_count(val);
+              err = stat_json.MemberAsUInt("ns", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_compute_output()->set_ns(val);
+            }
+
+            {
+              triton::common::TritonJson::Value stat_json;
+              err = res_json.MemberAsObject("success", &stat_json);
+              GOTO_IF_ERR(err, earlyexit);
+
+              uint64_t val;
+              err = stat_json.MemberAsUInt("count", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_success()->set_count(val);
+              err = stat_json.MemberAsUInt("ns", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_success()->set_ns(val);
+            }
+
+            {
+              triton::common::TritonJson::Value stat_json;
+              err = res_json.MemberAsObject("fail", &stat_json);
+              GOTO_IF_ERR(err, earlyexit);
+
+              uint64_t val;
+              err = stat_json.MemberAsUInt("count", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_fail()->set_count(val);
+              err = stat_json.MemberAsUInt("ns", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_fail()->set_ns(val);
+            }
+
+            {
+              triton::common::TritonJson::Value stat_json;
+              err = res_json.MemberAsObject("empty_response", &stat_json);
+              GOTO_IF_ERR(err, earlyexit);
+
+              uint64_t val;
+              err = stat_json.MemberAsUInt("count", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_empty_response()->set_count(val);
+              err = stat_json.MemberAsUInt("ns", &val);
+              GOTO_IF_ERR(err, earlyexit);
+              res.mutable_empty_response()->set_ns(val);
+            }
+
+            (*statistics->mutable_response_stats())[key] = std::move(res);
+          }
+        }
+
         triton::common::TritonJson::Value batches_json;
         err = model_stat.MemberAsArray("batch_stats", &batches_json);
         GOTO_IF_ERR(err, earlyexit);

From 8764fa9daadb67d76f667959149a97b5ab560f2e Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 6 Feb 2024 16:24:45 -0800
Subject: [PATCH 02/14] Add L0_response_statistics

---
 .../response_statistics_test.py               | 164 ++++++++++++++++++
 qa/L0_response_statistics/test.sh             |  89 ++++++++++
 2 files changed, 253 insertions(+)
 create mode 100755 qa/L0_response_statistics/response_statistics_test.py
 create mode 100755 qa/L0_response_statistics/test.sh

diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py
new file mode 100755
index 0000000000..270f592824
--- /dev/null
+++ b/qa/L0_response_statistics/response_statistics_test.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import concurrent.futures
+import time
+import unittest
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+
+class TestResponseStatistics(unittest.TestCase):
+    def setUp(self):
+        self._model_name = "square_int32"
+        self._min_infer_delay_ns = 400000000
+        self._min_output_delay_ns = 200000000
+        self._number_of_fail_responses = 2
+        self._number_of_empty_responses = 1
+        self._statistics_counts = []
+        self._grpc_client = grpcclient.InferenceServerClient(
+            "localhost:8001", verbose=True
+        )
+        self._http_client = httpclient.InferenceServerClient("localhost:8000")
+
+    def _generate_streaming_callback_and_response_pair(self):
+        response = []  # [{"result": result, "error": error}, ...]
+
+        def callback(result, error):
+            response.append({"result": result, "error": error})
+
+        return callback, response
+
+    def _stream_infer(self, number_of_responses):
+        callback, responses = self._generate_streaming_callback_and_response_pair()
+        self._grpc_client.start_stream(callback)
+        input_data = np.array([number_of_responses], dtype=np.int32)
+        inputs = [grpcclient.InferInput("IN", input_data.shape, "INT32")]
+        inputs[0].set_data_from_numpy(input_data)
+        outputs = [grpcclient.InferRequestedOutput("OUT")]
+        self._grpc_client.async_stream_infer(
+            model_name=self._model_name, inputs=inputs, outputs=outputs
+        )
+        while len(responses) < (number_of_responses - self._number_of_empty_responses):
+            time.sleep(0.1)  # poll until all expected responses are received
+        self._grpc_client.stop_stream()
+        return responses
+
+    def _update_statistics_counts(self, current_index, number_of_responses):
+        if current_index >= len(self._statistics_counts):
+            self._statistics_counts.append(
+                {
+                    "compute_infer": 0,
+                    "compute_output": 0,
+                    "success": 0,
+                    "fail": 0,
+                    "empty_response": 0,
+                }
+            )
+        if (
+            current_index
+            + self._number_of_fail_responses
+            + self._number_of_empty_responses
+            < number_of_responses
+        ):
+            # success
+            self._statistics_counts[current_index]["compute_infer"] += 1
+            self._statistics_counts[current_index]["compute_output"] += 1
+            self._statistics_counts[current_index]["success"] += 1
+        elif current_index + self._number_of_empty_responses < number_of_responses:
+            # fail
+            self._statistics_counts[current_index]["compute_infer"] += 1
+            self._statistics_counts[current_index]["compute_output"] += 1
+            self._statistics_counts[current_index]["fail"] += 1
+        else:
+            # empty
+            self._statistics_counts[current_index]["compute_infer"] += 1
+            self._statistics_counts[current_index]["empty_response"] += 1
+
+    def _check_statistics_count_and_duration(
+        self, response_stats, current_index, stats_name
+    ):
+        expected_count = self._statistics_counts[current_index][stats_name]
+        if stats_name == "compute_infer" or stats_name == "empty_response":
+            delay_ns = self._min_infer_delay_ns
+        elif stats_name == "compute_output":
+            delay_ns = self._min_output_delay_ns
+        else:  # success or fail
+            delay_ns = self._min_infer_delay_ns + self._min_output_delay_ns
+        upper_bound_ns = 1.01 * delay_ns * expected_count
+        lower_bound_ns = 0.99 * delay_ns * expected_count
+        stats = response_stats[str(current_index)][stats_name]
+        self.assertEqual(stats["count"], expected_count)
+        self.assertLessEqual(stats["ns"], upper_bound_ns)
+        self.assertGreaterEqual(stats["ns"], lower_bound_ns)
+
+    def _check_response_stats(self, responses, number_of_responses):
+        statistics_grpc = self._grpc_client.get_inference_statistics(
+            model_name=self._model_name, as_json=True
+        )
+        statistics_http = self._http_client.get_inference_statistics(
+            model_name=self._model_name
+        )
+        # self.assertEqual(statistics_grpc, statistics_http)
+        model_stats = statistics_http["model_stats"][0]
+        self.assertEqual(model_stats["name"], self._model_name)
+        response_stats = model_stats["response_stats"]
+        self.assertGreaterEqual(len(response_stats), number_of_responses)
+        for i in range(number_of_responses):
+            self._update_statistics_counts(i, number_of_responses)
+            self._check_statistics_count_and_duration(
+                response_stats, i, "compute_infer"
+            )
+            self._check_statistics_count_and_duration(
+                response_stats, i, "compute_output"
+            )
+            self._check_statistics_count_and_duration(response_stats, i, "success")
+            self._check_statistics_count_and_duration(response_stats, i, "fail")
+            self._check_statistics_count_and_duration(
+                response_stats, i, "empty_response"
+            )
+
+    def test_response_statistics(self):
+        number_of_responses = 4
+        responses = self._stream_infer(number_of_responses)
+        self._check_response_stats(responses, number_of_responses)
+
+        number_of_responses = 6
+        responses = self._stream_infer(number_of_responses)
+        self._check_response_stats(responses, number_of_responses)
+
+        number_of_responses = 3
+        responses = self._stream_infer(number_of_responses)
+        self._check_response_stats(responses, number_of_responses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_response_statistics/test.sh b/qa/L0_response_statistics/test.sh
new file mode 100755
index 0000000000..c65f333401
--- /dev/null
+++ b/qa/L0_response_statistics/test.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+SERVER=/opt/tritonserver/bin/tritonserver
+source ../common/util.sh
+
+RET=0
+
+rm -rf models && mkdir models
+mkdir -p models/square_int32/1 && (cd models/square_int32 && \
+    echo 'name: "square_int32"' >> config.pbtxt && \
+    echo 'backend: "square"' >> config.pbtxt && \
+    echo 'max_batch_size: 0' >> config.pbtxt && \
+    echo 'model_transaction_policy { decoupled: True }' >> config.pbtxt && \
+    echo -e 'input [{ name: "IN" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \
+    echo -e 'output [{ name: "OUT" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "CUSTOM_INFER_DELAY_NS" \n value: { string_value: "400000000" } }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "CUSTOM_OUTPUT_DELAY_NS" \n value: { string_value: "200000000" } }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "CUSTOM_FAIL_COUNT" \n value: { string_value: "2" } }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "CUSTOM_EMPTY_COUNT" \n value: { string_value: "1" } }]' >> config.pbtxt)
+
+TEST_LOG="response_statistics_test.log"
+SERVER_LOG="./response_statistics_test.server.log"
+
+SERVER_ARGS="--model-repository=`pwd`/models"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python response_statistics_test.py > $TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Failed response statistics test\n***"
+    cat $TEST_LOG
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+exit $RET

From c19af7c600ee327b39d479bd7cc5ed45a00f39b1 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 6 Feb 2024 16:47:49 -0800
Subject: [PATCH 03/14] Enable http vs grpc statistics comparison

---
 .../response_statistics_test.py               | 45 ++++++++++++++-----
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py
index 270f592824..7996e655e6 100755
--- a/qa/L0_response_statistics/response_statistics_test.py
+++ b/qa/L0_response_statistics/response_statistics_test.py
@@ -26,14 +26,12 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import concurrent.futures
 import time
 import unittest
 
 import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-from tritonclient.utils import InferenceServerException
 
 
 class TestResponseStatistics(unittest.TestCase):
@@ -113,24 +111,47 @@ def _check_statistics_count_and_duration(
             delay_ns = self._min_output_delay_ns
         else:  # success or fail
             delay_ns = self._min_infer_delay_ns + self._min_output_delay_ns
-        upper_bound_ns = 1.01 * delay_ns * expected_count
-        lower_bound_ns = 0.99 * delay_ns * expected_count
+        upper_bound_ns = 1.1 * delay_ns * expected_count
+        lower_bound_ns = 0.9 * delay_ns * expected_count
         stats = response_stats[str(current_index)][stats_name]
         self.assertEqual(stats["count"], expected_count)
         self.assertLessEqual(stats["ns"], upper_bound_ns)
         self.assertGreaterEqual(stats["ns"], lower_bound_ns)
 
-    def _check_response_stats(self, responses, number_of_responses):
-        statistics_grpc = self._grpc_client.get_inference_statistics(
-            model_name=self._model_name, as_json=True
-        )
+    def _get_response_statistics(self):
+        # http response statistics
         statistics_http = self._http_client.get_inference_statistics(
             model_name=self._model_name
         )
-        # self.assertEqual(statistics_grpc, statistics_http)
-        model_stats = statistics_http["model_stats"][0]
-        self.assertEqual(model_stats["name"], self._model_name)
-        response_stats = model_stats["response_stats"]
+        model_stats_http = statistics_http["model_stats"][0]
+        self.assertEqual(model_stats_http["name"], self._model_name)
+        response_stats_http = model_stats_http["response_stats"]
+        # grpc response statistics
+        statistics_grpc = self._grpc_client.get_inference_statistics(
+            model_name=self._model_name, as_json=True
+        )
+        model_stats_grpc = statistics_grpc["model_stats"][0]
+        self.assertEqual(model_stats_grpc["name"], self._model_name)
+        response_stats_grpc = model_stats_grpc["response_stats"]
+        # check equivalent between http and grpc statistics
+        self.assertEqual(len(response_stats_http), len(response_stats_grpc))
+        for idx, statistics_http in response_stats_http.items():
+            self.assertIn(idx, response_stats_grpc)
+            statistics_grpc = response_stats_grpc[idx]
+            for name, stats_http in statistics_http.items():
+                self.assertIn(name, statistics_grpc)
+                stats_grpc = statistics_grpc[name]
+                # normalize gRPC statistics to http
+                stats_grpc["count"] = (
+                    int(stats_grpc["count"]) if ("count" in stats_grpc) else 0
+                )
+                stats_grpc["ns"] = int(stats_grpc["ns"]) if ("ns" in stats_grpc) else 0
+                # check equal
+                self.assertEqual(stats_http, stats_grpc)
+        return response_stats_http
+
+    def _check_response_stats(self, responses, number_of_responses):
+        response_stats = self._get_response_statistics()
         self.assertGreaterEqual(len(response_stats), number_of_responses)
         for i in range(number_of_responses):
             self._update_statistics_counts(i, number_of_responses)

From d6527f2411cf552ba082631aa0a9136cc8290a63 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:34:20 -0800
Subject: [PATCH 04/14] Add docs for response statistics protocol

---
 docs/protocol/extension_statistics.md | 55 +++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 040f165dde..335611546a 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -78,6 +78,7 @@ $model_stat =
   "inference_count" : $number,
   "execution_count" : $number,
   "inference_stats" : $inference_stats,
+  "response_stats" : { $string : $response_stat, ... },
   "batch_stats" : [ $batch_stat, ... ],
   "memory_usage" : [ $memory_usage, ...]
 }
@@ -114,6 +115,12 @@ $model_stat =
   model/version. So, for example, "inference_stats":"success"
   indicates the number of successful inference requests for the model.
 
+- "response_stats" : The aggregate decoupled response statistics for the
+  model/version. For example, { "key" : { "response_stats" : "success" } }
+  indicates the aggregate statistics of successful decoupled response at "key"
+  for the model, where "key" identifies between different decoupled responses
+  generated by the model.
+
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate
   how many actual model executions were performed and show differences
@@ -180,6 +187,28 @@ $inference_stats =
   from the response object to the Response Cache.
 
 
+```
+$response_stats =
+{
+  "compute_infer" : $duration_stat,
+  "compute_output" : $duration_stat,
+  "success" : $duration_stat,
+  "fail" : $duration_stat,
+  "empty_response" : $duration_stat
+}
+```
+
+- "compute_infer" : The count and cumulative duration to compute a response.
+- "compute_output" : The count and cumulative duration to extract the output
+  tensor of a computed response.
+- "success" : The count and cumulative duration of a success inference. The
+  duration is the sum of infer and output durations.
+- "fail" : The count and cumulative duration of a fail inference. The duration
+  is the sum of infer and output durations.
+- "empty_response" : The count and cumulative duration of an inference with an
+  empty / no response. The duration is infer durations.
+
+
 ```
 $batch_stats =
 {
@@ -360,6 +389,12 @@ message ModelStatistics
   // point, the GPU memory usage for models in ONNX Runtime backend and TensorRT
   // backend is usually aligned.
   repeated MemoryUsage memory_usage = 8;
+
+  // The key and value pairs for all decoupled responses statistics. The key is
+  // a string identifying a set of response statistics aggregated together (i.e.
+  // index of the response sent). The value is the aggregated response
+  // statistics.
+  map<string, InferResponseStatistics> response_stats = 9;
 }
 
 // Inference statistics.
@@ -428,6 +463,26 @@ message InferStatistics
   StatisticDuration cache_miss = 8;
 }
 
+// Statistics per decoupled response.
+message InferResponseStatistics
+{
+  // The count and cumulative duration to compute a response.
+  StatisticDuration compute_infer = 1;
+
+  // The count and cumulative duration to extract the output tensors of a
+  // response.
+  StatisticDuration compute_output = 2;
+
+  // The count and cumulative duration for successful responses.
+  StatisticDuration success = 3;
+
+  // The count and cumulative duration for failed responses.
+  StatisticDuration fail = 4;
+
+  // The count and cumulative duration for empty responses.
+  StatisticDuration empty_response = 5;
+}
+
 // Inference batch statistics.
 message InferBatchStatistics
 {

From b1fe5e776d33c64aa50b8560a28eea36f0679ad8 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:06:52 -0800
Subject: [PATCH 05/14] Add more comments for response statistics test

---
 .../response_statistics_test.py                  | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py
index 7996e655e6..b7b02a956c 100755
--- a/qa/L0_response_statistics/response_statistics_test.py
+++ b/qa/L0_response_statistics/response_statistics_test.py
@@ -47,14 +47,19 @@ def setUp(self):
         )
         self._http_client = httpclient.InferenceServerClient("localhost:8000")
 
+    # Return a coupled (callback, response) pair for gRPC stream infer.
     def _generate_streaming_callback_and_response_pair(self):
-        response = []  # [{"result": result, "error": error}, ...]
+        # [{"result": result, "error": error}, ...]
+        response = []
 
         def callback(result, error):
             response.append({"result": result, "error": error})
 
         return callback, response
 
+    # Send an infer request and return its responses. 'number_of_responses' is the sum
+    # of success, fail and empty responses the model should return for this request.
+    # This function waits until all success and fail responses are received.
     def _stream_infer(self, number_of_responses):
         callback, responses = self._generate_streaming_callback_and_response_pair()
         self._grpc_client.start_stream(callback)
@@ -70,6 +75,9 @@ def _stream_infer(self, number_of_responses):
         self._grpc_client.stop_stream()
         return responses
 
+    # Update expected statistics counts for the response at 'current_index'.
+    # 'number_of_responses' is the sum of success, fail and empty responses expected
+    # from this inference request.
     def _update_statistics_counts(self, current_index, number_of_responses):
         if current_index >= len(self._statistics_counts):
             self._statistics_counts.append(
@@ -101,6 +109,7 @@ def _update_statistics_counts(self, current_index, number_of_responses):
             self._statistics_counts[current_index]["compute_infer"] += 1
             self._statistics_counts[current_index]["empty_response"] += 1
 
+    # Check the 'response_stats' at 'current_index' for 'stats_name' is valid.
     def _check_statistics_count_and_duration(
         self, response_stats, current_index, stats_name
     ):
@@ -118,6 +127,8 @@ def _check_statistics_count_and_duration(
         self.assertLessEqual(stats["ns"], upper_bound_ns)
         self.assertGreaterEqual(stats["ns"], lower_bound_ns)
 
+    # Fetch and return the response statistics from both gRPC and HTTP endpoints, and
+    # check they are equivalent before returning.
     def _get_response_statistics(self):
         # http response statistics
         statistics_http = self._http_client.get_inference_statistics(
@@ -150,6 +161,8 @@ def _get_response_statistics(self):
                 self.assertEqual(stats_http, stats_grpc)
         return response_stats_http
 
+    # Check the response statistics is valid for a given infer request, providing its
+    # 'responses' and 'number_of_responses'.
     def _check_response_stats(self, responses, number_of_responses):
         response_stats = self._get_response_statistics()
         self.assertGreaterEqual(len(response_stats), number_of_responses)
@@ -167,6 +180,7 @@ def _check_response_stats(self, responses, number_of_responses):
                 response_stats, i, "empty_response"
             )
 
+    # Test response statistics. The statistics must be valid over two or more infers.
     def test_response_statistics(self):
         number_of_responses = 4
         responses = self._stream_infer(number_of_responses)

From 13fba82d7a28c006e718b102302bf3ba0640ac9a Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:10:50 -0800
Subject: [PATCH 06/14] Remove model name from config

---
 qa/L0_response_statistics/test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/qa/L0_response_statistics/test.sh b/qa/L0_response_statistics/test.sh
index c65f333401..eae900a9e9 100755
--- a/qa/L0_response_statistics/test.sh
+++ b/qa/L0_response_statistics/test.sh
@@ -47,7 +47,6 @@ RET=0
 
 rm -rf models && mkdir models
 mkdir -p models/square_int32/1 && (cd models/square_int32 && \
-    echo 'name: "square_int32"' >> config.pbtxt && \
     echo 'backend: "square"' >> config.pbtxt && \
     echo 'max_batch_size: 0' >> config.pbtxt && \
     echo 'model_transaction_policy { decoupled: True }' >> config.pbtxt && \

From aa72d17a95ca8d5c152f04f1153df68a21ba6df6 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:38:08 -0800
Subject: [PATCH 07/14] Improve docs wordings

---
 docs/protocol/extension_statistics.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 335611546a..246b014112 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -112,14 +112,15 @@ $model_stat =
   DOES NOT include cache hits.
 
 - "inference_stats" : The aggregate statistics for the
-  model/version. So, for example, "inference_stats":"success"
-  indicates the number of successful inference requests for the model.
+  model. So, for example, "inference_stats":"success" indicates the number of
+  successful inference requests for the model.
 
 - "response_stats" : The aggregate decoupled response statistics for the
-  model/version. For example, { "key" : { "response_stats" : "success" } }
-  indicates the aggregate statistics of successful decoupled response at "key"
-  for the model, where "key" identifies between different decoupled responses
-  generated by the model.
+  model. For example, { "key" : { "response_stats" : "success" } } indicates the
+  aggregate statistics of successful decoupled response at "key" for the model,
+  where "key" identifies between different decoupled responses generated by the
+  model. It is advised to check with the model backend on the set of keys it can
+  provide and what each of them means.
 
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate
@@ -372,7 +373,7 @@ message ModelStatistics
   // The "execution_count" value DOES NOT include cache hits.
   uint64 execution_count = 5;
 
-  // The aggregate statistics for the model/version.
+  // The aggregate statistics for the model.
   InferStatistics inference_stats = 6;
 
   // The aggregate statistics for each different batch size that is

From 448e54c5739c560400dd3955097047089357f1c5 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:40:11 -0800
Subject: [PATCH 08/14] [Continue] Improve docs wordings

---
 docs/protocol/extension_statistics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 246b014112..2053da35fe 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -120,7 +120,7 @@ $model_stat =
   aggregate statistics of successful decoupled response at "key" for the model,
   where "key" identifies between different decoupled responses generated by the
   model. It is advised to check with the model backend on the set of keys it can
-  provide and what each of them means.
+  provide and the meaning of each key.
 
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate

From 153cb1635154b375ea7a3a02e3544214a06baf1c Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:07:15 -0800
Subject: [PATCH 09/14] [Continue] Add more comments for response statistics
 test

---
 qa/L0_response_statistics/response_statistics_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py
index b7b02a956c..450827c891 100755
--- a/qa/L0_response_statistics/response_statistics_test.py
+++ b/qa/L0_response_statistics/response_statistics_test.py
@@ -182,14 +182,17 @@ def _check_response_stats(self, responses, number_of_responses):
 
     # Test response statistics. The statistics must be valid over two or more infers.
     def test_response_statistics(self):
+        # Send a request that generates 4 responses.
         number_of_responses = 4
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)
-
+        # Send a request that generates 6 responses, and make sure the
+        # statistics are aggregrated with the previous request.
         number_of_responses = 6
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)
-
+        # Send a request that generates 3 responses, and make sure the
+        # statistics are aggregrated with the previous requests.
         number_of_responses = 3
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)

From 8ec960d8877ea63076ed0b0e7eeb4526fb1fee2b Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:37:29 -0800
Subject: [PATCH 10/14] [Continue 2] Improve docs wordings

---
 docs/protocol/extension_statistics.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 2053da35fe..5e0f550212 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -117,10 +117,11 @@ $model_stat =
 
 - "response_stats" : The aggregate decoupled response statistics for the
   model. For example, { "key" : { "response_stats" : "success" } } indicates the
-  aggregate statistics of successful decoupled response at "key" for the model,
-  where "key" identifies between different decoupled responses generated by the
-  model. It is advised to check with the model backend on the set of keys it can
-  provide and the meaning of each key.
+  aggregate statistics of successful decoupled responses at "key" for the model,
+  where "key" identifies each decoupled response generated by the model across
+  different requests. For example, given a model that generates three responses,
+  the key could be "0", "1" and "2" identifying the three responses in order.
+  Check with the model backend on the keys it can return and their meanings.
 
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate

From 0c97795c7319bdb9134637a1ee8eb17ebde80926 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:41:23 -0800
Subject: [PATCH 11/14] Fix typo

---
 qa/L0_response_statistics/response_statistics_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py
index 450827c891..b04403bfb3 100755
--- a/qa/L0_response_statistics/response_statistics_test.py
+++ b/qa/L0_response_statistics/response_statistics_test.py
@@ -187,12 +187,12 @@ def test_response_statistics(self):
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)
         # Send a request that generates 6 responses, and make sure the
-        # statistics are aggregrated with the previous request.
+        # statistics are aggregated with the previous request.
         number_of_responses = 6
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)
         # Send a request that generates 3 responses, and make sure the
-        # statistics are aggregrated with the previous requests.
+        # statistics are aggregated with the previous requests.
         number_of_responses = 3
         responses = self._stream_infer(number_of_responses)
         self._check_response_stats(responses, number_of_responses)

From d09511c8bbee7640928b64b31243a1ba2d938b7d Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 16 Feb 2024 11:15:36 -0800
Subject: [PATCH 12/14] Remove mentioning decoupled from docs

---
 docs/protocol/extension_statistics.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 5e0f550212..d0beb771d8 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -115,13 +115,13 @@ $model_stat =
   model. So, for example, "inference_stats":"success" indicates the number of
   successful inference requests for the model.
 
-- "response_stats" : The aggregate decoupled response statistics for the
-  model. For example, { "key" : { "response_stats" : "success" } } indicates the
-  aggregate statistics of successful decoupled responses at "key" for the model,
-  where "key" identifies each decoupled response generated by the model across
-  different requests. For example, given a model that generates three responses,
-  the key could be "0", "1" and "2" identifying the three responses in order.
-  Check with the model backend on the keys it can return and their meanings.
+- "response_stats" : The aggregate response statistics for the model. For
+  example, { "key" : { "response_stats" : "success" } } indicates the aggregate
+  statistics of successful responses at "key" for the model, where "key"
+  identifies each response generated by the model across different requests. For
+  example, given a model that generates three responses, the key could be "0",
+  "1" and "2" identifying the three responses in order. Check with the model
+  backend on the keys it can return and their meanings.
 
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate

From d8bd46815f19233ee80ad977fc59a1aa7872af5f Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 16 Feb 2024 11:28:39 -0800
Subject: [PATCH 13/14] [Continue 3] Improve docs wordings

---
 docs/protocol/extension_statistics.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index d0beb771d8..2a45a246a3 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -119,9 +119,8 @@ $model_stat =
   example, { "key" : { "response_stats" : "success" } } indicates the aggregate
   statistics of successful responses at "key" for the model, where "key"
   identifies each response generated by the model across different requests. For
-  example, given a model that generates three responses, the key could be "0",
-  "1" and "2" identifying the three responses in order. Check with the model
-  backend on the keys it can return and their meanings.
+  example, given a model that generates three responses, the keys can be "0",
+  "1" and "2" identifying the three responses in order.
 
 - "batch_stats" : The aggregate statistics for each different batch
   size that is executed in the model. The batch statistics indicate

From 3380db371d4a659afdec5b1928991f1231862db0 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Fri, 16 Feb 2024 17:09:58 -0800
Subject: [PATCH 14/14] [Continue 4] Improve docs wordings

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 docs/protocol/extension_statistics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 2a45a246a3..4da4140657 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -78,8 +78,8 @@ $model_stat =
   "inference_count" : $number,
   "execution_count" : $number,
   "inference_stats" : $inference_stats,
-  "response_stats" : { $string : $response_stat, ... },
-  "batch_stats" : [ $batch_stat, ... ],
+  "response_stats" : { $string : $response_stats, ... },
+  "batch_stats" : [ $batch_stats, ... ],
   "memory_usage" : [ $memory_usage, ...]
 }
 ```