triton-inference-server · pskiran1 · Jan 20, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 24, 2025
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -127,6 +127,45 @@ for trial in $TRIALS; do
 
   kill $SERVER_PID
   wait $SERVER_PID
+
+  SERVER_ARGS="--model-repository=$MODELDIR --grpc-max-response-pool-size=1"
+  SERVER_LOG="grpc_max_response_pool_size_1_${trial}_server.log"
+  CLIENT_LOG="grpc_max_response_pool_size_1_${trial}_client.log"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+  fi
+
+  for test in \
+              test_one_to_none \
+              test_one_to_one \
+              test_one_to_many \
+              test_no_streaming \
+              test_response_order \
+        test_wrong_shape; do
+
+      echo "Test: $test" >>$CLIENT_LOG
+      set +e
+      python $DECOUPLED_TEST DecoupledTest.$test >>$CLIENT_LOG 2>&1
+      if [ $? -ne 0 ]; then
+              echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***" >>$CLIENT_LOG
+              echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***"
+              RET=1
+      else
+          check_test_results $TEST_RESULT_FILE 1
+          if [ $? -ne 0 ]; then
+              cat $CLIENT_LOG
+              echo -e "\n***\n*** Test Result Verification Failed\n***"
+              RET=1
+          fi
+      fi
+      set -e
+  done
+
+  kill $SERVER_PID
+  wait $SERVER_PID
 done
 
 # Test the server frontend can merge the responses of non-decoupled model that

diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -306,6 +306,7 @@ enum TritonOptionId {
   OPTION_GRPC_ADDRESS,
   OPTION_GRPC_HEADER_FORWARD_PATTERN,
   OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
+  OPTION_GRPC_MAX_RESPONSE_POOL_SIZE,
   OPTION_GRPC_USE_SSL,
   OPTION_GRPC_USE_SSL_MUTUAL,
   OPTION_GRPC_SERVER_CERT,
@@ -536,6 +537,11 @@ TritonParser::SetupOptions()
        "allocated for reuse. As long as the number of in-flight requests "
        "doesn't exceed this value there will be no allocation/deallocation of "
        "request/response objects."});
+  grpc_options_.push_back(
+      {OPTION_GRPC_MAX_RESPONSE_POOL_SIZE, "grpc-max-response-pool-size",
+       Option::ArgInt,
+       "The maximum number of inference response objects that can remain "
+       "allocated in the pool at any given time."});
   grpc_options_.push_back(
       {OPTION_GRPC_USE_SSL, "grpc-use-ssl", Option::ArgBool,
        "Use SSL authentication for GRPC requests. Default is false."});
@@ -1438,6 +1444,14 @@ TritonParser::Parse(int argc, char** argv)
         case OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE:
           lgrpc_options.infer_allocation_pool_size_ = ParseOption<int>(optarg);
           break;
+        case OPTION_GRPC_MAX_RESPONSE_POOL_SIZE:
+          lgrpc_options.max_response_pool_size_ = ParseOption<int>(optarg);
+          if (lgrpc_options.max_response_pool_size_ <= 0) {
+            throw ParseException(
+                "Error: --grpc-max-response-pool-size must be greater "
+                "than 0.");
+          }
+          break;
         case OPTION_GRPC_USE_SSL:
           lgrpc_options.ssl_.use_ssl_ = ParseOption<bool>(optarg);
           break;

diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -2395,8 +2395,8 @@ Server::Server(
         "ModelInferHandler", tritonserver_, trace_manager_, shm_manager_,
         &service_, model_infer_cq_.get(),
         options.infer_allocation_pool_size_ /* max_state_bucket_count */,
-        options.infer_compression_level_, restricted_kv,
-        options.forward_header_pattern_));
+        options.max_response_pool_size_, options.infer_compression_level_,
+        restricted_kv, options.forward_header_pattern_));
   }
 
   // Handler for streaming inference requests. Keeps one handler for streaming
@@ -2405,8 +2405,8 @@ Server::Server(
       "ModelStreamInferHandler", tritonserver_, trace_manager_, shm_manager_,
       &service_, model_stream_infer_cq_.get(),
       options.infer_allocation_pool_size_ /* max_state_bucket_count */,
-      options.infer_compression_level_, restricted_kv,
-      options.forward_header_pattern_));
+      options.max_response_pool_size_, options.infer_compression_level_,
+      restricted_kv, options.forward_header_pattern_));
 }
 
 Server::~Server()
@@ -2472,6 +2472,8 @@ Server::GetOptions(Options& options, UnorderedMapType& options_map)
   RETURN_IF_ERR(GetValue(
       options_map, "infer_allocation_pool_size",
       &options.infer_allocation_pool_size_));
+  RETURN_IF_ERR(GetValue(
+      options_map, "max_response_pool_size", &options.max_response_pool_size_));
   RETURN_IF_ERR(GetValue(
       options_map, "forward_header_pattern", &options.forward_header_pattern_));
 

diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -89,6 +89,7 @@ struct Options {
   // requests doesn't exceed this value there will be no
   // allocation/deallocation of request/response objects.
   int infer_allocation_pool_size_{8};
+  int max_response_pool_size_{INT_MAX};
   RestrictedFeatures restricted_protocols_;
   std::string forward_header_pattern_;
 };