pytorch · keehyuna · Nov 4, 2024 · Nov 5, 2024 · Nov 14, 2024 · Nov 18, 2024
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
 
+  runtime_states.prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
+  runtime_states.prev_pre_allocated_outputs_enabled = false;
+
   if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
     uint64_t inputs = 0;
     uint64_t outputs = 0;
@@ -307,6 +310,9 @@ bool TRTEngine::set_device_memory_budget(int64_t budget) {
   if (profile_execution) {
     enable_profiling();
   }
+
+  runtime_states.set_context_changed();
+
   return result;
 }
 

diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -30,6 +30,53 @@ using FlattenedState = std::tuple<
     std::tuple<std::string, std::string>, // serialized metadata
     std::tuple<std::string, std::string>>; // Platform
 
+struct RuntimeStates {
+  bool need_cudagraphs_record;
+  bool need_cudagraphs_reset;
+  bool can_use_pre_allocated_outputs;
+};
+
+struct TorchTRTRuntimeStates {
+  // Previous runtime states
+  bool prev_cudagraphs_enabled = false;
+  bool prev_pre_allocated_outputs_enabled = false;
+  // Indicates to reevaluate the runtime settings as context has changed
+  bool has_context_changed = false;
+
+  // Evaluates whether certain conditions are met to enable CUDA Graph recording/reset or to reuse pre-allocated outputs
+  // based on the current and previous states, as well as input shape has changed
+  RuntimeStates validate_states(bool cudagraphs_enabled, bool pre_allocated_outputs_enabled, bool shape_changed) {
+    bool need_cudagraphs_record = false;
+    bool can_use_pre_allocated_outputs = false;
+    bool need_cudagraphs_reset = false;
+
+    // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+    // If context is changed by runtime setting like weight streaming, it needs cuda graphs record
+    if (cudagraphs_enabled && (!prev_cudagraphs_enabled || shape_changed || has_context_changed)) {
+      need_cudagraphs_record = true;
+    }
+    // Pre-allocated output can be used when previous and current state are true without shape change
+    if (prev_pre_allocated_outputs_enabled && pre_allocated_outputs_enabled && !shape_changed) {
+      can_use_pre_allocated_outputs = true;
+    }
+
+    if (!cudagraphs_enabled || shape_changed || has_context_changed) {
+      need_cudagraphs_reset = true;
+    }
+
+    // Reset the flag
+    has_context_changed = false;
+    prev_cudagraphs_enabled = cudagraphs_enabled;
+    prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled;
+
+    RuntimeStates values = {need_cudagraphs_record, need_cudagraphs_reset, can_use_pre_allocated_outputs};
+    return values;
+  }
+  void set_context_changed() {
+    has_context_changed = true;
+  }
+};
+
 struct TRTEngine : torch::CustomClassHolder {
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
@@ -88,6 +135,8 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_streamable_device_memory_budget();
   int64_t get_automatic_device_memory_budget();
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
+  void set_pre_allocated_outputs(bool enable);
+  TorchTRTRuntimeStates runtime_states;
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
@@ -101,7 +150,9 @@ struct TRTEngine : torch::CustomClassHolder {
   at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
-  std::string shape_key;
+  std::string shape_key = "None";
+  bool use_pre_allocated_outputs = false;
+  std::vector<at::Tensor> pre_allocated_outputs;
 
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);

diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -60,9 +60,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
   return new_target_device_opt.value();
 }
 
-bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
-  // Validate whether the current input shapes to the engine
-  // invalidate the existing cudagraphs object
+bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  // Validate whether the current input shapes to the engine has changed
 
   // Populate the shape key for the inputs
   // x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
@@ -83,15 +82,102 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
 
   auto new_shape_key = new_shape_key_ss.str();
 
-  // Compare the shape key to the original key and invalidate shapes if they do not match
+  // Compare the shape key to the original key
   if (new_shape_key != compiled_engine->shape_key) {
-    LOG_DEBUG("Resetting Cudagraph on New Shape Key " << new_shape_key);
+    LOG_DEBUG("Input shape changed " << compiled_engine->shape_key << " -> " << new_shape_key);
     compiled_engine->shape_key = new_shape_key;
-    compiled_engine->cudagraph.reset();
-    return false;
+    return true;
   }
 
-  return true;
+  return false;
+}
+void setup_input_tensors(
+    std::vector<at::Tensor> inputs,
+    c10::intrusive_ptr<TRTEngine> compiled_engine,
+    bool need_cudagraphs_record) {
+  // this is a buffer to store shape tensor input addresses throughout the runtime scope
+  std::list<std::vector<int64_t>> inputShapeTensorValues;
+  std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
+
+  for (size_t i = 0; i < inputs.size(); i++) {
+    std::string name = compiled_engine->in_binding_names[i];
+
+    TORCHTRT_CHECK(
+        inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+
+    auto expected_type =
+        util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    TORCHTRT_CHECK(
+        inputs[i].dtype() == expected_type,
+        "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+
+    auto dims = core::util::toDims(inputs[i].sizes());
+    auto shape = core::util::toVec(dims);
+    LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
+
+    if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
+      // Shape tensor inputs are casted to int64 explicitly.
+      // Refer to
+      // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
+      auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
+      std::vector<int64_t> inputs_cpu_vec(
+          input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
+      inputShapeTensorValues.emplace_back(inputs_cpu_vec);
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          "Error while setting the tensor address for shape inputs");
+
+      if (CUDAGRAPHS_MODE) {
+        // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
+        compiled_engine->input_buffers[i] = input_cpu;
+      }
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
+          "Error while setting the tensor address for shape inputs");
+
+    } else {
+      at::Tensor contig_input = inputs[i].view(shape).contiguous();
+      formatted_inputs.emplace_back(std::move(contig_input));
+
+      if (need_cudagraphs_record) {
+        // Create a new persistent input buffer
+        compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
+      }
+
+      TORCHTRT_CHECK(
+          compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
+
+      if (CUDAGRAPHS_MODE) {
+        // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
+        compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
+        TORCHTRT_CHECK(
+            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
+            "Error while setting the input tensor address for inputs");
+      } else {
+        // Otherwise use the formatted buffer directly
+        TORCHTRT_CHECK(
+            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
+            "Error while setting the input tensor address for inputs");
+      }
+    }
+  }
+}
+std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
+  for (auto output_indices : compiled_engine->out_binding_map) {
+    // out_binding_map stores TRT_IDX: PYT_IDX
+    auto pyt_idx = output_indices.second;
+
+    std::string name = compiled_engine->out_binding_names[pyt_idx];
+    auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
+    LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
+
+    auto dims = core::util::toVec(out_shape);
+    auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
+  }
+
+  return outputs;
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -114,18 +200,17 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     compiled_engine->cudagraph.enable_debug_mode();
   }
 
+  bool shape_changed = _validate_shapes(inputs, compiled_engine);
+
   // Whether cudagraphs needs to record the graph on this pass
-  bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
+  RuntimeStates states = compiled_engine->runtime_states.validate_states(
+      CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
 
-  if (!CUDAGRAPHS_MODE) {
+  if (states.need_cudagraphs_reset) {
     compiled_engine->cudagraph.reset();
   }
 
-  // this is a buffer to store shape tensor input addresses throughout the runtime scope
-  std::list<std::vector<int64_t>> inputShapeTensorValues;
-
   // Intialize inputs and outputs to be available throughout the succeeding scopes
-  std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
 
   if (MULTI_DEVICE_SAFE_MODE) {
@@ -183,68 +268,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
     }
 
-    for (size_t i = 0; i < inputs.size(); i++) {
-      std::string name = compiled_engine->in_binding_names[i];
-
-      TORCHTRT_CHECK(
-          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
-
-      auto expected_type =
-          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      TORCHTRT_CHECK(
-          inputs[i].dtype() == expected_type,
-          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
-
-      auto dims = core::util::toDims(inputs[i].sizes());
-      auto shape = core::util::toVec(dims);
-      LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
-
-      if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
-        // Shape tensor inputs are casted to int64 explicitly.
-        // Refer to
-        // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
-        auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
-        std::vector<int64_t> inputs_cpu_vec(
-            input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
-        inputShapeTensorValues.emplace_back(inputs_cpu_vec);
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-        if (CUDAGRAPHS_MODE) {
-          // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
-          compiled_engine->input_buffers[i] = input_cpu;
-        }
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-      } else {
-        at::Tensor contig_input = inputs[i].view(shape).contiguous();
-        formatted_inputs.emplace_back(std::move(contig_input));
-
-        if (need_cudagraphs_record) {
-          // Create a new persistent input buffer
-          compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
-        }
-
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
-
-        if (CUDAGRAPHS_MODE) {
-          // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
-          compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        } else {
-          // Otherwise use the formatted buffer directly
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        }
-      }
-    }
+    setup_input_tensors(inputs, compiled_engine, states.need_cudagraphs_record);
 
     // Check if input shapes can be inferred.
     int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
@@ -263,20 +287,16 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
+    if (states.can_use_pre_allocated_outputs) {
+      outputs = compiled_engine->pre_allocated_outputs;
+    } else {
+      outputs = create_output_tensors(compiled_engine);
+    }
 
     for (auto output_indices : compiled_engine->out_binding_map) {
-      // out_binding_map stores TRT_IDX: PYT_IDX
       auto pyt_idx = output_indices.second;
-
       std::string name = compiled_engine->out_binding_names[pyt_idx];
-      auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
-      LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
-
-      auto dims = core::util::toVec(out_shape);
-      auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
-
-      if (need_cudagraphs_record) {
+      if (states.need_cudagraphs_record) {
         // If we are recording the cuda graph then we need to update the persistent output buffer
         compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
       }
@@ -328,7 +348,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       // Direct execution uses the caller buffers directly
       compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
     } else {
-      if (need_cudagraphs_record) {
+      if (states.need_cudagraphs_record) {
         // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
         c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
         compiled_engine->cudagraph.capture_begin();
@@ -345,6 +365,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
   } // End engine exeuction (resets to caller stream)
 
+  // Create output buffer for next execution of graph or trt context.
+  if (compiled_engine->use_pre_allocated_outputs) {
+    compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
+  }
+
   // Block caller stream until engine execution is complete
   at::cuda::CUDAEvent trt_exec_complete;
   trt_exec_complete.record(compiled_engine->engine_stream);

diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -88,6 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
+        .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,