From 377248e8db874439e9a83be1c41e6f8e0201bd0e Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Tue, 5 Nov 2024 09:31:38 +0900
Subject: [PATCH] chore: setting for test

---
 core/runtime/TRTEngine.cpp                                   | 4 ----
 core/runtime/execute_engine.cpp                              | 4 ----
 core/runtime/register_jit_hooks.cpp                          | 2 +-
 .../dynamo/runtime/_PythonTorchTensorRTModule.py             | 5 ++++-
 py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py     | 5 ++++-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index ba78c30a90..adc21bd496 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -296,10 +296,6 @@ int64_t TRTEngine::get_automatic_device_memory_budget() {
   return cuda_engine->getWeightStreamingAutomaticBudget();
 }
 
-void TRTEngine::set_pre_allocated_outputs(bool enable) {
-  use_pre_allocated_outputs = enable;
-}
-
 std::string TRTEngine::to_str() const {
   // clang-format off
   std::stringstream ss;
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
index f7ba509494..682d56ab3d 100644
--- a/core/runtime/execute_engine.cpp
+++ b/core/runtime/execute_engine.cpp
@@ -5,7 +5,6 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "torch/torch.h"
 
-#include <ATen/record_function.h>
 #include "core/runtime/TRTEngineProfiler.h"
 #include "core/runtime/runtime.h"
 #include "core/util/prelude.h"
@@ -200,7 +199,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Input Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
-    RECORD_FUNCTION("process input", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       input_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
@@ -282,7 +280,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   { // Output Setup
     std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
-    RECORD_FUNCTION("process output", std::vector<c10::IValue>());
     if (compiled_engine->profile_execution) {
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
@@ -331,7 +328,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   std::unique_lock<std::mutex> lock(compiled_engine->mu);
 
   { // Engine Execution (execute on engine stream)
-    RECORD_FUNCTION("Trt runtime", std::vector<c10::IValue>());
     c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
 
     std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 2918cee367..a09b99dbfc 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -86,7 +86,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
-        .def("set_pre_allocated_outputs", &TRTEngine::set_pre_allocated_outputs)
+        .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 17a38c716d..afb67d1165 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -109,7 +109,7 @@ def __init__(
         self.target_platform = Platform.current_platform()
         self.cudagraphs_enabled = False
         self.pre_allocated_outputs: List[torch.Tensor] = []
-        self.use_pre_allocated_outputs = False
+        self.use_pre_allocated_outputs = True
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -248,6 +248,9 @@ def create_output_tensors(self) -> List[torch.Tensor]:
             outputs.append(output)
         return outputs
 
+    def set_output_opt(self, enable: bool) -> None:
+        self.use_pre_allocated_outputs = enable
+
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         # Ensure inputs are available in all scopes and cast symbolic integers to Tensors
         contiguous_inputs: List[torch.Tensor] = [
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index 99f863f1da..b3ec3258f0 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,7 +207,7 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
-        self.engine.set_pre_allocated_outputs(True)
+        self.set_output_opt(True)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)
@@ -272,6 +272,9 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
         self.input_binding_names = state[2]
         self.output_binding_names = state[3]
 
+    def set_output_opt(self, enable: bool) -> None:
+        self.engine.use_pre_allocated_outputs = enable
+
     def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         """Implementation of the forward pass for a TensorRT engine