From fde1a1c82eef7173e6d28ad43cf1476e4059109b Mon Sep 17 00:00:00 2001
From: kee hyun an <keehyuna@nvidia.com>
Date: Fri, 29 Nov 2024 09:58:11 +0900
Subject: [PATCH] chore: rebase and update doc

---
 examples/dynamo/cudagraphs_wrapper_example.py | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/examples/dynamo/cudagraphs_wrapper_example.py b/examples/dynamo/cudagraphs_wrapper_example.py
index b9993a1f16..386eb62650 100644
--- a/examples/dynamo/cudagraphs_wrapper_example.py
+++ b/examples/dynamo/cudagraphs_wrapper_example.py
@@ -72,15 +72,27 @@ def forward(self, x):
 #        Node: torch.ops.aten.mul.Tensor, with layer location: /mul
 #        Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner
 
+# %%
+# trt module with cuda graphs
+# ----------------------------------
+#
+# When CUDA Graphs are applied to a TensorRT model that contains graph breaks, each break introduces additional
+# overhead. This occurs because graph breaks prevent the entire model from being executed as a single, continuous
+# optimized unit. As a result, some of the performance benefits typically provided by CUDA Graphs, such as reduced
+# kernel launch overhead and improved execution efficiency, may be diminished.
+with torch_tensorrt.runtime.enable_cudagraphs():
+    trt_model(input)
+
 # %%
 # Running wrapped module with cuda graphs
 # ----------------------------------
 #
-# Please note that initializing with wrapper module involve warm-up phase where the module
-# is executed several times. This ensures that memory allocations and initializations are
-# not recorded in CUDA Graphs.
-# When using the TensorRT module within a CUDA Graph context manager, a wrapped_module is returned.
-# This module captures the execution graph, allowing for efficient replay during subsequent
-# inferences by reducing kernel launch overheads and improving performance.
+# Using a wrapped runtime module with CUDA Graphs allows you to encapsulate sequences of operations into graphs
+# that can be executed efficiently, even in the presence of graph breaks. When a CUDA Graph context manager is
+# used with the TensorRT module as a positional argument, it returns a wrapped_module. This module captures the
+# execution graph, enabling efficient replay during subsequent inferences by reducing kernel launch overheads
+# and improving performance. Note that initializing with the wrapper module involves a warm-up phase where the
+# module is executed several times. This warm-up ensures that memory allocations and initializations are not
+# recorded in CUDA Graphs, which helps maintain consistent execution paths and optimize performance.
 with torch_tensorrt.runtime.enable_cudagraphs(trt_model) as wrapped_module:
     wrapped_module(input)