feat: Lazy engine initialization (#2997)

Signed-off-by: Naren Dasan <[email protected]> Signed-off-by: Naren Dasan <[email protected]>
pytorch · Aug 5, 2024 · 1d5dd56 · 1d5dd56
1 parent 577c5c4
commit 1d5dd56
Show file tree

Hide file tree

Showing 16 changed files with 523 additions and 107 deletions.
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -27,7 +27,7 @@
     from torch.export import ExportedProgram
     from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
     from torch_tensorrt.dynamo._compiler import (
-        convert_module_to_trt_engine as dynamo_convert_module_to_trt_engine,
+        convert_exported_program_to_serialized_trt_engine as dynamo_convert_exported_program_to_serialized_trt_engine,
     )
     from torch_tensorrt.dynamo._tracer import trace as dynamo_trace
 
@@ -351,7 +351,7 @@ def convert_method_to_trt_engine(
         torchtrt_inputs = prepare_inputs(inputs)
         exp_program = torch_tensorrt.dynamo.trace(module, torchtrt_inputs, **kwargs)
 
-        return dynamo_convert_module_to_trt_engine(
+        return dynamo_convert_exported_program_to_serialized_trt_engine(
             exp_program,
             inputs=tuple(inputs),
             enabled_precisions=enabled_precisions_set,

diff --git a/py/torch_tensorrt/dynamo/__init__.py b/py/torch_tensorrt/dynamo/__init__.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger(__name__)
 
 if version.parse(sanitized_torch_version()) >= version.parse("2.1.dev"):
-    from ._compiler import compile, convert_module_to_trt_engine
+    from ._compiler import compile, convert_exported_program_to_serialized_trt_engine
     from ._exporter import export
     from ._refit import refit_module_weights
     from ._settings import CompilationSettings

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -79,6 +79,7 @@ def compile(
     dryrun: bool = _defaults.DRYRUN,
     hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+    lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -141,6 +142,7 @@ def compile(
         dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs
         hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
+        lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -236,6 +238,7 @@ def compile(
         "dryrun": dryrun,
         "hardware_compatible": hardware_compatible,
         "timing_cache_path": timing_cache_path,
+        "lazy_engine_init": lazy_engine_init,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -454,6 +457,8 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     # Replace all FX Modules with TRT Modules
     for name, trt_module in trt_modules.items():
         setattr(partitioned_module, name, trt_module)
+        if settings.lazy_engine_init:
+            getattr(partitioned_module, name).setup_engine()
 
     # Reset settings object to user specification after fallback to global partitioning mode
     if fast_partitioner_failed:
@@ -464,7 +469,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     return partitioned_module
 
 
-def convert_module_to_trt_engine(
+def convert_exported_program_to_serialized_trt_engine(
     exported_program: ExportedProgram,
     inputs: Sequence[Any],
     *,
@@ -647,10 +652,5 @@ def convert_module_to_trt_engine(
             exc_info=True,
         )
 
-    import io
-
-    with io.BytesIO() as engine_bytes:
-        engine_bytes.write(interpreter_result.engine)
-        engine_bytearray: bytes = engine_bytes.getvalue()
-
-    return engine_bytearray
+    serialized_engine: bytes = interpreter_result.serialized_engine
+    return serialized_engine
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -32,6 +32,7 @@
 HARDWARE_COMPATIBLE = False
 SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.bf16, dtype.i8, dtype.f8}
 TIMING_CACHE_PATH = os.path.join(tempfile.gettempdir(), "timing_cache.bin")
+LAZY_ENGINE_INIT = False
 
 
 def default_device() -> Device:

diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -16,6 +16,7 @@
     ENABLED_PRECISIONS,
     ENGINE_CAPABILITY,
     HARDWARE_COMPATIBLE,
+    LAZY_ENGINE_INIT,
     MAKE_REFITABLE,
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
@@ -104,3 +105,4 @@ class CompilationSettings:
     dryrun: Union[bool, str] = DRYRUN
     hardware_compatible: bool = HARDWARE_COMPATIBLE
     timing_cache_path: str = TIMING_CACHE_PATH
+    lazy_engine_init: bool = LAZY_ENGINE_INIT
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -1,11 +1,11 @@
+import io
 import logging
 import os
 import warnings
 from datetime import datetime
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple
 
 import numpy as np
-import tensorrt as trt
 import torch
 import torch.fx
 from torch.fx.node import _get_qualified_name
@@ -29,6 +29,7 @@
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.logging import TRT_LOGGER
 
+import tensorrt as trt
 from packaging import version
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -43,7 +44,7 @@ class UnsupportedOperatorException(RuntimeError):
 
 
 class TRTInterpreterResult(NamedTuple):
-    engine: Any
+    serialized_engine: bytes
     input_names: Sequence[str]
     output_names: Sequence[str]
 
@@ -358,9 +359,11 @@ def run(
             builder_config, self.compilation_settings.timing_cache_path
         )
 
-        return TRTInterpreterResult(
-            serialized_engine, self._input_names, self._output_names
-        )
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(serialized_engine)
+            engine_str = engine_bytes.getvalue()
+
+        return TRTInterpreterResult(engine_str, self._input_names, self._output_names)
 
     def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
         self._cur_node_name = get_node_name(n)

diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import io
 import logging
 from typing import List, Sequence
 
@@ -102,33 +101,30 @@ def convert_module(
         settings: Compilation settings
         name: TRT engine name
     Returns:
-        _PythonTorchTensorRTModule or TorchTensorRTModule
+        PythonTorchTensorRTModule or TorchTensorRTModule
     """
     interpreter_result = interpret_module_to_result(module, inputs, settings)
 
-    if settings.use_python_runtime or not ENABLED_FEATURES.torch_tensorrt_runtime:
-        if not settings.use_python_runtime:
-            logger.info(
-                "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available"
-            )
-        return PythonTorchTensorRTModule(
-            engine=interpreter_result.engine,
-            input_names=list(interpreter_result.input_names),
-            output_names=list(interpreter_result.output_names),
-            settings=settings,
-        )
+    rt_cls = PythonTorchTensorRTModule
+
+    if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime:
 
-    else:
         from torch_tensorrt.dynamo.runtime import TorchTensorRTModule
 
-        with io.BytesIO() as engine_bytes:
-            engine_bytes.write(interpreter_result.engine)
-            engine_str = engine_bytes.getvalue()
+        rt_cls = TorchTensorRTModule
+
+    elif (
+        not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime
+    ):
 
-        return TorchTensorRTModule(
-            serialized_engine=engine_str,
-            name=name,
-            input_binding_names=list(interpreter_result.input_names),
-            output_binding_names=list(interpreter_result.output_names),
-            settings=settings,
+        logger.info(
+            "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available"
         )
+
+    return rt_cls(
+        serialized_engine=interpreter_result.serialized_engine,
+        input_binding_names=list(interpreter_result.input_names),
+        output_binding_names=list(interpreter_result.output_names),
+        name=name,
+        settings=settings,
+    )
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -4,8 +4,8 @@
 from contextlib import nullcontext
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
-import tensorrt as trt
 import torch
+import torch_tensorrt
 from torch.nn import Module
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import dtype
@@ -18,7 +18,7 @@
 from torch_tensorrt.dynamo.utils import DYNAMIC_DIM
 from torch_tensorrt.logging import TRT_LOGGER
 
-import torch_tensorrt
+import tensorrt as trt
 
 logger = logging.getLogger(__name__)
 
@@ -32,17 +32,45 @@ class PythonTorchTensorRTModule(Module):  # type: ignore[misc]
 
     def __init__(
         self,
-        engine: bytes,
-        input_names: Optional[List[str]] = None,
-        output_names: Optional[List[str]] = None,
+        serialized_engine: Optional[bytes] = None,
+        input_binding_names: Optional[List[str]] = None,
+        output_binding_names: Optional[List[str]] = None,
+        *,
+        name: str = "",
         settings: CompilationSettings = CompilationSettings(),
     ):
+        """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
+        a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine
+
+        Arguments:
+            serialized_engine (bytes): Serialized TensorRT engine in the form of a bytearray
+            input_binding_names (List[str]): List of input TensorRT engine binding names in the order they would be passed to the TRT modules
+            output_binding_names (List[str]): List of output TensorRT engine binding names in the order they should be returned
+
+        Keyword Arguments:
+            name (str): Name for module
+            settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
+
+        Example:
+
+            .. code-block:: py
+
+                trt_module = PythonTorchTensorRTModule(
+                    engine_str,
+                    input_binding_names=["x"],
+                    output_binding_names=["output"],
+                    name="my_module",
+                    settings=CompilationSettings(device=torch.cuda.current_device)
+                )
+
+        """
         super(PythonTorchTensorRTModule, self).__init__()
         self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict)
 
         # Run multi-gpu device check to validate engine instantiation
         multi_gpu_device_check()
 
+        self.name = name
         self.input_buffers: List[torch.Tensor] = []
         self.output_buffers: List[torch.Tensor] = []
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
@@ -55,9 +83,13 @@ def __init__(
         # Unused currently - to be used by Dynamic Shape support implementation
         self.memory_pool = None
 
-        self.engine = engine
-        self.input_names = input_names if input_names is not None else []
-        self.output_names = output_names if output_names is not None else []
+        self.serialized_engine = serialized_engine
+        self.input_names = (
+            input_binding_names if input_binding_names is not None else []
+        )
+        self.output_names = (
+            output_binding_names if output_binding_names is not None else []
+        )
         self.initialized = False
         self.target_device_id = (
             settings.device.gpu_id
@@ -69,12 +101,15 @@ def __init__(
         )
         self.profiling_enabled = settings.debug if settings.debug is not None else False
         self.settings = settings
-        self._initialize()
+        self.engine = None
+
+        if self.serialized_engine is not None and not self.settings.lazy_engine_init:
+            self.setup_engine()
 
-    def _initialize(self) -> None:
+    def setup_engine(self) -> None:
         self.initialized = True
         runtime = trt.Runtime(TRT_LOGGER)
-        self.engine = runtime.deserialize_cuda_engine(self.engine)
+        self.engine = runtime.deserialize_cuda_engine(self.serialized_engine)
         self.context = self.engine.create_execution_context()
 
         assert self.engine.num_io_tensors == (
@@ -114,8 +149,7 @@ def _check_initialized(self) -> None:
             raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
 
     def _on_state_dict(self, state_dict: Dict[str, Any], prefix: str, _: Any) -> None:
-        self._check_initialized()
-        state_dict[prefix + "engine"] = bytearray(self.engine.serialize())
+        state_dict[prefix + "engine"] = self.serialized_engine
         state_dict[prefix + "input_names"] = self.input_names
         state_dict[prefix + "output_names"] = self.output_names
 
@@ -129,17 +163,13 @@ def _load_from_state_dict(
         unexpected_keys: Any,
         error_msgs: Any,
     ) -> None:
-        engine_bytes = state_dict[prefix + "engine"]
+        self.serialized_engine = state_dict[prefix + "engine"]
+        self.input_names = state_dict[prefix + "input_names"]
+        self.output_names = state_dict[prefix + "output_names"]
 
         # Run multi-gpu device check to validate engine instantiation
         multi_gpu_device_check()
-
-        runtime = trt.Runtime(TRT_LOGGER)
-        self.engine = runtime.deserialize_cuda_engine(engine_bytes)
-
-        self.input_names = state_dict[prefix + "input_names"]
-        self.output_names = state_dict[prefix + "output_names"]
-        self._initialize()
+        self.setup_engine()
 
     def __getstate__(self) -> Dict[str, Any]:
         state = self.__dict__.copy()