diff --git a/perception/tensorrt_yolox/CMakeLists.txt b/perception/tensorrt_yolox/CMakeLists.txt
index a5498a845e62e..c38824cecd5b1 100644
--- a/perception/tensorrt_yolox/CMakeLists.txt
+++ b/perception/tensorrt_yolox/CMakeLists.txt
@@ -15,6 +15,7 @@ include(CheckLanguage)
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
+  find_package(autoware_type_adapters)
 else()
   message(WARNING "CUDA is not found. preprocess acceleration using CUDA will not be available.")
 endif()
@@ -32,6 +33,7 @@ ament_auto_add_library(${PROJECT_NAME} SHARED
 
 ament_target_dependencies(${PROJECT_NAME}
   OpenCV
+  autoware_type_adapters
 )
 
 if(CMAKE_CUDA_COMPILER)
diff --git a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
index 57c1b40c44a4d..178c3f8bba0a3 100644
--- a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
+++ b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
@@ -35,3 +35,6 @@
     clip_value: 6.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration.
     preprocess_on_gpu: true # If true, pre-processing is performed on GPU.
     calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization.
+    is_using_image_transport: true # if true, will use image_transport, false -> will only publish raw images
+    type_adaptation_activated: true # if true, will use type_adaptation to transfer images in GPU.
+    is_publish_debug_rois_image_: true # if true, will draw ROIs in the image.
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
index d287c8a44d4cf..7aa9e6d036a87 100644
--- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
+++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
@@ -15,6 +15,8 @@
 #ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_
 #define TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_
 
+#include "type_adapters/image_container.hpp"
+
 #include <cuda_utils/cuda_unique_ptr.hpp>
 #include <cuda_utils/stream_unique_ptr.hpp>
 #include <opencv2/opencv.hpp>
@@ -31,6 +33,7 @@ using cuda_utils::CudaUniquePtr;
 using cuda_utils::CudaUniquePtrHost;
 using cuda_utils::makeCudaStream;
 using cuda_utils::StreamUniquePtr;
+using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer;
 
 struct Object
 {
@@ -107,6 +110,10 @@ class TrtYoloX
     const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
     std::vector<cv::Mat> & color_masks);
 
+  bool doInference(
+    const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects,
+    std::vector<cv::Mat> & masks, std::vector<cv::Mat> & color_masks);
+
   /**
    * @brief run inference including pre-process and post-process
    * @param[out] objects results for object detection
@@ -116,6 +123,10 @@ class TrtYoloX
   bool doInferenceWithRoi(
     const std::vector<cv::Mat> & images, ObjectArrays & objects, const std::vector<cv::Rect> & roi);
 
+  bool doInferenceWithRoi(
+    const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects,
+    const std::vector<cv::Rect> & roi);
+
   /**
    * @brief run multi-scale inference including pre-process and post-process
    * @param[out] objects results for object detection
@@ -125,6 +136,10 @@ class TrtYoloX
   bool doMultiScaleInference(
     const cv::Mat & image, ObjectArrays & objects, const std::vector<cv::Rect> & roi);
 
+  bool doMultiScaleInference(
+    const std::shared_ptr<ImageContainer> & image, ObjectArrays & objects,
+    const std::vector<cv::Rect> & roi);
+
   /**
    * @brief allocate buffer for preprocess on GPU
    * @param[in] width original image width
@@ -167,6 +182,7 @@ class TrtYoloX
    * @param[in] images batching images
    */
   void preprocessGpu(const std::vector<cv::Mat> & images);
+  void preprocessGpu(const std::vector<std::shared_ptr<ImageContainer>> & images);
 
   /**
    * @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU
@@ -183,6 +199,9 @@ class TrtYoloX
   void preprocessWithRoiGpu(
     const std::vector<cv::Mat> & images, const std::vector<cv::Rect> & rois);
 
+  void preprocessWithRoiGpu(
+    const std::vector<std::shared_ptr<ImageContainer>> & images,
+    const std::vector<cv::Rect> & rois);
   /**
    * @brief run multi-scale preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU
    * @param[in] images batching images
@@ -196,15 +215,29 @@ class TrtYoloX
    * @param[in] rois region of interest
    */
   void multiScalePreprocessGpu(const cv::Mat & image, const std::vector<cv::Rect> & rois);
+  void multiScalePreprocessGpu(
+    const std::shared_ptr<ImageContainer> & image, const std::vector<cv::Rect> & rois);
 
   bool multiScaleFeedforward(const cv::Mat & image, int batch_size, ObjectArrays & objects);
+  bool multiScaleFeedforward(
+    const std::shared_ptr<ImageContainer> & image, int batch_size, ObjectArrays & objects);
+
   bool multiScaleFeedforwardAndDecode(
     const cv::Mat & images, int batch_size, ObjectArrays & objects);
+  bool multiScaleFeedforwardAndDecode(
+    const std::shared_ptr<ImageContainer> & images, int batch_size, ObjectArrays & objects);
 
   bool feedforward(const std::vector<cv::Mat> & images, ObjectArrays & objects);
+  bool feedforward(
+    const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects);
+
   bool feedforwardAndDecode(
     const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
     std::vector<cv::Mat> & color_masks);
+  bool feedforwardAndDecode(
+    const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects,
+    std::vector<cv::Mat> & masks, std::vector<cv::Mat> & color_masks);
+
   void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const;
   void generateGridsAndStride(
     const int target_w, const int target_h, const std::vector<int> & strides,
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
index 64332dffa834e..7e16ac7d0b4bc 100644
--- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
+++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
@@ -16,6 +16,7 @@
 #define TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_
 
 #include "object_recognition_utils/object_recognition_utils.hpp"
+#include "type_adapters/image_container.hpp"
 
 #include <autoware/universe_utils/ros/debug_publisher.hpp>
 #include <autoware/universe_utils/system/stop_watch.hpp>
@@ -48,6 +49,7 @@ namespace tensorrt_yolox
 // cspell: ignore Semseg
 using LabelMap = std::map<int, std::string>;
 using Label = tier4_perception_msgs::msg::Semantic;
+using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer;
 class TrtYoloXNode : public rclcpp::Node
 {
   struct RoiOverlaySemsegLabel
@@ -74,12 +76,22 @@ class TrtYoloXNode : public rclcpp::Node
 
 private:
   void onConnect();
+  void setUpImageSubscriber();
+  bool checkInputBlocked();
+  uint32_t getNumOutputConnections();
   void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg);
+  void onGpuImage(const std::shared_ptr<ImageContainer> msg);
   bool readLabelFile(const std::string & label_path);
   void replaceLabelMap();
   void overlapSegmentByRoi(
     const tensorrt_yolox::Object & object, cv::Mat & mask, const int width, const int height);
   int mapRoiLabel2SegLabel(const int32_t roi_label_index);
+  void postInference(
+    const std_msgs::msg::Header header, const tensorrt_yolox::ObjectArrays & objects,
+    const int width, const int height, std::vector<cv::Mat> & masks);
+  void drawImageDetection(
+    cv_bridge::CvImagePtr image, const tensorrt_yolox::ObjectArrays & objects, const int width,
+    const int height);
   image_transport::Publisher image_pub_;
   image_transport::Publisher mask_pub_;
 
@@ -88,6 +100,10 @@ class TrtYoloXNode : public rclcpp::Node
   rclcpp::Publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>::SharedPtr objects_pub_;
 
   image_transport::Subscriber image_sub_;
+  rclcpp::Subscription<ImageContainer>::SharedPtr gpu_image_sub_;
+  rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr simple_image_pub_;
+  rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr simple_mask_pub_;
+  rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr simple_color_mask_pub_;
 
   rclcpp::TimerBase::SharedPtr timer_;
 
@@ -95,6 +111,9 @@ class TrtYoloXNode : public rclcpp::Node
   std::unique_ptr<tensorrt_yolox::TrtYoloX> trt_yolox_;
   bool is_roi_overlap_segment_;
   bool is_publish_color_mask_;
+  bool is_using_image_transport_;
+  bool type_adaptation_activated_;
+  bool is_publish_debug_rois_image_;
   float overlap_roi_score_threshold_;
   // TODO(badai-nguyen): change to function
   std::map<std::string, int> remap_roi_to_semantic_ = {
diff --git a/perception/tensorrt_yolox/package.xml b/perception/tensorrt_yolox/package.xml
index 8a80d9d82d09f..0b5b45ecb7044 100644
--- a/perception/tensorrt_yolox/package.xml
+++ b/perception/tensorrt_yolox/package.xml
@@ -19,6 +19,7 @@
   <buildtool_export_depend>tensorrt_cmake_module</buildtool_export_depend>
 
   <depend>autoware_perception_msgs</depend>
+  <depend>autoware_type_adapters</depend>
   <depend>cuda_utils</depend>
   <depend>cv_bridge</depend>
   <depend>image_transport</depend>
diff --git a/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json b/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json
index ce1ad6c2d0caf..352512c401812 100644
--- a/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json
+++ b/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json
@@ -74,6 +74,21 @@
           "type": "string",
           "default": "",
           "description": "Path to a file which contains path to images. Those images will be used for int8 quantization."
+        },
+        "is_using_image_transport": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, will use image_transport, false -> will only publish raw images"
+        },
+        "type_adaptation_activated": {
+          "type": "boolean",
+          "default": false,
+          "description": "If true, will use type_adaptation to transfer images in GPU."
+        },
+        "is_publish_debug_rois_image_": {
+          "type": "boolean",
+          "default": true,
+          "description": "If true, will draw ROIs in the image."
         }
       },
       "required": [
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
index af6cecd2eff7d..8f8e363790e75 100644
--- a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
+++ b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
@@ -499,6 +499,93 @@ void TrtYoloX::preprocessGpu(const std::vector<cv::Mat> & images)
     images[0].rows, 3, batch_size, static_cast<float>(norm_factor_), *stream_);
 }
 
+void TrtYoloX::preprocessGpu(const std::vector<std::shared_ptr<ImageContainer>> & images)
+{
+  const auto batch_size = images.size();
+  auto input_dims = trt_common_->getBindingDimensions(0);
+
+  input_dims.d[0] = batch_size;
+  for (const auto & image : images) {
+    // if size of source input has been changed...
+    int width = image->width();
+    int height = image->height();
+    if (src_width_ != -1 || src_height_ != -1) {
+      if (width != src_width_ || height != src_height_) {
+        // Free cuda memory to reallocate
+        if (image_buf_h_) {
+          image_buf_h_.reset();
+        }
+        if (image_buf_d_) {
+          image_buf_d_.reset();
+        }
+        if (argmax_buf_h_) {
+          argmax_buf_h_.reset();
+        }
+        if (argmax_buf_d_) {
+          argmax_buf_d_.reset();
+        }
+      }
+    }
+    src_width_ = width;
+    src_height_ = height;
+  }
+  if (!image_buf_h_) {
+    trt_common_->setBindingDimensions(0, input_dims);
+    scales_.clear();
+  }
+  const float input_height = static_cast<float>(input_dims.d[2]);
+  const float input_width = static_cast<float>(input_dims.d[3]);
+  int b = 0;
+  size_t argmax_out_elem_num = 0;
+  for (const auto & image : images) {
+    if (!image_buf_h_) {
+      const float scale = std::min(input_width / image->width(), input_height / image->height());
+      scales_.emplace_back(scale);
+      image_buf_h_ = cuda_utils::make_unique_host<unsigned char[]>(
+        image->width() * image->height() * 3 * batch_size, cudaHostAllocWriteCombined);
+      image_buf_d_ =
+        cuda_utils::make_unique<unsigned char[]>(image->width() * image->height() * 3 * batch_size);
+    }
+    int index = b * image->width() * image->height() * 3;
+    // Copy into pinned memory
+    // synchroize the stream
+    cudaStreamSynchronize(image->cuda_stream()->stream());
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      image_buf_d_.get() + index, image->cuda_mem(),
+      image->width() * image->height() * 3 * sizeof(unsigned char), cudaMemcpyDeviceToDevice,
+      *stream_));
+    b++;
+
+    if (multitask_) {
+      for (int m = 0; m < multitask_; m++) {
+        const auto output_dims =
+          trt_common_->getBindingDimensions(m + 2);  // 0: input, 1: output for detections
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(image->width()),
+          output_dims.d[2] / static_cast<float>(image->height()));
+        int out_w = static_cast<int>(image->width() * scale);
+        int out_h = static_cast<int>(image->height() * scale);
+        argmax_out_elem_num += out_w * out_h * batch_size;
+      }
+    }
+  }
+
+  if (multitask_) {
+    if (!argmax_buf_h_) {
+      argmax_buf_h_ =
+        cuda_utils::make_unique_host<unsigned char[]>(argmax_out_elem_num, cudaHostAllocPortable);
+    }
+    if (!argmax_buf_d_) {
+      argmax_buf_d_ = cuda_utils::make_unique<unsigned char[]>(argmax_out_elem_num);
+    }
+  }
+
+  // Preprocess on GPU
+  resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+    input_d_.get(), image_buf_d_.get(), input_width, input_height, 3, images[0]->width(),
+    images[0]->height(), 3, batch_size, static_cast<float>(norm_factor_), *stream_);
+}
+
 void TrtYoloX::preprocess(const std::vector<cv::Mat> & images)
 {
   const auto batch_size = images.size();
@@ -566,6 +653,23 @@ bool TrtYoloX::doInference(
   }
 }
 
+bool TrtYoloX::doInference(
+  const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects,
+  std::vector<cv::Mat> & masks, [[maybe_unused]] std::vector<cv::Mat> & color_masks)
+{
+  if (!trt_common_->isInitialized()) {
+    return false;
+  }
+
+  preprocessGpu(images);
+
+  if (needs_output_decode_) {
+    return feedforwardAndDecode(images, objects, masks, color_masks);
+  } else {
+    return feedforward(images, objects);
+  }
+}
+
 void TrtYoloX::preprocessWithRoiGpu(
   const std::vector<cv::Mat> & images, const std::vector<cv::Rect> & rois)
 {
@@ -890,6 +994,60 @@ bool TrtYoloX::feedforward(const std::vector<cv::Mat> & images, ObjectArrays & o
   return true;
 }
 
+bool TrtYoloX::feedforward(
+  const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects)
+{
+  std::vector<void *> buffers = {
+    input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(),
+    out_classes_d_.get()};
+
+  trt_common_->enqueueV2(buffers.data(), *stream_, nullptr);
+
+  const auto batch_size = images.size();
+  auto out_num_detections = std::make_unique<int32_t[]>(batch_size);
+  auto out_boxes = std::make_unique<float[]>(4 * batch_size * max_detections_);
+  auto out_scores = std::make_unique<float[]>(batch_size * max_detections_);
+  auto out_classes = std::make_unique<int32_t[]>(batch_size * max_detections_);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_num_detections.get(), out_num_detections_d_.get(), sizeof(int32_t) * batch_size,
+    cudaMemcpyDeviceToHost, *stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_boxes.get(), out_boxes_d_.get(), sizeof(float) * 4 * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, *stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_scores.get(), out_scores_d_.get(), sizeof(float) * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, *stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_classes.get(), out_classes_d_.get(), sizeof(int32_t) * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, *stream_));
+  cudaStreamSynchronize(*stream_);
+  objects.clear();
+  for (size_t i = 0; i < batch_size; ++i) {
+    const size_t num_detection = static_cast<size_t>(out_num_detections[i]);
+    ObjectArray object_array;
+    object_array.reserve(num_detection);
+    for (size_t j = 0; j < num_detection; ++j) {
+      Object object{};
+      const auto x1 = out_boxes[i * max_detections_ * 4 + j * 4] / scales_[i];
+      const auto y1 = out_boxes[i * max_detections_ * 4 + j * 4 + 1] / scales_[i];
+      const auto x2 = out_boxes[i * max_detections_ * 4 + j * 4 + 2] / scales_[i];
+      const auto y2 = out_boxes[i * max_detections_ * 4 + j * 4 + 3] / scales_[i];
+      object.x_offset =
+        std::clamp(0, static_cast<int32_t>(x1), static_cast<int32_t>(images[i]->width()));
+      object.y_offset =
+        std::clamp(0, static_cast<int32_t>(y1), static_cast<int32_t>(images[i]->height()));
+      object.width = static_cast<int32_t>(std::max(0.0F, x2 - x1));
+      object.height = static_cast<int32_t>(std::max(0.0F, y2 - y1));
+      object.score = out_scores[i * max_detections_ + j];
+      object.type = out_classes[i * max_detections_ + j];
+      object_array.emplace_back(object);
+    }
+    objects.emplace_back(object_array);
+  }
+  return true;
+}
+
 bool TrtYoloX::feedforwardAndDecode(
   const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & out_masks,
   [[maybe_unused]] std::vector<cv::Mat> & color_masks)
@@ -959,6 +1117,76 @@ bool TrtYoloX::feedforwardAndDecode(
   return true;
 }
 
+bool TrtYoloX::feedforwardAndDecode(
+  const std::vector<std::shared_ptr<ImageContainer>> & images, ObjectArrays & objects,
+  std::vector<cv::Mat> & out_masks, [[maybe_unused]] std::vector<cv::Mat> & color_masks)
+{
+  std::vector<void *> buffers = {input_d_.get(), out_prob_d_.get()};
+  if (multitask_) {
+    buffers = {input_d_.get(), out_prob_d_.get(), segmentation_out_prob_d_.get()};
+  }
+  trt_common_->enqueueV2(buffers.data(), images[0]->cuda_stream()->stream(), nullptr);
+
+  const auto batch_size = images.size();
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost,
+    images[0]->cuda_stream()->stream()));
+  if (multitask_ && !use_gpu_preprocess_) {
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      segmentation_out_prob_h_.get(), segmentation_out_prob_d_.get(),
+      sizeof(float) * segmentation_out_elem_num_, cudaMemcpyDeviceToHost,
+      images[0]->cuda_stream()->stream()));
+  }
+  cudaStreamSynchronize(images[0]->cuda_stream()->stream());
+  objects.clear();
+
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto image_size = cv::Size(images[i]->width(), images[i]->height());
+    float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_);
+    ObjectArray object_array;
+    decodeOutputs(batch_prob, object_array, scales_[i], image_size);
+    // add refine mask using object
+    objects.emplace_back(object_array);
+    if (multitask_) {
+      segmentation_masks_.clear();
+
+      size_t counter = 0;
+      int batch =
+        static_cast<int>(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_);
+      for (int m = 0; m < multitask_; m++) {
+        const auto output_dims =
+          trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
+        size_t out_elem_num = std::accumulate(
+          output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
+        out_elem_num = out_elem_num * batch;
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(image_size.width),
+          output_dims.d[2] / static_cast<float>(image_size.height));
+        int out_w = static_cast<int>(image_size.width * scale);
+        int out_h = static_cast<int>(image_size.height * scale);
+        cv::Mat mask;
+        if (use_gpu_preprocess_) {
+          float * d_segmentation_results =
+            segmentation_out_prob_d_.get() + (i * segmentation_out_elem_num_per_batch_);
+          mask = getMaskImageGpu(&(d_segmentation_results[counter]), output_dims, out_w, out_h, i);
+        } else {
+          float * segmentation_results =
+            segmentation_out_prob_h_.get() + (i * segmentation_out_elem_num_per_batch_);
+          mask = getMaskImage(&(segmentation_results[counter]), output_dims, out_w, out_h);
+        }
+        segmentation_masks_.emplace_back(std::move(mask));
+        counter += out_elem_num;
+      }
+      // semantic segmentation was fixed as first task
+      out_masks.at(i) = segmentation_masks_.at(0);
+    } else {
+      continue;
+    }
+  }
+  return true;
+}
+
 // This method is assumed to be called when specified YOLOX model contains
 // EfficientNMS_TRT module.
 bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, ObjectArrays & objects)
@@ -1011,6 +1239,59 @@ bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, Obje
   return true;
 }
 
+bool TrtYoloX::multiScaleFeedforward(
+  const std::shared_ptr<ImageContainer> & image, int batch_size, ObjectArrays & objects)
+{
+  std::vector<void *> buffers = {
+    input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(),
+    out_classes_d_.get()};
+
+  trt_common_->enqueueV2(buffers.data(), image->cuda_stream()->stream(), nullptr);
+
+  auto out_num_detections = std::make_unique<int32_t[]>(batch_size);
+  auto out_boxes = std::make_unique<float[]>(4 * batch_size * max_detections_);
+  auto out_scores = std::make_unique<float[]>(batch_size * max_detections_);
+  auto out_classes = std::make_unique<int32_t[]>(batch_size * max_detections_);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_num_detections.get(), out_num_detections_d_.get(), sizeof(int32_t) * batch_size,
+    cudaMemcpyDeviceToHost, image->cuda_stream()->stream()));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_boxes.get(), out_boxes_d_.get(), sizeof(float) * 4 * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, image->cuda_stream()->stream()));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_scores.get(), out_scores_d_.get(), sizeof(float) * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, image->cuda_stream()->stream()));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_classes.get(), out_classes_d_.get(), sizeof(int32_t) * batch_size * max_detections_,
+    cudaMemcpyDeviceToHost, image->cuda_stream()->stream()));
+  cudaStreamSynchronize(image->cuda_stream()->stream());
+
+  objects.clear();
+  for (int i = 0; i < batch_size; ++i) {
+    const size_t num_detection = static_cast<size_t>(out_num_detections[i]);
+    ObjectArray object_array(num_detection);
+    for (size_t j = 0; j < num_detection; ++j) {
+      Object object{};
+      const auto x1 = out_boxes[i * max_detections_ * 4 + j * 4] / scales_[i];
+      const auto y1 = out_boxes[i * max_detections_ * 4 + j * 4 + 1] / scales_[i];
+      const auto x2 = out_boxes[i * max_detections_ * 4 + j * 4 + 2] / scales_[i];
+      const auto y2 = out_boxes[i * max_detections_ * 4 + j * 4 + 3] / scales_[i];
+      object.x_offset =
+        std::clamp(0, static_cast<int32_t>(x1), static_cast<int32_t>(image->width()));
+      object.y_offset =
+        std::clamp(0, static_cast<int32_t>(y1), static_cast<int32_t>(image->height()));
+      object.width = static_cast<int32_t>(std::max(0.0F, x2 - x1));
+      object.height = static_cast<int32_t>(std::max(0.0F, y2 - y1));
+      object.score = out_scores[i * max_detections_ + j];
+      object.type = out_classes[i * max_detections_ + j];
+      object_array.emplace_back(object);
+    }
+    objects.emplace_back(object_array);
+  }
+  return true;
+}
+
 bool TrtYoloX::multiScaleFeedforwardAndDecode(
   const cv::Mat & image, int batch_size, ObjectArrays & objects)
 {
@@ -1033,6 +1314,28 @@ bool TrtYoloX::multiScaleFeedforwardAndDecode(
   return true;
 }
 
+bool TrtYoloX::multiScaleFeedforwardAndDecode(
+  const std::shared_ptr<ImageContainer> & image, int batch_size, ObjectArrays & objects)
+{
+  std::vector<void *> buffers = {input_d_.get(), out_prob_d_.get()};
+  trt_common_->enqueueV2(buffers.data(), image->cuda_stream()->stream(), nullptr);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost,
+    image->cuda_stream()->stream()));
+  cudaStreamSynchronize(image->cuda_stream()->stream());
+  objects.clear();
+
+  for (int i = 0; i < batch_size; ++i) {
+    auto image_size = cv::Size(image->width(), image->height());
+    float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_);
+    ObjectArray object_array;
+    decodeOutputs(batch_prob, object_array, scales_[i], image_size);
+    objects.emplace_back(object_array);
+  }
+  return true;
+}
+
 void TrtYoloX::decodeOutputs(
   float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const
 {
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
index f4e544c11ffb1..73120b69e5a88 100644
--- a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
+++ b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
@@ -107,6 +107,11 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
   is_roi_overlap_segment_ = declare_parameter<bool>("is_roi_overlap_segment");
   is_publish_color_mask_ = declare_parameter<bool>("is_publish_color_mask");
   overlap_roi_score_threshold_ = declare_parameter<float>("overlap_roi_score_threshold");
+  is_using_image_transport_ = declare_parameter<bool>("is_using_image_transport");
+  type_adaptation_activated_ = declare_parameter<bool>("type_adaptation_activated");
+  is_publish_debug_rois_image_ = declare_parameter<bool>("is_publish_debug_rois_image");
+  // We don't use type adaptation when preprocessing is performed on CPU.
+  type_adaptation_activated_ = type_adaptation_activated_ && preprocess_on_gpu;
   roi_overlay_segment_labels_.UNKNOWN =
     declare_parameter<bool>("roi_overlay_segment_label.UNKNOWN");
   roi_overlay_segment_labels_.CAR = declare_parameter<bool>("roi_overlay_segment_label.CAR");
@@ -140,9 +145,15 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
 
   objects_pub_ = this->create_publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>(
     "~/out/objects", 1);
-  mask_pub_ = image_transport::create_publisher(this, "~/out/mask");
-  color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask");
-  image_pub_ = image_transport::create_publisher(this, "~/out/image");
+  if (is_using_image_transport_) {
+    mask_pub_ = image_transport::create_publisher(this, "~/out/mask");
+    color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask");
+    image_pub_ = image_transport::create_publisher(this, "~/out/image");
+  } else {
+    simple_image_pub_ = this->create_publisher<sensor_msgs::msg::Image>("~/out/image", 1);
+    simple_mask_pub_ = this->create_publisher<sensor_msgs::msg::Image>("~/out/mask", 1);
+    simple_color_mask_pub_ = this->create_publisher<sensor_msgs::msg::Image>("~/out/color_mask", 1);
+  }
 
   if (declare_parameter("build_only", false)) {
     RCLCPP_INFO(this->get_logger(), "TensorRT engine file is built and exit.");
@@ -150,48 +161,66 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
   }
 }
 
-void TrtYoloXNode::onConnect()
+bool TrtYoloXNode::checkInputBlocked()
 {
-  using std::placeholders::_1;
-  if (
-    objects_pub_->get_subscription_count() == 0 &&
-    objects_pub_->get_intra_process_subscription_count() == 0 &&
-    image_pub_.getNumSubscribers() == 0 && mask_pub_.getNumSubscribers() == 0 &&
-    color_mask_pub_.getNumSubscribers() == 0) {
-    image_sub_.shutdown();
-  } else if (!image_sub_) {
+  bool result;
+  if (type_adaptation_activated_) {
+    result = (gpu_image_sub_ == nullptr) || (gpu_image_sub_->get_publisher_count() == 0);
+  } else {
+    result = !image_sub_;
+  }
+  return result;
+}
+
+void TrtYoloXNode::setUpImageSubscriber()
+{
+  if (type_adaptation_activated_) {
+    gpu_image_sub_ = this->create_subscription<ImageContainer>(
+      "~/in/image", rclcpp::QoS(1).best_effort().durability_volatile(),
+      std::bind(&TrtYoloXNode::onGpuImage, this, std::placeholders::_1));
+  } else {
     image_sub_ = image_transport::create_subscription(
-      this, "~/in/image", std::bind(&TrtYoloXNode::onImage, this, _1), "raw",
+      this, "~/in/image", std::bind(&TrtYoloXNode::onImage, this, std::placeholders::_1), "raw",
       rmw_qos_profile_sensor_data);
   }
 }
 
-void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
+uint32_t TrtYoloXNode::getNumOutputConnections()
 {
-  stop_watch_ptr_->toc("processing_time", true);
-  tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects;
-
-  cv_bridge::CvImagePtr in_image_ptr;
-  try {
-    in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8);
-  } catch (cv_bridge::Exception & e) {
-    RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what());
-    return;
+  uint32_t num_subscribers = 0;
+  // Image output topic conenctions
+  if (is_using_image_transport_) {
+    num_subscribers = image_pub_.getNumSubscribers() + mask_pub_.getNumSubscribers() +
+                      color_mask_pub_.getNumSubscribers();
+  } else {
+    num_subscribers = simple_image_pub_->get_subscription_count() +
+                      simple_mask_pub_->get_subscription_count() +
+                      simple_color_mask_pub_->get_subscription_count();
   }
-  const auto width = in_image_ptr->image.cols;
-  const auto height = in_image_ptr->image.rows;
+  // Objects output topic connections
+  num_subscribers +=
+    objects_pub_->get_subscription_count() + objects_pub_->get_intra_process_subscription_count();
+  return num_subscribers;
+}
 
-  tensorrt_yolox::ObjectArrays objects;
-  std::vector<cv::Mat> masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))};
-  std::vector<cv::Mat> color_masks = {
-    cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))};
+void TrtYoloXNode::onConnect()
+{
+  using std::placeholders::_1;
 
-  if (!trt_yolox_->doInference({in_image_ptr->image}, objects, masks, color_masks)) {
-    RCLCPP_WARN(this->get_logger(), "Fail to inference");
-    return;
+  if (getNumOutputConnections() == 0) {
+    image_sub_.shutdown();
+    gpu_image_sub_.reset();
+  } else if (checkInputBlocked()) {
+    setUpImageSubscriber();
   }
-  auto & mask = masks.at(0);
+}
 
+void TrtYoloXNode::postInference(
+  const std_msgs::msg::Header header, const tensorrt_yolox::ObjectArrays & objects, const int width,
+  const int height, std::vector<cv::Mat> & masks)
+{
+  auto & mask = masks.at(0);
+  tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects;
   for (const auto & yolox_object : objects.at(0)) {
     tier4_perception_msgs::msg::DetectedObjectWithFeature object;
     object.feature.roi.x_offset = yolox_object.x_offset;
@@ -202,15 +231,7 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     object.object.classification =
       object_recognition_utils::toObjectClassifications(label_map_[yolox_object.type], 1.0f);
     out_objects.feature_objects.push_back(object);
-    const auto left = std::max(0, static_cast<int>(object.feature.roi.x_offset));
-    const auto top = std::max(0, static_cast<int>(object.feature.roi.y_offset));
-    const auto right =
-      std::min(static_cast<int>(object.feature.roi.x_offset + object.feature.roi.width), width);
-    const auto bottom =
-      std::min(static_cast<int>(object.feature.roi.y_offset + object.feature.roi.height), height);
-    cv::rectangle(
-      in_image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3,
-      8, 0);
+
     // Refine mask: replacing segmentation mask by roi class
     // This should remove when the segmentation accuracy is high
     if (is_roi_overlap_segment_ && trt_yolox_->getMultitaskNum() > 0) {
@@ -221,7 +242,7 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     sensor_msgs::msg::Image::SharedPtr out_mask_msg =
       cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask)
         .toImageMsg();
-    out_mask_msg->header = msg->header;
+    out_mask_msg->header = header;
 
     std::vector<std::pair<uint8_t, int>> compressed_data = perception_utils::runLengthEncoder(mask);
     int step = sizeof(uint8_t) + sizeof(int);
@@ -230,10 +251,13 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
       std::memcpy(&out_mask_msg->data[i * step], &compressed_data.at(i).first, sizeof(uint8_t));
       std::memcpy(&out_mask_msg->data[i * step + 1], &compressed_data.at(i).second, sizeof(int));
     }
-    mask_pub_.publish(out_mask_msg);
+    if (is_using_image_transport_) {
+      mask_pub_.publish(out_mask_msg);
+    } else {
+      simple_mask_pub_->publish(*out_mask_msg);
+    }
   }
-  image_pub_.publish(in_image_ptr->toImageMsg());
-  out_objects.header = msg->header;
+  out_objects.header = header;
   objects_pub_->publish(out_objects);
 
   if (debug_publisher_) {
@@ -258,8 +282,96 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     sensor_msgs::msg::Image::SharedPtr output_color_mask_msg =
       cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask)
         .toImageMsg();
-    output_color_mask_msg->header = msg->header;
-    color_mask_pub_.publish(output_color_mask_msg);
+    output_color_mask_msg->header = header;
+    if (is_using_image_transport_) {
+      color_mask_pub_.publish(output_color_mask_msg);
+    } else {
+      simple_color_mask_pub_->publish(*output_color_mask_msg);
+    }
+  }
+}
+
+void TrtYoloXNode::drawImageDetection(
+  cv_bridge::CvImagePtr image_ptr, const tensorrt_yolox::ObjectArrays & objects, const int width,
+  const int height)
+{
+  for (const auto & yolox_object : objects.at(0)) {
+    const auto left = std::max(0, static_cast<int>(yolox_object.x_offset));
+    const auto top = std::max(0, static_cast<int>(yolox_object.y_offset));
+    const auto right =
+      std::min(static_cast<int>(yolox_object.x_offset + yolox_object.width), width);
+    const auto bottom =
+      std::min(static_cast<int>(yolox_object.y_offset + yolox_object.height), height);
+    cv::rectangle(
+      image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3, 8,
+      0);
+  }
+
+  sensor_msgs::msg::Image::SharedPtr output_image_msg_ptr = image_ptr->toImageMsg();
+  if (is_using_image_transport_) {
+    image_pub_.publish(output_image_msg_ptr);
+  } else {
+    simple_image_pub_->publish(*output_image_msg_ptr);
+  }
+}
+
+void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
+{
+  stop_watch_ptr_->toc("processing_time", true);
+
+  cv_bridge::CvImagePtr in_image_ptr;
+  try {
+    in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8);
+  } catch (cv_bridge::Exception & e) {
+    RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what());
+    return;
+  }
+  const auto width = in_image_ptr->image.cols;
+  const auto height = in_image_ptr->image.rows;
+
+  tensorrt_yolox::ObjectArrays objects;
+  std::vector<cv::Mat> masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))};
+  std::vector<cv::Mat> color_masks = {
+    cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))};
+
+  std::vector<cv::Mat> images = {in_image_ptr->image};
+  if (!trt_yolox_->doInference(images, objects, masks, color_masks)) {
+    RCLCPP_WARN(this->get_logger(), "Fail to inference");
+    return;
+  }
+  postInference(msg->header, objects, width, height, masks);
+
+  if (is_publish_debug_rois_image_) {
+    drawImageDetection(in_image_ptr, objects, width, height);
+  }
+}
+
+void TrtYoloXNode::onGpuImage(const std::shared_ptr<ImageContainer> msg)
+{
+  stop_watch_ptr_->toc("processing_time", true);
+
+  const auto width = msg->width();
+  const auto height = msg->height();
+
+  tensorrt_yolox::ObjectArrays objects;
+  std::vector<cv::Mat> masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))};
+  std::vector<cv::Mat> color_masks = {
+    cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))};
+
+  std::vector<std::shared_ptr<ImageContainer>> msgs = {msg};
+  if (!trt_yolox_->doInference(msgs, objects, masks, color_masks)) {
+    RCLCPP_WARN(this->get_logger(), "Fail to inference");
+    return;
+  }
+
+  postInference(msg->header(), objects, width, height, masks);
+
+  if (is_publish_debug_rois_image_) {
+    sensor_msgs::msg::Image image_msg;
+    msg->get_sensor_msgs_image(image_msg);
+    cv_bridge::CvImagePtr image_ptr =
+      cv_bridge::toCvCopy(image_msg, sensor_msgs::image_encodings::BGR8);
+    drawImageDetection(image_ptr, objects, width, height);
   }
 }
 
diff --git a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
index 360f41e470e38..e98121012ae75 100644
--- a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
+++ b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
@@ -45,11 +45,12 @@ class YoloXSingleImageInferenceNode : public rclcpp::Node
       declare_parameter("output_image_path", p.string() + "_detect" + ext);
 
     auto trt_yolox = std::make_unique<tensorrt_yolox::TrtYoloX>(model_path, precision);
-    auto image = cv::imread(image_path);
+    cv::Mat image = cv::imread(image_path);
     tensorrt_yolox::ObjectArrays objects;
     std::vector<cv::Mat> masks;
     std::vector<cv::Mat> color_masks;
-    trt_yolox->doInference({image}, objects, masks, color_masks);
+    std::vector<cv::Mat> input_images({image});
+    trt_yolox->doInference(input_images, objects, masks, color_masks);
     for (const auto & object : objects[0]) {
       const auto left = object.x_offset;
       const auto top = object.y_offset;