From fd8043a395cada8b5dc225f690ba9f8f471f1e76 Mon Sep 17 00:00:00 2001
From: lizhensheng <lzs_1993@qq.com>
Date: Mon, 17 Oct 2022 16:19:40 +0800
Subject: [PATCH] add gpu intra-process-typeadaption demo.

---
 CMakeLists.txt                     |  23 +--
 cmake/CudaComputeTargetFlags.cmake |  43 ++++++
 cuda/CMakeLists.txt                | 105 ++++++++++++++
 cuda/cuda_hello_world.cu           |  21 +++
 cuda/cuda_image_container.cpp      | 226 +++++++++++++++++++++++++++++
 cuda/cuda_image_container.hpp      | 161 ++++++++++++++++++++
 cuda/cuda_image_listener.hpp       |  84 +++++++++++
 cuda/cuda_image_node.cpp           |  22 +++
 cuda/cuda_image_talker.hpp         |  98 +++++++++++++
 cuda/gmat_image_container.cpp      | 171 ++++++++++++++++++++++
 cuda/gmat_image_container.hpp      | 164 +++++++++++++++++++++
 cuda/gmat_image_listener.hpp       |  87 +++++++++++
 cuda/gmat_image_node.cpp           |  22 +++
 cuda/gmat_image_talker.hpp         |  98 +++++++++++++
 cuda/intra_gmat_listener.hpp       |  80 ++++++++++
 cuda/intra_gmat_node.cpp           |  22 +++
 cuda/intra_gmat_talker.hpp         |  93 ++++++++++++
 17 files changed, 1505 insertions(+), 15 deletions(-)
 create mode 100644 cmake/CudaComputeTargetFlags.cmake
 create mode 100644 cuda/CMakeLists.txt
 create mode 100644 cuda/cuda_hello_world.cu
 create mode 100644 cuda/cuda_image_container.cpp
 create mode 100644 cuda/cuda_image_container.hpp
 create mode 100644 cuda/cuda_image_listener.hpp
 create mode 100644 cuda/cuda_image_node.cpp
 create mode 100644 cuda/cuda_image_talker.hpp
 create mode 100644 cuda/gmat_image_container.cpp
 create mode 100644 cuda/gmat_image_container.hpp
 create mode 100644 cuda/gmat_image_listener.hpp
 create mode 100644 cuda/gmat_image_node.cpp
 create mode 100644 cuda/gmat_image_talker.hpp
 create mode 100644 cuda/intra_gmat_listener.hpp
 create mode 100644 cuda/intra_gmat_node.cpp
 create mode 100644 cuda/intra_gmat_talker.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e21f30f..0fb7f64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.5)
 
 project(shm_msgs)
 
-# add open3d installation directory here
+list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_LIST_DIR}/cmake)
+
+# add installation directory here
 # list(INSERT CMAKE_PREFIX_PATH 0 /opt/open3d/open3d)
+list(INSERT CMAKE_PREFIX_PATH 0 /opt/opencv/opencv-4.6.0/)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
@@ -81,21 +84,9 @@ list(REMOVE_ITEM PCL_LIBRARIES
   vtkRenderingOpenGL2
 )
 
-find_package(OpenCV 4 QUIET
-  COMPONENTS
-    opencv_core
-    opencv_imgproc
-    opencv_imgcodecs
-  CONFIG
-)
+find_package(OpenCV 4 QUIET)
 if(NOT OpenCV_FOUND)
-  find_package(OpenCV 3 REQUIRED
-    COMPONENTS
-      opencv_core
-      opencv_imgproc
-      opencv_imgcodecs
-    CONFIG
-  )
+  find_package(OpenCV 3 REQUIRED)
 endif()
 
 find_package(iceoryx_posh CONFIG REQUIRED)
@@ -157,6 +148,8 @@ add_subdirectory(lib)
 add_subdirectory(src)
 # extra image intra-process demos
 add_subdirectory(intra)
+# extra type-adaption cuda demos
+add_subdirectory(cuda)
 
 if(BUILD_TESTING)
   add_subdirectory(test)
diff --git a/cmake/CudaComputeTargetFlags.cmake b/cmake/CudaComputeTargetFlags.cmake
new file mode 100644
index 0000000..f24731d
--- /dev/null
+++ b/cmake/CudaComputeTargetFlags.cmake
@@ -0,0 +1,43 @@
+#
+#  Compute target flags macros by Anatoly Baksheev
+#
+#  Usage in CmakeLists.txt:
+#   	include(CudaComputeTargetFlags.cmake)
+#		APPEND_TARGET_ARCH_FLAGS()
+
+#compute flags macros
+MACRO(CUDA_COMPUTE_TARGET_FLAGS arch_bin arch_ptx cuda_nvcc_target_flags)
+	string(REGEX REPLACE "\\." "" ARCH_BIN_WITHOUT_DOTS "${${arch_bin}}")
+	string(REGEX REPLACE "\\." "" ARCH_PTX_WITHOUT_DOTS "${${arch_ptx}}")
+
+	set(cuda_computer_target_flags_temp "")
+
+	# Tell NVCC to add binaries for the specified GPUs
+	string(REGEX MATCHALL "[0-9()]+" ARCH_LIST "${ARCH_BIN_WITHOUT_DOTS}")
+	foreach(ARCH IN LISTS ARCH_LIST)
+		if (ARCH MATCHES "([0-9]+)\\(([0-9]+)\\)")
+			# User explicitly specified PTX for the concrete BIN
+			set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+		else()
+			# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+			set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${ARCH},code=sm_${ARCH})					
+		endif()
+	endforeach()
+
+	# Tell NVCC to add PTX intermediate code for the specified architectures
+	string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_WITHOUT_DOTS}")
+	foreach(ARCH IN LISTS ARCH_LIST)
+		set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${ARCH},code=compute_${ARCH})				
+	endforeach()
+
+	set(${cuda_nvcc_target_flags} ${cuda_computer_target_flags_temp})
+ENDMACRO()
+
+MACRO(APPEND_TARGET_ARCH_FLAGS)
+	set(cuda_nvcc_target_flags "")
+	CUDA_COMPUTE_TARGET_FLAGS(CUDA_ARCH_BIN CUDA_ARCH_PTX cuda_nvcc_target_flags)
+	if (cuda_nvcc_target_flags)
+		message(STATUS "CUDA NVCC target flags: ${cuda_nvcc_target_flags}")
+		list(APPEND CUDA_NVCC_FLAGS ${cuda_nvcc_target_flags})
+	endif()
+ENDMACRO()
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
new file mode 100644
index 0000000..1fc5dd7
--- /dev/null
+++ b/cuda/CMakeLists.txt
@@ -0,0 +1,105 @@
+macro(GA_CHECK_CUDA)
+  if ($ENV{GPUAC_COMPILE_WITH_CUDA})
+    find_package(CUDA REQUIRED  QUIET)
+    find_package(Eigen3 REQUIRED  QUIET)
+
+    # if(NOT ${CUDA_VERSION} VERSION_LESS "10.0"
+    #         AND NOT ${CUDA_VERSION} VERSION_EQUAL "10.0" )
+    #   message(FATAL_ERROR "GPU support on Melodic requires CUDA<=10.0")
+    # endif()
+    if(${CUDA_VERSION} VERSION_GREATER "9.1"
+          AND ${CMAKE_VERSION} VERSION_LESS "3.12.3")
+      unset(CUDA_cublas_device_LIBRARY CACHE)
+      set(CUDA_cublas_device_LIBRARY ${CUDA_cublas_LIBRARY})
+      set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
+    endif()
+    # if ("$ENV{ROS_DISTRO}" STREQUAL "melodic" AND ${EIGEN3_VERSION_STRING} VERSION_LESS "3.3.7")
+    #   message(FATAL_ERROR "GPU support on Melodic requires Eigen version>= 3.3.7")
+    # endif()
+    if(NOT DEFINED CMAKE_CUDA_STANDARD)
+        set(CMAKE_CUDA_STANDARD 14)
+        set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    endif()
+    set(USE_CUDA ON)
+  else()
+    message(WARNING "CUDA support is disabled. Set the GPUAC_COMPILE_WITH_CUDA environment variable and recompile to enable it")
+    set(USE_CUDA OFF)
+  endif()
+endmacro()
+
+include(CudaComputeTargetFlags)
+GA_CHECK_CUDA()
+set_directory_properties(PROPERTIES COMPILE_DEFINITIONS "")
+APPEND_TARGET_ARCH_FLAGS()
+
+if(NOT USE_CUDA)
+ return()
+endif()
+
+# Enable NVTX markers for improved profiling
+add_definitions(-DUSE_NVTX)
+link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+link_libraries("nvToolsExt")
+
+# nvcc -arch sm_50 ./cuda_hello_world.cu -o docker_hello
+cuda_add_executable(cuda_hello_world cuda_hello_world.cu)
+target_include_directories(cuda_hello_world PUBLIC
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    "$<INSTALL_INTERFACE:include>"
+    ${CUDA_INCLUDE_DIRS}
+)
+target_link_libraries(cuda_hello_world  ${PROJECT_NAME}_image
+    ${CUDA_nvToolsExt_LIBRARY}
+    ${CUDA_LIBRARIES}
+)
+install(TARGETS cuda_hello_world DESTINATION lib/${PROJECT_NAME})
+
+cuda_add_executable(cuda_image_node cuda_image_node.cpp cuda_image_container.cpp)
+ament_target_dependencies(cuda_image_node "rclcpp" "cv_bridge" "sensor_msgs")
+rosidl_get_typesupport_target(cpp_typesupport_target "${PROJECT_NAME}" "rosidl_typesupport_cpp")
+target_include_directories(cuda_image_node PUBLIC
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    "$<INSTALL_INTERFACE:include>"
+    ${CUDA_INCLUDE_DIRS}
+)
+target_link_libraries(cuda_image_node  ${PROJECT_NAME}_image
+    opencv_highgui
+    ${cpp_typesupport_target}
+    ${CUDA_nvToolsExt_LIBRARY}
+    ${CUDA_LIBRARIES}
+)
+install(TARGETS cuda_image_node DESTINATION lib/${PROJECT_NAME})
+
+add_executable(intra_gmat_node intra_gmat_node.cpp)
+ament_target_dependencies(intra_gmat_node "rclcpp" "cv_bridge" "sensor_msgs")
+rosidl_get_typesupport_target(cpp_typesupport_target "${PROJECT_NAME}" "rosidl_typesupport_cpp")
+target_include_directories(intra_gmat_node PUBLIC
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    "$<INSTALL_INTERFACE:include>"
+    ${CUDA_INCLUDE_DIRS}
+    ${OpenCV_INCLUDE_DIRS}
+)
+target_link_libraries(intra_gmat_node  ${PROJECT_NAME}_image
+    ${cpp_typesupport_target}
+    ${CUDA_nvToolsExt_LIBRARY}
+    ${CUDA_LIBRARIES}
+    ${OpenCV_LIBS}
+)
+install(TARGETS intra_gmat_node DESTINATION lib/${PROJECT_NAME})
+
+add_executable(gmat_image_node gmat_image_node.cpp gmat_image_container.cpp)
+ament_target_dependencies(gmat_image_node "rclcpp" "cv_bridge" "sensor_msgs")
+rosidl_get_typesupport_target(cpp_typesupport_target "${PROJECT_NAME}" "rosidl_typesupport_cpp")
+target_include_directories(gmat_image_node PUBLIC
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    "$<INSTALL_INTERFACE:include>"
+    ${CUDA_INCLUDE_DIRS}
+    ${OpenCV_INCLUDE_DIRS}
+)
+target_link_libraries(gmat_image_node  ${PROJECT_NAME}_image
+    ${cpp_typesupport_target}
+    ${CUDA_nvToolsExt_LIBRARY}
+    ${CUDA_LIBRARIES}
+    ${OpenCV_LIBS}
+)
+install(TARGETS gmat_image_node DESTINATION lib/${PROJECT_NAME})
diff --git a/cuda/cuda_hello_world.cu b/cuda/cuda_hello_world.cu
new file mode 100644
index 0000000..6b7d1bd
--- /dev/null
+++ b/cuda/cuda_hello_world.cu
@@ -0,0 +1,21 @@
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+
+__global__ void cuda_hello(){
+    printf("Hello World from GPU! %d\n", threadIdx.x*gridDim.x);
+}
+
+int main() {
+    printf("Hello World from CPU!\n");
+    cudaSetDevice(0);
+    cuda_hello<<<1,10>>>();
+    uint8_t *cuda_mem_;
+    if (cudaMalloc(&cuda_mem_, 1024) != cudaSuccess)
+    {
+        throw std::runtime_error("Failed to allocate device memory");
+    }
+
+    cudaDeviceSynchronize();
+    return 0;
+}
\ No newline at end of file
diff --git a/cuda/cuda_image_container.cpp b/cuda/cuda_image_container.cpp
new file mode 100644
index 0000000..f5f4220
--- /dev/null
+++ b/cuda/cuda_image_container.cpp
@@ -0,0 +1,226 @@
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+#include "sensor_msgs/msg/image.hpp"
+#include "std_msgs/msg/header.hpp"
+
+#include "cuda_image_container.hpp"
+
+#include "cuda.h"         // NOLINT
+#include "cuda_runtime.h" // NOLINT
+#include <nvToolsExt.h>   // NOLINT
+
+namespace shm_msgs
+{
+namespace
+{
+template <typename T> struct NotNull
+{
+    NotNull(const T *pointer_in, const char *msg) : pointer(pointer_in)
+    {
+        if (pointer == nullptr)
+        {
+            throw std::invalid_argument(msg);
+        }
+    }
+
+    const T *pointer;
+};
+
+} // namespace
+
+CUDAStreamWrapper::CUDAStreamWrapper()
+{
+    cudaStreamCreate(&main_stream_);
+}
+
+CUDAStreamWrapper::~CUDAStreamWrapper()
+{
+    cudaStreamDestroy(main_stream_);
+}
+
+CUDAMemoryWrapper::CUDAMemoryWrapper(size_t bytes_to_allocate) : bytes_allocated_(bytes_to_allocate)
+{
+    // non-jetson using cudaMalloc rather than cudaMallocManaged
+#ifndef __aarch64__
+    if (cudaMalloc(&cuda_mem_, bytes_to_allocate) != cudaSuccess)
+#else
+    if (cudaMallocManaged(&cuda_mem_, bytes_to_allocate) != cudaSuccess)
+#endif
+    {
+        throw std::runtime_error("Failed to allocate device memory");
+    }
+}
+
+void CUDAMemoryWrapper::copy_to_device(uint8_t *host_mem, size_t bytes_to_copy, const cudaStream_t &stream)
+{
+    nvtxRangePushA("CudaImageContainer:CopyToDevice");
+    if (bytes_to_copy > bytes_allocated_)
+    {
+        throw std::invalid_argument("Tried to copy too many bytes to device");
+    }
+    if (cudaMemcpyAsync(cuda_mem_, host_mem, bytes_to_copy, cudaMemcpyHostToDevice, stream) != cudaSuccess)
+    {
+        throw std::runtime_error("Failed to copy memory to the GPU");
+    }
+    cudaStreamSynchronize(stream);
+    nvtxRangePop();
+}
+
+void CUDAMemoryWrapper::copy_from_device(uint8_t *host_mem, size_t bytes_to_copy, const cudaStream_t &stream)
+{
+    nvtxRangePushA("CudaImageContainer:CopyFromDevice");
+    if (bytes_to_copy > bytes_allocated_)
+    {
+        throw std::invalid_argument("Tried to copy too many bytes from device");
+    }
+    if (cudaMemcpyAsync(host_mem, cuda_mem_, bytes_to_copy, cudaMemcpyDeviceToHost, stream) != cudaSuccess)
+    {
+        throw std::runtime_error("Failed to copy memory from the GPU");
+    }
+    cudaStreamSynchronize(stream);
+    nvtxRangePop();
+}
+
+uint8_t *CUDAMemoryWrapper::device_memory()
+{
+    return cuda_mem_;
+}
+
+CUDAMemoryWrapper::~CUDAMemoryWrapper()
+{
+    cudaFree(cuda_mem_);
+}
+
+CUDAEventWrapper::CUDAEventWrapper()
+{
+    cudaEventCreate(&event_);
+}
+
+void CUDAEventWrapper::record(std::shared_ptr<CUDAStreamWrapper> cuda_stream)
+{
+    cudaEventRecord(event_, cuda_stream->stream());
+}
+
+CUDAEventWrapper::~CUDAEventWrapper()
+{
+    cudaEventDestroy(event_);
+}
+
+CudaImageContainer::CudaImageContainer()
+{
+    cuda_stream_ = std::make_shared<CUDAStreamWrapper>();
+    cuda_event_ = std::make_shared<CUDAEventWrapper>();
+}
+
+CudaImageContainer::CudaImageContainer(std_msgs::msg::Header header, uint32_t height, uint32_t width,
+                                       std::string encoding, uint32_t step,
+                                       std::shared_ptr<CUDAStreamWrapper> cuda_stream)
+    : header_(header), cuda_stream_(cuda_stream), height_(height), width_(width), encoding_(encoding), step_(step)
+{
+    nvtxRangePushA("CudaImageContainer:Create");
+    cuda_mem_ = std::make_shared<CUDAMemoryWrapper>(size_in_bytes());
+    cuda_event_ = std::make_shared<CUDAEventWrapper>();
+    nvtxRangePop();
+}
+
+CudaImageContainer::CudaImageContainer(std::unique_ptr<sensor_msgs::msg::Image> unique_sensor_msgs_image)
+    : CudaImageContainer(
+          NotNull(unique_sensor_msgs_image.get(), "unique_sensor_msgs_image cannot be nullptr").pointer->header,
+          unique_sensor_msgs_image->height, unique_sensor_msgs_image->width, unique_sensor_msgs_image->encoding,
+          unique_sensor_msgs_image->step)
+{
+    nvtxRangePushA("CudaImageContainer:CreateFromMessage");
+    cuda_mem_->copy_to_device(&unique_sensor_msgs_image->data[0], size_in_bytes(), cuda_stream_->stream());
+    nvtxRangePop();
+}
+
+CudaImageContainer::CudaImageContainer(const sensor_msgs::msg::Image &sensor_msgs_image)
+    : CudaImageContainer(std::make_unique<sensor_msgs::msg::Image>(sensor_msgs_image))
+{
+}
+
+CudaImageContainer::CudaImageContainer(const CudaImageContainer &other)
+{
+    nvtxRangePushA("CudaImageContainer:Copy");
+    header_ = other.header_;
+    height_ = other.height_;
+    width_ = other.width_;
+    encoding_ = other.encoding_;
+    step_ = other.step_;
+
+    // Make a new stream and have it wait on the previous one.
+    cuda_stream_ = std::make_shared<CUDAStreamWrapper>();
+    cuda_event_ = std::make_shared<CUDAEventWrapper>();
+    cuda_event_->record(other.cuda_stream_);
+    cudaStreamWaitEvent(cuda_stream_->stream(), cuda_event_->event());
+
+    // Copy the image data over so that this is an independent copy (deep copy).
+    cuda_mem_ = std::make_shared<CUDAMemoryWrapper>(size_in_bytes());
+    if (cudaMemcpyAsync(other.cuda_mem_->device_memory(), cuda_mem_->device_memory(), size_in_bytes(),
+                        cudaMemcpyDeviceToDevice, cuda_stream_->stream()) != cudaSuccess)
+    {
+        throw std::runtime_error("Failed to copy memory from the GPU");
+    }
+    nvtxRangePop();
+}
+
+// Equals operator makes this container become another, shallow copy only.
+CudaImageContainer &CudaImageContainer::operator=(const CudaImageContainer &other)
+{
+    if (this == &other)
+    {
+        return *this;
+    }
+
+    header_ = other.header_;
+    height_ = other.height_;
+    width_ = other.width_;
+    encoding_ = other.encoding_;
+    step_ = other.step_;
+    cuda_stream_ = other.cuda_stream_;
+    cuda_event_ = other.cuda_event_;
+    cuda_mem_ = other.cuda_mem_;
+
+    return *this;
+}
+
+CudaImageContainer::~CudaImageContainer()
+{
+}
+
+const std_msgs::msg::Header &CudaImageContainer::header() const
+{
+    return header_;
+}
+
+std_msgs::msg::Header &CudaImageContainer::header()
+{
+    return header_;
+}
+
+uint8_t *CudaImageContainer::cuda_mem()
+{
+    return cuda_mem_->device_memory();
+}
+
+void CudaImageContainer::get_sensor_msgs_image(sensor_msgs::msg::Image &destination) const
+{
+    nvtxRangePushA("CudaImageContainer:GetMsg");
+    destination.header = header_;
+    destination.height = height_;
+    destination.width = width_;
+    destination.encoding = encoding_;
+    destination.step = step_;
+    destination.data.resize(size_in_bytes());
+    cuda_mem_->copy_from_device(&destination.data[0], size_in_bytes(), cuda_stream_->stream());
+    nvtxRangePop();
+}
+
+size_t CudaImageContainer::size_in_bytes() const
+{
+    return height_ * step_;
+}
+
+} //  namespace shm_msgs
\ No newline at end of file
diff --git a/cuda/cuda_image_container.hpp b/cuda/cuda_image_container.hpp
new file mode 100644
index 0000000..3114bc6
--- /dev/null
+++ b/cuda/cuda_image_container.hpp
@@ -0,0 +1,161 @@
+#ifndef SHM_MSGS__CUDA_IMAGE_CONTAINER_HPP_
+#define SHM_MSGS__CUDA_IMAGE_CONTAINER_HPP_
+
+#include <memory>
+#include <string>
+
+#include "rclcpp/type_adapter.hpp"
+#include "sensor_msgs/msg/image.hpp"
+#include "std_msgs/msg/header.hpp"
+
+#include "cuda.h"         // NOLINT
+#include "cuda_runtime.h" // NOLINT
+
+namespace shm_msgs
+{
+class CUDAStreamWrapper final
+{
+  public:
+    CUDAStreamWrapper();
+
+    ~CUDAStreamWrapper();
+
+    cudaStream_t &stream()
+    {
+        return main_stream_;
+    }
+
+  private:
+    cudaStream_t main_stream_{};
+};
+
+class CUDAMemoryWrapper final
+{
+  public:
+    explicit CUDAMemoryWrapper(size_t size_in_bytes);
+
+    void copy_to_device(uint8_t *host_mem, size_t bytes_to_copy, const cudaStream_t &stream);
+
+    void copy_from_device(uint8_t *host_mem, size_t bytes_to_copy, const cudaStream_t &stream);
+
+    uint8_t *device_memory();
+
+    ~CUDAMemoryWrapper();
+
+  private:
+    size_t bytes_allocated_{0};
+
+    uint8_t *cuda_mem_{nullptr};
+};
+
+class CUDAEventWrapper final
+{
+  public:
+    CUDAEventWrapper();
+
+    void record(std::shared_ptr<CUDAStreamWrapper> cuda_stream);
+
+    cudaEvent_t &event()
+    {
+        return event_;
+    }
+
+    ~CUDAEventWrapper();
+
+  private:
+    cudaEvent_t event_;
+};
+
+class CudaImageContainer final
+{
+  public:
+    CudaImageContainer();
+
+    /// Store an owning pointer to a sensor_msg::msg::Image, and create CUDA memory that references it
+    explicit CudaImageContainer(std::unique_ptr<sensor_msgs::msg::Image> unique_sensor_msgs_image);
+
+    /// Copy the sensor_msgs::msg::Image into this contain and create CUDA memory that references it.
+    explicit CudaImageContainer(const sensor_msgs::msg::Image &sensor_msgs_image);
+
+    explicit CudaImageContainer(const CudaImageContainer &other);
+
+    CudaImageContainer(std_msgs::msg::Header header, uint32_t height, uint32_t width, std::string encoding,
+                       uint32_t step,
+                       std::shared_ptr<CUDAStreamWrapper> cuda_stream = std::make_shared<CUDAStreamWrapper>());
+
+    CudaImageContainer &operator=(const CudaImageContainer &other);
+
+    ~CudaImageContainer();
+
+    /// Const access the ROS Header.
+    const std_msgs::msg::Header &header() const;
+
+    /// Access the ROS Header.
+    std_msgs::msg::Header &header();
+
+    void get_sensor_msgs_image(sensor_msgs::msg::Image &destination) const;
+
+    uint8_t *cuda_mem();
+
+    size_t size_in_bytes() const;
+
+    std::shared_ptr<CUDAStreamWrapper> cuda_stream() const
+    {
+        return cuda_stream_;
+    }
+
+    uint32_t height() const
+    {
+        return height_;
+    }
+
+    uint32_t width() const
+    {
+        return width_;
+    }
+
+    const std::string &encoding() const
+    {
+        return encoding_;
+    }
+
+    uint32_t step() const
+    {
+        return step_;
+    }
+
+  private:
+    std_msgs::msg::Header header_;
+
+    std::shared_ptr<CUDAStreamWrapper> cuda_stream_;
+
+    std::shared_ptr<CUDAMemoryWrapper> cuda_mem_;
+
+    std::shared_ptr<CUDAEventWrapper> cuda_event_;
+
+    uint32_t height_{0};
+    uint32_t width_{0};
+    std::string encoding_;
+    uint32_t step_{0};
+};
+
+} // namespace shm_msgs
+
+template <> struct rclcpp::TypeAdapter<shm_msgs::CudaImageContainer, sensor_msgs::msg::Image>
+{
+    using is_specialized = std::true_type;
+    using custom_type = shm_msgs::CudaImageContainer;
+    using ros_message_type = sensor_msgs::msg::Image;
+
+    static void convert_to_ros_message(const custom_type &source, ros_message_type &destination)
+    {
+        source.get_sensor_msgs_image(destination);
+    }
+
+    static void convert_to_custom(const ros_message_type &source, custom_type &destination)
+    {
+        destination = shm_msgs::CudaImageContainer(source);
+    }
+};
+
+#endif // SHM_MSGS__CUDA_IMAGE_CONTAINER_HPP_
diff --git a/cuda/cuda_image_listener.hpp b/cuda/cuda_image_listener.hpp
new file mode 100644
index 0000000..4bc23aa
--- /dev/null
+++ b/cuda/cuda_image_listener.hpp
@@ -0,0 +1,84 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstring>
+#include <memory>
+#include <cinttypes>
+
+#include "rclcpp/rclcpp.hpp"
+#include "rclcpp/type_adapter.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <cv_bridge/cv_bridge.h>
+
+#include "cuda_image_container.hpp"
+
+#include <opencv2/opencv.hpp>
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+
+class Listener : public rclcpp::Node {
+private:
+  using Topic = rclcpp::TypeAdapter<shm_msgs::CudaImageContainer, sensor_msgs::msg::Image>;
+  using DataType = shm_msgs::CudaImageContainer;
+
+public:
+  explicit Listener()
+      : Node("cuda_image_listener", rclcpp::NodeOptions().use_intra_process_comms(true)) {
+
+    // subscription callback to process arriving data
+    auto callback = [this](std::unique_ptr<DataType> msg) -> void {
+
+      RCLCPP_INFO(this->get_logger(), "received with address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(msg.get()));
+
+      // no conversion
+      // last_cvimage = cv_bridge::toCvShare(msg);
+      // last_cvimage = cv_bridge::toCvCopy(msg);
+
+      // move to the private ptr
+      m_last_container = std::move(msg);
+      RCLCPP_INFO(this->get_logger(), "container address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(m_last_container.get()));
+
+      auto time_offset_ns = (now() - m_last_container->header().stamp).nanoseconds();
+      auto timestamp_offset_ns = (rclcpp::Time(m_last_container->header().stamp) - m_last_image_ts).nanoseconds();
+      auto time_offset_ms = time_offset_ns / 1000000.0F;
+      auto timestamp_offset_ms = timestamp_offset_ns / 1000000.0F;
+      RCLCPP_INFO(get_logger(), "get-image-transport-time: %.3f", time_offset_ms);
+      if(m_last_image_ts.nanoseconds() > 0.0)
+      {
+        RCLCPP_INFO(get_logger(), "get-image-timestamp_offset-time: %.3f", timestamp_offset_ms);
+      }
+      m_last_image_ts = m_last_container->header().stamp;
+      // cv::imshow("im show", last_cvimage->image);
+      // cv::waitKey(0);
+    };
+
+    // rclcpp::QoS qos(rclcpp::KeepLast(10));
+    rclcpp::QoS custom_qos_profile = rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+      .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+      .keep_last(5)
+      .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+      .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+      .avoid_ros_namespace_conventions(false);
+
+    m_subscription = create_subscription<Topic>("cuda_image", custom_qos_profile, callback);
+  }
+
+private:
+  rclcpp::Subscription<Topic>::SharedPtr m_subscription;
+
+  std::unique_ptr<DataType> m_last_container;
+  rclcpp::Time m_last_image_ts{0, 0, RCL_ROS_TIME};
+};
diff --git a/cuda/cuda_image_node.cpp b/cuda/cuda_image_node.cpp
new file mode 100644
index 0000000..c313b6d
--- /dev/null
+++ b/cuda/cuda_image_node.cpp
@@ -0,0 +1,22 @@
+#include <rclcpp/rclcpp.hpp>
+
+#include <memory>
+
+#include "cuda_image_listener.hpp"
+#include "cuda_image_talker.hpp"
+
+int main(int argc, char *argv[])
+{
+    setvbuf(stdout, NULL, _IONBF, BUFSIZ);
+    rclcpp::init(argc, argv);
+    rclcpp::executors::SingleThreadedExecutor executor;
+
+    auto talker = std::make_shared<Talker>();
+    auto listener = std::make_shared<Listener>();
+    executor.add_node(talker);
+    executor.add_node(listener);
+
+    executor.spin();
+    rclcpp::shutdown();
+    return 0;
+}
\ No newline at end of file
diff --git a/cuda/cuda_image_talker.hpp b/cuda/cuda_image_talker.hpp
new file mode 100644
index 0000000..f6bff44
--- /dev/null
+++ b/cuda/cuda_image_talker.hpp
@@ -0,0 +1,98 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <cinttypes>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "rclcpp/rclcpp.hpp"
+#include "rclcpp/type_adapter.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <cv_bridge/cv_bridge.h>
+#include <sensor_msgs/image_encodings.hpp>
+
+#include <opencv2/opencv.hpp>
+
+#include "cuda_image_container.hpp"
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+
+class Talker : public rclcpp::Node
+{
+  private:
+    using Topic = rclcpp::TypeAdapter<shm_msgs::CudaImageContainer, sensor_msgs::msg::Image>;
+    using DataType = shm_msgs::CudaImageContainer;
+
+  public:
+    explicit Talker() : Node("cuda_image_talker", rclcpp::NodeOptions().use_intra_process_comms(true))
+    {
+
+        // m_input_cvimage->image = cv::imread("./res/img/205_182.png");
+        // m_input_cvimage->image = cv::imread("./res/img/1024_768.jpeg");
+        m_input_cvimage->image = cv::imread("./res/img/1920_1080.jpg");
+        m_input_cvimage->header.frame_id = "camera_link";
+        m_input_cvimage->encoding = "bgr8";
+        // cv::imshow("input image", m_input_cvimage->image);
+        // cv::waitKey(0);
+
+        auto publishMessage = [this]() -> void {
+            // conversion from mat to sensor_msg
+            m_input_rosimage = std::make_unique<sensor_msgs::msg::Image>();
+            // todo: from cvmat to container
+            m_input_cvimage->toImageMsg(*m_input_rosimage);
+            // start here
+            m_input_rosimage->header.stamp = now();
+
+            // no deep copy from capture to published
+            auto msg = std::make_unique<DataType>(std::move(m_input_rosimage));
+
+            // RCLCPP_INFO(this->get_logger(), "Publishing with ts: %u.%u", msg->header().stamp.sec,
+            // msg->header().stamp.nanosec);
+            RCLCPP_INFO(this->get_logger(), "Publishing with address: 0x%" PRIXPTR "",
+                        reinterpret_cast<std::uintptr_t>(msg.get()));
+
+            m_publisher->publish(std::move(msg));
+            // We gave up ownership
+
+            m_count++;
+        };
+
+        // rclcpp::QoS qos(rclcpp::KeepLast(10));
+        // rclcpp::QoS qos(rclcpp::SensorDataQoS());
+        rclcpp::QoS custom_qos_profile =
+            rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+                .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+                .keep_last(5)
+                .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+                .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+                .avoid_ros_namespace_conventions(false);
+
+        m_publisher = this->create_publisher<Topic>("cuda_image", custom_qos_profile);
+
+        // Use a timer to schedule periodic message publishing.
+        m_timer = this->create_wall_timer(1s, publishMessage);
+    }
+
+  private:
+    uint64_t m_count = 1;
+    rclcpp::Publisher<Topic>::SharedPtr m_publisher;
+    rclcpp::TimerBase::SharedPtr m_timer;
+    std::shared_ptr<cv_bridge::CvImage> m_input_cvimage{std::make_shared<cv_bridge::CvImage>()};
+    std::unique_ptr<sensor_msgs::msg::Image> m_input_rosimage{std::make_unique<sensor_msgs::msg::Image>()};
+};
diff --git a/cuda/gmat_image_container.cpp b/cuda/gmat_image_container.cpp
new file mode 100644
index 0000000..e46ca5a
--- /dev/null
+++ b/cuda/gmat_image_container.cpp
@@ -0,0 +1,171 @@
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "opencv2/core/mat.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include "std_msgs/msg/header.hpp"
+
+#include "gmat_image_container.hpp"
+
+namespace shm_msgs
+{
+
+namespace
+{
+int
+encoding2mat_type(const std::string & encoding)
+{
+  if (encoding == "mono8") {
+    return CV_8UC1;
+  } else if (encoding == "bgr8") {
+    return CV_8UC3;
+  } else if (encoding == "mono16") {
+    return CV_16SC1;
+  } else if (encoding == "rgba8") {
+    return CV_8UC4;
+  } else if (encoding == "bgra8") {
+    return CV_8UC4;
+  } else if (encoding == "32FC1") {
+    return CV_32FC1;
+  } else if (encoding == "rgb8") {
+    return CV_8UC3;
+  } else if (encoding == "yuv422") {
+    return CV_8UC2;
+  } else {
+    throw std::runtime_error("Unsupported encoding type");
+  }
+}
+
+template<typename T>
+struct NotNull
+{
+  NotNull(const T * pointer_in, const char * msg)
+  : pointer(pointer_in)
+  {
+    if (pointer == nullptr) {
+      throw std::invalid_argument(msg);
+    }
+  }
+
+  const T * pointer;
+};
+
+}  // namespace
+
+ROSGpuMatContainer::ROSGpuMatContainer(
+  std::unique_ptr<sensor_msgs::msg::Image> unique_sensor_msgs_image)
+: header_(NotNull(
+      unique_sensor_msgs_image.get(),
+      "unique_sensor_msgs_image cannot be nullptr"
+).pointer->header),
+  frame_(
+    unique_sensor_msgs_image->height,
+    unique_sensor_msgs_image->width,
+    encoding2mat_type(unique_sensor_msgs_image->encoding),
+    unique_sensor_msgs_image->data.data(),
+    unique_sensor_msgs_image->step),
+  storage_(std::move(unique_sensor_msgs_image))
+{}
+
+ROSGpuMatContainer::ROSGpuMatContainer(
+  const cv::Mat & mat_frame,
+  const std_msgs::msg::Header & header,
+  std::shared_ptr<cv::cuda::Stream> cuda_stream)
+: header_(header),
+  cv_cuda_stream_(cuda_stream),
+  frame_(mat_frame),
+  storage_(nullptr)
+{
+  cv_cuda_event_ = std::make_shared<cv::cuda::Event>();
+}
+
+ROSGpuMatContainer::ROSGpuMatContainer(
+  const sensor_msgs::msg::Image & sensor_msgs_image)
+: ROSGpuMatContainer(std::make_unique<sensor_msgs::msg::Image>(sensor_msgs_image))
+{}
+
+bool
+ROSGpuMatContainer::is_owning() const
+{
+  return std::holds_alternative<std::nullptr_t>(storage_);
+}
+
+const cv::cuda::GpuMat &
+ROSGpuMatContainer::cv_gpu_mat() const
+{
+  return frame_;
+}
+
+cv::cuda::GpuMat
+ROSGpuMatContainer::cv_gpu_mat()
+{
+  return frame_;
+}
+
+const std_msgs::msg::Header &
+ROSGpuMatContainer::header() const
+{
+  return header_;
+}
+
+std_msgs::msg::Header &
+ROSGpuMatContainer::header()
+{
+  return header_;
+}
+
+// std::shared_ptr<const sensor_msgs::msg::Image>
+// ROSGpuMatContainer::get_sensor_msgs_msg_image_pointer() const
+// {
+//   if (!std::holds_alternative<std::shared_ptr<sensor_msgs::msg::Image>>(storage_)) {
+//     return nullptr;
+//   }
+//   return std::get<std::shared_ptr<sensor_msgs::msg::Image>>(storage_);
+// }
+
+std::unique_ptr<sensor_msgs::msg::Image>
+ROSGpuMatContainer::get_sensor_msgs_msg_image_pointer_copy() const
+{
+  auto unique_image = std::make_unique<sensor_msgs::msg::Image>();
+  this->get_sensor_msgs_msg_image_copy(*unique_image);
+  return unique_image;
+}
+
+void
+ROSGpuMatContainer::get_sensor_msgs_msg_image_copy(
+  sensor_msgs::msg::Image & sensor_msgs_image) const
+{
+  sensor_msgs_image.height = frame_.rows;
+  sensor_msgs_image.width = frame_.cols;
+  switch (frame_.type()) {
+    case CV_8UC1:
+      sensor_msgs_image.encoding = "mono8";
+      break;
+    case CV_8UC3:
+      sensor_msgs_image.encoding = "bgr8";
+      break;
+    case CV_16SC1:
+      sensor_msgs_image.encoding = "mono16";
+      break;
+    case CV_8UC4:
+      sensor_msgs_image.encoding = "rgba8";
+      break;
+    default:
+      throw std::runtime_error("unsupported encoding type");
+  }
+  sensor_msgs_image.step = static_cast<sensor_msgs::msg::Image::_step_type>(frame_.step);
+  size_t size = frame_.step * frame_.rows;
+  sensor_msgs_image.data.resize(size);
+  // memcpy(&sensor_msgs_image.data[0], frame_.data, size);
+  cv::Mat cpu_mat;
+  frame_.download(cpu_mat, *cv_cuda_stream_);
+  sensor_msgs_image.data.assign(cpu_mat.datastart, cpu_mat.dataend);
+
+  sensor_msgs_image.header = header_;
+}
+
+}  // namespace shm_msgs
\ No newline at end of file
diff --git a/cuda/gmat_image_container.hpp b/cuda/gmat_image_container.hpp
new file mode 100644
index 0000000..9e192c4
--- /dev/null
+++ b/cuda/gmat_image_container.hpp
@@ -0,0 +1,164 @@
+#ifndef SHM_MSGS__GPU_MAT_SENSOR_MSGS_IMAGE_TYPE_ADAPTER_HPP_
+#define SHM_MSGS__GPU_MAT_SENSOR_MSGS_IMAGE_TYPE_ADAPTER_HPP_
+
+#include <cstddef>
+#include <memory>
+#include <variant>
+
+#include "opencv2/core/mat.hpp"
+#include <opencv2/cudaimgproc.hpp>
+
+#include "rclcpp/type_adapter.hpp"
+#include "sensor_msgs/msg/image.hpp"
+
+namespace shm_msgs
+{
+
+class ROSGpuMatContainer
+{
+
+public:
+  using SensorMsgsImageStorageType = std::variant<
+    std::nullptr_t,
+    std::unique_ptr<sensor_msgs::msg::Image>,
+    std::shared_ptr<sensor_msgs::msg::Image>
+  >;
+
+  ROSGpuMatContainer()
+  {
+    cv_cuda_stream_ = std::make_shared<cv::cuda::Stream>();
+    cv_cuda_event_ = std::make_shared<cv::cuda::Event>();
+  }
+
+  // deep copy from other
+  explicit ROSGpuMatContainer(const ROSGpuMatContainer & other)
+  : header_(other.header_)
+  {
+    // Make a new stream and have it wait on the previous one.
+    cv_cuda_stream_ = std::make_shared<cv::cuda::Stream>();
+    cv_cuda_event_ = std::make_shared<cv::cuda::Event>();
+    cv_cuda_event_->record(*other.cv_cuda_stream_);
+    cv_cuda_stream_->waitEvent(*cv_cuda_event_);
+
+    other.frame_.copyTo(frame_, *cv_cuda_stream_);
+    if (std::holds_alternative<std::shared_ptr<sensor_msgs::msg::Image>>(other.storage_)) {
+      storage_ = std::get<std::shared_ptr<sensor_msgs::msg::Image>>(other.storage_);
+    } else if (std::holds_alternative<std::unique_ptr<sensor_msgs::msg::Image>>(other.storage_)) {
+      storage_ = std::make_unique<sensor_msgs::msg::Image>(
+        *std::get<std::unique_ptr<sensor_msgs::msg::Image>>(other.storage_));
+    }
+  }
+
+  // shallow copy only
+  ROSGpuMatContainer & operator=(const ROSGpuMatContainer & other)
+  {
+    if (this != &other) {
+      header_ = other.header_;
+      frame_ = other.frame_;
+      cv_cuda_stream_ = other.cv_cuda_stream_;
+      cv_cuda_event_ = other.cv_cuda_event_;
+      if (std::holds_alternative<std::shared_ptr<sensor_msgs::msg::Image>>(other.storage_)) {
+        storage_ = std::get<std::shared_ptr<sensor_msgs::msg::Image>>(other.storage_);
+      } else if (std::holds_alternative<std::unique_ptr<sensor_msgs::msg::Image>>(other.storage_)) {
+        storage_ = std::make_unique<sensor_msgs::msg::Image>(
+          *std::get<std::unique_ptr<sensor_msgs::msg::Image>>(other.storage_));
+      } else if (std::holds_alternative<std::nullptr_t>(other.storage_)) {
+        storage_ = nullptr;
+      }
+    }
+    return *this;
+  }
+
+  /// Store an owning pointer to a sensor_msg::msg::Image, and create a cv::cuda::GpuMat that references it.
+  explicit ROSGpuMatContainer(std::unique_ptr<sensor_msgs::msg::Image> unique_sensor_msgs_image);
+
+  /// copy the given cv::Mat into this class.
+  ROSGpuMatContainer(
+    const cv::Mat & mat_frame,
+    const std_msgs::msg::Header & header,
+    std::shared_ptr<cv::cuda::Stream> cv_cuda_stream);
+
+  /// Copy the sensor_msgs::msg::Image into this contain and create a cv::cuda::GpuMat that references it.
+  explicit ROSGpuMatContainer(const sensor_msgs::msg::Image & sensor_msgs_image);
+
+  /// Return true if this class owns the data the cv_gpu_mat references.
+  /**
+   * Note that this does not check if the cv::cuda::GpuMat owns its own data, only if
+   * this class owns a sensor_msgs::msg::Image that the cv::cuda::GpuMat references.
+   */
+  bool
+  is_owning() const;
+
+  /// Const access the cv::cuda::GpuMat in this class.
+  const cv::cuda::GpuMat &
+  cv_gpu_mat() const;
+
+  /// Get a shallow copy of the cv::cuda::GpuMat that is in this class.
+  /**
+   * Note that if you want to let this container go out of scope you should
+   * make a deep copy with cv::cuda::GpuMat::clone() beforehand.
+   */
+  cv::cuda::GpuMat
+  cv_gpu_mat();
+
+  /// Const access the ROS Header.
+  const std_msgs::msg::Header &
+  header() const;
+
+  /// Access the ROS Header.
+  std_msgs::msg::Header &
+  header();
+
+  /// Get shared const pointer to the sensor_msgs::msg::Image if available, otherwise nullptr.
+  // std::shared_ptr<const sensor_msgs::msg::Image>
+  // get_sensor_msgs_msg_image_pointer() const;
+
+  /// Get copy as a unique pointer to the sensor_msgs::msg::Image.
+  std::unique_ptr<sensor_msgs::msg::Image>
+  get_sensor_msgs_msg_image_pointer_copy() const;
+
+  /// Get a copy of the image as a sensor_msgs::msg::Image.
+  sensor_msgs::msg::Image
+  get_sensor_msgs_msg_image_copy() const;
+
+  /// Get a copy of the image as a sensor_msgs::msg::Image.
+  void
+  get_sensor_msgs_msg_image_copy(sensor_msgs::msg::Image & sensor_msgs_image) const;
+
+private:
+  std_msgs::msg::Header header_;
+  cv::cuda::GpuMat frame_;
+  std::shared_ptr<cv::cuda::Stream> cv_cuda_stream_;
+  std::shared_ptr<cv::cuda::Event> cv_cuda_event_;
+  SensorMsgsImageStorageType storage_;
+};
+
+}  // namespace shm_msgs
+
+template<>
+struct rclcpp::TypeAdapter<shm_msgs::ROSGpuMatContainer, sensor_msgs::msg::Image>
+{
+  using is_specialized = std::true_type;
+  using custom_type = shm_msgs::ROSGpuMatContainer;
+  using ros_message_type = sensor_msgs::msg::Image;
+
+  static
+  void
+  convert_to_ros_message(
+    const custom_type & source,
+    ros_message_type & destination)
+  {
+    source.get_sensor_msgs_msg_image_copy(destination);
+  }
+
+  static
+  void
+  convert_to_custom(
+    const ros_message_type & source,
+    custom_type & destination)
+  {
+    destination = shm_msgs::ROSGpuMatContainer(source);
+  }
+};
+
+#endif  // SHM_MSGS__GPU_MAT_SENSOR_MSGS_IMAGE_TYPE_ADAPTER_HPP_
\ No newline at end of file
diff --git a/cuda/gmat_image_listener.hpp b/cuda/gmat_image_listener.hpp
new file mode 100644
index 0000000..23747e5
--- /dev/null
+++ b/cuda/gmat_image_listener.hpp
@@ -0,0 +1,87 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstring>
+#include <memory>
+#include <cinttypes>
+
+#include "rclcpp/rclcpp.hpp"
+#include "rclcpp/type_adapter.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <cv_bridge/cv_bridge.h>
+
+#include "gmat_image_container.hpp"
+
+#include <opencv2/opencv.hpp>
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+
+class Listener : public rclcpp::Node {
+private:
+  using Topic = rclcpp::TypeAdapter<shm_msgs::ROSGpuMatContainer, sensor_msgs::msg::Image>;
+  using DataType = shm_msgs::ROSGpuMatContainer;
+
+public:
+  explicit Listener()
+      : Node("gmat_image_listener", rclcpp::NodeOptions().use_intra_process_comms(true)) {
+
+    // subscription callback to process arriving data
+    auto callback = [this](std::unique_ptr<DataType> msg) -> void {
+
+      RCLCPP_INFO(this->get_logger(), "received with address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(msg.get()));
+
+      // no conversion
+      // last_cvimage = cv_bridge::toCvShare(msg);
+      // last_cvimage = cv_bridge::toCvCopy(msg);
+
+      // move to the private ptr
+      m_last_container = std::move(msg);
+      RCLCPP_INFO(this->get_logger(), "container address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(m_last_container.get()));
+
+      // directly process gpumat, no copy from gpu to cpu
+      m_last_container->cv_gpu_mat();
+
+      auto time_offset_ns = (now() - m_last_container->header().stamp).nanoseconds();
+      auto timestamp_offset_ns = (rclcpp::Time(m_last_container->header().stamp) - m_last_image_ts).nanoseconds();
+      auto time_offset_ms = time_offset_ns / 1000000.0F;
+      auto timestamp_offset_ms = timestamp_offset_ns / 1000000.0F;
+      RCLCPP_INFO(get_logger(), "get-image-transport-time: %.3f", time_offset_ms);
+      if(m_last_image_ts.nanoseconds() > 0.0)
+      {
+        RCLCPP_INFO(get_logger(), "get-image-timestamp_offset-time: %.3f", timestamp_offset_ms);
+      }
+      m_last_image_ts = m_last_container->header().stamp;
+      // cv::imshow("im show", last_cvimage->image);
+      // cv::waitKey(0);
+    };
+
+    // rclcpp::QoS qos(rclcpp::KeepLast(10));
+    rclcpp::QoS custom_qos_profile = rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+      .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+      .keep_last(5)
+      .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+      .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+      .avoid_ros_namespace_conventions(false);
+
+    m_subscription = create_subscription<Topic>("gmat_image", custom_qos_profile, callback);
+  }
+
+private:
+  rclcpp::Subscription<Topic>::SharedPtr m_subscription;
+
+  std::unique_ptr<DataType> m_last_container;
+  rclcpp::Time m_last_image_ts{0, 0, RCL_ROS_TIME};
+};
diff --git a/cuda/gmat_image_node.cpp b/cuda/gmat_image_node.cpp
new file mode 100644
index 0000000..a814e85
--- /dev/null
+++ b/cuda/gmat_image_node.cpp
@@ -0,0 +1,22 @@
+#include <rclcpp/rclcpp.hpp>
+
+#include <memory>
+
+#include "gmat_image_talker.hpp"
+#include "gmat_image_listener.hpp"
+
+int main(int argc, char * argv[])
+{
+  setvbuf(stdout, NULL, _IONBF, BUFSIZ);
+  rclcpp::init(argc, argv);
+  rclcpp::executors::SingleThreadedExecutor executor;
+
+  auto talker = std::make_shared<Talker>();
+  auto listener = std::make_shared<Listener>();
+  executor.add_node(talker);
+  executor.add_node(listener);
+
+  executor.spin();
+  rclcpp::shutdown();
+  return 0;
+}
\ No newline at end of file
diff --git a/cuda/gmat_image_talker.hpp b/cuda/gmat_image_talker.hpp
new file mode 100644
index 0000000..4c963b3
--- /dev/null
+++ b/cuda/gmat_image_talker.hpp
@@ -0,0 +1,98 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <cinttypes>
+
+#include "rclcpp/rclcpp.hpp"
+#include "rclcpp/type_adapter.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <sensor_msgs/image_encodings.hpp>
+#include <cv_bridge/cv_bridge.h>
+
+#include <opencv2/opencv.hpp>
+
+#include "gmat_image_container.hpp"
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+
+// RCLCPP_USING_CUSTOM_TYPE_AS_ROS_MESSAGE_TYPE(
+//   shm_msgs::ROSGpuMatContainer,
+//   sensor_msgs::msg::Image);
+
+class Talker : public rclcpp::Node {
+private:
+  using Topic = rclcpp::TypeAdapter<shm_msgs::ROSGpuMatContainer, sensor_msgs::msg::Image>;
+  using DataType = shm_msgs::ROSGpuMatContainer;
+
+public:
+  explicit Talker()
+      : Node("gmat_image_talker", rclcpp::NodeOptions().use_intra_process_comms(true)) {
+
+    // m_input_cvimage->image = cv::imread("./res/img/205_182.png");
+    // m_input_cvimage->image = cv::imread("./res/img/1024_768.jpeg");
+    m_input_cvimage->image = cv::imread("./res/img/1920_1080.jpg");
+    m_input_cvimage->header.frame_id = "camera_link";
+    m_input_cvimage->encoding = "bgr8";
+    // cv::imshow("input image", m_input_cvimage->image);
+    // cv::waitKey(0);
+    cv_cuda_stream_ = std::make_shared<cv::cuda::Stream>();
+
+    auto publishMessage = [this]() -> void {
+      m_input_cvimage->header.stamp = now();
+
+      // no deep copy from capture to published
+      // copy from cpu to gpu
+      auto msg = std::make_unique<DataType>(m_input_cvimage->image, m_input_cvimage->header, cv_cuda_stream_);
+
+      // no conversion
+      // m_input_cvimage->toImageMsg(*msg);
+
+      // RCLCPP_INFO(this->get_logger(), "Publishing with ts: %u.%u", msg->header().stamp.sec, msg->header().stamp.nanosec);
+      RCLCPP_INFO(this->get_logger(), "Publishing with address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(msg.get()));
+
+      m_publisher->publish(std::move(msg));
+      // We gave up ownership
+
+      m_count++;
+    };
+
+    // rclcpp::QoS qos(rclcpp::KeepLast(10));
+    // rclcpp::QoS qos(rclcpp::SensorDataQoS());
+    rclcpp::QoS custom_qos_profile = rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+      .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+      .keep_last(5)
+      .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+      .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+      .avoid_ros_namespace_conventions(false);
+
+    m_publisher = this->create_publisher<Topic>("gmat_image", custom_qos_profile);
+
+    // Use a timer to schedule periodic message publishing.
+    m_timer = this->create_wall_timer(1s, publishMessage);
+  }
+
+private:
+  uint64_t m_count = 1;
+  rclcpp::Publisher<Topic>::SharedPtr m_publisher;
+  rclcpp::TimerBase::SharedPtr m_timer;
+  std::shared_ptr<cv_bridge::CvImage> m_input_cvimage{std::make_shared<cv_bridge::CvImage>()};
+  std::shared_ptr<cv::cuda::Stream> cv_cuda_stream_;
+};
diff --git a/cuda/intra_gmat_listener.hpp b/cuda/intra_gmat_listener.hpp
new file mode 100644
index 0000000..2d859d7
--- /dev/null
+++ b/cuda/intra_gmat_listener.hpp
@@ -0,0 +1,80 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstring>
+#include <memory>
+#include <cinttypes>
+
+#include "rclcpp/rclcpp.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <cv_bridge/cv_bridge.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/cudaimgproc.hpp>
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+class Listener : public rclcpp::Node {
+private:
+  using Topic = sensor_msgs::msg::Image;
+
+public:
+  explicit Listener()
+      : Node("intra_gmat_listener", rclcpp::NodeOptions().use_intra_process_comms(true)) {
+
+    // subscription callback to process arriving data
+    auto callback = [this](const Topic::SharedPtr msg) -> void {
+
+      RCLCPP_INFO(this->get_logger(), "received with address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(msg.get()));
+
+      // from receive to process
+      last_cvimage = cv_bridge::toCvShare(msg);
+      // last_cvimage = cv_bridge::toCvCopy(msg);
+
+      // copy from cpu to gpu
+      m_last_gmatimage->upload(last_cvimage->image);
+
+      auto time_offset_ns = (now() - last_cvimage->header.stamp).nanoseconds();
+      auto timestamp_offset_ns = (rclcpp::Time(msg->header.stamp) - m_last_image_ts).nanoseconds();
+      auto time_offset_ms = time_offset_ns / 1000000.0F;
+      auto timestamp_offset_ms = timestamp_offset_ns / 1000000.0F;
+      RCLCPP_INFO(get_logger(), "get-image-transport-time: %.3f", time_offset_ms);
+      if(m_last_image_ts.nanoseconds() > 0.0)
+      {
+        RCLCPP_INFO(get_logger(), "get-image-timestamp_offset-time: %.3f", timestamp_offset_ms);
+      }
+      m_last_image_ts = msg->header.stamp;
+      // cv::imshow("im show", last_cvimage->image);
+      // cv::waitKey(0);
+    };
+
+    // rclcpp::QoS qos(rclcpp::KeepLast(10));
+    rclcpp::QoS custom_qos_profile = rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+      .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+      .keep_last(5)
+      .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+      .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+      .avoid_ros_namespace_conventions(false);
+
+    m_subscription = create_subscription<Topic>("intra_gmat", custom_qos_profile, callback);
+  }
+
+private:
+  rclcpp::Subscription<Topic>::SharedPtr m_subscription;
+
+  cv_bridge::CvImageConstPtr last_cvimage;
+  std::shared_ptr<cv::cuda::GpuMat> m_last_gmatimage{std::make_shared<cv::cuda::GpuMat>()};
+  rclcpp::Time m_last_image_ts{0, 0, RCL_ROS_TIME};
+};
diff --git a/cuda/intra_gmat_node.cpp b/cuda/intra_gmat_node.cpp
new file mode 100644
index 0000000..de1c806
--- /dev/null
+++ b/cuda/intra_gmat_node.cpp
@@ -0,0 +1,22 @@
+#include <rclcpp/rclcpp.hpp>
+
+#include <memory>
+
+#include "intra_gmat_talker.hpp"
+#include "intra_gmat_listener.hpp"
+
+int main(int argc, char * argv[])
+{
+  setvbuf(stdout, NULL, _IONBF, BUFSIZ);
+  rclcpp::init(argc, argv);
+  rclcpp::executors::SingleThreadedExecutor executor;
+
+  auto talker = std::make_shared<Talker>();
+  auto listener = std::make_shared<Listener>();
+  executor.add_node(talker);
+  executor.add_node(listener);
+
+  executor.spin();
+  rclcpp::shutdown();
+  return 0;
+}
\ No newline at end of file
diff --git a/cuda/intra_gmat_talker.hpp b/cuda/intra_gmat_talker.hpp
new file mode 100644
index 0000000..cdf0b12
--- /dev/null
+++ b/cuda/intra_gmat_talker.hpp
@@ -0,0 +1,93 @@
+// Copyright 2021 Apex.AI, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <cinttypes>
+
+#include "rclcpp/rclcpp.hpp"
+
+#include "sensor_msgs/msg/image.hpp"
+#include <sensor_msgs/image_encodings.hpp>
+#include <cv_bridge/cv_bridge.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/cudaimgproc.hpp>
+
+using namespace std::chrono_literals;
+using namespace cv_bridge;
+
+class Talker : public rclcpp::Node {
+private:
+  using Topic = sensor_msgs::msg::Image;
+
+public:
+  explicit Talker()
+      : Node("intra_gmat_talker", rclcpp::NodeOptions().use_intra_process_comms(true)) {
+
+    // m_input_cvimage->image = cv::imread("./res/img/205_182.png");
+    // m_input_cvimage->image = cv::imread("./res/img/1024_768.jpeg");
+    m_input_cvimage->image = cv::imread("./res/img/1920_1080.jpg");
+    m_input_cvimage->header.frame_id = "camera_link";
+    m_input_cvimage->encoding = "bgr8";
+    // cv::imshow("input image", m_input_cvimage->image);
+    // cv::waitKey(0);
+    // copy from cpu to gpu and processing
+    m_input_gmatimage->upload(m_input_cvimage->image);
+
+    auto publishMessage = [this]() -> void {
+      m_input_cvimage->header.stamp = now();
+
+      // copy from gpu to cpu
+      m_input_gmatimage->download(m_input_cvimage->image);
+
+      auto msg = std::make_unique<Topic>();
+      // a copy from capture to published
+      m_input_cvimage->toImageMsg(*msg);
+
+      // RCLCPP_INFO(this->get_logger(), "Publishing with ts: %u.%u", m_input_cvimage->header.stamp.sec, m_input_cvimage->header.stamp.nanosec);
+      RCLCPP_INFO(this->get_logger(), "Publishing with address: 0x%" PRIXPTR "", reinterpret_cast<std::uintptr_t>(msg.get()));
+
+      m_publisher->publish(std::move(msg));
+      // We gave up ownership
+
+      m_count++;
+    };
+
+    // rclcpp::QoS qos(rclcpp::KeepLast(10));
+    // rclcpp::QoS qos(rclcpp::SensorDataQoS());
+    rclcpp::QoS custom_qos_profile = rclcpp::QoS(rclcpp::QoSInitialization::from_rmw(rmw_qos_profile_default))
+      .history(rmw_qos_history_policy_t::RMW_QOS_POLICY_HISTORY_KEEP_LAST)
+      .keep_last(5)
+      .reliability(rmw_qos_reliability_policy_t::RMW_QOS_POLICY_RELIABILITY_BEST_EFFORT)
+      .durability(rmw_qos_durability_policy_t::RMW_QOS_POLICY_DURABILITY_VOLATILE)
+      .avoid_ros_namespace_conventions(false);
+
+    m_publisher = this->create_publisher<Topic>("intra_gmat", custom_qos_profile);
+
+    // Use a timer to schedule periodic message publishing.
+    m_timer = this->create_wall_timer(1s, publishMessage);
+  }
+
+private:
+  uint64_t m_count = 1;
+  rclcpp::Publisher<Topic>::SharedPtr m_publisher;
+  rclcpp::TimerBase::SharedPtr m_timer;
+  std::shared_ptr<cv_bridge::CvImage> m_input_cvimage{std::make_shared<cv_bridge::CvImage>()};
+  std::shared_ptr<cv::cuda::GpuMat> m_input_gmatimage{std::make_shared<cv::cuda::GpuMat>()};
+  std::unique_ptr<sensor_msgs::msg::Image> m_input_rosimage{std::make_unique<sensor_msgs::msg::Image>()};
+};