diff --git a/perception/tensorrt_yolox/CMakeLists.txt b/perception/tensorrt_yolox/CMakeLists.txt index a5498a845e62e..c38824cecd5b1 100644 --- a/perception/tensorrt_yolox/CMakeLists.txt +++ b/perception/tensorrt_yolox/CMakeLists.txt @@ -15,6 +15,7 @@ include(CheckLanguage) check_language(CUDA) if(CMAKE_CUDA_COMPILER) enable_language(CUDA) + find_package(autoware_type_adapters) else() message(WARNING "CUDA is not found. preprocess acceleration using CUDA will not be available.") endif() @@ -32,6 +33,7 @@ ament_auto_add_library(${PROJECT_NAME} SHARED ament_target_dependencies(${PROJECT_NAME} OpenCV + autoware_type_adapters ) if(CMAKE_CUDA_COMPILER) diff --git a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml index 57c1b40c44a4d..178c3f8bba0a3 100644 --- a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml +++ b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml @@ -35,3 +35,6 @@ clip_value: 6.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration. preprocess_on_gpu: true # If true, pre-processing is performed on GPU. calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization. + is_using_image_transport: true # if true, will use image_transport, false -> will only publish raw images + type_adaptation_activated: true # if true, will use type_adaptation to transfer images in GPU. + is_publish_debug_rois_image_: true # if true, will draw ROIs in the image. diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp index d287c8a44d4cf..7aa9e6d036a87 100644 --- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp +++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp @@ -15,6 +15,8 @@ #ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_ #define TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_ +#include "type_adapters/image_container.hpp" + #include #include #include @@ -31,6 +33,7 @@ using cuda_utils::CudaUniquePtr; using cuda_utils::CudaUniquePtrHost; using cuda_utils::makeCudaStream; using cuda_utils::StreamUniquePtr; +using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer; struct Object { @@ -107,6 +110,10 @@ class TrtYoloX const std::vector & images, ObjectArrays & objects, std::vector & masks, std::vector & color_masks); + bool doInference( + const std::vector> & images, ObjectArrays & objects, + std::vector & masks, std::vector & color_masks); + /** * @brief run inference including pre-process and post-process * @param[out] objects results for object detection @@ -116,6 +123,10 @@ class TrtYoloX bool doInferenceWithRoi( const std::vector & images, ObjectArrays & objects, const std::vector & roi); + bool doInferenceWithRoi( + const std::vector> & images, ObjectArrays & objects, + const std::vector & roi); + /** * @brief run multi-scale inference including pre-process and post-process * @param[out] objects results for object detection @@ -125,6 +136,10 @@ class TrtYoloX bool doMultiScaleInference( const cv::Mat & image, ObjectArrays & objects, const std::vector & roi); + bool doMultiScaleInference( + const std::shared_ptr & image, ObjectArrays & objects, + const std::vector & roi); + /** * @brief allocate buffer for preprocess on GPU * @param[in] width original image width @@ -167,6 +182,7 @@ class TrtYoloX * @param[in] images batching images */ void preprocessGpu(const std::vector & images); + void preprocessGpu(const std::vector> & images); /** * @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU @@ -183,6 +199,9 @@ class TrtYoloX void preprocessWithRoiGpu( const std::vector & images, const std::vector & rois); + void preprocessWithRoiGpu( + const std::vector> & images, + const std::vector & rois); /** * @brief run multi-scale preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU * @param[in] images batching images @@ -196,15 +215,29 @@ class TrtYoloX * @param[in] rois region of interest */ void multiScalePreprocessGpu(const cv::Mat & image, const std::vector & rois); + void multiScalePreprocessGpu( + const std::shared_ptr & image, const std::vector & rois); bool multiScaleFeedforward(const cv::Mat & image, int batch_size, ObjectArrays & objects); + bool multiScaleFeedforward( + const std::shared_ptr & image, int batch_size, ObjectArrays & objects); + bool multiScaleFeedforwardAndDecode( const cv::Mat & images, int batch_size, ObjectArrays & objects); + bool multiScaleFeedforwardAndDecode( + const std::shared_ptr & images, int batch_size, ObjectArrays & objects); bool feedforward(const std::vector & images, ObjectArrays & objects); + bool feedforward( + const std::vector> & images, ObjectArrays & objects); + bool feedforwardAndDecode( const std::vector & images, ObjectArrays & objects, std::vector & masks, std::vector & color_masks); + bool feedforwardAndDecode( + const std::vector> & images, ObjectArrays & objects, + std::vector & masks, std::vector & color_masks); + void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const; void generateGridsAndStride( const int target_w, const int target_h, const std::vector & strides, diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp index 64332dffa834e..7e16ac7d0b4bc 100644 --- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp +++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp @@ -16,6 +16,7 @@ #define TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_ #include "object_recognition_utils/object_recognition_utils.hpp" +#include "type_adapters/image_container.hpp" #include #include @@ -48,6 +49,7 @@ namespace tensorrt_yolox // cspell: ignore Semseg using LabelMap = std::map; using Label = tier4_perception_msgs::msg::Semantic; +using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer; class TrtYoloXNode : public rclcpp::Node { struct RoiOverlaySemsegLabel @@ -74,12 +76,22 @@ class TrtYoloXNode : public rclcpp::Node private: void onConnect(); + void setUpImageSubscriber(); + bool checkInputBlocked(); + uint32_t getNumOutputConnections(); void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg); + void onGpuImage(const std::shared_ptr msg); bool readLabelFile(const std::string & label_path); void replaceLabelMap(); void overlapSegmentByRoi( const tensorrt_yolox::Object & object, cv::Mat & mask, const int width, const int height); int mapRoiLabel2SegLabel(const int32_t roi_label_index); + void postInference( + const std_msgs::msg::Header header, const tensorrt_yolox::ObjectArrays & objects, + const int width, const int height, std::vector & masks); + void drawImageDetection( + cv_bridge::CvImagePtr image, const tensorrt_yolox::ObjectArrays & objects, const int width, + const int height); image_transport::Publisher image_pub_; image_transport::Publisher mask_pub_; @@ -88,6 +100,10 @@ class TrtYoloXNode : public rclcpp::Node rclcpp::Publisher::SharedPtr objects_pub_; image_transport::Subscriber image_sub_; + rclcpp::Subscription::SharedPtr gpu_image_sub_; + rclcpp::Publisher::SharedPtr simple_image_pub_; + rclcpp::Publisher::SharedPtr simple_mask_pub_; + rclcpp::Publisher::SharedPtr simple_color_mask_pub_; rclcpp::TimerBase::SharedPtr timer_; @@ -95,6 +111,9 @@ class TrtYoloXNode : public rclcpp::Node std::unique_ptr trt_yolox_; bool is_roi_overlap_segment_; bool is_publish_color_mask_; + bool is_using_image_transport_; + bool type_adaptation_activated_; + bool is_publish_debug_rois_image_; float overlap_roi_score_threshold_; // TODO(badai-nguyen): change to function std::map remap_roi_to_semantic_ = { diff --git a/perception/tensorrt_yolox/package.xml b/perception/tensorrt_yolox/package.xml index 8a80d9d82d09f..0b5b45ecb7044 100644 --- a/perception/tensorrt_yolox/package.xml +++ b/perception/tensorrt_yolox/package.xml @@ -19,6 +19,7 @@ tensorrt_cmake_module autoware_perception_msgs + autoware_type_adapters cuda_utils cv_bridge image_transport diff --git a/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json b/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json index ce1ad6c2d0caf..352512c401812 100644 --- a/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json +++ b/perception/tensorrt_yolox/schema/yolox_s_plus_opt.schema.json @@ -74,6 +74,21 @@ "type": "string", "default": "", "description": "Path to a file which contains path to images. Those images will be used for int8 quantization." + }, + "is_using_image_transport": { + "type": "boolean", + "default": false, + "description": "If true, will use image_transport, false -> will only publish raw images" + }, + "type_adaptation_activated": { + "type": "boolean", + "default": false, + "description": "If true, will use type_adaptation to transfer images in GPU." + }, + "is_publish_debug_rois_image_": { + "type": "boolean", + "default": true, + "description": "If true, will draw ROIs in the image." } }, "required": [ diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp index af6cecd2eff7d..8f8e363790e75 100644 --- a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp +++ b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp @@ -499,6 +499,93 @@ void TrtYoloX::preprocessGpu(const std::vector & images) images[0].rows, 3, batch_size, static_cast(norm_factor_), *stream_); } +void TrtYoloX::preprocessGpu(const std::vector> & images) +{ + const auto batch_size = images.size(); + auto input_dims = trt_common_->getBindingDimensions(0); + + input_dims.d[0] = batch_size; + for (const auto & image : images) { + // if size of source input has been changed... + int width = image->width(); + int height = image->height(); + if (src_width_ != -1 || src_height_ != -1) { + if (width != src_width_ || height != src_height_) { + // Free cuda memory to reallocate + if (image_buf_h_) { + image_buf_h_.reset(); + } + if (image_buf_d_) { + image_buf_d_.reset(); + } + if (argmax_buf_h_) { + argmax_buf_h_.reset(); + } + if (argmax_buf_d_) { + argmax_buf_d_.reset(); + } + } + } + src_width_ = width; + src_height_ = height; + } + if (!image_buf_h_) { + trt_common_->setBindingDimensions(0, input_dims); + scales_.clear(); + } + const float input_height = static_cast(input_dims.d[2]); + const float input_width = static_cast(input_dims.d[3]); + int b = 0; + size_t argmax_out_elem_num = 0; + for (const auto & image : images) { + if (!image_buf_h_) { + const float scale = std::min(input_width / image->width(), input_height / image->height()); + scales_.emplace_back(scale); + image_buf_h_ = cuda_utils::make_unique_host( + image->width() * image->height() * 3 * batch_size, cudaHostAllocWriteCombined); + image_buf_d_ = + cuda_utils::make_unique(image->width() * image->height() * 3 * batch_size); + } + int index = b * image->width() * image->height() * 3; + // Copy into pinned memory + // synchroize the stream + cudaStreamSynchronize(image->cuda_stream()->stream()); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + image_buf_d_.get() + index, image->cuda_mem(), + image->width() * image->height() * 3 * sizeof(unsigned char), cudaMemcpyDeviceToDevice, + *stream_)); + b++; + + if (multitask_) { + for (int m = 0; m < multitask_; m++) { + const auto output_dims = + trt_common_->getBindingDimensions(m + 2); // 0: input, 1: output for detections + const float scale = std::min( + output_dims.d[3] / static_cast(image->width()), + output_dims.d[2] / static_cast(image->height())); + int out_w = static_cast(image->width() * scale); + int out_h = static_cast(image->height() * scale); + argmax_out_elem_num += out_w * out_h * batch_size; + } + } + } + + if (multitask_) { + if (!argmax_buf_h_) { + argmax_buf_h_ = + cuda_utils::make_unique_host(argmax_out_elem_num, cudaHostAllocPortable); + } + if (!argmax_buf_d_) { + argmax_buf_d_ = cuda_utils::make_unique(argmax_out_elem_num); + } + } + + // Preprocess on GPU + resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + input_d_.get(), image_buf_d_.get(), input_width, input_height, 3, images[0]->width(), + images[0]->height(), 3, batch_size, static_cast(norm_factor_), *stream_); +} + void TrtYoloX::preprocess(const std::vector & images) { const auto batch_size = images.size(); @@ -566,6 +653,23 @@ bool TrtYoloX::doInference( } } +bool TrtYoloX::doInference( + const std::vector> & images, ObjectArrays & objects, + std::vector & masks, [[maybe_unused]] std::vector & color_masks) +{ + if (!trt_common_->isInitialized()) { + return false; + } + + preprocessGpu(images); + + if (needs_output_decode_) { + return feedforwardAndDecode(images, objects, masks, color_masks); + } else { + return feedforward(images, objects); + } +} + void TrtYoloX::preprocessWithRoiGpu( const std::vector & images, const std::vector & rois) { @@ -890,6 +994,60 @@ bool TrtYoloX::feedforward(const std::vector & images, ObjectArrays & o return true; } +bool TrtYoloX::feedforward( + const std::vector> & images, ObjectArrays & objects) +{ + std::vector buffers = { + input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), + out_classes_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + const auto batch_size = images.size(); + auto out_num_detections = std::make_unique(batch_size); + auto out_boxes = std::make_unique(4 * batch_size * max_detections_); + auto out_scores = std::make_unique(batch_size * max_detections_); + auto out_classes = std::make_unique(batch_size * max_detections_); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_num_detections.get(), out_num_detections_d_.get(), sizeof(int32_t) * batch_size, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_boxes.get(), out_boxes_d_.get(), sizeof(float) * 4 * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_scores.get(), out_scores_d_.get(), sizeof(float) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_classes.get(), out_classes_d_.get(), sizeof(int32_t) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + cudaStreamSynchronize(*stream_); + objects.clear(); + for (size_t i = 0; i < batch_size; ++i) { + const size_t num_detection = static_cast(out_num_detections[i]); + ObjectArray object_array; + object_array.reserve(num_detection); + for (size_t j = 0; j < num_detection; ++j) { + Object object{}; + const auto x1 = out_boxes[i * max_detections_ * 4 + j * 4] / scales_[i]; + const auto y1 = out_boxes[i * max_detections_ * 4 + j * 4 + 1] / scales_[i]; + const auto x2 = out_boxes[i * max_detections_ * 4 + j * 4 + 2] / scales_[i]; + const auto y2 = out_boxes[i * max_detections_ * 4 + j * 4 + 3] / scales_[i]; + object.x_offset = + std::clamp(0, static_cast(x1), static_cast(images[i]->width())); + object.y_offset = + std::clamp(0, static_cast(y1), static_cast(images[i]->height())); + object.width = static_cast(std::max(0.0F, x2 - x1)); + object.height = static_cast(std::max(0.0F, y2 - y1)); + object.score = out_scores[i * max_detections_ + j]; + object.type = out_classes[i * max_detections_ + j]; + object_array.emplace_back(object); + } + objects.emplace_back(object_array); + } + return true; +} + bool TrtYoloX::feedforwardAndDecode( const std::vector & images, ObjectArrays & objects, std::vector & out_masks, [[maybe_unused]] std::vector & color_masks) @@ -959,6 +1117,76 @@ bool TrtYoloX::feedforwardAndDecode( return true; } +bool TrtYoloX::feedforwardAndDecode( + const std::vector> & images, ObjectArrays & objects, + std::vector & out_masks, [[maybe_unused]] std::vector & color_masks) +{ + std::vector buffers = {input_d_.get(), out_prob_d_.get()}; + if (multitask_) { + buffers = {input_d_.get(), out_prob_d_.get(), segmentation_out_prob_d_.get()}; + } + trt_common_->enqueueV2(buffers.data(), images[0]->cuda_stream()->stream(), nullptr); + + const auto batch_size = images.size(); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, + images[0]->cuda_stream()->stream())); + if (multitask_ && !use_gpu_preprocess_) { + CHECK_CUDA_ERROR(cudaMemcpyAsync( + segmentation_out_prob_h_.get(), segmentation_out_prob_d_.get(), + sizeof(float) * segmentation_out_elem_num_, cudaMemcpyDeviceToHost, + images[0]->cuda_stream()->stream())); + } + cudaStreamSynchronize(images[0]->cuda_stream()->stream()); + objects.clear(); + + for (size_t i = 0; i < batch_size; ++i) { + auto image_size = cv::Size(images[i]->width(), images[i]->height()); + float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_); + ObjectArray object_array; + decodeOutputs(batch_prob, object_array, scales_[i], image_size); + // add refine mask using object + objects.emplace_back(object_array); + if (multitask_) { + segmentation_masks_.clear(); + + size_t counter = 0; + int batch = + static_cast(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_); + for (int m = 0; m < multitask_; m++) { + const auto output_dims = + trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + size_t out_elem_num = std::accumulate( + output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); + out_elem_num = out_elem_num * batch; + const float scale = std::min( + output_dims.d[3] / static_cast(image_size.width), + output_dims.d[2] / static_cast(image_size.height)); + int out_w = static_cast(image_size.width * scale); + int out_h = static_cast(image_size.height * scale); + cv::Mat mask; + if (use_gpu_preprocess_) { + float * d_segmentation_results = + segmentation_out_prob_d_.get() + (i * segmentation_out_elem_num_per_batch_); + mask = getMaskImageGpu(&(d_segmentation_results[counter]), output_dims, out_w, out_h, i); + } else { + float * segmentation_results = + segmentation_out_prob_h_.get() + (i * segmentation_out_elem_num_per_batch_); + mask = getMaskImage(&(segmentation_results[counter]), output_dims, out_w, out_h); + } + segmentation_masks_.emplace_back(std::move(mask)); + counter += out_elem_num; + } + // semantic segmentation was fixed as first task + out_masks.at(i) = segmentation_masks_.at(0); + } else { + continue; + } + } + return true; +} + // This method is assumed to be called when specified YOLOX model contains // EfficientNMS_TRT module. bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, ObjectArrays & objects) @@ -1011,6 +1239,59 @@ bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, Obje return true; } +bool TrtYoloX::multiScaleFeedforward( + const std::shared_ptr & image, int batch_size, ObjectArrays & objects) +{ + std::vector buffers = { + input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), + out_classes_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), image->cuda_stream()->stream(), nullptr); + + auto out_num_detections = std::make_unique(batch_size); + auto out_boxes = std::make_unique(4 * batch_size * max_detections_); + auto out_scores = std::make_unique(batch_size * max_detections_); + auto out_classes = std::make_unique(batch_size * max_detections_); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_num_detections.get(), out_num_detections_d_.get(), sizeof(int32_t) * batch_size, + cudaMemcpyDeviceToHost, image->cuda_stream()->stream())); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_boxes.get(), out_boxes_d_.get(), sizeof(float) * 4 * batch_size * max_detections_, + cudaMemcpyDeviceToHost, image->cuda_stream()->stream())); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_scores.get(), out_scores_d_.get(), sizeof(float) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, image->cuda_stream()->stream())); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_classes.get(), out_classes_d_.get(), sizeof(int32_t) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, image->cuda_stream()->stream())); + cudaStreamSynchronize(image->cuda_stream()->stream()); + + objects.clear(); + for (int i = 0; i < batch_size; ++i) { + const size_t num_detection = static_cast(out_num_detections[i]); + ObjectArray object_array(num_detection); + for (size_t j = 0; j < num_detection; ++j) { + Object object{}; + const auto x1 = out_boxes[i * max_detections_ * 4 + j * 4] / scales_[i]; + const auto y1 = out_boxes[i * max_detections_ * 4 + j * 4 + 1] / scales_[i]; + const auto x2 = out_boxes[i * max_detections_ * 4 + j * 4 + 2] / scales_[i]; + const auto y2 = out_boxes[i * max_detections_ * 4 + j * 4 + 3] / scales_[i]; + object.x_offset = + std::clamp(0, static_cast(x1), static_cast(image->width())); + object.y_offset = + std::clamp(0, static_cast(y1), static_cast(image->height())); + object.width = static_cast(std::max(0.0F, x2 - x1)); + object.height = static_cast(std::max(0.0F, y2 - y1)); + object.score = out_scores[i * max_detections_ + j]; + object.type = out_classes[i * max_detections_ + j]; + object_array.emplace_back(object); + } + objects.emplace_back(object_array); + } + return true; +} + bool TrtYoloX::multiScaleFeedforwardAndDecode( const cv::Mat & image, int batch_size, ObjectArrays & objects) { @@ -1033,6 +1314,28 @@ bool TrtYoloX::multiScaleFeedforwardAndDecode( return true; } +bool TrtYoloX::multiScaleFeedforwardAndDecode( + const std::shared_ptr & image, int batch_size, ObjectArrays & objects) +{ + std::vector buffers = {input_d_.get(), out_prob_d_.get()}; + trt_common_->enqueueV2(buffers.data(), image->cuda_stream()->stream(), nullptr); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, + image->cuda_stream()->stream())); + cudaStreamSynchronize(image->cuda_stream()->stream()); + objects.clear(); + + for (int i = 0; i < batch_size; ++i) { + auto image_size = cv::Size(image->width(), image->height()); + float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_); + ObjectArray object_array; + decodeOutputs(batch_prob, object_array, scales_[i], image_size); + objects.emplace_back(object_array); + } + return true; +} + void TrtYoloX::decodeOutputs( float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const { diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp index f4e544c11ffb1..73120b69e5a88 100644 --- a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp +++ b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp @@ -107,6 +107,11 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) is_roi_overlap_segment_ = declare_parameter("is_roi_overlap_segment"); is_publish_color_mask_ = declare_parameter("is_publish_color_mask"); overlap_roi_score_threshold_ = declare_parameter("overlap_roi_score_threshold"); + is_using_image_transport_ = declare_parameter("is_using_image_transport"); + type_adaptation_activated_ = declare_parameter("type_adaptation_activated"); + is_publish_debug_rois_image_ = declare_parameter("is_publish_debug_rois_image"); + // We don't use type adaptation when preprocessing is performed on CPU. + type_adaptation_activated_ = type_adaptation_activated_ && preprocess_on_gpu; roi_overlay_segment_labels_.UNKNOWN = declare_parameter("roi_overlay_segment_label.UNKNOWN"); roi_overlay_segment_labels_.CAR = declare_parameter("roi_overlay_segment_label.CAR"); @@ -140,9 +145,15 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) objects_pub_ = this->create_publisher( "~/out/objects", 1); - mask_pub_ = image_transport::create_publisher(this, "~/out/mask"); - color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask"); - image_pub_ = image_transport::create_publisher(this, "~/out/image"); + if (is_using_image_transport_) { + mask_pub_ = image_transport::create_publisher(this, "~/out/mask"); + color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask"); + image_pub_ = image_transport::create_publisher(this, "~/out/image"); + } else { + simple_image_pub_ = this->create_publisher("~/out/image", 1); + simple_mask_pub_ = this->create_publisher("~/out/mask", 1); + simple_color_mask_pub_ = this->create_publisher("~/out/color_mask", 1); + } if (declare_parameter("build_only", false)) { RCLCPP_INFO(this->get_logger(), "TensorRT engine file is built and exit."); @@ -150,48 +161,66 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) } } -void TrtYoloXNode::onConnect() +bool TrtYoloXNode::checkInputBlocked() { - using std::placeholders::_1; - if ( - objects_pub_->get_subscription_count() == 0 && - objects_pub_->get_intra_process_subscription_count() == 0 && - image_pub_.getNumSubscribers() == 0 && mask_pub_.getNumSubscribers() == 0 && - color_mask_pub_.getNumSubscribers() == 0) { - image_sub_.shutdown(); - } else if (!image_sub_) { + bool result; + if (type_adaptation_activated_) { + result = (gpu_image_sub_ == nullptr) || (gpu_image_sub_->get_publisher_count() == 0); + } else { + result = !image_sub_; + } + return result; +} + +void TrtYoloXNode::setUpImageSubscriber() +{ + if (type_adaptation_activated_) { + gpu_image_sub_ = this->create_subscription( + "~/in/image", rclcpp::QoS(1).best_effort().durability_volatile(), + std::bind(&TrtYoloXNode::onGpuImage, this, std::placeholders::_1)); + } else { image_sub_ = image_transport::create_subscription( - this, "~/in/image", std::bind(&TrtYoloXNode::onImage, this, _1), "raw", + this, "~/in/image", std::bind(&TrtYoloXNode::onImage, this, std::placeholders::_1), "raw", rmw_qos_profile_sensor_data); } } -void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) +uint32_t TrtYoloXNode::getNumOutputConnections() { - stop_watch_ptr_->toc("processing_time", true); - tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects; - - cv_bridge::CvImagePtr in_image_ptr; - try { - in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8); - } catch (cv_bridge::Exception & e) { - RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what()); - return; + uint32_t num_subscribers = 0; + // Image output topic conenctions + if (is_using_image_transport_) { + num_subscribers = image_pub_.getNumSubscribers() + mask_pub_.getNumSubscribers() + + color_mask_pub_.getNumSubscribers(); + } else { + num_subscribers = simple_image_pub_->get_subscription_count() + + simple_mask_pub_->get_subscription_count() + + simple_color_mask_pub_->get_subscription_count(); } - const auto width = in_image_ptr->image.cols; - const auto height = in_image_ptr->image.rows; + // Objects output topic connections + num_subscribers += + objects_pub_->get_subscription_count() + objects_pub_->get_intra_process_subscription_count(); + return num_subscribers; +} - tensorrt_yolox::ObjectArrays objects; - std::vector masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))}; - std::vector color_masks = { - cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))}; +void TrtYoloXNode::onConnect() +{ + using std::placeholders::_1; - if (!trt_yolox_->doInference({in_image_ptr->image}, objects, masks, color_masks)) { - RCLCPP_WARN(this->get_logger(), "Fail to inference"); - return; + if (getNumOutputConnections() == 0) { + image_sub_.shutdown(); + gpu_image_sub_.reset(); + } else if (checkInputBlocked()) { + setUpImageSubscriber(); } - auto & mask = masks.at(0); +} +void TrtYoloXNode::postInference( + const std_msgs::msg::Header header, const tensorrt_yolox::ObjectArrays & objects, const int width, + const int height, std::vector & masks) +{ + auto & mask = masks.at(0); + tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects; for (const auto & yolox_object : objects.at(0)) { tier4_perception_msgs::msg::DetectedObjectWithFeature object; object.feature.roi.x_offset = yolox_object.x_offset; @@ -202,15 +231,7 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) object.object.classification = object_recognition_utils::toObjectClassifications(label_map_[yolox_object.type], 1.0f); out_objects.feature_objects.push_back(object); - const auto left = std::max(0, static_cast(object.feature.roi.x_offset)); - const auto top = std::max(0, static_cast(object.feature.roi.y_offset)); - const auto right = - std::min(static_cast(object.feature.roi.x_offset + object.feature.roi.width), width); - const auto bottom = - std::min(static_cast(object.feature.roi.y_offset + object.feature.roi.height), height); - cv::rectangle( - in_image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3, - 8, 0); + // Refine mask: replacing segmentation mask by roi class // This should remove when the segmentation accuracy is high if (is_roi_overlap_segment_ && trt_yolox_->getMultitaskNum() > 0) { @@ -221,7 +242,7 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) sensor_msgs::msg::Image::SharedPtr out_mask_msg = cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask) .toImageMsg(); - out_mask_msg->header = msg->header; + out_mask_msg->header = header; std::vector> compressed_data = perception_utils::runLengthEncoder(mask); int step = sizeof(uint8_t) + sizeof(int); @@ -230,10 +251,13 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) std::memcpy(&out_mask_msg->data[i * step], &compressed_data.at(i).first, sizeof(uint8_t)); std::memcpy(&out_mask_msg->data[i * step + 1], &compressed_data.at(i).second, sizeof(int)); } - mask_pub_.publish(out_mask_msg); + if (is_using_image_transport_) { + mask_pub_.publish(out_mask_msg); + } else { + simple_mask_pub_->publish(*out_mask_msg); + } } - image_pub_.publish(in_image_ptr->toImageMsg()); - out_objects.header = msg->header; + out_objects.header = header; objects_pub_->publish(out_objects); if (debug_publisher_) { @@ -258,8 +282,96 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) sensor_msgs::msg::Image::SharedPtr output_color_mask_msg = cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask) .toImageMsg(); - output_color_mask_msg->header = msg->header; - color_mask_pub_.publish(output_color_mask_msg); + output_color_mask_msg->header = header; + if (is_using_image_transport_) { + color_mask_pub_.publish(output_color_mask_msg); + } else { + simple_color_mask_pub_->publish(*output_color_mask_msg); + } + } +} + +void TrtYoloXNode::drawImageDetection( + cv_bridge::CvImagePtr image_ptr, const tensorrt_yolox::ObjectArrays & objects, const int width, + const int height) +{ + for (const auto & yolox_object : objects.at(0)) { + const auto left = std::max(0, static_cast(yolox_object.x_offset)); + const auto top = std::max(0, static_cast(yolox_object.y_offset)); + const auto right = + std::min(static_cast(yolox_object.x_offset + yolox_object.width), width); + const auto bottom = + std::min(static_cast(yolox_object.y_offset + yolox_object.height), height); + cv::rectangle( + image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3, 8, + 0); + } + + sensor_msgs::msg::Image::SharedPtr output_image_msg_ptr = image_ptr->toImageMsg(); + if (is_using_image_transport_) { + image_pub_.publish(output_image_msg_ptr); + } else { + simple_image_pub_->publish(*output_image_msg_ptr); + } +} + +void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) +{ + stop_watch_ptr_->toc("processing_time", true); + + cv_bridge::CvImagePtr in_image_ptr; + try { + in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8); + } catch (cv_bridge::Exception & e) { + RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what()); + return; + } + const auto width = in_image_ptr->image.cols; + const auto height = in_image_ptr->image.rows; + + tensorrt_yolox::ObjectArrays objects; + std::vector masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))}; + std::vector color_masks = { + cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))}; + + std::vector images = {in_image_ptr->image}; + if (!trt_yolox_->doInference(images, objects, masks, color_masks)) { + RCLCPP_WARN(this->get_logger(), "Fail to inference"); + return; + } + postInference(msg->header, objects, width, height, masks); + + if (is_publish_debug_rois_image_) { + drawImageDetection(in_image_ptr, objects, width, height); + } +} + +void TrtYoloXNode::onGpuImage(const std::shared_ptr msg) +{ + stop_watch_ptr_->toc("processing_time", true); + + const auto width = msg->width(); + const auto height = msg->height(); + + tensorrt_yolox::ObjectArrays objects; + std::vector masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))}; + std::vector color_masks = { + cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))}; + + std::vector> msgs = {msg}; + if (!trt_yolox_->doInference(msgs, objects, masks, color_masks)) { + RCLCPP_WARN(this->get_logger(), "Fail to inference"); + return; + } + + postInference(msg->header(), objects, width, height, masks); + + if (is_publish_debug_rois_image_) { + sensor_msgs::msg::Image image_msg; + msg->get_sensor_msgs_image(image_msg); + cv_bridge::CvImagePtr image_ptr = + cv_bridge::toCvCopy(image_msg, sensor_msgs::image_encodings::BGR8); + drawImageDetection(image_ptr, objects, width, height); } } diff --git a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp index 360f41e470e38..e98121012ae75 100644 --- a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp +++ b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp @@ -45,11 +45,12 @@ class YoloXSingleImageInferenceNode : public rclcpp::Node declare_parameter("output_image_path", p.string() + "_detect" + ext); auto trt_yolox = std::make_unique(model_path, precision); - auto image = cv::imread(image_path); + cv::Mat image = cv::imread(image_path); tensorrt_yolox::ObjectArrays objects; std::vector masks; std::vector color_masks; - trt_yolox->doInference({image}, objects, masks, color_masks); + std::vector input_images({image}); + trt_yolox->doInference(input_images, objects, masks, color_masks); for (const auto & object : objects[0]) { const auto left = object.x_offset; const auto top = object.y_offset;