tier4 · technolojin · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 29, 2024
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -70,6 +70,8 @@ class CenterPointTRT
 protected:
   void initPtr();
 
+  void initPriorityMap();
+
   virtual bool preprocess(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
 
@@ -92,6 +94,8 @@ class CenterPointTRT
   std::size_t voxels_size_{0};
   std::size_t coordinates_size_{0};
   std::vector<float> points_;
+  std::vector<unsigned int> priority_map_;
+
   cuda::unique_ptr<float[]> voxels_d_{nullptr};
   cuda::unique_ptr<int[]> coordinates_d_{nullptr};
   cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
@@ -107,6 +111,7 @@ class CenterPointTRT
   cuda::unique_ptr<float[]> points_d_{nullptr};
   cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> priority_map_d_{nullptr};
   cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
 };
 

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -27,9 +27,9 @@ cudaError_t generateVoxels_random_launch(
   cudaStream_t stream);
 
 cudaError_t generateBaseFeatures_launch(
-  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
-  unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs,
-  cudaStream_t stream);
+  unsigned int * mask, unsigned int * priority_map, float * voxels, int grid_y_size,
+  int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
+  float * voxel_num, int * voxel_idxs, cudaStream_t stream);
 
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,

diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -19,8 +19,10 @@
 #include <lidar_centerpoint/network/scatter_kernel.hpp>
 #include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
 
+#include <algorithm>
 #include <iostream>
 #include <memory>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -56,6 +58,8 @@ CenterPointTRT::CenterPointTRT(
 
   initPtr();
 
+  initPriorityMap();
+
   cudaStreamCreate(&stream_);
 }
 
@@ -85,6 +89,7 @@ void CenterPointTRT::initPtr()
 
   // host
   points_.resize(CAPACITY_POINT * config_.point_feature_size_);
+  priority_map_.resize(mask_size_);
 
   // device
   voxels_d_ = cuda::make_unique<float[]>(voxels_size_);
@@ -102,9 +107,56 @@ void CenterPointTRT::initPtr()
   points_d_ = cuda::make_unique<float[]>(CAPACITY_POINT * config_.point_feature_size_);
   voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
   mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
+  priority_map_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
   num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
 }
 
+void CenterPointTRT::initPriorityMap()
+{
+  // initialize priority score map
+  std::vector<std::pair<float, unsigned int>> priority_score_map;
+  priority_score_map.reserve(mask_size_);
+
+  // random number generator, but fixed seed for reproducibility
+  std::mt19937 generator(0);
+  std::normal_distribution<float> normal_distribution(0.0, 1.0);
+
+  // assign priority score map
+  for (unsigned int i = 0; i < mask_size_; ++i) {
+    const int x = i % config_.grid_size_x_ - config_.grid_size_x_ / 2;
+    const int y = i / config_.grid_size_x_ - config_.grid_size_y_ / 2;
+
+    const float pos_x = x * config_.voxel_size_x_;
+    const float pos_y = y * config_.voxel_size_y_;
+
+    // absolute rectangular hyperbola to focus on crosssection
+    // multiply sigmoid function to prioritize the front area
+    const float score_a = abs((pos_x - 15.0) * pos_y) * (1 / (1 + exp(pos_x * 0.3)) + 0.75);
+    // ellipse area focus on (10,0) and (150,0)
+    const float score_b = sqrt((pos_x - 150) * (pos_x - 150) + pos_y * pos_y) +
+                          sqrt((pos_x - 10) * (pos_x - 10) + pos_y * pos_y) - 140;
+    // total score with weight
+    // add noise to extend detection area in low-score area
+    const float score =
+      sqrt(score_a * 0.1 + score_b * 2.5) + normal_distribution(generator) * score_b * 0.01;
+
+    priority_score_map.push_back(std::make_pair(score, i));
+  }
+
+  // sort priority map and assign to the priority_map_
+  for (unsigned int i = 0; i < mask_size_; ++i) {
+  }
+  std::sort(
+    priority_score_map.begin(), priority_score_map.end(),
+    [](const std::pair<float, unsigned int> & a, const std::pair<float, unsigned int> & b) {
+      return a.first < b.first;
+    });
+
+  for (unsigned int i = 0; i < mask_size_; ++i) {
+    priority_map_[i] = priority_score_map[i].second;
+  }
+}
+
 bool CenterPointTRT::detect(
   const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
   std::vector<Box3D> & det_boxes3d)
@@ -138,6 +190,9 @@ bool CenterPointTRT::preprocess(
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
     cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    priority_map_d_.get(), priority_map_.data(), mask_size_ * sizeof(unsigned int),
+    cudaMemcpyHostToDevice, stream_));
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
@@ -155,9 +210,9 @@ bool CenterPointTRT::preprocess(
     mask_d_.get(), voxels_buffer_d_.get(), stream_));
 
   CHECK_CUDA_ERROR(generateBaseFeatures_launch(
-    mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
-    config_.max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
-    coordinates_d_.get(), stream_));
+    mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_,
+    config_.grid_size_x_, config_.max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(),
+    num_points_per_voxel_d_.get(), coordinates_d_.get(), stream_));
 
   CHECK_CUDA_ERROR(generateFeatures_launch(
     voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),

diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -76,8 +76,9 @@ cudaError_t generateVoxels_random_launch(
   float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
   cudaStream_t stream)
 {
-  dim3 blocks((points_size + 256 - 1) / 256);
-  dim3 threads(256);
+  const size_t threads_per_block = 256;
+  dim3 threads(threads_per_block);
+  dim3 blocks(divup(points_size, threads_per_block));
 
   if (blocks.x == 0) {
     return cudaGetLastError();
@@ -92,15 +93,17 @@ cudaError_t generateVoxels_random_launch(
 }
 
 __global__ void generateBaseFeatures_kernel(
-  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
-  unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs)
+  unsigned int * mask, const unsigned int * priority_map, float * voxels, int grid_y_size,
+  int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
+  float * voxel_num, int * voxel_idxs)
 {
-  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+  const unsigned voxel_key = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned voxel_index = priority_map[voxel_key];
+  unsigned int voxel_idx = voxel_index % grid_x_size;
+  unsigned int voxel_idy = voxel_index / grid_x_size;
 
   if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
 
-  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
   unsigned int count = mask[voxel_index];
   if (!(count > 0)) return;
   count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;
@@ -125,18 +128,20 @@ __global__ void generateBaseFeatures_kernel(
 }
 
 // create 4 channels
+// cspell: ignore divup
 cudaError_t generateBaseFeatures_launch(
-  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
-  unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs,
-  cudaStream_t stream)
+  unsigned int * mask, unsigned int * priority_map, float * voxels, int grid_y_size,
+  int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
+  float * voxel_num, int * voxel_idxs, cudaStream_t stream)
 {
-  dim3 threads = {32, 32};
-  dim3 blocks = {
-    (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};
+  const size_t threads_per_block = 256;
+  const int grid_size = grid_x_size * grid_y_size;
+  dim3 threads(threads_per_block);
+  dim3 blocks(divup(grid_size, threads_per_block));
 
   generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
-    mask, voxels, grid_y_size, grid_x_size, max_voxel_size, pillar_num, voxel_features, voxel_num,
-    voxel_idxs);
+    mask, priority_map, voxels, grid_y_size, grid_x_size, max_voxel_size, pillar_num,
+    voxel_features, voxel_num, voxel_idxs);
   cudaError_t err = cudaGetLastError();
   return err;
 }

diff --git a/perception/lidar_centerpoint/test/test_preprocess_kernel.cpp b/perception/lidar_centerpoint/test/test_preprocess_kernel.cpp
@@ -75,12 +75,14 @@ void PreprocessKernelTest::SetUp()
   points_d_ = cuda::make_unique<float[]>(capacity_ * point_feature_size_);
   voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
   mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
+  priority_map_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
   num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
 
   CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
   CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_ * sizeof(int), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(priority_map_d_.get(), 0, mask_size_ * sizeof(int), stream_));
   CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_d_.get(), 0, voxels_size_ * sizeof(float), stream_));
   CHECK_CUDA_ERROR(
     cudaMemsetAsync(coordinates_d_.get(), 0, coordinates_size_ * sizeof(int), stream_));
@@ -167,9 +169,9 @@ TEST_F(PreprocessKernelTest, BasicTest)
   EXPECT_EQ(points[3], voxel_data[3]);
 
   code = generateBaseFeatures_launch(
-    mask_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_, max_voxel_size_,
-    num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
-    stream_);
+    mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_,
+    max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
+    coordinates_d_.get(), stream_);
 
   ASSERT_EQ(cudaSuccess, code);
 
@@ -309,9 +311,9 @@ TEST_F(PreprocessKernelTest, VoxelOverflowTest)
   EXPECT_EQ(0.0, voxel_data[max_point_in_voxel_size_ * point_feature_size_ + 3]);
 
   code = generateBaseFeatures_launch(
-    mask_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_, max_voxel_size_,
-    num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
-    stream_);
+    mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_,
+    max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
+    coordinates_d_.get(), stream_);
 
   ASSERT_EQ(cudaSuccess, code);
 

diff --git a/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp b/perception/lidar_centerpoint/test/test_preprocess_kernel.hpp
@@ -71,6 +71,7 @@ class PreprocessKernelTest : public testing::Test
   cuda::unique_ptr<float[]> points_d_{};
   cuda::unique_ptr<float[]> voxels_buffer_d_{};
   cuda::unique_ptr<unsigned int[]> mask_d_{};
+  cuda::unique_ptr<unsigned int[]> priority_map_d_{};
   cuda::unique_ptr<unsigned int[]> num_voxels_d_{};
 };