Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(lidar_centerpoint): priotizing voxel processing by space #1672

Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class CenterPointTRT
protected:
void initPtr();

void initPriorityMap();

virtual bool preprocess(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);

Expand All @@ -92,6 +94,8 @@ class CenterPointTRT
std::size_t voxels_size_{0};
std::size_t coordinates_size_{0};
std::vector<float> points_;
std::vector<unsigned int> priority_map_;

cuda::unique_ptr<float[]> voxels_d_{nullptr};
cuda::unique_ptr<int[]> coordinates_d_{nullptr};
cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
Expand All @@ -107,6 +111,7 @@ class CenterPointTRT
cuda::unique_ptr<float[]> points_d_{nullptr};
cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
cuda::unique_ptr<unsigned int[]> priority_map_d_{nullptr};
cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ cudaError_t generateVoxels_random_launch(
cudaStream_t stream);

cudaError_t generateBaseFeatures_launch(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs,
cudaStream_t stream);
unsigned int * mask, unsigned int * priority_map, float * voxels, int grid_y_size,
int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
float * voxel_num, int * voxel_idxs, cudaStream_t stream);

cudaError_t generateFeatures_launch(
const float * voxel_features, const float * voxel_num_points, const int * coords,
Expand Down
61 changes: 58 additions & 3 deletions perception/lidar_centerpoint/lib/centerpoint_trt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
#include <lidar_centerpoint/network/scatter_kernel.hpp>
#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>

#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <vector>

Expand Down Expand Up @@ -56,6 +58,8 @@ CenterPointTRT::CenterPointTRT(

initPtr();

initPriorityMap();

cudaStreamCreate(&stream_);
}

Expand Down Expand Up @@ -85,6 +89,7 @@ void CenterPointTRT::initPtr()

// host
points_.resize(CAPACITY_POINT * config_.point_feature_size_);
priority_map_.resize(mask_size_);

// device
voxels_d_ = cuda::make_unique<float[]>(voxels_size_);
Expand All @@ -102,9 +107,56 @@ void CenterPointTRT::initPtr()
points_d_ = cuda::make_unique<float[]>(CAPACITY_POINT * config_.point_feature_size_);
voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
priority_map_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
}

void CenterPointTRT::initPriorityMap()
{
// initialize priority score map
std::vector<std::pair<float, unsigned int>> priority_score_map;
priority_score_map.reserve(mask_size_);

// random number generator, but fixed seed for reproducibility
std::mt19937 generator(0);
std::normal_distribution<float> normal_distribution(0.0, 1.0);

// assign priority score map
for (unsigned int i = 0; i < mask_size_; ++i) {
const int x = i % config_.grid_size_x_ - config_.grid_size_x_ / 2;
const int y = i / config_.grid_size_x_ - config_.grid_size_y_ / 2;

const float pos_x = x * config_.voxel_size_x_;
const float pos_y = y * config_.voxel_size_y_;

// absolute rectangular hyperbola to focus on crosssection
// multiply sigmoid function to prioritize the front area
const float score_a = abs((pos_x - 15.0) * pos_y) * (1 / (1 + exp(pos_x * 0.3)) + 0.75);
// ellipse area focus on (10,0) and (150,0)
const float score_b = sqrt((pos_x - 150) * (pos_x - 150) + pos_y * pos_y) +
sqrt((pos_x - 10) * (pos_x - 10) + pos_y * pos_y) - 140;
// total score with weight
// add noise to extend detection area in low-score area
const float score =
sqrt(score_a * 0.1 + score_b * 2.5) + normal_distribution(generator) * score_b * 0.01;

priority_score_map.push_back(std::make_pair(score, i));
}

// sort priority map and assign to the priority_map_
for (unsigned int i = 0; i < mask_size_; ++i) {
}
std::sort(
priority_score_map.begin(), priority_score_map.end(),
[](const std::pair<float, unsigned int> & a, const std::pair<float, unsigned int> & b) {
return a.first < b.first;
});

for (unsigned int i = 0; i < mask_size_; ++i) {
priority_map_[i] = priority_score_map[i].second;
}
}

bool CenterPointTRT::detect(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
std::vector<Box3D> & det_boxes3d)
Expand Down Expand Up @@ -138,6 +190,9 @@ bool CenterPointTRT::preprocess(
CHECK_CUDA_ERROR(cudaMemcpyAsync(
points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
cudaMemcpyHostToDevice, stream_));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
priority_map_d_.get(), priority_map_.data(), mask_size_ * sizeof(unsigned int),
cudaMemcpyHostToDevice, stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
CHECK_CUDA_ERROR(
cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
Expand All @@ -155,9 +210,9 @@ bool CenterPointTRT::preprocess(
mask_d_.get(), voxels_buffer_d_.get(), stream_));

CHECK_CUDA_ERROR(generateBaseFeatures_launch(
mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
config_.max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
coordinates_d_.get(), stream_));
mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_,
config_.grid_size_x_, config_.max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(),
num_points_per_voxel_d_.get(), coordinates_d_.get(), stream_));

CHECK_CUDA_ERROR(generateFeatures_launch(
voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),
Expand Down
35 changes: 20 additions & 15 deletions perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,9 @@ cudaError_t generateVoxels_random_launch(
float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
cudaStream_t stream)
{
dim3 blocks((points_size + 256 - 1) / 256);
dim3 threads(256);
const size_t threads_per_block = 256;
dim3 threads(threads_per_block);
dim3 blocks(divup(points_size, threads_per_block));

if (blocks.x == 0) {
return cudaGetLastError();
Expand All @@ -92,15 +93,17 @@ cudaError_t generateVoxels_random_launch(
}

__global__ void generateBaseFeatures_kernel(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs)
unsigned int * mask, const unsigned int * priority_map, float * voxels, int grid_y_size,
int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
float * voxel_num, int * voxel_idxs)
{
unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
const unsigned voxel_key = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned voxel_index = priority_map[voxel_key];
unsigned int voxel_idx = voxel_index % grid_x_size;
unsigned int voxel_idy = voxel_index / grid_x_size;

if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;

unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
unsigned int count = mask[voxel_index];
if (!(count > 0)) return;
count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;
Expand All @@ -125,18 +128,20 @@ __global__ void generateBaseFeatures_kernel(
}

// create 4 channels
// cspell: ignore divup
cudaError_t generateBaseFeatures_launch(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, int max_voxel_size,
unsigned int * pillar_num, float * voxel_features, float * voxel_num, int * voxel_idxs,
cudaStream_t stream)
unsigned int * mask, unsigned int * priority_map, float * voxels, int grid_y_size,
int grid_x_size, int max_voxel_size, unsigned int * pillar_num, float * voxel_features,
float * voxel_num, int * voxel_idxs, cudaStream_t stream)
{
dim3 threads = {32, 32};
dim3 blocks = {
(grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};
const size_t threads_per_block = 256;
const int grid_size = grid_x_size * grid_y_size;
dim3 threads(threads_per_block);
dim3 blocks(divup(grid_size, threads_per_block));

generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
mask, voxels, grid_y_size, grid_x_size, max_voxel_size, pillar_num, voxel_features, voxel_num,
voxel_idxs);
mask, priority_map, voxels, grid_y_size, grid_x_size, max_voxel_size, pillar_num,
voxel_features, voxel_num, voxel_idxs);
cudaError_t err = cudaGetLastError();
return err;
}
Expand Down
14 changes: 8 additions & 6 deletions perception/lidar_centerpoint/test/test_preprocess_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ void PreprocessKernelTest::SetUp()
points_d_ = cuda::make_unique<float[]>(capacity_ * point_feature_size_);
voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
priority_map_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);

CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
CHECK_CUDA_ERROR(
cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_ * sizeof(int), stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(priority_map_d_.get(), 0, mask_size_ * sizeof(int), stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_d_.get(), 0, voxels_size_ * sizeof(float), stream_));
CHECK_CUDA_ERROR(
cudaMemsetAsync(coordinates_d_.get(), 0, coordinates_size_ * sizeof(int), stream_));
Expand Down Expand Up @@ -167,9 +169,9 @@ TEST_F(PreprocessKernelTest, BasicTest)
EXPECT_EQ(points[3], voxel_data[3]);

code = generateBaseFeatures_launch(
mask_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_, max_voxel_size_,
num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
stream_);
mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_,
max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
coordinates_d_.get(), stream_);

ASSERT_EQ(cudaSuccess, code);

Expand Down Expand Up @@ -309,9 +311,9 @@ TEST_F(PreprocessKernelTest, VoxelOverflowTest)
EXPECT_EQ(0.0, voxel_data[max_point_in_voxel_size_ * point_feature_size_ + 3]);

code = generateBaseFeatures_launch(
mask_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_, max_voxel_size_,
num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
stream_);
mask_d_.get(), priority_map_d_.get(), voxels_buffer_d_.get(), grid_size_y_, grid_size_x_,
max_voxel_size_, num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(),
coordinates_d_.get(), stream_);

ASSERT_EQ(cudaSuccess, code);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class PreprocessKernelTest : public testing::Test
cuda::unique_ptr<float[]> points_d_{};
cuda::unique_ptr<float[]> voxels_buffer_d_{};
cuda::unique_ptr<unsigned int[]> mask_d_{};
cuda::unique_ptr<unsigned int[]> priority_map_d_{};
cuda::unique_ptr<unsigned int[]> num_voxels_d_{};
};

Expand Down
Loading