Skip to content

Commit

Permalink
perf(image_projection_based_fusion,lidar_centerpoint): add cuda prepr…
Browse files Browse the repository at this point in the history
…ocess (autowarefoundation#1041)

* add cuda preprocess

Signed-off-by: wep21 <[email protected]>

* add cuda preprocess for centerpoint

Signed-off-by: wep21 <[email protected]>

* update pp

Signed-off-by: wep21 <[email protected]>
Signed-off-by: tomoya.kimura <[email protected]>

* fix bug

Signed-off-by: wep21 <[email protected]>

* style(pre-commit): autofix

---------

Signed-off-by: wep21 <[email protected]>
Signed-off-by: tomoya.kimura <[email protected]>
Co-authored-by: wep21 <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 27, 2023
1 parent 6a60009 commit 192d72d
Show file tree
Hide file tree
Showing 14 changed files with 329 additions and 207 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

namespace image_projection_based_fusion
{
static constexpr size_t CAPACITY_POINT = 1000000;
class PointPaintingTRT : public centerpoint::CenterPointTRT
{
public:
Expand All @@ -33,8 +34,6 @@ class PointPaintingTRT : public centerpoint::CenterPointTRT
const centerpoint::DensificationParam & densification_param,
const centerpoint::CenterPointConfig & config);

~PointPaintingTRT();

protected:
bool preprocess(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,19 @@

namespace image_projection_based_fusion
{
cudaError_t generateVoxels_random_launch(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
cudaStream_t stream);

cudaError_t generateBaseFeatures_launch(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream);

cudaError_t generateFeatures_launch(
const float * voxel_features, const float * voxel_num_points, const int * coords,
const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
const float voxel_size_y, const float voxel_size_z, const float range_min_x,
const float range_min_y, const float range_min_z, float * features,
const std::size_t encoder_in_feature_size, cudaStream_t stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ class VoxelGenerator : public centerpoint::VoxelGenerator
public:
using centerpoint::VoxelGenerator::VoxelGenerator;

std::size_t pointsToVoxels(
std::vector<float> & voxels, std::vector<int> & coordinates,
std::vector<float> & num_points_per_voxel) override;
std::size_t generateSweepPoints(std::vector<float> & points) override;
};
} // namespace image_projection_based_fusion

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,37 @@ bool PointPaintingTRT::preprocess(
if (!is_success) {
return false;
}
num_voxels_ = vg_ptr_pp_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_);
if (num_voxels_ == 0) {
return false;
}
const auto voxels_size =
num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_;
const auto coordinates_size = num_voxels_ * config_.point_dim_size_;
// memcpy from host to device (not copy empty voxels)
CHECK_CUDA_ERROR(cudaMemcpyAsync(
voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice));
const auto count = vg_ptr_pp_->generateSweepPoints(points_);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int),
cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float),
cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
cudaMemcpyHostToDevice, stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_, stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_, stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(
voxels_d_.get(), 0,
config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_ *
sizeof(float),
stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(
coordinates_d_.get(), 0, config_.max_voxel_size_ * config_.point_dim_size_ * sizeof(int),
stream_));
CHECK_CUDA_ERROR(cudaMemsetAsync(
num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));

CHECK_CUDA_ERROR(image_projection_based_fusion::generateVoxels_random_launch(
points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
mask_d_.get(), voxels_buffer_d_.get(), stream_));

CHECK_CUDA_ERROR(image_projection_based_fusion::generateBaseFeatures_launch(
mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
stream_));

CHECK_CUDA_ERROR(image_projection_based_fusion::generateFeatures_launch(
voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_,
voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),
config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_,
config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(),
config_.encoder_in_feature_size_, stream_));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,105 @@ std::size_t divup(const std::size_t a, const std::size_t b)

namespace image_projection_based_fusion
{
__global__ void generateVoxels_random_kernel(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
{
int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (point_idx >= points_size) return;

float x = points[point_idx * POINT_FEATURE_SIZE];
float y = points[point_idx * POINT_FEATURE_SIZE + 1];
float z = points[point_idx * POINT_FEATURE_SIZE + 2];

if (
x < min_x_range || x >= max_x_range || y < min_y_range || y >= max_y_range || z < min_z_range ||
z >= max_z_range)
return;

int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;

unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);

if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return;
float * address =
voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * POINT_FEATURE_SIZE;
for (unsigned int i = 0; i < POINT_FEATURE_SIZE; ++i) {
atomicExch(address + i, points[point_idx * POINT_FEATURE_SIZE + i]);
}
}

cudaError_t generateVoxels_random_launch(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
cudaStream_t stream)
{
dim3 blocks((points_size + 256 - 1) / 256);
dim3 threads(256);
generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>(
points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range,
max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask,
voxels);
cudaError_t err = cudaGetLastError();
return err;
}

__global__ void generateBaseFeatures_kernel(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
float * voxel_features, float * voxel_num, int * voxel_idxs)
{
unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;

if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;

unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
unsigned int count = mask[voxel_index];
if (!(count > 0)) return;
count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;

unsigned int current_pillarId = 0;
current_pillarId = atomicAdd(pillar_num, 1);

voxel_num[current_pillarId] = count;

uint3 idx = {0, voxel_idy, voxel_idx};
((uint3 *)voxel_idxs)[current_pillarId] = idx;

for (int i = 0; i < count; i++) {
int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i;
int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i;
for (unsigned int j = 0; j < POINT_FEATURE_SIZE; ++j) {
voxel_features[outIndex * POINT_FEATURE_SIZE + j] = voxels[inIndex * POINT_FEATURE_SIZE + j];
}
}

// clear buffer for next infer
atomicExch(mask + voxel_index, 0);
}

// create 4 channels
cudaError_t generateBaseFeatures_launch(
unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream)
{
dim3 threads = {32, 32};
dim3 blocks = {
(grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};

generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs);
cudaError_t err = cudaGetLastError();
return err;
}

__global__ void generateFeatures_kernel(
const float * voxel_features, const float * voxel_num_points, const int * coords,
const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
const float range_min_x, const float range_min_y, const float range_min_z, float * features,
const std::size_t encoder_in_feature_size)
{
Expand All @@ -70,7 +166,8 @@ __global__ void generateFeatures_kernel(
int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE;
int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE;

if (pillar_idx >= num_voxels) return;
unsigned int num_pillars = num_voxels[0];
if (pillar_idx >= num_pillars) return;

// load src
__shared__ float pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE][POINT_FEATURE_SIZE];
Expand Down Expand Up @@ -155,7 +252,7 @@ __global__ void generateFeatures_kernel(

cudaError_t generateFeatures_launch(
const float * voxel_features, const float * voxel_num_points, const int * coords,
const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
const float voxel_size_y, const float voxel_size_z, const float range_min_x,
const float range_min_y, const float range_min_z, float * features,
const std::size_t encoder_in_feature_size, cudaStream_t stream)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,10 @@

namespace image_projection_based_fusion
{
std::size_t VoxelGenerator::pointsToVoxels(
std::vector<float> & voxels, std::vector<int> & coordinates,
std::vector<float> & num_points_per_voxel)
size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
{
// voxels (float): (max_num_voxels * max_num_points_per_voxel * point_feature_size)
// coordinates (int): (max_num_voxels * point_dim_size)
// num_points_per_voxel (float): (max_num_voxels)

const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_;
std::vector<std::optional<int>> coord_to_voxel_idx(grid_size, -1);

std::size_t voxel_cnt = 0; // @return
// std::array<float, config_.point_feature_size> point;
// std::array<float, config_.point_dim_size> coord_zyx;
std::vector<float> point;
point.resize(config_.point_feature_size_);
std::vector<float> coord_zyx;
coord_zyx.resize(config_.point_dim_size_);
bool out_of_range;
std::size_t point_cnt;
int c, coord_idx, voxel_idx;
Eigen::Vector3f point_current, point_past;

size_t point_counter{};
for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
pc_cache_iter++) {
auto pc_msg = pc_cache_iter->pointcloud_msg;
Expand All @@ -55,56 +36,25 @@ std::size_t VoxelGenerator::pointsToVoxels(
point_past << *x_iter, *y_iter, *z_iter;
point_current = affine_past2current * point_past;

point[0] = point_current.x();
point[1] = point_current.y();
point[2] = point_current.z();
point[3] = time_lag;
points.at(point_counter * config_.point_feature_size_) = point_current.x();
points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
// decode the class value back to one-hot binary and assign it to point
std::fill(point.begin() + 4, point.end(), 0);
point[4 + *class_iter] = 1;

out_of_range = false;
for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
c = static_cast<int>((point[di] - range_[di]) * recip_voxel_size_[di]);
if (c < 0 || c >= grid_size_[di]) {
out_of_range = true;
break;
}
coord_zyx[config_.point_dim_size_ - di - 1] = c;
}
if (out_of_range) {
continue;
for (size_t i = 0; i < config_.class_size_; ++i) {
points.at(point_counter * config_.point_feature_size_ + 4 + i) = 0.f;
}

coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ +
coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2];
voxel_idx = coord_to_voxel_idx[coord_idx].value();
if (voxel_idx == -1) {
voxel_idx = voxel_cnt;
if (voxel_cnt >= config_.max_voxel_size_) {
continue;
}

voxel_cnt++;
coord_to_voxel_idx[coord_idx] = voxel_idx;
for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di];
}
}

point_cnt = num_points_per_voxel[voxel_idx];
if (point_cnt < config_.max_point_in_voxel_size_) {
for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) {
voxels
[voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ +
point_cnt * config_.point_feature_size_ + fi] = point[fi];
}
num_points_per_voxel[voxel_idx]++;
auto class_value = static_cast<int>(*class_iter);
auto iter = points.begin() + point_counter * config_.point_feature_size_ + 4;
while (class_value > 0) {
*iter = class_value % 2;
class_value /= 2;
++iter;
}
++point_counter;
}
}

return voxel_cnt;
return point_counter;
}

} // namespace image_projection_based_fusion
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@

namespace centerpoint
{
static constexpr size_t CAPACITY_POINT = 1000000;

class NetworkParam
{
public:
Expand Down Expand Up @@ -59,7 +61,7 @@ class CenterPointTRT
const NetworkParam & encoder_param, const NetworkParam & head_param,
const DensificationParam & densification_param, const CenterPointConfig & config);

~CenterPointTRT();
virtual ~CenterPointTRT();

bool detect(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
Expand All @@ -83,12 +85,13 @@ class CenterPointTRT

std::size_t class_size_{0};
CenterPointConfig config_;
std::size_t num_voxels_{0};
std::size_t encoder_in_feature_size_{0};
std::size_t spatial_features_size_{0};
std::vector<float> voxels_;
std::vector<int> coordinates_;
std::vector<float> num_points_per_voxel_;
std::size_t voxels_buffer_size_{0};
std::size_t mask_size_{0};
std::size_t voxels_size_{0};
std::size_t coordinates_size_{0};
std::vector<float> points_;
cuda::unique_ptr<float[]> voxels_d_{nullptr};
cuda::unique_ptr<int[]> coordinates_d_{nullptr};
cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
Expand All @@ -101,6 +104,10 @@ class CenterPointTRT
cuda::unique_ptr<float[]> head_out_dim_d_{nullptr};
cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
cuda::unique_ptr<float[]> points_d_{nullptr};
cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
};

} // namespace centerpoint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
namespace centerpoint
{
cudaError_t scatterFeatures_launch(
const float * pillar_features, const int * coords, const std::size_t num_pillars,
const float * pillar_features, const int * coords, const unsigned int * num_pillars,
const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size,
const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features,
cudaStream_t stream);
Expand Down
Loading

0 comments on commit 192d72d

Please sign in to comment.