From e7a4e6349762e1f5c8b384a4dc45751eac103ca5 Mon Sep 17 00:00:00 2001 From: jiangyuhao Date: Thu, 25 Jul 2024 16:29:34 +0800 Subject: [PATCH] Update dcn conv --- .../csrc/pytorch/mlu/mlu_common_helper.cpp | 129 ++++++++++++++++++ mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h | 58 ++++++++ .../pytorch/mlu/modulated_deform_conv_mlu.cpp | 128 ++++++++++------- 3 files changed, 266 insertions(+), 49 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp index 4d63d00ff9..b7cb2a1ed0 100644 --- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp +++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp @@ -11,6 +11,135 @@ *************************************************************************/ #include "mlu_common_helper.h" +mluOpTensorDescriptor_t MluOpTensorDescriptor::mut_desc() { + if (!desc_) { + mluOpCreateTensorDescriptor(&desc_); + } + return desc_; +} + +// for now, this function is used to decide whether output and output_contiguous are the same +bool is_copy_necessary(const at::Tensor& output, const at::Tensor& output_contiguous) { + if (!output.defined() && !output_contiguous.defined()) return false; + TORCH_CHECK(output.defined() && output_contiguous.defined(), + "One of those two tensor is undefined."); + TORCH_CHECK(output.sizes() == output_contiguous.sizes() , + "sizes of two input tensors are not the same."); + + // check if underlying data and strides of these tensors are the same. + if (output.data_ptr() != output_contiguous.data_ptr() || + output.strides() != output_contiguous.strides()) { + return true; + } + + // check if dtype are the same. + if (output.options().dtype() != output_contiguous.options().dtype()) { + return true; + } + + return false; +} + +std::vector +get_contiguous_strides(const at::IntArrayRef& sizes, + c10::MemoryFormat memory_format) { + switch (memory_format) { + case c10::MemoryFormat::Contiguous: + return get_channels_first_strides(sizes); + case c10::MemoryFormat::ChannelsLast: + case c10::MemoryFormat::ChannelsLast3d: + return get_channels_last_strides(sizes); + default: + TORCH_CHECK(false, + "get_contiguous_strides doesn't support memory_format ", + memory_format); + } +} + +std::vector modify_dims_based_on_layout(const at::IntArrayRef& dim, + const c10::MemoryFormat memory_format) { + // dimension is 0, return. + // dimension == 1 and numel == 1, return. + if (!dim.size() || (dim.size() ==1 && dim[0] == 1)) { + return dim.vec(); + } + std::vector target_dim; + static std::vector cl_dim_order{0, 2, 3, 1}; + static std::vector cl3d_dim_order{0, 2, 3, 4, 1}; + // trans tensor/stride size to cnnl desc size/stride. + auto modify_dims_pos = [](const std::vector& dim_order, + const at::IntArrayRef& input, + std::vector& out) { + for (const auto& item : dim_order) { + out.push_back(input[item]); + } + }; + switch (memory_format) { + case c10::MemoryFormat::ChannelsLast: + TORCH_CHECK(dim.size() == 4, "dim size must be 4 when memory_format ", + "is ChannelsLast."); + modify_dims_pos(cl_dim_order, dim, target_dim); + break; + case c10::MemoryFormat::ChannelsLast3d: + TORCH_CHECK(dim.size() == 5, "dim size must be 5 when memory_format is ", + "ChannelsLast3d."); + modify_dims_pos(cl3d_dim_order, dim, target_dim); + break; + case c10::MemoryFormat::Contiguous: + target_dim = std::move(dim.vec()); + break; + default: + TORCH_CHECK(false, "memory format not support."); + break; + } + return target_dim; +} + +std::vector get_channels_last_strides(const at::IntArrayRef& sizes) { + switch (sizes.size()) { + case 5: + return c10::get_channels_last_strides_3d(sizes); + case 4: + return c10::get_channels_last_strides_2d(sizes); + case 3: + return get_channels_last_strides_1d(sizes); + default: + TORCH_INTERNAL_ASSERT(false, "ChannelsLast doesn't support size ", sizes.size()); + } +} + +std::vector get_channels_first_strides(const at::IntArrayRef& sizes) { + auto dim = sizes.size(); + std::vector strides(dim); + if (dim > 0) { + int last_idx = dim - 1; + strides[last_idx] = 1; + for (auto i = last_idx - 1; i >= 0; --i) { + strides[i] = strides[i + 1] * std::max(sizes[i + 1], 1); + } + } + return strides; +} + +std::vector get_channels_last_strides_1d(const at::IntArrayRef& sizes) { + std::vector strides(sizes.size()); + switch (sizes.size()) { + // NLC + case 3: + strides[1] = 1; + strides[2] = sizes[1]; + strides[0] = strides[2] * sizes[2]; + return strides; + // LC + case 2: + strides[0] = 1; + strides[1] = sizes[0]; + return strides; + default: + TORCH_INTERNAL_ASSERT(false, "ChannelsLast1d doesn't support size ", sizes.size()); + } +} + // Descriptors mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) { const std::map mapping_type = { diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h index f14871ebc5..1afa7cc61b 100644 --- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h +++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h @@ -19,6 +19,7 @@ #include "aten/cnnl/cnnlHandle.h" #include "aten/cnnl/cnnlTensorDescriptors.h" #include "framework/core/MLUStream.h" +#include "framework/core/caching_allocator.h" using at::IntArrayRef; using at::Tensor; @@ -32,6 +33,7 @@ inline void* mlu_data_ptr(c10::TensorImpl* impl) { } // namespace torch_mlu #endif + #include "mlu_op.h" #include "pytorch_device_registry.hpp" #include "pytorch_mlu_helper.hpp" @@ -78,6 +80,20 @@ mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type); mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input); mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type); +std::vector modify_dims_based_on_layout(const at::IntArrayRef& dim, + const c10::MemoryFormat memory_format); + +std::vector get_contiguous_strides(const at::IntArrayRef& sizes, + c10::MemoryFormat memory_format = c10::MemoryFormat::Contiguous); + +std::vector get_channels_last_strides(const at::IntArrayRef& sizes); + +std::vector get_channels_first_strides(const at::IntArrayRef& sizes); + +std::vector get_channels_last_strides_1d(const at::IntArrayRef& sizes); + +bool is_copy_necessary(const at::Tensor& output, const at::Tensor& output_contiguous); + class MluOpTensorDescriptor { public: MluOpTensorDescriptor() { @@ -89,6 +105,48 @@ class MluOpTensorDescriptor { void set(at::Tensor); void set_with_layout(at::Tensor, mluOpTensorLayout_t layout); + + void set(at::Tensor t, + std::vector shape_info, + std::vector stride_info, + mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY) { + const int64_t t_dim = shape_info.size(); + TORCH_CHECK(t_dim == stride_info.size(), "shape size need equal to stride size."); + + auto data_type = getMluOpDataType(t.dtype()); + auto setTensorDesc = [&](const int64_t dim, + const int64_t* size, + const int64_t* stride) -> void { + TORCH_MLUOP_CHECK(mluOpSetTensorDescriptorEx_v2(mut_desc(), + layout, + data_type, + dim, + size, + stride)); + }; + if (!t_dim) { + int64_t dim_array[1] = {1}; + setTensorDesc(1, dim_array, dim_array); + return; + } + if (std::is_same::type, int64_t>::value == true) { + setTensorDesc(t_dim, shape_info.data(), stride_info.data()); + } else { + std::vector real_shape_info; + std::vector real_stride_info; + real_shape_info.reserve(t_dim); + real_stride_info.reserve(t_dim); + for (int i = 0; i < t_dim; i++) { + real_shape_info.push_back(shape_info[i]); + real_stride_info.push_back(stride_info[i]); + } + + setTensorDesc(t_dim, real_shape_info.data(), real_stride_info.data()); + } + } + + mluOpTensorDescriptor_t mut_desc(); + mluOpTensorDescriptor_t desc() { return desc_; } private: diff --git a/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp index c17b987c45..1313fcb030 100644 --- a/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp +++ b/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp @@ -28,6 +28,9 @@ void modulated_deform_conv_forward_mlu( int stride[2] = {(int)stride_h, (int)stride_w}; int dilation[2] = {(int)dilation_h, (int)dilation_w}; int im2col_step = input.size(0); + + // get current handle + auto handle = mluOpGetCurrentHandle(); // set to contiguous auto memory_format = @@ -39,33 +42,32 @@ void modulated_deform_conv_forward_mlu( auto mask_contiguous = torch_mlu::cnnl_contiguous(mask, memory_format); auto output_contiguous = torch_mlu::cnnl_contiguous(output, memory_format); - // get current handle - auto handle = mluOpGetCurrentHandle(); - // set tensor descriptor MluOpDCNDescriptor dcn_desc; MluOpTensorDescriptor input_desc, offset_desc, weight_desc, bias_desc, mask_desc, output_desc; - input_desc.set_with_layout(input_contiguous, MLUOP_LAYOUT_NHWC); - offset_desc.set_with_layout(offset_contiguous, MLUOP_LAYOUT_NHWC); - weight_desc.set_with_layout(weight_contiguous, MLUOP_LAYOUT_NHWC); - mask_desc.set_with_layout(mask_contiguous, MLUOP_LAYOUT_NHWC); - output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC); - - // bias Tensor size need be same with output Tensor channel size. - void* bias_ptr = nullptr; - mluOpTensorDescriptor_t bias_desc_ = NULL; - if (bias.defined() && bias.dim() == 1 && bias.size(0) == output.size(1)) { - bias_desc.set_with_layout(bias_contiguous, MLUOP_LAYOUT_ARRAY); - auto bias_impl = torch_mlu::getMluTensorImpl(bias_contiguous); - auto bias_ptr = torch_mlu::mlu_data_ptr(bias_impl); - bias_desc_ = bias_desc.desc(); - } - + auto desc_set = [&](const at::Tensor& t, + MluOpTensorDescriptor& t_desc, + mluOpTensorLayout_t layout) { + auto shape_vec = modify_dims_based_on_layout(t.sizes().vec(), memory_format); + auto stride_vec = get_contiguous_strides(shape_vec); + t_desc.set(t, shape_vec, stride_vec, layout); + }; + // prepare desc + mluOpTensorLayout_t layout = MLUOP_LAYOUT_NHWC; + desc_set(input_contiguous, input_desc, layout); + desc_set(offset_contiguous, offset_desc, layout); + desc_set(weight_contiguous, weight_desc, layout); + desc_set(mask_contiguous, mask_desc, layout); + desc_set(output_contiguous, output_desc, layout); + mluOpSetTensorDescriptorOnchipDataType(output_desc.desc(), getMluOpDataType(output.dtype())); // set dcn descriptor dcn_desc.set(input.dim(), padding, stride, dilation, deformable_group, group, im2col_step, MLUOP_DTYPE_FLOAT); - + // set onchip dtype + mluOpSetTensorDescriptorOnchipDataType(input_desc.desc(), getMluOpDataType(input.dtype())); + mluOpSetTensorDescriptorOnchipDataType(offset_desc.desc(), getMluOpDataType(offset.dtype())); + mluOpSetTensorDescriptorOnchipDataType(weight_desc.desc(), getMluOpDataType(weight.dtype())); //get ptr of tensors auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous); auto input_ptr = torch_mlu::mlu_data_ptr(input_impl); @@ -77,20 +79,31 @@ void modulated_deform_conv_forward_mlu( auto mask_ptr = torch_mlu::mlu_data_ptr(mask_impl); auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous); auto output_ptr = torch_mlu::mlu_data_ptr(output_impl); + // bias Tensor size need be same with output Tensor channel size. + void* bias_ptr = nullptr; + mluOpTensorDescriptor_t bias_desc_ = NULL; + if (with_bias) { + bias_desc.set_with_layout(bias_contiguous, MLUOP_LAYOUT_ARRAY); + auto bias_impl = torch_mlu::getMluTensorImpl(bias_contiguous); + bias_ptr = torch_mlu::mlu_data_ptr(bias_impl); + bias_desc_ = bias_desc.desc(); + } // allocate workspace size_t workspace_size = 0; TORCH_MLUOP_CHECK(mluOpGetDCNForwardWorkspaceSize(handle, dcn_desc.desc(), input_desc.desc(), offset_desc.desc(), mask_desc.desc(), weight_desc.desc(), bias_desc_, output_desc.desc(), &workspace_size)); - auto workspace = at::empty(workspace_size, input.options().dtype(at::ScalarType::Char)); - auto workspace_impl = torch_mlu::getMluTensorImpl(workspace); - auto workspace_ptr = torch_mlu::mlu_data_ptr(workspace_impl); + auto workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(workspace_size); TORCH_MLUOP_CHECK(mluOpDCNForward(handle, dcn_desc.desc(), input_desc.desc(), input_ptr, offset_desc.desc(), offset_ptr, mask_desc.desc(), mask_ptr, - weight_desc.desc(), weight_ptr, bias_desc_, bias_ptr, workspace_ptr, + weight_desc.desc(), weight_ptr, bias_desc_, bias_ptr, workspace_ptr.get(), workspace_size, output_desc.desc(), output_ptr)); + + if (is_copy_necessary(output, output_contiguous)) { + output.copy_(output_contiguous); + } } void modulated_deform_conv_backward_mlu( @@ -142,26 +155,40 @@ void modulated_deform_conv_backward_mlu( // bias Tensor size need be same with grad Tensor channel size. void* grad_bias_ptr = nullptr; mluOpTensorDescriptor_t grad_bias_desc_ = NULL; - if (bias.defined() && bias.dim() == 1 && bias.size(0) == grad_output.size(1)) { + if (with_bias) { grad_bias_desc.set_with_layout(grad_bias_contiguous, MLUOP_LAYOUT_ARRAY); auto grad_bias_impl = torch_mlu::getMluTensorImpl(grad_bias_contiguous); - auto grad_bias_ptr = torch_mlu::mlu_data_ptr(grad_bias_impl); + grad_bias_ptr = torch_mlu::mlu_data_ptr(grad_bias_impl); grad_bias_desc_ = grad_bias_desc.desc(); } // get current handle auto handle = mluOpGetCurrentHandle(); - grad_desc.set_with_layout(grad_contiguous, MLUOP_LAYOUT_NHWC); - input_desc.set_with_layout(input_contiguous, MLUOP_LAYOUT_NHWC); - offset_desc.set_with_layout(offset_contiguous, MLUOP_LAYOUT_NHWC); - weight_desc.set_with_layout(weight_contiguous, MLUOP_LAYOUT_NHWC); - mask_desc.set_with_layout(mask_contiguous, MLUOP_LAYOUT_NHWC); - grad_input_desc.set_with_layout(grad_input_contiguous, MLUOP_LAYOUT_NHWC); - grad_offset_desc.set_with_layout(grad_offset_contiguous, MLUOP_LAYOUT_NHWC); - grad_weight_desc.set_with_layout(grad_weight_contiguous, MLUOP_LAYOUT_NHWC); - grad_mask_desc.set_with_layout(grad_mask_contiguous, MLUOP_LAYOUT_NHWC); + auto desc_set = [&](const at::Tensor& t, + MluOpTensorDescriptor& t_desc, + mluOpTensorLayout_t layout) { + auto shape_vec = modify_dims_based_on_layout(t.sizes().vec(), memory_format); + auto stride_vec = get_contiguous_strides(shape_vec); + t_desc.set(t, shape_vec, stride_vec, layout); + }; + // prepare desc + mluOpTensorLayout_t layout = MLUOP_LAYOUT_NHWC; + desc_set(input_contiguous, input_desc, layout); + desc_set(offset_contiguous, offset_desc, layout); + desc_set(weight_contiguous, weight_desc, layout); + desc_set(mask_contiguous, mask_desc, layout); + desc_set(grad_contiguous, grad_desc, layout); + desc_set(grad_input_contiguous, grad_input_desc, layout); + desc_set(grad_offset_contiguous, grad_offset_desc, layout); + desc_set(grad_weight_contiguous, grad_weight_desc, layout); + desc_set(grad_mask_contiguous, grad_mask_desc, layout); // set dcn descriptor dcn_desc.set(input.dim(), padding, stride, dilation, deformable_group, group, im2col_step, MLUOP_DTYPE_FLOAT); + // set onchip dtype + mluOpSetTensorDescriptorOnchipDataType(grad_input_desc.desc(), getMluOpDataType(grad_input.dtype())); + //mluOpSetTensorDescriptorOnchipDataType(offset_desc.desc(), getMluOpDataType(offset.dtype())); + mluOpSetTensorDescriptorOnchipDataType(input_desc.desc(), getMluOpDataType(input.dtype())); + mluOpSetTensorDescriptorOnchipDataType(weight_desc.desc(), getMluOpDataType(weight.dtype())); // set ptrs auto grad_ptr = torch_mlu::mlu_data_ptr(grad_impl); auto input_ptr = torch_mlu::mlu_data_ptr(input_impl); @@ -187,12 +214,7 @@ void modulated_deform_conv_backward_mlu( /* grad_mask_desc */ grad_mask_desc.desc(), /* workspace_size */ &data_workspace_size)); // mallc data workspace mlu memory - void* data_workspace_ptr = nullptr; - at::Tensor data_workspace; - data_workspace = at::empty(data_workspace_size, - input.options().dtype(at::ScalarType::Char)); - auto data_workspace_impl = torch_mlu::getMluTensorImpl(data_workspace); - data_workspace_ptr = torch_mlu::mlu_data_ptr(data_workspace_impl); + auto data_workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(data_workspace_size); TORCH_MLUOP_CHECK(mluOpDCNBackwardData( /* handle */ handle, /* dcn_desc */ dcn_desc.desc(), @@ -206,7 +228,7 @@ void modulated_deform_conv_backward_mlu( /* weight_ptr */ weight_ptr, /* grad_output_desc */ grad_desc.desc(), /* grad_output_ptr */ grad_ptr, - /* workspace_ptr */ data_workspace_ptr, + /* workspace_ptr */ data_workspace_ptr.get(), /* workspace_size */ data_workspace_size, /* grad_input_desc */ grad_input_desc.desc(), /* grad_input_ptr */ grad_input_ptr, @@ -227,12 +249,7 @@ void modulated_deform_conv_backward_mlu( /* grad_bias_desc */ grad_bias_desc_, /* workspace_size */ &weight_workspace_size)); // malloc weight workspace mlu memory - void* weight_workspace_ptr = nullptr; - at::Tensor weight_workspace; - weight_workspace = at::empty(weight_workspace_size, - input.options().dtype(at::ScalarType::Char)); - auto weight_workspace_impl = torch_mlu::getMluTensorImpl(weight_workspace); - weight_workspace_ptr = torch_mlu::mlu_data_ptr(weight_workspace_impl); + auto weight_workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(weight_workspace_size); TORCH_MLUOP_CHECK(mluOpDCNBackwardWeight( /* handle */ handle, /* dcn_desc */ dcn_desc.desc(), @@ -244,12 +261,25 @@ void modulated_deform_conv_backward_mlu( /* mask_ptr */ mask_ptr, /* grad_output_desc */ grad_desc.desc(), /* grad_output_ptr */ grad_ptr, - /* workspace */ weight_workspace_ptr, + /* workspace */ weight_workspace_ptr.get(), /* workspace_size */ weight_workspace_size, /* grad_weight_desc */ grad_weight_desc.desc(), /* grad_weigth_ptr */ grad_weight_ptr, /* grad_bias_desc */ grad_bias_desc_, /* grad_bias_ptr */ grad_bias_ptr)); + if (is_copy_necessary(grad_input, grad_input_contiguous)) { + grad_input.copy_(grad_input_contiguous); + } + if (is_copy_necessary(grad_offset, grad_offset_contiguous)) { + grad_offset.copy_(grad_offset_contiguous); + } + if (is_copy_necessary(grad_weight, grad_weight_contiguous)) { + grad_weight.copy_(grad_weight_contiguous); + } + if (is_copy_necessary(grad_mask, grad_mask_contiguous)) { + grad_mask.copy_(grad_mask_contiguous); + } + } void modulated_deform_conv_forward_impl(