From e7a4e6349762e1f5c8b384a4dc45751eac103ca5 Mon Sep 17 00:00:00 2001
From: jiangyuhao <jiangyuhao@cambricon.com>
Date: Thu, 25 Jul 2024 16:29:34 +0800
Subject: [PATCH] Update dcn conv

---
 .../csrc/pytorch/mlu/mlu_common_helper.cpp    | 129 ++++++++++++++++++
 mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h |  58 ++++++++
 .../pytorch/mlu/modulated_deform_conv_mlu.cpp | 128 ++++++++++-------
 3 files changed, 266 insertions(+), 49 deletions(-)
diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
index 4d63d00ff9..b7cb2a1ed0 100644
--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
@@ -11,6 +11,135 @@
  *************************************************************************/
 #include "mlu_common_helper.h"
 
+mluOpTensorDescriptor_t MluOpTensorDescriptor::mut_desc() {
+  if (!desc_) {
+    mluOpCreateTensorDescriptor(&desc_);
+  }
+  return desc_;
+}
+
+// for now, this function is used to decide whether output and output_contiguous are the same
+bool is_copy_necessary(const at::Tensor& output, const at::Tensor& output_contiguous) {
+  if (!output.defined() && !output_contiguous.defined()) return false;
+  TORCH_CHECK(output.defined() && output_contiguous.defined(),
+    "One of those two tensor is undefined.");
+  TORCH_CHECK(output.sizes() == output_contiguous.sizes() ,
+                      "sizes of two input tensors are not the same.");
+
+  // check if underlying data and strides of these tensors are the same.
+  if (output.data_ptr() != output_contiguous.data_ptr() ||
+          output.strides() != output_contiguous.strides()) {
+    return true;
+  }
+
+  // check if dtype are the same.
+  if (output.options().dtype() != output_contiguous.options().dtype()) {
+    return true;
+  }
+
+  return false;
+}
+
+std::vector<int64_t>
+get_contiguous_strides(const at::IntArrayRef& sizes,
+                       c10::MemoryFormat memory_format) {
+  switch (memory_format) {
+    case c10::MemoryFormat::Contiguous:
+      return get_channels_first_strides(sizes);
+    case c10::MemoryFormat::ChannelsLast:
+    case c10::MemoryFormat::ChannelsLast3d:
+      return get_channels_last_strides(sizes);
+    default:
+      TORCH_CHECK(false,
+                      "get_contiguous_strides doesn't support memory_format ",
+                      memory_format);
+  }
+}
+
+std::vector<int64_t> modify_dims_based_on_layout(const at::IntArrayRef& dim,
+            const c10::MemoryFormat memory_format) {
+  // dimension is 0, return.
+  // dimension == 1 and numel == 1, return.
+  if (!dim.size() || (dim.size() ==1 && dim[0] == 1)) {
+      return dim.vec();
+  }
+  std::vector<int64_t> target_dim;
+  static std::vector<int> cl_dim_order{0, 2, 3, 1};
+  static std::vector<int> cl3d_dim_order{0, 2, 3, 4, 1};
+  // trans tensor/stride size to cnnl desc size/stride.
+  auto modify_dims_pos = [](const std::vector<int>& dim_order,
+                            const at::IntArrayRef& input,
+                            std::vector<int64_t>& out) {
+    for (const auto& item : dim_order) {
+      out.push_back(input[item]);
+    }
+  };
+  switch (memory_format) {
+    case c10::MemoryFormat::ChannelsLast:
+      TORCH_CHECK(dim.size() == 4, "dim size must be 4 when memory_format ",
+                      "is ChannelsLast.");
+      modify_dims_pos(cl_dim_order, dim, target_dim);
+      break;
+    case c10::MemoryFormat::ChannelsLast3d:
+      TORCH_CHECK(dim.size() == 5, "dim size must be 5 when memory_format is ",
+                      "ChannelsLast3d.");
+      modify_dims_pos(cl3d_dim_order, dim, target_dim);
+      break;
+    case c10::MemoryFormat::Contiguous:
+      target_dim = std::move(dim.vec());
+      break;
+    default:
+      TORCH_CHECK(false, "memory format not support.");
+      break;
+  }
+  return target_dim;
+}
+
+std::vector<int64_t> get_channels_last_strides(const at::IntArrayRef& sizes) {
+  switch (sizes.size()) {
+    case 5:
+      return c10::get_channels_last_strides_3d(sizes);
+    case 4:
+      return c10::get_channels_last_strides_2d(sizes);
+    case 3:
+      return get_channels_last_strides_1d(sizes);
+    default:
+      TORCH_INTERNAL_ASSERT(false, "ChannelsLast doesn't support size ", sizes.size());
+  }
+}
+
+std::vector<int64_t> get_channels_first_strides(const at::IntArrayRef& sizes) {
+  auto dim = sizes.size();
+  std::vector<int64_t> strides(dim);
+  if (dim > 0) {
+    int last_idx = dim - 1;
+    strides[last_idx] = 1;
+    for (auto i = last_idx - 1; i >= 0; --i) {
+      strides[i] = strides[i + 1] * std::max<int64_t>(sizes[i + 1], 1);
+    }
+  }
+  return strides;
+}
+
+std::vector<int64_t> get_channels_last_strides_1d(const at::IntArrayRef& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  switch (sizes.size()) {
+    // NLC
+    case 3:
+      strides[1] = 1;
+      strides[2] = sizes[1];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    // LC
+    case 2:
+      strides[0] = 1;
+      strides[1] = sizes[0];
+      return strides;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "ChannelsLast1d doesn't support size ", sizes.size());
+  }
+}
+
 // Descriptors
 mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {
   const std::map<std::string, mluOpDataType_t> mapping_type = {
diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
index f14871ebc5..1afa7cc61b 100644
--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -19,6 +19,7 @@
 #include "aten/cnnl/cnnlHandle.h"
 #include "aten/cnnl/cnnlTensorDescriptors.h"
 #include "framework/core/MLUStream.h"
+#include "framework/core/caching_allocator.h"
 
 using at::IntArrayRef;
 using at::Tensor;
@@ -32,6 +33,7 @@ inline void* mlu_data_ptr(c10::TensorImpl* impl) {
 } // namespace torch_mlu
 #endif
 
+
 #include "mlu_op.h"
 #include "pytorch_device_registry.hpp"
 #include "pytorch_mlu_helper.hpp"
@@ -78,6 +80,20 @@ mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
 mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
 mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);
 
+std::vector<int64_t> modify_dims_based_on_layout(const at::IntArrayRef& dim,
+            const c10::MemoryFormat memory_format);
+
+std::vector<int64_t> get_contiguous_strides(const at::IntArrayRef& sizes,
+             c10::MemoryFormat memory_format = c10::MemoryFormat::Contiguous);
+
+std::vector<int64_t> get_channels_last_strides(const at::IntArrayRef& sizes);
+
+std::vector<int64_t> get_channels_first_strides(const at::IntArrayRef& sizes);
+
+std::vector<int64_t> get_channels_last_strides_1d(const at::IntArrayRef& sizes);
+
+bool is_copy_necessary(const at::Tensor& output, const at::Tensor& output_contiguous);
+
 class MluOpTensorDescriptor {
  public:
   MluOpTensorDescriptor() {
@@ -89,6 +105,48 @@ class MluOpTensorDescriptor {
 
   void set(at::Tensor);
   void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
+
+  void set(at::Tensor t,
+           std::vector<long int> shape_info,
+           std::vector<long int> stride_info,
+           mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY) {
+    const int64_t t_dim = shape_info.size();
+    TORCH_CHECK(t_dim == stride_info.size(), "shape size need equal to stride size.");
+
+    auto data_type = getMluOpDataType(t.dtype());
+    auto setTensorDesc = [&](const int64_t dim,
+                             const int64_t* size,
+                             const int64_t* stride) -> void {
+      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptorEx_v2(mut_desc(),
+                                                      layout,
+                                                      data_type,
+                                                      dim,
+                                                      size,
+                                                      stride));
+    };
+    if (!t_dim) {
+        int64_t dim_array[1] = {1};
+        setTensorDesc(1, dim_array, dim_array);
+        return;
+    }
+    if (std::is_same<typename std::decay<long int>::type, int64_t>::value == true) {
+      setTensorDesc(t_dim, shape_info.data(), stride_info.data());
+    } else {
+      std::vector<int64_t> real_shape_info;
+      std::vector<int64_t> real_stride_info;
+      real_shape_info.reserve(t_dim);
+      real_stride_info.reserve(t_dim);
+      for (int i = 0; i < t_dim; i++) {
+        real_shape_info.push_back(shape_info[i]);
+        real_stride_info.push_back(stride_info[i]);
+      }
+
+      setTensorDesc(t_dim, real_shape_info.data(), real_stride_info.data());
+    }
+  }
+
+  mluOpTensorDescriptor_t mut_desc();
+
   mluOpTensorDescriptor_t desc() { return desc_; }
 
  private:
diff --git a/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp
index c17b987c45..1313fcb030 100644
--- a/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/modulated_deform_conv_mlu.cpp
@@ -28,6 +28,9 @@ void modulated_deform_conv_forward_mlu(
   int stride[2] = {(int)stride_h, (int)stride_w};
   int dilation[2] = {(int)dilation_h, (int)dilation_w};
   int im2col_step = input.size(0);
+
+  // get current handle
+  auto handle = mluOpGetCurrentHandle();
   
   // set to contiguous
   auto memory_format =
@@ -39,33 +42,32 @@ void modulated_deform_conv_forward_mlu(
   auto mask_contiguous = torch_mlu::cnnl_contiguous(mask, memory_format);
   auto output_contiguous = torch_mlu::cnnl_contiguous(output, memory_format);
 
-  // get current handle
-  auto handle = mluOpGetCurrentHandle();
-
   // set tensor descriptor
   MluOpDCNDescriptor dcn_desc;
   MluOpTensorDescriptor input_desc, offset_desc, weight_desc, bias_desc,
       mask_desc, output_desc;
-  input_desc.set_with_layout(input_contiguous, MLUOP_LAYOUT_NHWC);
-  offset_desc.set_with_layout(offset_contiguous, MLUOP_LAYOUT_NHWC);
-  weight_desc.set_with_layout(weight_contiguous, MLUOP_LAYOUT_NHWC);
-  mask_desc.set_with_layout(mask_contiguous, MLUOP_LAYOUT_NHWC);
-  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
-  
-  // bias Tensor size need be same with output Tensor channel size.
-  void* bias_ptr = nullptr;
-  mluOpTensorDescriptor_t bias_desc_ = NULL;
-  if (bias.defined() && bias.dim() == 1 && bias.size(0) == output.size(1)) {
-    bias_desc.set_with_layout(bias_contiguous, MLUOP_LAYOUT_ARRAY);
-    auto bias_impl = torch_mlu::getMluTensorImpl(bias_contiguous);
-    auto bias_ptr = torch_mlu::mlu_data_ptr(bias_impl);
-    bias_desc_ = bias_desc.desc();
-  }
-
+  auto desc_set = [&](const at::Tensor& t,
+                      MluOpTensorDescriptor& t_desc,
+                      mluOpTensorLayout_t layout) {
+    auto shape_vec = modify_dims_based_on_layout(t.sizes().vec(), memory_format);
+    auto stride_vec = get_contiguous_strides(shape_vec);
+    t_desc.set(t, shape_vec, stride_vec, layout);
+  };
+  // prepare desc
+  mluOpTensorLayout_t layout = MLUOP_LAYOUT_NHWC;
+  desc_set(input_contiguous, input_desc, layout);
+  desc_set(offset_contiguous, offset_desc, layout);
+  desc_set(weight_contiguous, weight_desc, layout);
+  desc_set(mask_contiguous, mask_desc, layout);
+  desc_set(output_contiguous, output_desc, layout);
+  mluOpSetTensorDescriptorOnchipDataType(output_desc.desc(), getMluOpDataType(output.dtype()));
   // set dcn descriptor
   dcn_desc.set(input.dim(), padding, stride, dilation,
 	       deformable_group, group, im2col_step, MLUOP_DTYPE_FLOAT);
-
+  // set onchip dtype
+  mluOpSetTensorDescriptorOnchipDataType(input_desc.desc(), getMluOpDataType(input.dtype()));
+  mluOpSetTensorDescriptorOnchipDataType(offset_desc.desc(), getMluOpDataType(offset.dtype()));
+  mluOpSetTensorDescriptorOnchipDataType(weight_desc.desc(), getMluOpDataType(weight.dtype()));
   //get ptr of tensors
   auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
   auto input_ptr = torch_mlu::mlu_data_ptr(input_impl);
@@ -77,20 +79,31 @@ void modulated_deform_conv_forward_mlu(
   auto mask_ptr = torch_mlu::mlu_data_ptr(mask_impl);
   auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
   auto output_ptr = torch_mlu::mlu_data_ptr(output_impl);
+  // bias Tensor size need be same with output Tensor channel size.
+  void* bias_ptr = nullptr;
+  mluOpTensorDescriptor_t bias_desc_ = NULL;
+  if (with_bias) {
+    bias_desc.set_with_layout(bias_contiguous, MLUOP_LAYOUT_ARRAY);
+    auto bias_impl = torch_mlu::getMluTensorImpl(bias_contiguous);
+    bias_ptr = torch_mlu::mlu_data_ptr(bias_impl);
+    bias_desc_ = bias_desc.desc();
+  }
 
   // allocate workspace
   size_t workspace_size = 0;
   TORCH_MLUOP_CHECK(mluOpGetDCNForwardWorkspaceSize(handle, dcn_desc.desc(),
       input_desc.desc(), offset_desc.desc(), mask_desc.desc(),
       weight_desc.desc(), bias_desc_, output_desc.desc(), &workspace_size));
-  auto workspace = at::empty(workspace_size, input.options().dtype(at::ScalarType::Char));
-  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
-  auto workspace_ptr = torch_mlu::mlu_data_ptr(workspace_impl);
+  auto workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(workspace_size);
 
   TORCH_MLUOP_CHECK(mluOpDCNForward(handle, dcn_desc.desc(), input_desc.desc(),
       input_ptr, offset_desc.desc(), offset_ptr, mask_desc.desc(), mask_ptr,
-      weight_desc.desc(), weight_ptr, bias_desc_, bias_ptr, workspace_ptr,
+      weight_desc.desc(), weight_ptr, bias_desc_, bias_ptr, workspace_ptr.get(),
       workspace_size, output_desc.desc(), output_ptr));
+
+  if (is_copy_necessary(output, output_contiguous)) {
+    output.copy_(output_contiguous);
+  }
 }
 
 void modulated_deform_conv_backward_mlu(
@@ -142,26 +155,40 @@ void modulated_deform_conv_backward_mlu(
   // bias Tensor size need be same with grad Tensor channel size.
   void* grad_bias_ptr = nullptr;
   mluOpTensorDescriptor_t grad_bias_desc_ = NULL;
-  if (bias.defined() && bias.dim() == 1 && bias.size(0) == grad_output.size(1)) {
+  if (with_bias) {
     grad_bias_desc.set_with_layout(grad_bias_contiguous, MLUOP_LAYOUT_ARRAY);
     auto grad_bias_impl = torch_mlu::getMluTensorImpl(grad_bias_contiguous);
-    auto grad_bias_ptr = torch_mlu::mlu_data_ptr(grad_bias_impl);
+    grad_bias_ptr = torch_mlu::mlu_data_ptr(grad_bias_impl);
     grad_bias_desc_ = grad_bias_desc.desc();
   }
   // get current handle
   auto handle = mluOpGetCurrentHandle();
-  grad_desc.set_with_layout(grad_contiguous, MLUOP_LAYOUT_NHWC);
-  input_desc.set_with_layout(input_contiguous, MLUOP_LAYOUT_NHWC);
-  offset_desc.set_with_layout(offset_contiguous, MLUOP_LAYOUT_NHWC);
-  weight_desc.set_with_layout(weight_contiguous, MLUOP_LAYOUT_NHWC);
-  mask_desc.set_with_layout(mask_contiguous, MLUOP_LAYOUT_NHWC);
-  grad_input_desc.set_with_layout(grad_input_contiguous, MLUOP_LAYOUT_NHWC);
-  grad_offset_desc.set_with_layout(grad_offset_contiguous, MLUOP_LAYOUT_NHWC);
-  grad_weight_desc.set_with_layout(grad_weight_contiguous, MLUOP_LAYOUT_NHWC);
-  grad_mask_desc.set_with_layout(grad_mask_contiguous, MLUOP_LAYOUT_NHWC);
+  auto desc_set = [&](const at::Tensor& t,
+                      MluOpTensorDescriptor& t_desc,
+                      mluOpTensorLayout_t layout) {
+    auto shape_vec = modify_dims_based_on_layout(t.sizes().vec(), memory_format);
+    auto stride_vec = get_contiguous_strides(shape_vec);
+    t_desc.set(t, shape_vec, stride_vec, layout);
+  };
+  // prepare desc
+  mluOpTensorLayout_t layout = MLUOP_LAYOUT_NHWC;
+  desc_set(input_contiguous, input_desc, layout);
+  desc_set(offset_contiguous, offset_desc, layout);
+  desc_set(weight_contiguous, weight_desc, layout);
+  desc_set(mask_contiguous, mask_desc, layout);
+  desc_set(grad_contiguous, grad_desc, layout);
+  desc_set(grad_input_contiguous, grad_input_desc, layout);
+  desc_set(grad_offset_contiguous, grad_offset_desc, layout);
+  desc_set(grad_weight_contiguous, grad_weight_desc, layout);
+  desc_set(grad_mask_contiguous, grad_mask_desc, layout);
   // set dcn descriptor
   dcn_desc.set(input.dim(), padding, stride, dilation,
 	       deformable_group, group, im2col_step, MLUOP_DTYPE_FLOAT);
+  // set onchip dtype
+  mluOpSetTensorDescriptorOnchipDataType(grad_input_desc.desc(), getMluOpDataType(grad_input.dtype()));
+  //mluOpSetTensorDescriptorOnchipDataType(offset_desc.desc(), getMluOpDataType(offset.dtype()));
+  mluOpSetTensorDescriptorOnchipDataType(input_desc.desc(), getMluOpDataType(input.dtype()));
+  mluOpSetTensorDescriptorOnchipDataType(weight_desc.desc(), getMluOpDataType(weight.dtype()));
   // set ptrs
   auto grad_ptr = torch_mlu::mlu_data_ptr(grad_impl); 
   auto input_ptr = torch_mlu::mlu_data_ptr(input_impl);
@@ -187,12 +214,7 @@ void modulated_deform_conv_backward_mlu(
                         /* grad_mask_desc   */ grad_mask_desc.desc(),
                         /* workspace_size   */ &data_workspace_size));
   // mallc data workspace mlu memory
-  void* data_workspace_ptr = nullptr;
-  at::Tensor data_workspace;
-  data_workspace = at::empty(data_workspace_size,
-                    input.options().dtype(at::ScalarType::Char));
-  auto data_workspace_impl = torch_mlu::getMluTensorImpl(data_workspace);
-  data_workspace_ptr = torch_mlu::mlu_data_ptr(data_workspace_impl);
+  auto data_workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(data_workspace_size);
   TORCH_MLUOP_CHECK(mluOpDCNBackwardData(
                         /* handle           */ handle,
                         /* dcn_desc         */ dcn_desc.desc(),
@@ -206,7 +228,7 @@ void modulated_deform_conv_backward_mlu(
                         /* weight_ptr       */ weight_ptr,
                         /* grad_output_desc */ grad_desc.desc(),
                         /* grad_output_ptr  */ grad_ptr,
-                        /* workspace_ptr    */ data_workspace_ptr,
+                        /* workspace_ptr    */ data_workspace_ptr.get(),
                         /* workspace_size   */ data_workspace_size,
                         /* grad_input_desc  */ grad_input_desc.desc(),
                         /* grad_input_ptr   */ grad_input_ptr,
@@ -227,12 +249,7 @@ void modulated_deform_conv_backward_mlu(
                         /* grad_bias_desc    */ grad_bias_desc_,
                         /* workspace_size    */ &weight_workspace_size));
   // malloc weight workspace mlu memory
-  void* weight_workspace_ptr = nullptr;
-  at::Tensor weight_workspace;
-  weight_workspace = at::empty(weight_workspace_size,
-                        input.options().dtype(at::ScalarType::Char));
-  auto weight_workspace_impl = torch_mlu::getMluTensorImpl(weight_workspace);
-  weight_workspace_ptr = torch_mlu::mlu_data_ptr(weight_workspace_impl);
+  auto weight_workspace_ptr = torch_mlu::MLUCachingAllocator::get()->allocate(weight_workspace_size);
   TORCH_MLUOP_CHECK(mluOpDCNBackwardWeight(
                         /* handle            */ handle,
                         /* dcn_desc          */ dcn_desc.desc(),
@@ -244,12 +261,25 @@ void modulated_deform_conv_backward_mlu(
                         /* mask_ptr          */ mask_ptr,
                         /* grad_output_desc  */ grad_desc.desc(),
                         /* grad_output_ptr   */ grad_ptr,
-                        /* workspace         */ weight_workspace_ptr,
+                        /* workspace         */ weight_workspace_ptr.get(),
                         /* workspace_size    */ weight_workspace_size,
                         /* grad_weight_desc  */ grad_weight_desc.desc(),
                         /* grad_weigth_ptr   */ grad_weight_ptr,
                         /* grad_bias_desc    */ grad_bias_desc_,
                         /* grad_bias_ptr     */ grad_bias_ptr));
+  if (is_copy_necessary(grad_input, grad_input_contiguous)) {
+    grad_input.copy_(grad_input_contiguous);
+  }
+  if (is_copy_necessary(grad_offset, grad_offset_contiguous)) {
+    grad_offset.copy_(grad_offset_contiguous);
+  }
+  if (is_copy_necessary(grad_weight, grad_weight_contiguous)) {
+    grad_weight.copy_(grad_weight_contiguous);
+  }
+  if (is_copy_necessary(grad_mask, grad_mask_contiguous)) {
+    grad_mask.copy_(grad_mask_contiguous);
+  }
+
 }
 
 void modulated_deform_conv_forward_impl(