[luci-interpreter] Support Hybrid UINT4 weights FullyConnected kernel (…

…Samsung#12811) This commit adds support of UINT4 quantized weights in FullyConnected kernel. ONE-DCO-1.0-Signed-off-by: Vyacheslav Bazhenov <[email protected]>
lemmaa · Apr 2, 2024 · 27e40aa · 27e40aa
1 parent cef6a8e
commit 27e40aa
Show file tree

Hide file tree

Showing 4 changed files with 274 additions and 3 deletions.
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
@@ -61,6 +61,13 @@ void FullyConnected::configure()
     LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
     LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
   }
+  else if (weights()->element_type() == DataType::U4)
+  {
+    // TODO support other combinations when needed
+    LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
+    LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
+  }
   else
   {
     throw std::runtime_error("luci-intp FullyConnected(1) Unsupported type.");
@@ -97,12 +104,23 @@ void FullyConnected::configure()
 void FullyConnected::execute() const
 {
   const bool is_hybrid =
-    (input()->element_type() == DataType::FLOAT32 && weights()->element_type() == DataType::S4 &&
+    (input()->element_type() == DataType::FLOAT32 &&
+     (weights()->element_type() == DataType::S4 || weights()->element_type() == DataType::U4) &&
      output()->element_type() == DataType::FLOAT32 &&
      (!bias() || bias()->element_type() == DataType::FLOAT32));
   if (is_hybrid)
   {
-    evalHybridWI4AF32();
+    switch (weights()->element_type())
+    {
+      case DataType::S4:
+        evalHybridWI4AF32();
+        break;
+      case DataType::U4:
+        evalHybridWU4AF32();
+        break;
+      default:
+        throw std::runtime_error("luci-intp FullyConnected(3) Unsupported type.");
+    }
   }
   else
   {
@@ -231,5 +249,69 @@ void FullyConnected::evalHybridWI4AF32() const
     getTensorShape(output()), getTensorData<float>(output()));
 }
 
+void FullyConnected::evalHybridWU4AF32() const
+{
+  float activation_min{};
+  float activation_max{};
+  calculateActivationRange(_params.activation, &activation_min, &activation_max);
+
+  tflite::FullyConnectedParams params{};
+  params.float_activation_min = activation_min;
+  params.float_activation_max = activation_max;
+  params.weights_format = tflite::FullyConnectedWeightsFormat::kDefault;
+
+  const auto *weights_uint4 = getTensorData<uint8_t>(weights());
+  auto *weights_float = getTensorData<float>(scratch());
+  const Shape &weights_shape = weights()->shape();
+  const auto weights_scales = weights()->scales();
+  const auto weights_zero_points = weights()->zero_points();
+  const auto weights_quantized_dimension = weights()->quantized_dimension();
+  LUCI_INTERPRETER_CHECK(weights_quantized_dimension == 0);
+  if (weights_scales.size() == 1)
+  {
+    // Per tensor
+    const auto scale = weights()->scale();
+    const auto zero_point = weights()->zero_point();
+    LUCI_INTERPRETER_CHECK(zero_point >= 0 and zero_point <= 15);
+    for (int32_t i = 0; i < weights_shape.num_elements(); ++i)
+    {
+      weights_float[i] =
+        scale * static_cast<float>(static_cast<int32_t>(weights_uint4[i]) - zero_point);
+    }
+  }
+  else
+  {
+    // Per channel
+    const int32_t quant_dim_size = weights_shape.dim(weights_quantized_dimension);
+
+    size_t outer_dims_size = 1;
+    size_t inner_dims_size = 1;
+    for (int i = 0; i < weights_quantized_dimension; ++i)
+      outer_dims_size *= weights_shape.dim(i);
+    for (int i = weights_quantized_dimension + 1; i < weights_shape.num_dims(); ++i)
+      inner_dims_size *= weights_shape.dim(i);
+
+    for (size_t outer_it = 0; outer_it < outer_dims_size; ++outer_it)
+      for (int32_t channel = 0; channel < quant_dim_size; ++channel)
+      {
+        int32_t zero_point = weights_zero_points[channel];
+        LUCI_INTERPRETER_CHECK(zero_point >= 0 and zero_point <= 15);
+        float scale = weights_scales[channel];
+        size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
+        for (size_t inner_it = 0; inner_it < inner_dims_size; ++inner_it)
+        {
+          weights_float[offset + inner_it] =
+            scale *
+            static_cast<float>(static_cast<int32_t>(weights_uint4[offset + inner_it]) - zero_point);
+        }
+      }
+  }
+
+  tflite::reference_ops::FullyConnected(
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(scratch()),
+    getTensorData<float>(scratch()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()));
+}
+
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.h b/compiler/luci-interpreter/src/kernels/FullyConnected.h
@@ -50,6 +50,7 @@ class FullyConnected : public KernelWithParams<FullyConnectedParams>
   void evalQuantized() const;
   void evalQuantizedS8() const;
   void evalHybridWI4AF32() const;
+  void evalHybridWU4AF32() const;
   Tensor *_scratch = nullptr;
 };
 

diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
@@ -204,6 +204,193 @@ TEST(FullyConnectedTest, SimpleS4)
               FloatArrayNear(output_data, quantized_tolerance));
 }
 
+TEST(FullyConnectedTest, SimpleU4PerTensor)
+{
+  std::initializer_list<int32_t> input_shape{1, 2};
+  std::initializer_list<int32_t> weights_shape{4, 2};
+  std::initializer_list<int32_t> bias_shape{4};
+  std::initializer_list<int32_t> output_shape{1, 4};
+  std::initializer_list<float> input_data{
+    1, 3, // batch = 0
+  };
+  std::initializer_list<uint8_t> weights_initializer{
+    8, 9, // unit = 0
+    8, 8, // unit = 1
+    7, 7, // unit = 2
+    8, 8, // unit = 3
+  };
+  std::initializer_list<float> bias_data{0, 1, 2, 3};
+  std::initializer_list<float> output_data{
+    1.5, 1, 0, 3, // batch = 0
+  };
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  std::vector<uint8_t> quantized_data(weights_initializer);
+  Tensor weights_tensor(DataType::U4, weights_shape, {{0.5}, {8}}, "");
+  memory_manager->allocate_memory(weights_tensor);
+  weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
+  Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
+  memory_manager->allocate_memory(weights_scratch);
+
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  const float quantized_tolerance = getTolerance(0, 15, 15);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
+                        &weights_scratch, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+TEST(FullyConnectedTest, SimpleU4PerChannel)
+{
+  std::initializer_list<int32_t> input_shape{1, 2};
+  std::initializer_list<int32_t> weights_shape{4, 2};
+  std::initializer_list<int32_t> bias_shape{4};
+  std::initializer_list<int32_t> output_shape{1, 4};
+  std::initializer_list<float> input_data{
+    1, 3, // batch = 0
+  };
+  std::initializer_list<uint8_t> weights_initializer{
+    8, 9, // unit = 0
+    8, 8, // unit = 1
+    7, 7, // unit = 2
+    8, 8, // unit = 3
+  };
+  std::initializer_list<float> bias_data{0, 1, 2, 3};
+  std::initializer_list<float> output_data{
+    1.5, 1, 0, 3, // batch = 0
+  };
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  std::vector<uint8_t> quantized_data(weights_initializer);
+  Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5, 0.5, 0.5}, {8, 8, 8, 8}, 0}, "");
+  memory_manager->allocate_memory(weights_tensor);
+  weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
+  Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
+  memory_manager->allocate_memory(weights_scratch);
+
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  const float quantized_tolerance = getTolerance(0, 15, 15);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
+                        &weights_scratch, params);
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              FloatArrayNear(output_data, quantized_tolerance));
+}
+
+TEST(FullyConnectedTest, SimpleU4WrongBiasType_NEG)
+{
+  std::initializer_list<int32_t> input_shape{1, 2};
+  std::initializer_list<int32_t> weights_shape{4, 2};
+  std::initializer_list<int32_t> bias_shape{4};
+  std::initializer_list<int32_t> output_shape{1, 4};
+  std::initializer_list<float> input_data{
+    1, 3, // batch = 0
+  };
+  std::initializer_list<uint8_t> weights_initializer{
+    8, 9, // unit = 0
+    8, 8, // unit = 1
+    7, 7, // unit = 2
+    8, 8, // unit = 3
+  };
+  std::initializer_list<uint8_t> bias_data{0, 1, 2, 3};
+  std::initializer_list<float> output_data{
+    1.5, 1, 0, 3, // batch = 0
+  };
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
+  std::vector<uint8_t> quantized_data(weights_initializer);
+  Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5}, {8, 8}, 1}, "");
+  memory_manager->allocate_memory(weights_tensor);
+  weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
+  Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
+  memory_manager->allocate_memory(weights_scratch);
+
+  Tensor bias_tensor = makeInputTensor<DataType::U8>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  const float quantized_tolerance = getTolerance(0, 15, 15);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
+                        &weights_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(FullyConnectedTest, SimpleU4WrongInputType_NEG)
+{
+  std::initializer_list<int32_t> input_shape{1, 2};
+  std::initializer_list<int32_t> weights_shape{4, 2};
+  std::initializer_list<int32_t> bias_shape{4};
+  std::initializer_list<int32_t> output_shape{1, 4};
+  std::initializer_list<uint8_t> input_data{
+    1, 3, // batch = 0
+  };
+  std::initializer_list<uint8_t> weights_initializer{
+    8, 9, // unit = 0
+    8, 8, // unit = 1
+    7, 7, // unit = 2
+    8, 8, // unit = 3
+  };
+  std::initializer_list<float> bias_data{0, 1, 2, 3};
+  std::initializer_list<float> output_data{
+    1.5, 1, 0, 3, // batch = 0
+  };
+  std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>(input_shape, input_data, memory_manager.get());
+  std::vector<uint8_t> quantized_data(weights_initializer);
+  Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5}, {8, 8}, 1}, "");
+  memory_manager->allocate_memory(weights_tensor);
+  weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
+  Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
+  memory_manager->allocate_memory(weights_scratch);
+
+  Tensor bias_tensor =
+    makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  const float quantized_tolerance = getTolerance(0, 15, 15);
+
+  FullyConnectedParams params{};
+  params.activation = Activation::RELU;
+
+  FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
+                        &weights_scratch, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 TEST(FullyConnectedTest, InvalidBiasType_NEG)
 {
   Shape input_shape{3, 2, 2, 1};

diff --git a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
@@ -35,7 +35,8 @@ std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode
   FullyConnectedParams params{};
   params.activation = node->fusedActivationFunction();
   params.keep_num_dims = node->keep_num_dims();
-  if (weights->element_type() == loco::DataType::S4)
+  if (weights->element_type() == loco::DataType::S4 ||
+      weights->element_type() == loco::DataType::U4)
   {
     auto scratchpad =
       std::make_unique<Tensor>(input->element_type(), weights->shape(), AffineQuantization{}, "");