Skip to content

Commit

Permalink
[luci-interpreter] Support Hybrid UINT4 weights FullyConnected kernel (
Browse files Browse the repository at this point in the history
…Samsung#12811)

This commit adds support of UINT4 quantized weights in FullyConnected kernel.

ONE-DCO-1.0-Signed-off-by: Vyacheslav Bazhenov <[email protected]>
  • Loading branch information
SlavikMIPT authored Apr 2, 2024
1 parent cef6a8e commit 27e40aa
Show file tree
Hide file tree
Showing 4 changed files with 274 additions and 3 deletions.
86 changes: 84 additions & 2 deletions compiler/luci-interpreter/src/kernels/FullyConnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ void FullyConnected::configure()
LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
}
else if (weights()->element_type() == DataType::U4)
{
// TODO support other combinations when needed
LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32);
LUCI_INTERPRETER_CHECK(!bias() || bias()->element_type() == DataType::FLOAT32)
}
else
{
throw std::runtime_error("luci-intp FullyConnected(1) Unsupported type.");
Expand Down Expand Up @@ -97,12 +104,23 @@ void FullyConnected::configure()
void FullyConnected::execute() const
{
const bool is_hybrid =
(input()->element_type() == DataType::FLOAT32 && weights()->element_type() == DataType::S4 &&
(input()->element_type() == DataType::FLOAT32 &&
(weights()->element_type() == DataType::S4 || weights()->element_type() == DataType::U4) &&
output()->element_type() == DataType::FLOAT32 &&
(!bias() || bias()->element_type() == DataType::FLOAT32));
if (is_hybrid)
{
evalHybridWI4AF32();
switch (weights()->element_type())
{
case DataType::S4:
evalHybridWI4AF32();
break;
case DataType::U4:
evalHybridWU4AF32();
break;
default:
throw std::runtime_error("luci-intp FullyConnected(3) Unsupported type.");
}
}
else
{
Expand Down Expand Up @@ -231,5 +249,69 @@ void FullyConnected::evalHybridWI4AF32() const
getTensorShape(output()), getTensorData<float>(output()));
}

void FullyConnected::evalHybridWU4AF32() const
{
float activation_min{};
float activation_max{};
calculateActivationRange(_params.activation, &activation_min, &activation_max);

tflite::FullyConnectedParams params{};
params.float_activation_min = activation_min;
params.float_activation_max = activation_max;
params.weights_format = tflite::FullyConnectedWeightsFormat::kDefault;

const auto *weights_uint4 = getTensorData<uint8_t>(weights());
auto *weights_float = getTensorData<float>(scratch());
const Shape &weights_shape = weights()->shape();
const auto weights_scales = weights()->scales();
const auto weights_zero_points = weights()->zero_points();
const auto weights_quantized_dimension = weights()->quantized_dimension();
LUCI_INTERPRETER_CHECK(weights_quantized_dimension == 0);
if (weights_scales.size() == 1)
{
// Per tensor
const auto scale = weights()->scale();
const auto zero_point = weights()->zero_point();
LUCI_INTERPRETER_CHECK(zero_point >= 0 and zero_point <= 15);
for (int32_t i = 0; i < weights_shape.num_elements(); ++i)
{
weights_float[i] =
scale * static_cast<float>(static_cast<int32_t>(weights_uint4[i]) - zero_point);
}
}
else
{
// Per channel
const int32_t quant_dim_size = weights_shape.dim(weights_quantized_dimension);

size_t outer_dims_size = 1;
size_t inner_dims_size = 1;
for (int i = 0; i < weights_quantized_dimension; ++i)
outer_dims_size *= weights_shape.dim(i);
for (int i = weights_quantized_dimension + 1; i < weights_shape.num_dims(); ++i)
inner_dims_size *= weights_shape.dim(i);

for (size_t outer_it = 0; outer_it < outer_dims_size; ++outer_it)
for (int32_t channel = 0; channel < quant_dim_size; ++channel)
{
int32_t zero_point = weights_zero_points[channel];
LUCI_INTERPRETER_CHECK(zero_point >= 0 and zero_point <= 15);
float scale = weights_scales[channel];
size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
for (size_t inner_it = 0; inner_it < inner_dims_size; ++inner_it)
{
weights_float[offset + inner_it] =
scale *
static_cast<float>(static_cast<int32_t>(weights_uint4[offset + inner_it]) - zero_point);
}
}
}

tflite::reference_ops::FullyConnected(
params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(scratch()),
getTensorData<float>(scratch()), getTensorShape(bias()), getTensorData<float>(bias()),
getTensorShape(output()), getTensorData<float>(output()));
}

} // namespace kernels
} // namespace luci_interpreter
1 change: 1 addition & 0 deletions compiler/luci-interpreter/src/kernels/FullyConnected.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class FullyConnected : public KernelWithParams<FullyConnectedParams>
void evalQuantized() const;
void evalQuantizedS8() const;
void evalHybridWI4AF32() const;
void evalHybridWU4AF32() const;
Tensor *_scratch = nullptr;
};

Expand Down
187 changes: 187 additions & 0 deletions compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,193 @@ TEST(FullyConnectedTest, SimpleS4)
FloatArrayNear(output_data, quantized_tolerance));
}

TEST(FullyConnectedTest, SimpleU4PerTensor)
{
std::initializer_list<int32_t> input_shape{1, 2};
std::initializer_list<int32_t> weights_shape{4, 2};
std::initializer_list<int32_t> bias_shape{4};
std::initializer_list<int32_t> output_shape{1, 4};
std::initializer_list<float> input_data{
1, 3, // batch = 0
};
std::initializer_list<uint8_t> weights_initializer{
8, 9, // unit = 0
8, 8, // unit = 1
7, 7, // unit = 2
8, 8, // unit = 3
};
std::initializer_list<float> bias_data{0, 1, 2, 3};
std::initializer_list<float> output_data{
1.5, 1, 0, 3, // batch = 0
};
std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();

Tensor input_tensor =
makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
std::vector<uint8_t> quantized_data(weights_initializer);
Tensor weights_tensor(DataType::U4, weights_shape, {{0.5}, {8}}, "");
memory_manager->allocate_memory(weights_tensor);
weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
memory_manager->allocate_memory(weights_scratch);

Tensor bias_tensor =
makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

const float quantized_tolerance = getTolerance(0, 15, 15);

FullyConnectedParams params{};
params.activation = Activation::RELU;

FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
&weights_scratch, params);
kernel.configure();
memory_manager->allocate_memory(output_tensor);
kernel.execute();

EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
EXPECT_THAT(extractTensorData<float>(output_tensor),
FloatArrayNear(output_data, quantized_tolerance));
}

TEST(FullyConnectedTest, SimpleU4PerChannel)
{
std::initializer_list<int32_t> input_shape{1, 2};
std::initializer_list<int32_t> weights_shape{4, 2};
std::initializer_list<int32_t> bias_shape{4};
std::initializer_list<int32_t> output_shape{1, 4};
std::initializer_list<float> input_data{
1, 3, // batch = 0
};
std::initializer_list<uint8_t> weights_initializer{
8, 9, // unit = 0
8, 8, // unit = 1
7, 7, // unit = 2
8, 8, // unit = 3
};
std::initializer_list<float> bias_data{0, 1, 2, 3};
std::initializer_list<float> output_data{
1.5, 1, 0, 3, // batch = 0
};
std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();

Tensor input_tensor =
makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
std::vector<uint8_t> quantized_data(weights_initializer);
Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5, 0.5, 0.5}, {8, 8, 8, 8}, 0}, "");
memory_manager->allocate_memory(weights_tensor);
weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
memory_manager->allocate_memory(weights_scratch);

Tensor bias_tensor =
makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

const float quantized_tolerance = getTolerance(0, 15, 15);

FullyConnectedParams params{};
params.activation = Activation::RELU;

FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
&weights_scratch, params);
kernel.configure();
memory_manager->allocate_memory(output_tensor);
kernel.execute();

EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
EXPECT_THAT(extractTensorData<float>(output_tensor),
FloatArrayNear(output_data, quantized_tolerance));
}

TEST(FullyConnectedTest, SimpleU4WrongBiasType_NEG)
{
std::initializer_list<int32_t> input_shape{1, 2};
std::initializer_list<int32_t> weights_shape{4, 2};
std::initializer_list<int32_t> bias_shape{4};
std::initializer_list<int32_t> output_shape{1, 4};
std::initializer_list<float> input_data{
1, 3, // batch = 0
};
std::initializer_list<uint8_t> weights_initializer{
8, 9, // unit = 0
8, 8, // unit = 1
7, 7, // unit = 2
8, 8, // unit = 3
};
std::initializer_list<uint8_t> bias_data{0, 1, 2, 3};
std::initializer_list<float> output_data{
1.5, 1, 0, 3, // batch = 0
};
std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();

Tensor input_tensor =
makeInputTensor<DataType::FLOAT32>(input_shape, input_data, memory_manager.get());
std::vector<uint8_t> quantized_data(weights_initializer);
Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5}, {8, 8}, 1}, "");
memory_manager->allocate_memory(weights_tensor);
weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
memory_manager->allocate_memory(weights_scratch);

Tensor bias_tensor = makeInputTensor<DataType::U8>(bias_shape, bias_data, memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

const float quantized_tolerance = getTolerance(0, 15, 15);

FullyConnectedParams params{};
params.activation = Activation::RELU;

FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
&weights_scratch, params);
EXPECT_ANY_THROW(kernel.configure());
}

TEST(FullyConnectedTest, SimpleU4WrongInputType_NEG)
{
std::initializer_list<int32_t> input_shape{1, 2};
std::initializer_list<int32_t> weights_shape{4, 2};
std::initializer_list<int32_t> bias_shape{4};
std::initializer_list<int32_t> output_shape{1, 4};
std::initializer_list<uint8_t> input_data{
1, 3, // batch = 0
};
std::initializer_list<uint8_t> weights_initializer{
8, 9, // unit = 0
8, 8, // unit = 1
7, 7, // unit = 2
8, 8, // unit = 3
};
std::initializer_list<float> bias_data{0, 1, 2, 3};
std::initializer_list<float> output_data{
1.5, 1, 0, 3, // batch = 0
};
std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>();

Tensor input_tensor =
makeInputTensor<DataType::U8>(input_shape, input_data, memory_manager.get());
std::vector<uint8_t> quantized_data(weights_initializer);
Tensor weights_tensor(DataType::U4, weights_shape, {{0.5, 0.5}, {8, 8}, 1}, "");
memory_manager->allocate_memory(weights_tensor);
weights_tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(uint8_t));
Tensor weights_scratch(DataType::FLOAT32, weights_shape, {}, "");
memory_manager->allocate_memory(weights_scratch);

Tensor bias_tensor =
makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

const float quantized_tolerance = getTolerance(0, 15, 15);

FullyConnectedParams params{};
params.activation = Activation::RELU;

FullyConnected kernel(&input_tensor, &weights_tensor, &bias_tensor, &output_tensor,
&weights_scratch, params);
EXPECT_ANY_THROW(kernel.configure());
}

TEST(FullyConnectedTest, InvalidBiasType_NEG)
{
Shape input_shape{3, 2, 2, 1};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode
FullyConnectedParams params{};
params.activation = node->fusedActivationFunction();
params.keep_num_dims = node->keep_num_dims();
if (weights->element_type() == loco::DataType::S4)
if (weights->element_type() == loco::DataType::S4 ||
weights->element_type() == loco::DataType::U4)
{
auto scratchpad =
std::make_unique<Tensor>(input->element_type(), weights->shape(), AffineQuantization{}, "");
Expand Down

0 comments on commit 27e40aa

Please sign in to comment.