Skip to content

Commit

Permalink
[CPU] [ARM] INT8: FullyConnected
Browse files Browse the repository at this point in the history
  • Loading branch information
eshoguli committed Jun 3, 2024
1 parent fe33b10 commit 46e41b5
Show file tree
Hide file tree
Showing 11 changed files with 49 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ std::tuple<std::shared_ptr<Node>, std::shared_ptr<Node>> decomposeFakeQuantize(
} // namespace fq_decomposition

bool FakeQuantizeDecompositionTransformation::transform(TransformationContext& context, ov::pass::pattern::Matcher& m) {
//return false;

auto node = ov::as_type_ptr<opset1::FakeQuantize>(m.get_match_root());
if (!node || !NetworkHelper::isQuantizeSupported(node)) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
std::unordered_map<int, arm_compute::DataType> acl_tensors_types_list;
std::unordered_map<int, arm_compute::DataLayout> acl_tensors_layouts_list;
for (auto& cpu_mem_ptr : memory) {
acl_tensors_types_list[cpu_mem_ptr.first] = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
// TODO: workaround: refactor
auto aclPrecision = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
if (aclPrecision == arm_compute::DataType::S8) {
aclPrecision = arm_compute::DataType::QASYMM8_SIGNED;
}
acl_tensors_types_list[cpu_mem_ptr.first] = aclPrecision;
acl_tensors_layouts_list[cpu_mem_ptr.first] = getAclDataLayoutByMemoryDesc(cpu_mem_ptr.second->getDescPtr());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
namespace ov {
namespace intel_cpu {

// TODO: add namespace 'acl'
// TODO: rename: ACLMemoryArgs => Tensors
using ACLMemoryArgs = std::unordered_map<int, std::shared_ptr<arm_compute::Tensor>>;
// TODO: rename: ACLMemoryInfoArgs => TensorInfos
using ACLMemoryInfoArgs = std::unordered_map<int, std::shared_ptr<arm_compute::TensorInfo>>;
using ACLFunction = std::unique_ptr<arm_compute::IFunction>;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void ACLFullyConnectedExecutor::prepareTensorsInfo() {
aclMemoryInfoArgs.at(ARG_DST)->set_tensor_shape({wei_shape[1], src_shape[1]});
}

auto expected_weight_format = arm_compute::WeightFormat::ANY;
auto expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
weightsInfo = arm_compute::WeightsInfo(false, 1, 1,
aclMemoryInfoArgs.at(ARG_WEI)->tensor_shape().total_size(),
false, expected_weight_format);
Expand Down Expand Up @@ -102,6 +102,22 @@ void ACLFullyConnectedExecutor::prepareTensorsInfo() {

void ACLFullyConnectedExecutor::configureFunction() {
iFunction = std::make_unique<arm_compute::NEFullyConnectedLayer>();

// TODO: workaround: refactor:
// * move to ACLFullyConnectedExecutor
// * generalize
const auto src_tensor = aclMemoryArgs.at(ARG_SRC).get();
src_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(4.0f / 255.f, 0));

const auto weights_tensor = aclMemoryArgs.at(ARG_WEI).get();
weights_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(8.0f / 255.f, 0));

const auto dst_tensor = aclMemoryArgs.at(ARG_DST).get();
// without snippets: dequantization is fused but not used
dst_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(32.f / 255.f, 0));
// incorrect behaviour: with snippets
//dst_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(1.f, 0));

reinterpret_cast<arm_compute::NEFullyConnectedLayer*>(iFunction.get())->configure(
aclMemoryArgs.at(ARG_SRC).get(),
aclMemoryArgs.at(ARG_WEI).get(),
Expand All @@ -111,5 +127,10 @@ void ACLFullyConnectedExecutor::configureFunction() {
weightsInfo);
}

// TODO: empty method: remove
bool ACLFullyConnectedExecutor::update(const MemoryArgs &memory) {
return ACLCommonExecutor::update(memory);
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor {
impl_desc_type implType() const override {
return impl_desc_type::gemm_acl;
}

bool update(const MemoryArgs& memory) override;
private:
arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
arm_compute::WeightsInfo weightsInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static const TypeMapping dnnlFCTypeMapping {
static const TypeMapping aclFCTypeMapping {
// {src, wei, bia, dst} pt<src, wei, bias, dst>
{{_f32 | _f16, _any, _any, _any}, pt(bypass(), use<0>(), use<0>(), use<0>())},
{{_i8, _i8, _any, _any}, pt(just<i8>(), just<i8>(), bypass(), just<i32>())}
};

static const MappingNotation dnnlConvolutionMappingNotation {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,15 +272,20 @@ void Transformations::UpToLpt() {
};

const bool useLpt = enableLpt &&
//false &&
LowPrecision::isFunctionQuantized(model, supported_fq_levels) &&
CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);

const auto defaultPrecisions = useLpt ? precision_set::get_int8_support() : std::vector<ov::element::Type>{};

PreLpt(defaultPrecisions);

if (useLpt)
ov::pass::Serialize("report/graphs/cpu.pre_lpt.xml", "report/graphs/cpu.pre_lpt.bin").run_on_model(model);

if (useLpt) {
Lpt(defaultPrecisions);
ov::pass::Serialize("report/graphs/cpu.lpt.xml", "report/graphs/cpu.lpt.bin").run_on_model(model);
}
}

void Transformations::SetSubStreamNum(int SubStreams) {
Expand Down Expand Up @@ -800,6 +805,7 @@ void Transformations::PostLpt() {
}

void Transformations::MainSnippets(void) {
//return;
auto is_supported_isa = [](){
#if defined(OPENVINO_ARCH_X86_64)
return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,14 @@ std::vector<std::string> disabledTestPatterns() {
retVector.emplace_back(R"(smoke_TestsDFT_(1|2|3|4)d/DFTLayerTest.Inference.*)");
// Issue 88764, 91647, 108802: accuracy issue
retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)");
// int8 / code-generation specific
retVector.emplace_back(R"(smoke_LPT.*)");
// Compressed weights are not supported
retVector.emplace_back(R"(smoke_MatMulCompressedWeights.*)");
retVector.emplace_back(R"(smoke_MatMulSharedCompressedWeights.*)");
retVector.emplace_back(R"(smoke_MatmulAndGatherSharedWeightsDecompression.*)");
// smoke_Snippets test cases are not supported on arm32 platforms
#if !defined(OPENVINO_ARCH_ARM64)
// int8 / code-generation specific
retVector.emplace_back(R"(smoke_LPT.*)");
retVector.emplace_back(R"(smoke_Snippets.*)");
#endif
// Issue: 126738
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ void FullyConnectedTransformation::SetUp() {
shapes.inputB,
shapes.transposeA,
shapes.transposeB);

ov::pass::Serialize("report/graphs/test.original.xml", "test.original.bin").run_on_model(function);
}

TEST_P(FullyConnectedTransformation, CompareWithRefImpl) {
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
const std::vector<size_t> constShapes(inputShape1.rank().get_length(), 1ul);
const auto fakeQuantizeOnAcitvations = ov::test::utils::make_fake_quantize(
paramNode, precision, 256ul, constShapes,
{ 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
{ -128.f / 4.f }, { 127.f / 4.f }, { -128.f / 4.f }, { 127.f / 4.f });
fakeQuantizeOnAcitvations->set_friendly_name("fakeQuantizeOnAcitvations");

auto weightsConst = std::make_shared<ov::op::v0::Constant>(
Expand Down

0 comments on commit 46e41b5

Please sign in to comment.