[CPU] [ARM] INT8: FullyConnected

openvinotoolkit · Jun 3, 2024 · 46e41b5 · 46e41b5
1 parent fe33b10
commit 46e41b5
Show file tree

Hide file tree

Showing 11 changed files with 49 additions and 7 deletions.
diff --git a/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp b/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp
@@ -277,6 +277,8 @@ std::tuple<std::shared_ptr<Node>, std::shared_ptr<Node>> decomposeFakeQuantize(
 } // namespace fq_decomposition
 
 bool FakeQuantizeDecompositionTransformation::transform(TransformationContext& context, ov::pass::pattern::Matcher& m) {
+    //return false;
+
     auto node = ov::as_type_ptr<opset1::FakeQuantize>(m.get_match_root());
     if (!node || !NetworkHelper::isQuantizeSupported(node)) {
         return false;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp
@@ -14,7 +14,12 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
     std::unordered_map<int, arm_compute::DataType>   acl_tensors_types_list;
     std::unordered_map<int, arm_compute::DataLayout> acl_tensors_layouts_list;
     for (auto& cpu_mem_ptr : memory) {
-        acl_tensors_types_list[cpu_mem_ptr.first] = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
+        // TODO: workaround: refactor
+        auto aclPrecision = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
+        if (aclPrecision == arm_compute::DataType::S8) {
+            aclPrecision = arm_compute::DataType::QASYMM8_SIGNED;
+        }
+        acl_tensors_types_list[cpu_mem_ptr.first] = aclPrecision;
         acl_tensors_layouts_list[cpu_mem_ptr.first] = getAclDataLayoutByMemoryDesc(cpu_mem_ptr.second->getDescPtr());
     }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp
@@ -11,7 +11,10 @@
 namespace ov {
 namespace intel_cpu {
 
+// TODO: add namespace 'acl'
+// TODO: rename: ACLMemoryArgs => Tensors
 using ACLMemoryArgs     = std::unordered_map<int, std::shared_ptr<arm_compute::Tensor>>;
+// TODO: rename: ACLMemoryInfoArgs => TensorInfos
 using ACLMemoryInfoArgs = std::unordered_map<int, std::shared_ptr<arm_compute::TensorInfo>>;
 using ACLFunction       = std::unique_ptr<arm_compute::IFunction>;
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -69,7 +69,7 @@ void ACLFullyConnectedExecutor::prepareTensorsInfo() {
         aclMemoryInfoArgs.at(ARG_DST)->set_tensor_shape({wei_shape[1], src_shape[1]});
     }
 
-    auto expected_weight_format = arm_compute::WeightFormat::ANY;
+    auto expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
     weightsInfo = arm_compute::WeightsInfo(false, 1, 1,
                                            aclMemoryInfoArgs.at(ARG_WEI)->tensor_shape().total_size(),
                                            false, expected_weight_format);
@@ -102,6 +102,22 @@ void ACLFullyConnectedExecutor::prepareTensorsInfo() {
 
 void ACLFullyConnectedExecutor::configureFunction() {
     iFunction = std::make_unique<arm_compute::NEFullyConnectedLayer>();
+
+    // TODO: workaround: refactor:
+    //  * move to ACLFullyConnectedExecutor
+    //  * generalize
+    const auto src_tensor = aclMemoryArgs.at(ARG_SRC).get();
+    src_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(4.0f / 255.f, 0));
+
+    const auto weights_tensor = aclMemoryArgs.at(ARG_WEI).get();
+    weights_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(8.0f / 255.f, 0));
+
+    const auto dst_tensor = aclMemoryArgs.at(ARG_DST).get();
+    // without snippets: dequantization is fused but not used
+    dst_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(32.f / 255.f, 0));
+    // incorrect behaviour: with snippets
+    //dst_tensor->info()->set_quantization_info(arm_compute::QuantizationInfo(1.f, 0));
+
     reinterpret_cast<arm_compute::NEFullyConnectedLayer*>(iFunction.get())->configure(
             aclMemoryArgs.at(ARG_SRC).get(),
             aclMemoryArgs.at(ARG_WEI).get(),
@@ -111,5 +127,10 @@ void ACLFullyConnectedExecutor::configureFunction() {
             weightsInfo);
 }
 
+// TODO: empty method: remove
+bool ACLFullyConnectedExecutor::update(const MemoryArgs &memory) {
+    return ACLCommonExecutor::update(memory);
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.hpp
@@ -26,6 +26,8 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor {
     impl_desc_type implType() const override {
         return impl_desc_type::gemm_acl;
     }
+
+    bool update(const MemoryArgs& memory) override;
 private:
     arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
     arm_compute::WeightsInfo weightsInfo;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -78,6 +78,7 @@ static const TypeMapping dnnlFCTypeMapping {
 static const TypeMapping aclFCTypeMapping {
     // {src, wei, bia, dst}              pt<src, wei, bias, dst>
     {{_f32 | _f16, _any, _any, _any}, pt(bypass(), use<0>(), use<0>(), use<0>())},
+    {{_i8, _i8, _any, _any}, pt(just<i8>(), just<i8>(), bypass(), just<i32>())}
 };
 
 static const MappingNotation dnnlConvolutionMappingNotation {

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -272,15 +272,20 @@ void Transformations::UpToLpt() {
     };
 
     const bool useLpt = enableLpt &&
+        //false &&
         LowPrecision::isFunctionQuantized(model, supported_fq_levels) &&
         CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);
 
     const auto defaultPrecisions = useLpt ? precision_set::get_int8_support() : std::vector<ov::element::Type>{};
 
     PreLpt(defaultPrecisions);
 
-    if (useLpt)
+    ov::pass::Serialize("report/graphs/cpu.pre_lpt.xml", "report/graphs/cpu.pre_lpt.bin").run_on_model(model);
+
+    if (useLpt) {
         Lpt(defaultPrecisions);
+        ov::pass::Serialize("report/graphs/cpu.lpt.xml", "report/graphs/cpu.lpt.bin").run_on_model(model);
+    }
 }
 
 void Transformations::SetSubStreamNum(int SubStreams) {
@@ -800,6 +805,7 @@ void Transformations::PostLpt() {
 }
 
 void Transformations::MainSnippets(void) {
+    //return;
     auto is_supported_isa = [](){
 #if defined(OPENVINO_ARCH_X86_64)
         return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2);

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -396,14 +396,14 @@ std::vector<std::string> disabledTestPatterns() {
     retVector.emplace_back(R"(smoke_TestsDFT_(1|2|3|4)d/DFTLayerTest.Inference.*)");
     // Issue 88764, 91647, 108802: accuracy issue
     retVector.emplace_back(R"(MultipleLSTMCellTest/MultipleLSTMCellTest.CompareWithRefs.*)");
-    // int8 / code-generation specific
-    retVector.emplace_back(R"(smoke_LPT.*)");
     // Compressed weights are not supported
     retVector.emplace_back(R"(smoke_MatMulCompressedWeights.*)");
     retVector.emplace_back(R"(smoke_MatMulSharedCompressedWeights.*)");
     retVector.emplace_back(R"(smoke_MatmulAndGatherSharedWeightsDecompression.*)");
     // smoke_Snippets test cases are not supported on arm32 platforms
 #if !defined(OPENVINO_ARCH_ARM64)
+    // int8 / code-generation specific
+    retVector.emplace_back(R"(smoke_LPT.*)");
     retVector.emplace_back(R"(smoke_Snippets.*)");
 #endif
     // Issue: 126738

diff --git a/src/plugins/intel_cpu/thirdparty/ComputeLibrary b/src/plugins/intel_cpu/thirdparty/ComputeLibrary
diff --git a/...tional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp b/...tional/plugin/shared/src/low_precision_transformations/fully_connected_transformation.cpp
@@ -46,6 +46,8 @@ void FullyConnectedTransformation::SetUp() {
         shapes.inputB,
         shapes.transposeA,
         shapes.transposeB);
+
+    ov::pass::Serialize("report/graphs/test.original.xml", "test.original.bin").run_on_model(function);
 }
 
 TEST_P(FullyConnectedTransformation, CompareWithRefImpl) {

diff --git a/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp b/src/tests/ov_helpers/ov_lpt_models/src/mat_mul.cpp
@@ -59,7 +59,7 @@ std::shared_ptr<ov::Model> MatMulFunction::getOriginal(
     const std::vector<size_t> constShapes(inputShape1.rank().get_length(), 1ul);
     const auto fakeQuantizeOnAcitvations = ov::test::utils::make_fake_quantize(
         paramNode, precision, 256ul, constShapes,
-        { 0.f }, { 255.f / 4.f }, { 0.f }, { 255.f / 4.f });
+        { -128.f / 4.f }, { 127.f / 4.f }, { -128.f / 4.f }, { 127.f / 4.f });
     fakeQuantizeOnAcitvations->set_friendly_name("fakeQuantizeOnAcitvations");
 
     auto weightsConst = std::make_shared<ov::op::v0::Constant>(
+3 −1		src/core/CPP/CPPTypes.cpp
+2 −1		src/cpu/operators/CpuFullyConnected.cpp