Merge branch 'master' into es/aarch64/int8

openvinotoolkit · dmitry-gorokhov · Dec 18, 2024 · Jun 26, 2024 · Aug 13, 2024 · Aug 16, 2024
commit 5d8c67dc1eff0701612f879266005f3da8c4a84f
@@ -14,59 +14,14 @@
 #include "cpu_types.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
 #include "nodes/executors/memory_arguments.hpp"
+#include "nodes/executors/common/common_utils.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
 namespace intel_cpu {
 
-static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
-    if (!memory.count(ARG_DST_DEQ_SCALE))
-        return {};
-
-    auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);
-
-    auto scalesData = static_cast<const float*>(scalesMemory->getData());
-
-    if (!scalesData)
-        return {};
-
-    auto dstShape = memory.at(ARG_DST)->getShape();
-    auto dqScalesShape = scalesMemory->getShape();
-
-    auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());
-
-    auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());
-
-    std::vector<float> DQScales(scaleSize, 1.0);
-
-    OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
-                    "set invalid scales size , DQScales vector size: ",
-                    DQScales.size(),
-                    ", scale data size: ",
-                    scaleSize);
-
-    // @todo do we really need to broadcast dq scales and then resize them back?
-    if (scaleSize > DQScales.size())
-        DQScales.resize(scaleSize, DQScales[0]);
-    if (1 == scaleSize) {
-        std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
-            return (scalesData[0] * val);
-        });
-    } else {
-        for (size_t i = 0; i < DQScales.size(); i++) {
-            DQScales[i] *= scalesData[i];
-        }
-    }
-    if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
-            return (val == DQScales[0]);
-        }))
-        DQScales.resize(1);
-
-    return DQScales;
-}
-
 DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
                                          const dnnl::engine& engine,
                                          const VectorDims& outputDims,

@@ -24,6 +24,239 @@
 namespace ov {
 namespace intel_cpu {
 
+static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
+    const auto& weightDims = wShape.getStaticDims();
+
+    auto inMinDims = inShape.getMinDims();
+    auto inMaxDims = inShape.getMaxDims();
+    inMinDims.back() = weightDims.back();
+    inMaxDims.back() = weightDims.back();
+
+    return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
+}
+
+static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
+    size_t activationRank = inShape.size();
+    size_t channelRank = wShape.size() - 1;
+    // activation   weight    output_shape
+    // NCHW         CoCHW     NCo
+    // TNC          CoC       TNCo
+    // NC           CoC       NCo
+    VectorDims outputShape(out_rank, 1);
+    // set Co
+    outputShape.back() = wShape[0];
+    // set batch dims
+    size_t batchRank = activationRank - channelRank;
+    size_t startIdx = out_rank - batchRank - 1;
+    for (size_t i = 0; i < batchRank; i++) {
+        outputShape[i + startIdx] = inShape[i];
+    }
+
+    return outputShape;
+}
+
+static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc,
+                                                        const DnnlMemoryDescPtr dstDesc) {
+    const auto& weiDesc = srcDesc->getDnnlDesc();
+    const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
+    const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
+
+    return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
+}
+
+static ov::optional<MemoryPtr> convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) {
+    MemoryArgs memoryArgs;
+    memoryArgs[ARG_SRC] = input;
+    memoryArgs[ARG_DST] = output;
+
+    auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
+    if (aclWeightsConverter->update(memoryArgs)) {
+        aclWeightsConverter->execute(memoryArgs);
+        return ov::optional<MemoryPtr>(memoryArgs.at(ARG_DST));
+    }
+
+    if (!node::Convert::isSupportedDesc(input->getDesc()) ||
+        !node::Convert::isSupportedDesc(output->getDesc())) {
+        return {};
+    }
+
+    auto data = static_cast<const uint8_t *>(input->getData());
+    std::vector<uint8_t> tmpBuff;
+    tmpBuff.resize(output->getSize());
+    cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()),
+                weightPrecision, input->getSize() / input->getDesc().getPrecision().size());
+
+    return ov::optional<MemoryPtr>(std::make_shared<Memory>(output->getPrimitive().get_engine(),
+                                                            output->getDesc().cloneWithNewPrecision(weightPrecision),
+                                                            tmpBuff.data()));
+}
+
+static ov::optional<MemoryPtr> reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) {
+    if (output->getDataType() == input->getDataType()) {
+        return {};
+    }
+    const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType());
+    auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc);
+    dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(),
+                                                         output->getPrimitive().get_engine(),
+                                                         input->getPrimitive().get_desc(),
+                                                         MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc());
+
+    if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) {
+        auto convertOutput = convertWeightPrecision(input, output, inPrc);
+        if (!convertOutput) {
+            return {};
+        }
+        input = *convertOutput;
+
+        if (reorderWithoutConvert) {
+            dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order);
+            reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
+            return ov::optional<MemoryPtr>(output);
+        }
+    }
+    return {};
+}
+
+static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc,
+                             DnnlMemoryDescPtr dstWeightDesc,
+                             MemoryCPtr weightsMem,
+                             ExecutorContext::CPtr context) {
+    MemoryPtr input = std::make_shared<Memory>(context->getEngine(), srcWeightDesc, weightsMem->getData());
+    MemoryPtr output = std::make_shared<Memory>(context->getEngine(), dstWeightDesc);
+    if (!input->getDesc().isDefined() || !output->getDesc().isDefined())
+        OPENVINO_THROW("Can't reorder data with dynamic shapes");
+
+    if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) {
+        return output;
+    }
+
+    if (input->getDesc().isCompatible(output->getDesc())) {
+        auto srcPtr = static_cast<uint8_t*>(input->getData());
+        auto dstPtr = static_cast<uint8_t*>(output->getData());
+        auto copySize = output->getSize();
+        cpu_memcpy(dstPtr, srcPtr, copySize);
+        return output;
+    }
+
+    // try directly reorder
+    auto engine = output->getPrimitive().get_engine();
+    dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(),
+                                                 engine,
+                                                 input->getPrimitive().get_desc(),
+                                                 output->getPrimitive().get_desc());
+
+    if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) {
+        // try precision conversion then do the reorder
+        auto fallbackOutput = reorderDataFallback(input, output, context);
+        if (fallbackOutput) {
+            return *fallbackOutput;
+        }
+    }
+    // if precision conversion does not work then do direct reference reorder
+    if (directReorder) {
+        dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
+        directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
+    } else {
+        OPENVINO_THROW("Could not make onednn reorder.");
+    }
+    return output;
+}
+
+static MemoryPtr reorderWeights(const MemoryArgs &memory,
+                                const ExecutorContext::CPtr context,
+                                ACLFCAttrs& aclfcAttrs,
+                                DnnlMemoryDescPtr dnnlSrcDesc,
+                                DnnlMemoryDescPtr dnnlDstDesc) {
+    auto create = [&]() {
+        MemoryPtr weightsMemory = memory.at(ARG_WEI);
+        if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) {
+            weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context);
+            DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
+        }
+        return weightsMemory;
+    };
+
+    auto weightCache = context->getWeightsCache();
+    if (weightCache != nullptr) {
+        const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
+        const auto N = wgtDims[0];
+        const auto K = wgtDims[1];
+        std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
+        const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
+                                        std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
+        DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
+        return *weightCache->findOrCreate(string_hash, create);
+    }
+
+    DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
+    return create();
+}
+
+static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
+                                     const ExecutorContext::CPtr context,
+                                     const FCAttrs &attrs,
+                                     ACLFCAttrs& aclfcAttrs,
+                                     const PostOps &postOps,
+                                     arm_compute::WeightFormat& expectedWeightFormat,
+                                     arm_compute::TensorInfo& weiTensorInfo) {
+    MemoryArgs memoryArgs;
+    memoryArgs[ARG_BIAS]  = memory.at(ARG_BIAS);
+    memoryArgs[ARG_WEI]   = memory.at(ARG_WEI);
+
+    auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr();
+
+    // normalize weights to 2D
+    const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims();
+    const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims);
+
+    originalWeightsDesc = std::make_shared<CpuBlockedMemoryDesc>(originalWeightsDesc->getPrecision(), Shape{wgtDims2D});
+
+    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
+    auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision);
+    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
+
+    if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
+        const auto& inShape = memory.at(ARG_SRC_0)->getShape();
+        const auto& wShape = originalWeightsDesc->getShape();
+        const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
+        const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
+        memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
+                                                            memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
+        memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
+                                                        memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
+    } else {
+        memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
+        memoryArgs[ARG_DST]   = memory.at(ARG_DST);
+    }
+
+    // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor
+    // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability
+    auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
+    bool isNeededReorder = aclWeightsRepack->update(memoryArgs);
+    expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED;
+    weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI);
+
+    if (isNeededReorder) {
+        dnnl::impl::dim_t o_dim = 0;
+        dnnl::impl::dim_t inner_dim = 1;
+        std::vector<dnnl::impl::dim_t> remaining_dims = {};
+        auto weights_md_ = dnnlDstDesc->getDnnlDesc().get();
+        dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat,
+                                                                  inner_dim, o_dim, remaining_dims, {});
+        if (aclfcAttrs.weightsNonTransposed) {
+            dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
+        }
+        aclfcAttrs.isWeightsRepacked = true;
+        return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+    }
+    if (!aclfcAttrs.weightsNonTransposed) {
+        dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
+        aclfcAttrs.isWeightsRepacked = true;
+    }
+    return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+}
+
 static bool checkPostOps(const PostOps &postOps) {
     if (postOps.empty()) {
         return true;

@@ -13,6 +13,7 @@
 #include "nodes/executors/memory_arguments.hpp"
 #include "nodes/executors/debug_messages.hpp"
 #include "nodes/executors/implementation_utils.hpp"
+#include "nodes/executors/common/common_utils.hpp"
 #include "utils/debug_capabilities.h"
 
 namespace ov {
@@ -56,7 +57,8 @@ static void initFCAttrs(const FCAttrs &attrs,
 ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs,
                                                              const PostOps &postOps,
                                                              const MemoryArgs &memory,
-                                                             const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) {
+                                                             const ExecutorContext::CPtr& context) {
+    dequantizationScales = getDeQuantizedScales(memory);
     initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
     packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo);
 }
@@ -72,7 +74,7 @@ bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
     VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS);
     VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
     VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
-    VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
+    //VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
     return true;
 }
 

@@ -0,0 +1,65 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// @file common_utils.hpp
+// Contains utility methods used by all executors
+//
+
+#pragma once
+
+#include <vector>
+
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/cpu_utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
+    if (!memory.count(ARG_DST_DEQ_SCALE))
+        return {};
+
+    auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);
+
+    auto scalesData = static_cast<const float*>(scalesMemory->getData());
+
+    if (!scalesData)
+        return {};
+
+    auto dstShape = memory.at(ARG_DST)->getShape();
+    auto dqScalesShape = scalesMemory->getShape();
+
+    auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());
+
+    auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());
+
+    std::vector<float> DQScales(scaleSize, 1.0);
+
+    OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
+                    "set invalid scales size , DQScales vector size: ",
+                    DQScales.size(),
+                    ", scale data size: ",
+                    scaleSize);
+
+    // @todo do we really need to broadcast dq scales and then resize them back?
+    if (scaleSize > DQScales.size())
+        DQScales.resize(scaleSize, DQScales[0]);
+    if (1 == scaleSize) {
+        std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
+            return (scalesData[0] * val);
+        });
+    } else {
+        for (size_t i = 0; i < DQScales.size(); i++) {
+            DQScales[i] *= scalesData[i];
+        }
+    }
+    if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
+            return (val == DQScales[0]);
+        }))
+        DQScales.resize(1);
+
+    return DQScales;
+}
+
+}   // namespace intel_cpu
+}   // namespace ov