Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] [ARM] [INT8] FullyConnected #25171

Merged
merged 35 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8212473
[CPU] [ARM] FullyConnected: int8 support
eshoguli Jun 26, 2024
7dd226e
[CPU] [ACL] FullyConnected fp32 executor refactoring
eshoguli Aug 13, 2024
d4c30c6
cleanup and refactoring
eshoguli Aug 16, 2024
5dbd319
address comments #1
alvoron Nov 14, 2024
03315bf
Merge branch 'master' into es/aarch64/int8
alvoron Nov 14, 2024
e6d4880
fixed build
alvoron Nov 14, 2024
8189730
fix merge
alvoron Nov 15, 2024
c743709
removed generate_values
alvoron Nov 15, 2024
c40e695
removed MatMulWithDequantizationTransformation
alvoron Nov 18, 2024
ff886c4
revert network_helper changes
alvoron Nov 19, 2024
39fcf09
delete empty DQScales check and moved test skip
alvoron Nov 19, 2024
2e9a2a0
return empty dequantizationScales check
alvoron Nov 19, 2024
1c639ef
added test case with bias
alvoron Nov 20, 2024
5df26e4
test with bias is added to x64 scope
alvoron Nov 22, 2024
b9df131
removed FusedWithMatMulI8 and MM transformation logic
alvoron Nov 22, 2024
b5f3487
fixed test and apply the last comment
alvoron Nov 25, 2024
ceff99e
simplified isSuitableChildForFC logic
alvoron Nov 25, 2024
459519b
update aclLowpFCTypeMapping
alvoron Nov 26, 2024
3688201
revert StaticMemory changes
alvoron Nov 27, 2024
1e95bb6
Merge branch 'master' into es/aarch64/int8
alvoron Nov 27, 2024
5d8c67d
Merge branch 'master' into es/aarch64/int8
alvoron Dec 11, 2024
7a07337
changes required after #26239
alvoron Dec 12, 2024
aeca18e
Merge branch 'master' into es/aarch64/int8
alvoron Dec 12, 2024
44e04cf
fix code style and warnings
alvoron Dec 12, 2024
fddd3f4
rollback dq scales fusing
alvoron Dec 13, 2024
55e0d0d
removed DQ check
alvoron Dec 13, 2024
9226262
stop wrapping FQ with Convert
alvoron Dec 16, 2024
aa64460
Revert "stop wrapping FQ with Convert"
alvoron Dec 16, 2024
0079f5f
mark getDeQuantizedScales as OV_CPU_MAYBE_UNUSED_FUNCTION
alvoron Dec 16, 2024
5455c1b
added missed code to prepareWeightMemory
alvoron Dec 16, 2024
118cfa8
fix fuse condition
alvoron Dec 16, 2024
f782508
Merge branch 'master' into es/aarch64/int8
alvoron Dec 17, 2024
5721c63
clang-format fix
alvoron Dec 17, 2024
bf9f2f6
Merge branch 'master' into es/aarch64/int8
alvoron Dec 17, 2024
83bc0c3
fix x64 expected nodes
alvoron Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge branch 'master' into es/aarch64/int8
  • Loading branch information
alvoron committed Dec 11, 2024
commit 5d8c67dc1eff0701612f879266005f3da8c4a84f
47 changes: 1 addition & 46 deletions src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,59 +14,14 @@
#include "cpu_types.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "nodes/executors/memory_arguments.hpp"
#include "nodes/executors/common/common_utils.hpp"
#include "openvino/core/type/element_type.hpp"
#include "utils/cpu_utils.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
if (!memory.count(ARG_DST_DEQ_SCALE))
return {};

auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);

auto scalesData = static_cast<const float*>(scalesMemory->getData());

if (!scalesData)
return {};

auto dstShape = memory.at(ARG_DST)->getShape();
auto dqScalesShape = scalesMemory->getShape();

auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());

auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());

std::vector<float> DQScales(scaleSize, 1.0);

OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
"set invalid scales size , DQScales vector size: ",
DQScales.size(),
", scale data size: ",
scaleSize);

// @todo do we really need to broadcast dq scales and then resize them back?
if (scaleSize > DQScales.size())
DQScales.resize(scaleSize, DQScales[0]);
if (1 == scaleSize) {
std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
return (scalesData[0] * val);
});
} else {
for (size_t i = 0; i < DQScales.size(); i++) {
DQScales[i] *= scalesData[i];
}
}
if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
return (val == DQScales[0]);
}))
DQScales.resize(1);

return DQScales;
}

DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps,
const dnnl::engine& engine,
const VectorDims& outputDims,
Expand Down
233 changes: 233 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,239 @@
namespace ov {
namespace intel_cpu {

static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) {
const auto& weightDims = wShape.getStaticDims();

auto inMinDims = inShape.getMinDims();
auto inMaxDims = inShape.getMaxDims();
inMinDims.back() = weightDims.back();
inMaxDims.back() = weightDims.back();

return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims();
}

static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) {
size_t activationRank = inShape.size();
size_t channelRank = wShape.size() - 1;
// activation weight output_shape
// NCHW CoCHW NCo
// TNC CoC TNCo
// NC CoC NCo
VectorDims outputShape(out_rank, 1);
// set Co
outputShape.back() = wShape[0];
// set batch dims
size_t batchRank = activationRank - channelRank;
size_t startIdx = out_rank - batchRank - 1;
for (size_t i = 0; i < batchRank; i++) {
outputShape[i + startIdx] = inShape[i];
}

return outputShape;
}

static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc,
const DnnlMemoryDescPtr dstDesc) {
const auto& weiDesc = srcDesc->getDnnlDesc();
const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba};
const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());

return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc);
}

static ov::optional<MemoryPtr> convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) {
MemoryArgs memoryArgs;
memoryArgs[ARG_SRC] = input;
memoryArgs[ARG_DST] = output;

auto aclWeightsConverter = std::make_shared<acl_fc_executor::ACLWeightsConverter>();
if (aclWeightsConverter->update(memoryArgs)) {
aclWeightsConverter->execute(memoryArgs);
return ov::optional<MemoryPtr>(memoryArgs.at(ARG_DST));
}

if (!node::Convert::isSupportedDesc(input->getDesc()) ||
!node::Convert::isSupportedDesc(output->getDesc())) {
return {};
}

auto data = static_cast<const uint8_t *>(input->getData());
std::vector<uint8_t> tmpBuff;
tmpBuff.resize(output->getSize());
cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()),
weightPrecision, input->getSize() / input->getDesc().getPrecision().size());

return ov::optional<MemoryPtr>(std::make_shared<Memory>(output->getPrimitive().get_engine(),
output->getDesc().cloneWithNewPrecision(weightPrecision),
tmpBuff.data()));
}

static ov::optional<MemoryPtr> reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) {
if (output->getDataType() == input->getDataType()) {
return {};
}
const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType());
auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc);
dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(),
output->getPrimitive().get_engine(),
input->getPrimitive().get_desc(),
MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc());

if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) {
auto convertOutput = convertWeightPrecision(input, output, inPrc);
if (!convertOutput) {
return {};
}
input = *convertOutput;

if (reorderWithoutConvert) {
dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order);
reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
return ov::optional<MemoryPtr>(output);
}
}
return {};
}

static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc,
DnnlMemoryDescPtr dstWeightDesc,
MemoryCPtr weightsMem,
ExecutorContext::CPtr context) {
MemoryPtr input = std::make_shared<Memory>(context->getEngine(), srcWeightDesc, weightsMem->getData());
MemoryPtr output = std::make_shared<Memory>(context->getEngine(), dstWeightDesc);
if (!input->getDesc().isDefined() || !output->getDesc().isDefined())
OPENVINO_THROW("Can't reorder data with dynamic shapes");

if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) {
return output;
}

if (input->getDesc().isCompatible(output->getDesc())) {
auto srcPtr = static_cast<uint8_t*>(input->getData());
auto dstPtr = static_cast<uint8_t*>(output->getData());
auto copySize = output->getSize();
cpu_memcpy(dstPtr, srcPtr, copySize);
return output;
}

// try directly reorder
auto engine = output->getPrimitive().get_engine();
dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(),
engine,
input->getPrimitive().get_desc(),
output->getPrimitive().get_desc());

if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) {
// try precision conversion then do the reorder
auto fallbackOutput = reorderDataFallback(input, output, context);
if (fallbackOutput) {
return *fallbackOutput;
}
}
// if precision conversion does not work then do direct reference reorder
if (directReorder) {
dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}});
} else {
OPENVINO_THROW("Could not make onednn reorder.");
}
return output;
}

static MemoryPtr reorderWeights(const MemoryArgs &memory,
const ExecutorContext::CPtr context,
ACLFCAttrs& aclfcAttrs,
DnnlMemoryDescPtr dnnlSrcDesc,
DnnlMemoryDescPtr dnnlDstDesc) {
auto create = [&]() {
MemoryPtr weightsMemory = memory.at(ARG_WEI);
if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) {
weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context);
DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing");
}
return weightsMemory;
};

auto weightCache = context->getWeightsCache();
if (weightCache != nullptr) {
const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims();
const auto N = wgtDims[0];
const auto K = wgtDims[1];
std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K);
const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" +
std::to_string(reinterpret_cast<uint64_t>(memory.at(ARG_WEI)->getData()));
DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash);
return *weightCache->findOrCreate(string_hash, create);
}

DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available");
return create();
}

static MemoryPtr prepareWeightMemory(const MemoryArgs &memory,
const ExecutorContext::CPtr context,
const FCAttrs &attrs,
ACLFCAttrs& aclfcAttrs,
const PostOps &postOps,
arm_compute::WeightFormat& expectedWeightFormat,
arm_compute::TensorInfo& weiTensorInfo) {
MemoryArgs memoryArgs;
memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS);
memoryArgs[ARG_WEI] = memory.at(ARG_WEI);

auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr();

// normalize weights to 2D
const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims();
const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims);

originalWeightsDesc = std::make_shared<CpuBlockedMemoryDesc>(originalWeightsDesc->getPrecision(), Shape{wgtDims2D});

auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision);
auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);

if (memory.at(ARG_SRC_0)->getShape().isDynamic()) {
const auto& inShape = memory.at(ARG_SRC_0)->getShape();
const auto& wShape = originalWeightsDesc->getShape();
const auto& inDymmyDims = makeDummyInputDims(inShape, wShape);
const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank());
memoryArgs[ARG_SRC_0] = std::make_shared<Memory>(context->getEngine(),
memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims));
memoryArgs[ARG_DST] = std::make_shared<Memory>(context->getEngine(),
memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims));
} else {
memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0);
memoryArgs[ARG_DST] = memory.at(ARG_DST);
}

// TODO: ACLWeightFormatGenerator should be replaced with Reorder executor
// that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability
auto aclWeightsRepack = std::make_shared<acl_fc_executor::ACLWeightFormatGenerator>(attrs, postOps, memoryArgs);
bool isNeededReorder = aclWeightsRepack->update(memoryArgs);
expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED;
weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI);

if (isNeededReorder) {
dnnl::impl::dim_t o_dim = 0;
dnnl::impl::dim_t inner_dim = 1;
std::vector<dnnl::impl::dim_t> remaining_dims = {};
auto weights_md_ = dnnlDstDesc->getDnnlDesc().get();
dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat,
inner_dim, o_dim, remaining_dims, {});
if (aclfcAttrs.weightsNonTransposed) {
dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc);
}
aclfcAttrs.isWeightsRepacked = true;
return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
}
if (!aclfcAttrs.weightsNonTransposed) {
dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
aclfcAttrs.isWeightsRepacked = true;
}
return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
}

static bool checkPostOps(const PostOps &postOps) {
if (postOps.empty()) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "nodes/executors/memory_arguments.hpp"
#include "nodes/executors/debug_messages.hpp"
#include "nodes/executors/implementation_utils.hpp"
#include "nodes/executors/common/common_utils.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
Expand Down Expand Up @@ -56,7 +57,8 @@ static void initFCAttrs(const FCAttrs &attrs,
ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs,
const PostOps &postOps,
const MemoryArgs &memory,
const ExecutorContext::CPtr& context) : dequantizationScales(attrs.dequantizationScales) {
const ExecutorContext::CPtr& context) {
dequantizationScales = getDeQuantizedScales(memory);
initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclfcAttrs, postOps, expectedWeightFormat, weiTensorInfo);
}
Expand All @@ -72,7 +74,7 @@ bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
VERIFY(checkPostOps(config.postOps), UNSUPPORTED_TYPE_OF_POSTOPS);
VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
//VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
return true;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
// @file common_utils.hpp
// Contains utility methods used by all executors
//

#pragma once

#include <vector>

#include "nodes/executors/memory_arguments.hpp"
#include "utils/cpu_utils.hpp"

namespace ov {
namespace intel_cpu {

static std::vector<float> getDeQuantizedScales(const MemoryArgs& memory) {
if (!memory.count(ARG_DST_DEQ_SCALE))
return {};

auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE);

auto scalesData = static_cast<const float*>(scalesMemory->getData());

if (!scalesData)
return {};

auto dstShape = memory.at(ARG_DST)->getShape();
auto dqScalesShape = scalesMemory->getShape();

auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size());

auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies<size_t>());

std::vector<float> DQScales(scaleSize, 1.0);

OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize,
"set invalid scales size , DQScales vector size: ",
DQScales.size(),
", scale data size: ",
scaleSize);

// @todo do we really need to broadcast dq scales and then resize them back?
if (scaleSize > DQScales.size())
DQScales.resize(scaleSize, DQScales[0]);
if (1 == scaleSize) {
std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) {
return (scalesData[0] * val);
});
} else {
for (size_t i = 0; i < DQScales.size(); i++) {
DQScales[i] *= scalesData[i];
}
}
if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) {
return (val == DQScales[0]);
}))
DQScales.resize(1);

return DQScales;
}

} // namespace intel_cpu
} // namespace ov
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.