diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 874deddbefd..013bf993280 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -23,6 +23,11 @@ #include #include +#if defined(OV_CPU_WITH_ACL) +#include "executors/acl/acl_utils.hpp" +#include "utils/debug_capabilities.h" +#endif + #include #include @@ -174,15 +179,15 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, withGroups = false; for (size_t i = 0; i < convBackprop->get_strides().size(); i++) { - stride.push_back(static_cast(convBackprop->get_strides()[i])); + deconvAttrs.stride.push_back(static_cast(convBackprop->get_strides()[i])); } for (size_t i = 0; i < convBackprop->get_dilations().size(); i++) { - dilation.push_back(static_cast(convBackprop->get_dilations()[i]) - 1); + deconvAttrs.dilation.push_back(static_cast(convBackprop->get_dilations()[i]) - 1); } - paddingL = convBackprop->get_pads_begin(); - paddingR = convBackprop->get_pads_end(); + deconvAttrs.paddingL = convBackprop->get_pads_begin(); + deconvAttrs.paddingR = convBackprop->get_pads_end(); - outputPadding = convBackprop->get_output_padding(); + deconvAttrs.outputPadding = convBackprop->get_output_padding(); autoPad = one_of(convBackprop->get_auto_pad(), ov::op::PadType::SAME_LOWER, ov::op::PadType::SAME_UPPER); } else if (auto groupConvBackprop = std::dynamic_pointer_cast(op)) { @@ -196,20 +201,20 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, isDW = withGroups && groupNum == OC && groupNum == IC; for (size_t i = 0; i < groupConvBackprop->get_strides().size(); i++) { - stride.push_back(static_cast(groupConvBackprop->get_strides()[i])); + deconvAttrs.stride.push_back(static_cast(groupConvBackprop->get_strides()[i])); } for (size_t i = 0; i < groupConvBackprop->get_dilations().size(); i++) { - dilation.push_back(static_cast(groupConvBackprop->get_dilations()[i]) - 1); + deconvAttrs.dilation.push_back(static_cast(groupConvBackprop->get_dilations()[i]) - 1); } - paddingL = groupConvBackprop->get_pads_begin(); - paddingR = groupConvBackprop->get_pads_end(); + deconvAttrs.paddingL = groupConvBackprop->get_pads_begin(); + deconvAttrs.paddingR = groupConvBackprop->get_pads_end(); - outputPadding = groupConvBackprop->get_output_padding(); + deconvAttrs.outputPadding = groupConvBackprop->get_output_padding(); autoPad = one_of(groupConvBackprop->get_auto_pad(), ov::op::PadType::SAME_LOWER, ov::op::PadType::SAME_UPPER); } - for (size_t i = 0; i < dilation.size(); i++) { - kernel.push_back(weightDims[withGroups + 2 + i]); + for (size_t i = 0; i < deconvAttrs.dilation.size(); i++) { + deconvAttrs.kernel.push_back(weightDims[withGroups + 2 + i]); } externOutShape = inputShapes.size() == 3; @@ -274,7 +279,7 @@ bool Deconvolution::canBeExecutedInInt8() const { return false; } - if (!withGroups && stride.back() > 3) + if (!withGroups && deconvAttrs.stride.back() > 3) return false; if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core)) { const auto& inMaxDims = getOutputShapeAtPort(0).getMaxDims(); @@ -291,8 +296,8 @@ bool Deconvolution::canBeExecutedInInt8() const { return false; } - for (size_t i = 0; i < kernel.size(); i++) { - if (kernel[i] < stride[i]) + for (size_t i = 0; i < deconvAttrs.kernel.size(); i++) { + if (deconvAttrs.kernel[i] < deconvAttrs.stride[i]) return false; } @@ -301,7 +306,7 @@ bool Deconvolution::canBeExecutedInInt8() const { : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 : 4; if (withGroups && !isDW && (IC % channelBlock != 0 || OC % channelBlock != 0)) return false; - if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) && stride.back() > 3) + if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) && deconvAttrs.stride.back() > 3) return false; InferenceEngine::Precision inPrecision = getOriginalInputPrecisionAtPort(0); @@ -310,7 +315,7 @@ bool Deconvolution::canBeExecutedInInt8() const { InferenceEngine::Precision weiPrecision = getOriginalInputPrecisionAtPort(1); auto weightsDataType = DnnlExtensionUtils::IEPrecisionToDataType(weiPrecision); - if (isDW && (inputDataType == dnnl_s8 || dilation.size() == 3)) + if (isDW && (inputDataType == dnnl_s8 || deconvAttrs.dilation.size() == 3)) return false; return (inputDataType == dnnl_s8 || inputDataType == dnnl_u8) && weightsDataType == dnnl_s8; @@ -351,10 +356,10 @@ std::pair Deconvolution::makeDummyInOutShape() { const auto& weightDims = getWeightDims(); const size_t wghOffset = getAlgorithm() == Algorithm::DeconvolutionGrouped ? 1 : 0; - VectorDims paddings(paddingL.size()); + VectorDims paddings(deconvAttrs.paddingL.size()); if (!autoPad) { for (size_t i = 0; i < paddings.size(); ++i) { - paddings[i] = paddingL[i] + paddingR[i]; + paddings[i] = deconvAttrs.paddingL[i] + deconvAttrs.paddingR[i]; } } else { for (size_t i = 0; i < origInDims.size() - 2; i++) { @@ -363,17 +368,17 @@ std::pair Deconvolution::makeDummyInOutShape() { // if input shape is dynamic and bounded, paddings should be computed basing on the following limitations: // 1. paddings must not be negative // 2. the result padding must have such a value to keep the dummy dimensions inside the predefined interval - auto c1 = lastOutputSpatialDims[i] - outputPadding[i] - 1 - - (dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); + auto c1 = lastOutputSpatialDims[i] - deconvAttrs.outputPadding[i] - 1 - + (deconvAttrs.dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); if (origInMaxDims[i + 2] != Shape::UNDEFINED_DIM) { - auto upper_bound = stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; + auto upper_bound = deconvAttrs.stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; if (upper_bound < 0) { IE_THROW() << errorPrefix << ": paddings for dummy shapes can't be computed"; } } - auto lower_bound = stride[i] * static_cast(origInMinDims[i + 2] - 1) - c1; + auto lower_bound = deconvAttrs.stride[i] * static_cast(origInMinDims[i + 2] - 1) - c1; if (lower_bound > 0) { paddings[i] = lower_bound; } @@ -383,16 +388,16 @@ std::pair Deconvolution::makeDummyInOutShape() { for (size_t i = 0; i < inputDims.size() - 2; i++) { if (origInDims[2 + i] == Shape::UNDEFINED_DIM) { - inputDims[2 + i] = (lastOutputSpatialDims[i] - (dilation[i] + 1) * - (weightDims[wghOffset + 2 + i] - 1) - 1 + paddings[i] - outputPadding[i]) / - stride[i] + 1; + inputDims[2 + i] = (lastOutputSpatialDims[i] - (deconvAttrs.dilation[i] + 1) * + (weightDims[wghOffset + 2 + i] - 1) - 1 + paddings[i] - deconvAttrs.outputPadding[i]) / + deconvAttrs.stride[i] + 1; } } } inShape = Shape(inputDims); outShape = Shape(shapeInferInternal(inShape.getStaticDims(), lastOutputSpatialDims)); - paddingL = shapeInference->get_pads_begin(); - paddingR = shapeInference->get_pads_end(); + deconvAttrs.paddingL = shapeInference->get_pads_begin(); + deconvAttrs.paddingR = shapeInference->get_pads_end(); } return {inShape.getStaticDims(), outShape.getStaticDims()}; } @@ -420,7 +425,7 @@ void Deconvolution::getSupportedDescriptors() { if (!descs.empty()) return; isInt8 = canBeExecutedInInt8(); - withBiases = externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; + deconvAttrs.withBiasesParam = withBiases = externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; //ONEDNN deconvolution_fwd_t primitive can support bias fusing. //ONEDNN convolution_data_bwd_t can't support bias fusing. //Current only int8 precision choose deconvolution_fwd_t. @@ -463,6 +468,41 @@ void Deconvolution::getSupportedDescriptors() { Shape outShape(outDims); initPaddingR(inShape, outShape); +#if defined(OV_CPU_WITH_ACL) + NodeConfig config; + config.inConfs.resize(getParentEdges().size()); + config.outConfs.resize(getOriginalOutputsNumber()); + + auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + for (size_t i = 0; i < getParentEdges().size(); ++i) { + auto checkDesc = [&](LayoutType format) -> bool { + NodeConfig config; + config.inConfs.resize(getParentEdges().size()); + config.outConfs.resize(getOriginalOutputsNumber()); + + for (size_t i = 0; i < getParentEdges().size(); ++i) { + config.inConfs[i].setMemDesc( + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); + } + config.outConfs[0].setMemDesc( + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(0))); + + std::vector srcMemoryDescs; + for (size_t i = 0; i < config.inConfs.size(); i++) { + srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()); + } + std::vector dstMemoryDescs; + for (size_t i = 0; i < config.outConfs.size(); i++) { + dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); + } + + return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs); + }; + useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp); + } + if (useACL) return; +#endif + setPostOps(*attr, outShape.getStaticDims()); if (isInt8) { @@ -484,15 +524,14 @@ void Deconvolution::getSupportedDescriptors() { } void Deconvolution::initPaddingR(const Shape &inShape, const Shape &outShape) { - for (size_t i = 0; i < paddingR.size(); i++) { + for (size_t i = 0; i < deconvAttrs.paddingR.size(); i++) { int with_group = getAlgorithm() == Algorithm::DeconvolutionGrouped ? 1 : 0; const auto& weightDims = getWeightDims(); int krn = weightDims[with_group + 2 + i]; int src = outShape.getStaticDims()[2 + i]; int dst = inShape.getStaticDims()[2 + i]; - - krn = (krn - 1)*(dilation[i] + 1) + 1; - paddingR[i] = (dst - 1) * stride[i] - (src - krn + paddingL[i]); + krn = (krn - 1)*(deconvAttrs.dilation[i] + 1) + 1; + deconvAttrs.paddingR[i] = (dst - 1) * deconvAttrs.stride[i] - (src - krn + deconvAttrs.paddingL[i]); } } @@ -594,6 +633,20 @@ VectorDims Deconvolution::shapeInferInternal(const VectorDims &inDims, std::vect } void Deconvolution::execute(dnnl::stream strm) { + if (useACL) { + std::vector srcMemory; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + srcMemory.push_back(getParentEdgeAt(i)->getMemoryPtr()); + } + std::vector dstMemory; + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + dstMemory.push_back(getChildEdgeAt(i)->getMemoryPtr()); + } + //TODO: need to pass post ops data + execPtrDeconv->exec(srcMemory, dstMemory, nullptr); + return; + } + if (!execPtr) { IE_THROW() << "Can't execute Deconvolution node with name: " << getName() << ", because executor is not compiled"; } @@ -774,7 +827,7 @@ void Deconvolution::createPrimitive() { const AttrPtr pAttr = makePrimitiveAttr(outDims); auto prim_desc = createInt8MkldnnDeconvDesc(inDesc->getDnnlDesc(), wgh_candidate, dnnlBiasDesc, outDesc->getDnnlDesc(), withBiases, - stride, dilation, paddingL, paddingR, *pAttr, getEngine()); + deconvAttrs.stride, deconvAttrs.dilation, deconvAttrs.paddingL, deconvAttrs.paddingR, *pAttr, getEngine()); const bool found = DnnlExtensionUtils::find_implementation(prim_desc, selectedImpl); @@ -803,10 +856,26 @@ void Deconvolution::prepareParams() { IE_THROW() << "Input memory has not been allocated."; if (!wghMemPtr || !wghMemPtr->isAllocated()) IE_THROW() << "Weight memory has not been allocated."; - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + auto selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) IE_THROW() << "Preferable primitive descriptor is not set for node " << getName() << "."; + if (useACL) { + std::vector srcMemoryDescs; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + srcMemoryDescs.push_back(getParentEdgesAtPort(i).front()->getMemory().getDescWithType()); + } + std::vector dstMemoryDescs; + for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { + dstMemoryDescs.push_back(getChildEdgesAtPort(i).front()->getMemory().getDescWithType()); + } + + execPtrDeconv = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, srcMemoryDescs, + dstMemoryDescs, *attr); + selected_pd->setImplementationType(execPtrDeconv->getImplType()); + return; + } + auto inMemoryDesc = getParentEdgesAtPort(0).front()->getMemory().getDescWithType(); auto outMemoryDesc = getChildEdgesAtPort(0).front()->getMemory().getDescWithType(); @@ -817,8 +886,8 @@ void Deconvolution::prepareParams() { } pAttrLocal = pAttr; if (autoPad || externOutShape) { - paddingL = shapeInference->get_pads_begin(); - paddingR = shapeInference->get_pads_end(); + deconvAttrs.paddingL = shapeInference->get_pads_begin(); + deconvAttrs.paddingR = shapeInference->get_pads_end(); } initPaddingR(inMemoryDesc->getShape(), outMemoryDesc->getShape()); } else { @@ -846,10 +915,10 @@ void Deconvolution::prepareParams() { wghDesc, biasDesc, outMemoryDesc, - stride, - dilation, - paddingL, - paddingR, + deconvAttrs.stride, + deconvAttrs.dilation, + deconvAttrs.paddingL, + deconvAttrs.paddingR, isInt8, *pAttrLocal, selected_pd->getImplementationType()}; @@ -1006,14 +1075,16 @@ void Deconvolution::createDescriptor(const std::vector &inputDesc } dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(int8WeightDims), memory::data_type::s8, memory::format_tag::any); descs.emplace_back(createDescriptorInternalInt8(in_candidate, wgh_candidate, bias_candidate, - out_candidate, withBiases, stride, dilation, paddingL, paddingR, *attr, getEngine())); + out_candidate, withBiases, deconvAttrs.stride, deconvAttrs.dilation, + deconvAttrs.paddingL, deconvAttrs.paddingR, *attr, getEngine())); } else { dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(getWeightDims()), dnnlInDesc.getDataType(), memory::format_tag::any); convolution_backward_data::primitive_desc deconv_desc; convolution_forward::primitive_desc fwd_conv_pd; std::tie(deconv_desc, fwd_conv_pd) = createDescriptorInternalDefault(in_candidate, wgh_candidate, out_candidate, dnnl::algorithm::convolution_direct, - stride, dilation, paddingL, paddingR, *attr, getEngine()); + deconvAttrs.stride, deconvAttrs.dilation, deconvAttrs.paddingL, + deconvAttrs.paddingR, *attr, getEngine()); IE_ASSERT(fwd_conv_pd && deconv_desc && deconv_desc.get(true) != nullptr) << "Failed to create convolution_backward_data::primitive_desc: " << "Node: ##" << getName(); fwdConvPD.push_back(fwd_conv_pd); // oneDNN requires forward pd to exists until primitive is created @@ -1120,6 +1191,44 @@ bool Deconvolution::canFuseBias() const { (externOutShape ? getParentEdges().size() == 3 : getParentEdges().size() == 2)); } +void Deconvolution::initSupportedPrimitiveDescriptors() { + if (!useACL) { + Node::initSupportedPrimitiveDescriptors(); + return; + } + + auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + auto pushDesc = [&](LayoutType format) { + NodeConfig config; + config.inConfs.resize(getParentEdges().size()); + config.outConfs.resize(getOriginalOutputsNumber()); + + for (size_t i = 0; i < getParentEdges().size(); ++i) { + config.inConfs[i].setMemDesc( + // ACL expected equal precision + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + } + config.outConfs[0].setMemDesc( + // ACL expected equal precision + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getOutputShapeAtPort(0))); + + std::vector srcMemoryDescs; + for (size_t i = 0; i < config.inConfs.size(); i++) { + srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()); + } + std::vector dstMemoryDescs; + for (size_t i = 0; i < config.outConfs.size(); i++) { + dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); + } + + auto factory = std::make_shared(deconvAttrs, srcMemoryDescs, dstMemoryDescs, + std::make_shared(context, getImplPriority())); + + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::acl, factory); + }; + pushDesc(LayoutType::ncsp); +} + } // namespace node } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/deconv.h b/src/plugins/intel_cpu/src/nodes/deconv.h index 3b6b50cad0c..92de796cef4 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.h +++ b/src/plugins/intel_cpu/src/nodes/deconv.h @@ -11,6 +11,8 @@ #include #include "common/dnnl_executor.h" +#include "executors/deconv_list.hpp" + namespace ov { namespace intel_cpu { namespace node { @@ -20,6 +22,7 @@ public: Deconvolution(const std::shared_ptr& op, const GraphContext::CPtr context); void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override; void createDescriptor(const std::vector& inputDesc, const std::vector& outputDesc) override; void createPrimitive() override; @@ -41,7 +44,7 @@ public: bool canFuse(const NodePtr& node) const override; const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); } - const std::vector& getStride() const { return stride; } + const std::vector& getStride() const { return deconvAttrs.stride; } void prepareParams() override; void execute(dnnl::stream strm) override; @@ -55,6 +58,7 @@ protected: AttrPtr initPrimitiveAttr() override; AttrPtr makePrimitiveAttr(const VectorDims& dims); std::vector getAvailableFormatsForDims(const Shape& dims) const override; + std::shared_ptr execPtrDeconv = nullptr; private: using executorPtr = std::shared_ptr; @@ -89,16 +93,13 @@ private: size_t groupNum = 1; size_t IC = 0; size_t OC = 0; - std::vector kernel; - std::vector stride; - std::vector dilation; - ov::CoordinateDiff paddingL; - ov::CoordinateDiff paddingR; - ov::CoordinateDiff outputPadding; std::vector lastOutputSpatialDims; VectorDims int8WeightDims; VectorDims expectedBiasDims {}; + bool useACL = false; + DeconvAttrs deconvAttrs; + Shape inShape; AttrPtr pAttr; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp new file mode 100644 index 00000000000..dff93aa35b9 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp @@ -0,0 +1,248 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_deconv.hpp" +#include "ie_parallel.hpp" + +namespace ov { +namespace intel_cpu { + +using namespace arm_compute; + +ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) { + auto srcDims = srcDescs[0]->getShape().getDims(); + auto weiDims = srcDescs[1]->getShape().getDims(); + // swap input and output channels dimensions to be align with ACL + // weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor + std::swap(weiDims[0], weiDims[1]); + auto dstDims = dstDescs[0]->getShape().getDims(); + + VectorDims biasDims; + TensorInfo biasTensorInfo; + + if (deconvAttrs.withBiasesParam) { + biasDims = srcDescs[2]->getShape().getStaticDims(); + biasTensorInfo = TensorInfo(shapeCast(biasDims), 1, + precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2])); + } + + TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); + TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1, + precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1])); + TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + + unsigned int pad_l = + (deconvAttrs.paddingL.size() > 1) ? static_cast(deconvAttrs.paddingL.at(1)) : static_cast(deconvAttrs.paddingL.at(0)); + unsigned int pad_r = + (deconvAttrs.paddingR.size() > 1) ? static_cast(deconvAttrs.paddingR.at(1)) : static_cast(deconvAttrs.paddingR.at(0)); + unsigned int pad_t = static_cast(deconvAttrs.paddingL.at(0)); + unsigned int pad_b = static_cast(deconvAttrs.paddingR.at(0)); + unsigned int stride_x = (deconvAttrs.stride.size() > 1) ? deconvAttrs.stride.at(1) : deconvAttrs.stride.at(0); + unsigned int stride_y = deconvAttrs.stride.at(0); + PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, DimensionRoundingType::FLOOR); + + return ACLDeconvTensorInfo{srcTensorInfo, weiTensorInfo, biasTensorInfo, dstTensorInfo, deconv_info}; +} + +AclDeconvExecutor::AclDeconvExecutor(const ExecutorContext::CPtr context) : DeconvExecutor(context) {} + +bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) { + this->deconvAttrs = deconvAttrs; + ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); + TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; + TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; + TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo; + TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; + PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info; + + arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, + &weiTensorInfo, + deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, + &dstTensorInfo, + deconv_info); + if (!status) { + DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description()); + return false; + } + + srcTensor.allocator()->init(srcTensorInfo); + weiTensor.allocator()->init(weiTensorInfo); + dstTensor.allocator()->init(dstTensorInfo); + if (deconvAttrs.withBiasesParam) + biasTensor.allocator()->init(biasTensorInfo); + + deconv = std::make_unique(); + deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info); + + // weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor + weiBuffer = std::vector(srcDescs[1]->getShape().getStaticDims()[0] * + srcDescs[1]->getShape().getStaticDims()[1] * + srcDescs[1]->getShape().getStaticDims()[2] * + srcDescs[1]->getShape().getStaticDims()[3]); + return true; +} + +static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector& dst_data) { + const auto src_data = reinterpret_cast(srcMemPtr->getData()); + + const int DIM0 = srcMemPtr->getStaticDims()[0]; + const int DIM1 = srcMemPtr->getStaticDims()[1]; + const int DIM2 = srcMemPtr->getStaticDims()[2]; + const int DIM3 = srcMemPtr->getStaticDims()[3]; + + parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) { + for (int dim3 = 0; dim3 < DIM3; ++dim3) { + const int src_off = dim0 * DIM1 * DIM2 * DIM3 + + dim1 * DIM2 * DIM3 + + dim2 * DIM3 + + dim3; + const int dst_off = dim1 * DIM0 * DIM2 * DIM3 + + dim0 * DIM2 * DIM3 + + dim2 * DIM3 + + dim3; + + dst_data[dst_off] = src_data[src_off]; + } + }); +} + +void AclDeconvExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { + // TODO: Remove transpose from exec + transpose_to_1023(src[1], weiBuffer); + + srcTensor.allocator()->import_memory(src[0]->getData()); + dstTensor.allocator()->import_memory(dst[0]->getData()); + weiTensor.allocator()->import_memory(weiBuffer.data()); + if (deconvAttrs.withBiasesParam) + biasTensor.allocator()->import_memory(src[2]->getData()); + deconv->run(); + + srcTensor.allocator()->free(); + dstTensor.allocator()->free(); + weiTensor.allocator()->free(); + if (deconvAttrs.withBiasesParam) + biasTensor.allocator()->free(); +} + +bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs) { + if ((srcDescs[0]->getShape().getDims().size() != 3 && srcDescs[0]->getShape().getDims().size() != 4) || + dstDescs[0]->getShape().getDims().size() != srcDescs[0]->getShape().getDims().size() || + srcDescs[1]->getShape().getDims().size() != 4) { + DEBUG_LOG("AclDeconvExecutor does not support dimension:", + " src[0]=", srcDescs[0]->getShape().getDims().size(), + " src[1]=", srcDescs[1]->getShape().getDims().size(), + " dst[0]=", dstDescs[0]->getShape().getDims().size()); + return false; + } + + // TODO: Ticket CVS-114087 - enable FP16 when check FP16 scoup + if (!(one_of(srcDescs[0]->getPrecision(), /*InferenceEngine::Precision::FP16, */InferenceEngine::Precision::FP32) && + srcDescs[0]->getPrecision() == srcDescs[1]->getPrecision() && + srcDescs[1]->getPrecision() == dstDescs[0]->getPrecision())) { + DEBUG_LOG("AclDeconvExecutor does not support precisions:", + " src[0]=", srcDescs[0]->getPrecision(), + " src[1]=", srcDescs[1]->getPrecision(), + " dst[0]=", dstDescs[0]->getPrecision()); + return false; + } + + if (deconvAttrs.withBiasesParam && srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) { + DEBUG_LOG("AclDeconvExecutor does not support precisions:", + " src[2]=", srcDescs[2]->getPrecision()); + return false; + } + + if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) && + srcDescs[1]->hasLayoutType(LayoutType::ncsp) && + dstDescs[0]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[0]->hasLayoutType(LayoutType::nspc) && + srcDescs[1]->hasLayoutType(LayoutType::nspc) && + dstDescs[0]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("AclDeconvExecutor does not support layouts:", + " src[0]=", srcDescs[0]->serializeFormat(), + " src[1]=", srcDescs[1]->serializeFormat(), + " dst=", dstDescs[0]->serializeFormat()); + return false; + } + + if (deconvAttrs.withBiasesParam && + !(srcDescs[2]->hasLayoutType(LayoutType::ncsp)) && + !(srcDescs[2]->hasLayoutType(LayoutType::nspc))) { + DEBUG_LOG("AclDeconvExecutor does not support layouts:", + " src[0]=", srcDescs[0]->serializeFormat(), + " src[1]=", srcDescs[1]->serializeFormat(), + " src[2]=", srcDescs[2]->serializeFormat(), + " dst=", dstDescs[0]->serializeFormat()); + return false; + } + + ACLDeconvTensorInfo aclDeconvTensorInfo = getACLDeconvTensorInfo(deconvAttrs, srcDescs, dstDescs); + TensorInfo srcTensorInfo = aclDeconvTensorInfo.srcTensorInfo; + TensorInfo weiTensorInfo = aclDeconvTensorInfo.weiTensorInfo; + TensorInfo biasTensorInfo = aclDeconvTensorInfo.biasTensorInfo; + TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; + PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info; + + unsigned int kernel_x = (deconvAttrs.kernel.size() > 1) ? deconvAttrs.kernel.at(1) : deconvAttrs.kernel.at(0); + unsigned int kernel_y = deconvAttrs.kernel.at(0); + + // After stride=8 up-sampling in ACL Deconvolution layer slower than reference + if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) return false; + + unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); + unsigned int dilation_y = deconvAttrs.dilation.at(0); + if (!one_of(dilation_x, static_cast(0), static_cast(1)) || + !one_of(dilation_y, static_cast(0), static_cast(1))) return false; + + size_t in_h = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[2] : srcDescs[0]->getShape().getDims()[1]; + size_t in_w = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[3] : srcDescs[0]->getShape().getDims()[2]; + + // Validate function has bug (https://github.com/ARM-software/ComputeLibrary/issues/1061) with error exception. + // We copy deconvolution_output_dimensions function for get correct validation + // TODO: remove after fix + if (validate_deconvolution_output_dimensions(in_w, in_h, kernel_x, kernel_y, deconv_info)) { + DEBUG_LOG("NEDeconvolutionLayer arm_compute::deconvolution_output_dimensions failed"); + return false; + } + + arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, + &weiTensorInfo, + deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, + &dstTensorInfo, + deconv_info); + if (!status) { + DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description()); + return false; + } + + return true; +} + +bool AclDeconvExecutorBuilder::validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, + unsigned int kernel_height, + const PadStrideInfo &pad_stride_info) { + const unsigned int pad_left = pad_stride_info.pad_left(); + const unsigned int pad_top = pad_stride_info.pad_top(); + const unsigned int pad_right = pad_stride_info.pad_right(); + const unsigned int pad_bottom = pad_stride_info.pad_bottom(); + const unsigned int stride_x = pad_stride_info.stride().first; + const unsigned int stride_y = pad_stride_info.stride().second; + + if (!((in_width < 1 || in_height < 1) || + (((in_width - 1) * stride_x + kernel_width) < (pad_left + pad_right)) || + (((in_height - 1) * stride_y + kernel_height) < (pad_top + pad_bottom)))) { return false; } + return true; +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp new file mode 100644 index 00000000000..911d3315bb2 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/deconv.hpp" +#include "arm_compute/runtime/NEON/NEFunctions.h" +#include "utils/debug_capabilities.h" +#include "acl_utils.hpp" + +namespace ov { +namespace intel_cpu { + +struct ACLDeconvTensorInfo { + arm_compute::TensorInfo srcTensorInfo; + arm_compute::TensorInfo weiTensorInfo; + arm_compute::TensorInfo biasTensorInfo; + arm_compute::TensorInfo dstTensorInfo; + arm_compute::PadStrideInfo deconv_info; +}; + +ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs); + +class AclDeconvExecutor : public DeconvExecutor { +public: + explicit AclDeconvExecutor(const ExecutorContext::CPtr context); + bool init(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) override; + void exec(const std::vector& src, + const std::vector& dst, + const void *post_ops_data_) override; + + impl_desc_type getImplType() const override { + return implType; + } + +private: + DeconvAttrs deconvAttrs; + impl_desc_type implType = impl_desc_type::acl; + + arm_compute::Tensor srcTensor; + arm_compute::Tensor weiTensor; + arm_compute::Tensor biasTensor; + arm_compute::Tensor dstTensor; + std::unique_ptr deconv = nullptr; + + std::vector weiBuffer; +}; + +class AclDeconvExecutorBuilder : public DeconvExecutorBuilder { +public: + static bool customIsSupported(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs); + + bool isSupported(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const override { + return customIsSupported(deconvAttrs, srcDescs, dstDescs); + } + + DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { + return std::make_shared(context); + } + +private: + static bool validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const arm_compute::PadStrideInfo &pad_stride_info); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp new file mode 100644 index 00000000000..b11a1e57190 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp @@ -0,0 +1,13 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "deconv.hpp" + +namespace ov { +namespace intel_cpu { + +using namespace InferenceEngine; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp new file mode 100644 index 00000000000..2d553ff9044 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_memory.h" +#include "onednn/iml_type_mapper.h" +#include "executor.hpp" +#include +#include "nodes/common/dnnl_executor.h" + +namespace ov { +namespace intel_cpu { + +struct DeconvAttrs { + std::vector kernel; + std::vector stride; + std::vector dilation; + std::vector paddingL; + std::vector paddingR; + ov::CoordinateDiff outputPadding; + bool withBiasesParam = false; +}; + +class DeconvExecutor { +public: + explicit DeconvExecutor(const ExecutorContext::CPtr context) : context(context) {} + + virtual bool init(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) = 0; + + virtual void exec(const std::vector& src, + const std::vector& dst, + const void *post_ops_data_) = 0; + virtual ~DeconvExecutor() = default; + virtual impl_desc_type getImplType() const = 0; + +protected: + DeconvAttrs deconvAttrs; + ExecutorContext::CPtr context; +}; + +using DeconvExecutorPtr = std::shared_ptr; +using DeconvExecutorCPtr = std::shared_ptr; + +class DeconvExecutorBuilder { +public: + ~DeconvExecutorBuilder() = default; + virtual bool isSupported(const DeconvAttrs& convAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const = 0; + virtual DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; +}; + +using DeconvExecutorBuilderPtr = std::shared_ptr; +using DeconvExecutorBuilderCPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp new file mode 100644 index 00000000000..f5b897c2d1b --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "deconv_list.hpp" + +namespace ov { +namespace intel_cpu { + +const std::vector& getDeconvExecutorsList() { + static std::vector descs = { + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) + }; + + return descs; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp new file mode 100644 index 00000000000..e94476178ff --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp @@ -0,0 +1,79 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor.hpp" + +#include "deconv.hpp" +#if defined(OV_CPU_WITH_ACL) +#include "acl/acl_deconv.hpp" +#endif + +#include "onednn/iml_type_mapper.h" +#include "common/primitive_cache.hpp" + +namespace ov { +namespace intel_cpu { + +struct DeconvExecutorDesc { + ExecutorType executorType; + DeconvExecutorBuilderCPtr builder; +}; + +const std::vector& getDeconvExecutorsList(); + +class DeconvExecutorFactory : public ExecutorFactory { +public: + DeconvExecutorFactory(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) : ExecutorFactory(context) { + for (auto& desc : getDeconvExecutorsList()) { + if (desc.builder->isSupported(deconvAttrs, srcDescs, dstDescs)) { + supportedDescs.push_back(desc); + } + } + } + + ~DeconvExecutorFactory() = default; + virtual DeconvExecutorPtr makeExecutor(const DeconvAttrs& deconvAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) { + auto build = [&](const DeconvExecutorDesc* desc) { + auto executor = desc->builder->makeExecutor(context); + if (executor->init(deconvAttrs, srcDescs, dstDescs, attr)) { + return executor; + } + DeconvExecutorPtr ptr = nullptr; + return ptr; + }; + + if (chosenDesc) { + if (auto executor = build(chosenDesc)) { + return executor; + } + } + + for (const auto& sd : supportedDescs) { + if (auto executor = build(&sd)) { + chosenDesc = &sd; + return executor; + } + } + + IE_THROW() << "DeconvExecutorFactory: Supported executor is not found"; + } + +private: + std::vector supportedDescs; + const DeconvExecutorDesc* chosenDesc = nullptr; +}; + +using DeconvExecutorFactoryPtr = std::shared_ptr; +using DeconvExecutorFactoryCPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov