Files
openvino/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
2021-10-28 10:52:14 +03:00

899 lines
40 KiB
C++

// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_conv_node.h"
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
#include "mkldnn_eltwise_node.h"
#include "mkldnn_fake_quantize_node.h"
#include "mkldnn_pooling_node.h"
#include "mkldnn_concat_node.h"
#include "cpu/x64/cpu_isa_traits.hpp"
#include <string>
#include <vector>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <utils/general_utils.h>
#include <ngraph/ops.hpp>
#include <cpu/x64/jit_generator.hpp>
#include "common/cpu_convert.h"
#include <memory_desc/cpu_memory_desc_utils.h>
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "utils/cpu_utils.hpp"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
bool MKLDNNConvolutionNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
if (isDynamicNgraphNode(op)) {
errorMessage = "Doesn't support op with dynamic shapes";
return false;
}
if (!ngraph::is_type<ngraph::op::v1::Convolution>(op) && !ngraph::is_type<ngraph::op::v1::GroupConvolution>(op)) {
errorMessage = "Only opset1 Convolution and GroupConvolution operations are supported";
return false;
}
size_t ndims = op->get_input_shape(0).size();
if ((ndims < 4) || (ndims > 5)) {
errorMessage = "Doesn't support 'data' input with rank: " + std::to_string(ndims);
return false;
}
} catch (...) {
return false;
}
return true;
}
MKLDNNConvolutionNode::MKLDNNConvolutionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(op, eng, cache), withBiases(false), withSum(false), withDWConv(false),
isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef),
groupNum(1lu), eltwisePrecision(Precision::FP32) {
std::string errorMessage;
if (!isSupportedOperation(op, errorMessage)) {
IE_THROW(NotImplemented) << errorMessage;
}
auto convolutionOp = ngraph::as_type_ptr<ngraph::op::v1::Convolution>(op);
auto groupConvolutionOp = ngraph::as_type_ptr<ngraph::op::v1::GroupConvolution>(op);
if (convolutionOp) {
algorithm = ConvolutionCommon;
groupNum = 1;
isGrouped = false;
weightDims = convolutionOp->input_value(1).get_shape();
IC = weightDims[1];
groupIC = IC;
groupOC = weightDims[0];
biasesDims = { groupOC };
for (int i = 0; i < convolutionOp->get_strides().size(); i++) {
stride.push_back(convolutionOp->get_strides()[i]);
}
for (int i = 0; i < convolutionOp->get_dilations().size(); i++) {
dilation.push_back(static_cast<ptrdiff_t>(convolutionOp->get_dilations()[i]) - 1);
}
paddingL = convolutionOp->get_pads_begin();
paddingR = convolutionOp->get_pads_end();
} else if (groupConvolutionOp) {
algorithm = ConvolutionGrouped;
groupNum = groupConvolutionOp->input_value(1).get_shape()[0];
isGrouped = true;
weightDims = groupConvolutionOp->input_value(1).get_shape();
groupIC = weightDims[2];
IC = groupIC * groupNum;
groupOC = weightDims[1];
biasesDims = {groupOC * groupNum};
for (int i = 0; i < groupConvolutionOp->get_strides().size(); i++) {
stride.push_back(groupConvolutionOp->get_strides()[i]);
}
for (int i = 0; i < groupConvolutionOp->get_dilations().size(); i++) {
dilation.push_back(static_cast<ptrdiff_t>(groupConvolutionOp->get_dilations()[i]) - 1);
}
paddingL = groupConvolutionOp->get_pads_begin();
paddingR = groupConvolutionOp->get_pads_end();
}
}
bool MKLDNNConvolutionNode::canBeExecutedInInt8() const {
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(0));
if (!inputZeroPoints.empty())
inputDataType = memory::data_type::u8;
auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(1));
if (!weightsZeroPoints.empty())
weightsDataType = memory::data_type::s8;
return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && weightsDataType == memory::data_type::s8;
}
InferenceEngine::Precision MKLDNNConvolutionNode::fusedEltwisePrecision(const MKLDNNNodePtr& fusingNode) const {
InferenceEngine::Precision eltwisePrecision;
int fusingPort = fusingNode->getFusingPort();
if (fusingPort == 0) {
eltwisePrecision = fusingNode->getOriginalInputPrecisionAtPort(1);
} else if (fusingPort == 1) {
eltwisePrecision = fusingNode->getOriginalInputPrecisionAtPort(0);
} else {
IE_THROW() << "Cannot determine Eltwise post op precision for Convolution node with name '" << getName() << "'";
}
return eltwisePrecision;
}
void MKLDNNConvolutionNode::getSupportedDescriptors() {
if (!descs.empty())
return;
withBiases = getOriginalInputsNumber() == 3;
if (!implPriorities.empty()) {
isPrimitivesPriorityDefined = true;
// winograd support only constant weights and bias
isWino = std::find(implPriorities.begin(), implPriorities.end(), impl_desc_type::jit_avx512_winograd) != implPriorities.end() &&
mkldnn::impl::cpu::x64::mayiuse(mkldnn::impl::cpu::x64::avx512_common) && !canBeExecutedInInt8() &&
getParentEdgeAt(1)->getParent()->isConstant() && getParentEdgeAt(1)->getParent()->getType() == Input &&
(withBiases ? (getParentEdgeAt(2)->getParent()->isConstant() && getParentEdgeAt(2)->getParent()->getType() == Input) : true);
}
withSum = false;
int expectedInputEdgesNum = static_cast<int>(getOriginalInputsNumber());
for (int i = 0; i < fusedWith.size(); i++) {
if (fusedWith[i]->getType() == Convolution) {
expectedInputEdgesNum += static_cast<int>(fusedWith[i]->getOriginalInputsNumber()) - 1;
}
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
withSum = true;
expectedInputEdgesNum++;
}
}
}
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(0));
if (!inputZeroPoints.empty())
inputDataType = memory::data_type::u8;
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(0));
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
if (!fusedWith.empty()) {
outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0));
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
}
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && withSum) {
for (int i = 0; i < fusedWith.size(); i++) {
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
eltwisePrecision = fusedEltwisePrecision(fusedWith[i]);
if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
eltwisePrecision = Precision::FP32;
outputDataType = memory::data_type::f32;
}
break;
}
}
}
}
if (getParentEdges().size() != expectedInputEdgesNum)
IE_THROW() << "Incorrect number of input edges for layer " << getName() << ", expected: " << expectedInputEdgesNum
<< " actual: " << getParentEdges().size();
if (getChildEdges().empty())
IE_THROW() << "Incorrect number of output edges for layer " << getName();
int ndims = getInputShapeAtPort(0).getRank();
withDWConv = isFusedWith(Convolution);
for (int i = 0; i < fusedWith.size(); i++) {
auto *convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(fusedWith[i].get());
if (convolutionNode) {
auto& inActivationDims = convolutionNode->inputShapes[0].getStaticDims();
dw_conv_ih = inActivationDims[convolutionNode->inputShapes[0].getRank() - 2];
dw_conv_iw = inActivationDims[convolutionNode->inputShapes[0].getRank() - 1];
auto& outDims = convolutionNode->outputShapes[0].getStaticDims();
dw_conv_oc = outDims[1];
const auto &dwWeightsDims = convolutionNode->inputShapes[1].getStaticDims();
dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 1]);
dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 2]);
dw_conv_strides = convolutionNode->getStride();
if (canBeExecutedInInt8()) {
if (i == 0) {
dw_conv_in_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(0));
} else {
dw_conv_in_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0));
}
} else {
dw_conv_in_dt = memory::data_type::f32;
}
for (int j = 0; j < paddingR.size(); j++) {
int with_group = isGrouped ? 1 : 0;
int krn = weightDims[with_group + 2 + j];
int src = getInputShapeAtPort(0).getStaticDims()[2 + j];
int dst = getOutputShapeAtPort(0).getStaticDims()[2 + j];
krn = (krn - 1)*(dilation[j] + 1) + 1;
int calc_dst = (src - krn + paddingL[j]) / stride[j] + 1;
paddingR[j] = (dst - calc_dst) * stride[j];
}
}
}
MemoryDescPtr in_candidate, out_candidate;
if (canBeExecutedInInt8()) {
// We have to extend convolution_x8s8s32x from oneDNN to support BF16 output data type
if (outputDataType == memory::data_type::bf16)
outputDataType = memory::data_type::f32;
if (eltwisePrecision == Precision::BF16)
eltwisePrecision = Precision::FP32;
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(getInputShapeAtPort(0),
inputDataType, ndims == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(getOutputShapeAtPort(0),
outputDataType, ndims == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
createDescriptor({ in_candidate }, { out_candidate });
} else {
inputDataType = (getOriginalInputPrecisionAtPort(0) == Precision::BF16
&& !(isDepthWise() && ndims == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
outputDataType = (getOriginalOutputPrecisionAtPort(0) == Precision::BF16
&& !(isDepthWise() && ndims == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
eltwisePrecision = Precision::FP32;
for (int i = 0; i < fusedWith.size(); i++) {
if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
eltwisePrecision = fusedEltwisePrecision(fusedWith[i]);
// TODO(amalyshe): there might be situation when convolution can be executed in BF16,
// output is required in FP32 but eltwise inplace tensor would be in BF16
// currently we forcedly change output to the BF16 that will add reoreder after the node
// Another situation can be when we mark output as FP32 and Eltwise asPrecison (which stand
// for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
// bofore the fused convolution. This behaviour might be more correct regarding expected markup
// of the graph but performance of first and second approaches might be different. Need to verify
outputDataType = eltwisePrecision == Precision::BF16 ? memory::data_type::bf16 : memory::data_type::f32;
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
}
}
}
// correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
if (inputDataType == memory::data_type::f32
&& (outputDataType == memory::data_type::bf16 || eltwisePrecision == Precision::BF16)) {
outputDataType = memory::data_type::f32;
eltwisePrecision = Precision::FP32;
}
if (one_of(ndims, 4, 5)) {
memory::format_tag ncsp = ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw;
memory::format_tag nspc = ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc;
memory::format_tag nCsp16c = ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c;
memory::format_tag nCsp8c = ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c;
auto inputShape = getInputShapeAtPort(0);
auto outputShape = getOutputShapeAtPort(0);
if (IC == 1 && groupOC == 1) {
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, ncsp);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, ncsp);
createDescriptor({ in_candidate }, { out_candidate });
} else if (IC < 4) {
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, ncsp);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nCsp16c);
createDescriptor({ in_candidate }, { out_candidate });
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nCsp8c);
createDescriptor({ in_candidate }, { out_candidate });
} else {
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, nCsp16c);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nCsp16c);
createDescriptor({ in_candidate }, { out_candidate });
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, nCsp8c);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nCsp8c);
createDescriptor({ in_candidate }, { out_candidate });
}
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, ncsp);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, ncsp);
createDescriptor({ in_candidate }, { out_candidate });
if (inputDataType != memory::data_type::bf16 && isNspcAvailable()) {
in_candidate = std::make_shared<DnnlBlockedMemoryDesc>(inputShape, inputDataType, nspc);
out_candidate = std::make_shared<DnnlBlockedMemoryDesc>(outputShape, outputDataType, nspc);
createDescriptor({ in_candidate }, { out_candidate });
}
}
}
}
void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false, bool initAsBinary = false) {
bool initBinaryMemory = initWeights;
mkldnn::post_ops ops;
for (auto &node : fusedWith) {
if (node->getType() == Split || node->getType() == Concatenation)
continue;
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
if (eltwiseNode) {
if (eltwiseNode->isSpecialConvolutionAddFusing()) {
ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
} else {
constexpr int align = 16;
eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align, initAsBinary, initBinaryMemory);
if (initBinaryMemory) {
if (eltwiseNode->scalesMemory)
binaryPostOpsArgs.push_back(eltwiseNode->scalesMemory->GetPrimitive());
if (eltwiseNode->shiftsMemory)
binaryPostOpsArgs.push_back(eltwiseNode->shiftsMemory->GetPrimitive());
}
}
continue;
}
auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
if (fakeQuantizeNode) {
constexpr int align = -1;
// no need to fill post ops dims for fq, make sense only for bin fq
fakeQuantizeNode->appendPostOps(ops, VectorDims{}, align, initAsBinary, initBinaryMemory);
if (initBinaryMemory) {
if (fakeQuantizeNode->cropHighMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->cropHighMemory->GetPrimitive());
if (fakeQuantizeNode->cropLowMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->cropLowMemory->GetPrimitive());
if (fakeQuantizeNode->inputScaleMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->inputScaleMemory->GetPrimitive());
if (fakeQuantizeNode->inputShiftMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->inputShiftMemory->GetPrimitive());
if (fakeQuantizeNode->outputScaleMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->outputScaleMemory->GetPrimitive());
if (fakeQuantizeNode->outputShiftMemory)
binaryPostOpsArgs.push_back(fakeQuantizeNode->outputShiftMemory->GetPrimitive());
}
continue;
}
auto* convolutionNode = dynamic_cast<MKLDNNConvolutionNode *>(node.get());
if (convolutionNode) {
if (initWeights) {
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
static_cast<const float *>(getParentEdgeAt(
getOriginalInputsNumber() + 0)->getMemory().GetData()),
static_cast<const float *>(getParentEdgeAt(
getOriginalInputsNumber() + 1)->getMemory().GetData()));
} else {
// todo: rewrite onto append_dw_k3s2p1
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
nullptr,
nullptr);
}
continue;
}
IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
}
attr.set_post_ops(ops);
}
void MKLDNNConvolutionNode::selectOptimalPrimitiveDescriptor() {
selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true);
}
void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
// attr[0] - depthwise, quantize
// attr[1] - binary
mkldnn::primitive_attr attrs[1];
setPostOps(attrs[0]);
// setPostOps(attrs[1], false, true);
bool containJitImpl = false;
for (auto& desc : descs) {
if (containJitImpl && isPossibleToSkipInitConfig(desc))
continue;
for (auto &attr : attrs) {
addZeroPoints(attr);
auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (static_cast<bool>(itpd)) {
NodeConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
PortConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
auto desc = getSrcMemDesc(itpd, i);
if (desc->getType() & MemoryDescType::Blocked && !isGrouped) {
dataConfig.desc = desc->as<BlockedMemoryDesc>()->cloneWithUndefStridesAndOffset();
} else {
dataConfig.desc = std::move(desc);
}
config.inConfs.push_back(dataConfig);
}
if (withDWConv) {
auto weightsPrc = MKLDNNExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
auto biasPrc = memory::data_type::f32;
std::vector<size_t> dwWeightsDims({dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
std::vector<size_t> dwBiasesDims({dw_conv_oc});
PortConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = std::make_shared<DnnlBlockedMemoryDesc>(Shape(dwWeightsDims), weightsPrc, memory::format_tag::Goihw8g);
config.inConfs.push_back(dataConfig);
dataConfig.desc = std::make_shared<DnnlBlockedMemoryDesc>(Shape(dwBiasesDims), biasPrc, memory::format_tag::x);
config.inConfs.push_back(dataConfig);
}
for (size_t i = 0; i < descOutputNumbers(desc); i++) {
PortConfig dataConfig;
if (withSum) {
dataConfig.inPlace = getParentEdges().size() - 1;
}
dataConfig.constant = false;
auto desc = getDstMemDesc(itpd, i);
if (desc->getType() & MemoryDescType::Blocked && !isGrouped) {
dataConfig.desc = desc->as<BlockedMemoryDesc>()->cloneWithUndefStridesAndOffset();
} else {
dataConfig.desc = std::move(desc);
}
config.outConfs.push_back(dataConfig);
if (withSum) {
dataConfig.inPlace = -1;
dataConfig.desc = dataConfig.desc->cloneWithNewPrecision(dataConfig.desc->getPrecision());
config.inConfs.push_back(dataConfig);
}
}
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type & jit)
containJitImpl = true;
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
}
void MKLDNNConvolutionNode::createPrimitive() {
if (prim)
return;
mkldnn::primitive_attr attr;
addZeroPoints(attr);
// todo: [AV] delete "false" to use binary mechanism
if (false && getSelectedPrimitiveDescriptor()->getImplementationType() == jit_gemm) {
setPostOps(attr, true, true);
} else {
setPostOps(attr, true);
}
auto prim_desc = createPrimitiveDescriptor<convolution_forward::primitive_desc,
convolution_forward::desc>(attr);
prim.reset(new convolution_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
if (withBiases)
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
else
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
// todo: [AV] uncomment to use binary mechanism
// auto post_ops = attr.get_post_ops();
// int idx = 0;
// for (int i = 0; i < post_ops.len(); i++) {
// if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
// primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]});
// }
// }
}
bool MKLDNNConvolutionNode::created() const {
return getType() == Convolution;
}
void MKLDNNConvolutionNode::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const std::vector<MemoryDescPtr>& outputDesc) {
const auto inDesc = MemoryDescUtils::convertToDnnlMemoryDesc(inputDesc[0])->getDnnlDesc();
const auto outDesc = MemoryDescUtils::convertToDnnlMemoryDesc(outputDesc[0])->getDnnlDesc();
memory::data_type wdt = static_cast<memory::data_type>(inDesc.data.data_type);
memory::data_type bdt = memory::data_type::f32;
if (inDesc.data.data_type == mkldnn_s8 || inDesc.data.data_type == mkldnn_u8) {
wdt = memory::data_type::s8;
}
mkldnn::memory::desc wgh_candidate(MKLDNNExtensionUtils::convertToDnnlDims(weightDims), wdt, memory::format_tag::any);
std::vector<mkldnn::algorithm> algorithms;
if (isWinograd())
algorithms.push_back(mkldnn::algorithm::convolution_winograd);
algorithms.push_back(mkldnn::algorithm::convolution_direct);
for (auto alg : algorithms) {
try {
std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
if (withBiases) {
mkldnn::memory::desc bias_candidate(MKLDNNExtensionUtils::convertToDnnlDims(biasesDims), bdt, memory::format_tag::any);
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
inDesc, wgh_candidate, bias_candidate, outDesc,
mkldnn::memory::dims(stride.begin(), stride.end()),
mkldnn::memory::dims(dilation.begin(), dilation.end()),
mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
} else {
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
inDesc, wgh_candidate, outDesc,
mkldnn::memory::dims(stride.begin(), stride.end()),
mkldnn::memory::dims(dilation.begin(), dilation.end()),
mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
}
descs.emplace_back(conv_desc);
} catch (...) {
IE_THROW() << "Cannot create convolution forward descriptor for layer: " << getName();
}
}
}
void MKLDNNConvolutionNode::addZeroPoints(mkldnn::primitive_attr& attr) const {
if (!inputZeroPoints.empty())
attr.set_input_zero_points(1 << 1 /*through C dim*/, inputZeroPoints);
if (!weightsZeroPoints.empty())
attr.set_weights_zero_points(1 << 1 /*through C dim*/, weightsZeroPoints);
if (!outputCompensation.empty()) {
attr.set_output_compensations(1 << 1 /*through C dim*/, outputCompensation);
}
}
void MKLDNNConvolutionNode::initDescriptor(const NodeConfig& config) {
auto *selectedPD = getSelectedPrimitiveDescriptor();
if (!selectedPD) {
return;
}
// Strided blobs feature support.
// Works only for FP32 convolutions for now.
bool isStridedBlobsSupported = true;
// TODO [NM]: refactor via using global executionPrecision.
if (canBeExecutedInInt8()) {
isStridedBlobsSupported = false;
}
if (isStridedBlobsSupported) {
createDescriptor({config.inConfs[0].desc}, {config.outConfs[0].desc});
}
// attr[0] - depthwise, quantize
// attr[1] - binary
mkldnn::primitive_attr attrs[1];
setPostOps(attrs[0]);
// setPostOps(attrs[1], false, true);
auto rightConfig = selectedPD->getConfig();
size_t selected_count = 0;
bool containJitImpl = false;
for (size_t i = 0; i < descs.size(); i++) {
auto& desc = descs[i];
if (containJitImpl && isPossibleToSkipInitConfig(desc))
continue;
for (auto &attr : attrs) {
addZeroPoints(attr);
auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (static_cast<bool>(itpd)) {
NodeConfig cfg;
cfg.dynBatchSupport = true;
for (size_t j = 0; j < descInputNumbers(desc); j++) {
PortConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = getSrcMemDesc(itpd, j);
cfg.inConfs.push_back(dataConfig);
}
if (withDWConv) {
auto weightsPrc = MKLDNNExtensionUtils::IEPrecisionToDataType(dw_conv_in_dt == mkldnn_u8 ? Precision::I8 : Precision::FP32);
auto biasPrc = memory::data_type::f32;
std::vector <size_t> dwWeightsDims({dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
std::vector <size_t> dwBiasesDims({dw_conv_oc});
PortConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = std::make_shared<DnnlBlockedMemoryDesc>(Shape(dwWeightsDims), weightsPrc, memory::format_tag::Goihw8g);
cfg.inConfs.push_back(dataConfig);
dataConfig.desc = std::make_shared<DnnlBlockedMemoryDesc>(Shape(dwBiasesDims), biasPrc, memory::format_tag::x);
cfg.inConfs.push_back(dataConfig);
}
for (size_t j = 0; j < descOutputNumbers(desc); j++) {
PortConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(itpd, j);
if (withSum) {
auto eltwiseConfig = dataConfig;
eltwiseConfig.desc = eltwiseConfig.desc->cloneWithNewPrecision(eltwisePrecision);
cfg.inConfs.push_back(eltwiseConfig);
dataConfig.inPlace = getParentEdges().size() - 1;
}
cfg.outConfs.push_back(dataConfig);
}
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type & jit)
containJitImpl = true;
if (selected_count == selectedPrimitiveDescriptorIndex) {
if (impl_type != selectedPD->getImplementationType()) {
IE_THROW() << "Cannot get the original layer configuration!";
}
rightConfig = cfg;
}
if (i == descs.size() - 1 && isStridedBlobsSupported) {
if (impl_type == selectedPD->getImplementationType()) {
rightConfig = config;
}
}
selected_count++;
if (!itpd.next_impl())
break;
}
}
}
selectedPD->setConfig(rightConfig);
}
void MKLDNNConvolutionNode::filterSupportedPrimitiveDescriptors() {
MKLDNNNode::filterSupportedPrimitiveDescriptors();
// We also need to filter descs in Convolution node
filterSupportedDescriptors();
}
void MKLDNNConvolutionNode::filterSupportedDescriptors() {
if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty()) {
if (inputMemoryFormatsFilter.size() > 1 || outputMemoryFormatsFilter.size() > 1) {
IE_THROW() << "Incorrect number of input or output memory formats for Convolution node";
}
auto itd = descs.begin();
while (itd != descs.end()) {
bool isSuitableDesc = true;
if (!inputMemoryFormatsFilter.empty()) {
auto src_tdesc = MKLDNNExtensionUtils::makeDescriptor(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.src_desc);
isSuitableDesc &= src_tdesc->isSame(inputMemoryFormatsFilter[0]);
}
if (!outputMemoryFormatsFilter.empty()) {
auto dst_tdesc = MKLDNNExtensionUtils::makeDescriptor(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.dst_desc);
isSuitableDesc &= dst_tdesc->isSame(outputMemoryFormatsFilter[0]);
}
if (!isSuitableDesc) {
itd = descs.erase(itd);
} else {
itd++;
}
}
}
}
bool MKLDNNConvolutionNode::isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const {
// WA: In some cases, we can predict in advance the type of primitive that will be called in the future.
// In particular, isPossibleToSkipInitConfig() checks whether we can skip the creation of primitives with
// gemm implementation, which significantly increase the network load time.
if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty())
return false;
if (isPrimitivesPriorityDefined)
return false;
// Here we check that we will not delete jit_planar_conv primitive by mistake.
// It requires:
// 1) strides equal 1;
// 2) not grouped;
// 3) first dim of weights is not 1.
bool isPossibleJitPlanar = true;
if (isGrouped || weightDims[0] != 1)
isPossibleJitPlanar = false;
for (int i = 0; i < stride.size(); i++)
if (stride[i] != 1)
isPossibleJitPlanar = false;
std::shared_ptr<mkldnn::convolution_forward::desc> convDesc(desc);
auto srcMemDesc = MKLDNNExtensionUtils::makeDescriptor(convDesc->data.src_desc);
auto dstMemDesc = MKLDNNExtensionUtils::makeDescriptor(convDesc->data.dst_desc);
auto srcDataType = convDesc->data.src_desc.data_type;
auto dstDataType = convDesc->data.dst_desc.data_type;
bool isPlanarFloatConv = srcMemDesc->hasLayoutType(LayoutType::ncsp)
&& dstMemDesc->hasLayoutType(LayoutType::ncsp)
&& srcDataType == memory::data_type::f32
&& dstDataType == memory::data_type::f32;
return !isPossibleJitPlanar && isPlanarFloatConv;
}
std::shared_ptr<MemoryDesc> MKLDNNConvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx);
return MKLDNNExtensionUtils::makeDescriptor(desc);
}
bool MKLDNNConvolutionNode::canFuse(const MKLDNNNodePtr& node) const {
return canFuseSimpleOperation(node);
}
const mkldnn::memory& MKLDNNConvolutionNode::getWeights() const {
return getParentEdgeAt(1)->getMemory().GetPrimitive();
}
const mkldnn::memory& MKLDNNConvolutionNode::getBias() const {
return getParentEdgeAt(2)->getMemory().GetPrimitive();
}
InferenceEngine::Precision MKLDNNConvolutionNode::getRuntimePrecision() const {
std::vector<InferenceEngine::Precision> inputPrecisions;
// Don't take bias precision into account
size_t inputsNumLimit = 2;
for (size_t i = 0; i < std::min(getParentEdges().size(), inputsNumLimit); i++) {
auto parentEdge = getParentEdgeAt(i);
if (parentEdge && parentEdge->getStatus() == MKLDNNEdge::Status::Validated) {
inputPrecisions.emplace_back(MKLDNNExtensionUtils::DataTypeToIEPrecision((parentEdge->getMemoryPtr()->GetDataType())));
}
}
return getMaxPrecision(inputPrecisions);
}
bool MKLDNNConvolutionNode::isNspcAvailable() const {
using impl::cpu::x64::mayiuse;
// do not use in non-quantized networks until it is enforced externally
if (!isInQuantizedGraph) {
auto predicate = [](memory::format_tag tag) {
return one_of(tag, memory::format_tag::nwc, memory::format_tag::nhwc, memory::format_tag::ndhwc);
};
if (std::none_of(inputMemoryFormatsFilter.begin(), inputMemoryFormatsFilter.end(), predicate)) {
return false;
}
}
// A bunch of heuristics are designed to cut off not optimal nspc convolution applications
auto inpDims = getInputShapeAtPort(0).getStaticDims();
auto outDims = getOutputShapeAtPort(0).getStaticDims();
auto ndims = inpDims.size();
if (isDepthWise()) {
// 1d equivalent cases are painfully slow
if (1 == inpDims[inpDims.size() - 2]) {
return false;
}
} else {
// it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the channels number more than the specific value
size_t spatialRank = ndims - 2; //two means batch dim plus channels dim
bool is1x1 = false;
if (!isGrouped) {
auto weightDimsReversItr = weightDims.crbegin();
auto inpDimsReversItr = inpDims.crbegin();
auto outDimsReversItr = outDims.crbegin();
auto paddingLreversItr = paddingL.crbegin();
auto paddingRreversItr = paddingR.crbegin();
for (size_t i = 0; i < spatialRank; ++i) {
is1x1 = true
&& *(weightDimsReversItr++) == 1
&& *(inpDimsReversItr++) == *(outDimsReversItr++)
&& *(paddingLreversItr++) == 0
&& *(paddingRreversItr++) == 0;
}
}
// if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after the convolution performs slow
if (mayiuse(impl::cpu::x64::avx512_common) && is1x1) {
auto end = inpDims.rbegin();
std::advance(end, spatialRank);
if (std::all_of(inpDims.rbegin(), end, [](size_t x) { return 1 == x; })) {
return false;
}
}
unsigned thresholdNumChannels = 128u; // for avx and below
if (is1x1) {
thresholdNumChannels = 2048u;
} else if (mayiuse(impl::cpu::x64::avx512_common)) {
thresholdNumChannels = 512u;
}
size_t OC = outDims[1];
if (std::max(IC, OC) >= thresholdNumChannels) {
return false;
}
if (!mayiuse(impl::cpu::x64::avx)) {
// SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much better than gemm
if ((IC % 8) || (OC % 8)) {
return false;
}
}
}
return true;
}
InferenceEngine::Blob::Ptr MKLDNNConvolutionNode::createInternalBlob(InferenceEngine::SizeVector dims, size_t edgeNum, bool isGrouped) {
const auto constNode = std::dynamic_pointer_cast<MKLDNNInputNode>(getParentEdgeAt(edgeNum)->getParent());
if (!constNode) {
IE_THROW() << "Cannot cast " << edgeNum << " input to Input node for " << getName() << ".";
}
auto blb = constNode->getMemoryPtr();
if (blb == nullptr)
IE_THROW() << "Cannot get const blob for node " << getName() << ".";
auto const elementsCount = blb->GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
InferenceEngine::TensorDesc desc(InferenceEngine::Precision::FP32, dims, getWeightsLayoutByDims(dims, isGrouped));
Blob::Ptr internalBlob = InferenceEngine::make_shared_blob<float>(desc);
internalBlob->allocate();
if (internalBlob->size() != elementsCount) {
IE_THROW() << "Created internal blob and const blob has different size for node: " << getName() << ".";
}
cpu_convert(blb->GetPtr(),
internalBlob->buffer(),
MKLDNNExtensionUtils::DataTypeToIEPrecision(blb->GetDataType()),
internalBlob->getTensorDesc().getPrecision(),
elementsCount);
return internalBlob;
}
REG_MKLDNN_PRIM_FOR(MKLDNNConvolutionNode, Convolution);