From 2c06c363662365ab969edcfbaa89c499a4571cd0 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 11 Aug 2022 13:19:16 +0800 Subject: [PATCH] [CPU] Default enable brgconv AVX512 (#12406) --- src/plugins/intel_cpu/src/nodes/conv.cpp | 60 +++++++++++++++--------- src/plugins/intel_cpu/src/nodes/conv.h | 4 +- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 7551817a879..850160a839e 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -279,11 +279,6 @@ Convolution::Convolution(const std::shared_ptr& op, const dnnl::en paddingR = groupConvolutionOp->get_pads_end(); autoPadding = one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); } - - // Due to performance issue, brgconv will only be enabled by default: - // 1, support amx - // 2, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948) - shouldTryBrgconv = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && !isDynamicNode(); } bool Convolution::canBeExecutedInInt8() const { @@ -375,6 +370,8 @@ void Convolution::getSupportedDescriptors() { withBiases = getOriginalInputsNumber() == 3; + initTryBrgconvFlag(); + if (!implPriorities.empty()) { isPrimitivesPriorityDefined = true; // winograd support only constant weights and bias @@ -383,7 +380,7 @@ void Convolution::getSupportedDescriptors() { getParentEdgeAt(1)->getParent()->isConstant() && getParentEdgeAt(1)->getParent()->getType() == Type::Input && (withBiases ? (getParentEdgeAt(2)->getParent()->isConstant() && getParentEdgeAt(2)->getParent()->getType() == Type::Input) : true); - // AVX512 brconv is disabled by default due to performance issues. User can force it via Primitives priority mechanism. + // AVX512 brconv may be disabled by heuristics due to performance issues. User can force it via Primitives priority mechanism. if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { std::for_each(implPriorities.begin(), implPriorities.end(), [&](const impl_desc_type& desc_type) { if (desc_type & impl_desc_type::brgconv_avx512) { @@ -715,13 +712,12 @@ void Convolution::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - // attr[0] - depthwise, quantize - // attr[1] - binary - dnnl::primitive_attr attrs[2]; + pInitAttrs[0] = std::make_shared(); auto attrsNum = shouldTryBrgconv ? 2 : 1; - setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true); - if (shouldTryBrgconv) { - setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false); + setPostOps(*pInitAttrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true); + if (shouldTryBrgconv && !pInitAttrs[1]) { + pInitAttrs[1] = std::make_shared(); + setPostOps(*pInitAttrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false); } bool containJitImpl = false; @@ -730,7 +726,7 @@ void Convolution::initSupportedPrimitiveDescriptors() { if (containJitImpl && isPossibleToSkipInitConfig(desc)) continue; for (int i = 0; i < attrsNum; i++) { - auto &attr = attrs[i]; + auto &attr = *pInitAttrs[i]; addZeroPoints(attr); auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr); while (static_cast(itpd)) { @@ -942,14 +938,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { if (isStridedBlobsSupported) { createDescriptor({config.inConfs[0].getMemDesc()}, {config.outConfs[0].getMemDesc()}); } - // attr[0] - depthwise, quantize - // attr[1] - binary - dnnl::primitive_attr attrs[2]; auto attrsNum = shouldTryBrgconv ? 2 : 1; - setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true); - if (shouldTryBrgconv) { - setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false); - } auto rightConfig = selectedPD->getConfig(); size_t selected_count = 0; @@ -961,7 +950,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { if (containJitImpl && isPossibleToSkipInitConfig(desc)) continue; for (int n = 0; n < attrsNum; n++) { - auto &attr = attrs[n]; + auto &attr = *pInitAttrs[n]; addZeroPoints(attr); auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr); while (static_cast(itpd)) { @@ -1554,6 +1543,35 @@ void Convolution::appendZeroPointsArgs() { } } +void Convolution::initTryBrgconvFlag() { + // Due to performance issue, brgconv will only be enabled by default: + // 1, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948) + // 2, support amx + // 3, int8 without binary postops when avx512 + if (!isDynamicNode()) { + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + shouldTryBrgconv = true; + } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { + // should remove after binary postops performance issue resolved + // heuristics: if it's int8 model and it has binary post ops we will not use brgconv + if (canBeExecutedInInt8()) { + shouldTryBrgconv = true; + dnnl::primitive_attr attrs; + setPostOps(attrs, MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false); + const auto& ops = attrs.get_post_ops(); + for (int i = 0; i < ops.len(); i++) { + if (ops.kind(i) == dnnl::primitive::kind::binary) { + shouldTryBrgconv = false; + break; + } + } + if (shouldTryBrgconv) + pInitAttrs[1] = std::make_shared(std::move(attrs)); + } + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h index 87b9c3e603e..8eca1b3969e 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.h +++ b/src/plugins/intel_cpu/src/nodes/conv.h @@ -102,6 +102,7 @@ private: MemoryPtr getOutputMemory() const; void appendZeroPointsArgs(); + void initTryBrgconvFlag(); bool withBiases; bool withSum; @@ -136,8 +137,9 @@ private: const size_t Y_AXIS = 1; bool isWino = false; - // if we have amx support and shape is static or user specified we will try brgconv bool shouldTryBrgconv = false; + // cache attr for later usage. [0] - depthwise, quantize, [1] - binary + AttrPtr pInitAttrs[2]; AttrPtr pAttr; bool autoPadding = false; FusedSubgraphPtr subgraph;