From 2c06c363662365ab969edcfbaa89c499a4571cd0 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 11 Aug 2022 13:19:16 +0800
Subject: [PATCH] [CPU] Default enable brgconv AVX512 (#12406)

---
 src/plugins/intel_cpu/src/nodes/conv.cpp | 60 +++++++++++++++---------
 src/plugins/intel_cpu/src/nodes/conv.h   |  4 +-
 2 files changed, 42 insertions(+), 22 deletions(-)
diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
index 7551817a879..850160a839e 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -279,11 +279,6 @@ Convolution::Convolution(const std::shared_ptr<ngraph::Node>& op, const dnnl::en
         paddingR = groupConvolutionOp->get_pads_end();
         autoPadding = one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER);
     }
-
-    // Due to performance issue, brgconv will only be enabled by default:
-    // 1, support amx
-    // 2, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948)
-    shouldTryBrgconv = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && !isDynamicNode();
 }
 
 bool Convolution::canBeExecutedInInt8() const {
@@ -375,6 +370,8 @@ void Convolution::getSupportedDescriptors() {
 
     withBiases = getOriginalInputsNumber() == 3;
 
+    initTryBrgconvFlag();
+
     if (!implPriorities.empty()) {
         isPrimitivesPriorityDefined = true;
         // winograd support only constant weights and bias
@@ -383,7 +380,7 @@ void Convolution::getSupportedDescriptors() {
                  getParentEdgeAt(1)->getParent()->isConstant() && getParentEdgeAt(1)->getParent()->getType() == Type::Input &&
                  (withBiases ? (getParentEdgeAt(2)->getParent()->isConstant() && getParentEdgeAt(2)->getParent()->getType() == Type::Input) : true);
 
-        // AVX512 brconv is disabled by default due to performance issues. User can force it via Primitives priority mechanism.
+        // AVX512 brconv may be disabled by heuristics due to performance issues. User can force it via Primitives priority mechanism.
         if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
             std::for_each(implPriorities.begin(), implPriorities.end(), [&](const impl_desc_type& desc_type) {
                 if (desc_type & impl_desc_type::brgconv_avx512) {
@@ -715,13 +712,12 @@ void Convolution::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
-    // attr[0] - depthwise, quantize
-    // attr[1] - binary
-    dnnl::primitive_attr attrs[2];
+    pInitAttrs[0] = std::make_shared<dnnl::primitive_attr>();
     auto attrsNum = shouldTryBrgconv ? 2 : 1;
-    setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true);
-    if (shouldTryBrgconv) {
-        setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false);
+    setPostOps(*pInitAttrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true);
+    if (shouldTryBrgconv && !pInitAttrs[1]) {
+        pInitAttrs[1] = std::make_shared<dnnl::primitive_attr>();
+        setPostOps(*pInitAttrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false);
     }
 
     bool containJitImpl = false;
@@ -730,7 +726,7 @@ void Convolution::initSupportedPrimitiveDescriptors() {
         if (containJitImpl && isPossibleToSkipInitConfig(desc))
             continue;
         for (int i = 0; i < attrsNum; i++) {
-            auto &attr = attrs[i];
+            auto &attr = *pInitAttrs[i];
             addZeroPoints(attr);
             auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
             while (static_cast<bool>(itpd)) {
@@ -942,14 +938,7 @@ void Convolution::initDescriptor(const NodeConfig& config) {
     if (isStridedBlobsSupported) {
         createDescriptor({config.inConfs[0].getMemDesc()}, {config.outConfs[0].getMemDesc()});
     }
-    // attr[0] - depthwise, quantize
-    // attr[1] - binary
-    dnnl::primitive_attr attrs[2];
     auto attrsNum = shouldTryBrgconv ? 2 : 1;
-    setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), true);
-    if (shouldTryBrgconv) {
-        setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false);
-    }
 
     auto rightConfig = selectedPD->getConfig();
     size_t selected_count = 0;
@@ -961,7 +950,7 @@ void Convolution::initDescriptor(const NodeConfig& config) {
         if (containJitImpl && isPossibleToSkipInitConfig(desc))
             continue;
         for (int n = 0; n < attrsNum; n++) {
-            auto &attr = attrs[n];
+            auto &attr = *pInitAttrs[n];
             addZeroPoints(attr);
             auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
             while (static_cast<bool>(itpd)) {
@@ -1554,6 +1543,35 @@ void Convolution::appendZeroPointsArgs() {
     }
 }
 
+void Convolution::initTryBrgconvFlag() {
+    // Due to performance issue, brgconv will only be enabled by default:
+    // 1, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948)
+    // 2, support amx
+    // 3, int8 without binary postops when avx512
+    if (!isDynamicNode()) {
+        if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
+            shouldTryBrgconv = true;
+        } else if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
+            // should remove after binary postops performance issue resolved
+            // heuristics: if it's int8 model and it has binary post ops we will not use brgconv
+            if (canBeExecutedInInt8()) {
+                shouldTryBrgconv = true;
+                dnnl::primitive_attr attrs;
+                setPostOps(attrs, MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false);
+                const auto& ops = attrs.get_post_ops();
+                for (int i = 0; i < ops.len(); i++) {
+                    if (ops.kind(i) == dnnl::primitive::kind::binary) {
+                        shouldTryBrgconv = false;
+                        break;
+                    }
+                }
+                if (shouldTryBrgconv)
+                    pInitAttrs[1] = std::make_shared<dnnl::primitive_attr>(std::move(attrs));
+            }
+        }
+    }
+}
+
 }   // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h
index 87b9c3e603e..8eca1b3969e 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.h
+++ b/src/plugins/intel_cpu/src/nodes/conv.h
@@ -102,6 +102,7 @@ private:
     MemoryPtr getOutputMemory() const;
 
     void appendZeroPointsArgs();
+    void initTryBrgconvFlag();
 
     bool withBiases;
     bool withSum;
@@ -136,8 +137,9 @@ private:
     const size_t Y_AXIS = 1;
 
     bool isWino = false;
-    // if we have amx support and shape is static or user specified we will try brgconv
     bool shouldTryBrgconv = false;
+    // cache attr for later usage. [0] - depthwise, quantize, [1] - binary
+    AttrPtr pInitAttrs[2];
     AttrPtr pAttr;
     bool autoPadding = false;
     FusedSubgraphPtr subgraph;