[CPU] Use brgconv1x1 instead of inner product in some cases (#13715)

2022-11-21 19:00:49 +08:00 · 2022-11-21 19:00:49 +08:00 · cf7b174bf9
commit cf7b174bf9
parent 47e80200dd
15 changed files with 525 additions and 118 deletions
--- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
+++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
@ -156,5 +156,13 @@ DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t
    return DnnlExtensionUtils::makeDescriptor(*cdesc);
 }

+std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_desc_t& pd) {
+    const char *res;
+    dnnl_status_t status = dnnl_primitive_desc_query(pd, dnnl_query_impl_info_str, 0, &res);
+    if (status != dnnl_success)
+        IE_THROW() << "query_impl_info_str failed.";
+    return std::string(res);
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/dnnl_extension_utils.h
+++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.h
@ -49,6 +49,7 @@ public:
    static size_t getMemSizeForDnnlDesc(const dnnl::memory::desc& desc);

    static std::shared_ptr<DnnlMemoryDesc> query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx = 0);
+    static std::string query_impl_info_str(const const_dnnl_primitive_desc_t& pd);
 };

 }   // namespace intel_cpu
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@ -50,6 +50,8 @@
 #include <transformations/utils/utils.hpp>
 #include <low_precision/low_precision.hpp>
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include <common/primitive_desc.hpp>
+#include <common/primitive_desc_iface.hpp>

 using namespace dnnl;
 using namespace InferenceEngine;
@ -843,6 +845,13 @@ void Graph::CreatePrimitives() {
        OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive);
        DEBUG_LOG(*node);
        node->createPrimitive();
+#ifdef CPU_DEBUG_CAPS
+        if (node->prim) {
+            auto pd_c = (*node->prim).get_primitive_desc();
+            auto* pd = reinterpret_cast<const dnnl_primitive_desc*>(pd_c);
+            DEBUG_LOG("verbose##", node->getName(), "##", pd->info(), "\n");
+        }
+#endif
    }
 }

--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@ -60,6 +60,8 @@
 #include "nodes/common/cpu_convert.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include <common/primitive_desc.hpp>
+#include <common/primitive_desc_iface.hpp>

 using namespace dnnl;
 using namespace openvino;
@ -528,6 +530,13 @@ void Node::executeDynamic(dnnl::stream strm) {
            DEBUG_LOG(" prepareParams() on #", getExecIndex(), " ", getTypeStr(), " ", algToString(getAlgorithm()),
                      " ", getName(), " ", getOriginalLayers());
            prepareParams();
+#ifdef CPU_DEBUG_CAPS
+            if (prim) {
+                auto pd_c = (*prim).get_primitive_desc();
+                auto* pd = reinterpret_cast<const dnnl_primitive_desc*>(pd_c);
+                DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
+            }
+#endif
        }
        executeDynamicImpl(strm);
    }
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp
@ -54,5 +54,35 @@ Primitive DnnlExecutor::getExecPrim() const {
    return execPrim;
 }

+const_dnnl_primitive_desc_t DnnlExecutor::getPrimitiveDesc() const {
+    return (*execPrim).get_primitive_desc();
+}
+
+dnnl::memory::desc DnnlExecutor::getSrcDesc() const {
+    auto pd = getPrimitiveDesc();
+    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::src_md);
+
+    return md->getDnnlDesc();
+}
+
+dnnl::memory::desc DnnlExecutor::getWeightDesc() const {
+    auto pd = getPrimitiveDesc();
+    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::weights_md);
+
+    return md->getDnnlDesc();
+}
+
+dnnl::memory::desc DnnlExecutor::getDstDesc() const {
+    auto pd = getPrimitiveDesc();
+    auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::dst_md);
+
+    return md->getDnnlDesc();
+}
+
+impl_desc_type DnnlExecutor::getImplementationType() const {
+    auto pd = getPrimitiveDesc();
+    return parse_impl_name(DnnlExtensionUtils::query_impl_info_str(pd));
+}
+
 }  // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
+++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h
@ -6,6 +6,7 @@

 #include <cpu_memory.h>
 #include <primitive.h>
+#include <onednn/iml_type_mapper.h>

 namespace ov {
 namespace intel_cpu {
@ -30,6 +31,11 @@ class DnnlExecutor {
        bool needReordering() const;
        virtual ~DnnlExecutor() = default;
        Primitive getExecPrim() const;
+        const_dnnl_primitive_desc_t getPrimitiveDesc() const;
+        dnnl::memory::desc getSrcDesc() const;
+        dnnl::memory::desc getWeightDesc() const;
+        dnnl::memory::desc getDstDesc() const;
+        impl_desc_type getImplementationType() const;

    protected:
        DnnlExecutor() = default;
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@ -24,6 +24,8 @@
 #include "utils/cpu_utils.hpp"
 #include <common/primitive_hashing_utils.hpp>
 #include <cpu/cpu_primitive.hpp>
+#include <common/primitive_desc.hpp>
+#include <common/primitive_desc_iface.hpp>

 using namespace dnnl;
 using namespace InferenceEngine;
@ -1445,9 +1447,14 @@ void Convolution::prepareParams() {

        Node::appendPostOpArgs(*pAttrLocal, primArgs, convPostOpsArgs[preferLegacyPostOps]);

-        auto pd = (*(execPtr->getExecPrim())).get_primitive_desc();
+        auto pd = execPtr->getPrimitiveDesc();
        auto scratchpadMem = getScratchPadMem(pd);
        primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+#ifdef CPU_DEBUG_CAPS
+        if (result.second == CacheEntryBase::LookUpStatus::Miss) {
+            DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
+        }
+#endif
    } else {
        IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
    }
--- a/src/plugins/intel_cpu/src/nodes/deconv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@ -23,6 +23,8 @@
 #include <ie_ngraph_utils.hpp>
 #include "convolution_shape_inference.hpp"
 #include <common/primitive_hashing_utils.hpp>
+#include <common/primitive_desc.hpp>
+#include <common/primitive_desc_iface.hpp>

 using namespace dnnl;
 using namespace InferenceEngine;
@ -935,9 +937,14 @@ void Deconvolution::prepareParams() {
        }
        Node::appendPostOpArgs(*pAttrLocal, primArgs, postOpsArgs);

-        auto pd = (*(execPtr->getExecPrim())).get_primitive_desc();
+        auto pd = execPtr->getPrimitiveDesc();
        auto scratchpadMem = getScratchPadMem(pd);
        primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+#ifdef CPU_DEBUG_CAPS
+        if (result.second == CacheEntryBase::LookUpStatus::Miss) {
+            DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
+        }
+#endif
    } else {
        IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
    }
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@ -5,6 +5,8 @@
 #include "fullyconnected.h"
 #include "eltwise.h"
 #include "fake_quantize.h"
+#include "input.h"
+#include "reorder.h"
 #include "ngraph_transformations/op/fully_connected.hpp"
 #include <ngraph/opsets/opset1.hpp>
 #include <string>
@ -12,10 +14,14 @@
 #include <dnnl_extension_utils.h>
 #include <onednn/dnnl.h>
 #include "utils/general_utils.h"
+#include "cpu/x64/cpu_isa_traits.hpp"
 #include <memory_desc/cpu_memory_desc_utils.h>
 #include "memory_desc/dnnl_blocked_memory_desc.h"
 #include "utils/cpu_utils.hpp"
 #include <common/primitive_hashing_utils.hpp>
+#include <common/primitive_desc.hpp>
+#include <common/primitive_desc_iface.hpp>
+#include "onednn/dnnl.h"

 using namespace dnnl;
 using namespace InferenceEngine;
@ -32,6 +38,7 @@ struct FCKey {
    DnnlMemoryDescCPtr out;
    dnnl::primitive_attr attr;
    impl_desc_type implType;
+    bool useConv1x1;

    size_t hash() const;
    bool operator==(const FCKey& rhs) const;
@ -51,6 +58,7 @@ size_t FCKey::hash() const {

    seed = hash_combine(seed, get_attr_hash(*attr.get()));
    seed = hash_combine(seed, implType);
+    seed = hash_combine(seed, useConv1x1);
    return seed;
 }

@ -69,7 +77,7 @@ bool FCKey::operator==(const FCKey &rhs) const {
        retVal = retVal && out && rhs.out && out->getDnnlDesc() == rhs.out->getDnnlDesc();
    }
    retVal = retVal && *attr.get() == *rhs.attr.get() &&
-             implType == rhs.implType;
+             implType == rhs.implType && useConv1x1 == rhs.useConv1x1;
    return retVal;
 }

@ -205,14 +213,11 @@ void FullyConnected::getSupportedDescriptors() {

 void FullyConnected::prepareParams() {
    auto srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr();
-    auto wghMemPtr = getParentEdgesAtPort(1)[0]->getMemoryPtr();
    auto dstMemPtr = getChildEdgesAtPort(0)[0]->getMemoryPtr();
    if (!dstMemPtr || !dstMemPtr->isAllocated())
        IE_THROW() << "Destination memory hasn't been allocated.";
    if (!srcMemPtr || !srcMemPtr->isAllocated())
        IE_THROW() << "Input memory hasn't been allocated.";
-    if (!wghMemPtr || !wghMemPtr->isAllocated())
-        IE_THROW() << "Weight memory hasn't been allocated.";
    MemoryPtr biasMemPtr = nullptr;
    if (withBiases) {
        biasMemPtr = getParentEdgesAtPort(2)[0]->getMemoryPtr();
@ -220,7 +225,7 @@ void FullyConnected::prepareParams() {
            IE_THROW() << "Input memory hasn't been allocated.";
    }

-    const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor();
+    NodeDesc *selected_pd = getSelectedPrimitiveDescriptor();
    if (selected_pd == nullptr)
        IE_THROW() << "Preferable primitive descriptor is not set for node " << getName() << ".";

@ -228,7 +233,7 @@ void FullyConnected::prepareParams() {
    setPostOps(*attr, dstMemPtr->getStaticDims());
    (*attr).set_scratchpad_mode(dnnl::scratchpad_mode::user);

-    DnnlMemoryDescCPtr weightDesc = wghMemPtr->GetDescWithType<DnnlMemoryDesc>();
+    DnnlMemoryDescPtr weightDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weightDescIP);
    DnnlMemoryDescCPtr biasDesc = nullptr;
    if (biasMemPtr) {
        biasDesc = biasMemPtr->GetDescWithType<DnnlMemoryDesc>();
@ -237,60 +242,88 @@ void FullyConnected::prepareParams() {
    DnnlMemoryDescCPtr inDesc = srcMemPtr->GetDescWithType<DnnlMemoryDesc>();
    DnnlMemoryDescCPtr outDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();

+    useConv1x1 = canBeExecutedInConv1x1();
    FCKey key = {inDesc,
                 weightDesc,
                 biasDesc,
                 outDesc,
                 *attr,
-                 selected_pd->getImplementationType()};
+                 implementationTypeIP,
+                 useConv1x1};

    auto engine = getEngine();

-    auto builder = [&engine](const FCKey& key) -> std::shared_ptr<dnnl::primitive> {
-        auto inDesc = key.inp0->getDnnlDesc();
-        if (inDesc.dims().size() == 3) {
-            auto inDims = inDesc.dims();
-            auto normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
-            inDesc = inDesc.reshape(normalizedInDims);
-        }
+    auto builder = [&engine](const FCKey& key) -> executorPtr {
+        executorPtr execPtr = nullptr;
+        if (key.useConv1x1) {
+            auto desc = createDescriptorInternalForConv(key.inp0, key.inp1, key.bias, key.out);
+            primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
+            convolution_forward::primitive_desc prim_desc;

-        auto outDesc = key.out->getDnnlDesc();
-        if (outDesc.dims().size() == 3) {
-            auto outDims = outDesc.dims();
-            auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
-            outDesc = outDesc.reshape(normalizedOutDims);
-        }
+            while (static_cast<bool>(itpd))  {
+                impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());

-        std::shared_ptr<dnnl::inner_product_forward::desc> fcDsc;
-        if (key.bias) {
-            fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
-                                                                          inDesc,
-                                                                          key.inp1->getDnnlDesc(),
-                                                                          key.bias->getDnnlDesc(),
-                                                                          outDesc);
-        } else {
-            fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
-                                                                          inDesc,
-                                                                          key.inp1->getDnnlDesc(),
-                                                                          outDesc);
-        }
-        DnnlDesriptor desc(fcDsc);
-        primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
-        inner_product_forward::primitive_desc prim_desc;
-
-        while (static_cast<bool>(itpd))  {
-            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
-
-            if (impl_type == key.implType) {
-                prim_desc = itpd.get();
-                break;
+                if (impl_type == brgconv_avx512_1x1) {
+                    prim_desc = itpd.get();
+                    break;
+                }
+                if (!itpd.next_impl()) {
+                    break;
+                }
            }
-            if (!itpd.next_impl()) {
-                return nullptr;
+
+            if (prim_desc) {
+                execPtr = std::make_shared<ExecutorConv1x1>(prim_desc);
            }
        }
+        // fallback
+        if (!execPtr) {
+            auto inDesc = key.inp0->getDnnlDesc();
+            if (inDesc.dims().size() == 3) {
+                auto inDims = inDesc.dims();
+                auto normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
+                inDesc = inDesc.reshape(normalizedInDims);
+            }

-        return std::make_shared<inner_product_forward>(prim_desc);
+            auto outDesc = key.out->getDnnlDesc();
+            if (outDesc.dims().size() == 3) {
+                auto outDims = outDesc.dims();
+                auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
+                outDesc = outDesc.reshape(normalizedOutDims);
+            }
+
+            std::shared_ptr<dnnl::inner_product_forward::desc> fcDsc;
+            if (key.bias) {
+                fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
+                                                                            inDesc,
+                                                                            key.inp1->getDnnlDesc(),
+                                                                            key.bias->getDnnlDesc(),
+                                                                            outDesc);
+            } else {
+                fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
+                                                                            inDesc,
+                                                                            key.inp1->getDnnlDesc(),
+                                                                            outDesc);
+            }
+            DnnlDesriptor desc(fcDsc);
+            primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
+            inner_product_forward::primitive_desc prim_desc;
+
+            while (static_cast<bool>(itpd))  {
+                impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
+
+                if (impl_type == key.implType) {
+                    prim_desc = itpd.get();
+                    break;
+                }
+                if (!itpd.next_impl()) {
+                    return nullptr;
+                }
+            }
+
+            execPtr = std::make_shared<ExecutorInnerProduct>(prim_desc);
+        }
+        return execPtr;
    };

    auto cache = getRuntimeCache();
@ -300,41 +333,62 @@ void FullyConnected::prepareParams() {
        IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
    }

-    prim = result.first;
+    auto prevExecPtr = execPtr;
+    execPtr = result.first;

-    primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
-    primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
-    primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
-
-    if (withBiases) {
-        primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
-    }
-
-    appendPostOpArgs(*attr, primArgs, postOpsArgs);
-
-    auto pd = (*prim).get_primitive_desc();
-    auto scratchpadMem = getScratchPadMem(pd);
-    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
-
-    auto reshapeMemory = [this](int argType) {
-        auto param = primArgs.find(argType);
-        if (param != primArgs.end()) {
-            auto oldMem = param->second;
-            auto dims = oldMem.get_desc().dims();
-            if (dims.size() == 3) {
-                std::vector<dnnl::memory::dim> normalizedDims({dims[0] * dims[1], dims[2]});
-                dnnl::memory::desc newMemDesc(oldMem.get_desc().reshape(normalizedDims));
-                dnnl::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
-                primArgs.at(argType) = newMem;
+    if (execPtr) {
+        // no executor yet or shapes changed
+        if (!prevExecPtr || prevExecPtr->getSrcDesc() != execPtr->getSrcDesc()) {
+            auto oldMem = srcMemPtr->GetPrimitive();
+            // fast path: wanted is same with parent node output, typical is static shape with inner product
+            if (execPtr->getSrcDesc() == inDesc->getDnnlDesc()) {
+                primArgs[DNNL_ARG_SRC] = std::move(oldMem);
+            } else {
+                primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getSrcDesc(), oldMem.get_engine(), oldMem.get_data_handle());
            }
        }
-    };
-    reshapeMemory(DNNL_ARG_SRC);
-    reshapeMemory(DNNL_ARG_DST);
+        if (!prevExecPtr || prevExecPtr->getDstDesc() != execPtr->getDstDesc()) {
+            auto oldMem = dstMemPtr->GetPrimitive();
+            if (execPtr->getDstDesc() == outDesc->getDnnlDesc()) {
+                primArgs[DNNL_ARG_DST] = std::move(oldMem);
+            } else {
+                primArgs[DNNL_ARG_DST] = dnnl::memory(execPtr->getDstDesc(), oldMem.get_engine(), oldMem.get_data_handle());
+            }
+        }
+        if (!prevExecPtr || prevExecPtr->getWeightDesc() != execPtr->getWeightDesc()) {
+            primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(DnnlExtensionUtils::makeDescriptor(execPtr->getWeightDesc()))->GetPrimitive();
+        }
+        // changed shapes may also cause the kernel type changed
+        selected_pd->setImplementationType(execPtr->getImplementationType());
+        // maybe expected 1x1 conv is not created, update the flag depends on the real type
+        useConv1x1 = execPtr->getImplementationType() == brgconv_avx512_1x1;
+
+        if (withBiases) {
+            primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
+        }
+
+        appendPostOpArgs(*attr, primArgs, postOpsArgs);
+
+        auto pd = execPtr->getPrimitiveDesc();
+        auto scratchpadMem = getScratchPadMem(pd);
+        primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
+#ifdef CPU_DEBUG_CAPS
+        if (result.second == CacheEntryBase::LookUpStatus::Miss) {
+            DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
+        }
+#endif
+    } else {
+        IE_THROW() << "Executor is not created for node " << getName() << ".";
+    }
 }

 void FullyConnected::setDynamicBatchLim(int lim) {
-    dynBatchLim = lim;
+    if (!execPtr) {
+        IE_THROW() << "Can't set dynamic batch for FullyConnected node with name: " << getName() << ", because executor is not compiled";
+    }
+    if (execPtr->needReordering()) {
+        IE_THROW() << "Can't execute FullyConnected node with dynamic batch via executor with reorders";
+    }

    auto setBatchPrimArgs = [this](int argType, const dnnl::memory& oldMem) {
        dnnl::memory::desc newMemDesc(oldMem.get_desc());
@ -350,31 +404,38 @@ void FullyConnected::setDynamicBatchLim(int lim) {
        primArgs.at(argType) = dnnl::memory(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
    };

-    setBatchPrimArgs(DNNL_ARG_SRC, getParentEdgesAtPort(0)[0]->getMemory().GetPrimitive());
-    setBatchPrimArgs(DNNL_ARG_DST, getChildEdgesAtPort(0)[0]->getMemory().GetPrimitive());
+    if (useConv1x1) {
+        Node::setDynamicBatchLim(lim);
+    } else {
+        dynBatchLim = lim;
+        setBatchPrimArgs(DNNL_ARG_SRC, getParentEdgesAtPort(0)[0]->getMemory().GetPrimitive());
+        setBatchPrimArgs(DNNL_ARG_DST, getChildEdgesAtPort(0)[0]->getMemory().GetPrimitive());
+    }
 }

 void FullyConnected::execute(dnnl::stream strm) {
-    if (prim) {
-        // in cases parameter -> FullyConnected or dynamic shapes
-        // we keep old pointer to data in primArgs on second iteration with same input shapes
-        auto updateMemoryPtr = [this](int argType) {
-            auto param = primArgs.find(argType);
-            if (param != primArgs.end()) {
-                if (argType == DNNL_ARG_SRC && getInputShapeAtPort(DATA_ID).getRank() == 3) {
-                    primArgs.at(argType).set_data_handle(getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
-                }
-                if (argType == DNNL_ARG_DST && getOutputShapeAtPort(0).getRank() == 3) {
-                    primArgs.at(argType).set_data_handle(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
-                }
-            }
-        };
-
-        updateMemoryPtr(DNNL_ARG_SRC);
-        updateMemoryPtr(DNNL_ARG_DST);
-
-        (*prim).execute(strm, primArgs);
+    if (!execPtr) {
+        IE_THROW() << "Can't execute FullyConnected node with name: " << getName() << ", because executor is not compiled";
    }
+
+    // in cases parameter -> FullyConnected or dynamic shapes
+    // we keep old pointer to data in primArgs on second iteration with same input shapes
+    auto updateMemoryPtr = [this](int argType) {
+        auto param = primArgs.find(argType);
+        if (param != primArgs.end()) {
+            if (argType == DNNL_ARG_SRC && (getInputShapeAtPort(DATA_ID).getRank() == 3 || useConv1x1)) {
+                primArgs.at(argType).set_data_handle(getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
+            }
+            if (argType == DNNL_ARG_DST && (getOutputShapeAtPort(0).getRank() == 3 || useConv1x1)) {
+                primArgs.at(argType).set_data_handle(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
+            }
+        }
+    };
+
+    updateMemoryPtr(DNNL_ARG_SRC);
+    updateMemoryPtr(DNNL_ARG_DST);
+
+    execPtr->exec(primArgs, strm);
 }

 void FullyConnected::executeDynamicImpl(dnnl::stream strm) {
@ -647,6 +708,166 @@ InferenceEngine::Precision FullyConnected::getRuntimePrecision() const {
    return getMaxPrecision(inputPrecisions);
 }

+void FullyConnected::initOptimalPrimitiveDescriptor() {
+    Node::initOptimalPrimitiveDescriptor();
+    auto selectedPD = getSelectedPrimitiveDescriptor();
+    implementationTypeIP = selectedPD->getImplementationType();
+    // if convolution selected the reorder for ip is useless. Will do the reoder for ip in prepareParams
+    auto constParent = getParentEdgeAt(1)->getParent();
+    auto selectedParentPD = constParent->getSelectedPrimitiveDescriptor();
+    auto config = selectedPD->getConfig();
+    weightDescIP = config.inConfs[1].getMemDesc();
+    config.inConfs[1].setMemDesc(selectedParentPD->getConfig().outConfs[0].getMemDesc());
+    selectedPD->setConfig(config);
+}
+
+DnnlDesriptor FullyConnected::createDescriptorInternalForConv(DnnlMemoryDescCPtr inputDescPtr,
+                                                              DnnlMemoryDescCPtr weightDescPtr,
+                                                              DnnlMemoryDescCPtr biasDescPtr,
+                                                              DnnlMemoryDescCPtr outputDescPtr) {
+    const dnnl::memory::desc &inputDesc = inputDescPtr->getDnnlDesc();
+    const dnnl::memory::desc &outputDesc = outputDescPtr->getDnnlDesc();
+    const dnnl::memory::desc &weightDesc = weightDescPtr->getDnnlDesc();
+
+    // make a fake shape: N, IC, W
+    auto inDims = inputDesc.dims();
+    dnnl::memory::dims normalizedInDims;
+    if (inDims.size() == 3) {
+        normalizedInDims = {inDims[0], inDims[2], inDims[1]};
+    } else if (inDims.size() == 2) {
+        normalizedInDims = {dnnl::memory::dim{1}, inDims[1], inDims[0]};
+    }
+    auto convInDesc = dnnl::memory::desc(normalizedInDims, inputDesc.data_type(), memory::format_tag::nwc);
+
+    // make a fake shape: N, OC, W
+    auto outDims = outputDesc.dims();
+    dnnl::memory::dims normalizedOutDims;
+    if (outDims.size() == 3) {
+        normalizedOutDims = { outDims[0], outDims[2], outDims[1]};
+    } else if (outDims.size() == 2) {
+        normalizedOutDims = { dnnl::memory::dim{1}, outDims[1], outDims[0]};
+    }
+    auto convOutDesc = dnnl::memory::desc(normalizedOutDims, outputDesc.data_type(), memory::format_tag::nwc);
+
+    // make a fake shape: OC, IC, 1
+    auto weightDims = weightDesc.dims();
+    dnnl::memory::dims normalizedWeightDims;
+    normalizedWeightDims = {static_cast<dnnl::memory::dim>(weightDims[0]),
+                            static_cast<dnnl::memory::dim>(weightDims[1]),
+                            dnnl::memory::dim{1}};
+    auto convWeightDescAny = dnnl::memory::desc(normalizedWeightDims, weightDesc.data_type(), dnnl::memory::format_tag::any);
+
+    std::shared_ptr<dnnl::convolution_forward::desc> desc;
+    if (biasDescPtr) {
+        desc = std::make_shared<dnnl::convolution_forward::desc>(prop_kind::forward_scoring, dnnl::algorithm::convolution_direct,
+                                convInDesc, convWeightDescAny, biasDescPtr->getDnnlDesc(), convOutDesc,
+                                dnnl::memory::dims{1},   // stride
+                                dnnl::memory::dims{0},   // dilation
+                                dnnl::memory::dims{0},   // paddingL
+                                dnnl::memory::dims{0});  // paddingR
+    } else {
+        desc = std::make_shared<dnnl::convolution_forward::desc>(prop_kind::forward_scoring, dnnl::algorithm::convolution_direct,
+                                convInDesc, convWeightDescAny, convOutDesc,
+                                dnnl::memory::dims{1},   // stride
+                                dnnl::memory::dims{0},   // dilation
+                                dnnl::memory::dims{0},   // paddingL
+                                dnnl::memory::dims{0});  // paddingR
+    }
+
+    return DnnlDesriptor(desc);
+}
+
+bool FullyConnected::canBeExecutedInConv1x1() const {
+    bool retVal = false;
+    const auto inRank = getInputShapeAtPort(DATA_ID).getRank();
+    const auto weightRank = getInputShapeAtPort(WEIGHTS_ID).getRank();
+    // disable rank=4:
+    // if layout is nhwc:
+    //   A matrix: N * IC * H * W --> N * (IC*H*W), the M, N', K of matrix multiply will be:
+    //   M = 1, K = (IC*H*W), when M = 1 it should not be efficient since acts as a vector multiply
+    // if layout is nchw/nChw16c: brg1x1 not support. Although jit supports, it should have similar
+    //   problems with the above.
+    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) &&
+        getOriginalInputPrecisionAtPort(DATA_ID) == InferenceEngine::Precision::FP32 &&
+        one_of(inRank, 2, 3) && weightRank == 2) {
+        auto dstMemPtr = getChildEdgesAtPort(0)[0]->getMemoryPtr();
+        DnnlMemoryDescCPtr outDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();
+        // brg convolution does not support stride
+        if (outDesc->getDnnlDesc().data.offset0 == 0)
+            retVal = true;
+    }
+
+    if (retVal) {
+        auto srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr();
+        const auto& srcDims = srcMemPtr->getStaticDims();
+        auto weightMemPtr = getParentEdgesAtPort(1)[0]->getMemoryPtr();
+        const auto& weightDims = weightMemPtr->getStaticDims();
+        Dim M, N, K;
+        M = srcDims[inRank - 2];
+        K = srcDims[inRank - 1];
+        N = weightDims[0];
+
+        if (!(M >= 49 && M <= 3136 &&
+              K >= 96 && K <= 4096 &&
+              N >= 96 && N <= K * 4))
+            retVal = false;
+    }
+
+    return retVal;
+}
+
+FullyConnected::ExecutorInnerProduct::ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd) {
+    execPrim.reset(new dnnl::inner_product_forward(pd));
+}
+
+FullyConnected::ExecutorConv1x1::ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd) {
+    execPrim.reset(new dnnl::convolution_forward(pd));
+}
+
+MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
+    if (!getParentEdgeAt(1)->getParent()->isConstant())
+        IE_THROW() << "Weight input is not const for node " << getName() << ".";
+    auto blob = getParentEdgeAt(1)->getMemoryPtr();
+    if (!blob)
+        IE_THROW() << "Cannot get const weights blob for node " << getName() << ".";
+
+    auto constDnnlMemOutDesc = blob->GetDescWithType<DnnlMemoryDesc>();
+    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
+    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().dims());
+    auto create = [&] () {
+        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
+
+        Memory srcMemory{ getEngine() };
+        srcMemory.Create(newSrcDesc, blob->GetData());
+
+        MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
+        _ptr->Create(weightDesc);
+        node::Reorder::reorderData(srcMemory, *_ptr, getRuntimeCache());
+
+        return _ptr;
+    };
+
+    MemoryPtr ptr;
+    const auto& format = weightDesc->serializeFormat();
+    auto itr = privateWeightCache.find(format);
+    if (privateWeightCache.end() != itr) {
+        ptr = itr->second;
+    } else {
+        if (weightCache != nullptr) {
+            const std::string string_hash = getName() + "_" + format
+                                            + "_" + std::to_string(blob->GetSize())
+                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(blob->GetData()));
+
+            ptr = *weightCache->findOrCreate(string_hash, create);
+        } else {
+            ptr = create();
+        }
+        privateWeightCache[format] = ptr;
+    }
+
+    return ptr;
+}
+
 }   // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@ -9,6 +9,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "common/dnnl_executor.h"

 namespace ov {
 namespace intel_cpu {
@ -40,6 +41,7 @@ public:
    }

    void initSupportedPrimitiveDescriptors() override;
+    void initOptimalPrimitiveDescriptor() override;
    std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
    std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;

@ -75,6 +77,35 @@ private:
    static const size_t WEIGHTS_ID = 1;
    static const size_t BIAS_ID = 2;
    dnnl::memory::data_type outputDataType;
+
+    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    executorPtr execPtr = nullptr;
+    bool useConv1x1 = false;
+    impl_desc_type implementationTypeIP;
+    MemoryDescPtr weightDescIP;
+    // when weightCache is not enabled (such as stream=1), brgconv weights may change due to
+    // different shapes. Weights will be cached in privateWeightCache.
+    // When weightCache is enabled, it holds weight ptr reference since weightCache does not hold the
+    // reference
+    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
+
+    class ExecutorInnerProduct : public DnnlExecutor {
+        public:
+            ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd);
+    };
+
+    class ExecutorConv1x1 : public DnnlExecutor {
+        public:
+            ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd);
+    };
+
+    static DnnlDesriptor createDescriptorInternalForConv(DnnlMemoryDescCPtr inputDescPtr,
+                                                         DnnlMemoryDescCPtr weightDescPtr,
+                                                         DnnlMemoryDescCPtr biasDescPtr,
+                                                         DnnlMemoryDescCPtr outputDescPtr);
+
+    bool canBeExecutedInConv1x1() const;
+    MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc);
 };

 }   // namespace node
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@ -412,7 +412,7 @@ std::string Reorder::getReorderArgs(const MemoryDesc &parentDesc, const MemoryDe
    return inArgs + "_" + outArgs;
 }

-void Reorder::reorderData(const Memory &input, const Memory &output) {
+void Reorder::reorderData(const Memory &input, const Memory &output, MultiCachePtr cache) {
    if (!input.getDesc().isDefined() || !output.getDesc().isDefined())
        IE_THROW() << "Can't reorder data with dynamic shapes";

@ -427,17 +427,44 @@ void Reorder::reorderData(const Memory &input, const Memory &output) {
        auto copySize = output.GetSize();
        cpu_memcpy(dstPtr, srcPtr, copySize);
    } else {
-        std::unique_ptr<dnnl::reorder> pReorder;
-        dnnl::memory srcMemory;
+        auto getReorder = [] (MultiCachePtr& cache, const dnnl::memory& srcMemory, const dnnl::memory& dstMemory)
+            -> std::shared_ptr<dnnl::reorder> {
+            const auto& engine = dstMemory.get_engine();
+
+            auto builder = [&engine](const ReorderKey& key) -> std::shared_ptr<dnnl::reorder> {
+                dnnl::primitive_attr attr;
+                reorder::primitive_desc pd = dnnl::reorder::primitive_desc(engine, key.src, engine, key.dest, attr, true);
+                DEBUG_LOG(key.src, "->", key.dest);
+                if (!pd)
+                    return nullptr;
+                return std::make_shared<dnnl::reorder>(pd);
+            };
+
+            std::shared_ptr<dnnl::reorder> reorder;
+            auto src_desc = srcMemory.get_desc();
+            auto dst_desc = dstMemory.get_desc();
+            ReorderKey key = {src_desc, dst_desc};
+            if (!cache) {
+                reorder = builder(key);
+            } else {
+                auto result = cache->getOrCreate(key, builder);
+                reorder = std::move(result.first);
+            }
+            return reorder;
+        };
+
+        std::shared_ptr<dnnl::reorder> pReorder;
        std::vector<uint8_t> tmpBuff;

-        try {
-            pReorder = std::unique_ptr<dnnl::reorder>(new dnnl::reorder(input.GetPrimitive(), output.GetPrimitive()));
-            srcMemory = input.GetPrimitive();
-        }
-        catch (const dnnl::error& err) {
-            if (dnnl_unimplemented == err.status && output.GetDataType() != input.GetDataType() && Convert::isSupportedDesc(input.getDesc()) &&
-                    Convert::isSupportedDesc(output.getDesc())) {
+        auto srcMemory = input.GetPrimitive();
+        auto dstMemory = output.GetPrimitive();
+        auto engine = output.getEngine();
+        // try directly reorder
+        pReorder = getReorder(cache, srcMemory, dstMemory);
+        if (!pReorder) {
+            // try precision conversion then do the reorder
+            if (output.GetDataType() != input.GetDataType() && Convert::isSupportedDesc(input.getDesc()) &&
+                Convert::isSupportedDesc(output.getDesc())) {
                //we probably could not make the reorder because there is no one supporting this precision conversion
                //lets try to convert data first using cpu_convert
                auto data = static_cast<const uint8_t *>(input.GetPtr());
@ -447,19 +474,20 @@ void Reorder::reorderData(const Memory &input, const Memory &output) {
                cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToIEPrecision(input.GetDataType()),
                            outPrc, input.GetSize() / input.getDesc().getPrecision().size());

-                Memory tmpMem(output.getEngine());
+                Memory tmpMem(engine);
                auto tmpDesc = input.getDesc().cloneWithNewPrecision(outPrc);
                tmpMem.Create(std::move(tmpDesc), tmpBuff.data());

-                pReorder = std::unique_ptr<dnnl::reorder>(new dnnl::reorder(tmpMem.GetPrimitive(), output.GetPrimitive()));
                srcMemory = tmpMem.GetPrimitive();
-            } else {
-                throw;
+                pReorder = getReorder(cache, srcMemory, dstMemory);
+            }
+            if (!pReorder) {
+                IE_THROW() << "No reorder available for the following tensor descriptors: "
+                    << input.getDesc().serializeFormat() << " and " << output.getDesc().serializeFormat();
            }
        }
        if (pReorder) {
-            dnnl::stream loc_stream(output.getEngine(), dnnl::stream::flags::in_order);
-            auto dstMemory = output.GetPrimitive();
+            dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
            pReorder->execute(loc_stream, srcMemory, dstMemory);
        } else {
            IE_THROW() << "Could not make onednn reorder.";
--- a/src/plugins/intel_cpu/src/nodes/reorder.h
+++ b/src/plugins/intel_cpu/src/nodes/reorder.h
@ -65,7 +65,7 @@ public:

    static std::string getReorderArgs(const MemoryDesc &parentDesc, const MemoryDesc &childDesc);

-    static void reorderData(const Memory &input, const Memory &output);
+    static void reorderData(const Memory &input, const Memory &output, MultiCachePtr cache = nullptr);

 private:
    std::shared_ptr<MemoryDesc> input;
--- a/src/plugins/intel_cpu/src/primitive.cpp
+++ b/src/plugins/intel_cpu/src/primitive.cpp
@ -14,7 +14,7 @@ Primitive::operator bool() const {
    return prim ? true : false;
 }

-dnnl::primitive Primitive::operator*() {
+dnnl::primitive Primitive::operator*() const {
    return *prim;
 }

--- a/src/plugins/intel_cpu/src/primitive.h
+++ b/src/plugins/intel_cpu/src/primitive.h
@ -19,7 +19,7 @@ public:
    Primitive();
    operator bool() const;
    Primitive& operator=(const std::shared_ptr<dnnl::primitive>& primitive);
-    dnnl::primitive operator*();
+    dnnl::primitive operator*() const;
    void reset(dnnl::primitive* primitive);

 private:
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/matmul.cpp
@ -248,6 +248,16 @@ std::vector<CPUSpecificParams> filterSpecificParams_BrgemmAmx() {
    return specificParams;
 }

+
+std::vector<CPUSpecificParams> filterSpecificParams_Brgconv1x1() {
+    std::vector<CPUSpecificParams> specificParams;
+    if (with_cpu_x86_avx512_core()) {
+        specificParams.push_back(CPUSpecificParams{{}, {}, {"brgconv_avx512_1x1"}, "brgconv_avx512_1x1"});
+    }
+
+    return specificParams;
+}
+
 /* ============= FullyConnected ============= */
 namespace fullyConnected {

@ -534,7 +544,7 @@ INSTANTIATE_TEST_SUITE_P(nightly_FC_3D, MatMulLayerCPUTest, testParams3D_nightly
 INSTANTIATE_TEST_SUITE_P(nightly_FC_3D_BF16, MatMulLayerCPUTest, testParams3DBF16_nightly, MatMulLayerCPUTest::getTestCaseName);

 const std::vector<ShapeRelatedParams> IS2D_Brgemm_smoke = {
-    {static_shapes_to_test_representation({{59, 120}, {120, 120}}), {true, false}},
+    {static_shapes_to_test_representation({{39, 120}, {120, 120}}), {true, false}},

    {static_shapes_to_test_representation({{59, 16}, {16, 120}}), {true, false}},
    {static_shapes_to_test_representation({{59, 16}, {16, 120}}), {true, true}},
@ -596,6 +606,46 @@ const auto testParams2D_Brgemm_smoke = ::testing::Combine(fullyConnectedParams2D

 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_Brgemm, MatMulLayerCPUTest, testParams2D_Brgemm_smoke, MatMulLayerCPUTest::getTestCaseName);

+const std::vector<ShapeRelatedParams> IS2D_Brgconv1x1_smoke = {
+    {static_shapes_to_test_representation({{49, 96}, {96, 96}}), {true, false}},
+
+    {static_shapes_to_test_representation({{256, 188}, {188, 120}}), {true, false}},
+    {static_shapes_to_test_representation({{256, 188}, {188, 120}}), {true, true}},
+
+    {static_shapes_to_test_representation({{71, 128}, {128, 200}}), {false, false}},
+    {static_shapes_to_test_representation({{71, 128}, {128, 200}}), {false, true}},
+
+    {
+        {
+            {{-1, -1}, {{49, 96}, {59, 96}, {69, 96}, {79, 96}}},
+            {{96, 96}, {{96, 96}, {96, 96}, {96, 96}, {96, 96}}}
+        },
+        {false, false}
+    },
+    {
+        {
+            {{{0, 200}, {0, 200}}, {{98, 128}, {199, 128}}},
+            {{128, 166}, {{128, 166}, {128, 166}}}
+        },
+        {true, true}
+    },
+};
+
+const auto fullyConnectedParams2D_Brgconv1x1_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_Brgconv1x1_smoke),
+                                                       ::testing::Values(ElementType::f32),
+                                                       ::testing::Values(ElementType::undefined),
+                                                       ::testing::Values(ElementType::undefined),
+                                                       ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                       ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                       ::testing::Values(emptyAdditionalConfig));
+
+const auto testParams2D_Brgconv1x1_smoke = ::testing::Combine(fullyConnectedParams2D_Brgconv1x1_smoke,
+                                             ::testing::Values(MatMulNodeType::FullyConnected),
+                                             ::testing::ValuesIn(fusingParamsSet2D_Brgemm_smoke),
+                                             ::testing::ValuesIn(filterSpecificParams_Brgconv1x1()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_Brgconv1x1, MatMulLayerCPUTest, testParams2D_Brgconv1x1_smoke, MatMulLayerCPUTest::getTestCaseName);
+
 const auto fullyConnectedParams2D_Brgemm_Amx_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_Brgemm_smoke),
                                                       ::testing::Values(ElementType::f32),
                                                       ::testing::Values(ElementType::undefined),