From 05ab0f32d708842b5a58f6f85af33fd4d2a5a1bf Mon Sep 17 00:00:00 2001
From: Tingqian Li <tingqian.li@intel.com>
Date: Wed, 29 Mar 2023 16:27:08 +0800
Subject: [PATCH] [CPU] Simple fix of redundant const-weight reordering for
 brgconv node in dynamic model (#16305)

---
 src/plugins/intel_cpu/src/graph.cpp           | 18 ++--
 src/plugins/intel_cpu/src/node.cpp            | 45 ++++++++++
 src/plugins/intel_cpu/src/node.h              | 10 +++
 src/plugins/intel_cpu/src/nodes/conv.cpp      | 82 +++++++++++++------
 src/plugins/intel_cpu/src/nodes/conv.h        |  4 +-
 .../intel_cpu/src/nodes/fullyconnected.cpp    | 45 ----------
 .../intel_cpu/src/nodes/fullyconnected.h      |  6 --
 .../src/utils/debug_capabilities.cpp          | 47 +++++++----
 ...ntwise_branch_selection_transformation.cpp |  4 -
 .../snippets/conv_eltwise.cpp                 |  4 +-
 .../fake_quantize_decomposition_test.cpp      |  4 +-
 .../src/concat_const_inplace.cpp              |  4 +-
 12 files changed, 163 insertions(+), 110 deletions(-)
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index 0c275fc183b..3c1b32b8ca0 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -450,11 +450,19 @@ void Graph::InitDescriptors() {
         node->filterSupportedPrimitiveDescriptors();
 
 #ifdef CPU_DEBUG_CAPS
-        DEBUG_LOG("==================");
-        for (auto & pd : node->getSupportedPrimitiveDescriptors())
-            DEBUG_LOG("#", node->getExecIndex(),
-                      " ", node->getName(),
-                      "  SupportedPrimitiveDescriptor:\n", pd);
+        const auto& SPDs = node->getSupportedPrimitiveDescriptors();
+        for (int i = 0; i < SPDs.size(); i++) {
+            DEBUG_LOG("#",
+                      node->getExecIndex(),
+                      " ",
+                      node->getName(),
+                      "  SupportedPrimitiveDescriptors [",
+                      i,
+                      "/",
+                      SPDs.size(),
+                      "]: \n",
+                      SPDs[i]);
+        }
 #endif
     }
 
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
index f80ed309383..67e289aa1cb 100644
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -821,6 +821,51 @@ void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) {
     Node::prepareMemory(intDescs);
 }
 
+MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
+    if (!getParentEdgeAt(1)->getParent()->isConstant())
+        IE_THROW() << "Weight input is not const for node " << getName() << ".";
+    auto edgeMem = getParentEdgeAt(1)->getMemoryPtr();
+    if (!edgeMem)
+        IE_THROW() << "Cannot get const weights edgeMem for node " << getName() << ".";
+
+    auto constDnnlMemOutDesc = edgeMem->GetDescWithType<DnnlMemoryDesc>();
+    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
+    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
+    auto create = [&] () {
+        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
+
+        Memory srcMemory{ getEngine() };
+        srcMemory.Create(newSrcDesc, edgeMem->GetData());
+
+        MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
+        _ptr->Create(weightDesc);
+        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
+
+        return _ptr;
+    };
+
+    MemoryPtr ptr;
+    const auto& format = weightDesc->serializeFormat();
+    auto itr = privateWeightCache.find(format);
+    if (privateWeightCache.end() != itr) {
+        ptr = itr->second;
+    } else {
+        auto weightCache = context->getWeightsCache();
+        if (weightCache != nullptr) {
+            const std::string string_hash = getName() + "_" + format
+                                            + "_" + std::to_string(edgeMem->GetSize())
+                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(edgeMem->GetData()));
+
+            ptr = *weightCache->findOrCreate(string_hash, create);
+        } else {
+            ptr = create();
+        }
+        privateWeightCache[format] = ptr;
+    }
+
+    return ptr;
+}
+
 bool Node::isInPlace() {
     if (inplace == InPlaceType::Unknown) {
         auto selected_pd = getSelectedPrimitiveDescriptor();
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
index dd78bfd0159..d9f242b353d 100644
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -619,6 +619,8 @@ protected:
     void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
     void prepareMemory(dnnl::primitive_desc_iterator& itpd);
 
+    MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);
+
     bool isDynamic = false;
 
     bool isInputTensorAtPortEmpty(size_t port) const;
@@ -687,6 +689,14 @@ private:
     enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2 };
     ConstantType checkConstant(LOOK look, std::vector<NodePtr>& checkNodes);
 
+    // we cannot rely on per-NUMA weightCache for caching weights because:
+    //   1.it may not exist(in single stream configuration)
+    //   2.it only holds weak references, the life-cycle of cached item
+    //     is still under control of strong references outside of cache.
+    // privateWeightCache is for holding strong references to constant weight
+    // copies of same content with different layouts.
+    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
+
 #ifdef CPU_DEBUG_CAPS
     friend class Verbose;
 #endif
diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp
index a7b86667825..94b6481b8cf 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -57,6 +57,8 @@ struct ConvKey {
     dnnl::primitive_attr attr;
     impl_desc_type implType;
 
+    bool constWeight;
+
     size_t hash() const;
     bool operator==(const ConvKey& rhs) const;
 };
@@ -80,6 +82,7 @@ size_t ConvKey::hash() const {
 
     seed = hash_combine(seed, get_attr_hash(*attr.get()));
     seed = hash_combine(seed, implType);
+    seed = hash_combine(seed, constWeight);
     return seed;
 }
 
@@ -103,7 +106,7 @@ bool ConvKey::operator==(const ConvKey &rhs) const {
     retVal = retVal && paddingL == rhs.paddingL;
     retVal = retVal && paddingR == rhs.paddingR;
 
-    retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType;
+    retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType && constWeight == rhs.constWeight;
     return retVal;
 }
 
@@ -851,6 +854,14 @@ createDescriptorInternal(const dnnl::engine& engine,
 }
 } // namespace
 
+static memory::data_type deriveWeightDataType(memory::data_type src_dt) {
+    memory::data_type wdt = src_dt;
+    if (one_of(src_dt, memory::data_type::s8, memory::data_type::u8)) {
+        wdt = memory::data_type::s8;
+    }
+    return wdt;
+}
+
 void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                                    const std::vector<MemoryDescPtr>& outputDesc) {
     MemoryDescPtr inpDesc;
@@ -874,12 +885,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
     const auto& inDnnlDesc = definedInpMemDesc->getDnnlDesc();
     const auto& outDnnlDesc = definedOutMemDesc->getDnnlDesc();
 
-    memory::data_type dt  = inDnnlDesc.get_data_type();
-    memory::data_type wdt = dt;
-
-    if (one_of(dt, memory::data_type::s8, memory::data_type::u8)) {
-        wdt = memory::data_type::s8;
-    }
+    memory::data_type wdt = deriveWeightDataType(inDnnlDesc.get_data_type());
 
     dnnl::memory::desc weightDnnlDesc(DnnlExtensionUtils::convertToDnnlDims(weightDims), wdt, memory::format_tag::any);
     dnnl::memory::desc biasDnnlDesc;
@@ -1143,6 +1149,11 @@ bool Convolution::isPossibleToSkipInitConfig(const dnnl::primitive_desc &desc) c
 }
 
 std::shared_ptr<MemoryDesc> Convolution::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) {
+    if (idx == 1) {
+        // report original plain layout for weight since it needs to be reordered dynamically at runtime
+        return std::make_shared<CpuBlockedMemoryDesc>(getOriginalInputPrecisionAtPort(idx),
+                                                      Shape(getInputShapeAtPort(idx).getStaticDims()));
+    }
     auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx);
     if (getInputShapeAtPort(idx).isDynamic()) {
         return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx));
@@ -1352,10 +1363,17 @@ void Convolution::prepareParams() {
                    paddingL,
                    paddingR,
                    *pAttrLocal,
-                   selected_pd->getImplementationType()};
+                   selected_pd->getImplementationType(),
+                   getParentEdgeAt(1)->getParent()->isConstant()};
 
     auto engine = getEngine();
     auto builder = [&engine](const ConvKey& key) -> executorPtr {
+        // remove the requirement on weight memory layout to let primitive
+        // report the best layout for weight to be reordered dynamically at runtime
+        auto wghDescAny =
+            dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
+                               deriveWeightDataType(key.inp0->getDataType()),
+                               memory::format_tag::any);
         auto createDnnlConvDesc = [](const dnnl::engine engine,
                                      const dnnl::memory::desc& srcDesc,
                                      const dnnl::memory::desc& wghDesc,
@@ -1390,7 +1408,7 @@ void Convolution::prepareParams() {
         const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : dnnl::algorithm::convolution_direct;
         dnnl::primitive_desc desc = createDnnlConvDesc(engine,
                                                        key.inp0->getDnnlDesc(),
-                                                       key.inp1->getDnnlDesc(),
+                                                       wghDescAny,
                                                        key.out->getDnnlDesc(),
                                                        key.bias,
                                                        key.stride,
@@ -1401,7 +1419,6 @@ void Convolution::prepareParams() {
                                                        key.attr);
 
         auto itpd = desc;
-
         executorPtr execPtr = nullptr;
         while (static_cast<bool>(itpd)) {
             impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@@ -1412,7 +1429,8 @@ void Convolution::prepareParams() {
                                                                 key.inp0->getDnnlDesc(),
                                                                 key.inp1->getDnnlDesc(),
                                                                 key.out->getDnnlDesc(),
-                                                                engine);
+                                                                engine,
+                                                                key.constWeight);
                 break;
             }
 
@@ -1425,16 +1443,13 @@ void Convolution::prepareParams() {
             auto inDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()),
                                                                                            key.inp0->getDataType(),
                                                                                            memory::format_tag::any);
-            auto wghDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
-                                                                                        key.inp1->getDataType(),
-                                                                                        memory::format_tag::any);
             auto outDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()),
                                                                                         key.out->getDataType(),
                                                                                         memory::format_tag::any);
 
             auto reorderConvDesc = createDnnlConvDesc(engine,
                                                       inDesc,
-                                                      wghDesc,
+                                                      wghDescAny,
                                                       outDesc,
                                                       key.bias,
                                                       key.stride,
@@ -1450,13 +1465,15 @@ void Convolution::prepareParams() {
                                                                 key.inp0->getDnnlDesc(),
                                                                 key.inp1->getDnnlDesc(),
                                                                 key.out->getDnnlDesc(),
-                                                                engine);
+                                                                engine,
+                                                                key.constWeight);
             }
         }
 
         return execPtr;
     };
 
+    auto prevExecPtr = execPtr;
     execPtr = nullptr;
     auto cache = context->getParamsCache();
     auto result = cache->getOrCreate(key, builder);
@@ -1465,9 +1482,22 @@ void Convolution::prepareParams() {
 
     if (execPtr) {
         primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
-        primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
         primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
 
+        if (key.constWeight) {
+            // const weight preparation/reordering needs to be done once at next execution
+            // when the input weight data is guaranteed to be ready (considering possible const-folding
+            // subgraphs inserted between constant weight node and conv)
+            auto it = primArgs.find(DNNL_ARG_WEIGHTS);
+            if (it == primArgs.end() || !prevExecPtr ||
+                !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
+                pendingConstWeightReorder = true;
+            }
+        } else {
+            // non-const weight will be reordered by executor on every exec
+            primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
+        }
+
         if (withBiases) {
             primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
         }
@@ -1497,12 +1527,14 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::convolution_fo
                                                                 const dnnl::memory::desc& inMemDesc,
                                                                 const dnnl::memory::desc& weightMemDesc,
                                                                 const dnnl::memory::desc& outMemDesc,
-                                                                const dnnl::engine& engine) : DnnlExecutor(pd) {
+                                                                const dnnl::engine& engine,
+                                                                bool constWeight) : DnnlExecutor(pd) {
     if (inMemDesc != getDnnlSrcDesc()) {
         inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)});
     }
 
-    if (weightMemDesc != getDnnlWeightDesc()) {
+    if (!constWeight && weightMemDesc != getDnnlWeightDesc()) {
+        // const weight will be reordered at first execution
         inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, getDnnlWeightDesc(), engine)});
     }
 
@@ -1516,6 +1548,11 @@ void Convolution::execute(dnnl::stream strm) {
         IE_THROW() << "Can't execute Convolution node with name: " << getName() << ", because executor is not compiled";
     }
 
+    if (pendingConstWeightReorder) {
+        primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
+        pendingConstWeightReorder = false;
+    }
+
     execPtr->exec(primArgs, strm);
 }
 
@@ -1630,13 +1667,8 @@ void Convolution::appendZeroPointsArgs() {
     }
 }
 
-// brgconv will be enabled by default:
-// 1, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948)
-// 2, hw supports avx512+
+// brgconv will be enabled by default when HW supports avx512+
 void Convolution::initTryBrgconvFlag() {
-    if (isDynamicNode())
-        return;
-
     if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
         shouldTryBrgconv = true;
     }
diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h
index fb9385601ca..d0e4c48c151 100644
--- a/src/plugins/intel_cpu/src/nodes/conv.h
+++ b/src/plugins/intel_cpu/src/nodes/conv.h
@@ -94,8 +94,10 @@ private:
                                 const dnnl::memory::desc& inMemDesc,
                                 const dnnl::memory::desc& weightMemDesc,
                                 const dnnl::memory::desc& outMemDesc,
-                                const dnnl::engine& engine);
+                                const dnnl::engine& engine,
+                                bool constWeight);
     };
+    bool pendingConstWeightReorder = false;
 
     void prepareParams() override;
     void execute(dnnl::stream strm) override;
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index 608ed26ac45..7e2181c444c 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -913,51 +913,6 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
     return retVal;
 }
 
-MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
-    if (!getParentEdgeAt(1)->getParent()->isConstant())
-        IE_THROW() << "Weight input is not const for node " << getName() << ".";
-    auto blob = getParentEdgeAt(1)->getMemoryPtr();
-    if (!blob)
-        IE_THROW() << "Cannot get const weights blob for node " << getName() << ".";
-
-    auto constDnnlMemOutDesc = blob->GetDescWithType<DnnlMemoryDesc>();
-    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
-    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
-    auto create = [&] () {
-        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
-
-        Memory srcMemory{ getEngine() };
-        srcMemory.Create(newSrcDesc, blob->GetData());
-
-        MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
-        _ptr->Create(weightDesc);
-        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
-
-        return _ptr;
-    };
-
-    MemoryPtr ptr;
-    const auto& format = weightDesc->serializeFormat();
-    auto itr = privateWeightCache.find(format);
-    if (privateWeightCache.end() != itr) {
-        ptr = itr->second;
-    } else {
-        auto weightCache = context->getWeightsCache();
-        if (weightCache != nullptr) {
-            const std::string string_hash = getName() + "_" + format
-                                            + "_" + std::to_string(blob->GetSize())
-                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(blob->GetData()));
-
-            ptr = *weightCache->findOrCreate(string_hash, create);
-        } else {
-            ptr = create();
-        }
-        privateWeightCache[format] = ptr;
-    }
-
-    return ptr;
-}
-
 bool FullyConnected::useSparseWeightsDecompression() {
     // minSparseRate == 1 means that sparse feature is switched off
     if (minSparseRate == 1.f) {
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index 3f0983f2fc2..8add77440fd 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -83,11 +83,6 @@ private:
     bool useConv1x1 = false;
     impl_desc_type implementationTypeIP;
     MemoryDescPtr weightDescIP;
-    // when weightCache is not enabled (such as stream=1), brgconv weights may change due to
-    // different shapes. Weights will be cached in privateWeightCache.
-    // When weightCache is enabled, it holds weight ptr reference since weightCache does not hold the
-    // reference
-    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
     dnnl::primitive_attr attr;
 
     static dnnl::convolution_forward::primitive_desc
@@ -99,7 +94,6 @@ private:
                                     const dnnl::engine& engine);
 
     bool canBeExecutedInConv1x1() const;
-    MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc);
 
     // sparse weights
     bool useSparseWeights = false;
diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
index fb13000708c..b791f18cec1 100644
--- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
+++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
@@ -21,6 +21,17 @@
 namespace ov {
 namespace intel_cpu {
 
+namespace {
+    size_t replace_all(std::string & inout, std::string what, std::string with) {
+        std::size_t count{};
+        for (std::string::size_type pos{}; inout.npos != (pos = inout.find(what.data(), pos, what.length()));
+             pos += with.length(), ++count) {
+            inout.replace(pos, what.length(), with.data(), with.length());
+        }
+        return count;
+    }
+}
+
 DebugLogEnabled::DebugLogEnabled(const char* file, const char* func, int line, const char* name) {
     // check ENV
     const char* p_filters = std::getenv("OV_CPU_DEBUG_LOG");
@@ -96,19 +107,27 @@ std::ostream & operator<<(std::ostream & os, const MemoryDesc& desc) {
 }
 
 std::ostream & operator<<(std::ostream & os, const NodeDesc& desc) {
-    os << "    ImplementationType: " << impl_type_to_string(desc.getImplementationType()) << std::endl;
+    std::stringstream ss;
+    ss << "  " << impl_type_to_string(desc.getImplementationType()) << "(";
+    const char * sep = "";
     for (auto & conf : desc.getConfig().inConfs) {
-        os << "    inConfs: " << *conf.getMemDesc();
-        if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
-        if (conf.constant()) os << " constant";
-        os << std::endl;
+        ss << sep << *conf.getMemDesc();
+        if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
+        if (conf.constant()) ss << " constant";
+        sep = ",";
     }
+    ss << ") -> (";
+    sep = "";
     for (auto & conf : desc.getConfig().outConfs) {
-        os << "    outConfs: " << *conf.getMemDesc();
-        if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
-        if (conf.constant()) os << " constant";
-        os << std::endl;
+        ss << sep << *conf.getMemDesc();
+        if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
+        if (conf.constant()) ss << " constant";
+        sep = ",";
     }
+    ss << ")" << std::endl;
+    auto str = ss.str();
+    replace_all(str, "0 - ?", "?");
+    os << str;
     return os;
 }
 
@@ -137,15 +156,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
         }
         return true;
     };
-    auto replace_all = [](std::string& inout, std::string what, std::string with) {
-        std::size_t count{};
-        for (std::string::size_type pos{};
-            inout.npos != (pos = inout.find(what.data(), pos, what.length()));
-            pos += with.length(), ++count) {
-            inout.replace(pos, what.length(), with.data(), with.length());
-        }
-        return count;
-    };
+
     auto nodeDesc = node.getSelectedPrimitiveDescriptor();
     std::stringstream leftside;
 
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
index f95c319d706..ff0642926ce 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
@@ -42,8 +42,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
         },
         { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
         {
-            {"Constant", "convolution1"},
-            {"Constant", "convolution2"},
             {"fakeQuantizeBefore1", "convolution1"},
             {"fakeQuantizeBefore2", "convolution2"},
             {"maxPool", "result"}
@@ -75,8 +73,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
         },
         { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
         {
-            {"Constant", "convolution1"},
-            {"Constant", "convolution2"},
             {"fakeQuantizeBefore1", "convolution1"},
             {"fakeQuantizeBefore2", "convolution2"},
             {"maxPool", "result"}
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
index ffc6ef57add..ab0aaf27ef1 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
@@ -16,7 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise,
         ::testing::Values(convInputShape),
         ::testing::Values(convInputShape),
         ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Add>())), // non-tokenizable
-        ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs
+        ::testing::Values(5), // num nodes = 5: Convert + Convolution + 3 Reorders on Convs in&outs
         ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution
         ::testing::Values(CommonTestUtils::DEVICE_CPU)),
         ConvEltwise::getTestCaseName);
@@ -26,7 +26,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise,
         ::testing::Values(convInputShape),
         ::testing::Values(convInputShape),
         ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Multiply>())), // fully-tokenizable
-        ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders
+        ::testing::Values(6), //num nodes = 6: Convert + Convolution + Subgraph + Reorders
         ::testing::Values(1), // num subgraphs = 1: Mul (2 inputs) can't be fused into Conv => Subgraph is created
         ::testing::Values(CommonTestUtils::DEVICE_CPU)),
         ConvEltwise::getTestCaseName);
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
index a231dd9a595..6333c339a3d 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
@@ -121,8 +121,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::ValuesIn(testValuesLegacyFuse),
         ::testing::ValuesIn(operations),
-        // reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + reorder(ABcd16b16a) + Convolution + reorder(nchw)
-        ::testing::Values(std::pair<size_t, size_t>{6, 0}),
+        // reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + Convolution(with internal weight reordering) + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{5, 0}),
         ::testing::Values(CommonTestUtils::DEVICE_CPU)),
     FakeQuantizeDecompositionTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp
index ec9eff60542..9e4ee465ee5 100644
--- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp
+++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp
@@ -71,9 +71,9 @@ namespace {
     TEST_P(ConcatConstantInPlaceTest, smoke_ConcatConstantInPlaceTest_CPU) {
         Run();
         if (this->GetParam() == Precision::BF16)
-            CheckNumberOfNodesWithType(executableNetwork, "Reorder", 4);
-        else
             CheckNumberOfNodesWithType(executableNetwork, "Reorder", 3);
+        else
+            CheckNumberOfNodesWithType(executableNetwork, "Reorder", 2);
     }
 
 INSTANTIATE_TEST_SUITE_P(smoke_ConcatConstantInPlaceTest_CPU, ConcatConstantInPlaceTest,