[CPU] Simple fix of redundant const-weight reordering for brgconv node in dynamic model (#16305)

2023-03-29 16:27:08 +08:00
parent 556d469f6b
commit 05ab0f32d7
12 changed files with 163 additions and 110 deletions
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -450,11 +450,19 @@ void Graph::InitDescriptors() {
        node->filterSupportedPrimitiveDescriptors();

 #ifdef CPU_DEBUG_CAPS
-        DEBUG_LOG("==================");
-        for (auto & pd : node->getSupportedPrimitiveDescriptors())
-            DEBUG_LOG("#", node->getExecIndex(),
-                      " ", node->getName(),
-                      "  SupportedPrimitiveDescriptor:\n", pd);
+        const auto& SPDs = node->getSupportedPrimitiveDescriptors();
+        for (int i = 0; i < SPDs.size(); i++) {
+            DEBUG_LOG("#",
+                      node->getExecIndex(),
+                      " ",
+                      node->getName(),
+                      "  SupportedPrimitiveDescriptors [",
+                      i,
+                      "/",
+                      SPDs.size(),
+                      "]: \n",
+                      SPDs[i]);
+        }
 #endif
    }

--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@@ -821,6 +821,51 @@ void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) {
    Node::prepareMemory(intDescs);
 }

+MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
+    if (!getParentEdgeAt(1)->getParent()->isConstant())
+        IE_THROW() << "Weight input is not const for node " << getName() << ".";
+    auto edgeMem = getParentEdgeAt(1)->getMemoryPtr();
+    if (!edgeMem)
+        IE_THROW() << "Cannot get const weights edgeMem for node " << getName() << ".";
+
+    auto constDnnlMemOutDesc = edgeMem->GetDescWithType<DnnlMemoryDesc>();
+    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
+    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
+    auto create = [&] () {
+        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
+
+        Memory srcMemory{ getEngine() };
+        srcMemory.Create(newSrcDesc, edgeMem->GetData());
+
+        MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
+        _ptr->Create(weightDesc);
+        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
+
+        return _ptr;
+    };
+
+    MemoryPtr ptr;
+    const auto& format = weightDesc->serializeFormat();
+    auto itr = privateWeightCache.find(format);
+    if (privateWeightCache.end() != itr) {
+        ptr = itr->second;
+    } else {
+        auto weightCache = context->getWeightsCache();
+        if (weightCache != nullptr) {
+            const std::string string_hash = getName() + "_" + format
+                                            + "_" + std::to_string(edgeMem->GetSize())
+                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(edgeMem->GetData()));
+
+            ptr = *weightCache->findOrCreate(string_hash, create);
+        } else {
+            ptr = create();
+        }
+        privateWeightCache[format] = ptr;
+    }
+
+    return ptr;
+}
+
 bool Node::isInPlace() {
    if (inplace == InPlaceType::Unknown) {
        auto selected_pd = getSelectedPrimitiveDescriptor();
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@@ -619,6 +619,8 @@ protected:
    void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
    void prepareMemory(dnnl::primitive_desc_iterator& itpd);

+    MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);
+
    bool isDynamic = false;

    bool isInputTensorAtPortEmpty(size_t port) const;
@@ -687,6 +689,14 @@ private:
    enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2 };
    ConstantType checkConstant(LOOK look, std::vector<NodePtr>& checkNodes);

+    // we cannot rely on per-NUMA weightCache for caching weights because:
+    //   1.it may not exist(in single stream configuration)
+    //   2.it only holds weak references, the life-cycle of cached item
+    //     is still under control of strong references outside of cache.
+    // privateWeightCache is for holding strong references to constant weight
+    // copies of same content with different layouts.
+    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
+
 #ifdef CPU_DEBUG_CAPS
    friend class Verbose;
 #endif
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -57,6 +57,8 @@ struct ConvKey {
    dnnl::primitive_attr attr;
    impl_desc_type implType;

+    bool constWeight;
+
    size_t hash() const;
    bool operator==(const ConvKey& rhs) const;
 };
@@ -80,6 +82,7 @@ size_t ConvKey::hash() const {

    seed = hash_combine(seed, get_attr_hash(*attr.get()));
    seed = hash_combine(seed, implType);
+    seed = hash_combine(seed, constWeight);
    return seed;
 }

@@ -103,7 +106,7 @@ bool ConvKey::operator==(const ConvKey &rhs) const {
    retVal = retVal && paddingL == rhs.paddingL;
    retVal = retVal && paddingR == rhs.paddingR;

-    retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType;
+    retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType && constWeight == rhs.constWeight;
    return retVal;
 }

@@ -851,6 +854,14 @@ createDescriptorInternal(const dnnl::engine& engine,
 }
 } // namespace

+static memory::data_type deriveWeightDataType(memory::data_type src_dt) {
+    memory::data_type wdt = src_dt;
+    if (one_of(src_dt, memory::data_type::s8, memory::data_type::u8)) {
+        wdt = memory::data_type::s8;
+    }
+    return wdt;
+}
+
 void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                                   const std::vector<MemoryDescPtr>& outputDesc) {
    MemoryDescPtr inpDesc;
@@ -874,12 +885,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
    const auto& inDnnlDesc = definedInpMemDesc->getDnnlDesc();
    const auto& outDnnlDesc = definedOutMemDesc->getDnnlDesc();

-    memory::data_type dt  = inDnnlDesc.get_data_type();
-    memory::data_type wdt = dt;
-
-    if (one_of(dt, memory::data_type::s8, memory::data_type::u8)) {
-        wdt = memory::data_type::s8;
-    }
+    memory::data_type wdt = deriveWeightDataType(inDnnlDesc.get_data_type());

    dnnl::memory::desc weightDnnlDesc(DnnlExtensionUtils::convertToDnnlDims(weightDims), wdt, memory::format_tag::any);
    dnnl::memory::desc biasDnnlDesc;
@@ -1143,6 +1149,11 @@ bool Convolution::isPossibleToSkipInitConfig(const dnnl::primitive_desc &desc) c
 }

 std::shared_ptr<MemoryDesc> Convolution::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) {
+    if (idx == 1) {
+        // report original plain layout for weight since it needs to be reordered dynamically at runtime
+        return std::make_shared<CpuBlockedMemoryDesc>(getOriginalInputPrecisionAtPort(idx),
+                                                      Shape(getInputShapeAtPort(idx).getStaticDims()));
+    }
    auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx);
    if (getInputShapeAtPort(idx).isDynamic()) {
        return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx));
@@ -1352,10 +1363,17 @@ void Convolution::prepareParams() {
                   paddingL,
                   paddingR,
                   *pAttrLocal,
-                   selected_pd->getImplementationType()};
+                   selected_pd->getImplementationType(),
+                   getParentEdgeAt(1)->getParent()->isConstant()};

    auto engine = getEngine();
    auto builder = [&engine](const ConvKey& key) -> executorPtr {
+        // remove the requirement on weight memory layout to let primitive
+        // report the best layout for weight to be reordered dynamically at runtime
+        auto wghDescAny =
+            dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
+                               deriveWeightDataType(key.inp0->getDataType()),
+                               memory::format_tag::any);
        auto createDnnlConvDesc = [](const dnnl::engine engine,
                                     const dnnl::memory::desc& srcDesc,
                                     const dnnl::memory::desc& wghDesc,
@@ -1390,7 +1408,7 @@ void Convolution::prepareParams() {
        const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : dnnl::algorithm::convolution_direct;
        dnnl::primitive_desc desc = createDnnlConvDesc(engine,
                                                       key.inp0->getDnnlDesc(),
-                                                       key.inp1->getDnnlDesc(),
+                                                       wghDescAny,
                                                       key.out->getDnnlDesc(),
                                                       key.bias,
                                                       key.stride,
@@ -1401,7 +1419,6 @@ void Convolution::prepareParams() {
                                                       key.attr);

        auto itpd = desc;
-
        executorPtr execPtr = nullptr;
        while (static_cast<bool>(itpd)) {
            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@@ -1412,7 +1429,8 @@ void Convolution::prepareParams() {
                                                                key.inp0->getDnnlDesc(),
                                                                key.inp1->getDnnlDesc(),
                                                                key.out->getDnnlDesc(),
-                                                                engine);
+                                                                engine,
+                                                                key.constWeight);
                break;
            }

@@ -1425,16 +1443,13 @@ void Convolution::prepareParams() {
            auto inDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()),
                                                                                           key.inp0->getDataType(),
                                                                                           memory::format_tag::any);
-            auto wghDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
-                                                                                        key.inp1->getDataType(),
-                                                                                        memory::format_tag::any);
            auto outDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()),
                                                                                        key.out->getDataType(),
                                                                                        memory::format_tag::any);

            auto reorderConvDesc = createDnnlConvDesc(engine,
                                                      inDesc,
-                                                      wghDesc,
+                                                      wghDescAny,
                                                      outDesc,
                                                      key.bias,
                                                      key.stride,
@@ -1450,13 +1465,15 @@ void Convolution::prepareParams() {
                                                                key.inp0->getDnnlDesc(),
                                                                key.inp1->getDnnlDesc(),
                                                                key.out->getDnnlDesc(),
-                                                                engine);
+                                                                engine,
+                                                                key.constWeight);
            }
        }

        return execPtr;
    };

+    auto prevExecPtr = execPtr;
    execPtr = nullptr;
    auto cache = context->getParamsCache();
    auto result = cache->getOrCreate(key, builder);
@@ -1465,9 +1482,22 @@ void Convolution::prepareParams() {

    if (execPtr) {
        primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
-        primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
        primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();

+        if (key.constWeight) {
+            // const weight preparation/reordering needs to be done once at next execution
+            // when the input weight data is guaranteed to be ready (considering possible const-folding
+            // subgraphs inserted between constant weight node and conv)
+            auto it = primArgs.find(DNNL_ARG_WEIGHTS);
+            if (it == primArgs.end() || !prevExecPtr ||
+                !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
+                pendingConstWeightReorder = true;
+            }
+        } else {
+            // non-const weight will be reordered by executor on every exec
+            primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
+        }
+
        if (withBiases) {
            primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
        }
@@ -1497,12 +1527,14 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::convolution_fo
                                                                const dnnl::memory::desc& inMemDesc,
                                                                const dnnl::memory::desc& weightMemDesc,
                                                                const dnnl::memory::desc& outMemDesc,
-                                                                const dnnl::engine& engine) : DnnlExecutor(pd) {
+                                                                const dnnl::engine& engine,
+                                                                bool constWeight) : DnnlExecutor(pd) {
    if (inMemDesc != getDnnlSrcDesc()) {
        inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)});
    }

-    if (weightMemDesc != getDnnlWeightDesc()) {
+    if (!constWeight && weightMemDesc != getDnnlWeightDesc()) {
+        // const weight will be reordered at first execution
        inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, getDnnlWeightDesc(), engine)});
    }

@@ -1516,6 +1548,11 @@ void Convolution::execute(dnnl::stream strm) {
        IE_THROW() << "Can't execute Convolution node with name: " << getName() << ", because executor is not compiled";
    }

+    if (pendingConstWeightReorder) {
+        primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
+        pendingConstWeightReorder = false;
+    }
+
    execPtr->exec(primArgs, strm);
 }

@@ -1630,13 +1667,8 @@ void Convolution::appendZeroPointsArgs() {
    }
 }

-// brgconv will be enabled by default:
-// 1, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948)
-// 2, hw supports avx512+
+// brgconv will be enabled by default when HW supports avx512+
 void Convolution::initTryBrgconvFlag() {
-    if (isDynamicNode())
-        return;
-
    if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
        shouldTryBrgconv = true;
    }
--- a/src/plugins/intel_cpu/src/nodes/conv.h
+++ b/src/plugins/intel_cpu/src/nodes/conv.h
@@ -94,8 +94,10 @@ private:
                                const dnnl::memory::desc& inMemDesc,
                                const dnnl::memory::desc& weightMemDesc,
                                const dnnl::memory::desc& outMemDesc,
-                                const dnnl::engine& engine);
+                                const dnnl::engine& engine,
+                                bool constWeight);
    };
+    bool pendingConstWeightReorder = false;

    void prepareParams() override;
    void execute(dnnl::stream strm) override;
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -913,51 +913,6 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
    return retVal;
 }

-MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
-    if (!getParentEdgeAt(1)->getParent()->isConstant())
-        IE_THROW() << "Weight input is not const for node " << getName() << ".";
-    auto blob = getParentEdgeAt(1)->getMemoryPtr();
-    if (!blob)
-        IE_THROW() << "Cannot get const weights blob for node " << getName() << ".";
-
-    auto constDnnlMemOutDesc = blob->GetDescWithType<DnnlMemoryDesc>();
-    auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
-    weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
-    auto create = [&] () {
-        auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
-
-        Memory srcMemory{ getEngine() };
-        srcMemory.Create(newSrcDesc, blob->GetData());
-
-        MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
-        _ptr->Create(weightDesc);
-        node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
-
-        return _ptr;
-    };
-
-    MemoryPtr ptr;
-    const auto& format = weightDesc->serializeFormat();
-    auto itr = privateWeightCache.find(format);
-    if (privateWeightCache.end() != itr) {
-        ptr = itr->second;
-    } else {
-        auto weightCache = context->getWeightsCache();
-        if (weightCache != nullptr) {
-            const std::string string_hash = getName() + "_" + format
-                                            + "_" + std::to_string(blob->GetSize())
-                                            + "_" + std::to_string(reinterpret_cast<uint64_t>(blob->GetData()));
-
-            ptr = *weightCache->findOrCreate(string_hash, create);
-        } else {
-            ptr = create();
-        }
-        privateWeightCache[format] = ptr;
-    }
-
-    return ptr;
-}
-
 bool FullyConnected::useSparseWeightsDecompression() {
    // minSparseRate == 1 means that sparse feature is switched off
    if (minSparseRate == 1.f) {
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -83,11 +83,6 @@ private:
    bool useConv1x1 = false;
    impl_desc_type implementationTypeIP;
    MemoryDescPtr weightDescIP;
-    // when weightCache is not enabled (such as stream=1), brgconv weights may change due to
-    // different shapes. Weights will be cached in privateWeightCache.
-    // When weightCache is enabled, it holds weight ptr reference since weightCache does not hold the
-    // reference
-    std::unordered_map<std::string, MemoryPtr> privateWeightCache;
    dnnl::primitive_attr attr;

    static dnnl::convolution_forward::primitive_desc
@@ -99,7 +94,6 @@ private:
                                    const dnnl::engine& engine);

    bool canBeExecutedInConv1x1() const;
-    MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc);

    // sparse weights
    bool useSparseWeights = false;
--- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
+++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
@@ -21,6 +21,17 @@
 namespace ov {
 namespace intel_cpu {

+namespace {
+    size_t replace_all(std::string & inout, std::string what, std::string with) {
+        std::size_t count{};
+        for (std::string::size_type pos{}; inout.npos != (pos = inout.find(what.data(), pos, what.length()));
+             pos += with.length(), ++count) {
+            inout.replace(pos, what.length(), with.data(), with.length());
+        }
+        return count;
+    }
+}
+
 DebugLogEnabled::DebugLogEnabled(const char* file, const char* func, int line, const char* name) {
    // check ENV
    const char* p_filters = std::getenv("OV_CPU_DEBUG_LOG");
@@ -96,19 +107,27 @@ std::ostream & operator<<(std::ostream & os, const MemoryDesc& desc) {
 }

 std::ostream & operator<<(std::ostream & os, const NodeDesc& desc) {
-    os << "    ImplementationType: " << impl_type_to_string(desc.getImplementationType()) << std::endl;
+    std::stringstream ss;
+    ss << "  " << impl_type_to_string(desc.getImplementationType()) << "(";
+    const char * sep = "";
    for (auto & conf : desc.getConfig().inConfs) {
-        os << "    inConfs: " << *conf.getMemDesc();
-        if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
-        if (conf.constant()) os << " constant";
-        os << std::endl;
+        ss << sep << *conf.getMemDesc();
+        if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
+        if (conf.constant()) ss << " constant";
+        sep = ",";
    }
+    ss << ") -> (";
+    sep = "";
    for (auto & conf : desc.getConfig().outConfs) {
-        os << "    outConfs: " << *conf.getMemDesc();
-        if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
-        if (conf.constant()) os << " constant";
-        os << std::endl;
+        ss << sep << *conf.getMemDesc();
+        if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
+        if (conf.constant()) ss << " constant";
+        sep = ",";
    }
+    ss << ")" << std::endl;
+    auto str = ss.str();
+    replace_all(str, "0 - ?", "?");
+    os << str;
    return os;
 }

@@ -137,15 +156,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
        }
        return true;
    };
-    auto replace_all = [](std::string& inout, std::string what, std::string with) {
-        std::size_t count{};
-        for (std::string::size_type pos{};
-            inout.npos != (pos = inout.find(what.data(), pos, what.length()));
-            pos += with.length(), ++count) {
-            inout.replace(pos, what.length(), with.data(), with.length());
-        }
-        return count;
-    };
+
    auto nodeDesc = node.getSelectedPrimitiveDescriptor();
    std::stringstream leftside;

--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/elementwise_branch_selection_transformation.cpp
@@ -42,8 +42,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
        },
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
        {
-            {"Constant", "convolution1"},
-            {"Constant", "convolution2"},
            {"fakeQuantizeBefore1", "convolution1"},
            {"fakeQuantizeBefore2", "convolution2"},
            {"maxPool", "result"}
@@ -75,8 +73,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
        },
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
        {
-            {"Constant", "convolution1"},
-            {"Constant", "convolution2"},
            {"fakeQuantizeBefore1", "convolution1"},
            {"fakeQuantizeBefore2", "convolution2"},
            {"maxPool", "result"}
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
@@ -16,7 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise,
        ::testing::Values(convInputShape),
        ::testing::Values(convInputShape),
        ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Add>())), // non-tokenizable
-        ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs
+        ::testing::Values(5), // num nodes = 5: Convert + Convolution + 3 Reorders on Convs in&outs
        ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution
        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
        ConvEltwise::getTestCaseName);
@@ -26,7 +26,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise,
        ::testing::Values(convInputShape),
        ::testing::Values(convInputShape),
        ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Multiply>())), // fully-tokenizable
-        ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders
+        ::testing::Values(6), //num nodes = 6: Convert + Convolution + Subgraph + Reorders
        ::testing::Values(1), // num subgraphs = 1: Mul (2 inputs) can't be fused into Conv => Subgraph is created
        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
        ConvEltwise::getTestCaseName);
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
@@ -121,8 +121,8 @@ INSTANTIATE_TEST_SUITE_P(
    ::testing::Combine(
        ::testing::ValuesIn(testValuesLegacyFuse),
        ::testing::ValuesIn(operations),
-        // reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + reorder(ABcd16b16a) + Convolution + reorder(nchw)
-        ::testing::Values(std::pair<size_t, size_t>{6, 0}),
+        // reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + Convolution(with internal weight reordering) + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{5, 0}),
        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
    FakeQuantizeDecompositionTest::getTestCaseName);

--- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp
+++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/concat_const_inplace.cpp
@@ -71,9 +71,9 @@ namespace {
    TEST_P(ConcatConstantInPlaceTest, smoke_ConcatConstantInPlaceTest_CPU) {
        Run();
        if (this->GetParam() == Precision::BF16)
-            CheckNumberOfNodesWithType(executableNetwork, "Reorder", 4);
-        else
            CheckNumberOfNodesWithType(executableNetwork, "Reorder", 3);
+        else
+            CheckNumberOfNodesWithType(executableNetwork, "Reorder", 2);
    }

 INSTANTIATE_TEST_SUITE_P(smoke_ConcatConstantInPlaceTest_CPU, ConcatConstantInPlaceTest,