[CPU] Convolution plus sum fusing in the case of dynamic shapes (#10235)

2022-02-15 13:12:07 +03:00
parent ccc38d22a8
commit 788a5bb9f2
14 changed files with 610 additions and 61 deletions
--- a/src/plugins/intel_cpu/src/mkldnn_graph.cpp
+++ b/src/plugins/intel_cpu/src/mkldnn_graph.cpp
@@ -80,6 +80,38 @@ void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMg
    CPU_DEBUG_CAP_ENABLE(serialize(*this));
 }

+void MKLDNNGraph::CreateGraph(const std::vector<MKLDNNNodePtr> &graphNodes,
+                              const std::vector<MKLDNNEdgePtr> &graphEdges,
+                              MKLDNNWeightsSharing::Ptr &w_cache,
+                              std::string name) {
+    if (IsReady())
+        ForgetGraphData();
+    // disable weights caching if graph was created only once
+    weightsCache = config.streamExecutorConfig._streams != 1 ? w_cache : nullptr;
+
+    rtParamsCache = std::make_shared<MultiCache>(config.rtCacheCapacity);
+
+    this->_name = std::move(name);
+    this->reuse_io_tensors = false;
+
+    this->graphNodes = graphNodes;
+    this->graphEdges = graphEdges;
+
+    for (auto node : graphNodes) {
+        if ("Parameter" == node->getTypeStr()) {
+            inputNodesMap[node->getName()] = node;
+        } else if ("Result" == node->getTypeStr()) {
+            outputNodesMap[node->getName()] = node;
+        }
+    }
+
+    InitGraph();
+
+    status = Ready;
+
+    CPU_DEBUG_CAP_ENABLE(serialize(*this));
+}
+
 template void MKLDNNGraph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
        const MKLDNNExtensionManager::Ptr&, MKLDNNWeightsSharing::Ptr&);
 template void MKLDNNGraph::CreateGraph(const CNNNetwork&,
@@ -1073,6 +1105,7 @@ Config MKLDNNGraph::getProperty() const {
 void MKLDNNGraph::RemoveEdge(MKLDNNEdgePtr& edge) {
    for (auto it = graphEdges.begin(); it != graphEdges.end(); it++) {
        if ((*it) == edge) {
+            edge->drop();
            graphEdges.erase(it);
            return;
        }
--- a/src/plugins/intel_cpu/src/mkldnn_graph.h
+++ b/src/plugins/intel_cpu/src/mkldnn_graph.h
@@ -50,6 +50,11 @@ public:
                     const MKLDNNExtensionManager::Ptr& extMgr,
                     MKLDNNWeightsSharing::Ptr &w_cache);

+    void CreateGraph(const std::vector<MKLDNNNodePtr> &graphNodes,
+                     const std::vector<MKLDNNEdgePtr> &graphEdges,
+                     MKLDNNWeightsSharing::Ptr &w_cache,
+                     std::string name);
+
    bool hasMeanImageFor(const std::string& name) {
        return _normalizePreprocMap.find(name) != _normalizePreprocMap.end();
    }
--- a/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp
@@ -239,14 +239,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionMatMulAndBias(MKLDNNGraph &graph) {
                    int inNum = 0;
                    if (remEdge) {
                        inNum = remEdge->getInputNum();
-                        remEdge->drop();
                        graph.RemoveEdge(remEdge);
                    }
                    remEdge = childs[j].lock();
                    int outNum = 0;
                    if (remEdge) {
                        outNum = remEdge->getOutputNum();
-                        remEdge->drop();
                        graph.RemoveEdge(remEdge);
                    }
                    MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
@@ -259,7 +257,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionMatMulAndBias(MKLDNNGraph &graph) {
                int inNum = 0;
                if (remEdge) {
                    inNum = remEdge->getInputNum();
-                    remEdge->drop();
                    graph.RemoveEdge(remEdge);
                }

@@ -1074,8 +1071,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
    };

    for (auto &graphNode : graphNodes) {
-        // TODO [DS]: at this moment this transformation prohibit for dynamic case
-        if (graphNode->getType() != Eltwise || graphNode->getAlgorithm() != EltwiseAdd || graphNode->isDynamicNode() ||
+        if (graphNode->getType() != Eltwise || graphNode->getAlgorithm() != EltwiseAdd ||
                std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isWithBroadcast())
            continue;

@@ -1227,9 +1223,9 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
        if (mergedConv->fusedWith.size() > 0 &&
           (mergedConv->fusedWith[0]->getType() == Convolution || mergedConv->fusedWith[0]->getType() == BinaryConvolution)) {
            // Merged with DW_conv. Shape may change
-            mergedConv->inputShapes.push_back(mergedConv->fusedWith[0]->outputShapes[0]);
+            mergedConv->inputShapes.push_back(mergedConv->fusedWith[0]->getOutputShapeAtPort(0));
        } else {
-            mergedConv->inputShapes.push_back(mergedConv->outputShapes[0]);
+            mergedConv->inputShapes.push_back(sum->getInputShapeAtPort(1));
        }

        size_t childIdx = 0lu;
@@ -1536,14 +1532,12 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
                        int inNum = 0;
                        if (remEdge) {
                            inNum = remEdge->getInputNum();
-                            remEdge->drop();
                            graph.RemoveEdge(remEdge);
                        }
                        remEdge = children[j].lock();
                        int outNum = 0;
                        if (remEdge) {
                            outNum = remEdge->getOutputNum();
-                            remEdge->drop();
                            graph.RemoveEdge(remEdge);
                        }
                        MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
@@ -1563,7 +1557,6 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
                        if (childNode->getAlgorithm() == EltwiseMulAdd) {
                            outNum = initialParentInNum + remEdge->getOutputNum() - 1;
                        }
-                        remEdge->drop();
                        graph.RemoveEdge(remEdge);
                    }

@@ -2068,7 +2061,6 @@ void MKLDNNGraphOptimizer::reshapeRnnSeq(MKLDNNGraph &graph) {
            graphEdges.push_back(newEdge);
            graphNodes.push_back(cpuConstant);

-            edge->drop();
            graph.RemoveEdge(edge);
        }
    }
--- a/src/plugins/intel_cpu/src/mkldnn_node.cpp
+++ b/src/plugins/intel_cpu/src/mkldnn_node.cpp
@@ -1546,3 +1546,7 @@ bool MKLDNNNode::canFuseSimpleOperation(const MKLDNNNodePtr& node) const {
    }
    return false;
 }
+
+void MKLDNNNode::addFusedNode(const MKLDNNNodePtr &fusingNode) {
+    fusedWith.push_back(fusingNode);
+}
--- a/src/plugins/intel_cpu/src/mkldnn_node.h
+++ b/src/plugins/intel_cpu/src/mkldnn_node.h
@@ -186,9 +186,7 @@ public:

    bool isFusedWith(Type type) const;

-    void addFusedNode(const MKLDNNNodePtr &fusingNode) {
-        fusedWith.push_back(fusingNode);
-    }
+    virtual void addFusedNode(const MKLDNNNodePtr &fusingNode);

    virtual void fuseInto(MKLDNNNodePtr& parentNode) {
        // The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input port a node was fused into parent one.
@@ -332,7 +330,7 @@ public:

    virtual void execute(mkldnn::stream strm);
    void executeDynamic(mkldnn::stream strm);
-    void redefineOutputMemory(const std::vector<VectorDims> &newShapes);
+    virtual void redefineOutputMemory(const std::vector<VectorDims> &newShapes);

    virtual void initSupportedPrimitiveDescriptors();

--- a/src/plugins/intel_cpu/src/nodes/mkldnn_conv_node.cpp
+++ b/src/plugins/intel_cpu/src/nodes/mkldnn_conv_node.cpp
@@ -9,6 +9,7 @@
 #include "mkldnn_fake_quantize_node.h"
 #include "mkldnn_pooling_node.h"
 #include "mkldnn_concat_node.h"
+#include "mkldnn_graph.h"
 #include "cpu/x64/cpu_isa_traits.hpp"
 #include <string>
 #include <vector>
@@ -95,6 +96,101 @@ bool ConvKey::operator==(const ConvKey &rhs) const {

 } // namespace

+class MKLDNNConvolutionNode::FusedSubgraph {
+public:
+    FusedSubgraph(const std::vector<MKLDNNNodePtr> &opList, const MKLDNNConvolutionNode &conv, MKLDNNWeightsSharing::Ptr weightCache) {
+        _graph = std::unique_ptr<MKLDNNGraph>(new MKLDNNGraph());
+
+        std::unordered_set<MKLDNNNodePtr> nodesSet;
+        std::vector<MKLDNNEdgePtr> edges;
+
+        auto addEdge = [&](const MKLDNNNodePtr& parent, const MKLDNNNodePtr& child, size_t parentPort, size_t childPort) -> void {
+            auto edge = std::make_shared<MKLDNNEdge>(parent, child, parentPort, childPort);
+            child->addEdge(edge);
+            edges.push_back(edge);
+            nodesSet.insert(parent);
+            nodesSet.insert(child);
+        };
+
+        //Make inputs
+        const auto &inpMemDesc1 = conv.getBaseMemDescAtOutputPort(0);
+        auto inp0 = std::make_shared<MKLDNNInputNode>(inpMemDesc1, "inp0", "Parameter", conv.getEngine(), weightCache);
+        inputs.push_back(inp0);
+        const size_t sumPortNum = conv.getParentEdges().size() - 1;
+        const auto &inpMemDesc2 = conv.getBaseMemDescAtInputPort(sumPortNum);
+        auto inp1 = std::make_shared<MKLDNNInputNode>(inpMemDesc2, "inp1", "Parameter", conv.getEngine(), weightCache);
+        inputs.push_back(inp1);
+
+        auto itr = std::find_if(opList.begin(), opList.end(), [](const MKLDNNNodePtr &node) {
+            if (auto eltwise = std::dynamic_pointer_cast<MKLDNNEltwiseNode>(node)) {
+                return eltwise->isSpecialConvolutionAddFusing();
+            }
+            return false;
+        });
+        auto sumNode = *itr;
+        addEdge(inp0, sumNode, 0, 0);
+        addEdge(inp1, sumNode, 0, 1);
+
+        //Replicate the rest of the subgraph
+        auto parentItr = itr;
+        while (++itr != opList.end()) {
+            auto parentNode = *parentItr;
+            auto currentNode = *itr;
+            if (FakeQuantize == currentNode->getType()) {
+                parentNode->addFusedNode(currentNode);
+            } else {
+                addEdge(parentNode, currentNode, 0, 0);
+                auto constantsItr = conv.fusedConstNodes.find(currentNode);
+                if (constantsItr != conv.fusedConstNodes.end()) {
+                    size_t inpPort = 1lu;
+                    for (const auto& item : constantsItr->second) {
+                        addEdge(item, currentNode, 0, inpPort++);
+                    }
+                }
+                parentItr = itr;
+            }
+        }
+
+        //Make output
+        const auto &outMemDesc = conv.getBaseMemDescAtOutputPort(0);
+        auto out = std::make_shared<MKLDNNInputNode>(outMemDesc, "out", "Result", conv.getEngine(), weightCache);
+        addEdge(*parentItr, out, 0, 0);
+        outputs.push_back(out);
+
+        std::vector<MKLDNNNodePtr> nodes(nodesSet.begin(), nodesSet.end());
+
+        _graph->CreateGraph(nodes, edges, weightCache, "fused_subgraph");
+    }
+
+    std::shared_ptr<MKLDNNInputNode> getInput(size_t idx) const {
+        if (idx < inputs.size()) {
+            return inputs[idx];
+        } else {
+            IE_THROW(OutOfBounds) << "Unexpected input index in MKLDNNConvolutionNode::fusedSubgraph::getInput idx=" << idx
+                                  << " inputs.size()=" << inputs.size();
+        }
+    }
+
+    std::shared_ptr<MKLDNNInputNode> getOutput(size_t idx) const {
+        if (idx < outputs.size()) {
+            return outputs[idx];
+        } else {
+            IE_THROW(OutOfBounds) << "Unexpected output index in MKLDNNConvolutionNode::fusedSubgraph::getInput idx=" << idx
+                                  << " inputs.size()=" << outputs.size();
+        }
+    }
+
+    void infer() {
+        _graph->ResetInferCount();
+        _graph->Infer();
+    }
+
+private:
+    std::unique_ptr<MKLDNNGraph> _graph;
+    std::vector<std::shared_ptr<MKLDNNInputNode>> inputs;
+    std::vector<std::shared_ptr<MKLDNNInputNode>> outputs;
+};
+
 bool MKLDNNConvolutionNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
    try {
        if (!ngraph::is_type<ngraph::op::v1::Convolution>(op) && !ngraph::is_type<ngraph::op::v1::GroupConvolution>(op)) {
@@ -220,7 +316,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
                 (withBiases ? (getParentEdgeAt(2)->getParent()->isConstant() && getParentEdgeAt(2)->getParent()->getType() == Input) : true);
    }

-    withSum = false;
    int expectedInputEdgesNum = static_cast<int>(getOriginalInputsNumber());
    for (int i = 0; i < fusedWith.size(); i++) {
        if (fusedWith[i]->getType() == Convolution) {
@@ -230,7 +325,6 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
        if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
            if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
-                withSum = true;
                expectedInputEdgesNum++;
            }
        }
@@ -418,6 +512,9 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const Vecto

        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            if (eltwiseNode->isSpecialConvolutionAddFusing()) {
+                if (withSumBroadcast) {
+                    break;
+                }
                ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
            } else {
                if (useLegacyPostOps || eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
@@ -536,7 +633,7 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {

                    if (withSum) {
                        dataConfig.inPlace(-1);
-                        dataConfig.setMemDesc(dataConfig.getMemDesc()->cloneWithNewPrecision(dataConfig.getMemDesc()->getPrecision()));
+                        dataConfig.setMemDesc(getSumMemDesc(itpd)->cloneWithNewPrecision(dataConfig.getMemDesc()->getPrecision()));
                        config.inConfs.push_back(dataConfig);
                    }
                }
@@ -993,7 +1090,7 @@ InferenceEngine::Blob::Ptr MKLDNNConvolutionNode::createInternalBlob(InferenceEn
 void MKLDNNConvolutionNode::prepareParams() {
    auto srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr();
    auto wghMemPtr = getParentEdgesAtPort(1)[0]->getMemoryPtr();
-    auto dstMemPtr = getChildEdgesAtPort(0)[0]->getMemoryPtr();
+    auto dstMemPtr = getOutputMemory();
    if (!dstMemPtr || !dstMemPtr->isAllocated())
        IE_THROW() << "Destination memory was not allocated.";
    if (!srcMemPtr || !srcMemPtr->isAllocated())
@@ -1030,7 +1127,7 @@ void MKLDNNConvolutionNode::prepareParams() {
    AttrPtr pAttrLocal;

    if (isDynamicNode()) {
-        if (!pAttr) {
+        if (!pAttr || withSum) {
            pAttr = initPrimitiveAttr();
        }
        pAttrLocal = pAttr;
@@ -1197,6 +1294,23 @@ void MKLDNNConvolutionNode::execute(mkldnn::stream strm) {

 void MKLDNNConvolutionNode::executeDynamicImpl(mkldnn::stream strm) {
    execute(strm);
+    if (withSumBroadcast) {
+        if (!subgraph) {
+            IE_THROW(Unexpected) << "Fused ops subgraph has not been created in " << getTypeStr() << " with name " << getName();
+        }
+        const size_t sumPortNum = getParentEdges().size() - 1;
+        const auto& sumInpMem = getParentEdgesAtPort(sumPortNum).front()->getMemory();
+        auto inp1 = subgraph->getInput(1);
+        inp1->getChildEdgesAtPort(0).front()->getMemoryPtr()->setDataHandle(sumInpMem.GetData());
+
+        subgraph->infer();
+
+        auto out = subgraph->getOutput(0);
+        const auto& outMem = out->getParentEdgesAtPort(0).front()->getMemory();
+        auto convOutMem = getChildEdgesAtPort(0).front()->getMemoryPtr();
+        convOutMem->redefineDesc(getBaseMemDescAtOutputPort(0)->cloneWithNewDims(outMem.getStaticDims()));
+        convOutMem->SetData(outMem);
+    }
 }

 void MKLDNNConvolutionNode::updatePadding() {
@@ -1207,6 +1321,69 @@ void MKLDNNConvolutionNode::updatePadding() {
    }
 }

+void MKLDNNConvolutionNode::redefineOutputMemory(const std::vector<VectorDims> &newOutputShapes) {
+    if (withSum) {
+        const size_t sumPortNum = getParentEdges().size() - 1;
+        const auto& sumInpMem = getParentEdgesAtPort(sumPortNum).front()->getMemory();
+        if (newOutputShapes.front() != sumInpMem.getStaticDims()) {
+            withSumBroadcast = true;
+            if (!subgraph) {
+                subgraph = std::make_shared<FusedSubgraph>(fusedWith, *this, weightCache);
+            }
+            auto inp0 = subgraph->getInput(0);
+            inp0->redefineOutputMemory(newOutputShapes);
+
+            auto inp1 = subgraph->getInput(1);
+            inp1->redefineOutputMemory({sumInpMem.getStaticDims()});
+            // here we postpone output memory reallocation due to the fact that it is the same memory with the sum second input
+            return;
+        } else {
+            withSumBroadcast = false;
+        }
+    }
+    MKLDNNNode::redefineOutputMemory(newOutputShapes);
+}
+
+MemoryDescPtr MKLDNNConvolutionNode::getSumMemDesc(primitive_desc_iterator &primitive_desc_it) {
+    if (getOutputShapeAtPort(0).isDynamic()) {
+        return MKLDNNExtensionUtils::makeUndefinedDesc(primitive_desc_it.dst_desc(0), getInputShapeAtPort(getParentEdges().size() - 1));
+    }
+    return MKLDNNExtensionUtils::makeDescriptor(primitive_desc_it.dst_desc(0));
+}
+
+MKLDNNMemoryPtr MKLDNNConvolutionNode::getOutputMemory() const {
+    if (withSumBroadcast) {
+        if (!subgraph) {
+            IE_THROW(Unexpected) << "Fused ops subgraph has not been created in " << getTypeStr() << " with name " << getName();
+        }
+        auto inp0 = subgraph->getInput(0);
+        return inp0->getChildEdgesAtPort(0).front()->getMemoryPtr();
+    } else {
+        return getChildEdgesAtPort(0).front()->getMemoryPtr();
+    }
+}
+
+void MKLDNNPlugin::MKLDNNConvolutionNode::addFusedNode(const MKLDNNNodePtr &fusingNode) {
+    if (Eltwise == fusingNode->getType()) {
+        if (fusingNode->getAlgorithm() == EltwiseAdd) {
+            auto eltwiseNode = std::dynamic_pointer_cast<MKLDNNEltwiseNode>(fusingNode);
+            if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) {
+                withSum = true;
+            }
+        }
+        if (withSum && isDynamicNode()) {
+            for (size_t i = 0; i < fusingNode->getParentEdges().size(); ++i) {
+                auto edge = fusingNode->getParentEdgesAtPort(i).front();
+                auto parent = edge->getParent();
+                if ("Constant" == parent->getTypeStr()) {
+                    fusedConstNodes[fusingNode].push_back(parent);
+                }
+            }
+        }
+    }
+    MKLDNNNode::addFusedNode(fusingNode);
+}
+
 void MKLDNNConvolutionNode::appendZeroPointsArgs() {
    if (inputZeroPointsMemPtr != nullptr) {
        primArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = inputZeroPointsMemPtr->GetPrimitive();
@@ -1218,5 +1395,4 @@ void MKLDNNConvolutionNode::appendZeroPointsArgs() {
        primArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST] = outputCompensationMemPtr->GetPrimitive();
    }
 }
-
 REG_MKLDNN_PRIM_FOR(MKLDNNConvolutionNode, Convolution);
--- a/src/plugins/intel_cpu/src/nodes/mkldnn_conv_node.h
+++ b/src/plugins/intel_cpu/src/nodes/mkldnn_conv_node.h
@@ -65,8 +65,12 @@ public:

 protected:
    InferenceEngine::Precision fusedEltwisePrecision(const MKLDNNNodePtr& fusingNode) const;
+    void redefineOutputMemory(const std::vector<VectorDims> &newOutputShapes) override;
+    void addFusedNode(const MKLDNNNodePtr &fusingNode) override;

 private:
+    class FusedSubgraph;
+    using FusedSubgraphPtr = std::shared_ptr<FusedSubgraph>;
    using executorPtr = std::shared_ptr<DnnlExecutor>;
    executorPtr execPtr = nullptr;

@@ -91,6 +95,8 @@ private:
    InferenceEngine::Blob::Ptr createInternalBlob(InferenceEngine::SizeVector dims, size_t edgeNum, bool isGrouped = false);

    void updatePadding();
+    MemoryDescPtr getSumMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it);
+    MKLDNNMemoryPtr getOutputMemory() const;

    void appendZeroPointsArgs();

@@ -99,6 +105,7 @@ private:
    bool withDWConv;
    bool isGrouped;
    bool isPrimitivesPriorityDefined = false;
+    bool withSumBroadcast = false;
    std::vector<size_t> stride;
    std::vector<ptrdiff_t> dilation;
    std::vector<ptrdiff_t> paddingL;
@@ -126,6 +133,8 @@ private:
    bool isWino = false;
    AttrPtr pAttr;
    bool autoPadding = false;
+    FusedSubgraphPtr subgraph;
+    std::unordered_map<MKLDNNNodePtr, std::vector<MKLDNNNodePtr>> fusedConstNodes;

    MKLDNNMemoryPtr inputZeroPointsMemPtr;
    MKLDNNMemoryPtr weightsZeroPointsMemPtr;
--- a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp
+++ b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp
@@ -1590,14 +1590,13 @@ size_t MKLDNNEltwiseNode::getOpInputsNum() const {
    }
 }

-// TODO [DS]: used only in FuseConvolutionSumAndConvolutionSumActivation
-// fix when reimplement this transformation for dynamic shapes
 bool MKLDNNEltwiseNode::isWithBroadcast() {
-    auto oDims = getOutputShapeAtPort(0).getStaticDims();
+    const auto& oDims = getOutputShapeAtPort(0).getDims();
    for (size_t i = 0; i < inputShapes.size(); i++) {
-        auto iDims = getInputShapeAtPort(i).getStaticDims();
-        if (iDims != oDims)
+        const auto& iDims = getInputShapeAtPort(i).getDims();
+        if (!dimsEqualWeak(iDims, oDims)) {
            return true;
+        }
    }

    return false;
@@ -2014,9 +2013,8 @@ bool MKLDNNEltwiseNode::canBeInPlace() const {

 void MKLDNNEltwiseNode::fuseInto(MKLDNNNodePtr& parentNode) {
    // Handling Convolution custom Add node fusing case which is processed via dnnl append_sum() API.
-    // TODO [DS]: at this moment this transformation prohibit for dynamic case
    specialConvolutionAddFusing = (parentNode->getType() == Convolution || parentNode->getType() == BinaryConvolution) && getAlgorithm() == EltwiseAdd &&
-            getInputShapeAtPort(0) == getInputShapeAtPort(1);
+            dimsEqualWeak(getInputShapeAtPort(0).getDims(), getInputShapeAtPort(1).getDims());
    if (!specialConvolutionAddFusing && canBePerformedAsScaleShift(parentNode.get())) {
        std::tie(scales, shifts) = getScalesAndShifts(parentNode.get());
        if ((parentNode->getType() == FullyConnected || parentNode->getType() == MatMul) && one_of(getAlgorithm(), EltwiseAdd, EltwiseSubtract,
--- a/src/plugins/intel_cpu/src/nodes/mkldnn_input_node.cpp
+++ b/src/plugins/intel_cpu/src/nodes/mkldnn_input_node.cpp
@@ -363,6 +363,12 @@ MKLDNNInputNode::MKLDNNInputNode(const Shape& shape, const InferenceEngine::Prec
    }
 }

+MKLDNNInputNode::MKLDNNInputNode(MemoryDescPtr memDesc, const std::string &name, const std::string &type,
+                                 const mkldnn::engine &eng, MKLDNNWeightsSharing::Ptr &cache) :
+    MKLDNNInputNode(memDesc->getShape(), memDesc->getPrecision(), name, type, eng, cache) {
+    extMemDesc = memDesc;
+}
+
 void MKLDNNInputNode::withMeanImage() {
    isMeanImage = true;
 }
@@ -389,29 +395,11 @@ void MKLDNNInputNode::initSupportedPrimitiveDescriptors() {
    if (!supportedPrimitiveDescriptors.empty())
        return;

-    std::vector<PortConfigurator> inPortConfs;
-    std::vector<PortConfigurator> outPortConfs;
-
-    if (getType() == Input || getType() == MemoryInput) {
-        auto precision = getOriginalOutputPrecisionAtPort(0);
-        if (precision == Precision::U16 || isMeanImage) {
-            precision = Precision::FP32;
-        }
-
-        outPortConfs.push_back({LayoutType::ncsp, precision});
-        if (!getParentEdges().empty()) {
-            inPortConfs.push_back({LayoutType::ncsp, precision, true});
-        }
-    } else if (getType() == Output) {
-        auto precision = getOriginalInputPrecisionAtPort(0);
-        if (precision == Precision::U16) precision = Precision::FP32;
-
-        inPortConfs.push_back({LayoutType::ncsp, precision});
+    if (extMemDesc) {
+        initSupportedPdFromMemDesc();
+    } else {
+        initSupportedPdDefault();
    }
-
-    addSupportedPrimDesc(inPortConfs,
-                         outPortConfs,
-                         impl_desc_type::unknown);
 }

 void MKLDNNInputNode::createPrimitive() {
@@ -437,5 +425,45 @@ bool MKLDNNInputNode::created() const {
    return getType() == Input || getType() == Output;
 }

+void MKLDNNInputNode::initSupportedPdDefault() {
+    std::vector<PortConfigurator> inPortConfs;
+    std::vector<PortConfigurator> outPortConfs;
+
+    if (getType() == Input || getType() == MemoryInput) {
+        auto precision = getOriginalOutputPrecisionAtPort(0);
+        if (precision == Precision::U16 || isMeanImage) {
+            precision = Precision::FP32;
+        }
+
+        outPortConfs.push_back({LayoutType::ncsp, precision});
+        if (!getParentEdges().empty()) {
+            inPortConfs.push_back({LayoutType::ncsp, precision, true});
+        }
+    } else if (getType() == Output) {
+        auto precision = getOriginalInputPrecisionAtPort(0);
+        if (precision == Precision::U16) precision = Precision::FP32;
+
+        inPortConfs.push_back({LayoutType::ncsp, precision});
+    }
+
+    addSupportedPrimDesc(inPortConfs,
+                         outPortConfs,
+                         impl_desc_type::unknown);
+}
+
+void MKLDNNInputNode::initSupportedPdFromMemDesc() {
+    NodeConfig config;
+    PortConfig portConfig;
+    portConfig.inPlace(-1);
+    portConfig.constant(false);
+    portConfig.setMemDesc(extMemDesc);
+    if (getType() == Input || getType() == MemoryInput) {
+        config.outConfs.push_back(portConfig);
+    } else if (getType() == Output) {
+        config.inConfs.push_back(portConfig);
+    }
+    supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown);
+}
+
 REG_MKLDNN_PRIM_FOR(MKLDNNInputNode, Input);
 REG_MKLDNN_PRIM_FOR(MKLDNNInputNode, Output);
--- a/src/plugins/intel_cpu/src/nodes/mkldnn_input_node.h
+++ b/src/plugins/intel_cpu/src/nodes/mkldnn_input_node.h
@@ -16,6 +16,8 @@ public:
    MKLDNNInputNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    MKLDNNInputNode(const Shape& shape, const InferenceEngine::Precision &prc, const std::string &name,
                    const std::string &type, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+    MKLDNNInputNode(MemoryDescPtr memDesc, const std::string &name, const std::string &type, const mkldnn::engine& eng,
+                    MKLDNNWeightsSharing::Ptr &cache);

    void getSupportedDescriptors() override;
    void initSupportedPrimitiveDescriptors() override;
@@ -35,10 +37,13 @@ public:

 private:
    void cloneBlobIfRequired();
+    void initSupportedPdDefault();
+    void initSupportedPdFromMemDesc();

 private:
    std::shared_ptr<ngraph::op::Constant> constOp;
    MKLDNNMemoryCPtr memoryPtr;
+    MemoryDescPtr extMemDesc = nullptr;
    bool isMeanImage = false;
 };

--- a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
+++ b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
@@ -144,9 +144,6 @@ std::vector<std::string> disabledTestPatterns() {
            *IS=_TS=\(\(4\.5\.6\.7\)\)_RS=\(\(1\.1\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.1\.1\.1\)\)_\(\(1\.1\.6\.1\)\).*)",
        // Issue: 69222
        R"(.*smoke_PriorBoxClustered.*PriorBoxClusteredLayerCPUTest.*_netPRC=f16_.*)",
-        // TODO : CVS-69533
-        R"(.*ConvolutionLayerCPUTest.*IS=\{.+\}.*_Fused=.*Add\(Parameters\).*)",
-        R"(.*GroupConvolutionLayerCPUTest.*IS=\{.+\}.*_Fused=.*Add\(Parameters\).*)",
        // Issue: 74817
        // Sporadic failings with NAN on Dynamic shape cases with jit implementation
        R"(.*DefConvLayoutTest7.*)",
--- a/src/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp
+++ b/src/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp
@@ -116,10 +116,35 @@ protected:
                                              ngraph::ParameterVector &params,
                                              const std::shared_ptr<ngraph::Node> &lastNode) override {
        auto retNode = CpuTestWithFusing::modifyGraph(ngPrc, params, lastNode);
-        for (size_t i = targetStaticShapes.front().size(); i < params.size(); ++i) {
-            const auto& shape = params[i]->get_output_partial_shape(0);
-            if (shape.is_static()) {
-                targetStaticShapes.front().push_back(shape.get_shape());
+        std::shared_ptr<ngraph::Node> opToShapeInfer = nullptr;
+        for (auto& targetShapes : targetStaticShapes) {
+            for (size_t i = targetShapes.size(); i < params.size(); ++i) {
+                const auto &shape = params[i]->get_output_partial_shape(0);
+                if (shape.is_static()) {
+                    targetShapes.push_back(shape.get_shape());
+                } else {
+                    // It is assumed that in such tests we have second parameter only if sum fusion is tested.
+                    // Considering this fact, we need to set the appropriate static shape for the second term of the sum operation, and
+                    // it has to match the convolution output shape. So the most suitable solution here is to perform shape inference on the
+                    // convolution node
+                    if (!opToShapeInfer) {
+                        ngraph::OutputVector inputsForShapeInfer;
+                        for (size_t j = 0; j < lastNode->get_input_size(); j++) {
+                            if (ngraph::is_type<ngraph::opset1::Constant>(lastNode->get_input_node_ptr(j))) {
+                                inputsForShapeInfer.push_back(lastNode->get_input_node_shared_ptr(j));
+                            } else {
+                                inputsForShapeInfer.push_back(std::make_shared<ngraph::opset1::Parameter>(lastNode->get_input_element_type(j),
+                                                                                                          lastNode->get_input_partial_shape(j)));
+                            }
+                        }
+                        opToShapeInfer = lastNode->clone_with_new_inputs(inputsForShapeInfer);
+                    }
+
+                    std::vector<ov::Shape> secondParameterShapes;
+                    opToShapeInfer->get_input_tensor(0).set_partial_shape(targetShapes.front());
+                    opToShapeInfer->validate_and_infer_types();
+                    targetShapes.push_back(opToShapeInfer->get_output_shape(0));
+                }
            }
        }
        return retNode;
--- a/src/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp
+++ b/src/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp
@@ -117,10 +117,35 @@ protected:
                                              ngraph::ParameterVector &params,
                                              const std::shared_ptr<ngraph::Node> &lastNode) override {
        auto retNode = CpuTestWithFusing::modifyGraph(ngPrc, params, lastNode);
-        for (size_t i = targetStaticShapes.front().size(); i < params.size(); ++i) {
-            const auto& shape = params[i]->get_output_partial_shape(0);
-            if (shape.is_static()) {
-                targetStaticShapes.front().push_back(shape.get_shape());
+        std::shared_ptr<ngraph::Node> opToShapeInfer = nullptr;
+        for (auto& targetShapes : targetStaticShapes) {
+            for (size_t i = targetShapes.size(); i < params.size(); ++i) {
+                const auto &shape = params[i]->get_output_partial_shape(0);
+                if (shape.is_static()) {
+                    targetShapes.push_back(shape.get_shape());
+                } else {
+                    // It is assumed that in such tests we have second parameter only if sum fusion is tested.
+                    // Considering this fact, we need to set the appropriate static shape for the second term of the sum operation, and
+                    // it has to match the convolution output shape. So the most suitable solution here is to perform shape inference on the
+                    // convolution node
+                    if (!opToShapeInfer) {
+                        ngraph::OutputVector inputsForShapeInfer;
+                        for (size_t j = 0; j < lastNode->get_input_size(); j++) {
+                            if (ngraph::is_type<ngraph::opset1::Constant>(lastNode->get_input_node_ptr(j))) {
+                                inputsForShapeInfer.push_back(lastNode->get_input_node_shared_ptr(j));
+                            } else {
+                                inputsForShapeInfer.push_back(std::make_shared<ngraph::opset1::Parameter>(lastNode->get_input_element_type(j),
+                                                                                                          lastNode->get_input_partial_shape(j)));
+                            }
+                        }
+                        opToShapeInfer = lastNode->clone_with_new_inputs(inputsForShapeInfer);
+                    }
+
+                    std::vector<ov::Shape> secondParameterShapes;
+                    opToShapeInfer->get_input_tensor(0).set_partial_shape(targetShapes.front());
+                    opToShapeInfer->validate_and_infer_types();
+                    targetShapes.push_back(opToShapeInfer->get_output_shape(0));
+                }
            }
        }
        return retNode;
--- a/src/tests/functional/plugin/cpu/subgraph_tests/src/conv_sum_broadcast.cpp
+++ b/src/tests/functional/plugin/cpu/subgraph_tests/src/conv_sum_broadcast.cpp
@@ -0,0 +1,254 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils/cpu_test_utils.hpp"
+#include "test_utils/fusing_test_utils.hpp"
+#include "test_utils/convolution_params.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+using namespace CPUTestUtils;
+using namespace InferenceEngine;
+using namespace ov::test;
+
+namespace SubgraphTestsDefinitions {
+typedef std::tuple<
+        InputShape, //convShape
+        InputShape,  //second term shape
+        bool,       // bias flag
+        fusingSpecificParams,
+        std::map<std::string, std::string> // config
+> convSumBroadcastParamSet;
+
+
+class ConcatConvSumInPlaceTest : public testing::WithParamInterface<convSumBroadcastParamSet>,
+                                 virtual public SubgraphBaseTest, public CpuTestWithFusing {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<convSumBroadcastParamSet>& obj) {
+        InputShape convShape;
+        InputShape secondShape;
+        bool bias;
+        fusingSpecificParams fusingParams;
+        std::map<std::string, std::string> additionalConfig;
+        std::tie(convShape, secondShape, bias, fusingParams, additionalConfig) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=";
+        result  << CommonTestUtils::partialShape2str({convShape.first, secondShape.first}) << "_";
+        result << "TS=";
+        for (const auto& shape : {convShape, secondShape}) {
+            result << "(";
+            if (!shape.second.empty()) {
+                for (const auto& itr : shape.second) {
+                    result << CommonTestUtils::vec2str(itr);
+                }
+            }
+            result << ")_";
+        }
+        result << "bias=" << (bias ? "True" : "False");
+        result << CpuTestWithFusing::getTestCaseName(fusingParams);
+
+        if (!additionalConfig.empty()) {
+            result << "_PluginConf";
+            for (auto& item : additionalConfig) {
+                result << "_" << item.first << "=" << item.second;
+            }
+        }
+
+        return result.str();
+    }
+
+    void SetUp() override {
+        InputShape convShape;
+        InputShape secondShape;
+        bool bias;
+        CPUSpecificParams cpuParams;
+        fusingSpecificParams fusingParams;
+        std::map<std::string, std::string> additionalConfig;
+        std::tie(convShape, secondShape, bias, fusingParams, additionalConfig) = this->GetParam();
+
+        std::tie(postOpMgrPtr, fusedOps) = fusingParams;
+
+        configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+        init_input_shapes({convShape, secondShape});
+
+        const InferenceEngine::SizeVector kernel = {3, 3};
+        const InferenceEngine::SizeVector stride = {1, 1};
+        const InferenceEngine::SizeVector dilation = {1, 1};
+        const std::vector<ptrdiff_t> padBegin = {0, 0};
+        const std::vector<ptrdiff_t> padEnd = {0, 0};
+        const size_t convOutChannels = 64;
+
+        auto netType = ngraph::element::f32;
+        auto inputParams = ngraph::builder::makeDynamicParams(netType, inputDynamicShapes);
+
+        auto conv = ngraph::builder::makeConvolution(inputParams[0], ngraph::element::f32, kernel, stride, padBegin,
+                                                     padEnd, dilation, ngraph::op::PadType::EXPLICIT, convOutChannels);
+        if (bias) {
+            auto biasNode = ngraph::builder::makeConstant<float>(ngraph::element::Type_t::f32, ngraph::Shape({1, convOutChannels, 1, 1}), {}, true);
+            conv = std::make_shared<ngraph::opset3::Add>(conv, biasNode);
+        }
+
+        auto sum = std::make_shared<ngraph::opset3::Add>(conv, inputParams[1]);
+
+        fusedOps.insert(fusedOps.begin(), "Add"); // as we always fuse the sum first
+
+        auto runtimeType = netType;
+        if (configuration.count(PluginConfigParams::KEY_ENFORCE_BF16) &&
+            PluginConfigParams::YES == configuration[PluginConfigParams::KEY_ENFORCE_BF16].as<std::string>()) {
+            runtimeType = ngraph::element::Type_t::bf16;
+        }
+
+        selectedType = makeSelectedTypeStr(getPrimitiveType(), runtimeType);
+
+        function = makeNgraphFunction(netType, inputParams, sum, "ConvolutionSumBroadcast");
+        targetDevice = CommonTestUtils::DEVICE_CPU;
+    }
+};
+
+TEST_P(ConcatConvSumInPlaceTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    run();
+
+    CheckPluginRelatedResults(executableNetwork, "Convolution");
+}
+
+namespace {
+const auto fusingMulAddFQMullAdd = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+        }, "Multiply(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
+        }, "Add(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            auto localPrc = inpNode->get_element_type();
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
+        }, "FakeQuantize(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+        }, "Multiply(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
+        }, "Add(PerChannel)"}}), {"Add"} };
+
+const auto fusingDivSubFQ = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Divide>(inpNode, secondMultInput);
+        }, "Divide(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
+            return std::make_shared<ngraph::opset1::Subtract>(inpNode, secondMultInput);
+        }, "Subtract(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            auto localPrc = inpNode->get_element_type();
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
+        }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize"} };
+
+const auto fusingSigmoidFQFQ = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Sigmoid);
+        }, "Sigmoid"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            auto localPrc = inpNode->get_element_type();
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
+        }, "FakeQuantize(PerChannel)"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            auto localPrc = inpNode->get_element_type();
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
+        }, "FakeQuantize(PerChannel)"}}), {"Sigmoid", "FakeQuantize", "FakeQuantize"} };
+
+const auto fusingClampFQ = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Clamp, {}, {3.0f, 6.0f});
+        }, "Clamp"},
+        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
+            auto localPrc = inpNode->get_element_type();
+            ngraph::Shape newShape = generatePerChannelShape(inpNode);
+            return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
+        }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize"} };
+
+
+
+const std::vector<fusingSpecificParams> fusingParamsSet{
+        emptyFusingSpec,
+        fusingSigmoid,
+        fusingFakeQuantizePerTensorRelu,
+        fusingFakeQuantizePerChannelRelu,
+        fusingFQPerChannelSigmoidFQPerChannel,
+        fusingReluScaleShift,
+        fusingMulAddFQMullAdd,
+        fusingSigmoidFQFQ,
+//        fusingClampFQ // TODO: we need investigation, this particular pattern does not work even in static case
+        fusingDivSubFQ
+};
+
+const std::vector<fusingSpecificParams> fusingParamsSetBF16{
+        emptyFusingSpec,
+        fusingSigmoid,
+        fusingReluScaleShift
+};
+
+InputShape convInpShape = {
+        //dynamic shapes
+        {-1, 32, -1, -1},
+        { //target static shapes
+            {1, 32, 10, 10},
+            {1, 32, 10, 10},
+            {1, 32, 10, 10},
+            {1, 32, 3, 3},
+            {1, 32, 3, 10}
+        }
+};
+
+InputShape secondInp = {
+        //dynamic shapes
+        {-1, -1, -1, -1},
+        { //target static shapes
+            {1, 64, 1, 8},
+            {1, 64, 1, 8},
+            {1, 64, 8, 8},
+            {1, 64, 8, 8},
+            {1, 64, 8, 1}
+        }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Conv_Sum_Broadcast_FP32, ConcatConvSumInPlaceTest,
+                         ::testing::Combine(
+                                 ::testing::Values(convInpShape),
+                                 ::testing::Values(secondInp),
+                                 ::testing::Values(true, false),
+                                 ::testing::ValuesIn(fusingParamsSet),
+                                 ::testing::Values(cpuEmptyPluginConfig)),
+                         ConcatConvSumInPlaceTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Conv_Sum_Broadcast_BF16, ConcatConvSumInPlaceTest,
+                         ::testing::Combine(
+                                 ::testing::Values(convInpShape),
+                                 ::testing::Values(secondInp),
+                                 ::testing::Values(true, false),
+                                 ::testing::ValuesIn(fusingParamsSetBF16),
+                                 ::testing::Values(cpuBF16PluginConfig)),
+                         ConcatConvSumInPlaceTest::getTestCaseName);
+
+} // namespace
+} // namespace SubgraphTestsDefinitions