From 3f6a026ae98c2f1b1ae61e88c7139db51e8328e4 Mon Sep 17 00:00:00 2001
From: Egor Duplensky <egor.duplenskii@intel.com>
Date: Tue, 14 Dec 2021 19:44:38 +0300
Subject: [PATCH] [CPU] Enable matmul deconv bin postops (#8009)

---
 .../mkldnn_plugin/mkldnn_graph_optimizer.cpp  |  45 ++--
 .../mkldnn_plugin/mkldnn_graph_optimizer.h    |   2 +-
 .../src/mkldnn_plugin/mkldnn_node.cpp         |  35 ++-
 .../src/mkldnn_plugin/mkldnn_node.h           |  14 +-
 .../convert_matmul_to_fc.cpp                  |   6 +-
 .../convert_to_cpu_specific_opset.hpp         |   2 -
 .../reshape_fully_connected.cpp               | 114 ----------
 .../reshape_fully_connected.hpp               |  25 ---
 .../mkldnn_plugin/nodes/mkldnn_conv_node.cpp  |  69 +++---
 .../mkldnn_plugin/nodes/mkldnn_conv_node.h    |   3 +-
 .../nodes/mkldnn_deconv_node.cpp              |  23 +-
 .../nodes/mkldnn_eltwise_node.cpp             | 209 +++++++++++-------
 .../mkldnn_plugin/nodes/mkldnn_eltwise_node.h |  16 +-
 .../nodes/mkldnn_fake_quantize_node.cpp       | 148 +++++++------
 .../nodes/mkldnn_fake_quantize_node.h         |  18 +-
 .../nodes/mkldnn_fullyconnected_node.cpp      |  56 ++---
 .../nodes/mkldnn_fullyconnected_node.h        |   9 +-
 .../nodes/mkldnn_matmul_node.cpp              | 135 ++++++++---
 .../mkldnn_plugin/nodes/mkldnn_matmul_node.h  |  14 +-
 .../nodes/mkldnn_pooling_node.cpp             |   2 +-
 .../mkldnn_plugin/nodes/mkldnn_pooling_node.h |   2 +-
 .../src/mkldnn_plugin/utils/cpu_utils.hpp     |  13 +-
 .../mat_mul_with_constant_transformation.cpp  |   4 +-
 .../single_layer_tests/mat_mul.cpp            |   3 +-
 .../plugin/cpu/single_layer_tests/mat_mul.cpp | 174 ++++++++++++---
 .../cpu/subgraph_tests/src/reshape_fc.cpp     | 101 ---------
 .../cpu/test_utils/fusing_test_utils.hpp      | 103 ++++-----
 .../src/single_layer/mat_mul.cpp              |   2 +-
 .../convert_matmul_test.cpp                   |  78 +------
 29 files changed, 714 insertions(+), 711 deletions(-)
 delete mode 100644 inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp
 delete mode 100644 inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
index 1261df4b559..b789d1c01ff 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -59,7 +59,7 @@ MKLDNNGraphOptimizer::MKLDNNGraphOptimizer() {}
 
 void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::MKLDNN_LT, "ApplyCommonGraphOptimizations", "FuseConvolutionAndBias");
-    FuseConvolutionAndBias(graph);
+    FuseConvolutionMatMulAndBias(graph);
     graph.RemoveDroppedNodes();
 
     OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMultiplyAndAdd");
@@ -166,37 +166,38 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
     graph.RemoveDroppedEdges();
 }
 
-void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
+void MKLDNNGraphOptimizer::FuseConvolutionMatMulAndBias(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 
-    auto isSuitableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == Convolution &&
+    auto isSuitableParentNode = [](const MKLDNNNodePtr& node) {
+        return (node->getType() == Convolution || node->getType() == MatMul) &&
                node->getChildEdges().size() == 1 &&
                node->getParentEdges().size() == 2 &&
                node->getFusedWith().empty();
     };
 
-    auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+    auto isSuitableChildNode = [&](const MKLDNNNodePtr& parentNode, const MKLDNNNodePtr& childNode) {
         if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
             return false;
 
-        auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
+        const auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
         if (biasNode->getType() != Input || !biasNode->isConstant() || biasNode->getChildEdges().size() != 1)
             return false;
 
-        auto convOutDims = parentNode->getOutputShapeAtPort(0).getDims();
-        auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(),
-                                                convOutDims.size());
+        const auto parentOutDims = parentNode->getOutputShapeAtPort(0).getDims();
+        const auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(),
+                                                parentOutDims.size());
         // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and per-channel cases.
         // Most of the real models contain per-channel bias, so we need to reavaluate the need to support per-tensor variant.
-        if (convOutDims.size() != biasDims.size() || biasDims.size() < 2)
+        if (parentOutDims.size() != biasDims.size() || biasDims.size() < 2)
             return false;
 
-        if (biasDims[0] != 1 || !dimsEqualStrong(biasDims[1], convOutDims[1]))
+        const auto channelAxis = parentNode->getFusingAxis();
+        if (!dimsEqualStrong(biasDims[channelAxis], parentOutDims[channelAxis]))
             return false;
 
-        for (int i = 2; i < biasDims.size(); i++) {
-            if (biasDims[i] != 1)
+        for (int i = 0; i < biasDims.size(); i++) {
+            if (biasDims[i] != 1 && i != channelAxis)
                 return false;
         }
 
@@ -262,13 +263,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
                     graph.RemoveEdge(remEdge);
                 }
 
-                auto parentEltwise = parentNode;
+                const auto& parentEltwise = parentNode;
                 MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
-                auto &graphEdges = graph.GetEdges();
+                auto& graphEdges = graph.GetEdges();
                 graphEdges.push_back(newEdge);
                 parent->addEdge(newEdge);
 
-                auto partialShape = { parentEltwise->outputShapes[0].toPartialShape()[1] };
+                auto partialShape = { parentEltwise->outputShapes[0].toPartialShape()[parentEltwise->getFusingAxis()] };
                 parent->outputShapes[inNum] = Shape(partialShape);
                 parentEltwise->inputShapes.push_back(parent->outputShapes[0]);
             }
@@ -627,7 +628,15 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
     }
 }
 
-static bool BF16QuantizeNodeFusing(MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+/**
+ * @todo FQ fusing was disabled for BF16 output since oneDNN primitives lack support
+ *       for bf16 depthwise postops.
+ *       This is not the case anymore, because after migration to oneDNN 2.3 FQ will be fused as
+ *       multiple binary post ops.
+ *       This check can already be removed for FC fusing, but should be kept for Convolution,
+ *       which still uses legacy depthwise postops for performance reasons.
+ */
+static bool BF16QuantizeNodeFusing(const MKLDNNNodePtr& parentNode, const MKLDNNNodePtr& childNode) {
     return childNode->getType() == FakeQuantize &&
         one_of(Precision::BF16,
             parentNode->getOriginalOutputPrecisionAtPort(0),
@@ -638,7 +647,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
     auto& graphNodes = graph.GetNodes();
 
     auto isSuitableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == FullyConnected && node->getChildEdges().size() == 1 && node->getInputShapeAtPort(0).getRank() != 3;
+        return node->getType() == FullyConnected && node->getChildEdges().size() == 1;
     };
 
     auto parent = graphNodes.begin();
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
index 0b896da6272..0be66e5ba08 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@@ -19,7 +19,7 @@ public:
     void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph);
 
 private:
-    void FuseConvolutionAndBias(MKLDNNGraph &graph);
+    void FuseConvolutionMatMulAndBias(MKLDNNGraph &graph);
     void FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &graph);
     void FuseMultiplyAndAdd(MKLDNNGraph &graph);
     void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index 55703b16384..44ea9e933d2 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -4,6 +4,7 @@
 
 #include "mkldnn_node.h"
 #include "dnnl_debug.h"
+#include "mkldnn_edge.h"
 #include "mkldnn_extension_mngr.h"
 #include "mkldnn_itt.h"
 
@@ -1048,6 +1049,16 @@ void MKLDNNNode::setDynamicBatchLim(int lim) {
     }
 }
 
+void MKLDNNNode::appendPostOpArgs(const mkldnn::primitive_attr& attr) {
+    auto post_ops = attr.get_post_ops();
+    int idx = 0;
+    for (int i = 0; i < post_ops.len(); i++) {
+        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
+            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]->GetPrimitive()});
+        }
+    }
+}
+
 bool MKLDNNNode::isFusedWith(Type fusedNodeType) const {
     for (auto fusedNode : fusedWith) {
         if (fusedNode->type == fusedNodeType)
@@ -1078,10 +1089,14 @@ Layout MKLDNNNode::getWeightsLayoutByDims(SizeVector dims, bool isGrouped) {
     }
 }
 
-void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
+void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
     IE_THROW() << "Fusing of " << this->getType() << " operation is not implemented";
 }
 
+void MKLDNNNode::appendBinPostOps(mkldnn::post_ops& ops, const std::vector<size_t>& binaryShape, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    IE_THROW() << "Binary fusing of " << this->getType() << " operation is not implemented";
+}
+
 std::vector<InferenceEngine::Precision> MKLDNNNode::getInputPrecisions() const {
     std::vector<InferenceEngine::Precision> inputPrecisions;
     for (size_t i = 0; i < getParentEdges().size(); i++) {
@@ -1205,6 +1220,9 @@ MKLDNNNode* MKLDNNNode::NodesFactory::create(const std::shared_ptr<ngraph::Node>
 
 bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const {
     size_t fusingPort = 0;
+    // @todo graph optimizer can provide parentNode as nullptr. Should be avoided
+    const size_t channelAxis = parentNode ? parentNode->getFusingAxis() : MKLDNNNode::getFusingAxis();
+
     for (size_t i = (parentNode == nullptr ? 1 : 0); i < getParentEdges().size(); i++) {
         MKLDNNNode *node = getParentEdgesAtPort(i)[0]->getParent().get();
         if (node == nullptr) {
@@ -1225,7 +1243,8 @@ bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const
             if (i == fusingPort)
                 continue;
             auto& weightShape = getInputShapeAtPort(i).getDims();
-            if (getParentEdgesAtPort(i)[0]->getParent()->getChildEdges().size() != 1 || !isPerTensorOrPerChannelBroadcastable(dataShape, weightShape, true))
+            if (getParentEdgesAtPort(i)[0]->getParent()->getChildEdges().size() != 1 ||
+                !isPerTensorOrPerChannelBroadcastable(dataShape, weightShape, channelAxis, true))
                 return false;
         }
         return true;
@@ -1246,6 +1265,9 @@ bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const
             || isConvertablePowerStatic();
 }
 
+// @todo shifts for Subtract and scales for Divide are replaced with
+// Add (with opposite sign) and Multiply (with inverse value) for legacy dephwise post ops
+// This can be avoided after dephwise post ops are gone
 std::pair<std::vector<float>, std::vector<float>> MKLDNNNode::getScalesAndShifts(const MKLDNNNode *parentNode) const {
     std::vector<float> scales, shifts;
 
@@ -1408,10 +1430,11 @@ bool MKLDNNNode::canFuseSimpleOperation(const MKLDNNNodePtr& node) const {
         }
         return ret;
     } else if (node->getType() == Eltwise) {
-        return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
-                                            EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
-                                            EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu) ||
-                      node->canBePerformedAsScaleShift(this);
+        return one_of(node->getAlgorithm(),
+                      EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
+                      EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
+                      EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu) ||
+            node->canBePerformedAsScaleShift(this);
     }
     return false;
 }
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index 3e0448f0db6..aee4f876806 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -204,6 +204,12 @@ public:
 
     bool isConstant();
 
+    virtual size_t getFusingAxis() const {
+        return 1;
+    }
+
+    void appendPostOpArgs(const mkldnn::primitive_attr& attr);
+
     bool isFusedWith(Type type) const;
 
     void addFusedNode(const MKLDNNNodePtr &fusingNode) {
@@ -594,8 +600,10 @@ protected:
      * Seed node should call this routine and pass its post operations list as parameter.
      * @param ops List of fused post operations
      */
-    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1, bool initAsBinary = false, bool initBinaryMemory = false);
-    virtual AttrPtr initPrimitiveAttr() const { return nullptr; }
+    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, int align = -1);
+    virtual void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem);
+
+    virtual std::shared_ptr<mkldnn::primitive_attr> initPrimitiveAttr() { return nullptr; }
 
     typedef std::function<DnnlMemoryDescPtr (mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx)>
             GetPrimitiveMemoryFormatFunc;
@@ -636,7 +644,7 @@ protected:
     std::vector<MKLDNNMemoryPtr> internalBlobMemory;
     std::vector<NodeDesc> supportedPrimitiveDescriptors;
     std::unordered_map<int, mkldnn::memory> primArgs;
-    std::vector<mkldnn::memory> binaryPostOpsArgs;
+    std::vector<MKLDNNMemoryPtr> binaryPostOpsArgs;
     MKLDNNPrimitive prim;
     std::vector<MKLDNNDescriptor> descs;
 
diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp
index d43953d46c6..b3ff0ef1d9e 100644
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp
@@ -36,8 +36,9 @@ MKLDNNPlugin::ConvertMatMulToFC::ConvertMatMulToFC() {
         auto rank_a = shape_a.rank().get_length();
         auto rank_b = shape_b.rank().get_length();
 
-        // Transformation to FC is not supported for 1D second input
-        if (rank_b == 1) {
+        // Transformation to FC is not supported for 1D inputs
+        if (rank_a == 1 || rank_b == 1 ||
+            rank_a > 3 || rank_b > 3) {
             return false;
         }
 
@@ -47,7 +48,6 @@ MKLDNNPlugin::ConvertMatMulToFC::ConvertMatMulToFC() {
             std::count_if(shape_b.begin(), shape_b.end(), [](ngraph::Dimension x) { return x != 1; }) > 2) {
             return false;
         }
-
         /*
          *  get_aligned_shapes function align two input shapes to have the same size and
          *  the same batch dimensions (last two dimensions are not comparable).
diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
index ff901fbafc0..078fb75c14d 100644
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
@@ -7,7 +7,6 @@
 #include "ngraph/op/fake_quantize.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "reshape_fc_fusion.hpp"
-#include "reshape_fully_connected.hpp"
 #include "align_matmul_input_ranks.hpp"
 #include "reshape_prelu.hpp"
 #include "convert_broadcast_to_tiles.hpp"
@@ -29,7 +28,6 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphF
     manager.register_pass<AlignMatMulInputRanks>();
     manager.register_pass<ConvertTileToSeqTiles>();
     manager.register_pass<FullyConnectedBiasFusion>();
-    manager.register_pass<ReshapeFullyConnected>();
     manager.register_pass<ConvertToPowerStatic>();
     manager.register_pass<ConvertToLeakyRelu>();
     manager.register_pass<ReshapePRelu>();
diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
deleted file mode 100644
index 2446e7694a8..00000000000
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "reshape_fully_connected.hpp"
-#include "op/fully_connected.hpp"
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset7.hpp>
-#include <ngraph/rt_info.hpp>
-#include <ngraph/pattern/op/wrap_type.hpp>
-#include <ngraph/pattern/op/or.hpp>
-#include <transformations/utils/utils.hpp>
-#include <numeric>
-
-NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnected, "ReshapeFullyConnected", 0);
-
-MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() {
-    ngraph::OutputVector twoInputs = {
-            ngraph::pattern::any_input(ngraph::pattern::has_static_rank()), ngraph::pattern::any_input(ngraph::pattern::has_static_shape())};
-    ngraph::OutputVector threeInputs = {
-            ngraph::pattern::any_input(ngraph::pattern::has_static_rank()), ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
-                                        ngraph::pattern::any_input()};
-    auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_rank());
-    auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_rank());
-    const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});
-
-    ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
-        auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
-        if (!fc || transformation_callback(fc)) {
-            return false;
-        }
-
-        auto fc_input_shape = fc->get_input_partial_shape(0);
-        auto input_rank = fc_input_shape.rank().get_length();
-        auto output_shape = fc->get_output_partial_shape(0);
-
-        if (input_rank == 2 || input_rank == 0) {
-            return false;
-        }
-
-        ngraph::NodeVector new_ops;
-        int64_t K = *(fc->get_input_shape(1).rbegin()); // requested 2nd input with static shape in the matcher
-        auto reshape = std::make_shared<ngraph::opset1::Reshape>(
-                fc->input_value(0), ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{-1, K}), false);
-        if (reshape->get_output_partial_shape(0).rank().is_dynamic())
-            return false;
-        new_ops.push_back(reshape);
-
-        reshape->set_friendly_name(fc->get_friendly_name() + "/Reshape");
-
-        // Calculate output shape for new FullyConnected layer
-        // [I, K] * [O, K] = [I, O]
-        auto I = reshape->get_output_partial_shape(0)[0];
-        auto O = fc->get_input_partial_shape(1)[0];
-        ngraph::PartialShape output_shape_new{I, O};
-
-        std::shared_ptr<ngraph::Node> fc_new;
-        if (fc->get_input_size() == 2) {
-            fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
-                                                                        fc->input_value(1),
-                                                                        output_shape_new.rank(),
-                                                                        fc->get_output_type());
-        } else if (fc->get_input_size() == 3) {
-            fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
-                                                                        fc->input_value(1),
-                                                                        fc->input_value(2),
-                                                                        output_shape_new.rank(),
-                                                                        fc->get_output_type());
-        } else {
-            return false;
-        }
-        new_ops.push_back(fc_new);
-
-        if (output_shape != output_shape_new) {
-            auto I_idxs = std::vector<size_t>(input_rank - 1);
-            std::iota(I_idxs.begin(), I_idxs.end(), 0);
-            auto A_input_shape = ngraph::op::util::make_try_fold<ngraph::opset7::ShapeOf>(fc->input_value(0));
-            auto B_input_shape = ngraph::op::util::make_try_fold<ngraph::opset7::ShapeOf>(fc->input_value(1));
-            auto I_node = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(A_input_shape, {I_idxs});
-            auto O_node = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(B_input_shape, {0});
-            ngraph::OutputVector output_shape_dims{I_node, O_node};
-
-            const auto original_rank = fc->get_output_rank();
-            NGRAPH_CHECK(original_rank.is_static());
-            if (input_rank < original_rank.get_length()) {
-                const size_t const_shape_value = original_rank.get_length() - input_rank;
-                output_shape_dims.insert(
-                    output_shape_dims.begin(), ngraph::opset1::Constant::create(I_node->get_element_type(), { const_shape_value }, { 1 }));
-            }
-
-            auto reshape_output_shape = ngraph::op::util::make_try_fold<ngraph::opset1::Concat>(output_shape_dims, 0);
-            auto reshape_output = std::make_shared<ngraph::opset1::Reshape>(fc_new, reshape_output_shape, false);
-            new_ops.push_back(A_input_shape);
-            new_ops.push_back(B_input_shape);
-            new_ops.push_back(I_node);
-            new_ops.push_back(O_node);
-            new_ops.push_back(reshape_output_shape);
-            new_ops.push_back(reshape_output);
-            reshape_output->set_friendly_name(fc->get_friendly_name());
-            fc_new->set_friendly_name(fc->get_friendly_name() + "/FC");
-            ngraph::copy_runtime_info(fc, new_ops);
-            ngraph::replace_node(fc, reshape_output);
-        } else {
-            fc_new->set_friendly_name(fc->get_friendly_name());
-            ngraph::copy_runtime_info(fc, new_ops);
-            ngraph::replace_node(fc, fc_new);
-        }
-
-        return true;
-    };
-
-    auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnected");
-    this->register_matcher(m, callback);
-}
diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp
deleted file mode 100644
index 162427de5de..00000000000
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/pass/graph_rewrite.hpp>
-
-/*
- * Description:
- *     ReshapeFullyConnected transformation detects FullyConnected operations
- *     and for each operation where input shape is greater than 2 inserts Reshape
- *     operations before and after FullyConnected operation. This transformation is
- *     required because of IE restrictions.
- */
-
-namespace MKLDNNPlugin {
-
-class ReshapeFullyConnected: public ngraph::pass::MatcherPass {
-public:
-    NGRAPH_RTTI_DECLARATION;
-    ReshapeFullyConnected();
-};
-
-}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
index 6132525193e..e2f01e85cef 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@@ -330,48 +330,42 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
     }
 }
 
-void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights = false, bool initAsBinary = false) {
-    bool initBinaryMemory = initWeights;
+void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights = false) {
     mkldnn::post_ops ops;
+    bool useLegacyPostOps = true; // @todo remove after issue with performance of binary post ops fixed
+
+    auto getBinPostOpShape = [&](){
+        const auto outShape = getOutputShapeAtPort(0).getStaticDims();
+        const auto outShapeRank = getOutputShapeAtPort(0).getRank();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = outShape[chIdx];
+        return binaryShape;
+    };
 
     for (auto &node : fusedWith) {
         if (node->getType() == Split || node->getType() == Concatenation)
             continue;
 
-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
             if (eltwiseNode->isSpecialConvolutionAddFusing()) {
                 ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
             } else {
-                constexpr int align = 16;
-                eltwiseNode->appendPostOps(ops, dims, align, initAsBinary, initBinaryMemory);
-                if (initBinaryMemory) {
-                    if (eltwiseNode->scalesMemory)
-                        binaryPostOpsArgs.push_back(eltwiseNode->scalesMemory->GetPrimitive());
-                    if (eltwiseNode->shiftsMemory)
-                        binaryPostOpsArgs.push_back(eltwiseNode->shiftsMemory->GetPrimitive());
+                if (useLegacyPostOps || eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                    constexpr int align = 16;
+                    eltwiseNode->appendPostOps(ops, dims, align);
+                } else {
+                    eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
                 }
             }
             continue;
         }
 
-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            constexpr int align = -1;
-            fakeQuantizeNode->appendPostOps(ops, dims, align, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (fakeQuantizeNode->cropHighMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropHighMemory->GetPrimitive());
-                if (fakeQuantizeNode->cropLowMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropLowMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputShiftMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputShiftMemory->GetPrimitive());
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            if (useLegacyPostOps) {
+                fakeQuantizeNode->appendPostOps(ops, dims);
+            } else {
+                fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
             }
             continue;
         }
@@ -416,7 +410,6 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
     // attr[1] - binary
     mkldnn::primitive_attr attrs[1];
     setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims());
-//    setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false, true);
 
     bool containJitImpl = false;
 
@@ -630,7 +623,6 @@ void MKLDNNConvolutionNode::initDescriptor(const NodeConfig& config) {
     // attr[1] - binary
     mkldnn::primitive_attr attrs[1];
     setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims());
-//    setPostOps(attrs[1], false, true);
 
     auto rightConfig = selectedPD->getConfig();
     size_t selected_count = 0;
@@ -926,13 +918,8 @@ void MKLDNNConvolutionNode::prepareParams() {
     auto initPrimitiveAttr = [&]() {
         mkldnn::primitive_attr attr;
         addZeroPoints(attr);
+        setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true);
 
-        // todo: [AV] delete "false" to use binary mechanism
-        if (false && getSelectedPrimitiveDescriptor()->getImplementationType() == jit_gemm) {
-            setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true, true);
-        } else {
-            setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true);
-        }
         return std::make_shared<mkldnn::primitive_attr>(std::move(attr));
     };
 
@@ -991,14 +978,8 @@ void MKLDNNConvolutionNode::prepareParams() {
     if (withBiases) {
         primArgs[DNNL_ARG_BIAS] = getBias();
     }
-// todo: [AV] uncomment to use binary mechanism
-//    auto post_ops = attr.get_post_ops();
-//    int idx = 0;
-//    for (int i = 0; i < post_ops.len(); i++) {
-//        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
-//            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]});
-//        }
-//    }
+
+    appendPostOpArgs(*pAttrLocal);
 }
 
 void MKLDNNConvolutionNode::executeDynamicImpl(dnnl::stream strm) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
index 39ef625f503..dcdd18092d5 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@@ -69,7 +69,7 @@ private:
     void executeDynamicImpl(mkldnn::stream strm) override;
 
     void addZeroPoints(mkldnn::primitive_attr& attr) const;
-    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights, bool initAsBinary);
+    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights);
     void filterSupportedDescriptors();
     bool isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const;
     bool isNspcAvailable() const;
@@ -122,4 +122,3 @@ private:
 };
 
 }  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
index 1cb58a478ec..6a2c6332e38 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@@ -157,9 +157,6 @@ bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() const {
         return false;
     }
 
-    // todo: [antonvor] added these checks to fix performance problems
-    if (kernel.size() == 3)
-        return false;
     if (!withGroups && stride.back() > 3)
         return false;
     if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common)) {
@@ -271,17 +268,25 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
 void MKLDNNDeconvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
     mkldnn::post_ops ops;
 
+    auto getBinPostOpShape = [&](){
+        const auto outShape = getOutputShapeAtPort(0).getStaticDims();
+        const auto outShapeRank = getOutputShapeAtPort(0).getRank();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = outShape[chIdx];
+        return binaryShape;
+    };
+
     for (auto &node : fusedWith) {
-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
             // TODO [DS]: change to shape from memory
             constexpr int align = 16;
+            // use legacy depthwise since backprop convolution does not support binary post ops
             eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
             continue;
         }
-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            fakeQuantizeNode->appendPostOps(ops);
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
             continue;
         }
         IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
@@ -358,6 +363,8 @@ void MKLDNNDeconvolutionNode::createPrimitive() {
         auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
         primArgs = {{DNNL_ARG_DIFF_DST, src}, {DNNL_ARG_WEIGHTS, weights}, {DNNL_ARG_DIFF_SRC, dst}};
     }
+
+    appendPostOpArgs(attr);
 }
 
 void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<MemoryDescPtr> &inputDesc,
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
index 353a022faf9..f87e32c1476 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@@ -7,6 +7,7 @@
 #include <ie_parallel.hpp>
 
 #include <mkldnn_types.h>
+#include "cpu_types.h"
 #include "utils/bfloat16.hpp"
 #include <cpu/x64/injectors/jit_uni_quantization_injector.hpp>
 #include <cpu/ref_eltwise.hpp>
@@ -31,6 +32,7 @@
 #include "ngraph_transformations/op/leaky_relu.hpp"
 #include "ngraph_transformations/op/swish_cpu.hpp"
 
+#include <oneapi/dnnl/dnnl.hpp>
 #include <string>
 #include <vector>
 #include <memory>
@@ -791,18 +793,41 @@ private:
     }
 };
 
+MKLDNNEltwiseNode::BroadcastingPolicy MKLDNNEltwiseNode::determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op) {
+    const auto const1 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(0));
+    const auto const2 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1));
+    int constPort = -1;
+    if (const2) {
+        constPort = 1;
+    } else if (const1) {
+        constPort = 0;
+    } else {
+        return Undefined;
+    }
+
+    auto const_shape = op->get_input_shape(constPort);
+    if (ngraph::shape_size(const_shape) == 1)
+        return PerTensor;
+    else
+        return PerChannel;
+}
+
 const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> MKLDNNEltwiseNode::initializers = {
     {ngraph::op::v1::Add::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseAdd;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
     }},
     {ngraph::op::v1::Subtract::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseSubtract;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
     }},
     {ngraph::op::v1::Multiply::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseMultiply;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
     }},
     {ngraph::op::v1::Divide::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseDivide;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
     }},
     {ngraph::op::v0::SquaredDifference::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseSquaredDifference;
@@ -828,6 +853,7 @@ const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> M
         node.alpha = powerStatic->get_power();
         node.beta = powerStatic->get_scale();
         node.gamma = powerStatic->get_shift();
+        node.broadcastingPolicy = PerTensor;
     }},
     {ngraph::op::v1::Equal::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseEqual;
@@ -954,6 +980,7 @@ const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> M
     }},
     {ngraph::op::v0::PRelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwisePrelu;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
     }},
     {ngraph::op::v0::Erf::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
         node.algorithm = EltwiseErf;
@@ -984,7 +1011,7 @@ bool MKLDNNEltwiseNode::isSupportedOperation(const std::shared_ptr<const ngraph:
 }
 
 MKLDNNEltwiseNode::MKLDNNEltwiseNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-        MKLDNNNode(op, eng, cache) {
+    MKLDNNNode(op, eng, cache), broadcastingPolicy(Undefined) {
     std::string errorMessage;
     if (!isSupportedOperation(op, errorMessage)) {
         IE_THROW(NotImplemented) << errorMessage;
@@ -1713,106 +1740,124 @@ void MKLDNNEltwiseNode::fuseInto(MKLDNNNodePtr& parentNode) {
             getInputShapeAtPort(0) == getInputShapeAtPort(1);
     if (!specialConvolutionAddFusing && canBePerformedAsScaleShift(parentNode.get())) {
         std::tie(scales, shifts) = getScalesAndShifts(parentNode.get());
+        if ((parentNode->getType() == FullyConnected || parentNode->getType() == MatMul) && one_of(getAlgorithm(), EltwiseAdd, EltwiseSubtract,
+                EltwiseMultiply, EltwiseDivide, EltwiseMulAdd, EltwisePowerStatic, EltwisePrelu)) {
+            std::tie(scales, shifts) = getScalesAndShifts(parentNode.get());
+        }
     }
     MKLDNNNode::fuseInto(parentNode);
 }
 
-void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
+void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
     const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' ";
 
     if (getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
         switch (getMKLDNNAlgorithm()) {
-            case mkldnn::algorithm::eltwise_relu:
-            case mkldnn::algorithm::eltwise_tanh:
-            case mkldnn::algorithm::eltwise_elu:
-            case mkldnn::algorithm::eltwise_square:
-            case mkldnn::algorithm::eltwise_abs:
-            case mkldnn::algorithm::eltwise_sqrt:
-            case mkldnn::algorithm::eltwise_linear:
-            case mkldnn::algorithm::eltwise_bounded_relu:
-            case mkldnn::algorithm::eltwise_soft_relu:
-            case mkldnn::algorithm::eltwise_logistic:
-            case mkldnn::algorithm::eltwise_exp:
-            case mkldnn::algorithm::eltwise_gelu_erf:
-            case mkldnn::algorithm::eltwise_gelu_tanh:
-            case mkldnn::algorithm::eltwise_clip:
-            case mkldnn::algorithm::eltwise_swish:
-            case mkldnn::algorithm::eltwise_hardswish:
-            case mkldnn::algorithm::eltwise_mish:
-            case mkldnn::algorithm::eltwise_hsigmoid:
-            case mkldnn::algorithm::eltwise_round_half_to_even:
-            case mkldnn::algorithm::eltwise_round_half_away_from_zero:
-                ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta());
-                break;
-            default: IE_THROW() << errorPrefix << "as post operation is not supported";
+        case mkldnn::algorithm::eltwise_relu:
+        case mkldnn::algorithm::eltwise_tanh:
+        case mkldnn::algorithm::eltwise_elu:
+        case mkldnn::algorithm::eltwise_square:
+        case mkldnn::algorithm::eltwise_abs:
+        case mkldnn::algorithm::eltwise_sqrt:
+        case mkldnn::algorithm::eltwise_linear:
+        case mkldnn::algorithm::eltwise_bounded_relu:
+        case mkldnn::algorithm::eltwise_soft_relu:
+        case mkldnn::algorithm::eltwise_logistic:
+        case mkldnn::algorithm::eltwise_exp:
+        case mkldnn::algorithm::eltwise_gelu_erf:
+        case mkldnn::algorithm::eltwise_gelu_tanh:
+        case mkldnn::algorithm::eltwise_clip:
+        case mkldnn::algorithm::eltwise_swish:
+        case mkldnn::algorithm::eltwise_hardswish:
+        case mkldnn::algorithm::eltwise_mish:
+        case mkldnn::algorithm::eltwise_hsigmoid:
+        case mkldnn::algorithm::eltwise_round_half_to_even:
+        case mkldnn::algorithm::eltwise_round_half_away_from_zero:
+            ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta());
+            break;
+        default: IE_THROW() << errorPrefix << "as post operation is not supported";
         }
     } else {
-        const size_t chIdx = postOpDims.size() > 1 ? 1 : 0;
+        const size_t chIdx = postOpDims.size() > 1 ? getFusingAxis() : 0;
         scalesBuffer = makeAlignedBuffer(postOpDims[chIdx], scales, align);
         if (getAlgorithm() != EltwisePrelu) {
             shiftsBuffer = makeAlignedBuffer(postOpDims[chIdx], shifts, align);
         }
 
-        if (initAsBinary) {
-            auto appendBinary = [&](const mkldnn::algorithm alg, MKLDNNMemoryPtr &memPtr, const std::vector<float> &data) {
-                if (data.empty())
-                    IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-
-                std::vector<size_t> binaryDims(postOpDims.size(), 1);
-                binaryDims[chIdx] = postOpDims[chIdx];
-
-                DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, Shape(binaryDims));
-                ops.append_binary(alg, memoryDesc.getDnnlDesc());
-
-                if (initBinaryMemory) {
-                    memPtr.reset(new MKLDNNMemory(getEngine()));
-                    memPtr->Create(memoryDesc, &data[0]);
-                }
-            };
-            switch (getAlgorithm()) {
-                case EltwiseAdd:
-                case EltwiseSubtract:
-                    appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shiftsBuffer);
-                    break;
-                case EltwiseMultiply:
-                case EltwiseDivide:
-                    appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scalesBuffer);
-                    break;
-                case EltwiseMulAdd:
-                case EltwisePowerStatic:
-                    appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scalesBuffer);
-                    appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shiftsBuffer);
-                    break;
-                case EltwisePrelu:
-                    appendBinary(mkldnn::algorithm::binary_prelu, scalesMemory, scalesBuffer);
-                    break;
-                default:
-                    IE_THROW() << errorPrefix << "as post operation is not supported";
-            }
-        } else {
-            switch (getAlgorithm()) {
-                case EltwiseAdd:
-                case EltwiseSubtract:
-                case EltwiseMultiply:
-                case EltwiseDivide:
-                case EltwiseMulAdd:
-                case EltwisePowerStatic:
-                    if (scalesBuffer.empty() || shiftsBuffer.empty())
-                        IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-                    ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift, &scalesBuffer[0], &shiftsBuffer[0]);
-                    break;
-                case EltwisePrelu:
-                    if (scalesBuffer.empty())
-                        IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-                    ops.append_depthwise(mkldnn::algorithm::depthwise_prelu, &scalesBuffer[0], nullptr);
-                    break;
-                default:
-                    IE_THROW() << errorPrefix << "as post operation is not supported";
-            }
+        /* @todo legacy depthwise post ops are kept for now
+         * for performance reasons
+         */
+        switch (getAlgorithm()) {
+        case EltwiseAdd:
+        case EltwiseSubtract:
+        case EltwiseMultiply:
+        case EltwiseDivide:
+        case EltwiseMulAdd:
+        case EltwisePowerStatic:
+            if (scales.empty() || shifts.empty())
+                IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+            ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift, &scalesBuffer[0], &shiftsBuffer[0]);
+            break;
+        case EltwisePrelu:
+            if (scales.empty())
+                IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+            ops.append_depthwise(mkldnn::algorithm::depthwise_prelu, &scalesBuffer[0], nullptr);
+            break;
+        default:
+            IE_THROW() << errorPrefix << "as post operation is not supported";
         }
     }
 }
 
+void MKLDNNEltwiseNode::appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' as binary post op ";
+    VectorDims broadcastBinaryShape(postOpDims.size(), 1);
+
+    auto appendBinary = [&](const mkldnn::algorithm alg, MKLDNNMemoryPtr &memPtr, const std::vector<float> &data) {
+        if (data.empty())
+            IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+        if (broadcastingPolicy == Undefined)
+            IE_THROW() << errorPrefix << "cannot be performed since policy is Undefined";
+
+        DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, broadcastingPolicy == PerTensor ? Shape(broadcastBinaryShape) : Shape(postOpDims));
+
+        ops.append_binary(alg, memoryDesc.getDnnlDesc());
+
+        if (!memPtr) {
+            memPtr.reset(new MKLDNNMemory(getEngine()));
+            memPtr->Create(memoryDesc, &data[0]);
+
+            binaryPostOpsMem.push_back(memPtr);
+        }
+    };
+
+    switch (getAlgorithm()) {
+    case EltwiseAdd:
+    case EltwiseSubtract:
+        appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwiseDivide:
+    case EltwiseMultiply:
+        appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        break;
+    case EltwiseMulAdd:
+        appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwisePowerStatic:
+        if (beta != 1.0f) // Multiply if has scales
+            appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        if (gamma != 0.0f) // Add only if has shifts
+            appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwisePrelu:
+        appendBinary(mkldnn::algorithm::binary_prelu, scalesMemory, scales);
+        break;
+    default:
+        IE_THROW() << errorPrefix << "as post operation is not supported";
+    }
+}
+
 bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
     auto isSuitableNode = [this](const MKLDNNEltwiseNode* node) {
         // [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
index 5471a14b2c9..b5e7768b52a 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@@ -75,7 +75,8 @@ public:
     bool created() const override;
     bool canBeInPlace() const override;
     bool canFuse(const MKLDNNNodePtr& node) const override;
-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1, bool initAsBinary = false, bool initBinaryMemory = false) override;
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1) override;
+    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;
     void fuseInto(MKLDNNNodePtr& parentNode) override;
     InferenceEngine::Precision getRuntimePrecision() const override;
 
@@ -97,8 +98,17 @@ public:
 
     void executeDynamicImpl(mkldnn::stream strm) override { execute(strm); }
 
+    enum BroadcastingPolicy {
+        PerChannel,
+        PerTensor,
+        Undefined,
+    };
+
+    BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; }
+
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
 
+
 private:
     struct EltwiseExecutor {
         EltwiseExecutor(size_t batch) : batchDimIdx(batch) {}
@@ -130,6 +140,8 @@ private:
         size_t fullWorkAmount = 0;
     };
 
+    BroadcastingPolicy broadcastingPolicy;
+
     mkldnn::algorithm mkldnnAlgorithm = mkldnn::algorithm::undef;
 
     static const int optimalTensorRank = 6;
@@ -157,6 +169,8 @@ private:
     using Initializer = std::function<void(const std::shared_ptr<ngraph::Node>&, MKLDNNEltwiseNode& node)>;
     static const std::map<const ngraph::DiscreteTypeInfo, Initializer> initializers;
 
+    static BroadcastingPolicy determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op);
+
     void executeOptimized6D(const std::unique_ptr<jit_uni_eltwise_kernel> &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs,
                             const VectorDims &dims_out) const;
     void executeOptimizedGeneric(const std::unique_ptr<jit_uni_eltwise_kernel> &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs,
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
index 319a41528e6..3597719521b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
@@ -860,7 +860,15 @@ bool MKLDNNFakeQuantizeNode::isSupportedOperation(const std::shared_ptr<const ng
                         count_not_unit_axis++;
                     }
                 }
-                if (count_not_unit_axis > 1 || not_unit_axis > 1) {
+
+                /* @todo
+                 * Channel axis 2 is added for 3D MatMul (most common one).
+                 * FQ for non-1 channel fallbacks to reference implementation.
+                 * Expected to be fused for 3D MatMul
+                 * Long term idea: restore limitation for channel axis 1 and
+                 * support fusing of unfolded FQ (see FakeQuantizeDecomposition transformation)
+                 */
+                if (count_not_unit_axis > 1 || !one_of(not_unit_axis, 1, 2)) {
                     errorMessage = "Supports only per-tensor and per-channel quantizations";
                     return false;
                 }
@@ -1057,6 +1065,13 @@ MKLDNNFakeQuantizeNode::MKLDNNFakeQuantizeNode(const std::shared_ptr<ngraph::Nod
             outputScaleSize = outputScale.size();
             outputShiftSize = outputShift.size();
 
+            if (everyone_is(1, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize))
+                broadcastingPolicy = PerTensor;
+            else if (one_of(1, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize))
+                broadcastingPolicy = Mixed;
+            else
+                broadcastingPolicy = PerChannel;
+
             bool quantizationOnly = true;
 
             for (int i = 0; i < cropLow.size(); i++) {
@@ -1649,14 +1664,12 @@ void MKLDNNFakeQuantizeNode::execute(mkldnn::stream strm) {
     }
 }
 
-void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
-    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
-    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
-    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
-    const size_t bufferAlignment = 16;
+void MKLDNNFakeQuantizeNode::initializePostOpData(const VectorDims &dims, const size_t bufferAlignment) {
+    if (isPostOpDataInitialized)
+        return;
 
     if (getAlgorithm() == FQBinarization) {
-        const auto realAxisSize = postOpDims[postOpDims.size() > 1 ? 1 : 0];
+        const auto realAxisSize = dims[dims.size() > 1 ? 1 : 0];
         const auto axisPaddedSize = rnd_up(realAxisSize, bufferAlignment);
         if (!isPostOpDataInitialized) {
             binarizationThresholds.resize(axisPaddedSize, 0);
@@ -1671,73 +1684,76 @@ void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDi
                 std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0);
             }
         }
-
-        ops.append_binarization(mkldnn::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]);
-
-        if (!isInputLowBroadcasted && !isOutputHighBroadcasted) {
-            isPostOpDataInitialized = true;
-        }
     } else {
-        if (!isPostOpDataInitialized) {
-            if (cropLow.size() > 1)
-                cropLow.resize(rnd_up(cropLow.size(), bufferAlignment), 0);
-            if (cropHigh.size() > 1)
-                cropHigh.resize(rnd_up(cropHigh.size(), bufferAlignment), 0);
-            if (inputScale.size() > 1)
-                inputScale.resize(rnd_up(inputScale.size(), bufferAlignment), 0);
-            if (inputShift.size() > 1)
-                inputShift.resize(rnd_up(inputShift.size(), bufferAlignment), 0);
-            if (outputScale.size() > 1)
-                outputScale.resize(rnd_up(outputScale.size(), bufferAlignment), 0);
-            if (outputShift.size() > 1)
-                outputShift.resize(rnd_up(outputShift.size(), bufferAlignment), 0);
+        if (cropLow.size() > 1)
+            cropLow.resize(rnd_up(cropLow.size(), bufferAlignment), 0);
+        if (cropHigh.size() > 1)
+            cropHigh.resize(rnd_up(cropHigh.size(), bufferAlignment), 0);
+        if (inputScale.size() > 1)
+            inputScale.resize(rnd_up(inputScale.size(), bufferAlignment), 0);
+        if (inputShift.size() > 1)
+            inputShift.resize(rnd_up(inputShift.size(), bufferAlignment), 0);
+        if (outputScale.size() > 1)
+            outputScale.resize(rnd_up(outputScale.size(), bufferAlignment), 0);
+        if (outputShift.size() > 1)
+            outputShift.resize(rnd_up(outputShift.size(), bufferAlignment), 0);
 
-            cropLowData.set(cropLow.size(), 1 << 1, &cropLow[0]);
-            cropHighData.set(cropHigh.size(), 1 << 1, &cropHigh[0]);
-            inputScaleData.set(inputScale.size(), 1 << 1, &inputScale[0]);
-            inputShiftData.set(inputShift.size(), 1 << 1, &inputShift[0]);
-            outputScaleData.set(outputScale.size(), 1 << 1, &outputScale[0]);
-            outputShiftData.set(outputShift.size(), 1 << 1, &outputShift[0]);
-        }
+        cropLowData.set(cropLow.size(), 1 << 1, &cropLow[0]);
+        cropHighData.set(cropHigh.size(), 1 << 1, &cropHigh[0]);
+        inputScaleData.set(inputScale.size(), 1 << 1, &inputScale[0]);
+        inputShiftData.set(inputShift.size(), 1 << 1, &inputShift[0]);
+        outputScaleData.set(outputScale.size(), 1 << 1, &outputScale[0]);
+        outputShiftData.set(outputShift.size(), 1 << 1, &outputShift[0]);
+    }
 
+    isPostOpDataInitialized = true;
+}
+
+void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
+    initializePostOpData(postOpDims, align);
+
+    if (getAlgorithm() == FQBinarization) {
+        ops.append_binarization(mkldnn::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]);
+    } else {
         mkldnn::algorithm alg = getAlgorithm() == FQCommon ? mkldnn::algorithm::quantization_quantize_dequantize :
                                                              mkldnn::algorithm::quantization_quantize;
-
-        if (initAsBinary) {
-            auto appendBinary = [&](const mkldnn::algorithm alg, const size_t dataSize, MKLDNNMemoryPtr &memPtr, const void *data) {
-                const auto rank = getOutputShapeAtPort(0).getRank();
-                auto chIdx = rank > 1 ? 1 : 0;
-
-                std::vector<size_t> binaryShape(rank, 1);
-                binaryShape[chIdx] = dataSize;
-
-                DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, Shape(binaryShape));
-                ops.append_binary(alg, memoryDesc.getDnnlDesc());
-
-                if (initBinaryMemory) {
-                    memPtr.reset(new MKLDNNMemory(getEngine()));
-                    memPtr->Create(memoryDesc, data);
-                }
-            };
-
-            appendBinary(mkldnn::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
-            appendBinary(mkldnn::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
-            appendBinary(mkldnn::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
-            appendBinary(mkldnn::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
-            if (alg == mkldnn::algorithm::quantization_quantize_dequantize) {
-                ops.append_eltwise(1.0f, mkldnn::algorithm::eltwise_round_half_to_even, 0, 0);
-            }
-            appendBinary(mkldnn::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
-            appendBinary(mkldnn::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
-
-        } else {
-            ops.append_quantization(alg, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData);
-        }
-
-        isPostOpDataInitialized = true;
+        ops.append_quantization(alg, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData);
     }
 }
 
+void MKLDNNFakeQuantizeNode::appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    static const size_t bufferAlignment = 1;
+
+    initializePostOpData(postOpDims, bufferAlignment);
+
+    VectorDims broadcastBinaryShape(postOpDims.size(), 1);
+
+    auto appendBinary = [&](const mkldnn::algorithm alg, const size_t dataSize, MKLDNNMemoryPtr &memPtr, const void *data) {
+        DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, dataSize == 1 ? Shape(broadcastBinaryShape) : Shape(postOpDims));
+        ops.append_binary(alg, memoryDesc.getDnnlDesc());
+
+        if (!memPtr) {
+            memPtr.reset(new MKLDNNMemory(getEngine()));
+            memPtr->Create(memoryDesc, data);
+
+            binaryPostOpsMem.push_back(memPtr);
+        }
+    };
+
+    mkldnn::algorithm alg = getAlgorithm() == FQCommon ? mkldnn::algorithm::quantization_quantize_dequantize :
+                                                         mkldnn::algorithm::quantization_quantize;
+
+    appendBinary(mkldnn::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
+    appendBinary(mkldnn::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
+    appendBinary(mkldnn::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
+    appendBinary(mkldnn::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
+    if (alg == mkldnn::algorithm::quantization_quantize_dequantize) {
+        ops.append_eltwise(1.0f, mkldnn::algorithm::eltwise_round_half_to_even, 0, 0);
+    }
+    appendBinary(mkldnn::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
+    appendBinary(mkldnn::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
+}
+
 MKLDNNFakeQuantizeNode::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
     bool isBinarization = _jqp.op_type == FQBinarization;
     if (mayiuse(cpu::x64::avx512_common)) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
index af75677ab82..a56b94fdf40 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
@@ -121,11 +121,22 @@ public:
     InferenceEngine::Precision getInputPrecision() const { return inputPrecision; }
     InferenceEngine::Precision getOutputPrecision() const { return outputPrecision; }
 
-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}, int align = -1, bool initAsBinary = false,
-                       bool initBinaryMemory = false) override;
+    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
+    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
+    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}, int align = 16) override;
+    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;
 
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
 
+    enum BroadcastingPolicy {
+        PerChannel, // all FQ operations are per channel
+        PerTensor,  // all FQ operations are per tensor
+        Mixed,      // some per channel, some per tensor
+    };
+
+    BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; }
+
     MKLDNNMemoryPtr cropLowMemory;
     MKLDNNMemoryPtr cropHighMemory;
     MKLDNNMemoryPtr inputScaleMemory;
@@ -149,6 +160,7 @@ private:
 
     void init() override;
     std::vector<LayoutType> getDataFormats() const;
+    void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment);
     void executeReference();
     void executeBinarization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const;
     void executeQuantization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const;
@@ -195,6 +207,8 @@ private:
     InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
 
     std::string errorPrefix;
+
+    BroadcastingPolicy broadcastingPolicy;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
index 68bad2f2f92..8eaea33af95 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@@ -147,13 +147,7 @@ void MKLDNNFullyConnectedNode::createPrimitive() {
     else
         primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getParentEdgeAt(WEIGHTS_ID)->getMemory().GetPrimitive()}, {DNNL_ARG_DST, dst}};
 
-    auto post_ops = attr->get_post_ops();
-    int idx = 0;
-    for (int i = 0; i < post_ops.len(); i++) {
-        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
-            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]});
-        }
-    }
+    appendPostOpArgs(*attr);
 }
 
 void MKLDNNFullyConnectedNode::execute(mkldnn::stream strm) {
@@ -183,42 +177,32 @@ bool MKLDNNFullyConnectedNode::canFuse(const MKLDNNNodePtr& node) const {
     return canFuseSimpleOperation(node);
 }
 
-void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false, bool initAsBinary = false) {
-    bool initBinaryMemory = initWeights;
+void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) {
     mkldnn::post_ops ops;
 
+    auto getBinPostOpShape = [&](){
+        const size_t binaryShapeRank = getOutputShapeAtPort(0).getRank() == 3 ? 2 : getOutputShapeAtPort(0).getRank();
+        VectorDims binaryShape(binaryShapeRank, 1);
+        const size_t channelAxis = getFusingAxis();
+        // always use 1 as channelAxis for binary Shape, since oneDNN primitive is actually always 2D
+        binaryShape[1] = getOutputShapeAtPort(0).getStaticDims()[channelAxis];
+
+        return binaryShape;
+    };
+
     for (auto &node : fusedWith) {
-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            // no need to fill post ops dims for fq, make sense only for bin fq
-            fakeQuantizeNode->appendPostOps(ops, VectorDims{}, -1, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (fakeQuantizeNode->cropHighMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropHighMemory->GetPrimitive());
-                if (fakeQuantizeNode->cropLowMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropLowMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputShiftMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputShiftMemory->GetPrimitive());
-            }
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
             continue;
         }
 
-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
             // TODO [DS]: change to shape from memory
             constexpr int align = -1;
-            eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (eltwiseNode->scalesMemory)
-                    binaryPostOpsArgs.push_back(eltwiseNode->scalesMemory->GetPrimitive());
-                if (eltwiseNode->shiftsMemory)
-                    binaryPostOpsArgs.push_back(eltwiseNode->shiftsMemory->GetPrimitive());
+            if (eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
+            } else {
+                eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
             }
             continue;
         }
@@ -280,7 +264,7 @@ const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriori
 MKLDNNNode::AttrPtr MKLDNNFullyConnectedNode::initPrimitiveAttr() {
     auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());
 
-    setPostOps(*attr, true, true);
+    setPostOps(*attr);
 
     return attr;
 }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
index 6749c9451c0..c8394bb1afd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
@@ -26,6 +26,10 @@ public:
         return false;
     }
 
+    size_t getFusingAxis() const override {
+        return getOutputShapeAtPort(0).getRank() == 3 ? 2 : 1;
+    }
+
     const std::vector<impl_desc_type>& getPrimitivesPriority() override;
     void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                           const std::vector<MemoryDescPtr>& outputDesc) override;
@@ -43,8 +47,7 @@ public:
 
     static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
 
-protected:
-    AttrPtr initPrimitiveAttr();
+    std::shared_ptr<mkldnn::primitive_attr> initPrimitiveAttr() override;
 
 private:
     void createDescriptorInternal(const mkldnn::memory::desc &inputDesc,
@@ -54,7 +57,7 @@ private:
     InferenceEngine::SizeVector biasesDims;
 
     std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
-    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights, bool initAsBinary);
+    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
 
     bool withBiases = false;
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp
index e9ebde02cff..944f65ff5f0 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp
@@ -17,6 +17,7 @@
 #include "common/cpu_memcpy.h"
 #include <ngraph/opsets/opset1.hpp>
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include "nodes/mkldnn_fake_quantize_node.h"
 #include "utils/general_utils.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "mkldnn_extension_utils.h"
@@ -54,31 +55,65 @@ bool MKLDNNMatMulNode::isSupportedOperation(const std::shared_ptr<const ngraph::
 }
 
 MKLDNNMatMulNode::MKLDNNMatMulNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-    MKLDNNNode(op, eng, cache) {
+    MKLDNNNode(op, eng, cache), withBiases(false) {
     std::string errorMessage;
+    errorPrefix = "MatMul node with name '" + getName() + "'";
+
     if (!isSupportedOperation(op, errorMessage))
         IE_THROW(NotImplemented) << errorMessage;
 
-    errorPrefix = "MatMul node with name '" + getName() + "'";
-
     const auto matMul = std::dynamic_pointer_cast<const ngraph::opset1::MatMul>(op);
 
+    if (!matMul) {
+        IE_THROW(NotImplemented) << "Operation with name " << op->get_friendly_name() << ":" << op->get_type_name() <<
+            " is not an instance of MatMul from opset1";
+    }
+
     transposeIn[0] = matMul->get_transpose_a();
     transposeIn[1] = matMul->get_transpose_b();
 }
 
 bool MKLDNNMatMulNode::canFuse(const MKLDNNNodePtr& node) const {
-    return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
-                  EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
-                  EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu);
+    // per channel binary post op for rank > 2D is supported only by oneDNN reference implementation because of unusual MatMul channel axis (issue 6669)
+    if (getOutputShapeAtPort(0).getRank() > 2) {
+        if (const auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
+            if (one_of(eltwiseNode->getAlgorithm(),
+                       EltwiseAdd, EltwiseMultiply, EltwiseSubtract, EltwiseDivide, EltwisePrelu, EltwiseMulAdd, EltwisePowerStatic) &&
+                eltwiseNode->getBroadcastingPolicy() != MKLDNNEltwiseNode::PerTensor) {
+                return false;
+            }
+        } else if (const auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            if (fakeQuantizeNode->getBroadcastingPolicy() != MKLDNNFakeQuantizeNode::PerTensor) {
+                return false;
+            }
+        }
+    }
+
+    return canFuseSimpleOperation(node);
 }
 
-void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights = false) const {
+void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights = false) {
     mkldnn::post_ops ops;
 
-    for (auto &node : fusedWith) {
+    auto getBinPostOpShape = [&](){
+        const auto outShapeRank = dims.size();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = dims[chIdx];
+        return binaryShape;
+    };
+
+    for (const auto &node : fusedWith) {
         if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
-            eltwiseNode->appendPostOps(ops, dims);
+            // TODO [DS]: change to shape from memory
+            if (eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                eltwiseNode->appendPostOps(ops, dims);
+            } else {
+                eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
+            }
+            continue;
+        } else if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
             continue;
         }
 
@@ -88,8 +123,7 @@ void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims
     attr.set_post_ops(ops);
 }
 
-
-MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims) const {
+MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims) {
     auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());
 
     setPostOps(*attr, dims, true);
@@ -97,7 +131,7 @@ MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims)
     return attr;
 }
 
-MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr() const {
+MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr() {
     auto dummyShape = MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0));
     return initPrimitiveAttr(dummyShape.getStaticDims());
 }
@@ -131,12 +165,29 @@ static VectorDims getStridesAndModifyShape(Shape& shape, const bool transpose) {
     return strides;
 }
 
+mkldnn::memory::desc MKLDNNMatMulNode::getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc) {
+    // oneDNN matmul requires shape for bias desc to be the same rank
+    VectorDims biasDims(outMemDesc->getShape().getRank(), 1);
+    const auto outDims = outMemDesc->getShape().getStaticDims();
+    const auto chIdx = getFusingAxis();
+    biasDims[chIdx] = outDims[chIdx];
+    const auto bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(2));
+
+    return mkldnn::memory::desc(MKLDNNExtensionUtils::convertToDnnlDims(biasDims), bdt, memory::format_tag::any);
+}
+
 void MKLDNNMatMulNode::getSupportedDescriptors() {
-    if (getParentEdges().size() != 2)
+    if (getParentEdges().size() != getOriginalInputsNumber())
         IE_THROW()  << errorPrefix << " has incorrect number of input edges for layer " << getName();
     if (getChildEdges().empty())
         IE_THROW()  << errorPrefix << " has incorrect number of output edges for layer " << getName();
 
+    withBiases = getOriginalInputsNumber() == 3;
+
+    auto canBeExecutedInInt8 = [](const Precision firstInput, const Precision secondInput) {
+        return one_of(firstInput, Precision::U8, Precision::I8) && secondInput == Precision::I8;
+    };
+
     auto firstInPortPrec = getOriginalInputPrecisionAtPort(0);
     auto secondInPortPrec = getOriginalInputPrecisionAtPort(1);
     auto outPortPrec = getOriginalOutputPrecisionAtPort(0);
@@ -154,6 +205,9 @@ void MKLDNNMatMulNode::getSupportedDescriptors() {
         outPortPrec = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0);
     }
 
+    if (!canBeExecutedInInt8(firstInPortPrec, secondInPortPrec) && one_of(outPortPrec, Precision::U8, Precision::I8))
+        outPortPrec = Precision::FP32; // INT output is not supported for non-INT inputs
+
     const auto& inputShape0 = getInputShapeAtPort(0);
     const auto& inputShape1 = getInputShapeAtPort(1);
     const auto& outputShape = getOutputShapeAtPort(0);
@@ -206,12 +260,19 @@ void MKLDNNMatMulNode::getSupportedDescriptors() {
 
 void MKLDNNMatMulNode::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                                         const std::vector<MemoryDescPtr>& outputDesc) {
-    MKLDNNDescriptor desc{
-        std::make_shared<matmul::desc>(inDataDesc[0]->getDnnlDesc(),
-                                       inDataDesc[1]->getDnnlDesc(),
-                                       outDataDesc->getDnnlDesc())};
+    std::shared_ptr<mkldnn::matmul::desc> matmul_desc;
+    if (withBiases) {
+        matmul_desc.reset(new matmul::desc(inDataDesc[0]->getDnnlDesc(),
+                                           inDataDesc[1]->getDnnlDesc(),
+                                           getBiasDescFrom(outDataDesc),
+                                           outDataDesc->getDnnlDesc()));
+    } else {
+        matmul_desc.reset(new matmul::desc(inDataDesc[0]->getDnnlDesc(),
+                                           inDataDesc[1]->getDnnlDesc(),
+                                           outDataDesc->getDnnlDesc()));
+    }
 
-    descs.push_back(desc);
+    descs.emplace_back(matmul_desc);
 }
 
 void MKLDNNMatMulNode::initSupportedPrimitiveDescriptors() {
@@ -262,9 +323,13 @@ void MKLDNNMatMulNode::createPrimitive() {
 
 MemoryDescPtr MKLDNNMatMulNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
     auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1): primitive_desc_it.src_desc(idx);
-    return std::make_shared<CpuBlockedMemoryDesc>(
-        MKLDNNExtensionUtils::DataTypeToIEPrecision(static_cast<mkldnn::memory::data_type>(desc.data.data_type)),
-        getInputShapeAtPort(idx)); /* provide initial shapes, so hide transpose effect */
+
+    if (idx < 2) // inputs
+        return std::make_shared<CpuBlockedMemoryDesc>(
+            MKLDNNExtensionUtils::DataTypeToIEPrecision(static_cast<mkldnn::memory::data_type>(desc.data.data_type)),
+            getInputShapeAtPort(idx)); /* provide initial shapes, so hide transpose effect */
+    else // bias
+        return MKLDNNExtensionUtils::makeDescriptor(desc);
 }
 
 bool MKLDNNMatMulNode::created() const {
@@ -300,10 +365,7 @@ void MKLDNNMatMulNode::prepareParams() {
     AttrPtr attr;
 
     if (isDynamicNode()) {
-        if (!pAttr) {
-            pAttr = initPrimitiveAttr(src0MemPtr->getStaticDims());
-        }
-        attr = pAttr;
+        attr = initPrimitiveAttr(dstMemPtr->getStaticDims());
 
         const auto& src0Desc = src0MemPtr->getDesc();
         const auto& src1Desc = src1MemPtr->getDesc();
@@ -323,13 +385,22 @@ void MKLDNNMatMulNode::prepareParams() {
 
     auto dstDnnlDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();
 
-    MKLDNNDescriptor desc{
-            std::make_shared<matmul::desc>(src0TransposedDesc->getDnnlDesc(),
-                                           src1TransposedDesc->getDnnlDesc(),
-                                           dstDnnlDesc->getDnnlDesc())};
+    std::shared_ptr<mkldnn::matmul::desc> matmul_desc;
 
-    matmul::primitive_desc prim_desc;
+    if (withBiases) {
+        matmul_desc.reset(new mkldnn::matmul::desc{src0TransposedDesc->getDnnlDesc(),
+                                            src1TransposedDesc->getDnnlDesc(),
+                                            getBiasDescFrom(dstDnnlDesc),
+                                            dstDnnlDesc->getDnnlDesc()});
+    } else {
+        matmul_desc.reset(new mkldnn::matmul::desc(src0TransposedDesc->getDnnlDesc(),
+                                            src1TransposedDesc->getDnnlDesc(),
+                                            dstDnnlDesc->getDnnlDesc()));
+    }
+
+    MKLDNNDescriptor desc(matmul_desc);
     primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine(), *attr);
+    matmul::primitive_desc prim_desc;
 
     while (static_cast<bool>(itpd))  {
         impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@@ -347,6 +418,10 @@ void MKLDNNMatMulNode::prepareParams() {
     primArgs[DNNL_ARG_SRC_0] = src0MemPtr->GetPrimitive();
     primArgs[DNNL_ARG_WEIGHTS_0] = src1MemPtr->GetPrimitive();
     primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
+    if (withBiases)
+        primArgs[DNNL_ARG_BIAS] = getParentEdgeAt(2)->getMemoryPtr()->GetPrimitive();
+
+    appendPostOpArgs(*attr);
 }
 
 void MKLDNNMatMulNode::executeDynamicImpl(dnnl::stream strm) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h
index 9c9489df237..ab3abd3a35c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h
@@ -32,6 +32,10 @@ public:
         return getOriginalInputsNumber();
     }
 
+    size_t getFusingAxis() const override {
+        return getOutputShapeAtPort(0).getRank() - 1;
+    }
+
     void prepareParams() override;
     void executeDynamicImpl(mkldnn::stream strm) override;
 
@@ -39,11 +43,15 @@ public:
     const std::vector<impl_desc_type>& getPrimitivesPriority() override;
 
 protected:
-    AttrPtr initPrimitiveAttr() const override;
-    AttrPtr initPrimitiveAttr(const VectorDims& dims) const;
+    AttrPtr initPrimitiveAttr() override;
+    AttrPtr initPrimitiveAttr(const VectorDims& dims);
 
 private:
-    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights) const;
+    mkldnn::memory::desc getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc);
+
+    bool withBiases;
+
+    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights);
 
     std::string errorPrefix;
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
index 379253233ee..0a95c61d86e 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
@@ -511,7 +511,7 @@ void MKLDNNPoolingNode::initDescriptor(const NodeConfig& config) {
     selectedPD->setConfig(rightConfig);
 }
 
-MKLDNNNode::AttrPtr MKLDNNPoolingNode::initPrimitiveAttr() const {
+MKLDNNNode::AttrPtr MKLDNNPoolingNode::initPrimitiveAttr() {
     auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());
 
     setPostOps(*attr, true);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
index f3a6fc781cc..75f726d567b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
@@ -34,7 +34,7 @@ public:
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
 
 protected:
-    AttrPtr initPrimitiveAttr() const override;
+    AttrPtr initPrimitiveAttr() override;
 
 private:
     void setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) const;
diff --git a/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp b/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp
index e9c213f23fb..ed4e94e1516 100644
--- a/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp
+++ b/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp
@@ -4,6 +4,13 @@
 
 #pragma once
 
+#include <cstddef>
+#include <numeric>
+#include <vector>
+
+#include "ie_common.h"
+#include "ie_layouts.h"
+
 namespace MKLDNNPlugin {
 
 /**
@@ -36,7 +43,9 @@ inline std::vector<size_t> getNormalizedDimsBySize(const InferenceEngine::SizeVe
 * flag which specify how we compare C dims if value is undefined (weak or strong)
 * @return true if broadcastable, false otherwise.
 */
-inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVector &firstInputDims, const InferenceEngine::SizeVector& secondInputDims,
+inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVector &firstInputDims,
+                                                 const InferenceEngine::SizeVector& secondInputDims,
+                                                 size_t channelAxis,
                                                  bool weakComparison = false) {
     bool (*dimsEqual)(size_t, size_t) = weakComparison ? static_cast<bool (*)(size_t, size_t)>(dimsEqualWeak) :
                                                          static_cast<bool (*)(size_t, size_t)>(dimsEqualStrong);
@@ -47,7 +56,7 @@ inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVect
 
     std::vector<size_t> normalizedSecondInputDims = getNormalizedDimsBySize(secondInputDims, firstInputDims.size());
     for (size_t i = 0; i < normalizedSecondInputDims.size(); i++) {
-        if ((i == 1 && !dimsEqual(normalizedSecondInputDims[i], firstInputDims[1])) || (i != 1 && normalizedSecondInputDims[i] != 1))
+        if ((i == channelAxis && !dimsEqual(normalizedSecondInputDims[i], firstInputDims[i])) || (i != channelAxis && normalizedSecondInputDims[i] != 1))
             return false;
     }
     return true;
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
index 776be228425..b612da99596 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
@@ -51,7 +51,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "FullyConnected",
+        "MatMul",
         "U8"
     },
     // 4D with Dq on weights
@@ -61,7 +61,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
         {},
         { ngraph::element::f32, {}, {{0.1f, 0.01}, ngraph::element::f32, ngraph::Shape{ 2, 1 }} },
-        "FullyConnected",
+        "MatMul",
         "U8"
     },
     // 3D with the same values
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
index 3241ebef007..58b2bbde7bb 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
@@ -11,7 +11,8 @@ using namespace LayerTestsDefinitions;
 namespace {
 
 const std::vector<InferenceEngine::Precision> inputPrecisions = {
-        InferenceEngine::Precision::FP32
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::I32,
 };
 
 const std::vector<ShapeRelatedParams> shapeRelatedParams = {
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp
index 367886f31e2..063e13e5d7d 100644
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp
@@ -2,9 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "shared_test_classes/single_layer/mat_mul.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
+#include "ie_precision.hpp"
 #include "test_utils/fusing_test_utils.hpp"
 #include "ngraph_functions/builders.hpp"
+#include <string>
 
 using namespace ngraph;
 using namespace InferenceEngine;
@@ -139,11 +142,10 @@ protected:
         const auto& inShapeA = inputDynamicShapes[0];
         const auto& inShapeB = inputDynamicShapes[1];
 
-        /* @todo
-         * Currently nodes are not fused thought Reshape
-         * Check can be deleted after this limitation is gone
-         */
-        if (nodeType == MatMulNodeType::MatMul && inShapeA.size() < 4 && inShapeB.size() < 4)
+        // see comment in MKLDNNMatMulNode::canFuse
+        if (!(nodeType == MatMulNodeType::MatMul &&
+              std::get<0>(fusingParams) && std::get<0>(fusingParams)->getFusedOpsNames().find("(PerChannel)") != std::string::npos &&
+              std::max(inShapeA.size(), inShapeB.size()) > 2))
             std::tie(postOpMgrPtr, fusedOps) = fusingParams;
 
         configuration.insert(additionalConfig.begin(), additionalConfig.end());
@@ -179,6 +181,8 @@ TEST_P(MatMulLayerCPUTest, CompareWithRefs) {
 namespace {
 
 /* ============= Common params ============= */
+std::map<std::string, std::string> emptyAdditionalConfig;
+
 std::vector<std::map<std::string, std::string>> additionalConfig {
     std::map<std::string, std::string>{/* empty config */},
     {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}
@@ -196,15 +200,16 @@ std::vector<CPUSpecificParams> filterSpecificParams() {
     return specificParams;
 }
 
+const auto fusingBias = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+            {[](std::shared_ptr<Node> inpNode, const element::Type& ngPrc, ParameterVector& params) {
+                size_t last_dim = inpNode->get_output_partial_shape(0).rbegin()->get_length();
+                auto bias = builder::makeConstant(ngPrc, Shape{last_dim}, std::vector<float>{}, true);
+                return std::make_shared<opset1::Add>(inpNode, bias);
+            }, "fusingBias"}}), {"Add"}};
+
 /* ============= FullyConnected ============= */
 namespace fullyConnected {
 
-const auto fusingBiasFC = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
-            {[](std::shared_ptr<Node> inpNode, const element::Type& ngPrc, ParameterVector& params) {
-                auto bias = builder::makeConstant(ngPrc, Shape({inpNode->get_output_shape(0).back()}), std::vector<float>{}, true);
-                return std::make_shared<opset1::Add>(inpNode, bias);
-            }, "fusingBiasFC"}}), {"Add"}};
-
 const std::vector<ShapeRelatedParams> IS2D = {
     {static_shapes_to_test_representation({{59, 1}, {1, 120}}), {false, false}},
     {static_shapes_to_test_representation({{59, 1}, {1, 120}}), {true, false}},
@@ -229,26 +234,46 @@ const std::vector<ShapeRelatedParams> IS2D = {
 
 std::vector<fusingSpecificParams> fusingParamsSet2D {
         emptyFusingSpec,
-        fusingBiasFC,
+        fusingBias,
         fusingRelu,
         fusingMultiplyPerChannel,
-        fusingPReluPerTensor
+        fusingScaleShift, // EltwiseMulAdd fusing
+        fusingPReluPerTensor,
+        fusingFakeQuantizePerChannelRelu,
+        fusingFakeQuantizePerTensorRelu,
 };
 
-const auto fullyConnectedParams2D = ::testing::Combine(::testing::ValuesIn(IS2D),
-                                                       ::testing::ValuesIn(netPRCs),
-                                                       ::testing::Values(ElementType::undefined),
-                                                       ::testing::Values(ElementType::undefined),
-                                                       ::testing::Values(helpers::InputLayerType::CONSTANT),
-                                                       ::testing::Values(CommonTestUtils::DEVICE_CPU),
-                                                       ::testing::ValuesIn(additionalConfig));
+std::vector<fusingSpecificParams> fusingParamsSet2DBF16 {
+        emptyFusingSpec,
+        fusingBias,
+        fusingRelu,
+        fusingPReluPerTensor,
+};
 
-const auto testParams2D = ::testing::Combine(fullyConnectedParams2D,
+const auto testParams2D = ::testing::Combine(::testing::Combine(::testing::ValuesIn(IS2D),
+                                                                ::testing::Values(ElementType::f32),
+                                                                ::testing::Values(ElementType::undefined),
+                                                                ::testing::Values(ElementType::undefined),
+                                                                ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                                ::testing::Values(emptyAdditionalConfig)),
                                              ::testing::Values(MatMulNodeType::FullyConnected),
                                              ::testing::ValuesIn(fusingParamsSet2D),
                                              ::testing::ValuesIn(filterSpecificParams()));
 
+const auto testParams2DBF16 = ::testing::Combine(::testing::Combine(::testing::ValuesIn(IS2D),
+                                                                    ::testing::ValuesIn(netPRCs),
+                                                                    ::testing::Values(ElementType::undefined),
+                                                                    ::testing::Values(ElementType::undefined),
+                                                                    ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                                    ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                                    ::testing::ValuesIn(additionalConfig)),
+                                                 ::testing::Values(MatMulNodeType::FullyConnected),
+                                                 ::testing::ValuesIn(fusingParamsSet2DBF16),
+                                                 ::testing::ValuesIn(filterSpecificParams()));
+
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D, MatMulLayerCPUTest, testParams2D, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16, MatMulLayerCPUTest, testParams2DBF16, MatMulLayerCPUTest::getTestCaseName);
 
 const std::vector<ShapeRelatedParams> IS3D = {
     {static_shapes_to_test_representation({{1, 32, 120}, {120, 5}}), {false, false}},
@@ -266,23 +291,46 @@ const std::vector<ShapeRelatedParams> IS3D = {
 
 std::vector<fusingSpecificParams> fusingParamsSet3D {
         emptyFusingSpec,
-        fusingBiasFC
+        fusingBias,
+        fusingMultiplyPerChannel,
+        fusingFakeQuantizePerChannel,
+        fusingFakeQuantizePerTensorRelu,
+};
+
+std::vector<fusingSpecificParams> fusingParamsSet3DBF16 {
+        emptyFusingSpec,
+        fusingBias,
+        fusingMultiplyPerChannel,
 };
 
 const auto fullyConnectedParams3D = ::testing::Combine(::testing::ValuesIn(IS3D),
-                                                       ::testing::ValuesIn(netPRCs),
+                                                       ::testing::Values(ElementType::f32),
                                                        ::testing::Values(ElementType::undefined),
                                                        ::testing::Values(ElementType::undefined),
                                                        ::testing::Values(helpers::InputLayerType::CONSTANT),
                                                        ::testing::Values(CommonTestUtils::DEVICE_CPU),
-                                                       ::testing::ValuesIn(additionalConfig));
+                                                       ::testing::Values(emptyAdditionalConfig));
+
+const auto fullyConnectedParams3DBF16 = ::testing::Combine(::testing::ValuesIn(IS3D),
+                                                           ::testing::ValuesIn(netPRCs),
+                                                           ::testing::Values(ElementType::undefined),
+                                                           ::testing::Values(ElementType::undefined),
+                                                           ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                           ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                           ::testing::ValuesIn(additionalConfig));
 
 const auto testParams3D = ::testing::Combine(fullyConnectedParams3D,
                                              ::testing::Values(MatMulNodeType::FullyConnected),
                                              ::testing::ValuesIn(fusingParamsSet3D),
                                              ::testing::ValuesIn(filterSpecificParams()));
 
+const auto testParams3DBF16 = ::testing::Combine(fullyConnectedParams3DBF16,
+                                                 ::testing::Values(MatMulNodeType::FullyConnected),
+                                                 ::testing::ValuesIn(fusingParamsSet3DBF16),
+                                                 ::testing::ValuesIn(filterSpecificParams()));
+
 INSTANTIATE_TEST_SUITE_P(smoke_FC_3D, MatMulLayerCPUTest, testParams3D, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_BF16, MatMulLayerCPUTest, testParams3DBF16, MatMulLayerCPUTest::getTestCaseName);
 
 std::vector<std::map<std::string, std::string>> filterAdditionalConfig_Brgemm() {
     std::vector<std::map<std::string, std::string>> additionalConfig = {
@@ -357,7 +405,9 @@ const std::vector<ShapeRelatedParams> IS = {
     {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {true, false}},
     {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {false, true}},
     {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {true, true}},
+};
 
+const std::vector<ShapeRelatedParams> IS_Dynamic = {
     {
         { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
             {{-1, -1}, {{55, 12}, {33, 7}}}, // input 0
@@ -507,7 +557,16 @@ const std::vector<ShapeRelatedParams> IS = {
 std::vector<fusingSpecificParams> matmulFusingParams {
         emptyFusingSpec,
         fusingElu,
-        fusingSqrt
+        fusingSqrt,
+        fusingPReluPerTensor,
+        fusingMultiplyPerChannel,
+        fusingAddPerTensor,
+        fusingBias,
+        fusingFakeQuantizePerChannel,
+        /* @todo FQ unfolds into FQ + Convert + Substract + Multiply after LPT,
+         * so Relu cannot be fused in this case. Should be analysed */
+        // fusingFakeQuantizePerChannelRelu,
+        fusingFakeQuantizePerTensorRelu,
 };
 
 const auto matMulParams = ::testing::Combine(::testing::ValuesIn(IS),
@@ -523,7 +582,70 @@ const auto testParams = ::testing::Combine(matMulParams,
                                            ::testing::ValuesIn(matmulFusingParams),
                                            ::testing::ValuesIn(filterSpecificParams()));
 
-INSTANTIATE_TEST_SUITE_P(smoke_MM, MatMulLayerCPUTest, testParams, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Static, MatMulLayerCPUTest, testParams, MatMulLayerCPUTest::getTestCaseName);
+
+
+const auto matMulParamsDynamic = ::testing::Combine(::testing::ValuesIn(IS_Dynamic),
+                                             ::testing::ValuesIn(netPRCs),
+                                             ::testing::Values(ElementType::undefined),
+                                             ::testing::Values(ElementType::undefined),
+                                             ::testing::Values(helpers::InputLayerType::PARAMETER),
+                                             ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                             ::testing::ValuesIn(additionalConfig));
+
+const auto testParamsDynamic = ::testing::Combine(matMulParamsDynamic,
+                                           ::testing::Values(MatMulNodeType::MatMul),
+                                           ::testing::Values(emptyFusingSpec),
+                                           ::testing::ValuesIn(filterSpecificParams()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Dynamic, MatMulLayerCPUTest, testParamsDynamic, MatMulLayerCPUTest::getTestCaseName);
+
+
+const std::vector<ShapeRelatedParams> IS_Dynamic_Fusing = {
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1}, {{16, 12}, {33, 7}}}, // input 0
+            {{-1, 33}, {{12, 33}, {7, 33}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1, -1}, {{1, 2, 32, 60}, {1, 2, 32, 30}}}, // input 0
+            {{-1, 5}, {{60, 5}, {30, 5}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1}, {{7, 32, 60}, {7, 32, 30}}}, // input 0
+            {{-1, -1, -1, 25}, {{3, 7, 60, 25}, {3, 7, 30, 25}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1}, {{10, 10, 10}, {5, 5, 5}}}, // input 0
+            {{-1, -1, 5}, {{10, 10, 5}, {5, 5, 5}}}  // input 1
+        },
+        {false, false}
+    },
+};
+
+const auto matMulParamsDynamicFusing = ::testing::Combine(::testing::ValuesIn(IS_Dynamic_Fusing),
+                                                        ::testing::ValuesIn(netPRCs),
+                                                        ::testing::Values(ElementType::undefined),
+                                                        ::testing::Values(ElementType::undefined),
+                                                        ::testing::Values(helpers::InputLayerType::PARAMETER),
+                                                        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                        ::testing::ValuesIn(additionalConfig));
+
+const auto testParamsDynamicFusing = ::testing::Combine(matMulParamsDynamicFusing,
+                                                  ::testing::Values(MatMulNodeType::MatMul),
+                                                  ::testing::ValuesIn(matmulFusingParams),
+                                                  ::testing::ValuesIn(filterSpecificParams()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Dynamic_Fusing, MatMulLayerCPUTest, testParamsDynamicFusing, MatMulLayerCPUTest::getTestCaseName);
 
 } // namespace matmul
 
diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp
deleted file mode 100644
index 40b5215bd74..00000000000
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "test_utils/fusing_test_utils.hpp"
-#include "ngraph_functions/builders.hpp"
-
-using namespace ngraph;
-using namespace InferenceEngine;
-using namespace CPUTestUtils;
-
-namespace SubgraphTestsDefinitions {
-
-using ReshapeFCTestParams = std::tuple<std::pair<SizeVector, SizeVector>, // IS fully connected
-                                       bool,                              // transpose B
-                                       fusingSpecificParams>;
-
-class ReshapeFCTest : public testing::WithParamInterface<ReshapeFCTestParams>, public CpuTestWithFusing,
-                      virtual public LayerTestsUtils::LayerTestsCommon {
-public:
-    static std::string getTestCaseName(testing::TestParamInfo<ReshapeFCTestParams> obj) {
-        std::pair<SizeVector, SizeVector> isFc;
-        bool transpB;
-        fusingSpecificParams fusingParams;
-        std::tie(isFc, transpB, fusingParams) = obj.param;
-        SizeVector isA = isFc.first; SizeVector isB = isFc.second;
-
-        std::ostringstream result;
-        result << "IS_reshape=" << CommonTestUtils::vec2str(isA) << "_";
-        result << "IS_fc_B=" << CommonTestUtils::vec2str(isB) << "_";
-        result << "Transp_B=" << transpB;
-        result << CpuTestWithFusing::getTestCaseName(fusingParams);
-
-        return result.str();
-    }
-
-protected:
-    void SetUp() override {
-        targetDevice = CommonTestUtils::DEVICE_CPU;
-        std::pair<SizeVector, SizeVector> isFc;
-        bool transpB;
-        fusingSpecificParams fusingParams;
-        std::tie(isFc, transpB, fusingParams) = this->GetParam();
-        std::tie(postOpMgrPtr, fusedOps) = fusingParams;
-        SizeVector isReshape = isFc.first; SizeVector isB = isFc.second;
-        SizeVector isA(2);
-        isA[0] = isReshape[0];
-        isA[1] = std::accumulate(isReshape.begin() + 1, isReshape.end(), size_t{1}, std::multiplies<size_t>());
-        if (transpB) {
-            std::swap(*(isB.end() - 1), *(isB.end() - 2));
-        }
-
-        auto inputParams = builder::makeParams(element::f32, {isReshape});
-        auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<op::Parameter>(inputParams));
-
-        auto constNode = builder::makeConstant(element::i64, {isA.size()}, isA);
-        auto reshape = std::make_shared<opset1::Reshape>(paramOuts[0], constNode, true);
-
-        auto matrixB = builder::makeConstant<float>(element::f32, isB, {}, true);
-        auto matMul = builder::makeMatMul(reshape, matrixB, false, transpB);
-
-        const auto netType = element::f32;
-        selectedType = makeSelectedTypeStr("jit_gemm", netType);
-
-        function = makeNgraphFunction(netType, inputParams, matMul, "ReshapeFC");
-    }
-};
-
-TEST_P(ReshapeFCTest, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED()
-
-    Run();
-    CheckNodeOfTypeCount(executableNetwork, "Reshape", 0);
-    CheckPluginRelatedResults(executableNetwork, "FullyConnected");
-}
-
-namespace {
-
-const std::vector<bool> transpose = {
-    true, false
-};
-
-const std::vector<std::pair<SizeVector, SizeVector>> isFC = {
-    {{71, 128, 1, 1}, {128, 20}},
-    {{1, 24, 2, 7}, {336, 16}}
-};
-
-std::vector<fusingSpecificParams> fusingParamsSet {
-        emptyFusingSpec,
-        fusingAddPerChannel
-};
-
-const auto reshapeFCParams = ::testing::Combine(::testing::ValuesIn(isFC),
-                                                ::testing::ValuesIn(transpose),
-                                                ::testing::ValuesIn(fusingParamsSet));
-
-INSTANTIATE_TEST_SUITE_P(smoke_Check, ReshapeFCTest, reshapeFCParams, ReshapeFCTest::getTestCaseName);
-
-} // namespace
-
-} // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp b/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp
index 22fd8278b4a..709269dc4ab 100644
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "cpu_test_utils.hpp"
+#include <memory>
 #include <shared_test_classes/single_layer/activation.hpp>
 
 namespace CPUTestUtils {
@@ -75,6 +76,24 @@ protected:
     bool checkFusingPosition = true;
 };
 
+static size_t getFusingAxis(const std::shared_ptr<ngraph::Node>& node) {
+    if (std::dynamic_pointer_cast<const ngraph::opset1::MatMul>(node))
+        return node->get_output_partial_shape(0).size() - 1; // last dimension
+    else
+        return 1; // second dimension
+}
+
+static ngraph::Shape generatePerChannelShape(const std::shared_ptr<ngraph::Node>& node) {
+    const auto shape = node->get_output_partial_shape(0);
+    if (shape.size() == 1)
+        IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
+    ngraph::Shape perChannelShape(shape.size(), 1);
+    const auto channelAxis = getFusingAxis(node);
+    perChannelShape[channelAxis] = shape[channelAxis].get_length();
+
+    return perChannelShape;
+}
+
 /* FUSING PATTERNS */
 const auto emptyFusingSpec = fusingSpecificParams{nullptr, {}};
 
@@ -120,11 +139,7 @@ const auto fusingSqrt = fusingSpecificParams{std::make_shared<postNodesMgr>(std:
 
 const auto fusingPReluPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 auto data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(newShape));
                 return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::LeakyRelu, newShape, data);
             }, "PRelu(PerChannel)"}}), {"PRelu"}};
@@ -166,11 +181,7 @@ const auto fusingReluAdd = fusingSpecificParams{std::make_shared<postNodesMgr>(s
                 return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu);
             }, "Relu"},
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                 return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
             }, "Add(PerChannel)"}}), {"Relu", "Add"}};
@@ -180,40 +191,24 @@ const auto fusingReluScaleShift = fusingSpecificParams{std::make_shared<postNode
                 return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu);
             }, "Relu"},
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                 auto shape = inpNode->get_output_partial_shape(0);
-                 if (shape.size() == 1)
-                     IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                 ngraph::Shape newShape(shape.size(), 1);
-                 newShape[1] = shape[1].get_length();
-                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
-                 return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
+                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
             }, "Multiply(PerChannel)"},
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                 return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
             }, "Add(PerChannel)"}}), {"Relu", "Add"}};
 
 const auto fusingScaleShift = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
-                 auto shape = inpNode->get_output_partial_shape(0);
-                 if (shape.size() == 1)
-                     IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                 ngraph::Shape newShape(shape.size(), 1);
-                 newShape[1] = shape[1].get_length();
-                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
-                 return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
+                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
             }, "Multiply(PerChannel)"},
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                 return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
             }, "Add(PerChannel)"}}), {"Add"} };
@@ -228,22 +223,14 @@ const auto fusingFakeQuantizePerTensor = fusingSpecificParams{ std::make_shared<
 const auto fusingFakeQuantizePerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
                 auto localPrc = inpNode->get_element_type();
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
             }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize"}};
 
 const auto fusingFakeQuantizePerChannelRelu = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
                 auto localPrc = inpNode->get_element_type();
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                 return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
             }, "FakeQuantize(PerChannel)"},
             {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
@@ -291,60 +278,56 @@ const auto fusingSumEluFQ = fusingSpecificParams{std::make_shared<postNodesMgr>(
 const auto fusingMultiplyPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
             ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::op::v1::Multiply>(inpNode, secondMultInput);
         }, "Multiply(PerTensor)"}}), {"Multiply"}};
 
 const auto fusingMultiplyPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Multiply>(inpNode, secondMultInput);
         }, "Multiply(PerChannel)"}}), {"Multiply"}};
 
 const auto fusingAddPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
             ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Add>(inpNode, secondMultInput);
         }, "Add(PerTensor)"}}), {"Add"}};
 
 const auto fusingAddPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Add>(inpNode, secondMultInput);
         }, "Add(PerChannel)"}}), {"Add"}};
 
 const auto fusingSubtractPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
             ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Subtract>(inpNode, secondMultInput);
         }, "Subtract(PerTensor)"}}), {"Subtract"}};
 
 const auto fusingSubtractPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Subtract>(inpNode, secondMultInput);
         }, "Subtract(PerChannel)"}}), {"Subtract"}};
 
 const auto fusingDividePerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
             ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Divide>(inpNode, secondMultInput);
         }, "Divide(PerTensor)"}}), {"Divide"}};
 
 const auto fusingDividePerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
         {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
             return std::make_shared<ngraph::opset1::Divide>(inpNode, secondMultInput);
         }, "Divide(PerChannel)"}}), {"Divide"}};
 
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp
index c75c8995205..e3a54df0812 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp
@@ -44,7 +44,7 @@ std::string MatMulTest::getTestCaseName(const testing::TestParamInfo<MatMulLayer
     result << "trgDev=" << targetDevice;
     result << "config=(";
     for (const auto configEntry : additionalConfig) {
-        result << configEntry.first << ", " << configEntry.second << ":";
+        result << configEntry.first << ", " << configEntry.second << ";";
     }
     result << ")";
     return result.str();
diff --git a/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp b/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp
index 79fb1d8a387..49aa6dab406 100644
--- a/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp
+++ b/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp
@@ -13,7 +13,6 @@
 #include <ngraph_transformations/op/fully_connected.hpp>
 #include <ngraph_transformations/convert_matmul_to_fc.hpp>
 #include <ngraph_transformations/fc_bias_fusion.hpp>
-#include <ngraph_transformations/reshape_fully_connected.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/utils/utils.hpp>
 #include <ngraph/pass/manager.hpp>
@@ -171,7 +170,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest7) {
         ngraph::pass::Manager m;
         m.register_pass<ngraph::pass::InitNodeInfo>();
         m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
@@ -179,12 +177,9 @@ TEST(TransformationTests, ConvertMatMulToFCTest7) {
     {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{3, 2, 2});
         auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{3, 2}, {1});
-        auto reshape_begin = std::make_shared<ngraph::opset1::Reshape>(
-                input1, ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{-1, 2}), false);
-        auto fc = std::make_shared<FullyConnectedNode>(reshape_begin, input2, ngraph::Rank(2));
-        auto reshape_end = ngraph::op::util::reshapeTo(fc, ngraph::Shape{3, 2, 3});
+        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));
 
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_end}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{fc}, ngraph::ParameterVector{input1});
     }
 
     auto res = compare_functions(f, f_ref, true);
@@ -202,7 +197,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest8) {
         ngraph::pass::Manager m;
         m.register_pass<ngraph::pass::InitNodeInfo>();
         m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
@@ -211,18 +205,14 @@ TEST(TransformationTests, ConvertMatMulToFCTest8) {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, -1, 2});
         auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{3, 2}, {1});
 
-        auto reshape_begin = std::make_shared<ngraph::opset1::Reshape>(
-                input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 2}), false);
-
-        auto fc = std::make_shared<FullyConnectedNode>(reshape_begin, input2, ngraph::Rank(2));
+        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));
         auto a_shape = std::make_shared<ngraph::opset3::ShapeOf>(input1);
 
         auto I = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(a_shape, {0, 1});
         auto O = ngraph::opset1::Constant::create(ngraph::element::i64, { 1 }, { 3 });
         auto output_shape = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{I, O}, 0);
-        auto reshape_end = std::make_shared<ngraph::opset1::Reshape>(fc, output_shape, false);
 
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_end}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{fc}, ngraph::ParameterVector{input1});
     }
 
     auto res = compare_functions(f, f_ref, true);
@@ -268,7 +258,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest10) {
     ngraph::pass::Manager m;
     m.register_pass<ngraph::pass::InitNodeInfo>();
     m.register_pass<ConvertMatMulToFC>();
-    m.register_pass<ReshapeFullyConnected>();
     ASSERT_NO_THROW(m.run_passes(f));
 }
 
@@ -439,25 +428,22 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_1) {
     std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
     {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{5, 2, 3});
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 1, 2, 3}, {1});
+        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 2, 3}, {1});
         auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, input2, false, true);
 
         f = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
         ngraph::pass::Manager m;
         m.register_pass<ngraph::pass::InitNodeInfo>();
         m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
 
     {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{5, 2, 3});
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 3}), false);
         auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{2, 3}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, input2, ngraph::Rank(2));
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, ngraph::opset1::Constant::create(ngraph::element::i64, {4}, {1, 5, 2, 2}), false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_out}, ngraph::ParameterVector{input1});
+        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
     }
 
     auto res = compare_functions(f, f_ref, true);
@@ -475,7 +461,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_2) {
         ngraph::pass::Manager m;
         m.register_pass<ngraph::pass::InitNodeInfo>();
         m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
@@ -495,9 +480,9 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
     std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
     {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{ 5, 2, 3 });
-        auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 2, 3 }, { 1 });
+        auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 2, 3 }, { 1 });
         auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, weights, false, true);
-        auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 1, 2 }, { 1 });
+        auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 2 }, { 1 });
         auto add = std::make_shared<ngraph::opset1::Add>(matmul, biases);
 
         f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input1 });
@@ -505,7 +490,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
         m.register_pass<ngraph::pass::InitNodeInfo>();
         m.register_pass<ConvertMatMulToFC>();
         m.register_pass<FullyConnectedBiasFusion>();
-        m.register_pass<ReshapeFullyConnected>();
         m.run_passes(f);
         ASSERT_NO_THROW(check_rt_info(f));
     }
@@ -513,53 +497,13 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
     {
         auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{ 5, 2, 3 });
         auto reshape_before_const = ngraph::opset1::Constant::create(ngraph::element::i64, { 2 }, { -1, 3 });
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, reshape_before_const, false);
 
         auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 2, 3 }, { 1 });
         auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 2 }, { 1 });
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, weights, biases, ngraph::Rank(2));
+        auto matmul = std::make_shared<FullyConnectedNode>(input1, weights, biases, ngraph::Rank(2));
 
         auto reshape_after_const = ngraph::opset1::Constant::create(ngraph::element::i64, { 4 }, { 1, 5, 2, 2 });
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, reshape_after_const, false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ reshape_out }, ngraph::ParameterVector{ input1 });
-    }
-
-    auto res = compare_functions(f, f_ref, true);
-    ASSERT_TRUE(res.first) << res.second;
-}
-
-TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_dynamic) {
-    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
-    {
-        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, 2, 3});
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 1, 2, 3}, {1});
-        auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, input2, false, true);
-
-        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
-        ngraph::pass::Manager m;
-        m.register_pass<ngraph::pass::InitNodeInfo>();
-        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
-        m.run_passes(f);
-        ASSERT_NO_THROW(check_rt_info(f));
-    }
-
-    {
-        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, 2, 3});
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 3}), false);
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{2, 3}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, input2, ngraph::Rank(2));
-
-        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(input1);
-        auto gather = std::make_shared<ngraph::opset7::Gather>(
-                shape_of, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {0, 1}), ngraph::opset1::Constant::create(ngraph::element::i64, {}, {0}));
-        auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{
-                ngraph::opset1::Constant::create(ngraph::element::i64, {1}, {1}),
-                gather,
-                ngraph::opset1::Constant::create(ngraph::element::i64, {1}, {2}),
-        }, 0);
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, concat, false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_out}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ matmul }, ngraph::ParameterVector{ input1 });
     }
 
     auto res = compare_functions(f, f_ref, true);