[CPU] Enable matmul deconv bin postops (#8009)

2021-12-14 19:44:38 +03:00 · 2021-12-14 19:44:38 +03:00 · 3f6a026ae9
commit 3f6a026ae9
parent 2b9c4a7f42
29 changed files with 714 additions and 711 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -59,7 +59,7 @@ MKLDNNGraphOptimizer::MKLDNNGraphOptimizer() {}

 void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
    OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::MKLDNN_LT, "ApplyCommonGraphOptimizations", "FuseConvolutionAndBias");
-    FuseConvolutionAndBias(graph);
+    FuseConvolutionMatMulAndBias(graph);
    graph.RemoveDroppedNodes();

    OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMultiplyAndAdd");
@ -166,37 +166,38 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
    graph.RemoveDroppedEdges();
 }

-void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
+void MKLDNNGraphOptimizer::FuseConvolutionMatMulAndBias(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();

-    auto isSuitableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == Convolution &&
+    auto isSuitableParentNode = [](const MKLDNNNodePtr& node) {
+        return (node->getType() == Convolution || node->getType() == MatMul) &&
               node->getChildEdges().size() == 1 &&
               node->getParentEdges().size() == 2 &&
               node->getFusedWith().empty();
    };

-    auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+    auto isSuitableChildNode = [&](const MKLDNNNodePtr& parentNode, const MKLDNNNodePtr& childNode) {
        if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
            return false;

-        auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
+        const auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
        if (biasNode->getType() != Input || !biasNode->isConstant() || biasNode->getChildEdges().size() != 1)
            return false;

-        auto convOutDims = parentNode->getOutputShapeAtPort(0).getDims();
-        auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(),
-                                                convOutDims.size());
+        const auto parentOutDims = parentNode->getOutputShapeAtPort(0).getDims();
+        const auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(),
+                                                parentOutDims.size());
        // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and per-channel cases.
        // Most of the real models contain per-channel bias, so we need to reavaluate the need to support per-tensor variant.
-        if (convOutDims.size() != biasDims.size() || biasDims.size() < 2)
+        if (parentOutDims.size() != biasDims.size() || biasDims.size() < 2)
            return false;

-        if (biasDims[0] != 1 || !dimsEqualStrong(biasDims[1], convOutDims[1]))
+        const auto channelAxis = parentNode->getFusingAxis();
+        if (!dimsEqualStrong(biasDims[channelAxis], parentOutDims[channelAxis]))
            return false;

-        for (int i = 2; i < biasDims.size(); i++) {
-            if (biasDims[i] != 1)
+        for (int i = 0; i < biasDims.size(); i++) {
+            if (biasDims[i] != 1 && i != channelAxis)
                return false;
        }

@ -262,13 +263,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
                    graph.RemoveEdge(remEdge);
                }

-                auto parentEltwise = parentNode;
+                const auto& parentEltwise = parentNode;
                MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
-                auto &graphEdges = graph.GetEdges();
+                auto& graphEdges = graph.GetEdges();
                graphEdges.push_back(newEdge);
                parent->addEdge(newEdge);

-                auto partialShape = { parentEltwise->outputShapes[0].toPartialShape()[1] };
+                auto partialShape = { parentEltwise->outputShapes[0].toPartialShape()[parentEltwise->getFusingAxis()] };
                parent->outputShapes[inNum] = Shape(partialShape);
                parentEltwise->inputShapes.push_back(parent->outputShapes[0]);
            }
@ -627,7 +628,15 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
    }
 }

-static bool BF16QuantizeNodeFusing(MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+/**
+ * @todo FQ fusing was disabled for BF16 output since oneDNN primitives lack support
+ *       for bf16 depthwise postops.
+ *       This is not the case anymore, because after migration to oneDNN 2.3 FQ will be fused as
+ *       multiple binary post ops.
+ *       This check can already be removed for FC fusing, but should be kept for Convolution,
+ *       which still uses legacy depthwise postops for performance reasons.
+ */
+static bool BF16QuantizeNodeFusing(const MKLDNNNodePtr& parentNode, const MKLDNNNodePtr& childNode) {
    return childNode->getType() == FakeQuantize &&
        one_of(Precision::BF16,
            parentNode->getOriginalOutputPrecisionAtPort(0),
@ -638,7 +647,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
    auto& graphNodes = graph.GetNodes();

    auto isSuitableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == FullyConnected && node->getChildEdges().size() == 1 && node->getInputShapeAtPort(0).getRank() != 3;
+        return node->getType() == FullyConnected && node->getChildEdges().size() == 1;
    };

    auto parent = graphNodes.begin();
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@ -19,7 +19,7 @@ public:
    void ApplyImplSpecificGraphOptimizations(MKLDNNGraph& graph);

 private:
-    void FuseConvolutionAndBias(MKLDNNGraph &graph);
+    void FuseConvolutionMatMulAndBias(MKLDNNGraph &graph);
    void FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &graph);
    void FuseMultiplyAndAdd(MKLDNNGraph &graph);
    void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -4,6 +4,7 @@

 #include "mkldnn_node.h"
 #include "dnnl_debug.h"
+#include "mkldnn_edge.h"
 #include "mkldnn_extension_mngr.h"
 #include "mkldnn_itt.h"

@ -1048,6 +1049,16 @@ void MKLDNNNode::setDynamicBatchLim(int lim) {
    }
 }

+void MKLDNNNode::appendPostOpArgs(const mkldnn::primitive_attr& attr) {
+    auto post_ops = attr.get_post_ops();
+    int idx = 0;
+    for (int i = 0; i < post_ops.len(); i++) {
+        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
+            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]->GetPrimitive()});
+        }
+    }
+}
+
 bool MKLDNNNode::isFusedWith(Type fusedNodeType) const {
    for (auto fusedNode : fusedWith) {
        if (fusedNode->type == fusedNodeType)
@ -1078,10 +1089,14 @@ Layout MKLDNNNode::getWeightsLayoutByDims(SizeVector dims, bool isGrouped) {
    }
 }

-void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
+void MKLDNNNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
    IE_THROW() << "Fusing of " << this->getType() << " operation is not implemented";
 }

+void MKLDNNNode::appendBinPostOps(mkldnn::post_ops& ops, const std::vector<size_t>& binaryShape, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    IE_THROW() << "Binary fusing of " << this->getType() << " operation is not implemented";
+}
+
 std::vector<InferenceEngine::Precision> MKLDNNNode::getInputPrecisions() const {
    std::vector<InferenceEngine::Precision> inputPrecisions;
    for (size_t i = 0; i < getParentEdges().size(); i++) {
@ -1205,6 +1220,9 @@ MKLDNNNode* MKLDNNNode::NodesFactory::create(const std::shared_ptr<ngraph::Node>

 bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const {
    size_t fusingPort = 0;
+    // @todo graph optimizer can provide parentNode as nullptr. Should be avoided
+    const size_t channelAxis = parentNode ? parentNode->getFusingAxis() : MKLDNNNode::getFusingAxis();
+
    for (size_t i = (parentNode == nullptr ? 1 : 0); i < getParentEdges().size(); i++) {
        MKLDNNNode *node = getParentEdgesAtPort(i)[0]->getParent().get();
        if (node == nullptr) {
@ -1225,7 +1243,8 @@ bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const
            if (i == fusingPort)
                continue;
            auto& weightShape = getInputShapeAtPort(i).getDims();
-            if (getParentEdgesAtPort(i)[0]->getParent()->getChildEdges().size() != 1 || !isPerTensorOrPerChannelBroadcastable(dataShape, weightShape, true))
+            if (getParentEdgesAtPort(i)[0]->getParent()->getChildEdges().size() != 1 ||
+                !isPerTensorOrPerChannelBroadcastable(dataShape, weightShape, channelAxis, true))
                return false;
        }
        return true;
@ -1246,6 +1265,9 @@ bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const
            || isConvertablePowerStatic();
 }

+// @todo shifts for Subtract and scales for Divide are replaced with
+// Add (with opposite sign) and Multiply (with inverse value) for legacy dephwise post ops
+// This can be avoided after dephwise post ops are gone
 std::pair<std::vector<float>, std::vector<float>> MKLDNNNode::getScalesAndShifts(const MKLDNNNode *parentNode) const {
    std::vector<float> scales, shifts;

@ -1408,10 +1430,11 @@ bool MKLDNNNode::canFuseSimpleOperation(const MKLDNNNodePtr& node) const {
        }
        return ret;
    } else if (node->getType() == Eltwise) {
-        return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
-                                            EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
-                                            EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu) ||
-                      node->canBePerformedAsScaleShift(this);
+        return one_of(node->getAlgorithm(),
+                      EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
+                      EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
+                      EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu) ||
+            node->canBePerformedAsScaleShift(this);
    }
    return false;
 }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -204,6 +204,12 @@ public:

    bool isConstant();

+    virtual size_t getFusingAxis() const {
+        return 1;
+    }
+
+    void appendPostOpArgs(const mkldnn::primitive_attr& attr);
+
    bool isFusedWith(Type type) const;

    void addFusedNode(const MKLDNNNodePtr &fusingNode) {
@ -594,8 +600,10 @@ protected:
     * Seed node should call this routine and pass its post operations list as parameter.
     * @param ops List of fused post operations
     */
-    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1, bool initAsBinary = false, bool initBinaryMemory = false);
-    virtual AttrPtr initPrimitiveAttr() const { return nullptr; }
+    virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, int align = -1);
+    virtual void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem);
+
+    virtual std::shared_ptr<mkldnn::primitive_attr> initPrimitiveAttr() { return nullptr; }

    typedef std::function<DnnlMemoryDescPtr (mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx)>
            GetPrimitiveMemoryFormatFunc;
@ -636,7 +644,7 @@ protected:
    std::vector<MKLDNNMemoryPtr> internalBlobMemory;
    std::vector<NodeDesc> supportedPrimitiveDescriptors;
    std::unordered_map<int, mkldnn::memory> primArgs;
-    std::vector<mkldnn::memory> binaryPostOpsArgs;
+    std::vector<MKLDNNMemoryPtr> binaryPostOpsArgs;
    MKLDNNPrimitive prim;
    std::vector<MKLDNNDescriptor> descs;

--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_matmul_to_fc.cpp
@ -36,8 +36,9 @@ MKLDNNPlugin::ConvertMatMulToFC::ConvertMatMulToFC() {
        auto rank_a = shape_a.rank().get_length();
        auto rank_b = shape_b.rank().get_length();

-        // Transformation to FC is not supported for 1D second input
-        if (rank_b == 1) {
+        // Transformation to FC is not supported for 1D inputs
+        if (rank_a == 1 || rank_b == 1 ||
+            rank_a > 3 || rank_b > 3) {
            return false;
        }

@ -47,7 +48,6 @@ MKLDNNPlugin::ConvertMatMulToFC::ConvertMatMulToFC() {
            std::count_if(shape_b.begin(), shape_b.end(), [](ngraph::Dimension x) { return x != 1; }) > 2) {
            return false;
        }
-
        /*
         *  get_aligned_shapes function align two input shapes to have the same size and
         *  the same batch dimensions (last two dimensions are not comparable).
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
@ -7,7 +7,6 @@
 #include "ngraph/op/fake_quantize.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "reshape_fc_fusion.hpp"
-#include "reshape_fully_connected.hpp"
 #include "align_matmul_input_ranks.hpp"
 #include "reshape_prelu.hpp"
 #include "convert_broadcast_to_tiles.hpp"
@ -29,7 +28,6 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphF
    manager.register_pass<AlignMatMulInputRanks>();
    manager.register_pass<ConvertTileToSeqTiles>();
    manager.register_pass<FullyConnectedBiasFusion>();
-    manager.register_pass<ReshapeFullyConnected>();
    manager.register_pass<ConvertToPowerStatic>();
    manager.register_pass<ConvertToLeakyRelu>();
    manager.register_pass<ReshapePRelu>();
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
@ -1,114 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "reshape_fully_connected.hpp"
-#include "op/fully_connected.hpp"
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset7.hpp>
-#include <ngraph/rt_info.hpp>
-#include <ngraph/pattern/op/wrap_type.hpp>
-#include <ngraph/pattern/op/or.hpp>
-#include <transformations/utils/utils.hpp>
-#include <numeric>
-
-NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnected, "ReshapeFullyConnected", 0);
-
-MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() {
-    ngraph::OutputVector twoInputs = {
-            ngraph::pattern::any_input(ngraph::pattern::has_static_rank()), ngraph::pattern::any_input(ngraph::pattern::has_static_shape())};
-    ngraph::OutputVector threeInputs = {
-            ngraph::pattern::any_input(ngraph::pattern::has_static_rank()), ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
-                                        ngraph::pattern::any_input()};
-    auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_rank());
-    auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_rank());
-    const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});
-
-    ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher& m) {
-        auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
-        if (!fc || transformation_callback(fc)) {
-            return false;
-        }
-
-        auto fc_input_shape = fc->get_input_partial_shape(0);
-        auto input_rank = fc_input_shape.rank().get_length();
-        auto output_shape = fc->get_output_partial_shape(0);
-
-        if (input_rank == 2 || input_rank == 0) {
-            return false;
-        }
-
-        ngraph::NodeVector new_ops;
-        int64_t K = *(fc->get_input_shape(1).rbegin()); // requested 2nd input with static shape in the matcher
-        auto reshape = std::make_shared<ngraph::opset1::Reshape>(
-                fc->input_value(0), ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{-1, K}), false);
-        if (reshape->get_output_partial_shape(0).rank().is_dynamic())
-            return false;
-        new_ops.push_back(reshape);
-
-        reshape->set_friendly_name(fc->get_friendly_name() + "/Reshape");
-
-        // Calculate output shape for new FullyConnected layer
-        // [I, K] * [O, K] = [I, O]
-        auto I = reshape->get_output_partial_shape(0)[0];
-        auto O = fc->get_input_partial_shape(1)[0];
-        ngraph::PartialShape output_shape_new{I, O};
-
-        std::shared_ptr<ngraph::Node> fc_new;
-        if (fc->get_input_size() == 2) {
-            fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
-                                                                        fc->input_value(1),
-                                                                        output_shape_new.rank(),
-                                                                        fc->get_output_type());
-        } else if (fc->get_input_size() == 3) {
-            fc_new = std::make_shared<MKLDNNPlugin::FullyConnectedNode>(reshape,
-                                                                        fc->input_value(1),
-                                                                        fc->input_value(2),
-                                                                        output_shape_new.rank(),
-                                                                        fc->get_output_type());
-        } else {
-            return false;
-        }
-        new_ops.push_back(fc_new);
-
-        if (output_shape != output_shape_new) {
-            auto I_idxs = std::vector<size_t>(input_rank - 1);
-            std::iota(I_idxs.begin(), I_idxs.end(), 0);
-            auto A_input_shape = ngraph::op::util::make_try_fold<ngraph::opset7::ShapeOf>(fc->input_value(0));
-            auto B_input_shape = ngraph::op::util::make_try_fold<ngraph::opset7::ShapeOf>(fc->input_value(1));
-            auto I_node = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(A_input_shape, {I_idxs});
-            auto O_node = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(B_input_shape, {0});
-            ngraph::OutputVector output_shape_dims{I_node, O_node};
-
-            const auto original_rank = fc->get_output_rank();
-            NGRAPH_CHECK(original_rank.is_static());
-            if (input_rank < original_rank.get_length()) {
-                const size_t const_shape_value = original_rank.get_length() - input_rank;
-                output_shape_dims.insert(
-                    output_shape_dims.begin(), ngraph::opset1::Constant::create(I_node->get_element_type(), { const_shape_value }, { 1 }));
-            }
-
-            auto reshape_output_shape = ngraph::op::util::make_try_fold<ngraph::opset1::Concat>(output_shape_dims, 0);
-            auto reshape_output = std::make_shared<ngraph::opset1::Reshape>(fc_new, reshape_output_shape, false);
-            new_ops.push_back(A_input_shape);
-            new_ops.push_back(B_input_shape);
-            new_ops.push_back(I_node);
-            new_ops.push_back(O_node);
-            new_ops.push_back(reshape_output_shape);
-            new_ops.push_back(reshape_output);
-            reshape_output->set_friendly_name(fc->get_friendly_name());
-            fc_new->set_friendly_name(fc->get_friendly_name() + "/FC");
-            ngraph::copy_runtime_info(fc, new_ops);
-            ngraph::replace_node(fc, reshape_output);
-        } else {
-            fc_new->set_friendly_name(fc->get_friendly_name());
-            ngraph::copy_runtime_info(fc, new_ops);
-            ngraph::replace_node(fc, fc_new);
-        }
-
-        return true;
-    };
-
-    auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnected");
-    this->register_matcher(m, callback);
-}
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.hpp
@ -1,25 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/pass/graph_rewrite.hpp>
-
-/*
- * Description:
- *     ReshapeFullyConnected transformation detects FullyConnected operations
- *     and for each operation where input shape is greater than 2 inserts Reshape
- *     operations before and after FullyConnected operation. This transformation is
- *     required because of IE restrictions.
- */
-
-namespace MKLDNNPlugin {
-
-class ReshapeFullyConnected: public ngraph::pass::MatcherPass {
-public:
-    NGRAPH_RTTI_DECLARATION;
-    ReshapeFullyConnected();
-};
-
-}  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@ -330,48 +330,42 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
    }
 }

-void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights = false, bool initAsBinary = false) {
-    bool initBinaryMemory = initWeights;
+void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights = false) {
    mkldnn::post_ops ops;
+    bool useLegacyPostOps = true; // @todo remove after issue with performance of binary post ops fixed
+
+    auto getBinPostOpShape = [&](){
+        const auto outShape = getOutputShapeAtPort(0).getStaticDims();
+        const auto outShapeRank = getOutputShapeAtPort(0).getRank();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = outShape[chIdx];
+        return binaryShape;
+    };

    for (auto &node : fusedWith) {
        if (node->getType() == Split || node->getType() == Concatenation)
            continue;

-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            if (eltwiseNode->isSpecialConvolutionAddFusing()) {
                ops.append_sum(1.0, MKLDNNExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
            } else {
-                constexpr int align = 16;
-                eltwiseNode->appendPostOps(ops, dims, align, initAsBinary, initBinaryMemory);
-                if (initBinaryMemory) {
-                    if (eltwiseNode->scalesMemory)
-                        binaryPostOpsArgs.push_back(eltwiseNode->scalesMemory->GetPrimitive());
-                    if (eltwiseNode->shiftsMemory)
-                        binaryPostOpsArgs.push_back(eltwiseNode->shiftsMemory->GetPrimitive());
+                if (useLegacyPostOps || eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                    constexpr int align = 16;
+                    eltwiseNode->appendPostOps(ops, dims, align);
+                } else {
+                    eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
                }
            }
            continue;
        }

-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            constexpr int align = -1;
-            fakeQuantizeNode->appendPostOps(ops, dims, align, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (fakeQuantizeNode->cropHighMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropHighMemory->GetPrimitive());
-                if (fakeQuantizeNode->cropLowMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropLowMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputShiftMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputShiftMemory->GetPrimitive());
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            if (useLegacyPostOps) {
+                fakeQuantizeNode->appendPostOps(ops, dims);
+            } else {
+                fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            }
            continue;
        }
@ -416,7 +410,6 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
    // attr[1] - binary
    mkldnn::primitive_attr attrs[1];
    setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims());
-//    setPostOps(attrs[1], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims(), false, true);

    bool containJitImpl = false;

@ -630,7 +623,6 @@ void MKLDNNConvolutionNode::initDescriptor(const NodeConfig& config) {
    // attr[1] - binary
    mkldnn::primitive_attr attrs[1];
    setPostOps(attrs[0], MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0)).getStaticDims());
-//    setPostOps(attrs[1], false, true);

    auto rightConfig = selectedPD->getConfig();
    size_t selected_count = 0;
@ -926,13 +918,8 @@ void MKLDNNConvolutionNode::prepareParams() {
    auto initPrimitiveAttr = [&]() {
        mkldnn::primitive_attr attr;
        addZeroPoints(attr);
+        setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true);

-        // todo: [AV] delete "false" to use binary mechanism
-        if (false && getSelectedPrimitiveDescriptor()->getImplementationType() == jit_gemm) {
-            setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true, true);
-        } else {
-            setPostOps(attr, outMemoryDesc->getShape().getStaticDims(), true);
-        }
        return std::make_shared<mkldnn::primitive_attr>(std::move(attr));
    };

@ -991,14 +978,8 @@ void MKLDNNConvolutionNode::prepareParams() {
    if (withBiases) {
        primArgs[DNNL_ARG_BIAS] = getBias();
    }
-// todo: [AV] uncomment to use binary mechanism
-//    auto post_ops = attr.get_post_ops();
-//    int idx = 0;
-//    for (int i = 0; i < post_ops.len(); i++) {
-//        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
-//            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]});
-//        }
-//    }
+
+    appendPostOpArgs(*pAttrLocal);
 }

 void MKLDNNConvolutionNode::executeDynamicImpl(dnnl::stream strm) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@ -69,7 +69,7 @@ private:
    void executeDynamicImpl(mkldnn::stream strm) override;

    void addZeroPoints(mkldnn::primitive_attr& attr) const;
-    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights, bool initAsBinary);
+    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims &dims, bool initWeights);
    void filterSupportedDescriptors();
    bool isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const;
    bool isNspcAvailable() const;
@ -122,4 +122,3 @@ private:
 };

 }  // namespace MKLDNNPlugin
-
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@ -157,9 +157,6 @@ bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() const {
        return false;
    }

-    // todo: [antonvor] added these checks to fix performance problems
-    if (kernel.size() == 3)
-        return false;
    if (!withGroups && stride.back() > 3)
        return false;
    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common)) {
@ -271,17 +268,25 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
 void MKLDNNDeconvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
    mkldnn::post_ops ops;

+    auto getBinPostOpShape = [&](){
+        const auto outShape = getOutputShapeAtPort(0).getStaticDims();
+        const auto outShapeRank = getOutputShapeAtPort(0).getRank();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = outShape[chIdx];
+        return binaryShape;
+    };
+
    for (auto &node : fusedWith) {
-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            // TODO [DS]: change to shape from memory
            constexpr int align = 16;
+            // use legacy depthwise since backprop convolution does not support binary post ops
            eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
            continue;
        }
-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            fakeQuantizeNode->appendPostOps(ops);
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            continue;
        }
        IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
@ -358,6 +363,8 @@ void MKLDNNDeconvolutionNode::createPrimitive() {
        auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
        primArgs = {{DNNL_ARG_DIFF_DST, src}, {DNNL_ARG_WEIGHTS, weights}, {DNNL_ARG_DIFF_SRC, dst}};
    }
+
+    appendPostOpArgs(attr);
 }

 void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<MemoryDescPtr> &inputDesc,
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@ -7,6 +7,7 @@
 #include <ie_parallel.hpp>

 #include <mkldnn_types.h>
+#include "cpu_types.h"
 #include "utils/bfloat16.hpp"
 #include <cpu/x64/injectors/jit_uni_quantization_injector.hpp>
 #include <cpu/ref_eltwise.hpp>
@ -31,6 +32,7 @@
 #include "ngraph_transformations/op/leaky_relu.hpp"
 #include "ngraph_transformations/op/swish_cpu.hpp"

+#include <oneapi/dnnl/dnnl.hpp>
 #include <string>
 #include <vector>
 #include <memory>
@ -791,18 +793,41 @@ private:
    }
 };

+MKLDNNEltwiseNode::BroadcastingPolicy MKLDNNEltwiseNode::determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op) {
+    const auto const1 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(0));
+    const auto const2 = std::dynamic_pointer_cast<ngraph::opset1::Constant>(op->get_input_node_shared_ptr(1));
+    int constPort = -1;
+    if (const2) {
+        constPort = 1;
+    } else if (const1) {
+        constPort = 0;
+    } else {
+        return Undefined;
+    }
+
+    auto const_shape = op->get_input_shape(constPort);
+    if (ngraph::shape_size(const_shape) == 1)
+        return PerTensor;
+    else
+        return PerChannel;
+}
+
 const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> MKLDNNEltwiseNode::initializers = {
    {ngraph::op::v1::Add::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseAdd;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
    }},
    {ngraph::op::v1::Subtract::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseSubtract;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
    }},
    {ngraph::op::v1::Multiply::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseMultiply;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
    }},
    {ngraph::op::v1::Divide::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseDivide;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
    }},
    {ngraph::op::v0::SquaredDifference::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseSquaredDifference;
@ -828,6 +853,7 @@ const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> M
        node.alpha = powerStatic->get_power();
        node.beta = powerStatic->get_scale();
        node.gamma = powerStatic->get_shift();
+        node.broadcastingPolicy = PerTensor;
    }},
    {ngraph::op::v1::Equal::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseEqual;
@ -954,6 +980,7 @@ const std::map<const ngraph::DiscreteTypeInfo, MKLDNNEltwiseNode::Initializer> M
    }},
    {ngraph::op::v0::PRelu::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwisePrelu;
+        node.broadcastingPolicy = determineBroadcastingPolicy(op);
    }},
    {ngraph::op::v0::Erf::get_type_info_static(), [](const std::shared_ptr<ngraph::Node>& op, MKLDNNEltwiseNode& node) {
        node.algorithm = EltwiseErf;
@ -984,7 +1011,7 @@ bool MKLDNNEltwiseNode::isSupportedOperation(const std::shared_ptr<const ngraph:
 }

 MKLDNNEltwiseNode::MKLDNNEltwiseNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-        MKLDNNNode(op, eng, cache) {
+    MKLDNNNode(op, eng, cache), broadcastingPolicy(Undefined) {
    std::string errorMessage;
    if (!isSupportedOperation(op, errorMessage)) {
        IE_THROW(NotImplemented) << errorMessage;
@ -1713,106 +1740,124 @@ void MKLDNNEltwiseNode::fuseInto(MKLDNNNodePtr& parentNode) {
            getInputShapeAtPort(0) == getInputShapeAtPort(1);
    if (!specialConvolutionAddFusing && canBePerformedAsScaleShift(parentNode.get())) {
        std::tie(scales, shifts) = getScalesAndShifts(parentNode.get());
+        if ((parentNode->getType() == FullyConnected || parentNode->getType() == MatMul) && one_of(getAlgorithm(), EltwiseAdd, EltwiseSubtract,
+                EltwiseMultiply, EltwiseDivide, EltwiseMulAdd, EltwisePowerStatic, EltwisePrelu)) {
+            std::tie(scales, shifts) = getScalesAndShifts(parentNode.get());
+        }
    }
    MKLDNNNode::fuseInto(parentNode);
 }

-void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
+void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
    const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' ";

    if (getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
        switch (getMKLDNNAlgorithm()) {
-            case mkldnn::algorithm::eltwise_relu:
-            case mkldnn::algorithm::eltwise_tanh:
-            case mkldnn::algorithm::eltwise_elu:
-            case mkldnn::algorithm::eltwise_square:
-            case mkldnn::algorithm::eltwise_abs:
-            case mkldnn::algorithm::eltwise_sqrt:
-            case mkldnn::algorithm::eltwise_linear:
-            case mkldnn::algorithm::eltwise_bounded_relu:
-            case mkldnn::algorithm::eltwise_soft_relu:
-            case mkldnn::algorithm::eltwise_logistic:
-            case mkldnn::algorithm::eltwise_exp:
-            case mkldnn::algorithm::eltwise_gelu_erf:
-            case mkldnn::algorithm::eltwise_gelu_tanh:
-            case mkldnn::algorithm::eltwise_clip:
-            case mkldnn::algorithm::eltwise_swish:
-            case mkldnn::algorithm::eltwise_hardswish:
-            case mkldnn::algorithm::eltwise_mish:
-            case mkldnn::algorithm::eltwise_hsigmoid:
-            case mkldnn::algorithm::eltwise_round_half_to_even:
-            case mkldnn::algorithm::eltwise_round_half_away_from_zero:
-                ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta());
-                break;
-            default: IE_THROW() << errorPrefix << "as post operation is not supported";
+        case mkldnn::algorithm::eltwise_relu:
+        case mkldnn::algorithm::eltwise_tanh:
+        case mkldnn::algorithm::eltwise_elu:
+        case mkldnn::algorithm::eltwise_square:
+        case mkldnn::algorithm::eltwise_abs:
+        case mkldnn::algorithm::eltwise_sqrt:
+        case mkldnn::algorithm::eltwise_linear:
+        case mkldnn::algorithm::eltwise_bounded_relu:
+        case mkldnn::algorithm::eltwise_soft_relu:
+        case mkldnn::algorithm::eltwise_logistic:
+        case mkldnn::algorithm::eltwise_exp:
+        case mkldnn::algorithm::eltwise_gelu_erf:
+        case mkldnn::algorithm::eltwise_gelu_tanh:
+        case mkldnn::algorithm::eltwise_clip:
+        case mkldnn::algorithm::eltwise_swish:
+        case mkldnn::algorithm::eltwise_hardswish:
+        case mkldnn::algorithm::eltwise_mish:
+        case mkldnn::algorithm::eltwise_hsigmoid:
+        case mkldnn::algorithm::eltwise_round_half_to_even:
+        case mkldnn::algorithm::eltwise_round_half_away_from_zero:
+            ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta());
+            break;
+        default: IE_THROW() << errorPrefix << "as post operation is not supported";
        }
    } else {
-        const size_t chIdx = postOpDims.size() > 1 ? 1 : 0;
+        const size_t chIdx = postOpDims.size() > 1 ? getFusingAxis() : 0;
        scalesBuffer = makeAlignedBuffer(postOpDims[chIdx], scales, align);
        if (getAlgorithm() != EltwisePrelu) {
            shiftsBuffer = makeAlignedBuffer(postOpDims[chIdx], shifts, align);
        }

-        if (initAsBinary) {
-            auto appendBinary = [&](const mkldnn::algorithm alg, MKLDNNMemoryPtr &memPtr, const std::vector<float> &data) {
-                if (data.empty())
-                    IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-
-                std::vector<size_t> binaryDims(postOpDims.size(), 1);
-                binaryDims[chIdx] = postOpDims[chIdx];
-
-                DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, Shape(binaryDims));
-                ops.append_binary(alg, memoryDesc.getDnnlDesc());
-
-                if (initBinaryMemory) {
-                    memPtr.reset(new MKLDNNMemory(getEngine()));
-                    memPtr->Create(memoryDesc, &data[0]);
-                }
-            };
-            switch (getAlgorithm()) {
-                case EltwiseAdd:
-                case EltwiseSubtract:
-                    appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shiftsBuffer);
-                    break;
-                case EltwiseMultiply:
-                case EltwiseDivide:
-                    appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scalesBuffer);
-                    break;
-                case EltwiseMulAdd:
-                case EltwisePowerStatic:
-                    appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scalesBuffer);
-                    appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shiftsBuffer);
-                    break;
-                case EltwisePrelu:
-                    appendBinary(mkldnn::algorithm::binary_prelu, scalesMemory, scalesBuffer);
-                    break;
-                default:
-                    IE_THROW() << errorPrefix << "as post operation is not supported";
-            }
-        } else {
-            switch (getAlgorithm()) {
-                case EltwiseAdd:
-                case EltwiseSubtract:
-                case EltwiseMultiply:
-                case EltwiseDivide:
-                case EltwiseMulAdd:
-                case EltwisePowerStatic:
-                    if (scalesBuffer.empty() || shiftsBuffer.empty())
-                        IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-                    ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift, &scalesBuffer[0], &shiftsBuffer[0]);
-                    break;
-                case EltwisePrelu:
-                    if (scalesBuffer.empty())
-                        IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
-                    ops.append_depthwise(mkldnn::algorithm::depthwise_prelu, &scalesBuffer[0], nullptr);
-                    break;
-                default:
-                    IE_THROW() << errorPrefix << "as post operation is not supported";
-            }
+        /* @todo legacy depthwise post ops are kept for now
+         * for performance reasons
+         */
+        switch (getAlgorithm()) {
+        case EltwiseAdd:
+        case EltwiseSubtract:
+        case EltwiseMultiply:
+        case EltwiseDivide:
+        case EltwiseMulAdd:
+        case EltwisePowerStatic:
+            if (scales.empty() || shifts.empty())
+                IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+            ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift, &scalesBuffer[0], &shiftsBuffer[0]);
+            break;
+        case EltwisePrelu:
+            if (scales.empty())
+                IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+            ops.append_depthwise(mkldnn::algorithm::depthwise_prelu, &scalesBuffer[0], nullptr);
+            break;
+        default:
+            IE_THROW() << errorPrefix << "as post operation is not supported";
        }
    }
 }

+void MKLDNNEltwiseNode::appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' as binary post op ";
+    VectorDims broadcastBinaryShape(postOpDims.size(), 1);
+
+    auto appendBinary = [&](const mkldnn::algorithm alg, MKLDNNMemoryPtr &memPtr, const std::vector<float> &data) {
+        if (data.empty())
+            IE_THROW() << errorPrefix << "cannot be performed since buffers are not allocated";
+        if (broadcastingPolicy == Undefined)
+            IE_THROW() << errorPrefix << "cannot be performed since policy is Undefined";
+
+        DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, broadcastingPolicy == PerTensor ? Shape(broadcastBinaryShape) : Shape(postOpDims));
+
+        ops.append_binary(alg, memoryDesc.getDnnlDesc());
+
+        if (!memPtr) {
+            memPtr.reset(new MKLDNNMemory(getEngine()));
+            memPtr->Create(memoryDesc, &data[0]);
+
+            binaryPostOpsMem.push_back(memPtr);
+        }
+    };
+
+    switch (getAlgorithm()) {
+    case EltwiseAdd:
+    case EltwiseSubtract:
+        appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwiseDivide:
+    case EltwiseMultiply:
+        appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        break;
+    case EltwiseMulAdd:
+        appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwisePowerStatic:
+        if (beta != 1.0f) // Multiply if has scales
+            appendBinary(mkldnn::algorithm::binary_mul, scalesMemory, scales);
+        if (gamma != 0.0f) // Add only if has shifts
+            appendBinary(mkldnn::algorithm::binary_add, shiftsMemory, shifts);
+        break;
+    case EltwisePrelu:
+        appendBinary(mkldnn::algorithm::binary_prelu, scalesMemory, scales);
+        break;
+    default:
+        IE_THROW() << errorPrefix << "as post operation is not supported";
+    }
+}
+
 bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
    auto isSuitableNode = [this](const MKLDNNEltwiseNode* node) {
        // [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@ -75,7 +75,8 @@ public:
    bool created() const override;
    bool canBeInPlace() const override;
    bool canFuse(const MKLDNNNodePtr& node) const override;
-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1, bool initAsBinary = false, bool initBinaryMemory = false) override;
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align = -1) override;
+    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;
    void fuseInto(MKLDNNNodePtr& parentNode) override;
    InferenceEngine::Precision getRuntimePrecision() const override;

@ -97,8 +98,17 @@ public:

    void executeDynamicImpl(mkldnn::stream strm) override { execute(strm); }

+    enum BroadcastingPolicy {
+        PerChannel,
+        PerTensor,
+        Undefined,
+    };
+
+    BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; }
+
    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;

+
 private:
    struct EltwiseExecutor {
        EltwiseExecutor(size_t batch) : batchDimIdx(batch) {}
@ -130,6 +140,8 @@ private:
        size_t fullWorkAmount = 0;
    };

+    BroadcastingPolicy broadcastingPolicy;
+
    mkldnn::algorithm mkldnnAlgorithm = mkldnn::algorithm::undef;

    static const int optimalTensorRank = 6;
@ -157,6 +169,8 @@ private:
    using Initializer = std::function<void(const std::shared_ptr<ngraph::Node>&, MKLDNNEltwiseNode& node)>;
    static const std::map<const ngraph::DiscreteTypeInfo, Initializer> initializers;

+    static BroadcastingPolicy determineBroadcastingPolicy(const std::shared_ptr<ngraph::Node>& op);
+
    void executeOptimized6D(const std::unique_ptr<jit_uni_eltwise_kernel> &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs,
                            const VectorDims &dims_out) const;
    void executeOptimizedGeneric(const std::unique_ptr<jit_uni_eltwise_kernel> &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs,
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp
@ -860,7 +860,15 @@ bool MKLDNNFakeQuantizeNode::isSupportedOperation(const std::shared_ptr<const ng
                        count_not_unit_axis++;
                    }
                }
-                if (count_not_unit_axis > 1 || not_unit_axis > 1) {
+
+                /* @todo
+                 * Channel axis 2 is added for 3D MatMul (most common one).
+                 * FQ for non-1 channel fallbacks to reference implementation.
+                 * Expected to be fused for 3D MatMul
+                 * Long term idea: restore limitation for channel axis 1 and
+                 * support fusing of unfolded FQ (see FakeQuantizeDecomposition transformation)
+                 */
+                if (count_not_unit_axis > 1 || !one_of(not_unit_axis, 1, 2)) {
                    errorMessage = "Supports only per-tensor and per-channel quantizations";
                    return false;
                }
@ -1057,6 +1065,13 @@ MKLDNNFakeQuantizeNode::MKLDNNFakeQuantizeNode(const std::shared_ptr<ngraph::Nod
            outputScaleSize = outputScale.size();
            outputShiftSize = outputShift.size();

+            if (everyone_is(1, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize))
+                broadcastingPolicy = PerTensor;
+            else if (one_of(1, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize))
+                broadcastingPolicy = Mixed;
+            else
+                broadcastingPolicy = PerChannel;
+
            bool quantizationOnly = true;

            for (int i = 0; i < cropLow.size(); i++) {
@ -1649,14 +1664,12 @@ void MKLDNNFakeQuantizeNode::execute(mkldnn::stream strm) {
    }
 }

-void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align, bool initAsBinary, bool initBinaryMemory) {
-    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
-    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
-    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
-    const size_t bufferAlignment = 16;
+void MKLDNNFakeQuantizeNode::initializePostOpData(const VectorDims &dims, const size_t bufferAlignment) {
+    if (isPostOpDataInitialized)
+        return;

    if (getAlgorithm() == FQBinarization) {
-        const auto realAxisSize = postOpDims[postOpDims.size() > 1 ? 1 : 0];
+        const auto realAxisSize = dims[dims.size() > 1 ? 1 : 0];
        const auto axisPaddedSize = rnd_up(realAxisSize, bufferAlignment);
        if (!isPostOpDataInitialized) {
            binarizationThresholds.resize(axisPaddedSize, 0);
@ -1671,73 +1684,76 @@ void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDi
                std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0);
            }
        }
-
-        ops.append_binarization(mkldnn::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]);
-
-        if (!isInputLowBroadcasted && !isOutputHighBroadcasted) {
-            isPostOpDataInitialized = true;
-        }
    } else {
-        if (!isPostOpDataInitialized) {
-            if (cropLow.size() > 1)
-                cropLow.resize(rnd_up(cropLow.size(), bufferAlignment), 0);
-            if (cropHigh.size() > 1)
-                cropHigh.resize(rnd_up(cropHigh.size(), bufferAlignment), 0);
-            if (inputScale.size() > 1)
-                inputScale.resize(rnd_up(inputScale.size(), bufferAlignment), 0);
-            if (inputShift.size() > 1)
-                inputShift.resize(rnd_up(inputShift.size(), bufferAlignment), 0);
-            if (outputScale.size() > 1)
-                outputScale.resize(rnd_up(outputScale.size(), bufferAlignment), 0);
-            if (outputShift.size() > 1)
-                outputShift.resize(rnd_up(outputShift.size(), bufferAlignment), 0);
+        if (cropLow.size() > 1)
+            cropLow.resize(rnd_up(cropLow.size(), bufferAlignment), 0);
+        if (cropHigh.size() > 1)
+            cropHigh.resize(rnd_up(cropHigh.size(), bufferAlignment), 0);
+        if (inputScale.size() > 1)
+            inputScale.resize(rnd_up(inputScale.size(), bufferAlignment), 0);
+        if (inputShift.size() > 1)
+            inputShift.resize(rnd_up(inputShift.size(), bufferAlignment), 0);
+        if (outputScale.size() > 1)
+            outputScale.resize(rnd_up(outputScale.size(), bufferAlignment), 0);
+        if (outputShift.size() > 1)
+            outputShift.resize(rnd_up(outputShift.size(), bufferAlignment), 0);

-            cropLowData.set(cropLow.size(), 1 << 1, &cropLow[0]);
-            cropHighData.set(cropHigh.size(), 1 << 1, &cropHigh[0]);
-            inputScaleData.set(inputScale.size(), 1 << 1, &inputScale[0]);
-            inputShiftData.set(inputShift.size(), 1 << 1, &inputShift[0]);
-            outputScaleData.set(outputScale.size(), 1 << 1, &outputScale[0]);
-            outputShiftData.set(outputShift.size(), 1 << 1, &outputShift[0]);
-        }
+        cropLowData.set(cropLow.size(), 1 << 1, &cropLow[0]);
+        cropHighData.set(cropHigh.size(), 1 << 1, &cropHigh[0]);
+        inputScaleData.set(inputScale.size(), 1 << 1, &inputScale[0]);
+        inputShiftData.set(inputShift.size(), 1 << 1, &inputShift[0]);
+        outputScaleData.set(outputScale.size(), 1 << 1, &outputScale[0]);
+        outputShiftData.set(outputShift.size(), 1 << 1, &outputShift[0]);
+    }

+    isPostOpDataInitialized = true;
+}
+
+void MKLDNNFakeQuantizeNode::appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, int align) {
+    initializePostOpData(postOpDims, align);
+
+    if (getAlgorithm() == FQBinarization) {
+        ops.append_binarization(mkldnn::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]);
+    } else {
        mkldnn::algorithm alg = getAlgorithm() == FQCommon ? mkldnn::algorithm::quantization_quantize_dequantize :
                                                             mkldnn::algorithm::quantization_quantize;
-
-        if (initAsBinary) {
-            auto appendBinary = [&](const mkldnn::algorithm alg, const size_t dataSize, MKLDNNMemoryPtr &memPtr, const void *data) {
-                const auto rank = getOutputShapeAtPort(0).getRank();
-                auto chIdx = rank > 1 ? 1 : 0;
-
-                std::vector<size_t> binaryShape(rank, 1);
-                binaryShape[chIdx] = dataSize;
-
-                DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, Shape(binaryShape));
-                ops.append_binary(alg, memoryDesc.getDnnlDesc());
-
-                if (initBinaryMemory) {
-                    memPtr.reset(new MKLDNNMemory(getEngine()));
-                    memPtr->Create(memoryDesc, data);
-                }
-            };
-
-            appendBinary(mkldnn::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
-            appendBinary(mkldnn::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
-            appendBinary(mkldnn::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
-            appendBinary(mkldnn::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
-            if (alg == mkldnn::algorithm::quantization_quantize_dequantize) {
-                ops.append_eltwise(1.0f, mkldnn::algorithm::eltwise_round_half_to_even, 0, 0);
-            }
-            appendBinary(mkldnn::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
-            appendBinary(mkldnn::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
-
-        } else {
-            ops.append_quantization(alg, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData);
-        }
-
-        isPostOpDataInitialized = true;
+        ops.append_quantization(alg, &cropLowData, &cropHighData, &inputScaleData, &inputShiftData, &outputScaleData, &outputShiftData);
    }
 }

+void MKLDNNFakeQuantizeNode::appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) {
+    static const size_t bufferAlignment = 1;
+
+    initializePostOpData(postOpDims, bufferAlignment);
+
+    VectorDims broadcastBinaryShape(postOpDims.size(), 1);
+
+    auto appendBinary = [&](const mkldnn::algorithm alg, const size_t dataSize, MKLDNNMemoryPtr &memPtr, const void *data) {
+        DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, dataSize == 1 ? Shape(broadcastBinaryShape) : Shape(postOpDims));
+        ops.append_binary(alg, memoryDesc.getDnnlDesc());
+
+        if (!memPtr) {
+            memPtr.reset(new MKLDNNMemory(getEngine()));
+            memPtr->Create(memoryDesc, data);
+
+            binaryPostOpsMem.push_back(memPtr);
+        }
+    };
+
+    mkldnn::algorithm alg = getAlgorithm() == FQCommon ? mkldnn::algorithm::quantization_quantize_dequantize :
+                                                         mkldnn::algorithm::quantization_quantize;
+
+    appendBinary(mkldnn::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
+    appendBinary(mkldnn::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
+    appendBinary(mkldnn::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
+    appendBinary(mkldnn::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
+    if (alg == mkldnn::algorithm::quantization_quantize_dequantize) {
+        ops.append_eltwise(1.0f, mkldnn::algorithm::eltwise_round_half_to_even, 0, 0);
+    }
+    appendBinary(mkldnn::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
+    appendBinary(mkldnn::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
+}
+
 MKLDNNFakeQuantizeNode::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
    bool isBinarization = _jqp.op_type == FQBinarization;
    if (mayiuse(cpu::x64::avx512_common)) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.h
@ -121,11 +121,22 @@ public:
    InferenceEngine::Precision getInputPrecision() const { return inputPrecision; }
    InferenceEngine::Precision getOutputPrecision() const { return outputPrecision; }

-    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}, int align = -1, bool initAsBinary = false,
-                       bool initBinaryMemory = false) override;
+    // MKLDNN quantization_injectors assumes that quantization data memory is always aligned on 16
+    // by length of AVX512 vector register which is also enough for AVX2 and SSE42 implementations.
+    // Otherwise it can lead to buffer over-read and performance penalties due to denormals.
+    void appendPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims = {}, int align = 16) override;
+    void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims &postOpDims, std::vector<MKLDNNMemoryPtr>& binaryPostOpsMem) override;

    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;

+    enum BroadcastingPolicy {
+        PerChannel, // all FQ operations are per channel
+        PerTensor,  // all FQ operations are per tensor
+        Mixed,      // some per channel, some per tensor
+    };
+
+    BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; }
+
    MKLDNNMemoryPtr cropLowMemory;
    MKLDNNMemoryPtr cropHighMemory;
    MKLDNNMemoryPtr inputScaleMemory;
@ -149,6 +160,7 @@ private:

    void init() override;
    std::vector<LayoutType> getDataFormats() const;
+    void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment);
    void executeReference();
    void executeBinarization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const;
    void executeQuantization(const std::unique_ptr<jit_uni_quantize_kernel> &pKernel) const;
@ -195,6 +207,8 @@ private:
    InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;

    std::string errorPrefix;
+
+    BroadcastingPolicy broadcastingPolicy;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@ -147,13 +147,7 @@ void MKLDNNFullyConnectedNode::createPrimitive() {
    else
        primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getParentEdgeAt(WEIGHTS_ID)->getMemory().GetPrimitive()}, {DNNL_ARG_DST, dst}};

-    auto post_ops = attr->get_post_ops();
-    int idx = 0;
-    for (int i = 0; i < post_ops.len(); i++) {
-        if (post_ops.kind(i) == mkldnn::primitive::kind::binary) {
-            primArgs.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binaryPostOpsArgs[idx++]});
-        }
-    }
+    appendPostOpArgs(*attr);
 }

 void MKLDNNFullyConnectedNode::execute(mkldnn::stream strm) {
@ -183,42 +177,32 @@ bool MKLDNNFullyConnectedNode::canFuse(const MKLDNNNodePtr& node) const {
    return canFuseSimpleOperation(node);
 }

-void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false, bool initAsBinary = false) {
-    bool initBinaryMemory = initWeights;
+void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) {
    mkldnn::post_ops ops;

+    auto getBinPostOpShape = [&](){
+        const size_t binaryShapeRank = getOutputShapeAtPort(0).getRank() == 3 ? 2 : getOutputShapeAtPort(0).getRank();
+        VectorDims binaryShape(binaryShapeRank, 1);
+        const size_t channelAxis = getFusingAxis();
+        // always use 1 as channelAxis for binary Shape, since oneDNN primitive is actually always 2D
+        binaryShape[1] = getOutputShapeAtPort(0).getStaticDims()[channelAxis];
+
+        return binaryShape;
+    };
+
    for (auto &node : fusedWith) {
-        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
-        if (fakeQuantizeNode) {
-            // no need to fill post ops dims for fq, make sense only for bin fq
-            fakeQuantizeNode->appendPostOps(ops, VectorDims{}, -1, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (fakeQuantizeNode->cropHighMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropHighMemory->GetPrimitive());
-                if (fakeQuantizeNode->cropLowMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->cropLowMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->inputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->inputShiftMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputScaleMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputScaleMemory->GetPrimitive());
-                if (fakeQuantizeNode->outputShiftMemory)
-                    binaryPostOpsArgs.push_back(fakeQuantizeNode->outputShiftMemory->GetPrimitive());
-            }
+        if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            continue;
        }

-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
-        if (eltwiseNode) {
+        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
            // TODO [DS]: change to shape from memory
            constexpr int align = -1;
-            eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align, initAsBinary, initBinaryMemory);
-            if (initBinaryMemory) {
-                if (eltwiseNode->scalesMemory)
-                    binaryPostOpsArgs.push_back(eltwiseNode->scalesMemory->GetPrimitive());
-                if (eltwiseNode->shiftsMemory)
-                    binaryPostOpsArgs.push_back(eltwiseNode->shiftsMemory->GetPrimitive());
+            if (eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                eltwiseNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), align);
+            } else {
+                eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            }
            continue;
        }
@ -280,7 +264,7 @@ const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriori
 MKLDNNNode::AttrPtr MKLDNNFullyConnectedNode::initPrimitiveAttr() {
    auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());

-    setPostOps(*attr, true, true);
+    setPostOps(*attr);

    return attr;
 }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
@ -26,6 +26,10 @@ public:
        return false;
    }

+    size_t getFusingAxis() const override {
+        return getOutputShapeAtPort(0).getRank() == 3 ? 2 : 1;
+    }
+
    const std::vector<impl_desc_type>& getPrimitivesPriority() override;
    void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                          const std::vector<MemoryDescPtr>& outputDesc) override;
@ -43,8 +47,7 @@ public:

    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;

-protected:
-    AttrPtr initPrimitiveAttr();
+    std::shared_ptr<mkldnn::primitive_attr> initPrimitiveAttr() override;

 private:
    void createDescriptorInternal(const mkldnn::memory::desc &inputDesc,
@ -54,7 +57,7 @@ private:
    InferenceEngine::SizeVector biasesDims;

    std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
-    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights, bool initAsBinary);
+    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);

    bool withBiases = false;

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.cpp
@ -17,6 +17,7 @@
 #include "common/cpu_memcpy.h"
 #include <ngraph/opsets/opset1.hpp>
 #include "memory_desc/dnnl_blocked_memory_desc.h"
+#include "nodes/mkldnn_fake_quantize_node.h"
 #include "utils/general_utils.h"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "mkldnn_extension_utils.h"
@ -54,31 +55,65 @@ bool MKLDNNMatMulNode::isSupportedOperation(const std::shared_ptr<const ngraph::
 }

 MKLDNNMatMulNode::MKLDNNMatMulNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-    MKLDNNNode(op, eng, cache) {
+    MKLDNNNode(op, eng, cache), withBiases(false) {
    std::string errorMessage;
+    errorPrefix = "MatMul node with name '" + getName() + "'";
+
    if (!isSupportedOperation(op, errorMessage))
        IE_THROW(NotImplemented) << errorMessage;

-    errorPrefix = "MatMul node with name '" + getName() + "'";
-
    const auto matMul = std::dynamic_pointer_cast<const ngraph::opset1::MatMul>(op);

+    if (!matMul) {
+        IE_THROW(NotImplemented) << "Operation with name " << op->get_friendly_name() << ":" << op->get_type_name() <<
+            " is not an instance of MatMul from opset1";
+    }
+
    transposeIn[0] = matMul->get_transpose_a();
    transposeIn[1] = matMul->get_transpose_b();
 }

 bool MKLDNNMatMulNode::canFuse(const MKLDNNNodePtr& node) const {
-    return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
-                  EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
-                  EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu);
+    // per channel binary post op for rank > 2D is supported only by oneDNN reference implementation because of unusual MatMul channel axis (issue 6669)
+    if (getOutputShapeAtPort(0).getRank() > 2) {
+        if (const auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
+            if (one_of(eltwiseNode->getAlgorithm(),
+                       EltwiseAdd, EltwiseMultiply, EltwiseSubtract, EltwiseDivide, EltwisePrelu, EltwiseMulAdd, EltwisePowerStatic) &&
+                eltwiseNode->getBroadcastingPolicy() != MKLDNNEltwiseNode::PerTensor) {
+                return false;
+            }
+        } else if (const auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            if (fakeQuantizeNode->getBroadcastingPolicy() != MKLDNNFakeQuantizeNode::PerTensor) {
+                return false;
+            }
+        }
+    }
+
+    return canFuseSimpleOperation(node);
 }

-void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights = false) const {
+void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights = false) {
    mkldnn::post_ops ops;

-    for (auto &node : fusedWith) {
+    auto getBinPostOpShape = [&](){
+        const auto outShapeRank = dims.size();
+        const auto chIdx = getFusingAxis();
+        std::vector<size_t> binaryShape(outShapeRank, 1);
+        binaryShape[chIdx] = dims[chIdx];
+        return binaryShape;
+    };
+
+    for (const auto &node : fusedWith) {
        if (auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get())) {
-            eltwiseNode->appendPostOps(ops, dims);
+            // TODO [DS]: change to shape from memory
+            if (eltwiseNode->getMKLDNNAlgorithm() != mkldnn::algorithm::undef) {
+                eltwiseNode->appendPostOps(ops, dims);
+            } else {
+                eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
+            }
+            continue;
+        } else if (auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get())) {
+            fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), binaryPostOpsArgs);
            continue;
        }

@ -88,8 +123,7 @@ void MKLDNNMatMulNode::setPostOps(mkldnn::primitive_attr &attr, const VectorDims
    attr.set_post_ops(ops);
 }

-
-MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims) const {
+MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims) {
    auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());

    setPostOps(*attr, dims, true);
@ -97,7 +131,7 @@ MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr(const VectorDims &dims)
    return attr;
 }

-MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr() const {
+MKLDNNNode::AttrPtr MKLDNNMatMulNode::initPrimitiveAttr() {
    auto dummyShape = MemoryDescUtils::makeDummyShape(getOutputShapeAtPort(0));
    return initPrimitiveAttr(dummyShape.getStaticDims());
 }
@ -131,12 +165,29 @@ static VectorDims getStridesAndModifyShape(Shape& shape, const bool transpose) {
    return strides;
 }

+mkldnn::memory::desc MKLDNNMatMulNode::getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc) {
+    // oneDNN matmul requires shape for bias desc to be the same rank
+    VectorDims biasDims(outMemDesc->getShape().getRank(), 1);
+    const auto outDims = outMemDesc->getShape().getStaticDims();
+    const auto chIdx = getFusingAxis();
+    biasDims[chIdx] = outDims[chIdx];
+    const auto bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(2));
+
+    return mkldnn::memory::desc(MKLDNNExtensionUtils::convertToDnnlDims(biasDims), bdt, memory::format_tag::any);
+}
+
 void MKLDNNMatMulNode::getSupportedDescriptors() {
-    if (getParentEdges().size() != 2)
+    if (getParentEdges().size() != getOriginalInputsNumber())
        IE_THROW()  << errorPrefix << " has incorrect number of input edges for layer " << getName();
    if (getChildEdges().empty())
        IE_THROW()  << errorPrefix << " has incorrect number of output edges for layer " << getName();

+    withBiases = getOriginalInputsNumber() == 3;
+
+    auto canBeExecutedInInt8 = [](const Precision firstInput, const Precision secondInput) {
+        return one_of(firstInput, Precision::U8, Precision::I8) && secondInput == Precision::I8;
+    };
+
    auto firstInPortPrec = getOriginalInputPrecisionAtPort(0);
    auto secondInPortPrec = getOriginalInputPrecisionAtPort(1);
    auto outPortPrec = getOriginalOutputPrecisionAtPort(0);
@ -154,6 +205,9 @@ void MKLDNNMatMulNode::getSupportedDescriptors() {
        outPortPrec = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0);
    }

+    if (!canBeExecutedInInt8(firstInPortPrec, secondInPortPrec) && one_of(outPortPrec, Precision::U8, Precision::I8))
+        outPortPrec = Precision::FP32; // INT output is not supported for non-INT inputs
+
    const auto& inputShape0 = getInputShapeAtPort(0);
    const auto& inputShape1 = getInputShapeAtPort(1);
    const auto& outputShape = getOutputShapeAtPort(0);
@ -206,12 +260,19 @@ void MKLDNNMatMulNode::getSupportedDescriptors() {

 void MKLDNNMatMulNode::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                                        const std::vector<MemoryDescPtr>& outputDesc) {
-    MKLDNNDescriptor desc{
-        std::make_shared<matmul::desc>(inDataDesc[0]->getDnnlDesc(),
-                                       inDataDesc[1]->getDnnlDesc(),
-                                       outDataDesc->getDnnlDesc())};
+    std::shared_ptr<mkldnn::matmul::desc> matmul_desc;
+    if (withBiases) {
+        matmul_desc.reset(new matmul::desc(inDataDesc[0]->getDnnlDesc(),
+                                           inDataDesc[1]->getDnnlDesc(),
+                                           getBiasDescFrom(outDataDesc),
+                                           outDataDesc->getDnnlDesc()));
+    } else {
+        matmul_desc.reset(new matmul::desc(inDataDesc[0]->getDnnlDesc(),
+                                           inDataDesc[1]->getDnnlDesc(),
+                                           outDataDesc->getDnnlDesc()));
+    }

-    descs.push_back(desc);
+    descs.emplace_back(matmul_desc);
 }

 void MKLDNNMatMulNode::initSupportedPrimitiveDescriptors() {
@ -262,9 +323,13 @@ void MKLDNNMatMulNode::createPrimitive() {

 MemoryDescPtr MKLDNNMatMulNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
    auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1): primitive_desc_it.src_desc(idx);
-    return std::make_shared<CpuBlockedMemoryDesc>(
-        MKLDNNExtensionUtils::DataTypeToIEPrecision(static_cast<mkldnn::memory::data_type>(desc.data.data_type)),
-        getInputShapeAtPort(idx)); /* provide initial shapes, so hide transpose effect */
+
+    if (idx < 2) // inputs
+        return std::make_shared<CpuBlockedMemoryDesc>(
+            MKLDNNExtensionUtils::DataTypeToIEPrecision(static_cast<mkldnn::memory::data_type>(desc.data.data_type)),
+            getInputShapeAtPort(idx)); /* provide initial shapes, so hide transpose effect */
+    else // bias
+        return MKLDNNExtensionUtils::makeDescriptor(desc);
 }

 bool MKLDNNMatMulNode::created() const {
@ -300,10 +365,7 @@ void MKLDNNMatMulNode::prepareParams() {
    AttrPtr attr;

    if (isDynamicNode()) {
-        if (!pAttr) {
-            pAttr = initPrimitiveAttr(src0MemPtr->getStaticDims());
-        }
-        attr = pAttr;
+        attr = initPrimitiveAttr(dstMemPtr->getStaticDims());

        const auto& src0Desc = src0MemPtr->getDesc();
        const auto& src1Desc = src1MemPtr->getDesc();
@ -323,13 +385,22 @@ void MKLDNNMatMulNode::prepareParams() {

    auto dstDnnlDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();

-    MKLDNNDescriptor desc{
-            std::make_shared<matmul::desc>(src0TransposedDesc->getDnnlDesc(),
-                                           src1TransposedDesc->getDnnlDesc(),
-                                           dstDnnlDesc->getDnnlDesc())};
+    std::shared_ptr<mkldnn::matmul::desc> matmul_desc;

-    matmul::primitive_desc prim_desc;
+    if (withBiases) {
+        matmul_desc.reset(new mkldnn::matmul::desc{src0TransposedDesc->getDnnlDesc(),
+                                            src1TransposedDesc->getDnnlDesc(),
+                                            getBiasDescFrom(dstDnnlDesc),
+                                            dstDnnlDesc->getDnnlDesc()});
+    } else {
+        matmul_desc.reset(new mkldnn::matmul::desc(src0TransposedDesc->getDnnlDesc(),
+                                            src1TransposedDesc->getDnnlDesc(),
+                                            dstDnnlDesc->getDnnlDesc()));
+    }
+
+    MKLDNNDescriptor desc(matmul_desc);
    primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine(), *attr);
+    matmul::primitive_desc prim_desc;

    while (static_cast<bool>(itpd))  {
        impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@ -347,6 +418,10 @@ void MKLDNNMatMulNode::prepareParams() {
    primArgs[DNNL_ARG_SRC_0] = src0MemPtr->GetPrimitive();
    primArgs[DNNL_ARG_WEIGHTS_0] = src1MemPtr->GetPrimitive();
    primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
+    if (withBiases)
+        primArgs[DNNL_ARG_BIAS] = getParentEdgeAt(2)->getMemoryPtr()->GetPrimitive();
+
+    appendPostOpArgs(*attr);
 }

 void MKLDNNMatMulNode::executeDynamicImpl(dnnl::stream strm) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matmul_node.h
@ -32,6 +32,10 @@ public:
        return getOriginalInputsNumber();
    }

+    size_t getFusingAxis() const override {
+        return getOutputShapeAtPort(0).getRank() - 1;
+    }
+
    void prepareParams() override;
    void executeDynamicImpl(mkldnn::stream strm) override;

@ -39,11 +43,15 @@ public:
    const std::vector<impl_desc_type>& getPrimitivesPriority() override;

 protected:
-    AttrPtr initPrimitiveAttr() const override;
-    AttrPtr initPrimitiveAttr(const VectorDims& dims) const;
+    AttrPtr initPrimitiveAttr() override;
+    AttrPtr initPrimitiveAttr(const VectorDims& dims);

 private:
-    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights) const;
+    mkldnn::memory::desc getBiasDescFrom(const DnnlMemoryDescCPtr outMemDesc);
+
+    bool withBiases;
+
+    void setPostOps(mkldnn::primitive_attr &attr, const VectorDims& dims, bool initWeights);

    std::string errorPrefix;

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
@ -511,7 +511,7 @@ void MKLDNNPoolingNode::initDescriptor(const NodeConfig& config) {
    selectedPD->setConfig(rightConfig);
 }

-MKLDNNNode::AttrPtr MKLDNNPoolingNode::initPrimitiveAttr() const {
+MKLDNNNode::AttrPtr MKLDNNPoolingNode::initPrimitiveAttr() {
    auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());

    setPostOps(*attr, true);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
@ -34,7 +34,7 @@ public:
    static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;

 protected:
-    AttrPtr initPrimitiveAttr() const override;
+    AttrPtr initPrimitiveAttr() override;

 private:
    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false) const;
--- a/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp
+++ b/inference-engine/src/mkldnn_plugin/utils/cpu_utils.hpp
@ -4,6 +4,13 @@

 #pragma once

+#include <cstddef>
+#include <numeric>
+#include <vector>
+
+#include "ie_common.h"
+#include "ie_layouts.h"
+
 namespace MKLDNNPlugin {

 /**
@ -36,7 +43,9 @@ inline std::vector<size_t> getNormalizedDimsBySize(const InferenceEngine::SizeVe
 * flag which specify how we compare C dims if value is undefined (weak or strong)
 * @return true if broadcastable, false otherwise.
 */
-inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVector &firstInputDims, const InferenceEngine::SizeVector& secondInputDims,
+inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVector &firstInputDims,
+                                                 const InferenceEngine::SizeVector& secondInputDims,
+                                                 size_t channelAxis,
                                                 bool weakComparison = false) {
    bool (*dimsEqual)(size_t, size_t) = weakComparison ? static_cast<bool (*)(size_t, size_t)>(dimsEqualWeak) :
                                                         static_cast<bool (*)(size_t, size_t)>(dimsEqualStrong);
@ -47,7 +56,7 @@ inline bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVect

    std::vector<size_t> normalizedSecondInputDims = getNormalizedDimsBySize(secondInputDims, firstInputDims.size());
    for (size_t i = 0; i < normalizedSecondInputDims.size(); i++) {
-        if ((i == 1 && !dimsEqual(normalizedSecondInputDims[i], firstInputDims[1])) || (i != 1 && normalizedSecondInputDims[i] != 1))
+        if ((i == channelAxis && !dimsEqual(normalizedSecondInputDims[i], firstInputDims[i])) || (i != channelAxis && normalizedSecondInputDims[i] != 1))
            return false;
    }
    return true;
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
@ -51,7 +51,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
        { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
        { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
        { {}, {}, {} },
-        "FullyConnected",
+        "MatMul",
        "U8"
    },
    // 4D with Dq on weights
@ -61,7 +61,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
        { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
        {},
        { ngraph::element::f32, {}, {{0.1f, 0.01}, ngraph::element::f32, ngraph::Shape{ 2, 1 }} },
-        "FullyConnected",
+        "MatMul",
        "U8"
    },
    // 3D with the same values
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
@ -11,7 +11,8 @@ using namespace LayerTestsDefinitions;
 namespace {

 const std::vector<InferenceEngine::Precision> inputPrecisions = {
-        InferenceEngine::Precision::FP32
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::I32,
 };

 const std::vector<ShapeRelatedParams> shapeRelatedParams = {
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp
@ -2,9 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "shared_test_classes/single_layer/mat_mul.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
+#include "ie_precision.hpp"
 #include "test_utils/fusing_test_utils.hpp"
 #include "ngraph_functions/builders.hpp"
+#include <string>

 using namespace ngraph;
 using namespace InferenceEngine;
@ -139,11 +142,10 @@ protected:
        const auto& inShapeA = inputDynamicShapes[0];
        const auto& inShapeB = inputDynamicShapes[1];

-        /* @todo
-         * Currently nodes are not fused thought Reshape
-         * Check can be deleted after this limitation is gone
-         */
-        if (nodeType == MatMulNodeType::MatMul && inShapeA.size() < 4 && inShapeB.size() < 4)
+        // see comment in MKLDNNMatMulNode::canFuse
+        if (!(nodeType == MatMulNodeType::MatMul &&
+              std::get<0>(fusingParams) && std::get<0>(fusingParams)->getFusedOpsNames().find("(PerChannel)") != std::string::npos &&
+              std::max(inShapeA.size(), inShapeB.size()) > 2))
            std::tie(postOpMgrPtr, fusedOps) = fusingParams;

        configuration.insert(additionalConfig.begin(), additionalConfig.end());
@ -179,6 +181,8 @@ TEST_P(MatMulLayerCPUTest, CompareWithRefs) {
 namespace {

 /* ============= Common params ============= */
+std::map<std::string, std::string> emptyAdditionalConfig;
+
 std::vector<std::map<std::string, std::string>> additionalConfig {
    std::map<std::string, std::string>{/* empty config */},
    {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}
@ -196,15 +200,16 @@ std::vector<CPUSpecificParams> filterSpecificParams() {
    return specificParams;
 }

+const auto fusingBias = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+            {[](std::shared_ptr<Node> inpNode, const element::Type& ngPrc, ParameterVector& params) {
+                size_t last_dim = inpNode->get_output_partial_shape(0).rbegin()->get_length();
+                auto bias = builder::makeConstant(ngPrc, Shape{last_dim}, std::vector<float>{}, true);
+                return std::make_shared<opset1::Add>(inpNode, bias);
+            }, "fusingBias"}}), {"Add"}};
+
 /* ============= FullyConnected ============= */
 namespace fullyConnected {

-const auto fusingBiasFC = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
-            {[](std::shared_ptr<Node> inpNode, const element::Type& ngPrc, ParameterVector& params) {
-                auto bias = builder::makeConstant(ngPrc, Shape({inpNode->get_output_shape(0).back()}), std::vector<float>{}, true);
-                return std::make_shared<opset1::Add>(inpNode, bias);
-            }, "fusingBiasFC"}}), {"Add"}};
-
 const std::vector<ShapeRelatedParams> IS2D = {
    {static_shapes_to_test_representation({{59, 1}, {1, 120}}), {false, false}},
    {static_shapes_to_test_representation({{59, 1}, {1, 120}}), {true, false}},
@ -229,26 +234,46 @@ const std::vector<ShapeRelatedParams> IS2D = {

 std::vector<fusingSpecificParams> fusingParamsSet2D {
        emptyFusingSpec,
-        fusingBiasFC,
+        fusingBias,
        fusingRelu,
        fusingMultiplyPerChannel,
-        fusingPReluPerTensor
+        fusingScaleShift, // EltwiseMulAdd fusing
+        fusingPReluPerTensor,
+        fusingFakeQuantizePerChannelRelu,
+        fusingFakeQuantizePerTensorRelu,
 };

-const auto fullyConnectedParams2D = ::testing::Combine(::testing::ValuesIn(IS2D),
-                                                       ::testing::ValuesIn(netPRCs),
-                                                       ::testing::Values(ElementType::undefined),
-                                                       ::testing::Values(ElementType::undefined),
-                                                       ::testing::Values(helpers::InputLayerType::CONSTANT),
-                                                       ::testing::Values(CommonTestUtils::DEVICE_CPU),
-                                                       ::testing::ValuesIn(additionalConfig));
+std::vector<fusingSpecificParams> fusingParamsSet2DBF16 {
+        emptyFusingSpec,
+        fusingBias,
+        fusingRelu,
+        fusingPReluPerTensor,
+};

-const auto testParams2D = ::testing::Combine(fullyConnectedParams2D,
+const auto testParams2D = ::testing::Combine(::testing::Combine(::testing::ValuesIn(IS2D),
+                                                                ::testing::Values(ElementType::f32),
+                                                                ::testing::Values(ElementType::undefined),
+                                                                ::testing::Values(ElementType::undefined),
+                                                                ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                                ::testing::Values(emptyAdditionalConfig)),
                                             ::testing::Values(MatMulNodeType::FullyConnected),
                                             ::testing::ValuesIn(fusingParamsSet2D),
                                             ::testing::ValuesIn(filterSpecificParams()));

+const auto testParams2DBF16 = ::testing::Combine(::testing::Combine(::testing::ValuesIn(IS2D),
+                                                                    ::testing::ValuesIn(netPRCs),
+                                                                    ::testing::Values(ElementType::undefined),
+                                                                    ::testing::Values(ElementType::undefined),
+                                                                    ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                                    ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                                    ::testing::ValuesIn(additionalConfig)),
+                                                 ::testing::Values(MatMulNodeType::FullyConnected),
+                                                 ::testing::ValuesIn(fusingParamsSet2DBF16),
+                                                 ::testing::ValuesIn(filterSpecificParams()));
+
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D, MatMulLayerCPUTest, testParams2D, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16, MatMulLayerCPUTest, testParams2DBF16, MatMulLayerCPUTest::getTestCaseName);

 const std::vector<ShapeRelatedParams> IS3D = {
    {static_shapes_to_test_representation({{1, 32, 120}, {120, 5}}), {false, false}},
@ -266,23 +291,46 @@ const std::vector<ShapeRelatedParams> IS3D = {

 std::vector<fusingSpecificParams> fusingParamsSet3D {
        emptyFusingSpec,
-        fusingBiasFC
+        fusingBias,
+        fusingMultiplyPerChannel,
+        fusingFakeQuantizePerChannel,
+        fusingFakeQuantizePerTensorRelu,
+};
+
+std::vector<fusingSpecificParams> fusingParamsSet3DBF16 {
+        emptyFusingSpec,
+        fusingBias,
+        fusingMultiplyPerChannel,
 };

 const auto fullyConnectedParams3D = ::testing::Combine(::testing::ValuesIn(IS3D),
-                                                       ::testing::ValuesIn(netPRCs),
+                                                       ::testing::Values(ElementType::f32),
                                                       ::testing::Values(ElementType::undefined),
                                                       ::testing::Values(ElementType::undefined),
                                                       ::testing::Values(helpers::InputLayerType::CONSTANT),
                                                       ::testing::Values(CommonTestUtils::DEVICE_CPU),
-                                                       ::testing::ValuesIn(additionalConfig));
+                                                       ::testing::Values(emptyAdditionalConfig));
+
+const auto fullyConnectedParams3DBF16 = ::testing::Combine(::testing::ValuesIn(IS3D),
+                                                           ::testing::ValuesIn(netPRCs),
+                                                           ::testing::Values(ElementType::undefined),
+                                                           ::testing::Values(ElementType::undefined),
+                                                           ::testing::Values(helpers::InputLayerType::CONSTANT),
+                                                           ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                           ::testing::ValuesIn(additionalConfig));

 const auto testParams3D = ::testing::Combine(fullyConnectedParams3D,
                                             ::testing::Values(MatMulNodeType::FullyConnected),
                                             ::testing::ValuesIn(fusingParamsSet3D),
                                             ::testing::ValuesIn(filterSpecificParams()));

+const auto testParams3DBF16 = ::testing::Combine(fullyConnectedParams3DBF16,
+                                                 ::testing::Values(MatMulNodeType::FullyConnected),
+                                                 ::testing::ValuesIn(fusingParamsSet3DBF16),
+                                                 ::testing::ValuesIn(filterSpecificParams()));
+
 INSTANTIATE_TEST_SUITE_P(smoke_FC_3D, MatMulLayerCPUTest, testParams3D, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_BF16, MatMulLayerCPUTest, testParams3DBF16, MatMulLayerCPUTest::getTestCaseName);

 std::vector<std::map<std::string, std::string>> filterAdditionalConfig_Brgemm() {
    std::vector<std::map<std::string, std::string>> additionalConfig = {
@ -357,7 +405,9 @@ const std::vector<ShapeRelatedParams> IS = {
    {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {true, false}},
    {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {false, true}},
    {static_shapes_to_test_representation({{55, 12}, {12, 55}}), {true, true}},
+};

+const std::vector<ShapeRelatedParams> IS_Dynamic = {
    {
        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
            {{-1, -1}, {{55, 12}, {33, 7}}}, // input 0
@ -507,7 +557,16 @@ const std::vector<ShapeRelatedParams> IS = {
 std::vector<fusingSpecificParams> matmulFusingParams {
        emptyFusingSpec,
        fusingElu,
-        fusingSqrt
+        fusingSqrt,
+        fusingPReluPerTensor,
+        fusingMultiplyPerChannel,
+        fusingAddPerTensor,
+        fusingBias,
+        fusingFakeQuantizePerChannel,
+        /* @todo FQ unfolds into FQ + Convert + Substract + Multiply after LPT,
+         * so Relu cannot be fused in this case. Should be analysed */
+        // fusingFakeQuantizePerChannelRelu,
+        fusingFakeQuantizePerTensorRelu,
 };

 const auto matMulParams = ::testing::Combine(::testing::ValuesIn(IS),
@ -523,7 +582,70 @@ const auto testParams = ::testing::Combine(matMulParams,
                                           ::testing::ValuesIn(matmulFusingParams),
                                           ::testing::ValuesIn(filterSpecificParams()));

-INSTANTIATE_TEST_SUITE_P(smoke_MM, MatMulLayerCPUTest, testParams, MatMulLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Static, MatMulLayerCPUTest, testParams, MatMulLayerCPUTest::getTestCaseName);
+
+
+const auto matMulParamsDynamic = ::testing::Combine(::testing::ValuesIn(IS_Dynamic),
+                                             ::testing::ValuesIn(netPRCs),
+                                             ::testing::Values(ElementType::undefined),
+                                             ::testing::Values(ElementType::undefined),
+                                             ::testing::Values(helpers::InputLayerType::PARAMETER),
+                                             ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                             ::testing::ValuesIn(additionalConfig));
+
+const auto testParamsDynamic = ::testing::Combine(matMulParamsDynamic,
+                                           ::testing::Values(MatMulNodeType::MatMul),
+                                           ::testing::Values(emptyFusingSpec),
+                                           ::testing::ValuesIn(filterSpecificParams()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Dynamic, MatMulLayerCPUTest, testParamsDynamic, MatMulLayerCPUTest::getTestCaseName);
+
+
+const std::vector<ShapeRelatedParams> IS_Dynamic_Fusing = {
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1}, {{16, 12}, {33, 7}}}, // input 0
+            {{-1, 33}, {{12, 33}, {7, 33}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1, -1}, {{1, 2, 32, 60}, {1, 2, 32, 30}}}, // input 0
+            {{-1, 5}, {{60, 5}, {30, 5}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1}, {{7, 32, 60}, {7, 32, 30}}}, // input 0
+            {{-1, -1, -1, 25}, {{3, 7, 60, 25}, {3, 7, 30, 25}}}  // input 1
+        },
+        {false, false}
+    },
+    {
+        { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...}
+            {{-1, -1, -1}, {{10, 10, 10}, {5, 5, 5}}}, // input 0
+            {{-1, -1, 5}, {{10, 10, 5}, {5, 5, 5}}}  // input 1
+        },
+        {false, false}
+    },
+};
+
+const auto matMulParamsDynamicFusing = ::testing::Combine(::testing::ValuesIn(IS_Dynamic_Fusing),
+                                                        ::testing::ValuesIn(netPRCs),
+                                                        ::testing::Values(ElementType::undefined),
+                                                        ::testing::Values(ElementType::undefined),
+                                                        ::testing::Values(helpers::InputLayerType::PARAMETER),
+                                                        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                                        ::testing::ValuesIn(additionalConfig));
+
+const auto testParamsDynamicFusing = ::testing::Combine(matMulParamsDynamicFusing,
+                                                  ::testing::Values(MatMulNodeType::MatMul),
+                                                  ::testing::ValuesIn(matmulFusingParams),
+                                                  ::testing::ValuesIn(filterSpecificParams()));
+
+INSTANTIATE_TEST_SUITE_P(smoke_MM_Dynamic_Fusing, MatMulLayerCPUTest, testParamsDynamicFusing, MatMulLayerCPUTest::getTestCaseName);

 } // namespace matmul

--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/reshape_fc.cpp
@ -1,101 +0,0 @@
-// Copyright (C) 2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "test_utils/fusing_test_utils.hpp"
-#include "ngraph_functions/builders.hpp"
-
-using namespace ngraph;
-using namespace InferenceEngine;
-using namespace CPUTestUtils;
-
-namespace SubgraphTestsDefinitions {
-
-using ReshapeFCTestParams = std::tuple<std::pair<SizeVector, SizeVector>, // IS fully connected
-                                       bool,                              // transpose B
-                                       fusingSpecificParams>;
-
-class ReshapeFCTest : public testing::WithParamInterface<ReshapeFCTestParams>, public CpuTestWithFusing,
-                      virtual public LayerTestsUtils::LayerTestsCommon {
-public:
-    static std::string getTestCaseName(testing::TestParamInfo<ReshapeFCTestParams> obj) {
-        std::pair<SizeVector, SizeVector> isFc;
-        bool transpB;
-        fusingSpecificParams fusingParams;
-        std::tie(isFc, transpB, fusingParams) = obj.param;
-        SizeVector isA = isFc.first; SizeVector isB = isFc.second;
-
-        std::ostringstream result;
-        result << "IS_reshape=" << CommonTestUtils::vec2str(isA) << "_";
-        result << "IS_fc_B=" << CommonTestUtils::vec2str(isB) << "_";
-        result << "Transp_B=" << transpB;
-        result << CpuTestWithFusing::getTestCaseName(fusingParams);
-
-        return result.str();
-    }
-
-protected:
-    void SetUp() override {
-        targetDevice = CommonTestUtils::DEVICE_CPU;
-        std::pair<SizeVector, SizeVector> isFc;
-        bool transpB;
-        fusingSpecificParams fusingParams;
-        std::tie(isFc, transpB, fusingParams) = this->GetParam();
-        std::tie(postOpMgrPtr, fusedOps) = fusingParams;
-        SizeVector isReshape = isFc.first; SizeVector isB = isFc.second;
-        SizeVector isA(2);
-        isA[0] = isReshape[0];
-        isA[1] = std::accumulate(isReshape.begin() + 1, isReshape.end(), size_t{1}, std::multiplies<size_t>());
-        if (transpB) {
-            std::swap(*(isB.end() - 1), *(isB.end() - 2));
-        }
-
-        auto inputParams = builder::makeParams(element::f32, {isReshape});
-        auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<op::Parameter>(inputParams));
-
-        auto constNode = builder::makeConstant(element::i64, {isA.size()}, isA);
-        auto reshape = std::make_shared<opset1::Reshape>(paramOuts[0], constNode, true);
-
-        auto matrixB = builder::makeConstant<float>(element::f32, isB, {}, true);
-        auto matMul = builder::makeMatMul(reshape, matrixB, false, transpB);
-
-        const auto netType = element::f32;
-        selectedType = makeSelectedTypeStr("jit_gemm", netType);
-
-        function = makeNgraphFunction(netType, inputParams, matMul, "ReshapeFC");
-    }
-};
-
-TEST_P(ReshapeFCTest, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED()
-
-    Run();
-    CheckNodeOfTypeCount(executableNetwork, "Reshape", 0);
-    CheckPluginRelatedResults(executableNetwork, "FullyConnected");
-}
-
-namespace {
-
-const std::vector<bool> transpose = {
-    true, false
-};
-
-const std::vector<std::pair<SizeVector, SizeVector>> isFC = {
-    {{71, 128, 1, 1}, {128, 20}},
-    {{1, 24, 2, 7}, {336, 16}}
-};
-
-std::vector<fusingSpecificParams> fusingParamsSet {
-        emptyFusingSpec,
-        fusingAddPerChannel
-};
-
-const auto reshapeFCParams = ::testing::Combine(::testing::ValuesIn(isFC),
-                                                ::testing::ValuesIn(transpose),
-                                                ::testing::ValuesIn(fusingParamsSet));
-
-INSTANTIATE_TEST_SUITE_P(smoke_Check, ReshapeFCTest, reshapeFCParams, ReshapeFCTest::getTestCaseName);
-
-} // namespace
-
-} // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp
@ -5,6 +5,7 @@
 #pragma once

 #include "cpu_test_utils.hpp"
+#include <memory>
 #include <shared_test_classes/single_layer/activation.hpp>

 namespace CPUTestUtils {
@ -75,6 +76,24 @@ protected:
    bool checkFusingPosition = true;
 };

+static size_t getFusingAxis(const std::shared_ptr<ngraph::Node>& node) {
+    if (std::dynamic_pointer_cast<const ngraph::opset1::MatMul>(node))
+        return node->get_output_partial_shape(0).size() - 1; // last dimension
+    else
+        return 1; // second dimension
+}
+
+static ngraph::Shape generatePerChannelShape(const std::shared_ptr<ngraph::Node>& node) {
+    const auto shape = node->get_output_partial_shape(0);
+    if (shape.size() == 1)
+        IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
+    ngraph::Shape perChannelShape(shape.size(), 1);
+    const auto channelAxis = getFusingAxis(node);
+    perChannelShape[channelAxis] = shape[channelAxis].get_length();
+
+    return perChannelShape;
+}
+
 /* FUSING PATTERNS */
 const auto emptyFusingSpec = fusingSpecificParams{nullptr, {}};

@ -120,11 +139,7 @@ const auto fusingSqrt = fusingSpecificParams{std::make_shared<postNodesMgr>(std:

 const auto fusingPReluPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                auto data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(newShape));
                return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::LeakyRelu, newShape, data);
            }, "PRelu(PerChannel)"}}), {"PRelu"}};
@ -166,11 +181,7 @@ const auto fusingReluAdd = fusingSpecificParams{std::make_shared<postNodesMgr>(s
                return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu);
            }, "Relu"},
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
            }, "Add(PerChannel)"}}), {"Relu", "Add"}};
@ -180,40 +191,24 @@ const auto fusingReluScaleShift = fusingSpecificParams{std::make_shared<postNode
                return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu);
            }, "Relu"},
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                 auto shape = inpNode->get_output_partial_shape(0);
-                 if (shape.size() == 1)
-                     IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                 ngraph::Shape newShape(shape.size(), 1);
-                 newShape[1] = shape[1].get_length();
-                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
-                 return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
+                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
            }, "Multiply(PerChannel)"},
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
            }, "Add(PerChannel)"}}), {"Relu", "Add"}};

 const auto fusingScaleShift = fusingSpecificParams{ std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
-                 auto shape = inpNode->get_output_partial_shape(0);
-                 if (shape.size() == 1)
-                     IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                 ngraph::Shape newShape(shape.size(), 1);
-                 newShape[1] = shape[1].get_length();
-                 auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
-                 return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
+                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Multiply>(inpNode, constNode);
            }, "Multiply(PerChannel)"},
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) {
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                auto constNode = ngraph::builder::makeConstant(ngPrc, newShape, std::vector<float>{}, true);
                return std::make_shared<ngraph::opset1::Add>(inpNode, constNode);
            }, "Add(PerChannel)"}}), {"Add"} };
@ -228,22 +223,14 @@ const auto fusingFakeQuantizePerTensor = fusingSpecificParams{ std::make_shared<
 const auto fusingFakeQuantizePerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
                auto localPrc = inpNode->get_element_type();
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                    IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
            }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize"}};

 const auto fusingFakeQuantizePerChannelRelu = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
                auto localPrc = inpNode->get_element_type();
-                auto shape = inpNode->get_output_partial_shape(0);
-                if (shape.size() == 1)
-                 IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only";
-                ngraph::Shape newShape(shape.size(), 1);
-                newShape[1] = shape[1].get_length();
+                ngraph::Shape newShape = generatePerChannelShape(inpNode);
                return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape);
            }, "FakeQuantize(PerChannel)"},
            {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
@ -291,60 +278,56 @@ const auto fusingSumEluFQ = fusingSpecificParams{std::make_shared<postNodesMgr>(
 const auto fusingMultiplyPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
            ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::op::v1::Multiply>(inpNode, secondMultInput);
        }, "Multiply(PerTensor)"}}), {"Multiply"}};

 const auto fusingMultiplyPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Multiply>(inpNode, secondMultInput);
        }, "Multiply(PerChannel)"}}), {"Multiply"}};

 const auto fusingAddPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
            ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Add>(inpNode, secondMultInput);
        }, "Add(PerTensor)"}}), {"Add"}};

 const auto fusingAddPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Add>(inpNode, secondMultInput);
        }, "Add(PerChannel)"}}), {"Add"}};

 const auto fusingSubtractPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
            ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Subtract>(inpNode, secondMultInput);
        }, "Subtract(PerTensor)"}}), {"Subtract"}};

 const auto fusingSubtractPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Subtract>(inpNode, secondMultInput);
        }, "Subtract(PerChannel)"}}), {"Subtract"}};

 const auto fusingDividePerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
            ngraph::Shape secondMultInShape(1, 1);
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Divide>(inpNode, secondMultInput);
        }, "Divide(PerTensor)"}}), {"Divide"}};

 const auto fusingDividePerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](std::shared_ptr<ngraph::Node> inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){
-            ngraph::Shape secondMultInShape(inpNode->get_output_partial_shape(0).size(), 1);
-            secondMultInShape[1] = inpNode->get_output_partial_shape(0)[1].get_length();
-            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector<float>{}, true);
+            ngraph::Shape secondMultInShape = generatePerChannelShape(inpNode);
+            auto secondMultInput = ngraph::builder::makeConstant(ngPrc, secondMultInShape, std::vector<float>{}, true);
            return std::make_shared<ngraph::opset1::Divide>(inpNode, secondMultInput);
        }, "Divide(PerChannel)"}}), {"Divide"}};

--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/mat_mul.cpp
@ -44,7 +44,7 @@ std::string MatMulTest::getTestCaseName(const testing::TestParamInfo<MatMulLayer
    result << "trgDev=" << targetDevice;
    result << "config=(";
    for (const auto configEntry : additionalConfig) {
-        result << configEntry.first << ", " << configEntry.second << ":";
+        result << configEntry.first << ", " << configEntry.second << ";";
    }
    result << ")";
    return result.str();
--- a/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp
+++ b/inference-engine/tests/unit/cpu/ngraph_transformations/convert_matmul_test.cpp
@ -13,7 +13,6 @@
 #include <ngraph_transformations/op/fully_connected.hpp>
 #include <ngraph_transformations/convert_matmul_to_fc.hpp>
 #include <ngraph_transformations/fc_bias_fusion.hpp>
-#include <ngraph_transformations/reshape_fully_connected.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/utils/utils.hpp>
 #include <ngraph/pass/manager.hpp>
@ -171,7 +170,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest7) {
        ngraph::pass::Manager m;
        m.register_pass<ngraph::pass::InitNodeInfo>();
        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
        m.run_passes(f);
        ASSERT_NO_THROW(check_rt_info(f));
    }
@ -179,12 +177,9 @@ TEST(TransformationTests, ConvertMatMulToFCTest7) {
    {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{3, 2, 2});
        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{3, 2}, {1});
-        auto reshape_begin = std::make_shared<ngraph::opset1::Reshape>(
-                input1, ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{-1, 2}), false);
-        auto fc = std::make_shared<FullyConnectedNode>(reshape_begin, input2, ngraph::Rank(2));
-        auto reshape_end = ngraph::op::util::reshapeTo(fc, ngraph::Shape{3, 2, 3});
+        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));

-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_end}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{fc}, ngraph::ParameterVector{input1});
    }

    auto res = compare_functions(f, f_ref, true);
@ -202,7 +197,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest8) {
        ngraph::pass::Manager m;
        m.register_pass<ngraph::pass::InitNodeInfo>();
        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
        m.run_passes(f);
        ASSERT_NO_THROW(check_rt_info(f));
    }
@ -211,18 +205,14 @@ TEST(TransformationTests, ConvertMatMulToFCTest8) {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, -1, 2});
        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{3, 2}, {1});

-        auto reshape_begin = std::make_shared<ngraph::opset1::Reshape>(
-                input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 2}), false);
-
-        auto fc = std::make_shared<FullyConnectedNode>(reshape_begin, input2, ngraph::Rank(2));
+        auto fc = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));
        auto a_shape = std::make_shared<ngraph::opset3::ShapeOf>(input1);

        auto I = ngraph::op::util::node_to_get_shape_value_of_indices_from_shape_node(a_shape, {0, 1});
        auto O = ngraph::opset1::Constant::create(ngraph::element::i64, { 1 }, { 3 });
        auto output_shape = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{I, O}, 0);
-        auto reshape_end = std::make_shared<ngraph::opset1::Reshape>(fc, output_shape, false);

-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_end}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{fc}, ngraph::ParameterVector{input1});
    }

    auto res = compare_functions(f, f_ref, true);
@ -268,7 +258,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest10) {
    ngraph::pass::Manager m;
    m.register_pass<ngraph::pass::InitNodeInfo>();
    m.register_pass<ConvertMatMulToFC>();
-    m.register_pass<ReshapeFullyConnected>();
    ASSERT_NO_THROW(m.run_passes(f));
 }

@ -439,25 +428,22 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_1) {
    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
    {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{5, 2, 3});
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 1, 2, 3}, {1});
+        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 2, 3}, {1});
        auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, input2, false, true);

        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
        ngraph::pass::Manager m;
        m.register_pass<ngraph::pass::InitNodeInfo>();
        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
        m.run_passes(f);
        ASSERT_NO_THROW(check_rt_info(f));
    }

    {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{5, 2, 3});
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 3}), false);
        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{2, 3}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, input2, ngraph::Rank(2));
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, ngraph::opset1::Constant::create(ngraph::element::i64, {4}, {1, 5, 2, 2}), false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_out}, ngraph::ParameterVector{input1});
+        auto matmul = std::make_shared<FullyConnectedNode>(input1, input2, ngraph::Rank(2));
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
    }

    auto res = compare_functions(f, f_ref, true);
@ -475,7 +461,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_2) {
        ngraph::pass::Manager m;
        m.register_pass<ngraph::pass::InitNodeInfo>();
        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
        m.run_passes(f);
        ASSERT_NO_THROW(check_rt_info(f));
    }
@ -495,9 +480,9 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
    {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{ 5, 2, 3 });
-        auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 2, 3 }, { 1 });
+        auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 2, 3 }, { 1 });
        auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, weights, false, true);
-        auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 1, 2 }, { 1 });
+        auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 1, 2 }, { 1 });
        auto add = std::make_shared<ngraph::opset1::Add>(matmul, biases);

        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input1 });
@ -505,7 +490,6 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
        m.register_pass<ngraph::pass::InitNodeInfo>();
        m.register_pass<ConvertMatMulToFC>();
        m.register_pass<FullyConnectedBiasFusion>();
-        m.register_pass<ReshapeFullyConnected>();
        m.run_passes(f);
        ASSERT_NO_THROW(check_rt_info(f));
    }
@ -513,53 +497,13 @@ TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_3) {
    {
        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{ 5, 2, 3 });
        auto reshape_before_const = ngraph::opset1::Constant::create(ngraph::element::i64, { 2 }, { -1, 3 });
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, reshape_before_const, false);

        auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 2, 3 }, { 1 });
        auto biases = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 2 }, { 1 });
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, weights, biases, ngraph::Rank(2));
+        auto matmul = std::make_shared<FullyConnectedNode>(input1, weights, biases, ngraph::Rank(2));

        auto reshape_after_const = ngraph::opset1::Constant::create(ngraph::element::i64, { 4 }, { 1, 5, 2, 2 });
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, reshape_after_const, false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ reshape_out }, ngraph::ParameterVector{ input1 });
-    }
-
-    auto res = compare_functions(f, f_ref, true);
-    ASSERT_TRUE(res.first) << res.second;
-}
-
-TEST(TransformationTests, ConvertMatMulToFCTest_second_input_rank_adj_dynamic) {
-    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
-    {
-        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, 2, 3});
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1, 1, 2, 3}, {1});
-        auto matmul = std::make_shared<ngraph::opset1::MatMul>(input1, input2, false, true);
-
-        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{matmul}, ngraph::ParameterVector{input1});
-        ngraph::pass::Manager m;
-        m.register_pass<ngraph::pass::InitNodeInfo>();
-        m.register_pass<ConvertMatMulToFC>();
-        m.register_pass<ReshapeFullyConnected>();
-        m.run_passes(f);
-        ASSERT_NO_THROW(check_rt_info(f));
-    }
-
-    {
-        auto input1 = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::PartialShape{-1, 2, 3});
-        auto reshape_1 = std::make_shared<ngraph::opset1::Reshape>(input1, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {-1, 3}), false);
-        auto input2 = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{2, 3}, {1});
-        auto matmul = std::make_shared<FullyConnectedNode>(reshape_1, input2, ngraph::Rank(2));
-
-        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(input1);
-        auto gather = std::make_shared<ngraph::opset7::Gather>(
-                shape_of, ngraph::opset1::Constant::create(ngraph::element::i64, {2}, {0, 1}), ngraph::opset1::Constant::create(ngraph::element::i64, {}, {0}));
-        auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{
-                ngraph::opset1::Constant::create(ngraph::element::i64, {1}, {1}),
-                gather,
-                ngraph::opset1::Constant::create(ngraph::element::i64, {1}, {2}),
-        }, 0);
-        auto reshape_out = std::make_shared<ngraph::opset1::Reshape>(matmul, concat, false);
-        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{reshape_out}, ngraph::ParameterVector{input1});
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ matmul }, ngraph::ParameterVector{ input1 });
    }

    auto res = compare_functions(f, f_ref, true);