[Snippets] Added support of INT8 models (#12395)

2022-10-05 13:05:15 +04:00 · 2022-10-05 13:05:15 +04:00 · f6d6f5629f
commit f6d6f5629f
parent f7e05ad402
59 changed files with 2073 additions and 577 deletions
--- a/src/common/snippets/include/snippets/op/convert_saturation.hpp
+++ b/src/common/snippets/include/snippets/op/convert_saturation.hpp
@ -24,6 +24,7 @@ namespace op {
 class ConvertSaturation : public ov::op::v0::Convert {
 public:
    OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert);
+    BWDCMP_RTTI_DECLARATION;

    ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type);
    ConvertSaturation() = default;
--- a/src/common/snippets/include/snippets/op/convert_truncation.hpp
+++ b/src/common/snippets/include/snippets/op/convert_truncation.hpp
@ -23,6 +23,7 @@ namespace op {
 class ConvertTruncation : public ov::op::v0::Convert {
 public:
    OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert);
+    BWDCMP_RTTI_DECLARATION;

    ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type);
    ConvertTruncation() = default;
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@ -88,6 +88,17 @@ public:
        return m_generator;
    }

+    size_t get_non_scalar_constants_count() const {
+        return m_non_scalar_constants_count;
+    }
+
+    bool is_quantized() const {
+        return config.m_is_quantized;
+    }
+
+    bool has_type_relaxed_ops() const {
+        return config.m_has_type_relaxed_ops;
+    }

    snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
                                const void* compile_params = nullptr);
@ -99,6 +110,7 @@ public:
    // plugin sets generator for a snippet to some specific generator.
    // it's going to be replaced with Jitters table later
    void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
+    void set_non_scalar_constants_count(const size_t count);

    void print() const;
    void print_statistics(bool verbose);
@ -111,9 +123,29 @@ public:
 private:
    void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
    void convert_to_snippet_dialect();
-    Shape exec_domain;
-    std::shared_ptr<ov::Model> m_body;
-    std::shared_ptr<ngraph::snippets::Generator> m_generator;
+    // Count of potentional non-scalar Consants that will be created after some tranformations
+    // At the moment it's relevant only for FakeQuantize decomposition
+    // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
+    //       we should MANUALLY calculate it where it needed.
+    size_t m_non_scalar_constants_count = 0;
+    Shape exec_domain = {};
+    std::shared_ptr<ov::Model> m_body = nullptr;
+    std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
+
+    // TODO: Change logic of insert Converts. This exec element type can be different for plugins
+    const ov::element::Type execution_element_type = ov::element::f32;
+
+    // Config to know which transformations should be called.
+    // It helps to avoid overheads of extra transformation calls
+    struct {
+        // True if Subgraph contains FakeQuantize -> FQ decomposition should be called
+        bool m_is_quantized = false;
+        // True if we should align element types indise body
+        bool m_is_needed_to_align_precision = false;
+        // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
+        // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
+        bool m_has_type_relaxed_ops = false;
+    } config;
 };

 static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::BlockedShape& blocked_shape) {
@ -121,10 +153,6 @@ static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::Blo
    return os;
 }

-static inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) -> bool {
-    return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
-};
-
 static inline auto create_body(std::string name, const ngraph::ResultVector& results, const ngraph::ParameterVector& parameters) ->
    std::shared_ptr<ov::Model> {
    auto body = std::make_shared<ov::Model>(results, parameters, name);
--- a/src/common/snippets/include/snippets/pass/align_element_type.hpp
+++ b/src/common/snippets/include/snippets/pass/align_element_type.hpp
@ -0,0 +1,46 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface AlignElementType
+ * @brief Wrap sequence of operations which doesn't support execution on original element type by ConvertSaturation
+ *        and reset element type for type relaxed nodes inside body to align element type between nodes.
+ *        Example 1:
+ *          - After FQ decomposition there may be Convert[U8/I8]. If after the Convert there are other operations
+ *            that don't support U8/I8, new ConvertSaturation[exec_type] will be inserted after the FQ decomposition
+ *            to execute these operations on supported element type
+ *        Example 2:
+ *          - Input[I8] -> Unsupported I8 op -> Movement op -> Output[I8]. There will be inserted two ConvertSaturation:
+ *              * ConvertSatiration[exec_type] before op which is unsupported I8
+ *              * ConvertSaturation[I8] before Movement op to return original low precision.
+ *        Note: We cannot just remove original Convert[I8/U8] in Example 1 because we should cover two things:
+ *              * allow execution of operations on supported element type for them
+ *              * keep computations mathematically equivalent to the original function
+ *              Thus, for these cases we should have the following pipeline: FP32 -> Convert[I8/U8] -> Convert[FP32] -> FP32
+ *        Note: We shouldn't call validate_and_infer_type() after Convert insertions to avoid element type conflicts on inputs of ops
+ * @ingroup snippets
+ */
+class AlignElementType: public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("AlignElementType", "0");
+    AlignElementType(const ov::element::Type exec_type = ov::element::f32);
+    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+
+    static bool opNeedsAlignElementType(const std::shared_ptr<ov::Node>& n, const ov::element::Type exec_type = ov::element::f32);
+private:
+    ov::element::Type exec_type;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/common_optimizations.hpp
+++ b/src/common/snippets/include/snippets/pass/common_optimizations.hpp
@ -0,0 +1,22 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+class CommonOptimizations : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    CommonOptimizations();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/convert_constants_to_scalars.hpp
+++ b/src/common/snippets/include/snippets/pass/convert_constants_to_scalars.hpp
@ -14,7 +14,7 @@ namespace pass {
 /**
 * @interface ConvertConstantsToScalars
 * @brief Replace only constants which are should be represented as scalars during code generation.
- * Only single-value (0D) constants are currently supported.
+ *        Only single-value (0D) constants are currently supported.
 * @ingroup snippets
 */
 class ConvertConstantsToScalars: public ngraph::pass::MatcherPass {
@ -24,4 +24,4 @@ public:

 } // namespace pass
 } // namespace snippets
-} // namespace ngraph
+} // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/fq_decomposition.hpp
+++ b/src/common/snippets/include/snippets/pass/fq_decomposition.hpp
@ -0,0 +1,91 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/fake_quantize.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/constant_folding.hpp"
+#include "snippets/pass/transform_convert.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface FakeQuantizeDecomposition
+ * @ingroup snippets
+ * @brief FakeQuantizeDecomposition transformation decomposes FakeQuantize layer.
+ *
+ * Expression from specification:
+ * if x <= min(il, ih):
+ *   output = ol
+ * elif x > max(il, ih):
+ *   output = oh
+ * else:
+ *   output = round((x - il) / (ih - il) * (levels-1)) / (levels-1) * (oh - ol) + ol
+ *
+ * Expand brackets:
+ *   round(x * (levels-1) / (ih - il) - il * (levels-1) / (ih - il)) * (oh - ol) / (levels-1) + ol
+ * 
+ * Marking:
+ *   - isc := (levels-1) / (ih - il)
+ *   - ish := -il * isc
+ *   - osc := (oh - ol) / (levels-1)
+ *   - osh := ol
+ * Final expression:
+ *   round(x * isc + ish) * osc + osh
+ * 
+ * Some optimizations (example for scalars):
+ * 1. If output element type of FQ is U8 and il = 0, ish = 0, osc = 1, osh = 0, there is enough expression: x * isc
+ * 2. If output element type of FQ is I8 and ish ~= 128, osc = 1, osh ~= -128, il * isc ~= -128, ih * isc ~= 127 there is enough expression: x * isc
+ * 3. If osc = 1, osh = 0, there isn't dequantization
+ * 4. If there isn't dequantization and output element type of FQ isn't FP32, there isn't rounding
+ *
+ * This transformation doesn't support following cases:
+ * 1. At least one 'range' input is not Constant
+ * 2. At least one 'il' input value greater or equal than 'ih' input value
+ *
+ */
+
+class FakeQuantizeDecomposition : public ngraph::pass::MatcherPass {
+public:
+    FakeQuantizeDecomposition();
+
+    static bool isAllScalarConstant(const std::shared_ptr<const ngraph::Node>& node);
+    static bool getScalesAndShifts(const std::shared_ptr<const ngraph::op::v0::FakeQuantize>& fq_node,
+                                   std::vector<float>& cl,
+                                   std::vector<float>& ch,
+                                   std::vector<float>& isc,
+                                   std::vector<float>& ish,
+                                   std::vector<float>& osc,
+                                   std::vector<float>& osh);
+    static std::vector<float> calculateScales(const ngraph::element::Type& out_type,
+                                              const std::vector<float>& cl,
+                                              const std::vector<float>& ch,
+                                              const std::vector<float>& isc,
+                                              const std::vector<float>& ish,
+                                              const std::vector<float>& osc,
+                                              const std::vector<float>& osh);
+};
+
+/**
+ * @interface CommonFakeQuantizeDecomposition
+ * @ingroup snippets
+ * @brief CommonFakeQuantizeDecomposition pass applies all needed transformations for
+ *        correct FQ Decomposition:
+ *          0. Disable Validate() pass after each transformations
+ *          1. FakeQuantization decomposition
+ *          2. ConstantFolding
+ *          3. Validate
+ */
+class CommonFakeQuantizeDecomposition: public ngraph::pass::FunctionPass {
+public:
+    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_convert_on_inputs.hpp
@ -1,31 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/pass/graph_rewrite.hpp>
-#include <ngraph/pattern/matcher.hpp>
-
-namespace ngraph {
-namespace snippets {
-namespace pass {
-
-/**
- * @interface InsertConvertOnInputs
- * @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs
- *        to supported execution data type.
- *        Note: ConvertSaturation op isn't covered by specification of "Convert" op
- *              This op is used for conversion into and from FP32 after the correspoding Load
- *              and before Store to calculate in FP32 inside subgraph body in CPU Plugin
- * @ingroup snippets
- */
-class InsertConvertOnInputs: public ngraph::pass::MatcherPass {
-public:
-    InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32);
-};
-
-
-}  // namespace pass
-}  // namespace snippets
-}  // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp
@ -13,7 +13,7 @@ namespace pass {

 /**
 * @interface InsertMoveBroadcast
- * @brief Inserts explicit MoveBroadcast instruction if broadcasting by most warying dimension is needed.
+ * @brief Inserts explicit MoveBroadcast instruction if broadcasting by most varying dimension is needed.
 * The pass is used to convert model to a canonical form for code generation
 * @ingroup snippets
 */
--- a/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp
+++ b/src/common/snippets/include/snippets/pass/reset_type_relaxed_node_precision.hpp
@ -1,31 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ngraph/pass/graph_rewrite.hpp>
-#include <ngraph/pattern/matcher.hpp>
-
-namespace ngraph {
-namespace snippets {
-namespace pass {
-
-/**
- * @interface ResetTypeRelaxedNodePrecision
- * @brief Reset precision for type relaxed nodes inside body to align precision between nodes.
- *        Should be called after all Convert insertions
- * @ingroup snippets
- */
-class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass {
-public:
-    OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0");
-    ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32);
-    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
-private:
-    ov::element::Type exec_type;
-};
-
-}  // namespace pass
-}  // namespace snippets
-}  // namespace ngraph
--- a/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp
+++ b/src/common/snippets/include/snippets/pass/transform_convert_to_truncation.hpp
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A file contains public utilities.
+ * @file utils.hpp
+ */
+#pragma once
+
+#include "snippets_isa.hpp"
+#include "emitter.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace utils {
+
+// Get non-scalar Constant count that will be created after FakeQuantize decomposition.
+// This count is needed to know exact count of non-scalar Constants during tokenization.
+auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t;
+
+inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) -> bool {
+    return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
+}
+
+} // namespace utils
+} // namespace snippets
+} // namespace ngraph
--- a/src/common/snippets/src/op/convert_saturation.cpp
+++ b/src/common/snippets/src/op/convert_saturation.cpp
@ -8,6 +8,9 @@

 #include "ngraph/runtime/host_tensor.hpp"

+
+BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::ConvertSaturation);
+
 ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type)
    : ov::op::v0::Convert({x}, destination_type) {
 }
--- a/src/common/snippets/src/op/convert_truncation.cpp
+++ b/src/common/snippets/src/op/convert_truncation.cpp
@ -8,6 +8,9 @@

 #include "ngraph/runtime/host_tensor.hpp"

+
+BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::ConvertTruncation);
+
 ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type)
    : ov::op::v0::Convert({x}, destination_type) {
 }
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@ -11,18 +11,19 @@
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
 #include "snippets/pass/assign_registers.hpp"
-#include "snippets/pass/convert_constants_to_scalars.hpp"
+#include "snippets/pass/convert_constants.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
-#include "snippets/pass/transform_convert_to_truncation.hpp"
-#include "snippets/pass/insert_convert_on_inputs.hpp"
-#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
+#include "snippets/pass/transform_convert.hpp"
+#include "snippets/pass/align_element_type.hpp"
+#include "snippets/utils.hpp"

 #include "transformations/common_optimizations/nop_elimination.hpp"
 #include "transformations/utils/utils.hpp"

 #include <ngraph/pass/manager.hpp>
 #include "ngraph/pass/constant_folding.hpp"
+#include "ngraph_ops/type_relaxed.hpp"
 #include <openvino/pass/serialize.hpp>

 #include <algorithm>
@ -36,8 +37,20 @@ void snippets::op::Subgraph::set_generator(std::shared_ptr<ngraph::snippets::Gen
    m_generator = generator;
 }

+void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) {
+    m_non_scalar_constants_count = count;
+}
+
 snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body)
    : Op(args), m_body(body), m_generator(nullptr) {
+    const auto ops = m_body->get_ops();
+    for (const auto& op : ops) {
+        config.m_is_quantized = config.m_is_quantized || ov::is_type<ov::op::v0::FakeQuantize>(op);
+        config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
+        config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() ||
+            snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
+    }
+
    constructor_validate_and_infer_types();
 }

@ -86,7 +99,8 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
    ngraph::OutputVector subgraph_inputs;

    for (const auto& input : node->input_values()) {
-        if (is_scalar_constant(input.get_node_shared_ptr())) {
+        if ((utils::is_scalar_constant(input.get_node_shared_ptr())) ||
+            (ov::is_type<ov::op::v0::FakeQuantize>(node) && ov::is_type<ov::op::v0::Constant>(input.get_node_shared_ptr()))) {
            body_inputs.push_back(input);
        } else {
            auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
@ -119,6 +133,10 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
    auto body = create_body(node->get_friendly_name(), body_results, body_parameters);
    auto subgraph = build_subgraph(node, subgraph_inputs, body);

+    if (auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
+        subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node));
+    }
+
    for (size_t i = 0; i < body->get_parameters().size(); i++) {
        body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
    }
@ -251,25 +269,18 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape

 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
                                                 const BlockedShapeVector& inputShapes) {
-    // TODO: At the moment snippets support execution in only one element type
-    const auto execution_element_type = ov::element::f32;
-
-    ngraph::pass::Manager p_manager;
-    p_manager.register_pass<snippets::pass::TransformConvertToConvertTruncation>();
-    p_manager.run_passes(m_body);
-
    const auto& body_results = m_body->get_results();
    for (size_t i = 0; i < outputShapes.size(); i++) {
        const auto needed_out_type = std::get<2>(outputShapes[i]);

-        // If there is real Convert from graph (ConvertTruncation) before Result
+        // If there is real Convert from graph (ConvertTruncation) or after FQ decomp (ConvertSaturation) before Result
        // we should check destination type and insert ConvertSaturation before that if needed.
        // For example, to return original element type after Convert insertion on inputs
        std::shared_ptr<ov::Node> first_convert = body_results[i];
-        while (ov::is_type<ngraph::snippets::op::ConvertTruncation>(first_convert->get_input_node_ptr(0))) {
+        while (ov::is_type<ngraph::op::v0::Convert>(first_convert->get_input_node_ptr(0))) {
            first_convert = first_convert->get_input_node_shared_ptr(0);
        }
-        if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(first_convert)) {
+        if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::op::v0::Convert>(first_convert)) {
            const auto original_input_element_type = existing_convert_t->get_input_element_type(0);
            if (original_input_element_type != execution_element_type) {
                const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
@ -283,16 +294,16 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu
                body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
        body_results[i]->set_argument(0, convert);
    }
-
-    // After Convert insertion we should make the following steps:
-    //      - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body
-    //      - manually set output element types of type relaxed nodes to align element type inside subgraph body
-    //      - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert
-    //        element type of Scalars before inference
-    //      - eliminate redundant Convert that could have been inserted
+    // We should align element type inside body using the corresponding pass:
+    //  - Insert Convert before operations that doesn't support original element type for execution
+    //  - Insert reverse Convert before operations that support original element type
+    //    but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point)
+    // Then we should use ConstantFolding pass to convert element type of Scalars before inference.
+    // At the end eliminate redundant Convert that could be inserted
    ngraph::pass::Manager manager;
-    manager.register_pass<snippets::pass::InsertConvertOnInputs>(execution_element_type);
-    manager.register_pass<snippets::pass::ResetTypeRelaxedNodePrecision>(execution_element_type);
+    if (config.m_is_needed_to_align_precision) {
+        manager.register_pass<snippets::pass::AlignElementType>(execution_element_type);
+    }
    manager.register_pass<ngraph::pass::ConstantFolding>();
    manager.register_pass<ngraph::pass::EliminateConvert>();
    manager.run_passes(m_body);
--- a/src/common/snippets/src/pass/align_element_type.cpp
+++ b/src/common/snippets/src/pass/align_element_type.cpp
@ -0,0 +1,97 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/op/convert_saturation.hpp"
+#include "snippets/pass/align_element_type.hpp"
+#include "snippets/utils.hpp"
+#include "ngraph_ops/type_relaxed.hpp"
+#include "ngraph/op/util/op_types.hpp"
+
+#include <ngraph/rt_info.hpp>
+
+namespace {
+
+auto is_in_out_op(const std::shared_ptr<ov::Node>& n) -> bool {
+    return ov::is_type<ov::op::v0::Parameter>(n)
+        || ov::is_type<ov::op::v0::Constant>(n)
+        || ov::is_type<ov::op::v0::Result>(n);
+}
+
+// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert)
+// And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite
+auto op_supports_only_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
+    return !ov::is_type<ov::op::v0::Convert>(n);
+}
+
+// Check if executable operation supports only execution element type f32
+// NOTE: Executable op is node that isn't Parameter/Constant/Result
+auto is_executable_op_only_on_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
+    return op_supports_only_exec_type(n) && !is_in_out_op(n);
+}
+
+}  // namespace
+
+ngraph::snippets::pass::AlignElementType::AlignElementType(const ov::element::Type exec_type) : exec_type(exec_type) { }
+
+bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_ptr<ov::Model> &m) {
+    RUN_ON_FUNCTION_SCOPE(AlignElementType);
+
+    auto insertConvert = [](const std::shared_ptr<ov::Node>& op, const size_t idx, const ov::element::Type& element_type) -> void {
+        auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(op->input(idx).get_source_output(), element_type);
+        ngraph::copy_runtime_info(op->get_input_node_shared_ptr(idx), convert);
+        op->set_argument(idx, convert);
+    };
+
+    // NOTE: We don't call validate_and_infer_types() to avoid precision conflicts on inputs
+    bool rewritten = false;
+    auto ops = m->get_ordered_ops();
+    for (auto& op : ops) {
+        if (is_in_out_op(op) || ov::is_type<ov::op::v0::Convert>(op)) {
+            continue;
+        }
+
+        if (op_supports_only_exec_type(op)) {
+            for (auto i = 0; i < op->inputs().size(); i++) {
+                auto shared_input = op->get_input_node_shared_ptr(i);
+                auto existing_convert = ov::as_type_ptr<ov::op::v0::Convert>(shared_input);
+                // We should insert Convert before Ops, which supports only exec element type, only when:
+                //  - Input is Convert with unsupported destination type
+                //  - Input is Op which support any element type
+                // We couldn't unite these conditions and just check that element type isn't supported exec type
+                // because we don't call validate_and_infer_types() so we don't know new precisions
+                if ((existing_convert && existing_convert->get_destination_type() != exec_type) || (!is_executable_op_only_on_exec_type(shared_input))) {
+                    insertConvert(op, i, exec_type);
+                    rewritten |= true;
+                }
+            }
+            if (auto tr_node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
+                tr_node->set_overridden_output_type(exec_type, 0);
+                rewritten |= true;
+            }
+        } else {  // branch for the Movement ops and MatMul ops in the future
+            for (auto i = 0; i < op->inputs().size(); i++) {
+                auto shared_input = op->get_input_node_shared_ptr(i);
+                // it's original element type because we don't use validate_and_infer_type() anywhere
+                const auto original_eltype = op->input(i).get_element_type();
+                // If before op there is another op that doesn't support execution on original element type, we know that
+                // before this op will be inserted reverse Convert to support execution on supported element type (first branch of condition).
+                // So we should return original element type for operations that can support low precision
+                if (is_executable_op_only_on_exec_type(shared_input) && original_eltype != exec_type) {
+                    insertConvert(op, i, original_eltype);
+                    rewritten |= true;
+                }
+            }
+        }
+    }
+
+    return rewritten;
+}
+
+bool ngraph::snippets::pass::AlignElementType::opNeedsAlignElementType(const std::shared_ptr<ov::Node>& op, const ov::element::Type exec_type) {
+    // At the moment Snippets support only Eltwise/Convert/FQ which one output so we can just call get_element_type()
+    return is_executable_op_only_on_exec_type(op) && op->get_element_type() != exec_type;
+}
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@ -7,6 +7,7 @@

 #include "snippets/pass/collapse_subgraph.hpp"
 #include "snippets/op/subgraph.hpp"
+#include "snippets/utils.hpp"

 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/opsets/opset5.hpp>
@ -56,9 +57,19 @@ auto outputs_are_not_broadcastable(const std::shared_ptr<const Node>& node) -> b
    return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs);
 }

-auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
-    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_layout_oblivious")
-    auto is_layout_oblivious_binary = [](const std::shared_ptr<const Node> &n) -> bool {
+auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
+    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_supported_op")
+    auto is_supported_fq_op = [](const std::shared_ptr<const Node>& n) -> bool {
+        // TODO [92179]: Add support of FakeQuantize with non-constants inputs and with binarization algorithm.
+        const auto fq = ov::as_type_ptr<const opset1::FakeQuantize>(n);
+        return fq && fq->get_levels() != 2 &&
+               is_type<opset1::Constant>(n->get_input_node_shared_ptr(1)) &&
+               is_type<opset1::Constant>(n->get_input_node_shared_ptr(2)) &&
+               is_type<opset1::Constant>(n->get_input_node_shared_ptr(3)) &&
+               is_type<opset1::Constant>(n->get_input_node_shared_ptr(4));
+    };
+
+    auto is_supported_binary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
        return ov::is_type<opset1::Add>(n)
            || ov::is_type<opset1::Divide>(n)
            || ov::is_type<opset1::Equal>(n)
@ -79,10 +90,11 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
            || ov::is_type<opset1::Power>(n)
            || ov::is_type<opset1::SquaredDifference>(n)
            || ov::is_type<opset1::Subtract>(n)
-            || ov::is_type<opset1::Xor>(n);
+            || ov::is_type<opset1::Xor>(n)
+            || ov::is_type<ngraph::op::v0::Convert>(n);
    };

-    auto is_layout_oblivious_unary = [](const std::shared_ptr<const Node> &n) -> bool {
+    auto is_supported_unary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
        return ov::is_type<opset1::Abs>(n)
            || ov::is_type<opset1::Clamp>(n)
            || ov::is_type<opset1::Floor>(n)
@ -99,10 +111,10 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
            || ov::is_type<opset1::Tanh>(n)
            || ov::is_type<ngraph::op::v0::Gelu>(n)
            || ov::is_type<ngraph::op::v7::Gelu>(n)
-            || ov::is_type<ngraph::op::v4::HSwish>(n)
-            || ov::is_type<ngraph::op::v0::Convert>(n);
+            || ov::is_type<ngraph::op::v4::Swish>(n)
+            || ov::is_type<ngraph::op::v4::HSwish>(n);
    };
-    return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n);
+    return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n);
 }

 auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
@ -162,7 +174,7 @@ auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &sub
 } // namespace

 bool AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
-    return is_layout_oblivious(node) && has_supported_in_out(node);
+    return is_supported_op(node) && has_supported_in_out(node);
 }

 void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
@ -435,7 +447,10 @@ TokenizeSnippets::TokenizeSnippets() {
                // Result op has a single input
                internal_inputs.push_back(source_result->input_value(0));
            } else {
-                if (op::is_scalar_constant(input_node)) {
+                // We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
+                // After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
+                if ((utils::is_scalar_constant(input_node)) ||
+                    (ov::is_type<ov::op::v0::Constant>(input_node) && ov::is_type<ov::op::v0::FakeQuantize>(node))) {
                    internal_inputs.push_back(input_node->output(0));
                } else {
                    external_inputs.push_back(input_value);
@ -461,10 +476,23 @@ TokenizeSnippets::TokenizeSnippets() {
            throw ngraph_error("original node outputs size and extracted node outputs size doesn't much");
        }

+        // After some transformations, a different number of Constants for some operations may be created
+        // than the actual number of Constants during tokenization.
+        // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
+        // we should calculate potentional number of non-scalar Constants that will be moved up from body.
+        size_t hidden_non_scalar_constant_count = 0;
+        if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
+            hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
+        }
+
        ResultVector body_results;
        std::vector<std::set<Input<Node>>> subgraph_result_inputs;

        for (auto subgraph : input_subgraphs) {
+            // we should summurize non-scalar Constants count from all input subgraphs
+            // because we will collapse them with our node and we should get total count of non-scalar Constants
+            hidden_non_scalar_constant_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_non_scalar_constants_count();
+
            for (auto output : subgraph->outputs()) {
                bool first_side_consumer = true;

@ -502,12 +530,15 @@ TokenizeSnippets::TokenizeSnippets() {
        if (body_results.size() != subgraph_result_inputs.size()) {
            throw ngraph_error("body results and node results size mismatch during subgraph collaps");
        }
+
        // todo: move this plugin-specific constraint to the plugin callback
-        if (body_parameters.size() + body_results.size() > 12) {
+        if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) {
            const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
-            std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
+            std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
+            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
            const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
-            std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
+            std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
+            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
            return abort_with_strategy(message_reset, message_abort);
        }

@ -542,6 +573,7 @@ TokenizeSnippets::TokenizeSnippets() {
            act_body1->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
        }
        subgraph->get_rt_info()["originalLayersNames"] = fusedNames;
+        subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count);

        remark(1) << "Replacement (merge) done for: "
                    << subgraph->get_friendly_name()
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@ -0,0 +1,87 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/common_optimizations.hpp"
+
+#include <memory>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/pass/constant_folding.hpp>
+
+#include "transformations/utils/utils.hpp"
+#include "snippets/pass/fq_decomposition.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "snippets/itt.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::snippets::pass::CommonOptimizations, "Snippets::CommonOptimizations", 0);
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+
+// Move up Constants which aren't scalars from body to Subgraph and replace them with Parameters inside body
+void ConvertConstantsToParameters(const std::shared_ptr<ngraph::snippets::op::Subgraph>& subgraph) {
+    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::ConvertConstantsToParameters");
+    auto body = subgraph->get_body();
+
+    ParameterVector new_parameters;
+    OutputVector new_external_inputs = subgraph->input_values();
+
+    for (auto& op : body->get_ops()) {
+        auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
+        if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul))
+            continue;
+
+        auto parameter = std::make_shared<opset1::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
+        parameter->set_friendly_name(constant->get_friendly_name());
+        ngraph::copy_runtime_info(constant, parameter);
+        constant->output(0).replace(parameter->output(0));
+
+        new_external_inputs.push_back(constant);
+        new_parameters.push_back(parameter);
+    }
+
+    if (new_parameters.size() != 0) {
+        body->add_parameters(new_parameters);
+        body->validate_nodes_and_infer_types();
+        subgraph->set_arguments(new_external_inputs);
+    }
+}
+
+CommonOptimizations::CommonOptimizations() {
+    ngraph::graph_rewrite_callback callback = [this](pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::CommonOptimizations");
+
+        auto subgraph = ngraph::as_type_ptr<ngraph::snippets::op::Subgraph>(m.get_match_root());
+        if (transformation_callback(subgraph)) {
+            return false;
+        }
+
+        auto body = subgraph->get_body();
+        const auto is_quantized = subgraph->is_quantized();
+
+        // Firsly we should transform all original Converts inside body to ConvertTruncation to save original behavior.
+        // Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs.
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::snippets::pass::TransformConvertToConvertTruncation>();
+        if (is_quantized) {
+            manager.register_pass<ngraph::snippets::pass::CommonFakeQuantizeDecomposition>();
+        }
+        manager.run_passes(body);
+
+        // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
+        // so we can enable ConvertConstantsToParameters pass for quantized models
+        if (is_quantized) {
+            ConvertConstantsToParameters(subgraph);
+        }
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<ngraph::snippets::op::Subgraph>(), "snippets::pass::CommonOptimizations");
+    this->register_matcher(m, callback);
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
--- a/src/common/snippets/src/pass/convert_constants_to_scalars.cpp
+++ b/src/common/snippets/src/pass/convert_constants_to_scalars.cpp
@ -3,9 +3,12 @@
 //

 #include <snippets/itt.hpp>
-#include "snippets/snippets_isa.hpp"
-#include "snippets/pass/convert_constants_to_scalars.hpp"
 #include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/convert_constants.hpp"
+#include "snippets/op/subgraph.hpp"


 ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
@ -24,5 +27,5 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {

        return true;
    };
-    register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants, matcher_name), callback);
+    register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
 }
--- a/src/common/snippets/src/pass/fq_decomposition.cpp
+++ b/src/common/snippets/src/pass/fq_decomposition.cpp
@ -0,0 +1,308 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/pass/fq_decomposition.hpp"
+#include "snippets/op/convert_saturation.hpp"
+#include "snippets/itt.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/partial_shape.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <numeric>
+
+namespace {
+
+bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) {
+    auto il = fq->input_value(1);
+    auto ih = fq->input_value(2);
+    auto greater_equal = std::make_shared<ngraph::opset1::GreaterEqual>(il, ih);
+
+    ngraph::OutputVector result(1);
+    if (!greater_equal->constant_fold(result, greater_equal->input_values()))
+        return false;
+
+    auto res_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(result[0].get_node_shared_ptr());
+
+    const std::vector<bool> comp_result = res_node->cast_vector<bool>();
+
+    return !std::any_of(comp_result.begin(), comp_result.end(), [](const bool value) {
+        return value;
+    });
+}
+
+bool is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) {
+    return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) &&
+           ngraph::shape_size(source_output_node->get_shape()) == 1;
+}
+
+}  // namespace
+
+ngraph::snippets::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
+    MATCHER_SCOPE(FakeQuantizeDecomposition);
+
+    auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>(
+                                        OutputVector{ngraph::pattern::any_input(),
+                                        ngraph::pattern::wrap_type<opset1::Constant>(),
+                                        ngraph::pattern::wrap_type<opset1::Constant>(),
+                                        ngraph::pattern::wrap_type<opset1::Constant>(),
+                                        ngraph::pattern::wrap_type<opset1::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::FakeQuantizeDecomposition")
+        auto& pattern_to_output = m.get_pattern_value_map();
+        const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(
+            pattern_to_output.at(fake_quantize).get_node_shared_ptr());
+
+        if (!fake_quantize_node || transformation_callback(fake_quantize_node) ||
+            !isValidRangesInputs(fake_quantize_node)) {
+            return false;
+        }
+
+        Output<Node> data{fake_quantize_node->input_value(0)};
+        const Output<Node> input_low{fake_quantize_node->input_value(1)};
+        const Output<Node> input_high{fake_quantize_node->input_value(2)};
+        const Output<Node> output_low{fake_quantize_node->input_value(3)};
+        const Output<Node> output_high{fake_quantize_node->input_value(4)};
+        auto input_type = data.get_element_type();
+
+        std::vector<float> out_scales;
+        std::vector<float> cl, ch, isc, ish, osc, osh;
+        const bool status = getScalesAndShifts(fake_quantize_node, cl, ch, isc, ish, osc, osh);
+        if (status) {
+            out_scales = calculateScales(fake_quantize_node->get_output_element_type(0), cl, ch, isc, ish, osc, osh);
+        }
+        const bool do_dequantize = !(status && ((std::all_of(osc.cbegin(),
+                                                             osc.cend(),
+                                                             [](float val) {
+                                                                 return val == 1.f;
+                                                             }) &&
+                                                 std::all_of(osh.cbegin(),
+                                                             osh.cend(),
+                                                             [](float val) {
+                                                                 return val == 0.f;
+                                                             })) ||
+                                                out_scales.size() != 0));
+        const bool do_rounding = do_dequantize || fake_quantize_node->get_output_element_type(0) == ngraph::element::f32;
+
+        ngraph::NodeVector decomp_ops;
+        if (input_type != input_low.get_element_type()) {
+            input_type = input_low.get_element_type();
+            data = std::make_shared<ngraph::snippets::op::ConvertSaturation>(data, input_type);
+            decomp_ops.push_back(data.get_node_shared_ptr());
+        }
+
+        // if we set input_low or input_high in formula we got output = output_low and output = output_high
+        // respectively so we just clamp x
+        const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
+        const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
+        decomp_ops.push_back(max);
+        decomp_ops.push_back(min);
+
+        std::shared_ptr<ngraph::Node> result = nullptr;
+        if (out_scales.size() != 0) {
+            PartialShape scale_shape = input_low.get_partial_shape();
+            ngraph::PartialShape::broadcast_merge_into(scale_shape,
+                                                       input_high.get_partial_shape(),
+                                                       ov::op::AutoBroadcastType::NUMPY);
+            const auto scales =
+                std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, scale_shape.get_shape(), out_scales);
+            decomp_ops.push_back(scales);
+
+            result = std::make_shared<ngraph::opset1::Multiply>(min, scales);
+            decomp_ops.push_back(result);
+        } else {
+            // (levels-1)
+            const auto levels_minus_one =
+                std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
+            decomp_ops.push_back(levels_minus_one);
+            // (input_high - input_low)
+            const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
+            // (levels-1) / (input_high - input_low)
+            const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
+            // input_low * (levels-1) / (input_high - input_low)
+            const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
+            decomp_ops.push_back(subInHighLow);
+            decomp_ops.push_back(isc);
+            decomp_ops.push_back(ish);
+
+            // x * (levels-1) / (input_high - input_low)
+            const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
+            // x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
+            result = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
+            decomp_ops.push_back(after_isc_apply);
+            decomp_ops.push_back(result);
+        }
+
+        if (do_rounding) {
+            // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
+            result = std::make_shared<ngraph::opset5::Round>(result, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
+            decomp_ops.push_back(result);
+        }
+
+        if (do_dequantize) {
+            // (levels-1)
+            const auto levels_minus_one =
+                std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
+            // (output_high - output_low)
+            const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
+            // (output_high - output_low) / (levels-1)
+            const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
+            decomp_ops.push_back(sub_out_high_low);
+            decomp_ops.push_back(osc);
+
+            // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) *
+            // (output_high - output_low) / (levels-1)
+            const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(result, osc);
+            // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) *
+            // (output_high - output_low) / (levels-1) + output_low
+            result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
+            decomp_ops.push_back(after_osc_apply);
+            decomp_ops.push_back(result);
+        }
+
+        if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
+            result = std::make_shared<snippets::op::ConvertSaturation>(result, fake_quantize_node->get_output_element_type(0));
+            decomp_ops.push_back(result);
+        }
+
+        result->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
+        ngraph::replace_node(m.get_match_root(), result);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
+    register_matcher(m, callback);
+}
+
+bool ngraph::snippets::pass::FakeQuantizeDecomposition::isAllScalarConstant(const std::shared_ptr<const ngraph::Node>& node) {
+    return is_scalar_constant(node->get_input_node_shared_ptr(1)) &&
+           is_scalar_constant(node->get_input_node_shared_ptr(2)) &&
+           is_scalar_constant(node->get_input_node_shared_ptr(3)) &&
+           is_scalar_constant(node->get_input_node_shared_ptr(4));
+}
+
+bool ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(
+    const std::shared_ptr<const ngraph::opset1::FakeQuantize>& fq_node,
+    std::vector<float>& cl,
+    std::vector<float>& ch,
+    std::vector<float>& isc,
+    std::vector<float>& ish,
+    std::vector<float>& osc,
+    std::vector<float>& osh) {
+    auto input_low_constant =
+        std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(1));
+    auto input_high_constant =
+        std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(2));
+    auto output_low_constant =
+        std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(3));
+    auto output_high_constant =
+        std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(4));
+    if (!input_low_constant || !input_high_constant || !output_low_constant || !output_high_constant)
+        return false;
+
+    auto input_low = input_low_constant->cast_vector<float>();
+    auto input_high = input_high_constant->cast_vector<float>();
+    auto output_low = output_low_constant->cast_vector<float>();
+    auto output_high = output_high_constant->cast_vector<float>();
+    auto levels = fq_node->get_levels();
+
+    const auto input_size = std::max(input_low.size(), input_high.size());
+    const auto output_size = std::max(output_low.size(), output_high.size());
+
+    cl = input_low;
+    ch = input_high;
+    isc.resize(input_size, 0);
+    ish.resize(input_size, 0);
+    osc.resize(output_size, 0);
+    osh.resize(output_size, 0);
+
+    for (int i = 0; i < input_size; i++) {
+        float il = input_low[input_low.size() == 1 ? 0 : i];
+        float ih = input_high[input_high.size() == 1 ? 0 : i];
+
+        isc[i] = (levels - 1) / (ih - il);
+        ish[i] = -il * isc[i];
+    }
+
+    for (int i = 0; i < output_size; i++) {
+        float ol = output_low[output_low.size() == 1 ? 0 : i];
+        float oh = output_high[output_high.size() == 1 ? 0 : i];
+
+        osc[i] = (oh - ol) / (levels - 1);
+        osh[i] = ol;
+    }
+
+    return true;
+}
+
+std::vector<float> ngraph::snippets::pass::FakeQuantizeDecomposition::calculateScales(const ngraph::element::Type& out_type,
+                                                                            const std::vector<float>& cl,
+                                                                            const std::vector<float>& ch,
+                                                                            const std::vector<float>& isc,
+                                                                            const std::vector<float>& ish,
+                                                                            const std::vector<float>& osc,
+                                                                            const std::vector<float>& osh) {
+    std::vector<float> out_scales;
+    if (out_type == ngraph::element::u8 &&
+        std::all_of(cl.cbegin(),
+                    cl.cend(),
+                    [](float val) {
+                        return val == 0.0f;
+                    }) &&
+        std::all_of(ish.cbegin(),
+                    ish.cend(),
+                    [](float val) {
+                        return val == 0.0f;
+                    }) &&
+        std::all_of(osc.cbegin(),
+                    osc.cend(),
+                    [](float val) {
+                        return val == 1.0f;
+                    }) &&
+        std::all_of(osh.cbegin(), osh.cend(), [](float val) {
+            return val == 0.0f;
+        })) {
+        out_scales = isc;
+    }
+
+    static const float thr = 0.0001f;
+    if (out_type == ngraph::element::i8 &&
+        std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < thr; }) &&
+        std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
+        std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < thr; })) {
+        bool is_crop_aligned = true;
+        for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
+            if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > thr) {
+                is_crop_aligned = false;
+            }
+        }
+
+        for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
+            if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > thr) {
+                is_crop_aligned = false;
+            }
+        }
+
+        if (is_crop_aligned) {
+            out_scales = isc;
+        }
+    }
+
+    return out_scales;
+}
+
+bool ngraph::snippets::pass::CommonFakeQuantizeDecomposition::run_on_model(const std::shared_ptr<ngraph::Function>& f) {
+    RUN_ON_FUNCTION_SCOPE(CommonFakeQuantizeDecomposition);
+    ngraph::pass::Manager manager;
+    manager.set_per_pass_validation(false);
+    manager.register_pass<ngraph::snippets::pass::FakeQuantizeDecomposition>();
+    manager.register_pass<ngraph::pass::ConstantFolding>();
+    manager.register_pass<ngraph::pass::Validate>();
+    manager.run_passes(f);
+    return false;
+}
--- a/src/common/snippets/src/pass/insert_convert_on_inputs.cpp
+++ b/src/common/snippets/src/pass/insert_convert_on_inputs.cpp
@ -1,72 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <snippets/itt.hpp>
-#include "snippets/remarks.hpp"
-
-#include "snippets/pass/insert_convert_on_inputs.hpp"
-#include "snippets/snippets_isa.hpp"
-
-#include "ngraph/type.hpp"
-#include "ngraph/node.hpp"
-
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/rt_info.hpp>
-#include <ngraph/pattern/op/wrap_type.hpp>
-#include <ngraph/pattern/op/or.hpp>
-
-// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and
-// insert ConvertSaturation with supported element type before eltwises
-// NOTE: JUST EXAMPLE:
-//                             Parameter I8
-//                        ConvertTruncation U8
-//                  /              |               \
-// ConvertTruncation F32  ConvertTruncation I32  ConvertTruncation BF16
-//      Eltwise           ConvertSaturation FP32 ConvertTruncation I32
-//        <>                    Eltwise          ConvertSaturation FP32
-//                                 <>                    Eltwise
-bool insertConvertSaturationAfterNode(const std::shared_ptr<ov::Node>& node, const ov::element::Type element_type) {
-    bool rewritten = false;
-    for (const auto& output : node->outputs()) {
-        for (auto consumer : output.get_target_inputs()) {
-            const auto output_shared_node = consumer.get_node()->shared_from_this();
-            // Go down through ConvertTruncation sequence
-            if (auto existing_convert_t = ov::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(output_shared_node)) {
-                rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type);
-                continue;
-            }
-
-            // Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type
-            auto existing_convert_s = ov::as_type_ptr<ngraph::snippets::op::ConvertSaturation>(output_shared_node);
-            if ((!existing_convert_s && !ov::is_type<ov::op::v0::Result>(output_shared_node) && consumer.get_element_type() != element_type) ||
-                (existing_convert_s && existing_convert_s->get_destination_type() != element_type)) {
-                const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(node, element_type);
-                consumer.replace_source_output(convert);
-                rewritten |= true;
-            }
-        }
-    }
-    return rewritten;
-}
-
-ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) {
-    MATCHER_SCOPE(InsertConvertOnInputs);
-
-    auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
-    auto scalar_pattern = pattern::wrap_type<opset1::Constant>(
-        [=](Output<Node> output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; });
-    auto input = std::make_shared<pattern::op::Or>(OutputVector{ param_pattern, scalar_pattern });
-
-    ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) {
-        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs")
-        auto root = m.get_match_root();
-
-        auto rewritten = insertConvertSaturationAfterNode(root, exec_type);
-
-        return rewritten;
-    };
-
-    auto m = std::make_shared<ngraph::pattern::Matcher>(input, matcher_name);
-    register_matcher(m, callback);
-}
--- a/src/common/snippets/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp
@ -17,124 +17,43 @@ using namespace ngraph;

 namespace {

-std::shared_ptr<ngraph::Node> numpy_broadcast_node(const ngraph::Output<ngraph::Node>& value,
-    const ngraph::Shape& output_shape, const ngraph::Shape& source_shape) {
+std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngraph::Node>& value,
+                                                   const ov::Shape& target_shape, const ov::Shape& normalized_shape) {
    std::shared_ptr<ngraph::Node> broadcasted_node = value.get_node_shared_ptr();

-    if (output_shape == value.get_shape()) {
+    if (target_shape == value.get_shape()) {
        return broadcasted_node;
    }
-
-    NGRAPH_CHECK(source_shape.size() == output_shape.size(),
-                    "Ranks of source_shape and output_shape dont match: ",
-                    source_shape.size(),
-                    " vs ",
-                    output_shape.size());
-
-    bool do_broadcast = output_shape.size() > value.get_shape().size();
-    if (!do_broadcast) {
-        for (size_t index = 0; index < output_shape.size(); ++index) {
-            if (source_shape.at(index) == 1 && output_shape.at(index) != 1) {
-                do_broadcast = true;
-                break;
-            }
-        }
-    }
-
-    remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
-    << " " << broadcasted_node->get_shape() << " -> " << output_shape << std::endl;
-
-    // it shouldn't be a probrem for now since we don't consider StridedSlice and Broadcast here
-    if (auto constant = ngraph::as_type_ptr<ngraph::opset1::Constant>(broadcasted_node)) {
-        if (constant->get_shape() == ngraph::Shape() || ngraph::shape_size(constant->get_shape()) == 1) {
-            remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
-                       << " to scalar constant " << constant->get_shape() << " -- aborting!" << std::endl;
-
-            return broadcasted_node;
-        }
-    }
-
-    if (auto constant = ngraph::as_type_ptr<ngraph::snippets::op::Scalar>(broadcasted_node)) {
-        if (constant->get_shape() == ngraph::Shape() || ngraph::shape_size(constant->get_shape()) == 1) {
-            remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
-                       << " to scalar constant " << constant->get_shape() << " -- aborting!" << std::endl;
-
-            return broadcasted_node;
-        }
-    }
-
-    if (do_broadcast) {
-        // ShapeOf
-        broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, output_shape);
+    // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
+    // will be handled by pointer arithmetics in TileScheduler
+    if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
+        ov::Shape broadcasted_shape = normalized_shape;
+        *broadcasted_shape.rbegin() = *target_shape.rbegin();
+        broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
    }

    return broadcasted_node;
 }

-ngraph::Shape calculate_broadcast_shape(ngraph::Shape lhs_shape, ngraph::Shape rhs_shape) {
-    ngraph::Shape result;
-    auto lhs_rank = lhs_shape.size();
-    auto rhs_rank = rhs_shape.size();
-    auto max_rank = std::max(lhs_rank, rhs_rank);

-    // left-pad the lhs_shape with ones
-    lhs_shape.insert(begin(lhs_shape), max_rank - lhs_rank, 1);
-    // left-pad the rhs_shape with ones
-    rhs_shape.insert(begin(rhs_shape), max_rank - rhs_rank, 1);
-
-    for (size_t index = 0; index < max_rank; ++index) {
-        size_t lhs_dim = lhs_shape.at(index);
-        size_t rhs_dim = rhs_shape.at(index);
-
-        if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) {
-            throw ngraph::ngraph_error("incompatible shapes");
-        }
-
-        result.push_back(std::max(lhs_dim, rhs_dim));
+std::pair<ov::Shape, std::vector<ov::Shape>> get_numpy_broadcast_shapes(const std::vector<ov::Shape>& input_shapes) {
+    ov::PartialShape target_shape =  input_shapes.front();
+    for (auto i = 1; i < input_shapes.size(); i++) {
+        if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY))
+            throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes");
    }
-    return result;
-}
-
-std::pair<ngraph::Shape, std::vector<ngraph::Shape>> get_numpy_broadcast_shapes(const std::vector<ngraph::Shape>& input_shapes) {
-    ngraph::Shape target_shape = std::accumulate(begin(input_shapes), end(input_shapes), ngraph::Shape{}, calculate_broadcast_shape);
-
-    std::vector<ngraph::Shape> full_shapes;
-    for (const ngraph::Shape& input : input_shapes) {
-        ngraph::Shape padded_shape{input};
-        padded_shape.insert(begin(padded_shape), target_shape.size() - padded_shape.size(), 1);
-        full_shapes.push_back(move(padded_shape));
+    std::vector<ov::Shape> normalized_shapes;
+    for (const auto& input : input_shapes) {
+        ov::Shape padded_shape{input};
+        padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1);
+        normalized_shapes.push_back(std::move(padded_shape));
    }

-    return {target_shape, full_shapes};
-}
-
-auto reset_broacast_config(const std::shared_ptr<ngraph::Node>& op) -> void {
-    using namespace ngraph;
-
-    bool is_scalar = false;
-    for (auto input : op->inputs()) {
-        if (input.get_shape() == Shape() || ngraph::shape_size(input.get_shape()) == 1) {
-            is_scalar = true;
-        }
-    }
-
-    if (!is_scalar) {
-        if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseArithmetic>(op)) {
-            binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
-        } else if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseComparison>(op)) {
-            binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
-        } else if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseLogical>(op)) {
-            binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
-        }
-    }
+    return {target_shape.get_shape(), normalized_shapes};
 }

 } // namespace

-// adds explicit broadcasts if needed
-// ToDO: this indeed make model not reshapable, need to come up with more clever way to insert fake broadcast,
-// well on the other hand, if we replace scalar constant with Scalar op / or ShapeOf, we could have broadcasts that are reshapable
-// TODO: generate FakeBroadcast if and only if broadcast is done by w dimension
 ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
    MATCHER_SCOPE(InsertMoveBroadcast);
    ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
@ -145,28 +64,39 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
            return false;
        }

-        std::vector<ngraph::Shape> input_shapes;
-        for (const auto& input : values) {
-            input_shapes.push_back(input.get_shape());
+        auto is_scalar_constant = [](const ov::Output<ov::Node>& v){
+            if (auto constant = ov::as_type_ptr<ov::op::v0::Constant>(v.get_node_shared_ptr())) {
+                if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) {
+                    return true;
+                }
+            }
+            return false;
+        };
+        std::vector<ov::Shape> input_shapes;
+        std::vector<bool> ignore_as_scalar;
+        for (const auto& val : values) {
+            input_shapes.emplace_back(val.get_shape());
+            ignore_as_scalar.push_back(is_scalar_constant(val));
        }

-        // find the output tensor's shape, then broadcast all inputs so that they are compatible
+        // find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim
        auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes);

        ngraph::OutputVector broadcasted_inputs;
        for (size_t i = 0; i < values.size(); ++i) {
-            auto node = numpy_broadcast_node(values[i], bcast_shapes.first, bcast_shapes.second[i]);
-            ngraph::copy_runtime_info(root, node);
-            broadcasted_inputs.push_back(node);
+            if (ignore_as_scalar[i]) {
+                broadcasted_inputs.push_back(values[i]);
+            } else {
+                auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
+                ngraph::copy_runtime_info(root, node);
+                broadcasted_inputs.push_back(node);
+            }
        }

        auto new_args = ngraph::as_node_vector(broadcasted_inputs);
        for (size_t i = 0; i < new_args.size(); i++) {
            root->input(i).replace_source_output(new_args[i]->output(0));
        }
-
-        reset_broacast_config(root);
-
        return true;
    };

--- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
@ -27,32 +27,20 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
            const auto input = pm.at(load_pattern).get_node_shared_ptr();
            const auto param = pm.at(param_pattern).get_node_shared_ptr();

-            // check if load has more than 1 user to avoid load+broadcast load on the same parameter
-            if (input->output(0).get_target_inputs().size() != 1) {
+            // Cannot rewrite Broadcast + Load if load has more than 1 user
+            // or more than one input, or if Broadcast has several inputs
+            if (input->output(0).get_target_inputs().size() != 1 ||
+                root->inputs().size() != 1 || input->inputs().size() != 1) {
                return false;
            }

-            if (root->inputs().size() != 1 || input->inputs().size() != 1) {
-                throw ngraph_error("cannot rewrite Broadcast load with more than one input");
-            }
-
            auto inshape = root->input(0).get_shape();
            auto outshape = root->output(0).get_shape();
+
            auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
-            Shape bct(inshape.size(), 0);
-            for (size_t k = 0; k < inshape.size(); k++) {
-                if (inshape[k] != outshape[k] && inshape[k] == 1) {
-                    bct[k] = 1;
-                }
-            }
-            // Todo: consider refactoring BroadcastLoad, it seems we don't need broadcast_info at this point.
-            broadcastload->set_broadcast_info(bct);
-            if (inshape.back() == 1 && outshape.back() != 1) {
-                ngraph::copy_runtime_info(root, broadcastload);
-                ngraph::replace_node(root, broadcastload);
-                return true;
-            } else {
-                return false;
-            }
+            ngraph::copy_runtime_info(root, broadcastload);
+            ngraph::replace_node(root, broadcastload);
+
+            return true;
        });
 }
--- a/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp
+++ b/src/common/snippets/src/pass/reset_type_relaxed_node_precision.cpp
@ -1,31 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <snippets/itt.hpp>
-
-#include "snippets/op/convert_saturation.hpp"
-#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
-#include "ngraph_ops/type_relaxed.hpp"
-
-#include <ngraph/rt_info.hpp>
-
-
-ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { }
-
-bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr<ov::Model> &m) {
-    RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision);
-    bool rewritten = false;
-    for (auto& op : m->get_ordered_ops()) {
-        if (auto node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
-            for (int i = 0; i < op->outputs().size(); i++) {
-                node->set_overridden_output_type(exec_type, i);
-                rewritten |= true;
-            }
-        } else {
-            op->validate_and_infer_types();
-        }
-    }
-
-    return rewritten;
-}
--- a/src/common/snippets/src/pass/transform_convert_to_truncation.cpp
+++ b/src/common/snippets/src/pass/transform_convert_to_truncation.cpp
@ -5,7 +5,7 @@
 #include "snippets/remarks.hpp"
 #include <snippets/itt.hpp>

-#include "snippets/pass/transform_convert_to_truncation.hpp"
+#include "snippets/pass/transform_convert.hpp"
 #include "snippets/snippets_isa.hpp"

 #include <ngraph/opsets/opset1.hpp>
@ -14,15 +14,19 @@

 ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() {
    MATCHER_SCOPE(TransformConvertToConvertTruncation);
+    auto convert = std::make_shared<pattern::op::Label>(pattern::any_input(),
+        [](const std::shared_ptr<const Node> &n) {
+            return ov::is_type<ngraph::opset1::Convert>(n) &&
+                !ov::is_type<op::ConvertTruncation>(n) &&
+                !ov::is_type<op::ConvertSaturation>(n);
+        });
+
    register_matcher(std::make_shared<ngraph::pattern::Matcher>(
-        ngraph::pattern::wrap_type<ngraph::opset1::Convert>()),
+        ngraph::pattern::wrap_type<ngraph::opset1::Convert>(), matcher_name),
            [this](ngraph::pattern::Matcher &m) {
            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation")
            const auto root = m.get_match_root();
            const auto convert = ngraph::as_type_ptr<ngraph::opset1::Convert>(root);
-            if (!convert)
-                return false;
-
            auto convert_truncation = std::make_shared<op::ConvertTruncation>(convert->get_input_source_output(0),
                                                                              convert->get_destination_type());
            convert_truncation->set_friendly_name(convert->get_friendly_name());
@ -31,4 +35,4 @@ ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToC

            return true;
        });
-}
+}
--- a/src/common/snippets/src/utils.cpp
+++ b/src/common/snippets/src/utils.cpp
@ -0,0 +1,57 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/utils.hpp"
+
+#include "snippets/pass/fq_decomposition.hpp"
+
+
+auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t {
+    std::vector<float> out_scales;
+    std::vector<float> cl, ch, isc, ish, osc, osh;
+    const bool status = ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh);
+    if (status) {
+        out_scales = ngraph::snippets::pass::FakeQuantizeDecomposition::calculateScales(fq->get_output_element_type(0), cl, ch, isc, ish, osc, osh);
+        if (out_scales.size() != 0) {
+            return out_scales.size() != 1;
+        }
+    }
+
+    const bool only_quantized = status &&
+                                std::all_of(osc.cbegin(), osc.cend(),
+                                    [](float val) { return val == 1.f; }) &&
+                                std::all_of(osh.cbegin(), osh.cend(),
+                                    [](float val) { return val == 0.f; });
+    const bool il = ngraph::shape_size(fq->input(1).get_shape()) != 1lu;
+    const bool ih = ngraph::shape_size(fq->input(2).get_shape()) != 1lu;
+    const bool ol = !only_quantized && ngraph::shape_size(fq->input(3).get_shape()) != 1lu;
+    const bool oh = !only_quantized && ngraph::shape_size(fq->input(4).get_shape()) != 1lu;
+
+    // FakeQuantize decompoisition has the folowwing formula:
+    //      round(x * (levels-1) / (ih - il) - il * (levels-1) / (ih - il)) * (oh - ol) / (levels-1) + ol
+    // After the decomposition there is call of ConstantsFolding pass that generates new Constants:
+    //      - isc := (levels-1) / (ih - il)
+    //      - ish := -il * isc
+    //      - osc := (oh - ol) / (levels-1)
+    //      - osh := ol
+    // New formula:
+    //      round(x * isc + ish) * osc + osh
+    // Thus, after FakeQuantize decompoisition we have 6 Constants instead of original 4:
+    //      ih, il (for Max/Min), isc, ish, osc, osh
+    // Some of them can be scalar or non-scalar. It depends on which original 4 Constants are non-scalar
+    // To sum it up, below conditions check all possible cases to calculate count of new generated non-scalars
+    if (ol && il && ih)
+        return 6;
+    else if ((ol && (il || ih)) || (il && ih && oh))
+        return 5;
+    else if ((il && oh) || (ih && oh) || (il && ih))
+        return 4;
+    else if (il || ih)
+        return 3;
+    else if (ol)
+        return 2;
+    else if (oh)
+        return 1;
+    return 0;
+}
--- a/src/common/snippets/tests/src/broadcast_fusion.cpp
+++ b/src/common/snippets/tests/src/broadcast_fusion.cpp
@ -52,40 +52,6 @@ TEST(TransformationTests, FuseLoadWithBroadcastMoveByX) {
    ASSERT_TRUE(res.first) << res.second;
 }

-TEST(TransformationTests, NotFuseLoadWithBroadcastMoveByY) {
-    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
-    {
-        auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2});
-        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load0 = std::make_shared<snippets::isa::Load>(data0);
-        auto load1 = std::make_shared<snippets::isa::Load>(data1);
-        auto bct = std::make_shared<snippets::isa::BroadcastMove>(load0, load1->get_shape());
-        auto add = std::make_shared<opset1::Add>(bct, load1);
-        auto store = std::make_shared<snippets::isa::Store>(add);
-        f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data0, data1});
-
-        pass::Manager m;
-        m.register_pass<pass::InitNodeInfo>();
-        m.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
-        m.run_passes(f);
-        ASSERT_NO_THROW(check_rt_info(f));
-    }
-
-    {
-        auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2});
-        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
-        auto load0 = std::make_shared<snippets::isa::Load>(data0);
-        auto load1 = std::make_shared<snippets::isa::Load>(data1);
-        auto bct = std::make_shared<snippets::isa::BroadcastMove>(load0, load1->get_shape());
-        auto add = std::make_shared<opset1::Add>(bct, load1);
-        auto store = std::make_shared<snippets::isa::Store>(add);
-        f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data0, data1});
-    }
-
-    auto res = compare_functions(f, f_ref);
-    ASSERT_TRUE(res.first) << res.second;
-}
-
 TEST(TransformationTests, NoFuseLoadWithBroadcastMoveMultipleUsers) {
    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
    {
--- a/src/common/snippets/tests/src/movebroadcast.cpp
+++ b/src/common/snippets/tests/src/movebroadcast.cpp
@ -22,7 +22,7 @@ using namespace ngraph;
 TEST_F(TransformationTestsF, InsertBroadcastMove) {
    {
        auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3});
-        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 3});
+        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 1});
        auto add = std::make_shared<opset1::Add>(data0, data1);
        function = std::make_shared<Function>(NodeVector{add}, ParameterVector{data0, data1});

@ -30,10 +30,9 @@ TEST_F(TransformationTestsF, InsertBroadcastMove) {
    }
    {
        auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3});
-        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 3});
-        auto move0 = std::make_shared<snippets::isa::BroadcastMove>(data0, Shape{1, 2, 3});
+        auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 1});
        auto move1 = std::make_shared<snippets::isa::BroadcastMove>(data1, Shape{1, 2, 3});
-        auto add = std::make_shared<opset1::Add>(move0, move1);
+        auto add = std::make_shared<opset1::Add>(data0, move1);
        function_ref = std::make_shared<Function>(NodeVector{add}, ParameterVector{data0, data1});
    }
 }
--- a/src/common/snippets/tests/src/pass/fake_quantize_decomposition_test.cpp
+++ b/src/common/snippets/tests/src/pass/fake_quantize_decomposition_test.cpp
@ -0,0 +1,49 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "snippets/pass/common_optimizations.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "fake_quantize_function.hpp"
+#include "function_helper.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class FakeQuantizeDecompositionTest : public TransformationTestsF {
+public:
+    void register_passes() {
+        manager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
+    }
+
+    void TearDown() override {
+        TransformationTestsF::TearDown();
+
+        auto subgraph = FunctionHelper::getSubgraph(function);
+        auto body = subgraph == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph)->get_body();
+
+        auto subgraph_ref = FunctionHelper::getSubgraph(function_ref);
+        auto body_ref = subgraph_ref == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph_ref)->get_body();
+
+        auto res = comparator.compare(body, body_ref);
+        ASSERT_TRUE(res.valid) << res.message;
+    }
+};
+
+TEST_F(FakeQuantizeDecompositionTest, smoke_Snippets_PerTensorFakeQuantizeDecomposition) {
+    function = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
+        {1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
+
+    function_ref = FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
+        {1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
+
+    register_passes();
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/common/snippets/tests/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/tests/src/pass/insert_load_store.cpp
@ -41,31 +41,21 @@ TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) {

 namespace InsertLoadStoreTestsInstantiation {
 using ov::Shape;
-std::vector<Shape> inputShapes1{{1, 1, 2, 5, 1}, {1, 4, 1, 5, 1}};
-std::vector<Shape> inputShapes2{{1, 1, 2, 5, 1}, {1, 4, 1, 5, 1}, {1, 4, 1, 5, 16}};
+std::vector<Shape> inputShapes{{1, 4, 1, 5, 1}, {1, 4, 2, 5, 1}};
+std::vector<Shape> broadcastShapes{{1, 4, 1, 5, 16}, {1, 4, 2, 5, 16}};
 Shape exec_domain{1, 4, 2, 5, 16};
 Shape emptyShape{};

 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastLoad, InsertLoadStoreTests,
                         ::testing::Combine(
                                 ::testing::Values(exec_domain),
-                                 ::testing::ValuesIn(inputShapes1),
-                                 ::testing::ValuesIn(inputShapes1),
+                                 ::testing::Values(inputShapes[0]),
+                                 ::testing::Values(inputShapes[1]),
                                 ::testing::Values(emptyShape),
-                                 ::testing::Values(exec_domain),
-                                 ::testing::Values(exec_domain)),
+                                 ::testing::Values(broadcastShapes[0]),
+                                 ::testing::Values(broadcastShapes[1])),
                         InsertLoadStoreTests::getTestCaseName);

-
-INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastMove, InsertLoadStoreTests,
-                         ::testing::Combine(
-                                 ::testing::Values(exec_domain),
-                                 ::testing::Values(Shape {1, 4, 1, 5, 16}),
-                                 ::testing::ValuesIn(inputShapes2),
-                                 ::testing::Values(emptyShape),
-                                 ::testing::Values(exec_domain),
-                                 ::testing::Values(exec_domain)),
-                         InsertLoadStoreTests::getTestCaseName);
 } // namespace InsertLoadStoreTestsInstantiation
 }  // namespace snippets
 }  // namespace test
--- a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
@ -39,7 +39,7 @@ TEST_P(InsertMoveBroadcastTests, AddBroadcast) {

 namespace InsertMoveBroadcastTestsInstantiation {
 using ov::Shape;
-std::vector<Shape> inputShapes0 {{1, 1, 1, 3}, {1, 1, 2, 3}, {1, 8, 1, 3}};
+std::vector<Shape> inputShapes0 {{1, 8, 2, 1}};
 std::vector<Shape> inputShapes1 {{1, 8, 2, 3}};
 Shape broadcastShape {1, 8, 2, 3};
 Shape emptyShape {};
@ -59,12 +59,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastOn1, InsertMoveBroadcastTests,
                                 ::testing::Values(broadcastShape)),
                         InsertMoveBroadcastTests::getTestCaseName);

-std::vector<Shape> inputShapesBoth0 {{4, 1, 2, 3}, {1, 8, 1, 3}, {1, 1, 2, 3}};
-std::vector<Shape> inputShapesBoth1 {{1, 8, 1, 3}, {4, 1, 2, 3}, {4, 8, 1, 3}};
-Shape broadcastShapeBoth{4, 8, 2, 3};
-std::vector<insertMoveBroadcastParams> params = {std::make_tuple(inputShapesBoth0[0], inputShapesBoth1[0], broadcastShapeBoth, broadcastShapeBoth),
-                                        std::make_tuple(inputShapesBoth0[1], inputShapesBoth1[1], broadcastShapeBoth, broadcastShapeBoth),
-                                        std::make_tuple(inputShapesBoth0[2], inputShapesBoth1[2], broadcastShapeBoth, broadcastShapeBoth)};
+std::vector<Shape> inputShapesBoth0 {{4, 1, 2, 1}, {1, 8, 1, 1}, {1, 1, 2, 3}};
+std::vector<Shape> inputShapesBoth1 {{4, 8, 2, 3}, {4, 1, 2, 3}, {4, 8, 1, 1}};
+std::vector<Shape> broadcastShapeBoth{{4, 1, 2, 3}, {1, 8, 1, 3}, {4, 8, 1, 3}};
+std::vector<insertMoveBroadcastParams> params = {std::make_tuple(inputShapesBoth0[0], inputShapesBoth1[0], broadcastShapeBoth[0], emptyShape),
+                                        std::make_tuple(inputShapesBoth0[1], inputShapesBoth1[1], broadcastShapeBoth[1], emptyShape),
+                                        std::make_tuple(inputShapesBoth0[2], inputShapesBoth1[2], emptyShape, broadcastShapeBoth[2])};

 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastOnBoth, InsertMoveBroadcastTests,
                         ::testing::ValuesIn(params),
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@ -10,6 +10,7 @@

 #include <string>
 #include <map>
+#include <mutex>

 namespace ov {
 namespace intel_cpu {
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
@ -17,6 +17,7 @@

 #include "snippets_transformations/op/load_convert.hpp"
 #include "snippets_transformations/op/store_convert.hpp"
+#include "ngraph_transformations/op/swish_cpu.hpp"

 #include <ngraph/opsets/opset5.hpp>

@ -114,6 +115,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
    // jitters[ngraph::opset1::Tan::get_type_info_static()] = CREATE_EMITTER(); // not supported
    jitters[ngraph::opset1::Tanh::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_tanh_emitter);

+    jitters[ov::intel_cpu::SwishNode::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_swish_emitter);
    jitters[ngraph::op::v4::HSwish::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_hswish_emitter);
    // jitters[ngraph::opset1::HardSigmoid::get_type_info_static()] = CREATE_EMITTER(); // not supported
    // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported
--- a/src/plugins/intel_cpu/src/emitters/jit_dnnl_ext_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_dnnl_ext_emitters.hpp
@ -5,6 +5,7 @@
 #pragma once

 #include "ngraph/opsets/opset5.hpp"
+#include "ngraph_transformations/op/swish_cpu.hpp"
 #include "jit_dnnl_emitters.hpp"

 namespace ov {
@ -102,6 +103,20 @@ public:
        }
 };

+class jit_swish_emitter : public jit_dnnl_emitter {
+public:
+    jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
+                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
+            : jit_dnnl_emitter(host, host_isa, n, exec_prc) {
+        kind = dnnl_eltwise_swish;
+        auto op = ngraph::as_type_ptr<ov::intel_cpu::SwishNode>(n);
+        alpha = op->get_alpha();
+        beta = 0.f;
+
+        set_injector();
+    }
+};
+
 class jit_hswish_emitter : public jit_dnnl_emitter {
 public:
    jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
@ -114,6 +129,7 @@ public:
        set_injector();
    }
 };
+
 class jit_gelu_v0_emitter : public jit_dnnl_emitter {
 public:
    jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
--- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp
@ -18,10 +18,12 @@ using namespace Xbyak::util;
 namespace ov {
 namespace intel_cpu {

+namespace {
 // heuristic threshold number by byte between mask load and emulation with several simple partial load
-const int threshold_for_mask_emu_load = 14;
+constexpr int threshold_for_mask_emu_load = 14;
 // heuristic threshold number by byte between mask store and emulation with several simple partial store
-const int threshold_for_mask_emu_store = 6;
+constexpr int threshold_for_mask_emu_store = 6;
+}   // namespace

 size_t load_emitter_params::hash() const {
    size_t seed = 0;
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@ -387,13 +387,6 @@ void TileEmitter::emit_impl(const std::vector<size_t>& in,

 BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                           const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
-    if (n->get_input_shape(0).empty())
-        use_broadcast = true;
-    else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
-        use_broadcast = true;
-    else
-        use_broadcast = false;
-
    if (n->get_input_element_type(0) != n->get_output_element_type(0))
        IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: "
            << n->get_input_element_type(0) << " and " << n->get_output_element_type(0);
@ -420,20 +413,14 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void BroadcastMoveEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
-    Vmm vmm_src0 = Vmm(in[0]);
    Xmm xmm_src0 = Xmm(in[0]);
    Vmm vmm_dst  = Vmm(out[0]);

-    if (use_broadcast) {
-        switch (byte_size) {
-            case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
-            case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
-            case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
-            default: assert(!"unsupported data type");
-        }
-    } else {
-        if (vmm_src0 != vmm_dst)
-            h->uni_vmovups(vmm_dst, vmm_src0);
+    switch (byte_size) {
+        case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
+        case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
+        case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
+        default: assert(!"unsupported data type");
    }
 }

--- a/src/plugins/intel_cpu/src/exec_network.cpp
+++ b/src/plugins/intel_cpu/src/exec_network.cpp
@ -78,6 +78,7 @@ ExecNetwork::ExecNetwork(const InferenceEngine::CNNNetwork &network,
    bool isFloatModel = !ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(function);

    _cfg.isNewApi = !isLegacyAPI();
+    _mutex = std::make_shared<std::mutex>();

    // WA for inference dynamic batch cases in new API
    if (_cfg.isNewApi) {
@ -176,10 +177,10 @@ ExecNetwork::GraphGuard::Lock ExecNetwork::GetGraph() const {
        auto makeGraph = [&] {
            try {
                {
-                    std::lock_guard<std::mutex> lock{_cfgMutex};
+                    std::lock_guard<std::mutex> lock{*_mutex.get()};
                    graphLock._graph.setConfig(_cfg);
                }
-                graphLock._graph.CreateGraph(_network, extensionManager, _numaNodesWeights[numaNodeId]);
+                graphLock._graph.CreateGraph(_network, extensionManager, _numaNodesWeights[numaNodeId], _mutex);
            } catch(...) {
                exception = std::current_exception();
            }
@ -198,7 +199,7 @@ ExecNetwork::GraphGuard::Lock ExecNetwork::GetGraph() const {

 void ExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
    {
-        std::lock_guard<std::mutex> lock{_cfgMutex};
+        std::lock_guard<std::mutex> lock{*_mutex.get()};
        _cfg.readProperties(properties);
    }
    for (auto& g : _graphs) {
--- a/src/plugins/intel_cpu/src/exec_network.h
+++ b/src/plugins/intel_cpu/src/exec_network.h
@ -53,7 +53,9 @@ protected:
    ExtensionManager::Ptr extensionManager;
    std::vector<InferenceEngine::IVariableStateInternal::Ptr> memoryStates;
    const InferenceEngine::CNNNetwork           _network;
-    mutable std::mutex                          _cfgMutex;
+    // Generic synchronization primitive on ExecNetwork level.
+    // Usage example: helps to avoid data races during CPU Graph initialization in multi-streams scenario
+    mutable std::shared_ptr<std::mutex>         _mutex;
    Config                                      _cfg;
    std::atomic_int                             _numRequests = {0};
    std::string                                 _name;
@ -67,7 +69,7 @@ protected:

    // WARNING: Do not use _graphs directly.
    mutable std::deque<GraphGuard>              _graphs;
-    mutable NumaNodesWeights                           _numaNodesWeights;
+    mutable NumaNodesWeights                    _numaNodesWeights;

    /* WARNING: Use GetGraph() function to get access to graph in current stream.
     * NOTE: Main thread is interpreted as master thread of external stream so use this function to get access to graphs
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@ -25,6 +25,7 @@
 #include "nodes/input.h"
 #include <nodes/reorder.h>
 #include "nodes/convert.h"
+#include "nodes/subgraph.h"

 #include <ie_algorithm.hpp>
 #include <blob_factory.hpp>
@ -68,7 +69,7 @@ Graph::~Graph() {

 template<typename NET>
 void Graph::CreateGraph(NET &net, const ExtensionManager::Ptr& extMgr,
-        WeightsSharing::Ptr &w_cache) {
+        WeightsSharing::Ptr &w_cache, const std::shared_ptr<std::mutex>& mutex) {
    OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph");

    if (IsReady())
@ -77,6 +78,7 @@ void Graph::CreateGraph(NET &net, const ExtensionManager::Ptr& extMgr,
    weightsCache = config.streamExecutorConfig._streams != 1 ? w_cache : nullptr;

    rtParamsCache = std::make_shared<MultiCache>(config.rtCacheCapacity);
+    sharedMutex = mutex;

    Replicate(net, extMgr);
    InitGraph();
@ -119,9 +121,9 @@ void Graph::CreateGraph(const std::vector<NodePtr> &graphNodes,
 }

 template void Graph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
-        const ExtensionManager::Ptr&, WeightsSharing::Ptr&);
+        const ExtensionManager::Ptr&, WeightsSharing::Ptr&, const std::shared_ptr<std::mutex>& mutex);
 template void Graph::CreateGraph(const CNNNetwork&,
-        const ExtensionManager::Ptr&, WeightsSharing::Ptr&);
+        const ExtensionManager::Ptr&, WeightsSharing::Ptr&, const std::shared_ptr<std::mutex>& mutex);

 void Graph::Replicate(const std::shared_ptr<const ov::Model> &subgraph, const ExtensionManager::Ptr& extMgr) {
    this->_name = "subgraph";
@ -153,7 +155,9 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &subgraph, const Ex
        if (isQuantized()) {
            node->setQuantizedGraphFlag(true);
        }
+
        node->setRuntimeCache(rtParamsCache);
+        node->setSharedMutex(sharedMutex);

        graphNodes.push_back(node);

@ -265,7 +269,10 @@ void Graph::Replicate(const CNNNetwork &network, const ExtensionManager::Ptr& ex
        if (isQuantized()) {
            node->setQuantizedGraphFlag(true);
        }
+
        node->setRuntimeCache(rtParamsCache);
+        node->setSharedMutex(sharedMutex);
+
        graphNodes.push_back(node);

        if (op->get_type_info() == ngraph::op::v0::Parameter::get_type_info_static()) {
--- a/src/plugins/intel_cpu/src/graph.h
+++ b/src/plugins/intel_cpu/src/graph.h
@ -53,7 +53,8 @@ public:
    template<typename NET>
    void CreateGraph(NET &network,
                     const ExtensionManager::Ptr& extMgr,
-                     WeightsSharing::Ptr &w_cache);
+                     WeightsSharing::Ptr &w_cache,
+                     const std::shared_ptr<std::mutex>& mutex);

    void CreateGraph(const std::vector<NodePtr> &graphNodes,
                     const std::vector<EdgePtr> &graphEdges,
@ -262,6 +263,7 @@ private:
    std::vector<NodePtr> executableGraphNodes;

    MultiCachePtr rtParamsCache;
+    std::shared_ptr<std::mutex> sharedMutex = nullptr;

    void EnforceBF16();
 };
--- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "snippets_mark_skipped.hpp"
-#include <snippets/pass/collapse_subgraph.hpp>
+#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "snippets/utils.hpp"
 #include <ngraph/opsets/opset1.hpp>
 #include <utils/general_utils.h>
 #include <utils/cpu_utils.hpp>
@ -15,6 +17,7 @@ namespace ov {
 namespace intel_cpu {

 namespace {
+static const int DEFAULT_AXIS = 1;
 NodeFusingType GetNodeFusingType(const std::shared_ptr<const Node> &node) {
    auto &rt = node->get_rt_info();
    const auto rinfo = rt.find("MayBeFusedInPlugin");
@ -110,13 +113,18 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
           isBroadcastableToDataInput();
 }

-bool SupportsFusingWithConvolution_Simple(const std::shared_ptr<const Node> &node, const int channelAxis = 1) {
+inline bool canBeMatMulExecutedInInt8(const ov::element::Type& firstType, const ov::element::Type& secondType) {
+    return one_of(firstType, ov::element::i8, ov::element::u8) && secondType == ov::element::i8;
+}
+
+bool SupportsFusingWithConvolution_Simple(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
    return SupportsFusingWithConvolution_SumActivation(node) ||
           ov::is_type<ngraph::op::Tanh>(node) ||
           ov::is_type<ngraph::op::v0::Gelu>(node) ||
           ov::is_type<ngraph::op::v7::Gelu>(node) ||
           ov::is_type<ngraph::op::Abs>(node) ||
           ov::is_type<ngraph::op::Sqrt>(node) ||
+           ov::is_type<ngraph::op::FakeQuantize>(node) ||
           canBePerformedAsScaleShift(node, channelAxis);
 }
 // Convolution is a special case, since it supports peculiar fusings
@ -136,7 +144,7 @@ bool isSuitableBinaryConvolutionParent(const std::shared_ptr<const Node> &node)
    return is_suitable_node && has_only_child;
 }
 int getChannelAxis(const ov::AxisSet &axes, bool keep_dims) {
-    int channelAxis = 1;
+    int channelAxis = DEFAULT_AXIS;
    if (!keep_dims) {
        for (auto &axis : axes) {
            if (axis == 1) {
@ -150,7 +158,7 @@ int getChannelAxis(const ov::AxisSet &axes, bool keep_dims) {
    }
    return channelAxis;
 }
-bool isSuitableMiscParent(const std::shared_ptr<const Node> &node, int &channelAxis) {
+bool isSuitableMiscParent(const std::shared_ptr<const Node> &node) {
    const bool is_suitable_node = ov::is_type<ngraph::op::v0::MVN>(node) ||
                                  ov::is_type<ngraph::op::v6::MVN>(node) ||
                                  ov::is_type<ngraph::op::v0::NormalizeL2>(node) ||
@ -160,13 +168,8 @@ bool isSuitableMiscParent(const std::shared_ptr<const Node> &node, int &channelA
                                  ov::is_type<ngraph::op::v4::LSTMCell>(node) ||
                                  ov::is_type<ngraph::opset1::ConvolutionBackpropData>(node) ||
                                  ov::is_type<ngraph::op::util::ArithmeticReductionKeepDims>(node) ||
-                                  ov::is_type<ngraph::op::util::LogicalReductionKeepDims>(node) ||
-                                  ov::is_type<ngraph::opset1::GroupConvolutionBackpropData>(node);
-    if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node)) {
-        channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
-    } else if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::LogicalReductionKeepDims>(node)) {
-        channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
-    }
+                                  ov::is_type<ngraph::opset1::GroupConvolutionBackpropData>(node) ||
+                                  ov::is_type<ngraph::opset1::AvgPool>(node);
    // has a single output, connected to a single child
    const auto out = node->outputs();
    const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
@ -180,6 +183,13 @@ bool isSuitableMatMulParent(const std::shared_ptr<const Node> &node) {
    const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
    return is_suitable_node && has_only_child;
 }
+// From Reduce::canFuse() corner case. CanFuseSimpleOperation is covered by Misc
+inline bool isSuitableReduceParent(const std::shared_ptr<const Node> &node) {
+    bool is_suitable_reduce = ov::is_type<ov::op::util::ArithmeticReductionKeepDims>(node) && isSuitableMiscParent(node);
+    bool is_not_min_max = !ov::is_type<ov::op::v1::ReduceMax>(node) && !ov::is_type<ov::op::v1::ReduceMin>(node);
+    bool out_is_f32 = node->get_output_element_type(0) == ov::element::f32;
+    return is_suitable_reduce && is_not_min_max && out_is_f32;
+}
 // Subtract as ZeroPoints for Convolution
 bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &node) {
    const bool is_suitable_node = ov::is_type<ngraph::op::v1::Subtract>(node);
@ -197,21 +207,24 @@ bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &nod
    const auto weight_shape = child->get_input_shape(1);
    const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1;
    const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5);
-    if (!(is_conv && deptwise_is_suitable))
+    if (!deptwise_is_suitable)
        return false;

-    const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8;
    const auto zp_weights = node->get_input_node_shared_ptr(1);
    const auto zp_weight_shape = zp_weights->get_output_shape(0);
-    bool second_input_is_suitable =
-            ov::is_type<ngraph::op::v0::Constant>(zp_weights) &&
-                    zp_weights->get_output_element_type(0) == ov::element::u8 &&
-                    zp_weight_shape.size() >= 2;
-    if (!(first_input_is_suitable && second_input_is_suitable))
-        return false;
    auto correct_shape = ov::Shape(zp_weight_shape.size(), 1);
-    correct_shape[1] = zp_weight_shape[1];
-    return correct_shape == zp_weight_shape;
+    if (zp_weight_shape.size() > 1)
+        correct_shape[1] = zp_weight_shape[1];
+    const bool zp_weights_is_suitable = ov::is_type<ov::op::v0::Constant>(zp_weights) &&
+                                        zp_weights->get_element_type() == ov::element::u8 &&
+                                        zp_weight_shape.size() >= 2 && correct_shape == zp_weight_shape;
+    const bool first_conv_input_is_suitable = node->get_input_element_type(0) == ov::element::u8 &&
+                                              zp_weights_is_suitable;
+
+    const auto conv_weights = child->get_input_node_shared_ptr(1);
+    bool second_conv_input_is_suitable = ov::is_type<ngraph::op::v0::Constant>(conv_weights) &&
+                                         conv_weights->get_output_element_type(0) == ov::element::i8;
+    return first_conv_input_is_suitable && second_conv_input_is_suitable;
 }
 bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
    const bool is_suitable_node = ov::is_type<ngraph::op::v1::MaxPool>(node);
@ -220,11 +233,12 @@ bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
    const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
    return is_suitable_node && has_only_child;
 }
-bool isSuitableChildForFusingSimple(const std::shared_ptr<const Node> &node, int channelAxis = 1) {
+bool isSuitableChildForFusingSimple(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
    // Note: Fusing child is allowed to have several users, but that must be the end of the chain
    return SupportsFusingWithConvolution_Simple(node, channelAxis) && getNumNonConstInputs(node) == 1;
 }
-bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, NodeFusingType &updatedChainType) {
+bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, const bool canMatMulBeExecutedInI8,
+                                    NodeFusingType &updatedChainType, int& fusingAxis) {
    int num_non_const_inputs = 0;
    bool can_be_converted_to_FC = false;
    ov::Shape bias_shape;
@ -255,52 +269,66 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, Nod
    if (num_non_const_inputs != 1)
        return false;

+    // Matmul / FC bias fusion
+    if (ov::is_type<ngraph::opset1::Add>(node) &&
+        bias_shape.back() == matmul_shape.back() &&
+        bias_shape.back() == shape_size(bias_shape)) {
+        return true;
+    }
+
    // FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation
    // Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to
    // eliminate getNumNonConstInputs() check
-    int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
-
+    fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
    if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) {
        updatedChainType = NodeFusingType::FusedWithMisc;
        return true;
    }

-    // canFuse() from MatMul for case with rank > 2
-    // Algorithm::EltwisePowerStatic is ignored
-    if (!can_be_converted_to_FC &&
-        node->get_output_shape(0).size() > 2) {
-        if (ov::is_type<ov::op::v1::Add>(node) ||
-            ov::is_type<ov::op::v1::Multiply>(node) ||
-            ov::is_type<ov::op::v1::Subtract>(node) ||
-            ov::is_type<ov::op::v1::Divide>(node) ||
-            ov::is_type<ov::op::v0::PRelu>(node)) {
-            const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
-            const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
-            int constPort = -1;
-            if (const2) {
-                constPort = 1;
-            } else if (const1) {
-                constPort = 0;
-            }
+    // MatMul specific checks from ::canFuse()
+    if (!can_be_converted_to_FC) {
+        // can with rank() > 2
+        // Algorithm::EltwisePowerStatic is ignored
+        if (node->get_output_shape(0).size() > 2) {
+            if (ov::is_type<ov::op::v1::Add>(node) ||
+                ov::is_type<ov::op::v1::Multiply>(node) ||
+                ov::is_type<ov::op::v1::Subtract>(node) ||
+                ov::is_type<ov::op::v1::Divide>(node) ||
+                ov::is_type<ov::op::v0::PRelu>(node)) {
+                const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
+                const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
+                int constPort = -1;
+                if (const2) {
+                    constPort = 1;
+                } else if (const1) {
+                    constPort = 0;
+                }

-            if (constPort != -1) {
-                auto const_shape = node->get_input_shape(constPort);
-                if (ov::shape_size(const_shape) != 1) {
+                if (constPort != -1) {
+                    auto const_shape = node->get_input_shape(constPort);
+                    if (ov::shape_size(const_shape) != 1) {
+                        return false;
+                    }
+                }
+            } else if (ov::is_type<ov::op::v0::FakeQuantize>(node)) {
+                const bool is_per_tensor_broadcasting = ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(1)) &&
+                                                        ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(2)) &&
+                                                        ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(3)) &&
+                                                        ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(4));
+                if (!is_per_tensor_broadcasting) {
                    return false;
                }
            }
        }
+
+        // specific case for FQ
+        if (ov::is_type<ov::op::v0::FakeQuantize>(node)) {
+            if (one_of(node->get_output_element_type(0), ov::element::i8, ov::element::u8) && canMatMulBeExecutedInI8) {
+                return false;
+            }
+        }
    }

-    //    FullyConnectedBiasFusion
-    if (!(can_be_converted_to_FC && ov::is_type<ngraph::opset1::Add>(node) &&
-        bias_shape.back() == matmul_shape.back() &&
-        bias_shape.back() == shape_size(bias_shape))) {
-        return false;
-    }
-    // Fusing chain must be interrupted after the node, since reshape will be inserted
-    if (bias_shape.size() >= 2)
-        updatedChainType = NodeFusingType::FusedTerminator;
    return true;
 }
 bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &node) {
@ -334,11 +362,21 @@ bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &n
        }
        return true;
    };
+    auto isFusedFQNode = [&isFusedBiasNode](std::shared_ptr<Node> n) {
+        if (!(ov::is_type<ngraph::op::v0::FakeQuantize>(n) &&
+            GetNodeFusingType(n) == NodeFusingType::FusedWithConvolution))
+            return false;
+        const auto& parent = n->get_input_node_shared_ptr(0);
+        const bool is_suitable_parent = isSuitableConvolutionParent(parent)
+            || isFusedBiasNode(parent)
+            || (GetNodeFusingType(parent) == NodeFusingType::FusedWithConvolution);
+        return is_suitable_parent;
+    };
    int num_conv_parents = 0;
    for (size_t i = 0; i < node->get_input_size(); i++) {
        const auto n = node->get_input_node_shared_ptr(i);
        //BinaryConvolution allows other ops to be fused before the Add, while Convolution doesn't
-        num_conv_parents += (isSuitableConvolutionParent(n) || isFusedBiasNode(n) ||
+        num_conv_parents += (isSuitableConvolutionParent(n) || isFusedBiasNode(n) || isFusedFQNode(n) ||
                             GetNodeFusingType(n) == NodeFusingType::FusedWithBinaryConvolution);
    }
    return getNumNonConstInputs(node) == 2 && num_conv_parents >=1;
@ -346,6 +384,9 @@ bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &n
 bool isSuitableChildForFusingSumActivation(const std::shared_ptr<const Node> &node) {
    return SupportsFusingWithConvolution_SumActivation(node);
 }
+bool isSuitableReduceChild(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
+    return node->get_output_element_type(0) == ov::element::f32 && isSuitableChildForFusingSimple(node, channelAxis);
+}
 // Continue fusing chain of the passed type if the node has one child
 // Otherwise mark node as FusedTerminator (Fused, but fusing chain is interrupted)
 void PropagateIfHasOnlyChild(const std::shared_ptr<Node> &node, NodeFusingType nodeType) {
@ -378,59 +419,77 @@ void MarkSubgraphOpAsSkipped(const std::shared_ptr<Node> &node) {

 bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
    RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped);
-    int channelAxis = 1;
+    int channelAxis = DEFAULT_AXIS;
    for (auto &node : m->get_ordered_ops()) {
        if (ngraph::op::is_constant(node))
            continue;
+
        if (ngraph::op::is_parameter(node)) {
            SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
-            continue;
        } else if (isSuitableConvolutionParent(node)) {
            // Initiate fusing chain
            SetNodeFusingType(node, NodeFusingType::FusedWithConvolution);
-            continue;
+            channelAxis = DEFAULT_AXIS;
        } else if (isSuitableBinaryConvolutionParent(node)) {
            SetNodeFusingType(node, NodeFusingType::FusedWithBinaryConvolution);
-            continue;
-        } else if (isSuitableMiscParent(node, channelAxis)) {
+            channelAxis = DEFAULT_AXIS;
+        } else if (isSuitableReduceParent(node)) {
+            const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node);
+            channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
+            SetNodeFusingType(node, NodeFusingType::FusedWithReduce);
+        } else if (isSuitableMiscParent(node)) {
+            if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node)) {
+                channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
+            } else {
+                channelAxis = DEFAULT_AXIS;
+            }
            SetNodeFusingType(node, NodeFusingType::FusedWithMisc);
-            continue;
        } else if (isSuitableMatMulParent(node)) {
-            SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
-            continue;
+            if (canBeMatMulExecutedInInt8(node->get_input_element_type(0), node->get_input_element_type(1)))
+                SetNodeFusingType(node, NodeFusingType::FusedWithMatMulI8);
+            else
+                SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
+            channelAxis = DEFAULT_AXIS;
        } else if (isSuitableSubtractAsZeroPointsParent(node)) {
            SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
-            continue;
-        }
-        for (const auto fusingChainType : getContinuableChains(node)) {
-            if (isSuitableChildForFusingSimple(node, channelAxis)) {
-                PropagateIfHasOnlyChild(node, fusingChainType);
-            } else if (fusingChainType == NodeFusingType::FusedWithConvolution ||
-                       fusingChainType == NodeFusingType::FusedWithBinaryConvolution) {
-                if (isSuitableParentForFusingSumActivation(node)) {
-                    PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolutionSumActivation);
-                    // Mimic FuseConvolutionAndSimpleOperationThroughMaxPool
-                } else if (isSuitablePoolChild(node)) {
+            channelAxis = DEFAULT_AXIS;
+        } else {
+            for (const auto fusingChainType : getContinuableChains(node)) {
+                if (fusingChainType == NodeFusingType::FusedWithReduce) {
+                    if (isSuitableReduceChild(node, channelAxis))
+                        PropagateIfHasOnlyChild(node, fusingChainType);
+                } else if (isSuitableChildForFusingSimple(node, channelAxis)) {
                    PropagateIfHasOnlyChild(node, fusingChainType);
+                } else if (fusingChainType == NodeFusingType::FusedWithConvolution ||
+                           fusingChainType == NodeFusingType::FusedWithBinaryConvolution) {
+                    if (isSuitableParentForFusingSumActivation(node)) {
+                        PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolutionSumActivation);
+                        // Mimic FuseConvolutionAndSimpleOperationThroughMaxPool
+                    } else if (isSuitablePoolChild(node)) {
+                        PropagateIfHasOnlyChild(node, fusingChainType);
+                    }
+                } else if (fusingChainType == NodeFusingType::FusedWithConvolutionSumActivation &&
+                           isSuitableChildForFusingSumActivation(node)) {
+                    // Todo: Chain could be converted from FusedWithBinaryConvolution to FusedWithConvolution at this point
+                    // Set FusedWithConvolution, so the fusing chain could be propagated
+                    PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolution);
+                } else if (fusingChainType == NodeFusingType::FusedWithMatMul ||
+                           fusingChainType == NodeFusingType::FusedWithMatMulI8) {
+                    const bool isExecutedInINT8 = fusingChainType == NodeFusingType::FusedWithMatMulI8;
+                    // Handle fusings for both MatMul and FullyConnected
+                    NodeFusingType updatedChainType = fusingChainType;
+                    if (isSuitableChildForFusingMatMul(node, isExecutedInINT8, updatedChainType, channelAxis))
+                        PropagateIfHasOnlyChild(node, updatedChainType);
+                } else if (fusingChainType == NodeFusingType::IgnoredAfterInputs && (snippets::pass::AppropriateForSubgraph(node) ||
+                            ov::is_type<ngraph::op::v0::Convert>(node) || ov::is_type<ngraph::op::v1::Transpose>(node))) {
+                    // In OV_API 2.0 after Input node with I8/U8 precisions incerts Convert node, moreother on TF models inserts
+                    // Transpose layer. These brakes an idea to leave Eltwise node with I8/U8 inputs and FP32 outputs instead of Subgrath node
+                    // TODO Remove an additional check on Convert/Transpose here after enabling Subgraths with I8/U8 inputs and FP32 outputs
+                    SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
                }
-            } else if (fusingChainType == NodeFusingType::FusedWithConvolutionSumActivation &&
-                       isSuitableChildForFusingSumActivation(node)) {
-                // Todo: Chain could be converted from FusedWithBinaryConvolution to FusedWithConvolution at this point
-                // Set FusedWithConvolution, so the fusing chain could be propagated
-                PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolution);
-            } else if (fusingChainType == NodeFusingType::FusedWithMatMul) {
-                // Handle fusings for both MatMul and FullyConnected
-                NodeFusingType updatedChainType = fusingChainType;
-                if (isSuitableChildForFusingMatMul(node, updatedChainType))
-                    PropagateIfHasOnlyChild(node, updatedChainType);
-            } else if (fusingChainType == NodeFusingType::IgnoredAfterInputs && (snippets::pass::AppropriateForSubgraph(node) ||
-                        ov::is_type<ngraph::op::v0::Convert>(node) || ov::is_type<ngraph::op::v1::Transpose>(node))) {
-                // In OV_API 2.0 after Input node with I8/U8 precisions incerts Convert node, moreother on TF models inserts
-                // Transpose layer. These brakes an idea to leave Eltwise node with I8/U8 inputs and FP32 outputs instead of Subgrath node
-                // TODO Remove an additional check on Convert/Transpose here after enabling Subgraths with I8/U8 inputs and FP32 outputs
-                SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
            }
        }
+
        if (GetNodeFusingType(node) != NodeFusingType::NotSet) {
            SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
        } else {
--- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.hpp
+++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.hpp
@ -37,7 +37,7 @@ enum class NodeFusingType : int64_t {
    NotSet,
    FusedTerminator,
    FusedWithConvolution,  FusedWithBinaryConvolution, FusedWithConvolutionSumActivation,
-    FusedWithMatMul, FusedWithMisc, IgnoredAfterInputs};
+    FusedWithMatMul, FusedWithMatMulI8, FusedWithReduce, FusedWithMisc, IgnoredAfterInputs};

 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@ -573,6 +573,10 @@ public:
        rtParamsCache = cache;
    }

+    void setSharedMutex(const std::shared_ptr<std::mutex>& mutex) {
+        sharedMutex = mutex;
+    }
+
 protected:
    bool canFuseSimpleOperation(const NodePtr& node) const;

@ -747,6 +751,8 @@ protected:

    std::shared_ptr<IShapeInfer> shapeInference;

+    std::shared_ptr<std::mutex> sharedMutex = nullptr;
+
 private:
    std::vector<EdgeWeakPtr> parentEdges;
    std::vector<EdgeWeakPtr> childEdges;
--- a/src/plugins/intel_cpu/src/nodes/if.cpp
+++ b/src/plugins/intel_cpu/src/nodes/if.cpp
@ -70,8 +70,8 @@ void If::getSupportedDescriptors() {

    const std::shared_ptr<const ov::Model>& thenBody = ifOp->get_then_body();
    const std::shared_ptr<const ov::Model>& elseBody = ifOp->get_else_body();
-    subGraphThen.CreateGraph(thenBody, ext_mng, weightCache);
-    subGraphElse.CreateGraph(elseBody, ext_mng, weightCache);
+    subGraphThen.CreateGraph(thenBody, ext_mng, weightCache, sharedMutex);
+    subGraphElse.CreateGraph(elseBody, ext_mng, weightCache, sharedMutex);

    const auto &inMapThen = subGraphThen.GetInputNodesMap();
    for (const auto &param : ifOp->get_then_body()->get_parameters()) {
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@ -20,9 +20,12 @@
 #include <ngraph/rt_info.hpp>
 #include <ie_ngraph_utils.hpp>

+#include <shared_mutex>
+
 #include <snippets/op/subgraph.hpp>
 #include "emitters/cpu_generator.hpp"
 #include "snippets_transformations/fuse_load_store_and_convert.hpp"
+#include "ngraph_transformations/convert_to_swish_cpu.hpp"

 using namespace InferenceEngine;
 using namespace dnnl::impl::utils;
@ -34,30 +37,42 @@ namespace ov {
 namespace intel_cpu {
 namespace node {

+
 Snippet::Snippet(const std::shared_ptr<ngraph::Node>& op, const dnnl::engine& eng, WeightsSharing::Ptr &cache)
        : Node(op, eng, cache) {
    host_isa = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ?
        dnnl::impl::cpu::x64::avx512_core : dnnl::impl::cpu::x64::avx2;
-
-    // Create a deep local copy of the input snippet to perform canonicalization & code generation
-    // Todo: Probably better to implement a proper copy constructor
-    if (const auto tmp_snippet =  ov::as_type_ptr<ngraph::snippets::op::Subgraph>(op)) {
-        ngraph::OutputVector subgraph_node_inputs;
-        for (const auto &input : tmp_snippet->input_values()) {
-            auto new_input = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
-            subgraph_node_inputs.push_back(new_input);
-        }
-        auto new_body = ov::clone_model(*tmp_snippet->get_body().get());
-        snippet = std::make_shared<ngraph::snippets::op::Subgraph>(subgraph_node_inputs, new_body);
-        ngraph::copy_runtime_info(tmp_snippet, snippet);
-        snippet->set_friendly_name(tmp_snippet->get_friendly_name());
-        snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
-    } else {
+    original_snippet = ov::as_type_ptr<ngraph::snippets::op::Subgraph>(op);
+    if (!original_snippet) {
        IE_THROW(NotImplemented) << "Node is not an instance of snippets::op::Subgraph";
    }
 }

+void Snippet::copy_snippet() {
+    ngraph::OutputVector subgraph_node_inputs;
+    for (const auto &input : original_snippet->input_values()) {
+        auto new_input = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
+        subgraph_node_inputs.push_back(new_input);
+    }
+    std::shared_ptr<ov::Model> new_body = nullptr;
+    // Ticket[79554]: TypeRelaxed ops aren't thread safe so we use mutex to avoid collision in throughput mode
+    if (original_snippet->has_type_relaxed_ops()) {
+        if (!sharedMutex) {
+            IE_THROW() << "Subgraph doesn't have shared mutex";
+        }
+        std::lock_guard<std::mutex> lock(*sharedMutex.get());
+        new_body = ov::clone_model(*original_snippet->get_body().get());
+    } else {
+        new_body = ov::clone_model(*original_snippet->get_body().get());
+    }
+    snippet = std::make_shared<ngraph::snippets::op::Subgraph>(subgraph_node_inputs, new_body);
+    ngraph::copy_runtime_info(original_snippet, snippet);
+    snippet->set_friendly_name(original_snippet->get_friendly_name());
+    snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
+}
+
 void Snippet::initSupportedPrimitiveDescriptors() {
+    copy_snippet();
    if (!supportedPrimitiveDescriptors.empty())
        return;

@ -488,6 +503,7 @@ void Snippet::generate() {
    ov::pass::Manager optManager;
    optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
    optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
+    optManager.register_pass<ConvertToSwishCPU>();

    // LoadConvert uses Load emitter that support conversion from any type to only f32
    optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@ -32,6 +32,10 @@ public:
    void selectOptimalPrimitiveDescriptor() override;
    InferenceEngine::Precision getRuntimePrecision() const override;

+    // to avoid collisions in throughput mode with copy of TypeRelaxed nodes
+    // we should have common shared mutex between streams
+    void setSharedMutex(const std::shared_ptr<std::mutex>& mutex);
+
    // Here we convert to canonical for & jit everything
    void createPrimitive() override;

@ -46,6 +50,11 @@ private:

    typedef void (*kernel)(const void *, const void *);

+    // Create a deep local copy of the input snippet to perform canonicalization & code generation
+    // TODO: Probably better to implement a proper copy constructor
+    // NOTE: Before call mutex should be initialized
+    void copy_snippet();
+
    void define_schedule();

    void generate();
@ -54,6 +63,8 @@ private:
    void schedule_6d(const jit_snippets_call_args& const_args) const;
    void schedule_nt(const jit_snippets_call_args& const_args) const;

+    // Original subgraph node
+    std::shared_ptr<ngraph::snippets::op::Subgraph> original_snippet;
    // Local copy of subgraph node for canonization & code generation
    std::shared_ptr<ngraph::snippets::op::Subgraph> snippet;

--- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp
+++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp
@ -363,7 +363,7 @@ void TensorIterator::getSupportedDescriptors() {
        THROW_ERROR << "cannot be cast to ov::op::util::SubGraphOp";
    }
    const std::shared_ptr<const ov::Model> body = tiOp->get_function();
-    sub_graph.CreateGraph(body, ext_mng, weightCache);
+    sub_graph.CreateGraph(body, ext_mng, weightCache, sharedMutex);

    const auto &inMap = sub_graph.GetInputNodesMap();
    for (const auto &param : tiOp->get_function()->get_parameters()) {
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@ -82,6 +82,8 @@
 #include <transformations/op_conversions/fq_decomposition.hpp>
 #include <transformations/utils/utils.hpp>
 #include <snippets/pass/collapse_subgraph.hpp>
+#include <snippets/pass/common_optimizations.hpp>
+#include <snippets/pass/convert_constants.hpp>
 #include "ngraph_transformations/snippets_mark_skipped.hpp"
 #include <transformations/op_conversions/convert_roi_align_v9_to_v3.hpp>
 #include <transformations/op_conversions/convert_roi_align_v3_to_v9.hpp>
@ -579,20 +581,12 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
    }

    ngraph::pass::Manager postLPTPassManager;
-    postLPTPassManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
    postLPTPassManager.register_pass<ngraph::pass::UnrollTensorIterator>();
    postLPTPassManager.register_pass<ReshapePRelu>();
-
-    postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
-        std::string errMsg;
-        return node::FakeQuantize::isSupportedOperation(node, errMsg);
-    });
    postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
        // UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
        return node->get_rt_info().count("UNROLL_TI") == 0;
    });
-
-
    postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
    postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
        if (node->get_input_size() >= 2) {
@ -625,13 +619,19 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
    });
    postLPTPassManager.run_passes(nGraphFunc);

-    if (!useLpt && _enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
-        ngraph::pass::Manager tokenization_manager;
-        tokenization_manager.register_pass<SnippetsMarkSkipped>();
-        tokenization_manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
-        tokenization_manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
-        tokenization_manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
+    if (_enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
+        ngraph::pass::Manager snippetsManager;
+        snippetsManager.register_pass<SnippetsMarkSkipped>();
+        snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
+        snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
+        snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
                [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                    // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
+                    if (ov::is_type<const ov::op::v4::Swish>(n)) {
+                        if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
+                            return true;
+                    }
+
                    const auto& inputs = n->inputs();
                    // todo: clarify whether we can evaluate snippets on const paths
                    const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
@ -650,8 +650,18 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
                                                             [&](const ov::Output<const ov::Node>& out) {return  rank_is_too_large(out.get_tensor());});
                    return has_only_const_inputs || bad_input_rank || bad_output_rank;
                });
-        tokenization_manager.run_passes(nGraphFunc);
+        snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
+        snippetsManager.run_passes(nGraphFunc);
    }
+
+    ngraph::pass::Manager postSnippetsManager;
+    postSnippetsManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
+    postSnippetsManager.get_pass_config()->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
+            std::string errMsg;
+            return node::FakeQuantize::isSupportedOperation(node, errMsg);
+        });
+    postSnippetsManager.register_pass<ngraph::pass::ConstantFolding>();
+    postSnippetsManager.run_passes(nGraphFunc);
 }

 static void Transformation(CNNNetwork& clonedNetwork, const bool _enableLPT, const bool _enableBF16, const bool _enableSnippets, const bool isLegacyApi) {
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/fake_quantize_decomposition_test.cpp
@ -0,0 +1,131 @@
+
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "snippets/fake_quantize_decomposition_test.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace ngraph;
+
+namespace {
+
+namespace decompositionInSubgraph {
+const std::vector<TestValues> testValuesDecompositionScalars = {
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{}, {}, {}, {}},
+    },
+};
+const std::vector<TestValues> testValuesDecompositionPerChannel = {
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}},
+    },
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{1, 3, 1, 1}, {1, 3, 1, 1}, {}, {}},
+    },
+};
+
+std::vector<std::pair<std::shared_ptr<Node>, std::pair<std::string, std::string> >> operations = {
+    {std::make_shared<opset1::Abs>(), {"Subgraph", "Abs,fakeQuantize"}},
+    {std::make_shared<ngraph::op::v4::Swish>(), {"Subgraph", "Swish,fakeQuantize"}},
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_Snippets_FQDecomposition_Scalars,
+    FakeQuantizeDecompositionTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(testValuesDecompositionScalars),
+        ::testing::ValuesIn(operations),
+        // reorder (nChw[16|8]c) + MaxPool + Subgraph + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{4, 1}),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_Snippets_FQDecomposition_PerChannel,
+    FakeQuantizeDecompositionTest,
+    ::testing::Combine(
+        ::testing::Values(testValuesDecompositionPerChannel[0]),
+        ::testing::ValuesIn(operations),
+        // reorder (nChw[16|8]c) + MaxPool + reorder(nChw[16|8]c) x6 + Subgraph + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{10, 1}),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_Snippets_FQDecomposition_PerChannel_Input,
+    FakeQuantizeDecompositionTest,
+    ::testing::Combine(
+        ::testing::Values(testValuesDecompositionPerChannel[1]),
+        ::testing::ValuesIn(operations),
+        // reorder (nChw[16|8]c) + MaxPool + reorder(nChw[16|8]c) x4 + Subgraph + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{8, 1}),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    FakeQuantizeDecompositionTest::getTestCaseName);
+}  // namespace decompositionInSubgraph
+
+
+namespace legacyFuse {
+const std::vector<TestValues> testValuesLegacyFuse = {
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{1, 3, 1, 1}, {1, 3, 1, 1}, {}, {}}
+    },
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{}, {}, {1, 3, 1, 1}, {1, 3, 1, 1}}
+    },
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{}, {}, {}, {}}
+    },
+    {
+        ov::element::f32,
+        ngraph::Shape{1, 3, 16, 16},
+        ov::element::f32,
+        1.f,
+        {{1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}}
+    },
+};
+
+std::vector<std::pair<std::shared_ptr<Node>, std::pair<std::string, std::string>>> operations = {
+    {std::make_shared<opset1::Convolution>(), {"Convolution", "Convolution,fakeQuantize"}},
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_Snippets,
+    FakeQuantizeDecompositionTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(testValuesLegacyFuse),
+        ::testing::ValuesIn(operations),
+        // reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + reorder(ABcd16b16a) + Convolution + reorder(nchw)
+        ::testing::Values(std::pair<size_t, size_t>{6, 0}),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    FakeQuantizeDecompositionTest::getTestCaseName);
+
+}  // namespace legacyFuse
+
+}  // namespace
--- a/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp
@ -0,0 +1,107 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "snippets/pass/fq_decomposition.hpp"
+#include "snippets/pass/collapse_subgraph.hpp"
+#include "fake_quantize_function.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "ngraph_transformations/snippets_mark_skipped.hpp"
+#include "function_helper.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class FakeQuantizeTokenizationTest : public TransformationTestsF {
+public:
+    void register_passes() {
+        manager.register_pass<ov::intel_cpu::SnippetsMarkSkipped>();
+        manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
+        manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
+        manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>([](const std::shared_ptr<const ov::Node>& n) -> bool {
+            return false;
+        });
+    }
+
+    void TearDown() override {
+        TransformationTestsF::TearDown();
+
+        auto subgraph = FunctionHelper::getSubgraph(function);
+        auto body = subgraph == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph)->get_body();
+
+        auto subgraph_ref = FunctionHelper::getSubgraph(function_ref);
+        auto body_ref = subgraph_ref == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph_ref)->get_body();
+
+        if ((body != nullptr) && (body_ref != nullptr)) {
+            auto res = comparator.compare(body, body_ref);
+            ASSERT_TRUE(res.valid) << res.message;
+        } else {
+            ASSERT_EQ(nullptr, body);
+            ASSERT_EQ(nullptr, body_ref);
+        }
+    }
+};
+
+TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_FakeQuantize_PerTensor) {
+    function = FakeQuantizeFunction::getOperationAndFakeQuantize(
+        { {1, 3, 16, 16} },
+        element::f32,
+        { {}, {}, {}, {} },
+        true,
+        FunctionHelper::makePrerequisitesOriginal());
+
+    function_ref = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
+        { {1, 3, 16, 16} },
+        element::f32,
+        { {}, {}, {}, {} },
+        true,
+        FunctionHelper::makePrerequisitesOriginal());
+
+    register_passes();
+}
+
+TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_FakeQuantize_PerChannels) {
+    function = FakeQuantizeFunction::getOperationAndFakeQuantize(
+        { {1, 3, 16, 16} },
+        element::f32,
+        { {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1} },
+        true,
+        FunctionHelper::makePrerequisitesOriginal());
+
+    function_ref = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
+        { {1, 3, 16, 16} },
+        element::f32,
+        { {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1} },
+        true,
+        FunctionHelper::makePrerequisitesOriginal());
+
+    register_passes();
+}
+
+TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_ConvolutionWithFakeQuantize) {
+    function = FakeQuantizeFunction::getOperationAndFakeQuantize(
+        {{1, 3, 16, 16}},
+        element::f32,
+        {{}, {}, {}, {}},
+        true,
+        FunctionHelper::makePrerequisitesOriginal(),
+        std::make_shared<ngraph::opset1::Convolution>());
+
+    function_ref = FakeQuantizeFunction::getOperationAndFakeQuantize(
+        {{1, 3, 16, 16}},
+        element::f32,
+        {{}, {}, {}, {}},
+        true,
+        FunctionHelper::makePrerequisitesOriginal(),
+        std::make_shared<ngraph::opset1::Convolution>());
+
+    register_passes();
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/tests/functional/plugin/shared/include/snippets/fake_quantize_decomposition_test.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/fake_quantize_decomposition_test.hpp
@ -0,0 +1,50 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+#include <string>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+class ActualValues {
+public:
+    ov::element::Type modelType;
+    ngraph::Shape inputShape;
+    ov::element::Type inputType;
+    float zeroPoint;
+    std::vector<ngraph::Shape> fakeQuantizeShapes;
+};
+
+class TestValues {
+public:
+    ov::element::Type modelType;
+    ngraph::Shape inputShape;
+    ov::element::Type inputType;
+    float zeroPoint;
+    std::vector<ngraph::Shape> fakeQuantizeShapes;
+};
+
+typedef std::tuple<
+    TestValues,                 // test values
+    std::pair<std::shared_ptr<ngraph::Node>, std::pair<std::string, std::string>>,   // operation
+    std::pair<size_t, size_t>,  // number of nodes
+    std::string                 // target device
+> testsParams;
+
+class FakeQuantizeDecompositionTest : public testing::WithParamInterface<testsParams>, virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<testsParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace LayerTestsDefinitions
--- a/src/tests/functional/plugin/shared/src/snippets/fake_quantize_decomposition_test.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/fake_quantize_decomposition_test.cpp
@ -0,0 +1,78 @@
+
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/fake_quantize_decomposition_test.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+#include "ngraph_ops/type_relaxed.hpp"
+#include "fake_quantize_function.hpp"
+#include "function_helper.hpp"
+
+namespace LayerTestsDefinitions {
+
+std::string FakeQuantizeDecompositionTest::getTestCaseName(testing::TestParamInfo<testsParams> obj) {
+    std::ostringstream result;
+    const auto values = std::get<0>(obj.param);
+    const auto operation = std::get<1>(obj.param);
+    const auto operations_number = std::get<2>(obj.param);
+    const auto targetDevice = std::get<3>(obj.param);
+
+    const auto type_info = operation.first->get_type_info();
+    const auto operationString = ngraph::is_type<ngraph::opset1::Parameter>(operation.first) ?
+        "nullptr" :
+        (std::string(type_info.name) + "_" + std::string(type_info.version_id));
+
+    result << "IS=" << CommonTestUtils::vec2str(values.inputShape) << "_";
+    result << "netPRC=" << values.modelType << "_";
+    result << "D=" << targetDevice << "_";
+    result << "IN=" << values.inputType << "_";
+    result << "OP=" << operationString << "_";
+    result << "ON1=" << std::string(operation.second.first) << "_";
+    result << "ON1=" << std::string(operation.second.second) << "_";
+    result << "LP=" << values.zeroPoint;
+    result << "SH1=" << values.fakeQuantizeShapes[0] << "SH2=" << values.fakeQuantizeShapes[1]
+           << "SH3=" << values.fakeQuantizeShapes[2] << "SH4=" << values.fakeQuantizeShapes[3];
+    return result.str();
+}
+
+void FakeQuantizeDecompositionTest::SetUp() {
+    auto& testsParams = this->GetParam();
+
+    const auto values = std::get<0>(testsParams);
+    const auto operation = std::get<1>(testsParams);
+    const auto operations_number = std::get<2>(testsParams);
+    targetDevice = std::get<3>(testsParams);
+
+    ref_num_nodes = operations_number.first;
+    ref_num_subgraphs = operations_number.second;
+
+    init_input_shapes({{values.inputShape, {values.inputShape}}});
+
+    std::shared_ptr<ngraph::Node> op = ngraph::is_type<ngraph::opset1::Parameter>(operation.first) ? nullptr : operation.first;
+    function = ov::test::snippets::FakeQuantizeFunction::getOperationAndFakeQuantize(
+        {values.inputShape},
+        values.inputType,
+        values.fakeQuantizeShapes,
+        values.zeroPoint,
+        ov::test::snippets::FunctionHelper::makePrerequisitesOriginal(),
+        op);
+}
+
+TEST_P(FakeQuantizeDecompositionTest, CompareWithRefImpl) {
+    run();
+
+    const auto operation = std::get<1>(this->GetParam());
+    auto elementType = std::string(operation.second.first);
+    validateOriginalLayersNamesByType(elementType, operation.second.second);
+
+    validateNumSubgraphs();
+};
+
+}  // namespace LayerTestsDefinitions
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/snippets_test_utils.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/snippets_test_utils.hpp
@ -12,6 +12,9 @@ namespace test {
 class SnippetsTestsCommon : virtual public ov::test::SubgraphBaseTest {
 protected:
    void validateNumSubgraphs();
+
+    void validateOriginalLayersNamesByType(const std::string& layerType, const std::string& originalLayersNames);
+
    // Expected num nodes and subgraphs in exec graphs depends on the plugin
    // pipeline, tokenization callback for example. Therefore, they have to be provided manually.
    size_t ref_num_nodes = 0;
--- a/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/snippets_test_utils.cpp
@ -36,5 +36,23 @@ void SnippetsTestsCommon::validateNumSubgraphs() {
    ASSERT_EQ(ref_num_subgraphs, num_subgraphs) << "Compiled model contains invalid number of subgraphs.";
 }

+void SnippetsTestsCommon::validateOriginalLayersNamesByType(const std::string& layerType, const std::string& originalLayersNames) {
+    const auto& compiled_model = compiledModel.get_runtime_model();
+    for (const auto& op : compiled_model->get_ops()) {
+        const auto& rtInfo = op->get_rt_info();
+
+        const auto& typeIt = rtInfo.find("layerType");
+        const auto type = typeIt->second.as<std::string>();
+        if (type == layerType) {
+            const auto& nameIt = rtInfo.find("originalLayersNames");
+            const auto name = nameIt->second.as<std::string>();
+            ASSERT_EQ(originalLayersNames, name);
+            return;
+        }
+    }
+
+    ASSERT_TRUE(false) << "Layer type '" << layerType << "' was not found in compiled model";
+}
+
 }  // namespace test
 }  // namespace ov
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/fake_quantize_function.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/fake_quantize_function.hpp
@ -0,0 +1,43 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class FakeQuantizeFunction {
+public:
+    // Parameter => Operation => FakeQuantize => Result
+    static std::shared_ptr<ov::Model> getOperationAndFakeQuantize(
+        const ngraph::Shape& inputShape,
+        const element::Type inputType,
+        const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+        const float zeroPoint,
+        const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
+        std::shared_ptr<ngraph::Node> operation = nullptr);
+
+    // Parameter => Subgraph (Parameter => FakeQuantize => Result) => Result
+    static std::shared_ptr<ov::Model> getSubgraphWithFakeQuantize(
+        const ngraph::Shape& inputShape,
+        const element::Type inputType,
+        const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+        const float zeroPoint,
+        const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites = {},
+        const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations = {});
+
+    // Parameter => Subgraph (Parameter => element-wise ops from FakeQuantize decomposition results => Result) => Result
+    static std::shared_ptr<ov::Model> getSubgraphWithDecomposedFakeQuantize(
+        const ngraph::Shape& inputShape,
+        const element::Type inputType,
+        const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+        const float zeroPoint);
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/function_helper.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/function_helper.hpp
@ -0,0 +1,28 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/ngraph.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
+class FunctionHelper {
+public:
+    static std::vector<std::shared_ptr<Node>> makePrerequisitesOriginal();
+
+    static std::shared_ptr<Node> applyPrerequisites(
+        const std::shared_ptr<Node>& parent,
+        const std::vector<std::shared_ptr<Node>>& prerequisites);
+
+    // index: -1 - latest `Subgraph` operation
+    static std::shared_ptr<Node> getSubgraph(const std::shared_ptr<Model>& f, const int index = -1);
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/fake_quantize_function.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/fake_quantize_function.cpp
@ -0,0 +1,264 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fake_quantize_function.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/snippets_isa.hpp>
+#include <snippets/op/subgraph.hpp>
+#include "ngraph_functions/builders.hpp"
+#include "function_helper.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+namespace {
+std::shared_ptr<ngraph::op::FakeQuantize> makeFakeQuantize(
+    const Output<Node>& parent,
+    const ngraph::Shape& inputShape,
+    const element::Type inputType,
+    const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+    const float zeroPoint) {
+    auto generate = [](const ov::element::Type precision,
+                       const ngraph::Shape& shape,
+                       const float initialValue,
+                       const std::string& name) {
+        const auto size = ngraph::shape_size(shape);
+        std::vector<float> values(size);
+        for (auto i = 0; i < size; ++i) {
+            values[i] = static_cast<float>(initialValue + i);
+        }
+        auto constant = std::make_shared<ngraph::opset1::Constant>(precision, shape, values);
+        constant->set_friendly_name(name);
+        return constant;
+    };
+
+    const auto fakeQuantize = std::make_shared<ngraph::opset1::FakeQuantize>(
+        parent,
+        generate(inputType, fakeQuantizeShapes[0], zeroPoint, "inputLow"),
+        generate(inputType, fakeQuantizeShapes[1], 20.f, "inputHigh"),
+        generate(inputType, fakeQuantizeShapes[2], zeroPoint, "outputLow"),
+        generate(inputType, fakeQuantizeShapes[3], 20.f, "outputHigh"),
+        256ul);
+    fakeQuantize->set_friendly_name("fakeQuantize");
+
+    return fakeQuantize;
+}
+
+std::shared_ptr<ngraph::opset1::Convolution> makeConvolution(const Output<Node>& parent) {
+    const auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 3, 3, 1, 1 }, { 1.f });
+    const auto convolution = std::make_shared<ngraph::opset1::Convolution>(
+        parent,
+        weights,
+        ngraph::Strides{ 1, 1 },
+        ngraph::CoordinateDiff{ 0, 0 },
+        ngraph::CoordinateDiff{ 0, 0 },
+        ngraph::Strides{ 1, 1 });
+    convolution->set_friendly_name("Convolution");
+    return convolution;
+}
+
+std::shared_ptr<ngraph::opset1::GroupConvolution> makeGroupConvolution(const Output<Node>& parent) {
+    const auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 3, 3, 1, 1 }, { 1.f });
+    const auto convolution = std::make_shared<ngraph::opset1::GroupConvolution>(
+        parent,
+        weights,
+        ngraph::Strides{ 1, 1 },
+        ngraph::CoordinateDiff{ 0, 0 },
+        ngraph::CoordinateDiff{ 0, 0 },
+        ngraph::Strides{ 1, 1 });
+    convolution->set_friendly_name("GroupConvolution");
+    return convolution;
+}
+
+std::shared_ptr<ngraph::opset1::MatMul> makeMatMul(const Output<Node>& parent1, const Output<Node>& parent2) {
+    const auto matMul = std::make_shared<ngraph::opset1::MatMul>(parent1, parent2);
+    matMul->set_friendly_name("MatMul");
+    return matMul;
+}
+
+Output<Node> initOperation(std::shared_ptr<Node> operation, const std::vector<Output<Node>>& parents) {
+    if (is_type<ngraph::opset1::Convolution>(operation)) {
+        assert(parents.size() == 1ul);
+        return makeConvolution(parents[0]);
+    }
+
+    if (is_type<ngraph::opset1::GroupConvolution>(operation)) {
+        assert(parents.size() == 1ul);
+        return makeGroupConvolution(parents[0]);
+    }
+
+    if (is_type<ngraph::opset1::MatMul>(operation)) {
+        assert(parents.size() == 2ul);
+        return makeMatMul(parents[0], parents[1]);
+    }
+
+    operation->set_argument(0, parents[0]);
+    auto elementType = std::string(operation->get_type_name());
+    operation->set_friendly_name(elementType);
+
+    return operation;
+}
+
+// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
+std::shared_ptr<Node> getOperations(const std::vector<std::shared_ptr<Node>>& operations, const Output<Node>& parent) {
+    Output<Node> currentParent = parent;
+    for (auto operation : operations) {
+        operation->set_argument(0, currentParent);
+        currentParent = operation;
+    }
+    return currentParent.get_node_shared_ptr();
+}
+
+} // namespace
+
+std::shared_ptr<ov::Model> FakeQuantizeFunction::getOperationAndFakeQuantize(
+    const ngraph::Shape& inputShape,
+    const element::Type inputType,
+    const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+    const float zeroPoint,
+    const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
+    std::shared_ptr<ngraph::Node> operation) {
+    assert(fakeQuantizeShapes.size() == 4ul);
+
+    const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
+    parameter->set_friendly_name("parameter");
+
+    auto parent = FunctionHelper::applyPrerequisites(parameter, prerequisites);
+
+    const auto fakeQuantize = makeFakeQuantize(
+        operation == nullptr ? parent : initOperation(operation, { parent }),
+        inputShape,
+        inputType,
+        fakeQuantizeShapes,
+        zeroPoint);
+
+    fakeQuantize->set_friendly_name("fakeQuantize");
+
+    const auto result = std::make_shared<ngraph::opset1::Result>(fakeQuantize);
+    result->set_friendly_name("result");
+
+    auto function = std::make_shared<ngraph::Function>(ngraph::ResultVector{ result }, ParameterVector{ parameter }, "FakeQuantizeFunction");
+    function->validate_nodes_and_infer_types();
+
+    return function;
+}
+
+std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithFakeQuantize(
+    const ngraph::Shape& inputShape,
+    const element::Type inputType,
+    const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+    const float zeroPoint,
+    const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
+    const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations) {
+    assert(fakeQuantizeShapes.size() == 4ul);
+
+    auto getSubgraphBody = [](
+        const ngraph::Shape& inputShape,
+        const element::Type inputType,
+        const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+        const float zeroPoint,
+        const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations) {
+        const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
+        parameter->set_friendly_name("parameter");
+
+        const auto fakeQuantize = makeFakeQuantize(
+            getOperations(beforeFakeQuantizeOperations, {parameter}), inputShape, inputType, fakeQuantizeShapes, zeroPoint);
+
+        const auto result = std::make_shared<ngraph::opset1::Result>(fakeQuantize);
+        result->set_friendly_name("result");
+
+        return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithFakeQuantizeBody");
+    };
+
+    const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
+    parameter->set_friendly_name("parameter");
+
+    auto parent = FunctionHelper::applyPrerequisites(parameter, prerequisites);
+
+    const auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
+        ngraph::OutputVector{ parent },
+        getSubgraphBody(inputShape, inputType, fakeQuantizeShapes, zeroPoint, beforeFakeQuantizeOperations));
+    subgraph->set_friendly_name("subgraph");
+
+    const auto result = std::make_shared<ngraph::opset1::Result>(subgraph);
+    result->set_friendly_name("result");
+
+    auto function = std::make_shared<ngraph::Function>(ngraph::ResultVector{ result }, ParameterVector{ parameter }, "SubgraphWithFakeQuantize");
+    function->validate_nodes_and_infer_types();
+    return function;
+}
+
+std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
+    const ngraph::Shape& inputShape,
+    const element::Type inputType,
+    const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+    const float zeroPoint) {
+    assert(fakeQuantizeShapes.size() == 4ul);
+
+    auto getSubgraphBody = [](
+        const ngraph::Shape& inputShape,
+        const element::Type inputType,
+        const std::vector<ngraph::Shape>& fakeQuantizeShapes,
+        const float zeroPoint) {
+        const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
+        parameter->set_friendly_name("parameter");
+
+        const auto maximum = std::make_shared<ngraph::opset1::Maximum>(
+            parameter,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
+        maximum->set_friendly_name("inputLow");
+
+        const auto minimum = std::make_shared<ngraph::opset1::Minimum>(
+            maximum,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{20.f}));
+        minimum->set_friendly_name("inputHigh");
+
+        const auto multiply = std::make_shared<ngraph::opset1::Multiply>(
+            minimum,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
+        multiply->set_friendly_name("multiply");
+
+        const auto subtract = std::make_shared<ngraph::opset1::Subtract>(
+            multiply,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
+        subtract->set_friendly_name("subtract");
+
+        const auto round = std::make_shared<ngraph::opset5::Round>(subtract, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
+        round->set_friendly_name("round");
+
+        const auto devide = std::make_shared<ngraph::opset1::Multiply>(
+            round,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{0.0745098f}));
+        devide->set_friendly_name("devide");
+
+        const auto add = std::make_shared<ngraph::opset1::Add>(
+            devide,
+            std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
+        add->set_friendly_name("add");
+
+        const auto result = std::make_shared<ngraph::opset1::Result>(add);
+        result->set_friendly_name("result");
+
+        return std::make_shared<ngraph::Function>(
+            ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithDecomposedFakeQuantizeBody");
+    };
+
+    const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
+    parameter->set_friendly_name("parameter");
+
+    const auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
+        ngraph::OutputVector {parameter},
+        getSubgraphBody(inputShape, inputType, fakeQuantizeShapes, zeroPoint));
+    subgraph->set_friendly_name("subgraph");
+
+    const auto result = std::make_shared<ngraph::opset1::Result>(subgraph);
+    result->set_friendly_name("result");
+
+    return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithDecomposedFakeQuantize");
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/function_helper.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/function_helper.cpp
@ -0,0 +1,73 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "function_helper.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/snippets_isa.hpp>
+#include <snippets/op/subgraph.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
+std::vector<std::shared_ptr<Node>> FunctionHelper::makePrerequisitesOriginal() {
+    std::vector<std::shared_ptr<Node>> nodes;
+
+    const auto parameter = std::make_shared<ngraph::opset1::Parameter>();
+    parameter->set_friendly_name("parameter");
+    nodes.push_back(parameter);
+
+    const auto maxPool = std::make_shared<ngraph::opset1::MaxPool>(
+        parameter,
+        Strides{ 1, 1 }, // strides
+        Shape{ 0, 0 },   // pads_begin
+        Shape{ 0, 0 },   // pads_end
+        Shape{ 1, 1 });  // kernel
+    maxPool->set_friendly_name("maxPool");
+    nodes.push_back(maxPool);
+
+    return nodes;
+}
+
+std::shared_ptr<Node> FunctionHelper::applyPrerequisites(const std::shared_ptr<Node>& parent, const std::vector<std::shared_ptr<Node>>& prerequisites) {
+    std::shared_ptr<ngraph::Node> currentParent;
+    if (prerequisites.empty()) {
+        currentParent = parent;
+    } else {
+        auto begin = prerequisites[0];
+        if (is_type<ngraph::opset1::Parameter>(begin)) {
+            begin = prerequisites[1];
+        }
+        begin->set_argument(0, parent);
+
+        currentParent = *prerequisites.rbegin();
+    }
+    return currentParent;
+}
+
+std::shared_ptr<Node> FunctionHelper::getSubgraph(const std::shared_ptr<Model>& f, const int index) {
+    int currentIndex = 0;
+    std::shared_ptr<ngraph::snippets::op::Subgraph> subgraph;
+    for (const auto& op : f->get_ordered_ops()) {
+        auto tmp_subgraph = as_type_ptr<ngraph::snippets::op::Subgraph>(op);
+        if (tmp_subgraph != nullptr) {
+            if (index == currentIndex) {
+                return tmp_subgraph;
+            }
+            subgraph = tmp_subgraph;
+            currentIndex++;
+        }
+    }
+
+    if (index != -1) {
+        return nullptr;
+    }
+    return subgraph;
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
@ -13,19 +13,19 @@ namespace snippets {

 std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    auto load0 = std::make_shared<ngraph::snippets::op::Load>(data0);
-    std::shared_ptr<Node> add_input0 = load0;
-    if (!broadcast_shapes[0].empty()) {
-        auto broadcast0 = std::make_shared<ngraph::snippets::op::BroadcastMove>(load0, broadcast_shapes[0]);
-        add_input0 = broadcast0;
+    std::shared_ptr<Node> add_input0 = nullptr;
+    if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) {
+        add_input0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data0, broadcast_shapes[0]);
+    } else {
+        add_input0 = std::make_shared<ngraph::snippets::op::Load>(data0);
    }

    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    auto load1 = std::make_shared<ngraph::snippets::op::Load>(data1);
-    std::shared_ptr<Node> add_input1 = load1;
-    if (!broadcast_shapes[1].empty()) {
-        auto broadcast1 = std::make_shared<ngraph::snippets::op::BroadcastMove>(load1, broadcast_shapes[1]);
-        add_input1 = broadcast1;
+    std::shared_ptr<Node> add_input1 = nullptr;
+    if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) {
+        add_input1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data1, broadcast_shapes[1]);
+    } else {
+        add_input1 = std::make_shared<ngraph::snippets::op::Load>(data1);
    }
    auto add = std::make_shared<op::v1::Add>(add_input0, add_input1);
    auto store = std::make_shared<ngraph::snippets::op::Store>(add);