From 6525dd47276939f87c9479f5e16ba0d9f036d1b7 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Wed, 18 Jan 2023 16:59:21 +0400
Subject: [PATCH] [Snippets][CPU] Added FP32 MHA tokenization support (#14327)

---
 src/common/snippets/CMakeLists.txt            |    5 +-
 .../snippets/include/snippets/generator.hpp   |   30 +-
 .../snippets/include/snippets/op/brgemm.hpp   |   47 +
 .../include/snippets/op/broadcastload.hpp     |   10 +-
 .../include/snippets/op/broadcastmove.hpp     |    7 +-
 .../snippets/include/snippets/op/buffer.hpp   |   47 +
 .../snippets/include/snippets/op/fill.hpp     |   47 +
 .../include/snippets/op/horizon_max.hpp       |   32 +
 .../include/snippets/op/horizon_sum.hpp       |   32 +
 .../snippets/include/snippets/op/kernel.hpp   |    7 +-
 .../snippets/include/snippets/op/load.hpp     |   46 +-
 .../snippets/include/snippets/op/loop.hpp     |  111 ++
 .../include/snippets/op/memory_access.hpp     |   42 +
 .../include/snippets/op/powerstatic.hpp       |    1 -
 .../snippets/include/snippets/op/scalar.hpp   |    2 +-
 .../snippets/include/snippets/op/store.hpp    |   26 +-
 .../snippets/include/snippets/op/subgraph.hpp |  114 +-
 .../snippets/include/snippets/op/tile.hpp     |   48 -
 .../include/snippets/op/tile_scheduler.hpp    |   39 -
 .../include/snippets/op/vector_buffer.hpp     |   34 +
 .../pass/broadcast_to_movebroadcast.hpp       |   28 +
 .../snippets/pass/collapse_subgraph.hpp       |   26 +-
 .../pass/explicit_transpose_matmul_inputs.hpp |   32 +
 .../snippets/pass/fuse_transpose_brgemm.hpp   |   30 +
 .../include/snippets/pass/insert_buffer.hpp   |   30 +
 .../snippets/pass/insert_load_store.hpp       |    4 +-
 .../include/snippets/pass/insert_loops.hpp    |   43 +
 .../snippets/pass/insert_movebroadcast.hpp    |    4 +
 .../include/snippets/pass/loop_fusion.hpp     |   29 +
 .../include/snippets/pass/loop_helpers.hpp    |   99 ++
 .../snippets/pass/matmul_to_brgemm.hpp        |   28 +
 .../snippets/pass/mha_tokenization.hpp        |   28 +
 .../include/snippets/pass/reset_buffer.hpp    |   29 +
 .../snippets/pass/softmax_decomposition.hpp   |   30 +
 .../pass/softmax_reshape_elimination.hpp      |   27 +
 .../include/snippets/pass/tokenization.hpp    |   58 +
 .../snippets/pass/transpose_decomposition.hpp |   28 +
 .../include/snippets/snippets_isa.hpp         |    9 +-
 .../include/snippets/snippets_isa_tbl.hpp     |    4 +
 .../snippets/include/snippets/utils.hpp       |   10 +
 src/common/snippets/src/generator.cpp         |  249 ++--
 src/common/snippets/src/op/brgemm.cpp         |   64 +
 src/common/snippets/src/op/broadcastload.cpp  |   12 +-
 src/common/snippets/src/op/broadcastmove.cpp  |   41 +-
 src/common/snippets/src/op/buffer.cpp         |   53 +
 src/common/snippets/src/op/fill.cpp           |   38 +
 src/common/snippets/src/op/horizon_max.cpp    |   28 +
 src/common/snippets/src/op/horizon_sum.cpp    |   28 +
 src/common/snippets/src/op/kernel.cpp         |   12 +-
 src/common/snippets/src/op/load.cpp           |   61 +-
 src/common/snippets/src/op/loop.cpp           |  182 +++
 src/common/snippets/src/op/memory_access.cpp  |   45 +
 src/common/snippets/src/op/powerstatic.cpp    |   15 -
 src/common/snippets/src/op/scalar.cpp         |   12 +-
 src/common/snippets/src/op/store.cpp          |   38 +-
 src/common/snippets/src/op/subgraph.cpp       |  421 +++++--
 src/common/snippets/src/op/tile.cpp           |   15 -
 src/common/snippets/src/op/tile_scheduler.cpp |   10 -
 src/common/snippets/src/op/vector_buffer.cpp  |   27 +
 .../snippets/src/pass/align_element_type.cpp  |   15 +-
 .../snippets/src/pass/assign_registers.cpp    |  368 ++++--
 .../src/pass/broadcast_to_movebroadcast.cpp   |   49 +
 .../snippets/src/pass/collapse_subgraph.cpp   |  218 ++--
 .../src/pass/common_optimizations.cpp         |   11 +-
 .../snippets/src/pass/convert_constants.cpp   |    9 +-
 .../pass/explicit_transpose_matmul_inputs.cpp |   83 ++
 .../src/pass/fuse_transpose_brgemm.cpp        |   86 ++
 .../snippets/src/pass/insert_buffer.cpp       |   96 ++
 .../snippets/src/pass/insert_load_store.cpp   |   32 +-
 src/common/snippets/src/pass/insert_loops.cpp |  285 +++++
 .../src/pass/insert_movebroadcast.cpp         |   86 +-
 .../load_movebroadcast_to_broadcastload.cpp   |    6 +-
 src/common/snippets/src/pass/loop_fusion.cpp  |  331 +++++
 src/common/snippets/src/pass/loop_helpers.cpp |   48 +
 .../snippets/src/pass/matmul_to_brgemm.cpp    |   45 +
 .../snippets/src/pass/mha_tokenization.cpp    |  394 ++++++
 src/common/snippets/src/pass/reset_buffer.cpp |  114 ++
 .../src/pass/softmax_decomposition.cpp        |  216 ++++
 .../src/pass/softmax_reshape_elimination.cpp  |   70 ++
 src/common/snippets/src/pass/tokenization.cpp |   72 ++
 .../src/pass/transpose_decomposition.cpp      |   81 ++
 src/common/snippets/src/utils.cpp             |   56 +-
 .../snippets/tests/include/lowering_utils.hpp |    8 +-
 .../pass/broadcast_to_movebroadcast.hpp       |   29 +
 .../include/pass/fuse_transpose_brgemm.hpp    |   33 +
 .../tests/include/pass/mha_tokenization.hpp   |   20 +
 .../include/pass/softmax_decomposition.hpp    |   43 +
 .../snippets/tests/src/lowering_utils.cpp     |   63 +-
 .../src/pass/broadcast_to_movebroadcast.cpp   |   59 +
 .../tests/src/pass/canonicalization.cpp       |   11 +-
 .../tests/src/pass/collapse_subgraph.cpp      |   25 +-
 .../tests/src/pass/fuse_transpose_brgemm.cpp  |   58 +
 .../tests/src/pass/insert_load_store.cpp      |    8 +-
 .../tests/src/pass/insert_movebroadcast.cpp   |   13 +-
 .../snippets/tests/src/pass/merge_loops.cpp   |  169 +++
 .../tests/src/pass/mha_tokenization.cpp       |   38 +
 .../tests/src/pass/softmax_decomposition.cpp  |  122 ++
 .../src/pass/softmax_reshape_elimination.cpp  |   70 ++
 src/common/snippets/tests/src/registers.cpp   |   36 +-
 .../tests/onnx_import_com_microsoft.in.cpp    |    2 +-
 .../interface/ie_internal_plugin_config.hpp   |   13 +
 src/plugins/intel_cpu/src/config.cpp          |   10 +
 src/plugins/intel_cpu/src/config.h            |    7 +
 .../intel_cpu/src/emitters/cpu_generator.cpp  |   20 +-
 .../src/emitters/jit_eltwise_emitters.cpp     |   61 +
 .../src/emitters/jit_eltwise_emitters.hpp     |   18 +
 .../src/emitters/jit_snippets_emitters.cpp    | 1088 +++++++++++++----
 .../src/emitters/jit_snippets_emitters.hpp    |  264 +++-
 src/plugins/intel_cpu/src/extension.cpp       |   11 +-
 .../snippets_mark_skipped.cpp                 |   24 +-
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  |  583 ++++-----
 src/plugins/intel_cpu/src/nodes/subgraph.h    |   42 +-
 src/plugins/intel_cpu/src/plugin.cpp          |   31 +-
 .../fuse_load_store_and_convert.cpp           |    8 +-
 .../op/load_convert.cpp                       |   14 +-
 .../op/load_convert.hpp                       |    4 +-
 .../op/store_convert.cpp                      |   14 +-
 .../op/store_convert.hpp                      |    4 +-
 .../intel_cpu/src/transformation_pipeline.cpp |  114 +-
 .../intel_cpu/src/transformation_pipeline.h   |    6 +-
 .../skip_tests_config.cpp                     |    4 +
 .../shared_tests_instances/snippets/add.cpp   |   42 +-
 .../snippets/conv_eltwise.cpp                 |   38 +-
 .../snippets/convert.cpp                      |   30 +-
 .../snippets/matmul.cpp                       |   70 ++
 .../shared_tests_instances/snippets/mha.cpp   |   67 +
 .../snippets/select.cpp                       |   42 +
 .../snippets/softmax.cpp                      |   72 ++
 .../snippets/transpose.cpp                    |   27 +
 .../snippets/transpose_matmul.cpp             |   63 +
 .../snippets/transpose_softmax.cpp            |   42 +
 .../snippets/two_inputs_and_outputs.cpp       |    3 +-
 .../functional/subgraph_tests/src/mha.cpp     |   46 +-
 .../snipptes_mark_skipped.cpp                 |   14 +-
 .../fake_quantize_tokenization_test.cpp       |    2 +-
 .../plugin/shared/include/snippets/add.hpp    |   17 +
 .../shared/include/snippets/convert.hpp       |    2 +-
 .../plugin/shared/include/snippets/matmul.hpp |   70 ++
 .../plugin/shared/include/snippets/mha.hpp    |   47 +
 .../plugin/shared/include/snippets/select.hpp |   59 +
 .../shared/include/snippets/softmax.hpp       |   49 +
 .../include/snippets/three_inputs_eltwise.hpp |   10 +-
 .../shared/include/snippets/transpose.hpp     |   32 +
 .../include/snippets/transpose_matmul.hpp     |   33 +
 .../include/snippets/transpose_softmax.hpp    |   40 +
 .../snippets/two_inputs_and_outputs.hpp       |    2 +-
 .../plugin/shared/src/snippets/add.cpp        |   38 +
 .../plugin/shared/src/snippets/convert.cpp    |   38 +-
 .../plugin/shared/src/snippets/matmul.cpp     |  168 +++
 .../src/snippets/max_num_params_eltwise.cpp   |    4 +-
 .../plugin/shared/src/snippets/mha.cpp        |  125 ++
 .../plugin/shared/src/snippets/select.cpp     |  114 ++
 .../plugin/shared/src/snippets/softmax.cpp    |   91 ++
 .../src/snippets/three_inputs_eltwise.cpp     |    1 +
 .../plugin/shared/src/snippets/transpose.cpp  |   52 +
 .../shared/src/snippets/transpose_matmul.cpp  |   57 +
 .../shared/src/snippets/transpose_softmax.cpp |   82 ++
 .../src/snippets/two_inputs_and_outputs.cpp   |    8 +-
 .../src/base/utils/generate_inputs.cpp        |    5 +
 .../include/snippets_helpers.hpp              |    6 +-
 .../include/subgraph_converts.hpp             |   16 +-
 .../include/subgraph_customizable.hpp         |    3 +-
 .../include/subgraph_lowered.hpp              |   45 +-
 .../include/subgraph_matmul.hpp               |   96 ++
 .../include/subgraph_mha.hpp                  |  131 ++
 .../include/subgraph_simple.hpp               |   69 +-
 .../include/subgraph_softmax.hpp              |   57 +
 .../include/subgraph_transpose.hpp            |   36 +
 .../src/snippets_helpers.cpp                  |    4 +-
 .../src/subgraph_customizable.cpp             |    4 +-
 .../src/subgraph_lowered.cpp                  |  366 +++++-
 .../src/subgraph_matmul.cpp                   |   92 ++
 .../src/subgraph_mha.cpp                      |  348 ++++++
 .../src/subgraph_simple.cpp                   |   58 +-
 .../src/subgraph_softmax.cpp                  |   52 +
 .../src/subgraph_transpose.cpp                |   32 +
 176 files changed, 10025 insertions(+), 1664 deletions(-)
 create mode 100644 src/common/snippets/include/snippets/op/brgemm.hpp
 create mode 100644 src/common/snippets/include/snippets/op/buffer.hpp
 create mode 100644 src/common/snippets/include/snippets/op/fill.hpp
 create mode 100644 src/common/snippets/include/snippets/op/horizon_max.hpp
 create mode 100644 src/common/snippets/include/snippets/op/horizon_sum.hpp
 create mode 100644 src/common/snippets/include/snippets/op/loop.hpp
 create mode 100644 src/common/snippets/include/snippets/op/memory_access.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/tile.hpp
 delete mode 100644 src/common/snippets/include/snippets/op/tile_scheduler.hpp
 create mode 100644 src/common/snippets/include/snippets/op/vector_buffer.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/insert_buffer.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/insert_loops.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/loop_fusion.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/loop_helpers.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/mha_tokenization.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/reset_buffer.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/tokenization.hpp
 create mode 100644 src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
 create mode 100644 src/common/snippets/src/op/brgemm.cpp
 create mode 100644 src/common/snippets/src/op/buffer.cpp
 create mode 100644 src/common/snippets/src/op/fill.cpp
 create mode 100644 src/common/snippets/src/op/horizon_max.cpp
 create mode 100644 src/common/snippets/src/op/horizon_sum.cpp
 create mode 100644 src/common/snippets/src/op/loop.cpp
 create mode 100644 src/common/snippets/src/op/memory_access.cpp
 delete mode 100644 src/common/snippets/src/op/powerstatic.cpp
 delete mode 100644 src/common/snippets/src/op/tile.cpp
 delete mode 100644 src/common/snippets/src/op/tile_scheduler.cpp
 create mode 100644 src/common/snippets/src/op/vector_buffer.cpp
 create mode 100644 src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp
 create mode 100644 src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp
 create mode 100644 src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
 create mode 100644 src/common/snippets/src/pass/insert_buffer.cpp
 create mode 100644 src/common/snippets/src/pass/insert_loops.cpp
 create mode 100644 src/common/snippets/src/pass/loop_fusion.cpp
 create mode 100644 src/common/snippets/src/pass/loop_helpers.cpp
 create mode 100644 src/common/snippets/src/pass/matmul_to_brgemm.cpp
 create mode 100644 src/common/snippets/src/pass/mha_tokenization.cpp
 create mode 100644 src/common/snippets/src/pass/reset_buffer.cpp
 create mode 100644 src/common/snippets/src/pass/softmax_decomposition.cpp
 create mode 100644 src/common/snippets/src/pass/softmax_reshape_elimination.cpp
 create mode 100644 src/common/snippets/src/pass/tokenization.cpp
 create mode 100644 src/common/snippets/src/pass/transpose_decomposition.cpp
 create mode 100644 src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp
 create mode 100644 src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
 create mode 100644 src/common/snippets/tests/include/pass/mha_tokenization.hpp
 create mode 100644 src/common/snippets/tests/include/pass/softmax_decomposition.hpp
 create mode 100644 src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp
 create mode 100644 src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
 create mode 100644 src/common/snippets/tests/src/pass/merge_loops.cpp
 create mode 100644 src/common/snippets/tests/src/pass/mha_tokenization.cpp
 create mode 100644 src/common/snippets/tests/src/pass/softmax_decomposition.cpp
 create mode 100644 src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp
 create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/matmul.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/mha.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/select.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/softmax.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp
 create mode 100644 src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/matmul.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/mha.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/select.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/softmax.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp
 create mode 100644 src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp
 create mode 100644 src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp

diff --git a/src/common/snippets/CMakeLists.txt b/src/common/snippets/CMakeLists.txt
index 702543cfcf7..d3a7e47c604 100644
--- a/src/common/snippets/CMakeLists.txt
+++ b/src/common/snippets/CMakeLists.txt
@@ -26,9 +26,10 @@ ie_faster_build(${TARGET_NAME}
 )
 
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime
-                                     PRIVATE ngraph_reference openvino::runtime::dev)
+                                     PRIVATE ngraph_reference ov_shape_inference openvino::runtime::dev)
 
-target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>)
+target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>
+                                          PRIVATE $<BUILD_INTERFACE:${SHAPE_INFER_INCLUDE_DIR}>)
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 
diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
index 058e45c62b0..ab3156a108e 100644
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@@ -84,7 +84,7 @@ public:
      * @param f can this kernel be linearided to 1D range
      * @param p pointer to generated code
      */
-    Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
+    Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
     /**
      * @brief Returns callable instanse of code pointer
      */
@@ -92,7 +92,7 @@ public:
         return reinterpret_cast<K>(const_cast<unsigned char*>(ptr));
     }
 
-    Shape work_size {};
+    ov::PartialShape work_size {};
     bool is_flat {false};
     code ptr {nullptr};
 };
@@ -112,21 +112,43 @@ public:
      * @brief Default destructor
      */
     virtual ~Generator() = default;
+    /**
+    * @interface GeneratorConfig
+    * @brief Allows to tweak the lowering process.
+    */
+    class GeneratorConfig {
+    public:
+        // True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
+        bool m_save_lowered_code = false;
+        // True if we can optimize tails for single evaluation during code generation
+        // More details with optimization examples you can see in generate() method
+        // For example, tails with Buffer ops doesn't support single evaluation optimizations
+        //              because of that we should always reset memory pointer using finalization offsets
+        //              after data storing to Buffer
+        bool m_optimize_single_evaluation = true;
+        // True if we should check runtime info for nodes to call specific needed transformations
+        bool m_need_fill_tail_register = false;
+    };
     /**
      * @brief virtual method any specific implementation should implement
      * @param m model in canonical for for table-based code generation
+     * @param config config with transformation and optimization parameters
+     * @param compile_params parameters for generated code
      * @return pointer to generated code
      */
-    code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
+    code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);
 
     /**
      * @brief gets target machine
      * @return pointer to constant target machine
      */
-    std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
+    std::shared_ptr<const TargetMachine> get_target_machine() const;
 
 protected:
     std::shared_ptr<TargetMachine> target;
+    // todo: we need to save lowered code to access compiled brgemm kernels on execution time (normally lowered is destructed by then).
+    //  This is temporary solution, remove this when kernel caching is implemented. Don't forget to make generate const method.
+    std::vector<AllocatedEmitter> lowered_saved;
 };
 
 } // namespace snippets
diff --git a/src/common/snippets/include/snippets/op/brgemm.hpp b/src/common/snippets/include/snippets/op/brgemm.hpp
new file mode 100644
index 00000000000..2746d974a06
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/brgemm.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/matmul.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Brgemm
+ * @brief Brgemm is a batch-reduced matrix multiplication with the support of arbitrary strides between matrices rows
+ * @ingroup snippets
+ */
+class Brgemm : public ngraph::op::v0::MatMul {
+public:
+    OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
+    Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
+    Brgemm() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    bool has_evaluate() const override { return false; }
+
+    size_t get_offset_a() const { return m_offset_a; }
+    size_t get_offset_b() const { return m_offset_b; }
+    size_t get_offset_c() const { return m_offset_c; }
+
+    void set_offset_a(const size_t offset) { m_offset_a = offset; }
+    void set_offset_b(const size_t offset) { m_offset_b = offset; }
+    void set_offset_c(const size_t offset) { m_offset_c = offset; }
+
+private:
+    size_t m_offset_a = 0lu;  // offset for first input
+    size_t m_offset_b = 0lu;  // offset for second input
+    size_t m_offset_c = 0lu;  // offset for output
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp
index 8dce9ee2ab9..43f3a329adc 100644
--- a/src/common/snippets/include/snippets/op/broadcastload.hpp
+++ b/src/common/snippets/include/snippets/op/broadcastload.hpp
@@ -21,12 +21,18 @@ class BroadcastLoad : public BroadcastMove {
 public:
     OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);
 
-    BroadcastLoad(const Output<Node>& x, Shape output_shape);
+    BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
     BroadcastLoad() = default;
 
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    size_t get_offset() const { return m_offset; }
+    void set_offset(const size_t offset) { m_offset = offset; }
 
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;
+
+private:
+    size_t m_offset = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/broadcastmove.hpp b/src/common/snippets/include/snippets/op/broadcastmove.hpp
index cdcca462cc0..0d6368970b8 100644
--- a/src/common/snippets/include/snippets/op/broadcastmove.hpp
+++ b/src/common/snippets/include/snippets/op/broadcastmove.hpp
@@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op {
 public:
     OPENVINO_OP("BroadcastMove", "SnippetsOpset");
 
-    BroadcastMove(const Output<Node>& x, Shape output_shape);
+    BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
     BroadcastMove() = default;
 
     bool visit_attributes(AttributeVisitor& visitor) override;
@@ -28,12 +28,9 @@ public:
 
     void validate_and_infer_types() override;
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
 
 protected:
-    Shape output_shape;
+    ov::PartialShape output_shape;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/buffer.hpp b/src/common/snippets/include/snippets/op/buffer.hpp
new file mode 100644
index 00000000000..f75fc95e742
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/buffer.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Buffer
+ * @brief The operation is for intermediate data storage
+ *        - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
+ *                 It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                 Default value is -1 (full shape)
+ *        Notes:
+ *               - All buffers in a graph have the same memory pointer. So if we have a few buffers,
+ *                 each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
+ *               - Buffer should be a single consumer for operation output port
+ * @ingroup snippets
+ */
+class Buffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Buffer", "SnippetsOpset");
+
+    Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
+    Buffer() = default;
+
+    int32_t get_allocation_rank() const { return m_allocation_rank; }
+    void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }
+
+    size_t get_byte_size() const;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    int32_t m_allocation_rank = -1;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/fill.hpp b/src/common/snippets/include/snippets/op/fill.hpp
new file mode 100644
index 00000000000..85b95ec3799
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/fill.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface Fill
+ * @brief Generated in Tail Loop vector representation in code generation step for cases when we should
+ *        refill registers by special values.
+ *        For example, for cases with ReduceMax or ReduceSum in Softmax
+ *        Where:
+ *          - offset - starting element index where filling is performed while beginning of input data is untouched
+ *          - fill_value - hexadecimal filling value
+ * @ingroup snippets
+ */
+class Fill : public ngraph::op::Op {
+public:
+    OPENVINO_OP("Fill", "SnippetsOpset");
+
+    Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
+    Fill() = default;
+
+    size_t get_offset() const { return m_offset; }
+    uint32_t get_fill_value() const { return m_fill_value; }
+
+    void set_offset(const size_t offset) { m_offset = offset; }
+    void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+protected:
+    size_t m_offset = 0lu;
+    uint32_t m_fill_value = 0x0;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_max.hpp b/src/common/snippets/include/snippets/op/horizon_max.hpp
new file mode 100644
index 00000000000..d26c4a8c9e5
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/horizon_max.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonMax
+ * @brief The operation calculates a horizon maximum of a vector register
+ * @ingroup snippets
+ */
+class HorizonMax : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonMax", "SnippetsOpset");
+
+    HorizonMax(const Output<Node>& x);
+    HorizonMax() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/horizon_sum.hpp b/src/common/snippets/include/snippets/op/horizon_sum.hpp
new file mode 100644
index 00000000000..2dc25374bc0
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/horizon_sum.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface HorizonSum
+ * @brief The operation calculates a horizon sum of a vector register
+ * @ingroup snippets
+ */
+class HorizonSum : public ngraph::op::Op {
+public:
+    OPENVINO_OP("HorizonSum", "SnippetsOpset");
+
+    HorizonSum(const Output<Node>& x);
+    HorizonSum() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/kernel.hpp b/src/common/snippets/include/snippets/op/kernel.hpp
index f0b64de2b9a..a8d17745fde 100644
--- a/src/common/snippets/include/snippets/op/kernel.hpp
+++ b/src/common/snippets/include/snippets/op/kernel.hpp
@@ -20,13 +20,14 @@ class Kernel : public ngraph::op::Op {
 public:
     OPENVINO_OP("Kernel", "SnippetsOpset");
 
-    Kernel(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
+    Kernel(std::vector<AllocatedEmitter> region, std::shared_ptr<const ov::Model> m);
     Kernel() = default;
 
-    std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
+    std::vector<AllocatedEmitter> region;
+    const std::shared_ptr<const ov::Model> model;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<Kernel>(region);
+        return std::make_shared<Kernel>(region, model);
     }
     const void *compile_params = nullptr;
 };
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index 2d412778035..bd0a4c5463f 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <ngraph/op/op.hpp>
+#include "snippets/op/memory_access.hpp"
 
 namespace ngraph {
 namespace snippets {
@@ -12,36 +13,41 @@ namespace op {
 
 /**
  * @interface Load
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
- *        where number of elements to load is determined by "count"
- *        Default value is "1" - to load one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading
+ *        where number of elements to load is determined by "count" (Default value is "1" - to load one element)
+ *        and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element)
  * @ingroup snippets
  */
-class Load : public ngraph::op::Op {
+class Load : public MemoryAccess {
 public:
     OPENVINO_OP("Load", "SnippetsOpset");
 
-    Load(const Output<Node>& x, const size_t count = 1lu);
+    Load(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Load() = default;
 
-    size_t get_count() const { return m_count; }
-
-    void set_count(const size_t count) { m_count = count; }
-
-    bool visit_attributes(AttributeVisitor& visitor) override;
-
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
-    void validate_and_infer_types() override;
-
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
-
-protected:
-    size_t m_count = 0lu;
 };
 
+/**
+ * @interface LoadReshape
+ * @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
+ *        shape propagation. We need it to keep correct shape propagation  when Transpose is decomposed to
+ *        Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
+ * @ingroup snippets
+ */
+class LoadReshape : public Load {
+public:
+    OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
+    LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
+    LoadReshape() = default;
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    std::vector<size_t> m_order;
+};
 } // namespace op
 } // namespace snippets
 } // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp
new file mode 100644
index 00000000000..89cf0abd517
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/loop.hpp
@@ -0,0 +1,111 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "snippets/emitter.hpp"
+#include "ngraph/op/parameter.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface LoopBase
+ * @brief Base class for LoopBegin and LoopEnd
+ * @ingroup snippets
+ */
+class LoopBase : public ngraph::op::Op {
+public:
+    OPENVINO_OP("LoopBase", "SnippetsOpset");
+    LoopBase(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
+    LoopBase() = default;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    size_t get_work_amount() const;
+    size_t get_increment() const;
+    bool get_evaluate_once() const;
+
+protected:
+    size_t work_amount;
+    size_t work_amount_increment;
+    bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter
+};
+class LoopEnd;
+/**
+ * @interface LoopBegin
+ * @brief Marks the start of the Loop region.
+ *        Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd)
+ * @param args - vector of input values, they are passed directly to output.
+ * @ingroup snippets
+ */
+class LoopBegin : public LoopBase {
+    friend LoopEnd;
+
+public:
+    OPENVINO_OP("LoopBegin", "SnippetsOpset", LoopBase);
+    explicit LoopBegin(const OutputVector& args);
+    LoopBegin() = default;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs)  const override;
+    std::shared_ptr<LoopEnd> get_loop_end();
+    // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters
+    const uint8_t* begin_address;
+    std::vector<size_t> input_regs;
+
+private:
+    void validate_and_infer_types_except_LoopEnd();
+    LoopBegin(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment);
+};
+
+/**
+ * @interface LoopEnd
+ * @brief Marks the end of the Loop region and defines the loop properties.
+ *        Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd)
+ * @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output.
+ * @param work_amount total number of evaluations to be processed by the loop
+ * @param increment number of evaluations processed in one iteration of the loop.
+ * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
+ * should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data
+ * pointer will be incremented by work_amount*data_size on every iteration.
+ * @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to
+ * apply_increments, which enables more flexibility.
+ * @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop
+ * @ingroup snippets
+ */
+class LoopEnd : public LoopBase {
+public:
+    OPENVINO_OP("LoopEnd", "SnippetsOpset", LoopBase);
+    LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment,
+              std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets);
+    LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment,
+            std::vector<int64_t> ptr_increments, std::vector<int64_t> finalization_offsets);
+    LoopEnd() = default;
+    std::shared_ptr<LoopBegin> get_loop_begin();
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs)  const override;
+    const std::vector<int64_t>& get_finalization_offsets() const;
+    const std::vector<int64_t>& get_ptr_increments() const;
+    void set_finalization_offsets(std::vector<int64_t> offsets);
+    void set_ptr_increments(std::vector<int64_t> new_ptr_increments);
+    // update_ptr_increments resets non-zero increments to the new_increments. It's used when work_amount_increment is
+    // updated and we need to refresh ptr increments accordingly while respecting the broadcasting pattern
+    void update_ptr_increments(int64_t new_increment);
+    void set_work_amount(size_t new_work_amount);
+    void set_increment(size_t new_increment);
+    void set_evaluate_once(bool once);
+    // Used to propagate information about Loop structure, needed to simplify some optimizations. For example,
+    // to skip pointer increments when outer Loop is empty, and work_amount == vector_size (one inner vector Loop)
+    // true by default, the optimizations enabled if it's false;
+    bool has_outer_loop;
+
+private:
+    std::vector<int64_t> ptr_increments;
+    std::vector<int64_t> finalization_offsets;
+    size_t loop_io_size;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp
new file mode 100644
index 00000000000..f1b2d8ebb2f
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/memory_access.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface MemoryAccess
+ * @brief This is a base class for memory access operations (like Load and Store).
+ *        It provides universal set/get interface to manipulate the number
+ *        of elements accessed during one operation call ("count").
+ *        Default "count" value is "1" - it means to load/store one element
+ * @ingroup snippets
+ */
+
+class MemoryAccess : public ngraph::op::Op {
+public:
+    OPENVINO_OP("MemoryAccess", "SnippetsOpset");
+
+    size_t get_count() const;
+    size_t get_offset() const;
+    void set_count(const size_t count);
+    void set_offset(const size_t offset);
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+
+protected:
+    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu, size_t offset = 0lu);
+    MemoryAccess() = default;
+    size_t m_count = 0lu;
+    size_t m_offset = 0lu;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/powerstatic.hpp b/src/common/snippets/include/snippets/op/powerstatic.hpp
index f4dbe12f9ba..2f4e3fbcfa2 100644
--- a/src/common/snippets/include/snippets/op/powerstatic.hpp
+++ b/src/common/snippets/include/snippets/op/powerstatic.hpp
@@ -20,7 +20,6 @@ namespace op {
 class PowerStatic : public ov::op::util::UnaryElementwiseArithmetic {
 public:
     OPENVINO_OP("PowerStatic", "SnippetsOpset", ov::op::util::UnaryElementwiseArithmetic);
-    BWDCMP_RTTI_DECLARATION;
 
     PowerStatic() = default;
     PowerStatic(const Output <Node> &arg, float power) : UnaryElementwiseArithmetic(arg), power(power) {
diff --git a/src/common/snippets/include/snippets/op/scalar.hpp b/src/common/snippets/include/snippets/op/scalar.hpp
index 009f3028e92..108a34d6005 100644
--- a/src/common/snippets/include/snippets/op/scalar.hpp
+++ b/src/common/snippets/include/snippets/op/scalar.hpp
@@ -19,7 +19,6 @@ namespace op {
 class Scalar  : public ov::op::v0::Constant {
 public:
     OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant);
-    BWDCMP_RTTI_DECLARATION;
 
     Scalar() = default;
 
@@ -37,6 +36,7 @@ public:
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp
index dec50f179e9..38715cffc6c 100644
--- a/src/common/snippets/include/snippets/op/store.hpp
+++ b/src/common/snippets/include/snippets/op/store.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <ngraph/op/op.hpp>
+#include "snippets/op/memory_access.hpp"
 
 namespace ngraph {
 namespace snippets {
@@ -12,34 +13,19 @@ namespace op {
 
 /**
  * @interface Store
- * @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
- *        where number of elements to store is determined by "count"
- *        Default value is "1" - to store one element
+ * @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing
+ *        where number of elements to store is determined by "count" (Default value is "1" - to store one element)
+ *        and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr)
  * @ingroup snippets
  */
-class Store : public ngraph::op::Op {
+class Store : public MemoryAccess {
 public:
     OPENVINO_OP("Store", "SnippetsOpset");
 
-    Store(const Output<Node>& x, const size_t count = 1lu);
+    Store(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
     Store() = default;
 
-    size_t get_count() const { return m_count; }
-
-    void set_count(const size_t count) { m_count = count; }
-
-    bool visit_attributes(AttributeVisitor& visitor) override;
-
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
-    void validate_and_infer_types() override;
-
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
-
-protected:
-    size_t m_count = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 569b9dae9bf..ec55f076301 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -26,7 +26,7 @@ namespace op {
 class Subgraph : public ov::op::util::SubGraphOp {
 public:
     OPENVINO_OP("Subgraph", "SnippetsOpset", ov::op::util::SubGraphOp);
-    BWDCMP_RTTI_DECLARATION;
+    enum {DYNAMIC_DIMENSION = 0xffffffffffffffff};
 
     // < 1, 42, 17, 15, 16> < 0, 1, 2, 3, 1>
     // should be:
@@ -69,7 +69,7 @@ public:
     //
     // D = < 1, 3, 17, 15, 32> < 0, 1, 2, 3, 4>
     // E = < 1, 3, 17,  1, 32> < 0, 1, 2, 3, 4>
-    using BlockedShape = std::tuple<ngraph::Shape, ngraph::AxisVector, ngraph::element::Type>;
+    using BlockedShape = std::tuple<ngraph::PartialShape, ngraph::AxisVector, ngraph::element::Type>;
     using BlockedShapeVector = std::vector<BlockedShape>;
 
     Subgraph() = default;
@@ -86,80 +86,82 @@ public:
 
     // we introduce this method instead of using SubGraphOp::get_function()
     // to align naming with other methods
-    const std::shared_ptr<ov::Model> & body_ptr() const {
-        return m_bodies[0];
-    }
+    const std::shared_ptr<ov::Model>& body_ptr() const { return m_bodies[0]; }
+    std::shared_ptr<ov::Model>& body_ptr() { return m_bodies[0]; }
 
-    std::shared_ptr<ov::Model> & body_ptr() {
-        return m_bodies[0];
-    }
+    const ov::Model& body() const { return *m_bodies[0]; }
+    ov::Model& body() { return *m_bodies[0]; }
 
-    const ov::Model & body() const {
-        return *m_bodies[0];
-    }
+    const std::shared_ptr<ngraph::snippets::Generator>& get_generator() const { return m_generator; }
+    std::shared_ptr<ngraph::snippets::Generator> & get_generator() { return m_generator; }
 
-    ov::Model & body() {
-        return *m_bodies[0];
-    }
-
-    const std::shared_ptr<ngraph::snippets::Generator> & get_generator() const {
-        return m_generator;
-    }
-
-    std::shared_ptr<ngraph::snippets::Generator> & get_generator() {
-        return m_generator;
-    }
-
-    size_t get_non_scalar_constants_count() const {
-        return m_non_scalar_constants_count;
-    }
-
-    bool is_quantized() const {
-        return config.m_is_quantized;
-    }
-
-    bool has_type_relaxed_ops() const {
-        return config.m_has_type_relaxed_ops;
-    }
+    size_t get_buffer_scratchpad_size() const { return m_buffer_scratchpad; }
+    size_t get_virtual_port_count() const { return m_virtual_port_count; }
+    bool is_buffer_needed() const { return m_buffer_needed; }
+    bool is_quantized() const { return config.m_is_quantized; }
+    bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; }
+    bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; }
 
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
                                 const void* compile_params = nullptr);
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
     snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
     snippets::Schedule generate(const void* compile_params = nullptr);
-    Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
+    ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
+    std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
+    std::vector<Shape> reshape_body(const std::vector<Shape>& input_shapes);
 
     // plugin sets generator for a snippet to some specific generator.
     // it's going to be replaced with Jitters table later
     void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
-    void set_non_scalar_constants_count(const size_t count);
+    void set_tile_rank(size_t newRank) {tileRank = newRank;}
+    void set_virtual_port_count(const size_t count);
+    void set_buffer_needed(const bool need);
 
     void print() const;
     void print_statistics(bool verbose);
 
     void serialize() const;
+    void set_master_shape(ov::PartialShape new_shape) {master_shape = std::move(new_shape);}
 
     static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
     static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
 
+    // Non-scalar Constants are tokenized as Parameters inside Subgraph body but some operations with constant inputs
+    // should have explicit Constants even if they're non-scalar (Reshape, Transpose, Broadcast)
+    // This check returns True if Constant op which is input of this op should be inside Subgraph body
+    static auto constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool;
+
 private:
     void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
     void convert_to_snippet_dialect();
-
-    // Count of potentional non-scalar Consants that will be created after some tranformations
-    // At the moment it's relevant only for FakeQuantize decomposition
-    // NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
+    void init_config();
+    void initialize_buffer_scratchpad_size();
+    // Count of Subgraph virtual ports:
+    //  - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)
+    // Need Buffer op or not
+    //  - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants
+    // NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()),
     //       we should MANUALLY calculate it where it needed.
-    size_t m_non_scalar_constants_count = 0;
+    size_t m_virtual_port_count = 0;
+    bool m_buffer_needed = false;
+    size_t m_buffer_scratchpad = 0lu;
     Shape exec_domain = {};
     std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
 
     // TODO: Change logic of insert Converts. This exec element type can be different for plugins
     const ov::element::Type execution_element_type = ov::element::f32;
 
-    // Config to know which transformations should be called.
-    // It helps to avoid overheads of extra transformation calls
-    struct {
+    ov::PartialShape master_shape;
+    size_t tileRank = 0; // set by plugin to specify the number of dimensions processed in a single kernel call
+
+    /**
+    * @interface SubgraphConfig
+    * @brief Config to optimize IR transformation pipeline. It indicates which transformations are necessary
+    *       so the irrelevant ones could be skipped.
+    */
+    class SubgraphConfig {
+    public:
         // True if Subgraph contains FakeQuantize -> FQ decomposition should be called
         bool m_is_quantized = false;
         // True if we should align element types indise body
@@ -167,6 +169,12 @@ private:
         // True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
         // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
         bool m_has_type_relaxed_ops = false;
+        // True if body has operations that don't support plugin-side domain optimizations
+        // (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing)
+        bool m_has_domain_sensitive_ops = false;
+        // True if we should go through whole body to check for where loops should be explicitly inserted.
+        // Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops
+        bool m_explicit_loop_insertion = false;
     } config;
 };
 
@@ -190,6 +198,24 @@ static inline auto build_subgraph(const std::shared_ptr<ngraph::Node>& node, con
     return subgraph;
 };
 
+// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name();
+// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name
+auto inline update_out_tensor_name(const std::shared_ptr<ngraph::snippets::op::Subgraph>& subgraph) -> void {
+    bool not_set = true;
+    for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
+        for (const auto &in : subgraph->get_output_target_inputs(i)) {
+            if (ov::is_type<ov::op::v0::Result>(in.get_node())) {
+                const auto& body_result = subgraph->body_ptr()->get_output_op(i);
+                const auto& body_result_input = body_result->get_input_source_output(0);
+                ngraph::snippets::op::Subgraph::fill_empty_output_names(
+                        subgraph->output(i), body_result_input);
+                not_set = false;
+                break;
+            }
+        }
+    }
+}
+
 }  // namespace op
 }  // namespace snippets
 }  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp
deleted file mode 100644
index 5401a91c657..00000000000
--- a/src/common/snippets/include/snippets/op/tile.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ngraph/op/op.hpp"
-#include "snippets/emitter.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface Tile
- * @brief Generated by Canonicalization and represents Loop in affine notation
- * @ingroup snippets
- */
-class Tile : public ngraph::op::Op {
-public:
-    OPENVINO_OP("Tile", "SnippetsOpset");
-
-    /// \brief Construct an Tile
-    /// \param region The vector of pairs: emitters and the corresponding registers
-    /// \param increment Tile size - count of elements to load and store.
-    ///                  Vector Tile should have size of vector register and Scalar Tile should have 1
-    /// \param num_inputs Count of inputs
-    /// \param num_outputs Count of outputs
-    /// \param io_dims Vector of last dimensions of inputs and outputs
-    /// \param io_data_sizes Vector of data type sizes of inputs and outputs
-    Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
-         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
-    Tile() = default;
-    std::vector<AllocatedEmitter> region;
-    size_t increment = 0;
-    size_t num_inputs = 0;
-    size_t num_outputs = 0;
-    std::vector<size_t> io_dims {};
-    std::vector<size_t> io_data_size {};
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp
deleted file mode 100644
index 9d6010f7797..00000000000
--- a/src/common/snippets/include/snippets/op/tile_scheduler.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ngraph/op/op.hpp"
-#include "snippets/emitter.hpp"
-#include "tile.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface TileScheduler
- * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
- * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
- * have to be read several times (broadcasting).
- * @ingroup snippets
- */
-class TileScheduler : public ngraph::op::Op {
-public:
-    OPENVINO_OP("TileScheduler", "SnippetsOpset");
-
-    TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
-    TileScheduler() = default;
-    AllocatedEmitter vector_region;
-    AllocatedEmitter scalar_region;
-    // todo: this clone_with_new_inputs is irrelevant
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<TileScheduler>(vector_region, scalar_region);
-    }
-    const void *compile_params;
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/vector_buffer.hpp b/src/common/snippets/include/snippets/op/vector_buffer.hpp
new file mode 100644
index 00000000000..9d93e4c0157
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/vector_buffer.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface VectorBuffer
+ * @brief The operation is for intermediate data storage in vector register
+ * @ingroup snippets
+ */
+class VectorBuffer : public ngraph::op::Op {
+public:
+    OPENVINO_OP("VectorBuffer", "SnippetsOpset");
+
+    VectorBuffer(const ov::element::Type element_type = ov::element::f32);
+
+    bool visit_attributes(AttributeVisitor& visitor) override { return true;}
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    ov::element::Type m_element_type;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp b/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp
new file mode 100644
index 00000000000..0c90c1193ea
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/broadcast_to_movebroadcast.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface BroadcastToMoveBroadcast
+ * @brief Inserts explicit MoveBroadcast instruction if broadcasting by most varying dimension is needed instead of Broadcast.
+ *        Otherwise the pass removes Broadcast operation.
+ * @ingroup snippets
+ */
+class BroadcastToMoveBroadcast: public ngraph::pass::MatcherPass {
+public:
+    BroadcastToMoveBroadcast();
+};
+
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp b/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp
index 2d6f7c0d963..96c272a28d5 100644
--- a/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp
+++ b/src/common/snippets/include/snippets/pass/collapse_subgraph.hpp
@@ -12,28 +12,6 @@
 namespace ngraph {
 namespace snippets {
 namespace pass {
-/*
- NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked
- SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...).
- */
-enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin};
-void SetSnippetsNodeType(const std::shared_ptr<Node>&, SnippetsNodeType);
-SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node>&);
-void SetTopologicalOrder(const std::shared_ptr<Node>&, int64_t);
-int64_t GetTopologicalOrder(const std::shared_ptr<const Node>&);
-bool AppropriateForSubgraph(const std::shared_ptr<const Node>&);
-
-/**
- * @interface EnumerateNodes
- * @brief  Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order.
- * @ingroup snippets
- */
-class EnumerateNodes : public ov::pass::ModelPass {
-public:
-    OPENVINO_RTTI("EnumerateNodes", "0");
-    EnumerateNodes() : ModelPass() {}
-    bool run_on_model(const std::shared_ptr<ov::Model>&) override;
-};
 
 /**
  * @interface TokenizeSnippets
@@ -61,6 +39,10 @@ class TokenizeSnippets: public ngraph::pass::MatcherPass {
 public:
     OPENVINO_RTTI("TokenizeSnippets", "0");
     explicit TokenizeSnippets();
+
+    static bool AppropriateForSubgraph(const std::shared_ptr<const Node>&);
+
+    static const std::set<ngraph::element::Type> supported_element_types;
 };
 
 }  // namespace pass
diff --git a/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp b/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp
new file mode 100644
index 00000000000..fc90067f4af
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/explicit_transpose_matmul_inputs.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ExplicitTransposeMatMulInputs
+ * @brief At the moment Snippets supports Transpose only with order {0, 2, 3, 1},
+ *        so if there is pattern in graph:
+ *         in0     Transpose{0, 2, 1, 3}
+ *           \    /
+ *           MatMul[false, true]
+ *        We can set false in MatMul parameter `transposed_b` and
+ *        change Transpose order to {0, 2, 3, 1} which is supported by Snippets
+ * @ingroup snippets
+ */
+class ExplicitTransposeMatMulInputs: public ngraph::pass::MatcherPass {
+public:
+    ExplicitTransposeMatMulInputs();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
new file mode 100644
index 00000000000..1c2eaa11ea0
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/fuse_transpose_brgemm.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface FuseTransposeBrgemm
+ * @brief Fuses Transpose with Brgemm node, fusing on both Brgemm inputs and output is supported. Applicable to
+ *        Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o),
+ *        but only 0213 Transpose is currently supported.
+ * @ingroup snippets
+ */
+class FuseTransposeBrgemm: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("FuseTransposeBrgemm", "0");
+    FuseTransposeBrgemm();
+    static const std::set<std::vector<int>> supported_cases;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/pass/insert_buffer.hpp b/src/common/snippets/include/snippets/pass/insert_buffer.hpp
new file mode 100644
index 00000000000..a7fe4f00208
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/insert_buffer.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface InsertBuffer
+ * @brief The pass inserts Buffers on Inputs and Outputs of special operations [Softmax, Transpose] is it's needed
+ * @param allocation_rank - rank of shape for Buffer memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
+ *                          It's needed to allocate needed memory size that depends on Tile rank, for example.
+ *                          Default value is -1 (full shape)
+ * @ingroup snippets
+ */
+class InsertBuffer: public ngraph::pass::MatcherPass {
+public:
+    InsertBuffer(const int32_t allocation_rank = -1);
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_load_store.hpp b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
index 09911e62d8b..aab892312bf 100644
--- a/src/common/snippets/include/snippets/pass/insert_load_store.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_load_store.hpp
@@ -13,7 +13,7 @@ namespace pass {
 
 /**
  * @interface InsertLoad
- * @brief Inserts explicit load instruction after each parameter.
+ * @brief Inserts explicit load instruction after each parameter and buffer.
  * The pass is used to convert model to a canonical form for code generation
  * @ingroup snippets
  */
@@ -24,7 +24,7 @@ public:
 
 /**
  * @interface InsertStore
- * @brief Inserts explicit store instruction before each result.
+ * @brief Inserts explicit store instruction before each result and buffer.
  * The pass is used to convert model to a canonical form for code generation
  * @ingroup snippets
  */
diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp
new file mode 100644
index 00000000000..57046789167
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface InsertLoops
+ * @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution
+ * @param master_shape - shape used to determine loop work amounts
+ * @param loop_depth - the number of last master_shape dimensions processed by loops (aka tileRank - obsolete), could be 1 or 2
+ * @param vector_size - the number of entities processed on one iteration of vector loop
+ * @param single_loop_body - true, if we can just insert LoopBegin on inputs and LoopEnd on outputs, othwerwise
+ *                           the pass goes all over the body analyzing where LoopBegin and LoopEnd should be inserted:
+ *                           synchronization nodes are MatMul, Buffer and other already existing Loops.
+ * @ingroup snippets
+ */
+class InsertLoops: public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("InsertLoops", "0");
+    InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized = true);
+    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+
+    static std::vector<bool> calculate_inner_apply_increments(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
+    static std::vector<bool> calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes);
+    static std::vector<int64_t> calculate_finalization_offsets(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
+private:
+    ov::PartialShape m_master_shape;
+    size_t m_loop_depth;
+    size_t m_vector_size;
+    bool m_single_loop_body;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp b/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp
index 14fe951a12a..e0458e0b263 100644
--- a/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp
+++ b/src/common/snippets/include/snippets/pass/insert_movebroadcast.hpp
@@ -20,6 +20,10 @@ namespace pass {
 class InsertMoveBroadcast: public ngraph::pass::MatcherPass {
 public:
     InsertMoveBroadcast();
+
+    static Output<ngraph::Node> BroadcastNodeLastDim(const ngraph::Output<ngraph::Node>& value,
+                                                     const ov::PartialShape& target_shape,
+                                                     const ov::PartialShape& normalized_shape);
 };
 
 } // namespace pass
diff --git a/src/common/snippets/include/snippets/pass/loop_fusion.hpp b/src/common/snippets/include/snippets/pass/loop_fusion.hpp
new file mode 100644
index 00000000000..14676a15a6e
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/loop_fusion.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface LoopFusion
+ * @brief Fuse Loops into one Loop if their semantics allow it
+ * @ingroup snippets
+ */
+class LoopFusion: public ngraph::pass::MatcherPass {
+public:
+    LoopFusion();
+
+private:
+    bool Merge(const std::shared_ptr<op::LoopBegin>& buffer);
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/loop_helpers.hpp b/src/common/snippets/include/snippets/pass/loop_helpers.hpp
new file mode 100644
index 00000000000..12e0e9746bc
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/loop_helpers.hpp
@@ -0,0 +1,99 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "snippets/op/loop.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/* ==== LoopBegin === */
+/**
+ * @interface insertLoopBeginAfterOutputs
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface.
+ * @ingroup snippets
+ */
+std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs);
+
+/**
+ * @interface insertLoopBegin
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (ParameterVector, NodeVector or OutputVector).
+ * @ingroup snippets
+ */
+template<typename T>
+std::shared_ptr<LoopBegin> insertLoopBegin(const T& afterTheseNodes) {
+    static_assert(std::is_same<T, ParameterVector>() || std::is_same<T, NodeVector>(),
+                  "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
+    OutputVector originalOutputs;
+    std::vector<std::set<Input<Node>>> childInputs;
+    for (const auto &n : afterTheseNodes) {
+        const auto& nodeOutputs = n->outputs();
+        // Ignore the LoopBegin->LoopEnd edge to make it easier to construct enclosed Loops
+        std::move(nodeOutputs.begin(), nodeOutputs.end() - 1 * ov::is_type<LoopBegin>(n), std::back_inserter(originalOutputs));
+    }
+
+    return insertLoopBeginAfterOutputs(originalOutputs);
+}
+
+template<>
+inline std::shared_ptr<LoopBegin> insertLoopBegin(const OutputVector& afterTheseNodes) {
+    return insertLoopBeginAfterOutputs(afterTheseNodes);
+}
+/* ============== */
+
+/* ==== LoopEnd === */
+/**
+ * @interface insertLoopBeginAfterOutputs
+ * @brief  Inserts LoopBegin operation after the group of operations described
+ *          by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface.
+ * @param originalInputs LoopEnd will be inserted before these inputs
+ * @param loopBegin pointer to the beginning of the Loop region
+ * @param work_amount total number of evaluations to be processed by the loop
+ * @param increment number of evaluations processed in one iteration of the loop
+ * @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
+ * should be used when Loop is connected to Parameters and/or Results
+ * @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop
+ * @ingroup snippets
+ */
+
+std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
+                                                  const std::shared_ptr<LoopBegin>& loopBegin,
+                                                  size_t work_amount, size_t increment,
+                                                  std::vector<bool> apply_increment = {},
+                                                  std::vector<int64_t> finalization_offsets = {});
+
+/**
+ * @interface insertLoopEnd
+ * @brief  Inserts LoopEnd operation before the group of operations described
+ *          by the input argument (ResultVector, NodeVector or OutputVector).
+ * @ingroup snippets
+ */
+template<typename T, typename ...Args>
+std::shared_ptr<LoopEnd> insertLoopEnd(const T& beforeTheseNodes, Args ...args) {
+    static_assert(std::is_same<T, ResultVector>() || std::is_same<T, NodeVector>(),
+                  "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
+    std::vector<Input<Node>> originalInputs;
+    for (const auto &n : beforeTheseNodes) {
+        const auto& nodeInputs = n->inputs();
+        // Ignore the LoopBegin->LoopEnd edge to facilitate enclosed Loops construction
+        std::move(nodeInputs.begin(), nodeInputs.end() - 1 * ov::is_type<LoopEnd>(n), std::back_inserter(originalInputs));
+    }
+    return insertLoopEndBeforeInputs(originalInputs, args...);
+}
+
+template<typename ...Args>
+std::shared_ptr<LoopEnd> insertLoopEnd(const std::vector<Input<Node>>& beforeTheseNodes,  Args ...args) {
+    return insertLoopEndBeforeInputs(beforeTheseNodes, args...);
+}
+/* ============== */
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp
new file mode 100644
index 00000000000..1f00b944b56
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/matmul_to_brgemm.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface MatMulToBrgemm
+ * @brief Replaces ngraph::MatMul with snippets::op::Brgemm operation (only non-trasposing MatMuls are currently supported)
+ * @ingroup snippets
+ */
+class MatMulToBrgemm: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("MatMulToBrgemm", "0");
+    MatMulToBrgemm();
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/mha_tokenization.hpp b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp
new file mode 100644
index 00000000000..7c161e8447e
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/mha_tokenization.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface TokenizeMHASnippets
+ * @brief The pass tokenizes MHA-pattern into Subgraph
+ *        TODO: Write pattern
+ * @ingroup snippets
+ */
+class TokenizeMHASnippets: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TokenizeMHASnippets", "0");
+    TokenizeMHASnippets();
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/reset_buffer.hpp b/src/common/snippets/include/snippets/pass/reset_buffer.hpp
new file mode 100644
index 00000000000..599b533e3eb
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/reset_buffer.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface ResetBufferState
+ * @brief If there is Buffer between loops we should reset Buffer pointer after first loop execution (data storing) using finalization offsets
+ *        to have correct buffer data pointer for data loading in the next loop where data was stored in previous loop
+ * @ingroup snippets
+ */
+class ResetBufferState: public ngraph::pass::MatcherPass {
+public:
+    ResetBufferState();
+
+    static int64_t calculate_required_finalization_offsets(const size_t inner_master_work_amount, const size_t inner_target_work_amount);
+};
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
new file mode 100644
index 00000000000..b640ab35b0b
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/softmax_decomposition.hpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SoftmaxDecomposition
+ * @brief The pass decomposise Softmax into explicit Snippets dialects
+ *        Note:
+ *            - At the moment Snippets supports Softmax only in MHA pattern where there are Buffer ops before and after Softmax.
+ *              Also Snippets support Loops with Buffer ops on inputs and outputs if Buffer have the same buffer byte size
+ *              because of work with ptr increment. So we have to set Tile rank as buffer allocation rank even if rank 1 is enough
+ * @ingroup snippets
+ */
+class SoftmaxDecomposition: public ngraph::pass::MatcherPass {
+public:
+    SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank = -1);
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp
new file mode 100644
index 00000000000..7522f411669
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/softmax_reshape_elimination.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface SoftmaxReshapeElimination
+ * @brief The pass removes Reshape operations around Softmax if possible
+ * @ingroup snippets
+ */
+class SoftmaxReshapeElimination: public ngraph::pass::MatcherPass {
+public:
+    SoftmaxReshapeElimination();
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp
new file mode 100644
index 00000000000..19b776ec257
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/tokenization.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+#include "snippets/pass/mha_tokenization.hpp"
+#include "snippets/pass/collapse_subgraph.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/*
+ NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked
+ SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...).
+ */
+enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin};
+void SetSnippetsNodeType(const std::shared_ptr<Node>&, SnippetsNodeType);
+SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node>&);
+void SetTopologicalOrder(const std::shared_ptr<Node>&, int64_t);
+int64_t GetTopologicalOrder(const std::shared_ptr<const Node>&);
+
+/**
+ * @interface EnumerateNodes
+ * @brief  Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order.
+ * @ingroup snippets
+ */
+class EnumerateNodes : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("EnumerateNodes", "0");
+    EnumerateNodes() : ModelPass() {}
+    bool run_on_model(const std::shared_ptr<ov::Model>&) override;
+};
+
+
+/**
+ * @interface SnippetsTokenization
+ * @brief  Splits model to supported subgraphs
+ *         1. Enumerate nodes by topological order
+ *         2. MHA tokenization
+ *         3. Common tokenization
+ *         4. Some common transformations for Subgraphs. For example, FakeQuantize decomposition
+ * @ingroup snippets
+ */
+class SnippetsTokenization : public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("SnippetsTokenization", "0");
+    bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+};
+
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
new file mode 100644
index 00000000000..9f939eea4b7
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/transpose_decomposition.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface TransposeDecomposition
+ * @brief Decompose Transpose to Load + Store wrapped in several loops.
+ * @ingroup snippets
+ */
+class TransposeDecomposition: public ngraph::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("TransposeDecomposition", "0");
+    TransposeDecomposition();
+    static const std::set<std::vector<int>> supported_cases;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index d3f8957e2fc..af489925c51 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -9,16 +9,21 @@
 
 #include "op/broadcastload.hpp"
 #include "op/broadcastmove.hpp"
+#include "op/buffer.hpp"
 #include "op/convert_saturation.hpp"
 #include "op/convert_truncation.hpp"
+#include "op/horizon_max.hpp"
+#include "op/horizon_sum.hpp"
+#include "op/fill.hpp"
 #include "op/kernel.hpp"
 #include "op/load.hpp"
 #include "op/nop.hpp"
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"
 #include "op/store.hpp"
-#include "op/tile.hpp"
-#include "op/tile_scheduler.hpp"
+#include "op/loop.hpp"
+#include "op/brgemm.hpp"
+#include "op/vector_buffer.hpp"
 
 namespace ngraph {
 namespace snippets {
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index 09380faf4e2..1816322bb36 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -11,6 +11,10 @@
 
 // SnippetS dialect
 NGRAPH_OP(Load, ngraph::snippets::op)
+NGRAPH_OP(LoadReshape, ngraph::snippets::op)
+NGRAPH_OP(LoopBegin, ngraph::snippets::op)
+NGRAPH_OP(LoopEnd, ngraph::snippets::op)
+NGRAPH_OP(Brgemm, ngraph::snippets::op)
 NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
 
 NGRAPH_OP(Store, ngraph::snippets::op)
diff --git a/src/common/snippets/include/snippets/utils.hpp b/src/common/snippets/include/snippets/utils.hpp
index 97447ddd648..253785b516d 100644
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@@ -11,6 +11,7 @@
 #include "snippets_isa.hpp"
 #include "emitter.hpp"
 
+
 namespace ngraph {
 namespace snippets {
 namespace utils {
@@ -23,6 +24,15 @@ inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_outpu
     return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
 }
 
+
+ov::PartialShape get_port_planar_shape(const Output<Node>& out);
+ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector<size_t>& layout);
+std::vector<size_t> get_node_output_layout(const std::shared_ptr<Node>& node);
+std::vector<size_t> get_node_output_layout(const Node* node);
+
+inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); }
+inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); }
+
 } // namespace utils
 } // namespace snippets
 } // namespace ngraph
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 6ff07d977ae..c305db67f01 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -6,106 +6,219 @@
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
 #include "snippets/pass/insert_load_store.hpp"
-#include "snippets/op/tile.hpp"
+#include "snippets/op/loop.hpp"
+#include "snippets/op/subgraph.hpp"
 #include "snippets/op/kernel.hpp"
 #include <snippets/itt.hpp>
 
 #include <ngraph/pass/manager.hpp>
+#include <openvino/core/type.hpp>
 
-auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo {
+namespace ngraph {
+namespace snippets {
+
+auto getRegisters(const std::shared_ptr<ngraph::Node> &n) -> RegInfo {
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters")
-    auto rt = n->get_rt_info();
 
     // ToDo: change to reg_t
     std::vector<size_t> rin, rout;
 
-    auto it_rt = rt.find("reginfo");
-    if (it_rt != rt.end()) {
-        for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
-            rout.push_back(reg);
-        }
+    for (const auto& output : n->outputs()) {
+        const auto& rt = output.get_tensor_ptr()->get_rt_info();
+        auto it_rt = rt.find("reginfo");
+        if (it_rt != rt.end())
+            rout.push_back(it_rt->second.as<size_t>());
     }
 
     for (const auto& input : n->inputs()) {
-        auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
+        auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info();
         auto it_rt = rt.find("reginfo");
-        if (it_rt != rt.end()) {
-            for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
-                rin.push_back(reg);
-            }
-        }
+        if (it_rt != rt.end())
+            rin.push_back(it_rt->second.as<size_t>());
     }
+
     return std::make_pair(rin, rout);
 }
 
+auto tail_transformations(NodeVector& tail, const size_t tail_size, const ngraph::snippets::Generator::GeneratorConfig& config) -> void {
+    NodeVector updated_tile;
+    auto insertFill = [tail_size](const ov::Input<ov::Node>& input) -> std::shared_ptr<ov::Node> {
+        auto copyRegInfo = [](const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void {
+            auto rt = from.get_rt_info();
+            auto reginfo = rt.find("reginfo");
+            if (reginfo != rt.end()) {
+                to.get_rt_info()["reginfo"] = reginfo->second;
+            }
+        };
+        std::shared_ptr<ov::Node> fill = nullptr;
+        auto& rt = input.get_rt_info();
+        auto fill_rt = rt.find("set_fill");
+        if (fill_rt != rt.end()) {
+            const auto fill_value = fill_rt->second.as<uint32_t>();
+            fill = std::make_shared<ngraph::snippets::op::Fill>(input.get_source_output(), tail_size, fill_value);
+            input.get_node()->set_argument(input.get_index(), fill);
+            // we should explicitly copy reg info because we insert Fill after assign register
+            copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0));
+        }
+        return fill;
+    };
+
+    for (auto& op : tail) {
+        // We should fill vector regs by float_min and zero to have
+        // correct math calculations for ReduceMax and ReduceSum in scalar case.
+        // Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop,
+        //       so they are missed in <tail>
+        if (config.m_need_fill_tail_register &&
+            (ov::is_type<ov::op::v1::Maximum>(op) ||
+             ov::is_type<ov::op::v1::Add>(op))) {
+            for (auto i = 0; i < op->inputs().size(); ++i) {
+                if (auto fill = insertFill(op->input(i))) {
+                    updated_tile.push_back(fill);
+                }
+            }
+        } else if (const auto memory_access = std::dynamic_pointer_cast<ngraph::snippets::op::MemoryAccess>(op)) {
+            if (memory_access->get_count() != 1) {
+                memory_access->set_count(tail_size);
+            }
+        }
+        updated_tile.push_back(op);
+    }
+
+    tail = std::move(updated_tile);
+}
+
 ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov::Model>& m,
-                                                             const void* compile_params) const {
+                                                             const GeneratorConfig& config,
+                                                             const void* compile_params) {
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")
     if (!target->is_supported())
-        throw ngraph_error("unsupported architecture for code genration");
-
-    auto params = m->get_parameters();
-    auto results = m->get_results();
-    auto in = params.size();
-    auto out = results.size();
-    std::vector<size_t> io_last_dims(in + out);
-    std::vector<size_t> io_data_sizes(in + out);
-    std::transform(params.begin(), params.end(), io_last_dims.begin(),
-                   [](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
-    std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
-                   [](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
-    std::transform(params.begin(), params.end(), io_data_sizes.begin(),
-                   [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
-    std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
-                   [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
+        throw ngraph_error("unsupported architecture for code generation");
 
     OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
-    // vector tile
+    // vector loop
     std::vector<AllocatedEmitter> lowered;
-    for (auto n : m->get_ordered_ops()) {
-        lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
-    }
-    OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
+    auto lower_ops = [&lowered, this](const NodeVector& ops){
+        std::transform(ops.begin(), ops.end(), std::back_inserter(lowered),
+                       [this](const std::shared_ptr<Node>& n){
+                           return std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n));
+                       });
+    };
+    // *1* solo vector/tail loop + empty outer loop
+    //      => skip increments (both counter & ptr) : set evaluate_once flag
+    // *2* solo vector/tail loop + non-empty outer loop
+    //      => skip counter increments but perform ptr increments : set evaluate_once,
+    //         and perform pointer increments through finalization offsets
+    // *3* vector loop(s) + one tail loop
+    //      => vector as usual, tail depends on outer loop, see *1* and *2*
+    auto optimize_single_evaluation = [](const std::shared_ptr<op::LoopEnd>& loop, bool force_ptr_increment = false) {
+        if (loop->get_work_amount() < 2 * loop->get_increment()) {
+            loop->set_evaluate_once(true);
+            if (force_ptr_increment || loop->has_outer_loop) {
+                std::vector<int64_t> new_finalization_offsets(loop->get_finalization_offsets());
+                const auto& ptr_increments = loop->get_ptr_increments();
+                for (auto i = 0; i < new_finalization_offsets.size(); i++) {
+                    new_finalization_offsets[i] += ptr_increments[i];
+                }
+                loop->set_finalization_offsets(new_finalization_offsets);
+            }
+            return true;
+        } else {
+            return false;
+        }
+    };
+    const auto& ops = m->get_ordered_ops();
+    for (auto op = ops.begin(); op < ops.end(); op++) {
+        const auto& loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(*op);
 
-    // scalar tile
-    auto m_scalar = ov::clone_model(*m.get());
-    ngraph::pass::Manager mng;
-    mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
-    mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
-    mng.run_passes(m_scalar);
-    OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
-    std::vector<AllocatedEmitter> scalar_lowered;
-    for (auto n : m_scalar->get_ordered_ops()) {
-        scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
-    }
-    OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
-    // wrapping into tiles1D
-    //todo: in, out, and io_last_dims should derive naturally from the graph representation
-    const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
-    const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
-                                   std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
-    const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
-    const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
-                    std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
+        // ignore outer loops and possible manual scalar loops
+        if (loop_begin && loop_begin->get_increment() != 1) {
+            OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop")
+            NodeVector vector_loop, tail_loop;
+            std::shared_ptr<op::LoopEnd> vector_loop_end, tail_loop_end;
+            vector_loop_end = loop_begin->get_loop_end();
+            tail_loop_end = nullptr;
+            while (*op != vector_loop_end)
+                vector_loop.push_back(*op++);
+            vector_loop.push_back(*op);
+            const auto work_amount = vector_loop_end->get_work_amount();
+            const auto increment = vector_loop_end->get_increment();
+            const auto tail_size = work_amount % increment;
+            const auto need_tail = tail_size != 0;
+            const auto need_vector_loop = work_amount >= increment;
+            // Note, that finalization_offsets could be modified inside optimize_single_evaluation,
+            // so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail)
+            std::vector<int64_t> tail_finalization_offsets = need_tail ? vector_loop_end->get_finalization_offsets() : std::vector<int64_t> {};
+            // vector loops are required => Just copy the body, original loop is already a vector one
+            if (need_vector_loop) {
+                // Note that finalization offsets should be applied after the last iteration.
+                // So if there is a tail, then we should apply offsets after it, but not now.
+                if (need_tail)
+                    vector_loop_end->set_finalization_offsets(std::vector<int64_t>(tail_finalization_offsets.size(), 0));
 
-    OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
-    // wrapping into tiles2D
-    auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
-    tile_scheduler->compile_params = compile_params;
-    const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
-                                                       std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
+                if (config.m_optimize_single_evaluation) {
+                    // force ptr increments if there is tail
+                    optimize_single_evaluation(vector_loop_end, need_tail);
+                }
+
+                lower_ops(vector_loop);
+            }
+            OV_ITT_TASK_NEXT(GENERATE, "::TailLoop")
+            // tail is required => transform the body into a tail representation
+            // tail loop is fake loop because for tail we should calculate only
+            // finalization offsets which are supported by LoopEnd.
+            if (need_tail) {
+                NodeMap vector_to_tail_node_map;
+                tail_loop = ngraph::clone_nodes(vector_loop,  vector_to_tail_node_map);
+                tail_transformations(tail_loop, tail_size, config);
+                tail_loop_end = ov::as_type_ptr<op::LoopEnd>(*tail_loop.rbegin());
+                tail_loop_end->set_finalization_offsets(tail_finalization_offsets);
+                tail_loop_end->set_increment(tail_size);
+                // ptr increments were set to the old increment, need to update them in accordance with the new one
+                tail_loop_end->update_ptr_increments(static_cast<int64_t>(tail_size));
+                tail_loop_end->set_work_amount(tail_size);
+                tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop;
+
+                if (config.m_optimize_single_evaluation) {
+                    // tail loop is always executed once
+                    optimize_single_evaluation(tail_loop_end);
+                }
+
+                lower_ops(tail_loop);
+            }
+        } else {
+            lower_ops({*op});
+        }
+    }
 
     OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
-    // emission
-    auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
-    tiles2DKernel->compile_params = compile_params;
-    std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
-    kernel->emit_code({in, out}, {});
+    //todo: Kernel need info on i/o data access pattern and data shapes to calculate data offsets
+    // pass Params and Results
+    // todo: it's probably better to move AllocaledEmitter creation inside Kernel constructor
+    //  So Kernel accepts only model ptr and target, and creates AllocatedEmitter inside
+    //emission
+    auto loops2DKernel = std::make_shared<op::Kernel>(lowered, m);
+    loops2DKernel->compile_params = compile_params;
+    std::shared_ptr<Emitter> kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel);
+
+    kernel->emit_code({}, {});
+
     OV_ITT_TASK_NEXT(GENERATE, "::EmitData")
-    lowered.insert(lowered.end(), scalar_lowered.begin(), scalar_lowered.end());
     for (auto& op : lowered) {
         op.first->emit_data();
     }
     OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
+
+    // todo: we save lowered to access compiled brgemm kernels on execution time (normally lowered is destructed by then)
+    //  remove this when kernel caching is implemented. Don't forget to make generate const method.
+    if (config.m_save_lowered_code)
+        lowered_saved = lowered;
+
     return target->get_snippet();
 }
+
+std::shared_ptr<const TargetMachine> Generator::get_target_machine() const {
+    return target;
+}
+
+}// namespace snippets
+}// namespace ngraph
diff --git a/src/common/snippets/src/op/brgemm.cpp b/src/common/snippets/src/op/brgemm.cpp
new file mode 100644
index 00000000000..7bf999cb15e
--- /dev/null
+++ b/src/common/snippets/src/op/brgemm.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+#include "snippets/op/brgemm.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "snippets/utils.hpp"
+#include "matmul_shape_inference.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+Brgemm::Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a, const size_t offset_b, const size_t offset_c)
+    : MatMul(), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
+    set_arguments({A, B});
+    set_output_size(1);
+    constructor_validate_and_infer_types();
+}
+
+bool Brgemm::visit_attributes(AttributeVisitor& visitor) {
+    MatMul::visit_attributes(visitor);
+    visitor.on_attribute("offset_a", m_offset_a);
+    visitor.on_attribute("offset_b", m_offset_b);
+    visitor.on_attribute("offset_c", m_offset_c);
+    return true;
+}
+
+void Brgemm::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types);
+    element::Type result_et;
+    NODE_VALIDATION_CHECK(this,
+                          element::Type::merge(result_et, get_input_element_type(0), get_input_element_type(1)),
+                          "Arguments do not have the same element type (arg0 element type: ",
+                          get_input_element_type(0),
+                          ", arg1 element type: ",
+                          get_input_element_type(1),
+                          ").");
+    // If no leading dimensions are provided, assume dense row-major inputs-outputs
+    NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(),
+                          "Brgemm currently supports only static shapes.");
+
+    std::vector<ov::PartialShape> planar_input_shapes;
+    for (const auto& in : input_values())
+        planar_input_shapes.emplace_back(utils::get_port_planar_shape(in));
+
+    std::vector<ov::PartialShape> output_shapes = {ov::PartialShape{}};
+    ov::op::v0::shape_infer(this, planar_input_shapes, output_shapes);
+    const auto& output_layout = utils::get_node_output_layout(this);
+        output_shapes[0] = utils::get_reordered_planar_shape(output_shapes[0], output_layout);
+    set_output_type(0, result_et, output_shapes[0]);
+}
+
+std::shared_ptr<Node> Brgemm::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<Brgemm>(new_args.at(0), new_args.at(1), m_offset_a, m_offset_b, m_offset_c);
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/src/op/broadcastload.cpp b/src/common/snippets/src/op/broadcastload.cpp
index 7d8dd32cafb..0f4e6c7667e 100644
--- a/src/common/snippets/src/op/broadcastload.cpp
+++ b/src/common/snippets/src/op/broadcastload.cpp
@@ -11,15 +11,21 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, Shape shape)
-: BroadcastMove(x, shape) {
+snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, ov::PartialShape shape, size_t offset)
+    : BroadcastMove(x, std::move(shape)), m_offset(offset) {
     constructor_validate_and_infer_types();
 }
 
+bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) {
+    BroadcastMove::visit_attributes(visitor);
+    visitor.on_attribute("offset", m_offset);
+    return true;
+}
+
 std::shared_ptr<Node> snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(BroadcastLoad);
     check_new_args_count(this, new_args);
-    return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape);
+    return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape, m_offset);
 }
 
 void snippets::op::BroadcastLoad::validate_and_infer_types() {
diff --git a/src/common/snippets/src/op/broadcastmove.cpp b/src/common/snippets/src/op/broadcastmove.cpp
index 1a0d300ff5c..17910d3c642 100644
--- a/src/common/snippets/src/op/broadcastmove.cpp
+++ b/src/common/snippets/src/op/broadcastmove.cpp
@@ -12,7 +12,7 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, Shape shape) : Op({x}), output_shape(shape) {
+snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, ov::PartialShape shape) : Op({x}), output_shape(std::move(shape)) {
     constructor_validate_and_infer_types();
 }
 
@@ -24,44 +24,9 @@ bool snippets::op::BroadcastMove::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::BroadcastMove::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(BroadcastMove);
     check_new_args_count(this, new_args);
-    auto other = std::make_shared<BroadcastMove>(new_args.at(0), this->output_shape);
-    return other;
+    return std::make_shared<BroadcastMove>(new_args.at(0), output_shape);
 }
 
 void snippets::op::BroadcastMove::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), this->output_shape);
-}
-
-bool snippets::op::BroadcastMove::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(BroadcastMove);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    auto ishape = input_values[0]->get_shape();
-    auto oshape = output_values[0]->get_shape();
-
-    NGRAPH_CHECK(ishape.size() == oshape.size(), "input and output should have the same rank");
-
-    AxisSet broadcast_axes;
-    for (size_t k = 0; k < ishape.size(); k++) {
-        if (!((ishape[k] == oshape[k])
-           || (ishape[k] != oshape[k] && ((ishape[k] == 1) != (oshape[k] == 1) ) ))) {
-            throw ngraph_error("FakeBroadcast::evaluate incompatible shapes");
-        }
-
-        if (ishape[k] != oshape[k]) {
-            broadcast_axes.insert(k);
-        }
-    }
-
-    runtime::reference::broadcast(input_values[0]->get_data_ptr<char>(),
-                                  output_values[0]->get_data_ptr<char>(),
-                                  input_values[0]->get_shape(),
-                                  output_values[0]->get_shape(),
-                                  broadcast_axes,
-                                  sizeof(float));
-    return true;
-}
+}
\ No newline at end of file
diff --git a/src/common/snippets/src/op/buffer.cpp b/src/common/snippets/src/op/buffer.cpp
new file mode 100644
index 00000000000..ad05ae2e046
--- /dev/null
+++ b/src/common/snippets/src/op/buffer.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/buffer.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t {
+    return allocation_rank < 0 ? allocation_rank + static_cast<int32_t>(shape_rank) : allocation_rank;
+}
+
+snippets::op::Buffer::Buffer(const Output<Node>& x, const int32_t allocation_rank) : Op({x}), m_allocation_rank(allocation_rank) {
+    constructor_validate_and_infer_types();
+}
+
+bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(Buffer_visit_attributes);
+    visitor.on_attribute("allocation_rank", m_allocation_rank);
+    return true;
+}
+
+std::shared_ptr<Node> snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    auto new_buffer = std::make_shared<Buffer>(new_args.at(0), m_allocation_rank);
+    return new_buffer;
+}
+
+void snippets::op::Buffer::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
+    const auto shape_rank = get_input_partial_shape(0).rank();
+    if (shape_rank.is_static()) {
+        const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length());
+        NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(),
+                     "Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank));
+    }
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+size_t ngraph::snippets::op::Buffer::get_byte_size() const {
+    const auto pshape = get_input_partial_shape(0);
+    NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation");
+    const auto shape = pshape.get_shape();
+    const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size());
+    return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank) * get_element_type().size();
+}
diff --git a/src/common/snippets/src/op/fill.cpp b/src/common/snippets/src/op/fill.cpp
new file mode 100644
index 00000000000..ac93a501aad
--- /dev/null
+++ b/src/common/snippets/src/op/fill.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/fill.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::Fill::Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value)
+    : Op({x}), m_offset(offset), m_fill_value(fill_value) {
+    constructor_validate_and_infer_types();
+}
+
+bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(Fill_visit_attributes);
+    visitor.on_attribute("offset", m_offset);
+    visitor.on_attribute("fill_value", m_fill_value);
+    return true;
+}
+
+std::shared_ptr<Node> snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(Fill_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<Fill>(new_args.at(0), m_offset, m_fill_value);
+}
+
+void snippets::op::Fill::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(Fill_validate_and_infer_types);
+    const auto in_type = get_input_element_type(0);
+    NGRAPH_CHECK(in_type.size() == 4, "Fill operation supports only element types with 4 byte size but got:" + std::to_string(in_type.size()));
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
diff --git a/src/common/snippets/src/op/horizon_max.cpp b/src/common/snippets/src/op/horizon_max.cpp
new file mode 100644
index 00000000000..37e6e3f3c55
--- /dev/null
+++ b/src/common/snippets/src/op/horizon_max.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/op/horizon_max.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::HorizonMax::HorizonMax(const Output<Node>& x) : Op({x}) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(HorizonMax_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<HorizonMax>(new_args.at(0));
+}
+
+void snippets::op::HorizonMax::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(HorizonMax_validate_and_infer_types);
+    auto new_shape = get_input_partial_shape(0);
+    if (!ov::is_scalar(new_shape)) {
+        new_shape[new_shape.size() - 1] = 1lu;
+    }
+    set_output_type(0, get_input_element_type(0), new_shape);
+}
diff --git a/src/common/snippets/src/op/horizon_sum.cpp b/src/common/snippets/src/op/horizon_sum.cpp
new file mode 100644
index 00000000000..fa791dec234
--- /dev/null
+++ b/src/common/snippets/src/op/horizon_sum.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/op/horizon_sum.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::HorizonSum::HorizonSum(const Output<Node>& x) : Op({x}) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(HorizonSum_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<HorizonSum>(new_args.at(0));
+}
+
+void snippets::op::HorizonSum::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(HorizonSum_validate_and_infer_types);
+    auto new_shape = get_input_partial_shape(0);
+    if (!ov::is_scalar(new_shape)) {
+        new_shape[new_shape.size() - 1] = 1lu;
+    }
+    set_output_type(0, get_input_element_type(0), new_shape);
+}
diff --git a/src/common/snippets/src/op/kernel.cpp b/src/common/snippets/src/op/kernel.cpp
index aebca7edd3a..7003d3ba28c 100644
--- a/src/common/snippets/src/op/kernel.cpp
+++ b/src/common/snippets/src/op/kernel.cpp
@@ -5,8 +5,14 @@
 #include "snippets/op/kernel.hpp"
 #include "snippets/generator.hpp"
 
-using namespace std;
-using namespace ngraph;
+namespace ngraph {
+namespace snippets {
+namespace op {
 
-snippets::op::Kernel::Kernel(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
+Kernel::Kernel(std::vector<AllocatedEmitter> nested, std::shared_ptr<const ov::Model> m)
+: Op(), region(std::move(nested)), model(std::move(m)) {
 }
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index b49d7696fb8..8ee227c7afb 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -8,39 +8,54 @@
 
 #include <ngraph/runtime/host_tensor.hpp>
 
-using namespace std;
-using namespace ngraph;
+namespace ngraph {
+namespace snippets {
+namespace op {
 
-snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
+Load::Load(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
     constructor_validate_and_infer_types();
 }
 
-bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
-    return true;
-}
-
-std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
+std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Load);
     check_new_args_count(this, new_args);
-    return std::make_shared<Load>(new_args.at(0), m_count);
+    return std::make_shared<Load>(new_args.at(0), m_count, m_offset);
 }
 
-void snippets::op::Load::validate_and_infer_types() {
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+
+LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
+                            : Load(x, count, offset), m_order(std::move(order)) {
+    const auto& in_shape = x.get_partial_shape();
+    NGRAPH_CHECK(in_shape.is_static(), "LoadReshape supports only static input shapes");
+    const auto in_shape_size = in_shape.size();
+    NGRAPH_CHECK(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
+    NGRAPH_CHECK(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
+                 *std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
+    const std::set<size_t> unique_dims(order.begin(), order.end());
+    NGRAPH_CHECK(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
+    constructor_validate_and_infer_types();
 }
 
-bool snippets::op::Load::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(Load);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    std::copy(input_values[0]->get_data_ptr<uint8_t>(),
-        input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
-        output_values[0]->get_data_ptr<uint8_t>());
+void snippets::op::LoadReshape::validate_and_infer_types() {
+    const auto& old_shape = get_input_partial_shape(0);
+    ov::PartialShape new_shape;
+    for (const auto idx : m_order)
+        new_shape.push_back(old_shape[idx]);
+    set_output_type(0, get_input_element_type(0), new_shape);
+}
 
+bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) {
+    Load::visit_attributes(visitor);
+    visitor.on_attribute("order", m_order);
     return true;
 }
+
+std::shared_ptr<Node> snippets::op::LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(LoadReshape);
+    check_new_args_count(this, new_args);
+    return std::make_shared<LoadReshape>(new_args.at(0), m_count, m_offset, m_order);
+}
+
+}// namespace op
+}// namespace snippets
+}// namespace ngraph
diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp
new file mode 100644
index 00000000000..e1a4de9fef8
--- /dev/null
+++ b/src/common/snippets/src/op/loop.cpp
@@ -0,0 +1,182 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/op/loop.hpp"
+#include "snippets/generator.hpp"
+
+using namespace std;
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment)
+        : Op(args), work_amount(work_amount), work_amount_increment(increment), evaluate_once(false) {
+}
+
+bool LoopBase::visit_attributes(AttributeVisitor &visitor) {
+    visitor.on_attribute("work_amount", work_amount);
+    visitor.on_attribute("increment", work_amount_increment);
+    return true;
+}
+
+size_t LoopBase::get_work_amount() const {
+    return work_amount;
+}
+
+bool LoopBase::get_evaluate_once() const {
+    return evaluate_once;
+}
+
+size_t LoopBase::get_increment() const {
+    return work_amount_increment;
+}
+
+LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment)
+        : LoopBase(args, work_amount, work_amount_increment),
+        begin_address(nullptr), input_regs({}) {
+    // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached
+    // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it)
+    validate_and_infer_types_except_LoopEnd();
+}
+
+LoopBegin::LoopBegin(const std::vector<Output<Node>> &args)
+        : LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) {
+    validate_and_infer_types_except_LoopEnd();
+}
+
+std::shared_ptr<Node> LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, work_amount, work_amount_increment));
+}
+
+
+void LoopBegin::validate_and_infer_types_except_LoopEnd() {
+    const size_t num_inputs = get_input_size();
+    set_output_size(num_inputs + 1);
+    // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
+    for (int i = 0; i < num_inputs; i++)
+        get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
+    set_output_type(num_inputs, element::f32, ov::PartialShape{ov::Shape{}});
+}
+
+void LoopBegin::validate_and_infer_types() {
+    validate_and_infer_types_except_LoopEnd();
+    const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
+    NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output");
+    const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
+    NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output");
+    work_amount = loop_end->get_work_amount();
+    work_amount_increment = loop_end->get_increment();
+}
+
+std::shared_ptr<LoopEnd> LoopBegin::get_loop_end() {
+    const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
+    if (last_output_inputs.size() != 1)
+        throw std::invalid_argument("LoopBegin has more than one inputs attached to the last output");
+    const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
+    if (!loop_end)
+        throw std::invalid_argument("LoopBegin last output is not connected to LoopEnd");
+    return  loop_end;
+}
+
+LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment,
+                 std::vector<bool> apply_increments, std::vector<int64_t> finalization_offsets)
+        : LoopBase(args, work_amount, work_amount_increment), finalization_offsets(std::move(finalization_offsets)),
+        has_outer_loop(true), loop_io_size(0) {
+        ptr_increments.resize(apply_increments.size());
+        std::transform(apply_increments.begin(), apply_increments.end(), ptr_increments.begin(),
+                       [work_amount_increment](bool apply) {
+                           return apply ? work_amount_increment : 0;
+                       });
+    constructor_validate_and_infer_types();
+}
+
+LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment,
+                 std::vector<int64_t> ptr_increments, std::vector<int64_t> finalization_offsets)
+        : LoopBase(args, work_amount, work_amount_increment), ptr_increments(std::move(ptr_increments)),
+          finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true), loop_io_size(0) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<LoopEnd>(inputs, work_amount, work_amount_increment, ptr_increments, finalization_offsets);
+}
+
+std::shared_ptr<LoopBegin> LoopEnd::get_loop_begin() {
+    const auto& loop_begin = ov::as_type_ptr<LoopBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
+    if (!loop_begin)
+        throw std::invalid_argument("LoopEnd last input is not connected to LoopBegin");
+    return  loop_begin;
+}
+
+const std::vector<int64_t>& LoopEnd::get_finalization_offsets() const {
+    return finalization_offsets;
+}
+
+const std::vector<int64_t>& LoopEnd::get_ptr_increments()const {
+    return ptr_increments;
+}
+
+void LoopEnd::set_finalization_offsets(std::vector<int64_t> offsets) {
+    if (offsets.size() != loop_io_size)
+        throw std::invalid_argument("LoopEnd set_finalization_offsets is called with inconsistent offsets.size()");
+    finalization_offsets = std::move(offsets);
+}
+
+void LoopEnd::set_ptr_increments(std::vector<int64_t> new_ptr_increments) {
+    if (new_ptr_increments.size() != loop_io_size)
+        throw std::invalid_argument("LoopEnd set_ptr_increments is called with inconsistent new_ptr_increments.size()");
+    ptr_increments = std::move(new_ptr_increments);
+}
+
+void LoopEnd::update_ptr_increments(int64_t new_increment) {
+    std::transform(ptr_increments.begin(), ptr_increments.end(), ptr_increments.begin(),
+                   [new_increment](int64_t old_increment){
+                        return old_increment != 0 ? new_increment : 0;
+                   });
+}
+
+void LoopEnd::set_work_amount(size_t new_work_amount) {
+    work_amount = new_work_amount;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->work_amount = new_work_amount;
+}
+
+void LoopEnd::set_increment(size_t new_increment) {
+    work_amount_increment = new_increment;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->work_amount_increment = new_increment;
+}
+
+void LoopEnd::set_evaluate_once(bool once) {
+    evaluate_once = once;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->evaluate_once = once;
+}
+
+void LoopEnd::validate_and_infer_types() {
+    const size_t num_inputs = get_input_size();
+    const auto loop_begin = ov::as_type_ptr<LoopBegin>(input(get_input_size() - 1).get_source_output().get_node_shared_ptr());
+    NODE_VALIDATION_CHECK(this, loop_begin != nullptr, "LoopEnd must have LoopBegin as the last argument");
+    // Note: have to -2 because the LoopBegin->LoopEnd edge is counted twice
+    loop_io_size = get_input_size() + loop_begin->get_output_size() - 2;
+    NODE_VALIDATION_CHECK(this, ptr_increments.empty() || ptr_increments.size() == loop_io_size,
+                          "ptr_increments must be either empty or defined per every input & output of joined Loop. Expected size: ",
+                          loop_io_size, " got ", ptr_increments.size());
+    NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size,
+                          "finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ",
+                          loop_io_size, " got ", finalization_offsets.size());
+    if (ptr_increments.empty())
+        ptr_increments.resize(loop_io_size, static_cast<int64_t>(work_amount_increment));
+    if (finalization_offsets.empty())
+        finalization_offsets.resize(loop_io_size, 0);
+    set_output_size(num_inputs - 1);
+    const auto& ins = inputs();
+    // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
+    for (int i = 0; i < num_inputs - 1; i++)
+        get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp
new file mode 100644
index 00000000000..2530ea77b63
--- /dev/null
+++ b/src/common/snippets/src/op/memory_access.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/memory_access.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+MemoryAccess::MemoryAccess(const Output<Node>& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) {}
+
+bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) {
+    visitor.on_attribute("count", m_count);
+    visitor.on_attribute("offset", m_offset);
+    return true;
+}
+
+size_t MemoryAccess::get_count() const {
+    return m_count;
+}
+
+size_t MemoryAccess::get_offset() const {
+    return m_offset;
+}
+
+void MemoryAccess::set_count(const size_t count) {
+    m_count = count;
+}
+
+void MemoryAccess::set_offset(const size_t offset) {
+    m_offset = offset;
+}
+
+void MemoryAccess::validate_and_infer_types() {
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/powerstatic.cpp b/src/common/snippets/src/op/powerstatic.cpp
deleted file mode 100644
index cc23b40ac01..00000000000
--- a/src/common/snippets/src/op/powerstatic.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/powerstatic.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-BWDCMP_RTTI_DEFINITION(PowerStatic);
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
diff --git a/src/common/snippets/src/op/scalar.cpp b/src/common/snippets/src/op/scalar.cpp
index 16fa33f2f3a..d89ed94b235 100644
--- a/src/common/snippets/src/op/scalar.cpp
+++ b/src/common/snippets/src/op/scalar.cpp
@@ -6,8 +6,6 @@
 
 using namespace ngraph;
 
-BWDCMP_RTTI_DEFINITION(snippets::op::Scalar);
-
 std::shared_ptr<Node> snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const {
     check_new_args_count(this, new_args);
     return std::make_shared<Scalar>(*this);
@@ -22,3 +20,13 @@ void snippets::op::Scalar::validate_and_infer_types() {
                       "Scalar supports only one-element constants, got ", out_pshape.get_shape(),
                       " shape");
 }
+
+bool snippets::op::Scalar::visit_attributes(AttributeVisitor& visitor) {
+    auto shape = get_output_shape(0);
+    auto type = get_output_element_type(0);
+    auto value = cast_vector<float>();
+    visitor.on_attribute("element_type", type);
+    visitor.on_attribute("shape", shape);
+    visitor.on_attribute("value", value);
+    return true;
+}
diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp
index d75101be0c8..2cee1b20751 100644
--- a/src/common/snippets/src/op/store.cpp
+++ b/src/common/snippets/src/op/store.cpp
@@ -8,39 +8,19 @@
 
 #include <ngraph/runtime/host_tensor.hpp>
 
-using namespace std;
-using namespace ngraph;
+namespace ngraph {
+namespace snippets {
+namespace op {
 
-snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
+snippets::op::Store::Store(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
     constructor_validate_and_infer_types();
 }
-
-bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
-    return true;
-}
-
 std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
-    INTERNAL_OP_SCOPE(Store);
+    INTERNAL_OP_SCOPE(Store_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<Store>(new_args.at(0), m_count);
+    return std::make_shared<Store>(new_args.at(0), m_count, m_offset);
 }
 
-void snippets::op::Store::validate_and_infer_types() {
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
-}
-
-bool snippets::op::Store::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(Store);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    std::copy(input_values[0]->get_data_ptr<uint8_t>(),
-        input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
-        output_values[0]->get_data_ptr<uint8_t>());
-
-    return true;
-}
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 4d8fc5ad100..7cfd6a46605 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -9,13 +9,22 @@
 #include "snippets/op/convert_saturation.hpp"
 #include "snippets/pass/insert_load_store.hpp"
 #include "snippets/pass/insert_movebroadcast.hpp"
+#include "snippets/pass/broadcast_to_movebroadcast.hpp"
 #include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/pass/convert_constants.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
+#include "snippets/pass/insert_loops.hpp"
+#include "snippets/pass/transpose_decomposition.hpp"
 #include "snippets/pass/transform_convert.hpp"
 #include "snippets/pass/align_element_type.hpp"
+#include "snippets/pass/matmul_to_brgemm.hpp"
+#include "snippets/pass/fuse_transpose_brgemm.hpp"
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/pass/insert_buffer.hpp"
+#include "snippets/pass/loop_fusion.hpp"
 #include "snippets/utils.hpp"
 
 #include "transformations/common_optimizations/nop_elimination.hpp"
@@ -34,27 +43,43 @@ using namespace std;
 using namespace ngraph;
 using namespace ov::op::util;
 
-BWDCMP_RTTI_DEFINITION(snippets::op::Subgraph);
-
 void snippets::op::Subgraph::set_generator(std::shared_ptr<ngraph::snippets::Generator> generator) {
     m_generator = generator;
 }
 
-void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) {
-    m_non_scalar_constants_count = count;
+void snippets::op::Subgraph::set_virtual_port_count(const size_t count) {
+    m_virtual_port_count = count;
+}
+
+void snippets::op::Subgraph::set_buffer_needed(const bool need) {
+    m_buffer_needed = need;
+}
+
+void snippets::op::Subgraph::init_config() {
+    const auto ops = body_ptr()->get_ops();
+    for (const auto& op : ops) {
+        config.m_is_quantized = config.m_is_quantized ||
+            ov::is_type<ov::op::v0::FakeQuantize>(op);
+        config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops ||
+            std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
+        config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision ||
+            is_quantized() ||
+            has_type_relaxed_ops() ||
+            snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
+        config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops ||
+            ov::is_type<ov::op::v1::Transpose>(op) ||
+            ov::is_type<ov::op::v1::Softmax>(op) ||
+            ov::is_type<ov::op::v8::Softmax>(op) ||
+            ov::is_type<ov::op::v0::MatMul>(op);
+    }
+    // Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops
+    config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops;
 }
 
 snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body)
-    : SubGraphOp(args) {
+    : SubGraphOp(args), m_generator(nullptr) {
     set_function(body);
-    const auto ops = body_ptr()->get_ops();
-    for (const auto& op : ops) {
-        config.m_is_quantized = config.m_is_quantized || ov::is_type<ov::op::v0::FakeQuantize>(op);
-        config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
-        config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() ||
-            snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
-    }
-
+    init_config();
     constructor_validate_and_infer_types();
     for (size_t i = 0; i < body->get_parameters().size(); ++i)
         m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(i, i));
@@ -64,13 +89,43 @@ snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::M
 }
 
 snippets::op::Subgraph::Subgraph(const NodeVector& args, std::shared_ptr<ov::Model> body)
-    : Subgraph(as_output_vector(args), body) {}
+    : Subgraph(as_output_vector(args), std::move(body)) {}
 
 std::shared_ptr<Node> snippets::op::Subgraph::clone_with_new_inputs(const OutputVector& inputs) const {
     INTERNAL_OP_SCOPE(Subgraph);
     return make_shared<Subgraph>(inputs, ov::clone_model(body()));
 }
 
+std::vector<PartialShape> snippets::op::Subgraph::reshape_body(const std::vector<PartialShape>& input_shapes) {
+    auto& params = body_ptr()->get_parameters();
+    OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
+    for (size_t i = 0; i < params.size(); ++i) {
+        params[i]->set_partial_shape(input_shapes[i]);
+    }
+    body_ptr()->validate_nodes_and_infer_types();
+    std::vector<PartialShape> output_shapes;
+    for (const auto& res : body_ptr()->get_results()) {
+        output_shapes.emplace_back(res->get_input_partial_shape(0));
+    }
+    return output_shapes;
+}
+
+std::vector<Shape> snippets::op::Subgraph::reshape_body(const std::vector<Shape>& input_shapes) {
+    auto& params = body_ptr()->get_parameters();
+    OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
+    for (size_t i = 0; i < params.size(); ++i) {
+        params[i]->set_partial_shape(input_shapes[i]);
+    }
+    body_ptr()->validate_nodes_and_infer_types();
+    std::vector<Shape> output_shapes;
+    for (const auto& res : body_ptr()->get_results()) {
+        auto pshape = res->get_input_partial_shape(0);
+        OPENVINO_ASSERT(pshape.is_static(), "Subgraph inferred dynamic output shape during reshape with static inputs");
+        output_shapes.emplace_back(res->get_input_partial_shape(0).get_shape());
+    }
+    return output_shapes;
+}
+
 void snippets::op::Subgraph::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::validate_and_infer_types")
@@ -111,8 +166,10 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
     ngraph::OutputVector subgraph_inputs;
 
     for (const auto& input : node->input_values()) {
-        if ((utils::is_scalar_constant(input.get_node_shared_ptr())) ||
-            (ov::is_type<ov::op::v0::FakeQuantize>(node) && ov::is_type<ov::op::v0::Constant>(input.get_node_shared_ptr()))) {
+        if (ov::is_type<ngraph::opset1::Constant>(input.get_node_shared_ptr()) &&
+            (ngraph::shape_size(input.get_shape()) == 1 ||
+             ov::is_type<ov::op::v0::FakeQuantize>(node) ||
+             constant_input_should_be_inside_body(node))) {
             body_inputs.push_back(input);
         } else {
             auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
@@ -142,9 +199,17 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
     auto body = create_body(node->get_friendly_name(), body_results, body_parameters);
     auto subgraph = build_subgraph(node, subgraph_inputs, body);
 
+    bool need_buffer = false;
+    size_t hidden_data_count = 0lu;
     if (auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
-        subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node));
+        hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node);
+    // Ops that requires Buffer
+    } else if (ov::is_type<ov::op::v1::Softmax>(node) ||
+               ov::is_type<ov::op::v8::Softmax>(node)) {
+        need_buffer |= true;
     }
+    subgraph->set_virtual_port_count(hidden_data_count);
+    subgraph->set_buffer_needed(need_buffer);
 
     for (size_t i = 0; i < body->get_parameters().size(); i++) {
         body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
@@ -170,6 +235,13 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_
     NGRAPH_SUPPRESS_DEPRECATED_END
 }
 
+auto snippets::op::Subgraph::constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool {
+    return ov::is_type<ov::op::v1::Transpose>(node) ||
+           ov::is_type<ov::op::v1::Broadcast>(node) ||
+           ov::is_type<ov::op::v3::Broadcast>(node) ||
+           ov::is_type<ov::op::v1::Reshape>(node);
+}
+
 ///
 /// \brief  Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
 ///         it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
@@ -178,7 +250,8 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_
 ///             * None: all inputs have the same layout
 ///             * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
 ///         Also there is precision aligning inside body of subgraph during canonicalization
-Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
+ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes,
+                                                      const BlockedShapeVector& inputShapes) {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
     NODE_VALIDATION_CHECK(this, inputShapes.size() == body_ptr()->get_parameters().size(),
@@ -193,30 +266,29 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
                             return std::get<0>(lhs).size() < std::get<0>(rhs).size();
                          });
     };
-    Shape baseShape;
+    PartialShape baseShape;
     AxisVector baseOrder;
     std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
     const auto baseRank = baseShape.size();
     const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
     for (size_t i = 0; i < inputShapes.size(); i++) {
         const auto &blockedShape = inputShapes[i];
-        Shape inShape;
+        PartialShape inShape;
         AxisVector inOrder;
         element::Type inType;
         std::tie(inShape, inOrder, inType) = blockedShape;
         const auto inRank = inShape.size();
         NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
         if (inRank < baseRank) {
-            Shape newShape(baseRank, 1);
+            PartialShape newShape(ov::Shape(baseRank, 1));
             // todo: more complicated logics is needed if we want to merge smth else than blocked and planar
-            // could be done by PartialShape::broadcast_merge_into, but this way is faster
-            size_t startOffset = baseRank - inRank;
             if (baseIsBlocked) {
                 const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
                 NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
-                startOffset--;
+                inShape.insert(inShape.end(), ov::Dimension(1));
             }
-            std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
+            NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(newShape, inShape, ov::op::AutoBroadcastType::NUMPY),
+                                  "Failed to broadcast_merge inputs in snippets canonicalization");
             inShape = std::move(newShape);
         } else {
             // todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
@@ -225,55 +297,66 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
                                   "Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported");
         }
         ov::PartialShape tmpPShape(baseShape);
-        NODE_VALIDATION_CHECK(this,
-                              PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
-                              "Failed to create broadcastable shapes in snippets canonicalization");
-        const auto paramShape = body_ptr()->get_parameters()[i]->get_shape();
+        // todo: we need to generalize canonicalization for domain-sensitive ops. E.g. MatMul inputs can't be broadcasted one to another
+        if (!config.m_has_domain_sensitive_ops)
+            NODE_VALIDATION_CHECK(this,
+                                  PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
+                                  "Failed to create broadcastable shapes in snippets canonicalization");
+        const auto paramShape = body_ptr()->get_parameters()[i]->get_partial_shape();
         const auto paramType =  body_ptr()->get_parameters()[i]->get_element_type();
         if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
                 body_ptr()->replace_parameter(i, std::make_shared<opset1::Parameter>(paramType, inShape));
     }
-
     body_ptr()->validate_nodes_and_infer_types();
-    auto skipStartEndOnes = [](const Shape& shape) {
+    auto skipStartEndOnes = [](const PartialShape& shape) {
         auto begin = shape.begin();
         auto end = shape.end();
         while (begin != end && *begin == 1)
             begin++;
         while (begin != end && *(end-1) == 1)
             end--;
-        Shape trimmedShape(end - begin, 1);
+
+        PartialShape trimmedShape(std::vector<ov::Dimension> (end - begin, 1));
         std::copy(begin, end, trimmedShape.begin());
         return trimmedShape;
     };
 
     // Check that output shapes are broadcastable => can be scheduled
     const auto& body_results = body_ptr()->get_results();
-    PartialShape outPShape = body_results[0]->get_shape();
-    for (size_t i = 0; i < body_results.size(); i++) {
-        auto shape_i = body_results[i]->get_shape();
-        auto outputShape_i = std::get<0>(outputShapes[i]);
-        // Check that the produced output shape corresponds to the passed shape
-        // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs),
-        // so we need to remove leading and trailing "1" before the comparison
-        PartialShape pShape_i(skipStartEndOnes(shape_i));
-        bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, skipStartEndOnes(outputShape_i),
-                                                                              ::ngraph::op::AutoBroadcastType::NUMPY);
-        NODE_VALIDATION_CHECK(this, ov::shape_size(shape_i) == ov::shape_size(outputShape_i) &&
-                              compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ",
-                              get_friendly_name(), " : ", shape_i, " vs ", outputShape_i, ".");
-        // Check that output shapes are broadcastable to each other => can be scheduled
-        bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
-                                                               ::ngraph::op::AutoBroadcastType::NUMPY);
-        NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
+    PartialShape outPShape = body_results[0]->get_input_partial_shape(0);
+    // todo: we need a slightly more general approach for backward ROI propagation
+    const auto& result_parent = body_results[0]->get_input_node_shared_ptr(0);
+    if (body_results.size() == 1 &&
+        ov::is_type<opset1::Transpose>(result_parent) &&
+        ov::is_type<opset1::MatMul>(result_parent->get_input_node_shared_ptr(0))) {
+        outPShape = result_parent->get_input_partial_shape(0);
+    } else {
+        for (size_t i = 0; i < body_results.size(); i++) {
+            auto shape_i = body_results[i]->get_input_partial_shape(0);
+            auto outputShape_i = std::get<0>(outputShapes[i]);
+            // Check that the produced output shape corresponds to the passed shape
+            // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs),
+            // so we need to remove leading and trailing "1" before the comparison
+            PartialShape pShape_i(skipStartEndOnes(shape_i));
+            bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i,
+                                                                                skipStartEndOnes(outputShape_i),
+                                                                                ::ngraph::op::AutoBroadcastType::NUMPY);
+            NODE_VALIDATION_CHECK(this, compatibleWithPassedShape,
+                                  "Inferred and passed results shapes are incompatible for snippet ");
+            // Check that output shapes are broadcastable to each other => can be scheduled
+            bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
+                                                                                 ::ngraph::op::AutoBroadcastType::NUMPY);
+            NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs,
+                                  "Snippets output shapes must be numpy broadcastable");
+        }
     }
 
     // We should insert Converts after Parameters and Constant and before Results
     // to align precision inside Subgraph body that is supported by Plugin
     align_element_types(outputShapes, inputShapes);
 
-    exec_domain = outPShape.get_shape();
-    return exec_domain;
+    master_shape = outPShape;
+    return master_shape;
 }
 
 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
@@ -303,55 +386,209 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu
     //  - Insert Convert before operations that doesn't support original element type for execution
     //  - Insert reverse Convert before operations that support original element type
     //    but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point)
-    // Then we should use ConstantFolding pass to convert element type of Scalars before inference.
+    //  - Then we should use ConstantFolding pass to convert element type of Scalars before inference.
+    //  - Eliminate redundant Converts which can be inserted in AlignElementType() pass
     ngraph::pass::Manager manager;
     if (config.m_is_needed_to_align_precision) {
         manager.register_pass<snippets::pass::AlignElementType>(execution_element_type);
         manager.register_pass<ngraph::pass::ConstantFolding>();
+        // TODO [100041] : In some cases AlignElementType pass can insert extra Convert because
+        //                 the pass doesn't know real precisions in real time.
+        //                 We call EliminateConverts pass to remove them
+        manager.register_pass<ngraph::pass::EliminateConvert>();
     }
     manager.run_passes(body_ptr());
 }
 
+void snippets::op::Subgraph::initialize_buffer_scratchpad_size() {
+    auto is_transpose_loop = [](const ov::Output<ov::Node>& source_output) -> bool {
+        const auto parent = source_output.get_node_shared_ptr();
+        // Transpose op is decomposed into LoopBegin->LoadReshape->Store->LoopEnd subgraph. LoadReshape op can be only
+        // in Transpose decomposition. So it's enough to verify that this Loop is Transpose pattern.
+        // We cannot check for non-equality of input and output shape of Transpose Loop because Transpose may have the same
+        // shapes on input and output.
+        auto loop_end = ov::as_type_ptr<op::LoopEnd>(parent);
+        if (!loop_end)
+            return false;
+        size_t idx = source_output.get_index();
+        while (ov::is_type<op::LoopEnd>(loop_end->get_input_node_shared_ptr(idx))) {
+            auto consumer = loop_end->input_value(idx);
+            idx = consumer.get_index();
+            loop_end = ov::as_type_ptr<op::LoopEnd>(consumer.get_node_shared_ptr());
+        }
+
+        const auto loop_begin = loop_end->get_loop_begin();
+        // At the moment Transpose Loops cannot be fused with other Loops, so check for one input and one output is enough
+        if (loop_begin->get_input_size() != 1 || loop_end->get_output_size() != 1 || loop_begin->get_output_target_inputs(0).size() != 1)
+            return false;
+        const auto consumer = loop_begin->get_output_target_inputs(0).begin()->get_node();
+        return ov::is_type<op::LoadReshape>(consumer);
+    };
+    auto propagate_offset = [](const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer, const size_t offset) {
+        // If Buffer has offset We set this offset in the next Load and Store ops
+        // to correctly read and write data because all buffers have the one register
+        // Also if user sets offset to a Buffer It means that the Buffer has the corresponding Load and Store ops
+
+        // Propagate to up: in Store. Buffer can have only one Store
+        {
+            auto parent = buffer->get_input_node_shared_ptr(0);
+            auto idx = buffer->input(0).get_source_output().get_index();
+            // There may be graph with several LoopBegin and LoopEnd between Store/Brgemm and Buffer,
+            // so we should iterate through LoopBase
+            while (ov::is_type<snippets::op::LoopBase>(parent)) {
+                const auto source_output = parent->input_value(idx);
+                parent = source_output.get_node_shared_ptr();
+                idx = source_output.get_index();
+            }
+            if (auto store = ov::as_type_ptr<snippets::op::Store>(parent)) {
+                store->set_offset(offset);
+            } else if (const auto brgemm = ov::as_type_ptr<snippets::op::Brgemm>(parent)) {
+                // Brgemm encapsulates work with loading and storing of data
+                brgemm->set_offset_c(offset);
+            } else {
+                throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation");
+            }
+        }
+
+        // Propagate to down: in Load. Buffer can have several Load and Loops after himself. We should go through all target inputs
+        {
+            std::function<void(const Input<Node>&)> propagate_down;
+            propagate_down = [&](const Input<Node>& target_input) {
+                const auto child = target_input.get_node()->shared_from_this();
+                // There may be graph with several LoopBegin and LoopEnd between Load/Brgemm and Buffer,
+                // so we should iterate through LoopBase
+                // Example: Softmax decomposition with ReduceMax
+                if (ov::is_type<snippets::op::LoopBase>(child)) {
+                    const auto index = target_input.get_index();
+                    for (const auto loop_target_output : child->output(index).get_target_inputs()) {
+                        propagate_down(loop_target_output);
+                    }
+                } else if (const auto load = ov::as_type_ptr<snippets::op::Load>(child)) {
+                    load->set_offset(offset);
+                } else if (const auto brgemm = ov::as_type_ptr<snippets::op::Brgemm>(child)) {
+                    // Brgemm encapsulates work with loading and storing of data
+                    if (target_input.get_index() == 0) {
+                        brgemm->set_offset_a(offset);
+                    } else if (target_input.get_index() == 1) {
+                        brgemm->set_offset_b(offset);
+                    }
+                } else {
+                    throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation");
+                }
+            };
+
+            for (const auto target_output : buffer->output(0).get_target_inputs()) {
+                propagate_down(target_output);
+            }
+        }
+    };
+    m_buffer_scratchpad = 0;
+    size_t offset = 0;
+    const auto ops = body_ptr()->get_ordered_ops();
+    for (const auto& op : ops) {
+        if (const auto buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(op)) {
+            const auto buffer_size = buffer->get_byte_size();
+            // We need to allocate memory for first buffer at least
+            if (m_buffer_scratchpad == 0) {
+                m_buffer_scratchpad += buffer_size;
+                continue;
+            }
+
+            // Transpose and MatMul ops should have different memories on inputs and outputs to avoid data corruption,
+            // so after them, we should allocate new memory. Other operations (Eltwises, Convert) can be executed inplace.
+            const auto parent = buffer->get_input_node_shared_ptr(0);
+            if (ov::is_type<op::Brgemm>(parent) || is_transpose_loop(parent)) {
+                offset = m_buffer_scratchpad;
+                propagate_offset(buffer, offset);
+                m_buffer_scratchpad += buffer_size;
+                continue;
+            }
+
+            // If Buffer op requires memory size more that has been already allocated,
+            // we increase current memory size to the needed size
+            // For example, it's possible when we have a sequence of Eltwise ops with broadcasting
+            const auto current_allocated_memory_size = m_buffer_scratchpad - offset;
+            if (buffer_size > current_allocated_memory_size) {
+                m_buffer_scratchpad += (buffer_size - current_allocated_memory_size);
+                // Note: we don't update offset because we just add memory to needed size
+            }
+
+            propagate_offset(buffer, offset);
+        }
+    }
+}
+
 void snippets::op::Subgraph::convert_to_snippet_dialect() {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
     auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
-        return n->get_input_shape(0).back() != 1;
+        const auto& pshape = n->get_input_partial_shape(0);
+        const auto& last_dim = pshape[pshape.size() - 1];
+        return last_dim.is_dynamic() || last_dim.get_length() != 1;
     };
 
     // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
     // Then we are going to support variadic Load/Store with different element count
     const size_t count = m_generator->get_target_machine()->get_lanes();
+    const auto & params = body_ptr()->get_parameters();
 
+    bool inputs_has_dynamic_last_dims = std::any_of(params.begin(), params.end(),
+                                                    [](const shared_ptr<ngraph::op::Parameter>& p){
+                                                        return p->get_partial_shape().rbegin()->is_dynamic();
+                                                    });
+    const auto allocationRank = static_cast<int32_t>(tileRank);
     ngraph::pass::Manager manager;
+    if (config.m_has_domain_sensitive_ops) {
+        manager.register_pass<snippets::pass::MatMulToBrgemm>();
+        manager.register_pass<snippets::pass::FuseTransposeBrgemm>();
+        manager.register_pass<snippets::pass::InsertBuffer>(allocationRank);
+        manager.register_pass<snippets::pass::SoftmaxDecomposition>(count, allocationRank);
+        manager.register_pass<snippets::pass::TransposeDecomposition>();
+    }
+    manager.register_pass<snippets::pass::BroadcastToMoveBroadcast>();
     manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
     manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
     manager.register_pass<snippets::pass::InsertLoad>(count);
     manager.register_pass<snippets::pass::InsertStore>(count);
-    manager.register_pass<snippets::pass::InsertMoveBroadcast>();
-    manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
-    // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
-    // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
-    // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the the output does
-    // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
-    // with ScalarLoads (ScalarStores) to avoid invalid read in vector Tile. Graph example:
-    // Parameter_0    Parameter_1        Parameter_2
-    // [1,2,5,16]      [1,2,5,1]          [1,2,5,1]
-    //   Load        BroadcastLoad         Load*       Scalar
-    //          Add                             Subtract
-    //            \___________     ___________BroadcastMove
-    //                        \   /
-    //                       Multiply
-    //                         Store
-    //                        Result
-    // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
-    if (!exec_domain.empty() && exec_domain.back() != 1) {
-        manager.register_pass<snippets::pass::SetScalarCountForLoad>();
-        manager.register_pass<snippets::pass::SetScalarCountForStore>();
-        manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
-        manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
+    // todo: presently dynamic pipeline is activated even if the last two dimension are static
+    //  In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
+    //  should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required
+    // Presently Broadcasting is organized in the following way:
+    // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims)
+    if (!inputs_has_dynamic_last_dims) {
+        manager.register_pass<snippets::pass::InsertMoveBroadcast>();
+        manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
+        // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
+        // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
+        // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the output does
+        // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
+        // with ScalarLoads (ScalarStores) to avoid invalid read in vector Loop. Graph example:
+        // Parameter_0    Parameter_1        Parameter_2
+        // [1,2,5,16]      [1,2,5,1]          [1,2,5,1]
+        //   Load        BroadcastLoad         Load*       Scalar
+        //          Add                             Subtract
+        //            \___________     ___________BroadcastMove
+        //                        \   /
+        //                       Multiply
+        //                         Store
+        //                        Result
+        // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Loop.
+        if (master_shape.size() != 0 && master_shape[master_shape.size() - 1] != 1) {
+            manager.register_pass<snippets::pass::SetScalarCountForLoad>();
+            manager.register_pass<snippets::pass::SetScalarCountForStore>();
+            manager.get_pass_config()->
+                    set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
+            manager.get_pass_config()->
+                    set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
+        }
+        // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if
+        // automatic validation will be disabled in the pass manager
+        manager.register_pass<snippets::pass::InsertLoops>(master_shape, tileRank,
+            m_generator->get_target_machine()->get_lanes(), !config.m_explicit_loop_insertion);
+        if (config.m_has_domain_sensitive_ops) {
+            manager.register_pass<snippets::pass::LoopFusion>();
+            manager.register_pass<snippets::pass::ResetBufferState>();
+        }
     }
     manager.run_passes(body_ptr());
 }
@@ -380,29 +617,29 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt,
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
     NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
+
     convert_to_snippet_dialect();
     opt.run_passes(body_ptr());
 
-    // generation flow
+    // After all passes, when all optimizations are completed and all MemoryAccess ops are inserted,
+    // we can calculate common buffer scratchpad size and propagate offset from Buffer to the corresponding MemoryAccess ops
+    if (config.m_has_domain_sensitive_ops)
+        initialize_buffer_scratchpad_size();
+
     snippets::pass::AssignRegisters().run_on_model(body_ptr());
 
-    // schedule generation should go here and be target agnostic
+    const auto ops = body_ptr()->get_ops();
+    ngraph::snippets::Generator::GeneratorConfig generatorConfig;
+    generatorConfig.m_save_lowered_code = config.m_has_domain_sensitive_ops;
+    generatorConfig.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
+    generatorConfig.m_optimize_single_evaluation = std::none_of(ops.begin(), ops.end(), [](const std::shared_ptr<ov::Node>& op) {
+        return ov::is_type<ngraph::snippets::op::Buffer>(op);
+    });
 
     // actual code emission
-    ngraph::snippets::code ptr = m_generator->generate(body_ptr(), compile_params);
+    ngraph::snippets::code ptr = m_generator->generate(body_ptr(), generatorConfig, compile_params);
 
-    // check that body doesn't have constants for scheduling
-    std::vector<std::shared_ptr<opset1::Constant>> constants;
-    for (auto op : body_ptr()->get_ordered_ops()) {
-        if (auto constant = ov::as_type_ptr<opset1::Constant>(op)) {
-            if (ngraph::shape_size(constant->get_shape()) != 1 && constant->get_shape() != Shape()) {
-                constants.push_back(constant);
-            }
-        }
-    }
-    NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");
-
-    return {exec_domain, false /*canBeLinearized*/, ptr};
+    return {master_shape, false /*canBeLinearized*/, ptr};
 }
 
 void snippets::op::Subgraph::print() const {
diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp
deleted file mode 100644
index 779df920600..00000000000
--- a/src/common/snippets/src/op/tile.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/tile.hpp"
-#include "snippets/generator.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
-                         size_t num_inputs, size_t num_outputs,
-                         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
-      Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
-}
diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp
deleted file mode 100644
index a613184dc62..00000000000
--- a/src/common/snippets/src/op/tile_scheduler.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/tile_scheduler.hpp"
-#include "snippets/generator.hpp"
-
-ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
-    : Op(), vector_region{vector_region}, scalar_region{scalar_region} {
-}
diff --git a/src/common/snippets/src/op/vector_buffer.cpp b/src/common/snippets/src/op/vector_buffer.cpp
new file mode 100644
index 00000000000..1be69a6d9ad
--- /dev/null
+++ b/src/common/snippets/src/op/vector_buffer.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/vector_buffer.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+using namespace std;
+using namespace ngraph;
+
+snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return std::make_shared<VectorBuffer>(m_element_type);
+}
+
+void snippets::op::VectorBuffer::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types);
+    set_output_type(0, m_element_type, Shape{1lu});
+}
diff --git a/src/common/snippets/src/pass/align_element_type.cpp b/src/common/snippets/src/pass/align_element_type.cpp
index 2ce3b031aca..469c82ffe22 100644
--- a/src/common/snippets/src/pass/align_element_type.cpp
+++ b/src/common/snippets/src/pass/align_element_type.cpp
@@ -20,13 +20,17 @@ inline auto is_in_op(const std::shared_ptr<ov::Node>& n) -> bool {
         || ov::is_type<ov::op::v0::Constant>(n);
 }
 
-// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert)
-// And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite
+// At the moment Subgraph supports only Eltwise, Select, Convert, Broadcast and FQ (which is decomposed into Eltwises and Convert) with
+// Softmax (which is decomposed into Eltwises as well)
+// And only Eltwise and Select ops supports execution only in "exec_type". So we can check op type from the opposite
 // NOTE: This check is only for executable which isn't Parameter/Constant/Result
 inline auto op_supports_only_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
     return !is_in_op(n) &&
            !ov::is_type<ov::op::v0::Result>(n) &&
-           !ov::is_type<ov::op::v0::Convert>(n);
+           !ov::is_type<ov::op::v1::Transpose>(n) &&
+           !ov::is_type<ov::op::v0::Convert>(n) &&
+           !ov::is_type<ov::op::v1::Broadcast>(n) &&
+           !ov::is_type<ov::op::v3::Broadcast>(n);
 }
 
 }  // namespace
@@ -58,7 +62,8 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt
                 //  - Input is Convert with unsupported destination type
                 //  - Input is Op which support any element type
                 // We couldn't unite these conditions and just check that element type isn't supported exec type
-                // because we don't call validate_and_infer_types() so we don't know new precisions
+                // because we don't call validate_and_infer_types() so we don't know new precisions after setting of original
+                // input and output element types
                 if ((existing_convert && existing_convert->get_destination_type() != exec_type) ||
                     (!op_supports_only_exec_type(shared_input))) {
                     insertConvert(op, i, exec_type);
@@ -89,6 +94,6 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt
 }
 
 bool ngraph::snippets::pass::AlignElementType::opNeedsAlignElementType(const std::shared_ptr<ov::Node>& op, const ov::element::Type exec_type) {
-    // At the moment Snippets support only Eltwise/Convert/FQ which one output so we can just call get_element_type()
+    // At the moment Snippets support only Eltwise/Convert/FQ/Select/Softmax/Broadcast which one output so we can just call get_element_type()
     return op_supports_only_exec_type(op) && op->get_element_type() != exec_type;
 }
diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp
index 836523ed727..bd864d65f22 100644
--- a/src/common/snippets/src/pass/assign_registers.cpp
+++ b/src/common/snippets/src/pass/assign_registers.cpp
@@ -2,81 +2,208 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-// #include <openvino/cc/selective_build.h>
 #include <snippets/itt.hpp>
-#include "snippets/remarks.hpp"
-
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/snippets_isa.hpp"
-
-#include <ngraph/opsets/opset1.hpp>
-
 #include <iterator>
 
+namespace {
+static constexpr size_t reg_count = 16lu;
+}  // namespace
+
 bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
     RUN_ON_MODEL_SCOPE(AssignRegisters);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
     using Reg = size_t;
+    using tensor = std::shared_ptr<descriptor::Tensor>;
     auto ops = f->get_ordered_ops();
-    decltype(ops) stmts;
-    std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) {
-        return !(std::dynamic_pointer_cast<opset1::Parameter>(op) || std::dynamic_pointer_cast<opset1::Result>(op));
-        });
+    // Note that currently there are 3 types of ops:
+    //  * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer?
+    //  * gpr->vec: or vec->gpr Load/LoadConvert, Store/StoreConvert, BroadcastLoad etc.
+    //  * vec->vec: all other "normal" operations that perform calculations on vector registers: Add, BroadcastMove, Power, etc.
+    enum op_reg_type {gpr2gpr, gpr2vec, vec2gpr, vec2vec};
 
-    size_t rdx = 0;
-    std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
-    for (const auto& op : stmts) {
-        for (const auto& output : op->outputs()) {
-            regs[output.get_tensor_ptr()] = rdx++;
+    auto get_op_reg_type = [](const std::shared_ptr<Node>& op) {
+        if (std::dynamic_pointer_cast<opset1::Parameter>(op) ||
+                std::dynamic_pointer_cast<opset1::Result>(op) ||
+                std::dynamic_pointer_cast<op::LoopBegin>(op) ||
+                std::dynamic_pointer_cast<op::LoopEnd>(op) ||
+                std::dynamic_pointer_cast<op::Brgemm>(op) ||
+                std::dynamic_pointer_cast<op::Buffer>(op))
+            return gpr2gpr;
+        else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
+                 std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
+            return gpr2vec;
+        else if (std::dynamic_pointer_cast<snippets::op::Store>(op))
+            return vec2gpr;
+        else
+            return vec2vec;
+    };
+    std::vector<std::pair<op_reg_type, std::shared_ptr<Node>>> typed_ops;
+    for (const auto& op : ops)
+        typed_ops.emplace_back(std::make_pair(get_op_reg_type(op), op));
+    size_t counter_vec = 0;
+    size_t counter_gpr = 0;
+    std::map<tensor, Reg> regs_vec, regs_gpr;
+    // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually
+    std::map<tensor, Reg> manually_assigned_gprs, manually_assigned_vecs;
+    const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
+    const auto num_parameters = f->get_parameters().size();
+    const auto num_results = f->get_results().size();
+    auto accumulator_reg = 0lu;
+    for (const auto& op : ops) {
+        if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(f->get_parameter_index(param));
+        } else if (const auto& result = ov::as_type_ptr<opset1::Result>(op)) {
+            // here we use the fact that Result input & output tensors are identical by construction
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(f->get_result_index(result) + num_parameters);
+        } else if (const auto& buffer = ov::as_type_ptr<op::Buffer>(op)) {
+            // All buffers have one common data pointer
+            manually_assigned_gprs[op->input(0).get_tensor_ptr()] =
+                    static_cast<Reg>(num_results + num_parameters);
+            manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(num_results + num_parameters);
+        } else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
+            // Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer.
+            // We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
+            // TODO [96351]: We should rewrite accumulator pattern using another way
+            const auto input = op->get_input_node_shared_ptr(0); // input - it's accumulator math op: Add or Max
+            for (size_t i = 0; i < input->get_input_size(); ++i) {
+                if (ov::is_type<op::VectorBuffer>(input->get_input_node_shared_ptr(i))) {
+                    manually_assigned_vecs[input->input(i).get_tensor_ptr()] =
+                        static_cast<Reg>(accumulator_reg);
+                }
+            }
+
+            manually_assigned_vecs[input->output(0).get_tensor_ptr()] =
+                static_cast<Reg>(accumulator_reg);
+            manually_assigned_vecs[op->output(0).get_tensor_ptr()] =
+                static_cast<Reg>(accumulator_reg);
+
+            // If there is Broadcast, it should have the same register as Horizon op
+            // because it's a result of the accumulator as well
+            for (auto& out : op->output(0).get_target_inputs()) {
+                const auto child = out.get_node()->shared_from_this();
+                if (ov::is_type<op::BroadcastMove>(child)) {
+                    manually_assigned_vecs[child->output(0).get_tensor_ptr()] =
+                        static_cast<Reg>(accumulator_reg);
+                }
+            }
+            accumulator_reg++;
         }
     }
-
-    std::vector<std::set<Reg>> used;
-    std::vector<std::set<Reg>> def;
-
-    for (const auto& op : stmts) {
-        std::set<Reg> u;
-        for (const auto& input : op->inputs()) {
-            if (regs.count(input.get_tensor_ptr())) {
-                u.insert(regs[input.get_tensor_ptr()]);
+    auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr<ov::Node>& op,
+                                     decltype(regs_vec)& reg_map,
+                                     const std::map<tensor, Reg>& manually_assigned_regs,
+                                     size_t& counter) {
+        for (const auto& output : op->outputs()) {
+            const auto& t = output.get_tensor_ptr();
+            // Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
+            // so we have to check that the tensor has not been enumerated already
+            if (reg_map.count(t) == 0) {
+                reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG;
             }
         }
-        used.push_back(u);
-
-        std::set<Reg> d;
-        if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
-            for (const auto& output : op->outputs()) {
-                d.insert(regs[output.get_tensor_ptr()]);
-            }
+    };
+    for (const auto& t_op : typed_ops) {
+        switch (t_op.first) {
+            case vec2vec:
+            case gpr2vec:
+                enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec);
+                break;
+            case gpr2gpr:
+            case vec2gpr:
+                enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr);
+                break;
+        }
+    }
+    // todo: make one for gpr and one for vector
+    std::vector<std::set<Reg>> used_gpr(ops.size(), std::set<Reg>()); // used = used as an input
+    std::vector<std::set<Reg>> defined_gpr(ops.size(), std::set<Reg>()); // defined = used as output
+    std::vector<std::set<Reg>> used_vec(ops.size(), std::set<Reg>());
+    std::vector<std::set<Reg>> defined_vec(ops.size(), std::set<Reg>());
+
+    auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector<tensor>& tensors, const std::map<tensor, Reg>& reg_map) {
+        std::set<Reg> result;
+        for (const auto& t : tensors) {
+            if (reg_map.count(t) == 0)
+                throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor");
+            Reg reg_id = reg_map.at(t);
+            if (reg_id != IS_MANUALLY_ALLOCATED_REG)
+                result.insert(reg_id);
+        }
+        return result;
+    };
+    for (size_t i = 0; i < typed_ops.size(); i++) {
+        const auto& t_op = typed_ops[i];
+        std::vector<tensor> used_tensors, defined_tensors;
+        for (const auto& in : t_op.second->inputs())
+            used_tensors.push_back(in.get_tensor_ptr());
+        for (const auto& out : t_op.second->outputs())
+            defined_tensors.push_back(out.get_tensor_ptr());
+        switch (t_op.first) {
+            case vec2vec:
+                used_vec[i] = tensor2reg(used_tensors, regs_vec);
+                defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
+                break;
+            case gpr2gpr:
+                used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
+                defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
+                break;
+            case gpr2vec:
+                used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
+                defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
+                break;
+            case vec2gpr:
+                used_vec[i] = tensor2reg(used_tensors, regs_vec);
+                defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
+                break;
         }
-        def.push_back(d);
     }
 
     // define life intervals
-    std::vector<std::set<Reg>> lifeIn(stmts.size(), std::set<Reg>());
-    std::vector<std::set<Reg>> lifeOut(stmts.size(), std::set<Reg>());
+    // liveOut[i] - regs that are live on exit from i-th (topologically ordered) operation
+    // liveIn[i] - regs that are live on entering the i-th (topologically ordered) operation
+    std::vector<std::set<Reg>> life_in_vec(std::move(used_vec));
+    std::vector<std::set<Reg>> life_out_vec(typed_ops.size(), std::set<Reg>());
+    std::vector<std::set<Reg>> life_in_gpr(std::move(used_gpr));
+    std::vector<std::set<Reg>> life_out_gpr(typed_ops.size(), std::set<Reg>());
 
-    for (size_t i = 0; i < stmts.size(); i++) {
-        for (size_t n = 0; n < stmts.size(); n++) {
-            std::set_difference(lifeOut[n].begin(), lifeOut[n].end(), def[n].begin(), def[n].end(), std::inserter(lifeIn[n], lifeIn[n].begin()));
-            lifeIn[n].insert(used[n].begin(), used[n].end());
+    // todo: this part if O(N*N), so it's slow for large subgraphs. Can we simplify it? At least add an early stopping criteria
+    for (size_t i = 0; i < typed_ops.size(); i++) {
+        for (size_t n = 0; n < typed_ops.size(); n++) {
+            // Regs that are live on entering the operation = regs used by the op + (all other regs alive - regs defined by the op)
+            // copy regs from lifeOut to lifeIn while ignoring regs in def
+            std::set_difference(life_out_gpr[n].begin(), life_out_gpr[n].end(),
+                                defined_gpr[n].begin(), defined_gpr[n].end(),
+                                std::inserter(life_in_gpr[n], life_in_gpr[n].begin()));
+            std::set_difference(life_out_vec[n].begin(), life_out_vec[n].end(),
+                                defined_vec[n].begin(), defined_vec[n].end(),
+                                std::inserter(life_in_vec[n], life_in_vec[n].begin()));
         }
-        for (size_t n = 0; n < stmts.size(); n++) {
-            auto node = stmts[n];
-            if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
-                for (const auto& out : node->outputs()) {
-                    for (const auto& port : out.get_target_inputs()) {
-                        auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
-                        if (pos != stmts.end()) {
-                            auto k = pos-stmts.begin();
-                            lifeOut[n].insert(lifeIn[k].begin(), lifeIn[k].end());
-                        }
+        for (size_t n = 0; n < typed_ops.size(); n++) {
+            auto op = typed_ops[n].second;
+            for (const auto& out : op->outputs()) {
+                for (const auto& port : out.get_target_inputs()) {
+                    auto k = std::find(ops.begin(), ops.end(), port.get_node()->shared_from_this()) - ops.begin();
+                    if (k == ops.size())
+                        throw ngraph_error("assign registers can't find target op in the body");
+                    switch (typed_ops[k].first) {
+                        case vec2vec:
+                        case vec2gpr:
+                            life_out_vec[n].insert(life_in_vec[k].begin(), life_in_vec[k].end());
+                            break;
+                        case gpr2gpr:
+                        case gpr2vec:
+                            life_out_gpr[n].insert(life_in_gpr[k].begin(), life_in_gpr[k].end());
+                            break;
                     }
                 }
             }
         }
     }
-
     struct by_starting {
         auto operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs) const -> bool {
             return lhs.first < rhs.first|| (lhs.first == rhs.first && lhs.second < rhs.second);
@@ -88,13 +215,15 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
             return lhs.second < rhs.second || (lhs.second == rhs.second && lhs.first < rhs.first);
         }
     };
+    // A variable live interval - is a range (start, stop) of op indexes, such that
+    // the variable is alive within this range (defined but not used by the last user)
+    std::map<std::pair<int, int>, Reg, by_starting> live_intervals_vec, live_intervals_gpr;
 
-    std::set<std::pair<int, int>, by_starting> live_intervals;
-
-    std::reverse(lifeIn.begin(), lifeIn.end());
-    auto find_last_use = [lifeIn](int i) -> int {
-        int ln = static_cast<int>(lifeIn.size()) - 1;
-        for (auto& x : lifeIn) {
+    std::reverse(life_in_vec.begin(), life_in_vec.end());
+    std::reverse(life_in_gpr.begin(), life_in_gpr.end());
+    auto find_last_use = [](decltype(life_in_gpr) life_in, int i) -> int {
+        int ln = static_cast<int>(life_in.size()) - 1;
+        for (auto& x : life_in) {
             if (x.find(i) != x.end()) {
                 return ln;
             }
@@ -102,67 +231,86 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
         }
         return i;
     };
-
-    for (size_t i = 0; i < stmts.size(); i++) {
-        live_intervals.insert(std::make_pair(static_cast<int>(i), find_last_use(static_cast<int>(i))));
+    for (int i = 0; i < static_cast<int>(typed_ops.size()); i++) {
+        for (const auto& def : defined_vec[i])
+            live_intervals_vec[std::make_pair(i, find_last_use(life_in_vec, static_cast<int>(def)))] = def;
+        for (const auto& def : defined_gpr[i])
+            live_intervals_gpr[std::make_pair(i, find_last_use(life_in_gpr, static_cast<int>(def)))] = def;
     }
 
-    // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
-    std::multiset<std::pair<int, int>, by_ending> active;
-    std::map<Reg, Reg> register_map;
-    std::stack<Reg> bank;
-    for (int i = 0; i < 16; i++) bank.push(16-1-i);
+    auto linescan_assign_registers = [](const decltype(live_intervals_vec)& live_intervals,
+                                        const std::set<Reg>& reg_pool) {
+        // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
+        // todo: do we need multimap? <=> can an op have two inputs from the same op?
+        std::map<std::pair<int, int>, Reg, by_ending> active;
+        // uniquely defined register => reused reg (reduced subset enabled by reg by reusage)
+        std::map<Reg, Reg> register_map;
+        std::stack<Reg> bank;
+        // regs are stored in ascending order in reg_pool, so walk in reverse to assign them the same way
+        for (auto rit = reg_pool.crbegin(); rit != reg_pool.crend(); rit++)
+            bank.push(*rit);
 
-    for (auto interval : live_intervals) {
-        // check expired
-        while (!active.empty()) {
-            auto x = *active.begin();
-            if (x.second >= interval.first) {
-                break;
+        std::pair<int, int> interval, active_interval;
+        Reg unique_reg, active_unique_reg;
+        for (const auto& interval_reg : live_intervals) {
+            std::tie(interval, unique_reg) = interval_reg;
+            // check expired
+            while (!active.empty()) {
+                std::tie(active_interval, active_unique_reg) = *active.begin();
+                // if end of active interval has not passed yet => stop removing actives since they are sorted by end
+                if (active_interval.second >= interval.first) {
+                    break;
+                }
+                active.erase(active_interval);
+                bank.push(register_map[active_unique_reg]);
             }
-            active.erase(x);
-            bank.push(register_map[x.first]);
-        }
-        // allocate
-        if (active.size() == 16) {
-            throw ngraph_error("caanot allocate registers for a snippet ");
-        } else {
-            register_map[interval.first] = bank.top();
-            bank.pop();
-            active.insert(interval);
-        }
-    }
-
-    std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
-
-    for (const auto& reg : regs) {
-        physical_regs[reg.first] = register_map[reg.second];
-    }
-    const auto num_parameters = f->get_parameters().size();
-    for (const auto& n : f->get_ordered_ops()) {
-        auto& rt = n->get_rt_info();
-        std::vector<size_t> regs;
-        regs.reserve(n->outputs().size());
-        /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
-         * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
-         * Note also that Parameter and Result store general-purpose register index, because they work with memory
-         * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
-         * performed on registers.
-         */
-        if (is_type<ov::op::v0::Result>(n)) {
-            continue;
-        } else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
-            regs.push_back(f->get_parameter_index(param));
-        } else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
-            regs.push_back(f->get_result_index(store) + num_parameters);
-        } else {
-            for (const auto& output : n->outputs()) {
-                auto allocated = physical_regs[output.get_tensor_ptr()];
-                regs.push_back(allocated);
+            // allocate
+            if (active.size() == reg_pool.size()) {
+                // todo: if it is LoopBegin or LoopEnd that requires gpr, and we don't have any in the pool,
+                //  then assign SIZE_MAX-1 as a flag to spill a reg inside emitter
+                throw ngraph::ngraph_error("can't allocate registers for a snippet ");
+            } else {
+                register_map[unique_reg] = bank.top();
+                bank.pop();
+                active.insert(interval_reg);
             }
         }
-        rt["reginfo"] = regs;
-    }
+        return register_map;
+    };
+    // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator
+    std::set<Reg> vec_pool;
+    for (Reg i = 0; i < reg_count; i++)
+        vec_pool.insert(i);
+    std::set<Reg> gpr_pool(vec_pool);
+    for (const auto& t_reg : manually_assigned_vecs)
+        vec_pool.erase(t_reg.second);
+    for (const auto& t_reg : manually_assigned_gprs)
+        gpr_pool.erase(t_reg.second);
+    auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool);
+    auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool);
 
+    std::map<tensor, Reg> assigned_regs(std::move(manually_assigned_gprs));
+    assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end());
+    auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<tensor, Reg>& unique_regs,
+                                                   const std::map<Reg, Reg>& unique2reused) {
+        for (const auto& reg : unique_regs) {
+            if (reg.second == IS_MANUALLY_ALLOCATED_REG)
+                continue;
+            if (unique2reused.count(reg.second) == 0)
+                throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor");
+            assigned_regs[reg.first] = unique2reused.at(reg.second);
+        }
+    };
+    register_assigned_regs(regs_vec, unique2reused_map_vec);
+    register_assigned_regs(regs_gpr, unique2reused_map_gpr);
+
+    for (const auto& t_op : typed_ops) {
+        for (const auto& out : t_op.second->outputs()) {
+            const auto& t = out.get_tensor_ptr();
+            auto& rt = t->get_rt_info();
+            rt["reginfo"] = static_cast<size_t>(assigned_regs[t]);
+        }
+    }
     return false;
 }
+
diff --git a/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp b/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp
new file mode 100644
index 00000000000..d6e16633ba8
--- /dev/null
+++ b/src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/broadcast_to_movebroadcast.hpp"
+#include "snippets/pass/insert_movebroadcast.hpp"
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/rt_info.hpp>
+
+using namespace ngraph;
+
+ngraph::snippets::pass::BroadcastToMoveBroadcast::BroadcastToMoveBroadcast() {
+    MATCHER_SCOPE(BroadcastToMoveBroadcast);
+
+    auto m_broadcast = ngraph::pattern::wrap_type<ngraph::op::v1::Broadcast, ngraph::op::v3::Broadcast>();
+
+    auto callback = [this](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::BroadcastToMoveBroadcast")
+        auto root = m.get_match_root();
+        if (auto broadcast_v1 = ov::as_type_ptr<const ov::op::v1::Broadcast>(root)) {
+            if (broadcast_v1->get_broadcast_spec().m_type != ov::op::AutoBroadcastType::NUMPY)
+                return false;
+        } else if (auto broadcast_v3 = ov::as_type_ptr<const ov::op::v3::Broadcast>(root)) {
+            if (broadcast_v3->get_broadcast_spec().m_type != ov::op::BroadcastType::NUMPY)
+                return false;
+        }
+
+        const auto target_shape = root->get_output_partial_shape(0);
+        const auto value_shape = root->get_input_partial_shape(0);
+        if (target_shape.is_dynamic() || value_shape.is_dynamic()) {
+            return false;
+        }
+
+        const auto broadcast_node = ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim(root->input_value(0),
+                                                                                                      target_shape.get_shape(),
+                                                                                                      value_shape.get_shape());
+        replace_output_update_name(root->output(0), broadcast_node);
+        ngraph::copy_runtime_info(root, broadcast_node.get_node_shared_ptr());
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_broadcast, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
index 0e7f7e1a402..b348ccb85e9 100644
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -6,6 +6,9 @@
 #include <snippets/itt.hpp>
 
 #include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/pass/transpose_decomposition.hpp"
+#include "snippets/pass/fuse_transpose_brgemm.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "snippets/utils.hpp"
 
@@ -14,11 +17,11 @@
 #include <ngraph/rt_info.hpp>
 #include <ngraph/op/loop.hpp>
 #include "transformations/utils/utils.hpp"
+#include "ngraph/op/util/attr_types.hpp"
 
 #include <memory>
 #include <vector>
 #include <cassert>
-#include <queue>
 #include <string>
 #include <numeric>
 #include <climits>
@@ -32,33 +35,38 @@ namespace pass {
 namespace {
 
 auto outputs_are_not_broadcastable(const std::shared_ptr<const Node>& node) -> bool {
-    auto outputs = node->outputs();
-    auto find_smallest_output_shape = [](const std::vector<Output<const Node>>& outputs) -> Shape {
-        return std::accumulate(std::begin(outputs), std::end(outputs), ngraph::Shape(outputs.begin()->get_shape()),
-            [](Shape& other_shape, const Output<const Node>& output){
-                return shape_size(output.get_shape()) < shape_size(other_shape) ? output.get_shape() : other_shape;
-            });
-    };
-    auto ref_shape = find_smallest_output_shape(outputs);
-
-    auto check_shapes_broadcastable = [ref_shape](const Output<const Node>& output) -> bool {
-        auto other_shape = output.get_shape();
-
-        if (other_shape.size() != ref_shape.size()) {
-            return false;
-        }
-
-        return std::inner_product(std::begin(other_shape), std::end(other_shape), std::begin(ref_shape), true,
-                            std::logical_and<bool>(), [](Shape::value_type lsh, Shape::value_type rsh){
-                                return rsh == 1 || lsh == rsh;
-                            });
-    };
-
-    return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs);
+    const auto& outputs = node->outputs();
+    if (outputs.size() <= 1)
+        return false;
+    ov::PartialShape ref_shape = outputs.front().get_partial_shape();
+    bool success = true;
+    for (int i = 1; i < outputs.size() && success; i++) {
+        success &= ov::PartialShape::broadcast_merge_into(ref_shape, outputs[i].get_partial_shape(), ov::op::AutoBroadcastType::NUMPY);
+    }
+    return !success;
 }
 
 auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_supported_op")
+    auto is_supported_matmul = [](const std::shared_ptr<const Node>& n) -> bool {
+        const auto& matmul = is_type<const opset1::MatMul>(n);
+        const auto& out_shape = n->get_output_partial_shape(0);
+        return matmul && out_shape.is_static() && out_shape.size() == 4;
+    };
+    auto is_supported_transpose = [](const std::shared_ptr<const Node>& n) -> bool {
+        const auto& transpose = as_type_ptr<const opset1::Transpose>(n);
+        const auto& out_shape = n->get_output_partial_shape(0);
+        if (transpose && out_shape.is_static()) {
+            const auto& order = as_type_ptr<const opset1::Constant>(n->get_input_node_shared_ptr(1));
+            if (order) {
+                const auto order_value = order->cast_vector<int>();
+                return TransposeDecomposition::supported_cases.count(order_value) != 0 ||
+                       FuseTransposeBrgemm::supported_cases.count(order_value) != 0;
+            }
+        }
+        return false;
+    };
+
     auto is_supported_fq_op = [](const std::shared_ptr<const Node>& n) -> bool {
         // TODO [92179]: Add support of FakeQuantize with non-constants inputs and with binarization algorithm.
         const auto fq = ov::as_type_ptr<const opset1::FakeQuantize>(n);
@@ -69,6 +77,10 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
                is_type<opset1::Constant>(n->get_input_node_shared_ptr(4));
     };
 
+    auto is_supported_ternary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
+        return ov::is_type<opset1::Select>(n);
+    };
+
     auto is_supported_binary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
         return ov::is_type<opset1::Add>(n)
             || ov::is_type<opset1::Divide>(n)
@@ -114,14 +126,51 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
             || ov::is_type<ngraph::op::v4::Swish>(n)
             || ov::is_type<ngraph::op::v4::HSwish>(n);
     };
-    return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n);
+
+    auto is_supported_softmax = [](const std::shared_ptr<const Node> &n) -> bool {
+        if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic())
+            return false;
+        int64_t axis = -1;
+        const auto rank = n->get_input_partial_shape(0).rank();
+        if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(n)) {
+            axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank);
+        } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(n)) {
+            axis = softmax_v1->get_axis();
+        } else {
+            return false;
+        }
+        return axis >= 0 && axis == (rank.get_length() - 1);
+    };
+
+    auto is_supported_broadcast_op = [](const std::shared_ptr<const Node> &n) -> bool {
+        // Broadcast is supported only for MHA tokenization where there are needed and special checks
+        if (auto broadcast_v1 = ov::as_type_ptr<const ov::op::v1::Broadcast>(n)) {
+            return broadcast_v1->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY;
+        } else if (auto broadcast_v3 = ov::as_type_ptr<const ov::op::v3::Broadcast>(n)) {
+            return broadcast_v3->get_broadcast_spec().m_type == ov::op::BroadcastType::NUMPY;
+        }
+        return false;
+    };
+
+    return is_supported_fq_op(n) ||
+           is_supported_unary_eltwise_op(n) ||
+           is_supported_binary_eltwise_op(n) ||
+           is_supported_ternary_eltwise_op(n) ||
+           is_supported_transpose(n) ||
+           is_supported_softmax(n) ||
+           is_supported_matmul(n) ||
+           is_supported_broadcast_op(n);
 }
 
 auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
-    auto supported = [](descriptor::Tensor& t) -> bool {
-        static const std::set<ngraph::element::Type> supported_data_types =
-                { ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
-        return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
+    auto supported = [&n](descriptor::Tensor& t) -> bool {
+        // Todo: int32 isn't supported in general because i32 emitters are required for bit-exact i32 calculations in some cases
+        //  So i32 is supported exclusively for transposes and broadcast
+        return t.get_partial_shape().is_static() &&
+               (TokenizeSnippets::supported_element_types.count(t.get_element_type()) != 0 ||
+                (t.get_element_type() == ngraph::element::i32 &&
+                        (ov::is_type<const opset1::Transpose>(n) ||
+                         ov::is_type<const opset1::Broadcast>(n))));
     };
     const auto & inputs = n->inputs();
     const auto & outputs = n->outputs();
@@ -155,65 +204,15 @@ auto get_num_result_children(const std::shared_ptr<const Node> &node) -> size_t
     }
     return result;
 }
-// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name();
-// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name
-auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &subgraph) -> void {
-    bool not_set = true;
-    for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
-        for (const auto &in : subgraph->get_output_target_inputs(i)) {
-            if (ov::is_type<opset1::Result>(in.get_node())) {
-                const auto& body_result = subgraph->body_ptr()->get_output_op(i);
-                const auto& body_result_input = body_result->get_input_source_output(0);
-                op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input);
-                not_set = false;
-                break;
-            }
-        }
-    }
-}
 } // namespace
 
-bool AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
+const std::set<ngraph::element::Type> ngraph::snippets::pass::TokenizeSnippets::supported_element_types =
+        { ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
+
+bool TokenizeSnippets::AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
     return is_supported_op(node) && has_supported_in_out(node) && node->get_control_dependencies().empty();
 }
 
-void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
-    auto &rt = node->get_rt_info();
-    rt["SnippetsNodeType"] = nodeType;
-}
-
-SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node> &node) {
-    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType")
-    auto &rt = node->get_rt_info();
-    const auto rinfo = rt.find("SnippetsNodeType");
-    if (rinfo == rt.end())
-        return SnippetsNodeType::NotSet;
-    return rinfo->second.as<SnippetsNodeType>();
-}
-
-void SetTopologicalOrder(const std::shared_ptr<Node> &node, int64_t order) {
-    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder")
-    auto &rt = node->get_rt_info();
-    rt["TopologicalOrder"] = order;
-}
-
-int64_t GetTopologicalOrder(const std::shared_ptr<const Node> &node) {
-    auto &rt = node->get_rt_info();
-    const auto rinfo = rt.find("TopologicalOrder");
-    if (rinfo == rt.end())
-        throw ngraph_error("Topological order is required, but not set.");
-    return rinfo->second.as<int64_t>();
-}
-
-bool EnumerateNodes::run_on_model(const std::shared_ptr<ov::Model> &m) {
-    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes")
-    int64_t order = 0;
-    // Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough
-    for (auto &node : m->get_ordered_ops()) {
-        SetTopologicalOrder(node, order++);
-    }
-    return true;
-}
 TokenizeSnippets::TokenizeSnippets() {
     MATCHER_SCOPE(TokenizeSnippets);
     enum continuation_strategy {
@@ -224,7 +223,12 @@ TokenizeSnippets::TokenizeSnippets() {
     continuation_strategy strategy = continuation_strategy::reset;
     auto label = std::make_shared<pattern::op::Label>(pattern::any_input(),
         [](const std::shared_ptr<const Node> &n) {
-            return GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin && AppropriateForSubgraph(n);
+            // todo: MatMul and Transpose ops are always skipped by the SnippetsMarkSkipped pass.
+            //  This is a temporary solution. Either modify SnippetsMarkSkipped
+            //  or align this with the custom MHA tokenization pass.
+            return (GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin ||
+                    ov::is_type<opset1::MatMul>(n) || ov::is_type<opset1::Transpose>(n))
+                    && AppropriateForSubgraph(n);
         });
     ngraph::graph_rewrite_callback callback = [&, strategy](ngraph::pattern::Matcher &m) -> bool {
         OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::CreateSubgraph_callback")
@@ -248,7 +252,7 @@ TokenizeSnippets::TokenizeSnippets() {
             auto subgraph = op::Subgraph::wrap_node_as_subgraph(node);
             subgraph->get_rt_info()["originalLayersNames"] = getFusedNames(node) + node->get_friendly_name();
             ngraph::replace_node(node, subgraph);
-            update_out_tensor_name(subgraph);
+            op::update_out_tensor_name(subgraph);
         };
 
         auto abort_with_strategy = [&](const std::string& message_reset,
@@ -456,10 +460,15 @@ TokenizeSnippets::TokenizeSnippets() {
                 // Result op has a single input
                 internal_inputs.push_back(source_result->input_value(0));
             } else {
-                // We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
-                // After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
-                if ((utils::is_scalar_constant(input_node)) ||
-                    (ov::is_type<ov::op::v0::Constant>(input_node) && ov::is_type<ov::op::v0::FakeQuantize>(node))) {
+                // We need some non-scalar constants inside Subgraph in the following cases:
+                // [*] We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
+                //     After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
+                // [*] We support Transpose with second Constant input (represents order). This Constant will not be scheduled
+                //     and will only be used to decompose Transpose into a proper Load, Store and Loop combination.
+                if (ov::is_type<ngraph::opset1::Constant>(input_node) &&
+                    (ngraph::shape_size(input_value.get_shape()) == 1 ||
+                     ov::is_type<ov::op::v0::FakeQuantize>(node) ||
+                     op::Subgraph::constant_input_should_be_inside_body(node))) {
                     internal_inputs.push_back(input_node->output(0));
                 } else {
                     external_inputs.push_back(input_value);
@@ -489,18 +498,24 @@ TokenizeSnippets::TokenizeSnippets() {
         // than the actual number of Constants during tokenization.
         // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
         // we should calculate potentional number of non-scalar Constants that will be moved up from body.
-        size_t hidden_non_scalar_constant_count = 0;
+        size_t hidden_data_count = 0;
+        bool need_buffer = false;
         if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
-            hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
+            hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
+        // Ops require a Buffer
+        } else if (ov::is_type<ov::op::v1::Softmax>(node) ||
+                   ov::is_type<ov::op::v8::Softmax>(node)) {
+            need_buffer |= true;
         }
 
         ResultVector body_results;
         std::vector<std::set<Input<Node>>> subgraph_result_inputs;
 
         for (auto subgraph : input_subgraphs) {
-            // we should summurize non-scalar Constants count from all input subgraphs
-            // because we will collapse them with our node and we should get total count of non-scalar Constants
-            hidden_non_scalar_constant_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_non_scalar_constants_count();
+            // we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs
+            // because we will collapse them with our node and we should get total count
+            hidden_data_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_virtual_port_count();
+            need_buffer |= ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->is_buffer_needed();
 
             for (auto output : subgraph->outputs()) {
                 bool first_side_consumer = true;
@@ -541,13 +556,13 @@ TokenizeSnippets::TokenizeSnippets() {
         }
 
         // todo: move this plugin-specific constraint to the plugin callback
-        if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) {
+        if (body_parameters.size() + body_results.size() + hidden_data_count + static_cast<size_t>(need_buffer) > 12) {
             const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
-            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
+            std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
             const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
             std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
-            std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
+            std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
             return abort_with_strategy(message_reset, message_abort);
         }
 
@@ -557,7 +572,7 @@ TokenizeSnippets::TokenizeSnippets() {
         }
         auto subgraph = op::build_subgraph(node, external_inputs, body, subgraph_name);
         copy_runtime_info(replaced_nodes, subgraph);
-        const auto & act_body = subgraph->body();
+        const auto& act_body = subgraph->body();
         for (size_t i = 0; i < act_body.get_parameters().size(); i++) {
             act_body.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
         }
@@ -574,16 +589,17 @@ TokenizeSnippets::TokenizeSnippets() {
                 target_input.replace_source_output(subgraph->output(i));
             }
         }
-        update_out_tensor_name(subgraph);
+        op::update_out_tensor_name(subgraph);
 
         subgraph->validate_and_infer_types();
 
-        const auto & act_body1 = subgraph->body();
+        const auto& act_body1 = subgraph->body();
         for (size_t i = 0; i < act_body1.get_parameters().size(); i++) {
             act_body1.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
         }
         subgraph->get_rt_info()["originalLayersNames"] = fusedNames;
-        subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count);
+        subgraph->set_virtual_port_count(hidden_data_count);
+        subgraph->set_buffer_needed(need_buffer);
 
         remark(1) << "Replacement (merge) done for: "
                     << subgraph->get_friendly_name()
diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp
index b94f32af075..787fb8f650d 100644
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@@ -11,7 +11,10 @@
 
 #include "transformations/utils/utils.hpp"
 #include "snippets/pass/fq_decomposition.hpp"
+#include "snippets/pass/softmax_reshape_elimination.hpp"
+#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
 #include "snippets/op/subgraph.hpp"
+#include "snippets/utils.hpp"
 #include "snippets/itt.hpp"
 
 NGRAPH_RTTI_DEFINITION(ngraph::snippets::pass::CommonOptimizations, "Snippets::CommonOptimizations", 0);
@@ -31,7 +34,11 @@ void ConvertConstantsToParameters(const std::shared_ptr<ngraph::snippets::op::Su
 
     for (auto& op : body->get_ops()) {
         auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
-        if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul))
+        if (!constant || ngraph::shape_size(constant->get_shape()) == 1ul)
+            continue;
+
+        const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        if (op::Subgraph::constant_input_should_be_inside_body(child))
             continue;
 
         auto parameter = std::make_shared<opset1::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
@@ -67,9 +74,11 @@ CommonOptimizations::CommonOptimizations() {
         // Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs.
         ngraph::pass::Manager manager;
         manager.register_pass<ngraph::snippets::pass::TransformConvertToConvertTruncation>();
+        manager.register_pass<ngraph::snippets::pass::ExplicitTransposeMatMulInputs>();
         if (is_quantized) {
             manager.register_pass<ngraph::snippets::pass::CommonFakeQuantizeDecomposition>();
         }
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
         manager.run_passes(body);
 
         // At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp
index 3cb791d0130..37cf0f85266 100644
--- a/src/common/snippets/src/pass/convert_constants.cpp
+++ b/src/common/snippets/src/pass/convert_constants.cpp
@@ -20,11 +20,16 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
     ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
         OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
         auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
-        auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
+        if (ov::shape_size(constant->get_output_shape(0)) != 1)
+            return false;
+        //  Note that all Constants {1,1,1,1} are converted to Scalar {1} here
+        //  This is needed to simplify shape inference, otherwise {1,1,1,1} Constants can increase output rank
+        //  Also some operations support only scalar shapes, so we need separate scalars and shape [1]
+        const auto shape = constant->get_output_shape(0).size() == 0 ? ov::Shape{} : ov::Shape{1};
+        auto scalar = std::make_shared<snippets::op::Scalar>(ov::op::v0::Constant(*constant, shape));
         scalar->set_friendly_name(constant->get_friendly_name());
         ngraph::copy_runtime_info(constant, scalar);
         ngraph::replace_node(constant, scalar);
-
         return true;
     };
     register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
diff --git a/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp b/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp
new file mode 100644
index 00000000000..07e0045d880
--- /dev/null
+++ b/src/common/snippets/src/pass/explicit_transpose_matmul_inputs.cpp
@@ -0,0 +1,83 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
+#include "snippets/pass/transpose_decomposition.hpp"
+#include "snippets/op/subgraph.hpp"
+
+#include <ngraph/opsets/opset8.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+
+
+ngraph::snippets::pass::ExplicitTransposeMatMulInputs::ExplicitTransposeMatMulInputs() {
+    MATCHER_SCOPE(ExplicitTransposeMatMulInputs);
+
+    auto m_matmul0 = std::make_shared<ngraph::opset1::MatMul>(
+            ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
+            ngraph::pattern::any_input(ngraph::pattern::has_static_shape()));
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_matmul0, matcher_name),
+        [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ExplicitTransposeMatMulInputs")
+        auto root = m.get_match_root();
+        bool rewritten = false;
+
+        auto matmul0 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(root);
+        if (!matmul0)
+            return false;
+
+        for (size_t i = 0; i < matmul0->get_input_size(); i++) {
+            if (i == 0 && !matmul0->get_transpose_a())
+                continue;
+            if (i == 1 && !matmul0->get_transpose_b())
+                continue;
+
+            auto parent1 = matmul0->get_input_node_shared_ptr(i);
+            auto transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent1);
+            while (!transpose1 && !ov::is_type<ngraph::opset1::Parameter>(parent1)) {
+                // We can set supported order and transposed_b(false) only if ops have scalar shapes to avoid shape mismatching
+                const auto parent_count = parent1->inputs().size();
+                bool are_weights_scalar = true;
+                for (size_t j = 1; j < parent_count; ++j) {
+                    are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent1->get_input_shape(j)) == 1;
+                }
+                if (!are_weights_scalar)
+                    break;
+
+                parent1 = parent1->get_input_node_shared_ptr(0);
+                transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent1);
+            }
+            if (!transpose1)
+                continue;
+
+            const auto transpose_pattern = ngraph::as_type_ptr<ngraph::opset1::Constant>(transpose1->get_input_node_shared_ptr(1));
+            if (!transpose_pattern)
+                continue;
+
+            auto transposed_order = transpose_pattern->cast_vector<int32_t>();
+            std::swap(*transposed_order.rbegin(), *(transposed_order.rbegin() + 1));
+            if (pass::TransposeDecomposition::supported_cases.count(transposed_order) == 0)
+                continue;
+
+            auto new_transpose_order = std::make_shared<ngraph::opset1::Constant>(transpose_pattern->get_element_type(),
+                                                                                  ngraph::Shape{4},
+                                                                                  transposed_order);
+            new_transpose_order->set_friendly_name(transpose_pattern->get_friendly_name());
+            ngraph::copy_runtime_info(transpose_pattern, new_transpose_order);
+            transpose1->set_argument(1, new_transpose_order);
+            if (i == 0) {
+                matmul0->set_transpose_a(false);
+            } else {
+                matmul0->set_transpose_b(false);
+            }
+            rewritten |= true;
+        }
+
+        return rewritten;
+    });
+}
diff --git a/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
new file mode 100644
index 00000000000..73347c6475b
--- /dev/null
+++ b/src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
@@ -0,0 +1,86 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/pass/fuse_transpose_brgemm.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include "snippets/utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+#include "ngraph/rt_info.hpp"
+#include "ngraph/pattern/op/wrap_type.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+const std::set<std::vector<int>> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}};
+FuseTransposeBrgemm::FuseTransposeBrgemm() {
+    MATCHER_SCOPE(FuseTransposeBrgemm);
+    auto transpose_is_supported = [](const Output<Node>& transpose_port) {
+        const auto transpose_node = transpose_port.get_node_shared_ptr();
+        // it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map
+        const auto& constant = as_type_ptr<ngraph::opset1::Constant>(transpose_node->get_input_node_shared_ptr(1));
+        // if Transpose in and out layout is not empty => something was already fused on this port
+        if (!utils::get_node_output_layout(transpose_node).empty() ||
+            !utils::get_node_output_layout(transpose_node->get_input_node_shared_ptr(0)).empty())
+            return false;
+        const auto& transpose_order = constant->cast_vector<int>();
+        // todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way
+        //  to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if
+        //  the rt_info is properly propagated to the corresponding parameter
+        if (!is_type<ngraph::opset1::Parameter>(transpose_node->get_input_node_shared_ptr(0)) ||
+            supported_cases.count(transpose_order) == 0)
+            return false;
+        return true;
+    };
+    auto constant = pattern::wrap_type<opset1::Constant>();
+    auto transpose = pattern::wrap_type<opset1::Transpose>({pattern::any_input(), constant}, transpose_is_supported);
+    auto transpose_matcher = std::make_shared<pattern::Matcher>(transpose);
+    auto brgemm_any = pattern::wrap_type<op::Brgemm>({pattern::any_input(), pattern::any_input()});
+
+    auto brgemm_in0 = pattern::wrap_type<op::Brgemm>({transpose, pattern::any_input()});
+    auto brgemm_in1 = pattern::wrap_type<op::Brgemm>({pattern::any_input(), transpose});
+    auto brgemm_out0 = pattern::wrap_type<opset1::Transpose>({brgemm_any, constant});
+    auto brgemm_or_transpose = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{brgemm_in0, brgemm_in1, brgemm_out0});
+
+    auto callback = [=](pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "FuseTransposeBrgemm")
+        auto set_layout_from_order = [](const std::shared_ptr<opset1::Transpose>& node, const ov::Output<Node>& port) {
+            const auto& const_order = as_type_ptr<opset1::Constant>(node->get_input_node_shared_ptr(1));
+            std::vector<size_t> layout = const_order->cast_vector<size_t>();
+            auto& rt_info = port.get_node_shared_ptr()->get_rt_info();
+            rt_info["Layout"] = layout;
+        };
+        auto brgemm = as_type_ptr<op::Brgemm>(m.get_match_root());
+        // Transpose on the Brgemm's output
+        if (!brgemm) {
+            brgemm = as_type_ptr<op::Brgemm>(m.get_match_root()->get_input_node_shared_ptr(0));
+            const auto& brgemm_out = brgemm->output(0);
+            const auto& transpose_out = m.get_match_value();
+            for (const auto& in : transpose_out.get_target_inputs())
+                in.replace_source_output(brgemm->output(0));
+            set_layout_from_order(as_type_ptr<opset1::Transpose>(transpose_out.get_node_shared_ptr()), brgemm_out);
+        }
+        for (int i = 0; i < brgemm->get_input_size(); i++) {
+            const auto& in_value = brgemm->input_value(i);
+            if (transpose_matcher->match(in_value)) {
+                const auto& transpose = as_type_ptr<opset1::Transpose>(in_value.get_node_shared_ptr());
+                set_layout_from_order(transpose, transpose->input_value(0));
+                brgemm->set_argument(i, transpose->input_value(0));
+            }
+        }
+        // need to run validate_and_infer_types manually: either input shapes were updated or
+        // output Layout was updated (out shape will be updated in validate_and_infer_types())
+        brgemm->validate_and_infer_types();
+        return true;
+    };
+    register_matcher(std::make_shared<pattern::Matcher>(brgemm_or_transpose, matcher_name), callback);
+}
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/pass/insert_buffer.cpp b/src/common/snippets/src/pass/insert_buffer.cpp
new file mode 100644
index 00000000000..e3fdb0173ef
--- /dev/null
+++ b/src/common/snippets/src/pass/insert_buffer.cpp
@@ -0,0 +1,96 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/remarks.hpp"
+
+#include "snippets/pass/insert_buffer.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank) {
+    MATCHER_SCOPE(InsertBuffer);
+    // The list of operations that require Buffers on their Inputs and Outputs
+    const auto pattern = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax,
+                                                    ngraph::op::v8::Softmax,
+                                                    ngraph::op::v1::Transpose,
+                                                    op::Brgemm>();
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(pattern, matcher_name),
+            [this, allocation_rank](ngraph::pattern::Matcher &m) {
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertBuffer")
+            auto root = m.get_match_root();
+            bool rewritten = false;
+
+            // check if already has Buffer, Parameter or Constant as an input
+            for (const auto& input : root->inputs()) {
+                const auto input_node = input.get_source_output().get_node()->shared_from_this();
+                if (!ov::is_type<ngraph::snippets::op::Buffer>(input_node) &&
+                    !ov::is_type<ngraph::op::v0::Parameter>(input_node) &&
+                    !ov::is_type<ngraph::op::v0::Constant>(input_node)) {
+                    const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(input_node, allocation_rank);
+                    root->set_argument(input.get_index(), buffer);
+                    rewritten |= true;
+                }
+                if (ov::is_type<op::Buffer>(input.get_source_output().get_node_shared_ptr()) &&
+                    input.get_source_output().get_target_inputs().size() != 1) {
+                    throw ngraph::ngraph_error(
+                            "If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
+                }
+            }
+
+            // check if already has Buffer or outputs is Result
+            for (const auto& output : root->outputs()) {
+                const auto target_inputs = output.get_target_inputs();
+                if (target_inputs.size() > 1) {
+                    for (const auto& consumer : target_inputs) {
+                        const auto output_node = consumer.get_node()->shared_from_this();
+                        if (ov::is_type<ngraph::snippets::op::Buffer>(output_node)) {
+                            // If some of children from one common port are different Buffers,
+                            // we should remove them to insert one common Buffer on one common port
+                            replace_output_update_name(output_node->output(0), output_node->input_value(0));
+                        } else if (ov::is_type<ngraph::op::v0::Result>(output_node)) {
+                            // TODO: At this moment operation which is should be wrapped by Buffers doesn't support several childs where one of them is Result
+                            // because Result and Buffer from one root port should have the same register. It's not supported at the moment
+                            // For example,
+                            //    Buffer
+                            //      |
+                            //    Softmax
+                            //    /    \
+                            // Buffer Result
+                            throw ngraph::ngraph_error(
+                                "Operation which is should be wrapped by Buffers has few children from one output port where one of them is Result");
+                        }
+                    }
+                }
+
+                const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(output, allocation_rank);
+                for (const auto& consumer : output.get_target_inputs()) {
+                    const auto output_node = consumer.get_node()->shared_from_this();
+                    if (output_node != buffer &&
+                        !ov::is_type<ngraph::snippets::op::Buffer>(output_node) &&
+                        !ov::is_type<ngraph::op::v0::Result>(output_node)) {
+                        consumer.replace_source_output(buffer);
+                        rewritten |= true;
+                    }
+                }
+
+                const auto new_target_inputs = output.get_target_inputs();
+                const auto has_buffer_on_output = std::any_of(new_target_inputs.begin(), new_target_inputs.end(), [](const ov::Input<ov::Node>& consumer) {
+                    const auto child = consumer.get_node()->shared_from_this();
+                    // We check for count of target inputs of Buffer output because
+                    // we created Buffer op with root input previously for the next possible insertions
+                    // Thus, if Buffer wasn't inserted, this op doesn't have target inputs on output
+                    return ov::is_type<ngraph::snippets::op::Buffer>(child) && child->output(0).get_target_inputs().size() > 0;
+                });
+                if (has_buffer_on_output && new_target_inputs.size() != 1) {
+                    throw ngraph::ngraph_error(
+                            "If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
+                }
+            }
+            return rewritten;
+        });
+}
diff --git a/src/common/snippets/src/pass/insert_load_store.cpp b/src/common/snippets/src/pass/insert_load_store.cpp
index 8d60c4b7cff..707dd71375e 100644
--- a/src/common/snippets/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/src/pass/insert_load_store.cpp
@@ -15,15 +15,23 @@
 ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
     MATCHER_SCOPE(InsertLoad);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
-        ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
+        ngraph::pattern::wrap_type<ngraph::opset1::Parameter, ngraph::snippets::op::Buffer>(), matcher_name),
             [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
             auto root = m.get_match_root();
 
             // check if already has Load as an output
-            for (auto output : root->outputs()) {
-                for (auto consumer : output.get_target_inputs()) {
-                    if (ov::is_type<ngraph::snippets::op::Load>(consumer.get_node())) {
+            for (const auto& output : root->outputs()) {
+                for (const auto& consumer : output.get_target_inputs()) {
+                    // if a parameter is connected to a Load => we don't need another one
+                    // if a parameter is connected to LoopBegin => there must be Load inside the Loop
+                    // if a parameter is connected to MatMul => we don't need Load (read/write is encapsulated into the brgemm emitter)
+                    // (it's the responsibility of transformation that inserted the Loops)
+                    const auto& consumer_node = consumer.get_node();
+                    if (ov::is_type<ngraph::snippets::op::Load>(consumer_node) ||
+                        ov::is_type<ngraph::snippets::op::LoopBegin>(consumer_node) ||
+                        ov::is_type<ngraph::op::v0::MatMul>(consumer_node) ||
+                        ov::is_type<ngraph::op::v1::Transpose>(consumer_node)) {
                         return false;
                     }
                 }
@@ -33,8 +41,8 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
             ngraph::copy_runtime_info(root, load);
 
             bool rewritten = false;
-            for (auto output : root->outputs()) {
-                for (auto consumer : output.get_target_inputs()) {
+            for (const auto& output : root->outputs()) {
+                for (const auto& consumer : output.get_target_inputs()) {
                     if (consumer.get_node()->shared_from_this() != load) {
                         consumer.replace_source_output(load);
                         rewritten |= true;
@@ -49,19 +57,23 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
 ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
     MATCHER_SCOPE(InsertStore);
     register_matcher(std::make_shared<ngraph::pattern::Matcher>(
-        ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
+        ngraph::pattern::wrap_type<ngraph::opset1::Result, ngraph::snippets::op::Buffer>(), matcher_name),
             [this, count](ngraph::pattern::Matcher &m) {
             OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
             auto root = m.get_match_root();
 
             // check if already has Store as an input
-            for (auto input : root->inputs()) {
-                if (ov::is_type<ngraph::snippets::op::Store>(input.get_source_output().get_node())) {
+            for (const auto& input : root->inputs()) {
+                const auto& parent_node = input.get_source_output().get_node();
+                if (ov::is_type<ngraph::snippets::op::Store>(parent_node) ||
+                    ov::is_type<ngraph::snippets::op::LoopEnd>(parent_node) ||
+                    ov::is_type<ngraph::op::v0::MatMul>(parent_node)  ||
+                    ov::is_type<ngraph::op::v1::Transpose>(parent_node)) {
                     return false;
                 }
             }
 
-            auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
+            auto store = std::make_shared<ngraph::snippets::op::Store>(root->input_value(0), count);
             ngraph::copy_runtime_info(root, store);
             root->set_argument(0, store);
             return true;
diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp
new file mode 100644
index 00000000000..f6d83bf6da7
--- /dev/null
+++ b/src/common/snippets/src/pass/insert_loops.cpp
@@ -0,0 +1,285 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/pass/insert_loops.hpp"
+#include "snippets/pass/loop_helpers.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+
+#include <ngraph/rt_info.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool single_loop_body)
+    : m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size), m_single_loop_body(single_loop_body) {
+    if (m_master_shape.size() < m_loop_depth)
+        throw ngraph_error("InsertLoops can't insert loops: master shape rank is too small");
+}
+
+std::vector<bool> InsertLoops::calculate_inner_apply_increments(const ov::PartialShape& master,
+                                                                const std::vector<ov::PartialShape>& shapes) {
+    // Inner Loop applies increments if a dimension is not broadcasted
+    std::vector<bool> apply_increments;
+    apply_increments.reserve(shapes.size());
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
+                   [=](const ov::PartialShape& ps) { return utils::get_inner_dim(ps) != 1 && utils::get_inner_dim(master) != 1; });
+    return apply_increments;
+}
+std::vector<bool> InsertLoops::calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes) {
+    // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1)
+    std::vector<bool> apply_increments;
+    apply_increments.reserve(shapes.size());
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
+                   [=](const ov::PartialShape& ps) { return utils::get_outer_dim(ps) != 1 && utils::get_inner_dim(ps) == 1; });
+    return apply_increments;
+}
+std::vector<int64_t> InsertLoops::calculate_finalization_offsets(const ov::PartialShape& master,
+                                                                 const std::vector<ov::PartialShape>& shapes) {
+    const auto inner_work_amount = utils::get_inner_dim(master).get_length();
+    std::vector<int64_t> inner_finalization_offsets(shapes.size(), 0);
+    std::transform(shapes.begin(), shapes.end(), inner_finalization_offsets.begin(),
+                   [=](const ov::PartialShape& ps) {
+                       return utils::get_outer_dim(ps) == 1 && utils::get_inner_dim(ps) != 1 ? -inner_work_amount : 0;
+                   });
+    return inner_finalization_offsets;
+}
+
+void insert_loops_explicitly(const ov::NodeVector& ops, const size_t vector_size) {
+    ov::NodeVector body;
+    ov::NodeVector body_remainder;
+    ov::OutputVector body_parameters;
+    std::vector<ov::Input<ov::Node>> body_results;
+
+    // check for potential parameters for new Loop
+    auto add_body_parameters = [](const std::shared_ptr<ov::Node>& op, ov::OutputVector& body_parameters) {
+        for (const auto& input : op->inputs()) {
+            auto parent = input.get_source_output().get_node_shared_ptr();
+            if (ov::is_type<op::LoopEnd>(parent) ||
+                ov::is_type<op::Buffer>(parent) ||
+                ov::is_type<ov::op::v0::Parameter>(parent) ||
+                ov::is_type<op::Brgemm>(parent)) {
+                body_parameters.push_back(input.get_source_output());
+            }
+        }
+    };
+
+    // check for potential results for new Loop
+    auto add_body_results = [](const std::shared_ptr<ov::Node>& op, std::vector<ov::Input<ov::Node>>& body_results) {
+        for (const auto& output : op->outputs()) {
+            for (const auto& target_input : output.get_target_inputs()) {
+                auto child = target_input.get_node();
+                if (ov::is_type<op::LoopBegin>(child) ||
+                    ov::is_type<op::Buffer>(child) ||
+                    ov::is_type<ov::op::v0::Result>(child) ||
+                    ov::is_type<op::Brgemm>(child)) {
+                    body_results.push_back(target_input);
+                }
+            }
+        }
+    };
+
+    // check for potential missing body ops for new loop
+    std::function<void(const std::shared_ptr<ov::Node>& op, ov::NodeVector& body)> add_missing_body_ops;
+    add_missing_body_ops = [&](const std::shared_ptr<ov::Node>& op, ov::NodeVector& body) {
+        if (body_remainder.size()) {
+            for (const auto& input : op->inputs()) {
+                auto parent = input.get_source_output().get_node_shared_ptr();
+                auto iter = std::find(body_remainder.begin(), body_remainder.end(), parent);
+                if (iter != body_remainder.end()) {
+                    *std::back_inserter(body) = std::move(*iter);
+                    add_missing_body_ops(parent, body);
+                    add_body_parameters(parent, body_parameters);
+                    add_body_results(op, body_results);
+                }
+            }
+        }
+    };
+
+    auto wrap_body_by_loop = [&](const ov::NodeVector& body, const ov::OutputVector& body_parameters, const std::vector<ov::Input<ov::Node>>& body_results) {
+        NGRAPH_CHECK(!body_parameters.empty(), "The count of parameters for loop should be more than zero to create loop");
+        NGRAPH_CHECK(!body_results.empty(), "The count of results for loop should be more than zero to create loop");
+        std::vector<ov::PartialShape> body_shapes;
+        const auto count_io = body_parameters.size() + body_results.size();
+        body_shapes.reserve(count_io);
+        std::transform(body_parameters.begin(), body_parameters.end(), std::back_inserter(body_shapes),
+                       [](const ov::Output<ov::Node>& out) { return out.get_partial_shape(); });
+        std::transform(body_results.begin(), body_results.end(), std::back_inserter(body_shapes),
+                       [](const ov::Input<ov::Node>& in) { return in.get_partial_shape(); });
+
+        auto body_master_shape = body_shapes.front();
+        for (const auto& shape : body_shapes) {
+            NGRAPH_CHECK(PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY),
+                         "Loop input and output must be numpy broadcastable");
+        }
+        const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length();
+        const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length();
+
+        auto apply_increments = InsertLoops::calculate_inner_apply_increments(body_master_shape, body_shapes);
+        std::vector<int64_t> inner_finalization_offsets(body_shapes.size(), 0);
+        if (outer_work_amount > 1) {
+            inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(body_master_shape, body_shapes);
+        }
+
+        const auto& inner_loop_begin = op::insertLoopBeginAfterOutputs(body_parameters);
+        const auto& inner_loop_end = op::insertLoopEndBeforeInputs(
+            body_results, inner_loop_begin, inner_work_amount, vector_size,
+            apply_increments, inner_finalization_offsets);
+        // set internal flag to enable scalar vs vector loop optimizations
+        inner_loop_end->has_outer_loop = outer_work_amount > 1;
+        // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
+        // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
+        // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
+        // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
+        // on LoopBegin to guarantee that the constants are executed inside the Loop.
+        for (const auto& n : body) {
+            if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n)) {
+                c->add_control_dependency(inner_loop_begin);
+            }
+        }
+
+        if (outer_work_amount > 1) {
+            std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(body_shapes);
+            std::vector<int64_t> outer_finalization_offsets(body_shapes.size(), 0);
+            const auto& outer_loop_begin = op::insertLoopBegin(body_parameters);
+            op::insertLoopEnd(body_results, outer_loop_begin, outer_work_amount, 1lu,
+                apply_increments, outer_finalization_offsets);
+        }
+    };
+
+    auto op_is_outside_loop = [](const std::shared_ptr<ov::Node>& op) -> bool {
+        if (ov::is_type<ov::op::v0::Parameter>(op) ||
+            ov::is_type<ov::op::v0::Result>(op) ||
+            ov::is_type<op::Buffer>(op))
+            return true;
+        auto& rt = op->get_rt_info();
+        auto outside_rt = rt.find("outside_loop");
+        bool is_outside = false;
+        // If rt info isn't setted it means that op should be inside loop by default
+        if (outside_rt != rt.end()) {
+            is_outside = outside_rt->second.as<bool>();
+        }
+        return is_outside;
+    };
+
+    for (auto iter = ops.begin(); iter < ops.end(); iter++) {
+        const auto op = *iter;
+        // Need to check for that op should be inside or outside loop
+        if (op_is_outside_loop(op)) {
+            continue;
+        }
+
+        // If we meet loopBegin or Brgemm, it means that all previous nodes from ordered body
+        // should be in one body. It's like stop signal
+        const auto& loop_begin = ov::as_type_ptr<op::LoopBegin>(op);
+        const auto& brgemm = ov::as_type_ptr<op::Brgemm>(op);
+        if (loop_begin || brgemm) {
+            if (!body.empty()) {
+                if (!body_results.empty()) {
+                    wrap_body_by_loop(body, body_parameters, body_results);
+                } else {
+                    // If there aren't body results, it means that the current body ops are inputs of the next some operations in ordered_ops
+                    // So this set of the current body ops is part of the future body loop.
+                    // We should save them to add in body ops in the future
+                    std::move(body.begin(), body.end(), std::back_inserter(body_remainder));
+                }
+            }
+
+            // we should skip the next existing Loop body
+            if (loop_begin) {
+                const auto &loop_end = loop_begin->get_loop_end();
+                iter = std::find(iter, ops.end(), loop_end);
+            }
+
+            // clear loop body to create the next
+            body.clear();
+            body_parameters.clear();
+            body_results.clear();
+        } else {
+            add_missing_body_ops(op, body);
+            add_body_parameters(op, body_parameters);
+            add_body_results(op, body_results);
+
+            body.push_back(op);
+        }
+    }
+
+    if (!body.empty()) {
+        wrap_body_by_loop(body, body_parameters, body_results);
+    }
+}
+
+bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
+    RUN_ON_FUNCTION_SCOPE(InsertLoops);
+    if (m_master_shape.is_dynamic())
+        throw ngraph_error("InsertLoops doesn't support dynamic shapes yet");
+
+    const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length();
+    const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1;
+
+    auto ops = model->get_ordered_ops();
+    ParameterVector commonParams = model->get_parameters();
+    // Note that topological sort parses node arguments in reversed order, but results are added  - in direct order
+    // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter
+    const auto& orig_results = model->get_results();
+    ResultVector commonResults(orig_results.rbegin(), orig_results.rend());
+    std::vector<PartialShape> ioShapes;
+
+    const auto& body_rt_info = model->get_rt_info();
+    const auto& plugin_shapes = body_rt_info.find("PluginShapesOverride");
+    if (plugin_shapes == body_rt_info.end()) {
+        throw ngraph_error("InsertLoops requires PluginShapesOverride rt_info field");
+    } else {
+        const auto& new_shapes = plugin_shapes->second.as<std::vector<std::vector<size_t>>>();
+        if (new_shapes.size() != commonResults.size() + commonParams.size())
+            throw ngraph_error("InsertLoops got invalid number of plugin-overriden shapes");
+        for (int i = 0; i < commonParams.size(); i++)
+            ioShapes.emplace_back(new_shapes[i]);
+        // reverse overriden_shapes for results since commonResults are reversed with respect to model->get_parameters()
+        for (int i = 0; i < commonResults.size(); i++)
+            ioShapes.emplace_back(new_shapes[new_shapes.size() - 1 - i]);
+    }
+
+    if (inner_work_amount > 0) {
+        if (m_single_loop_body) {
+            const auto apply_increments = InsertLoops::calculate_inner_apply_increments(m_master_shape, ioShapes);
+            std::vector<int64_t> inner_finalization_offsets(ioShapes.size(), 0);
+            if (outer_work_amount > 1) {
+                inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(m_master_shape, ioShapes);
+            }
+            const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
+            const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
+                                                       m_vector_size, apply_increments, inner_finalization_offsets);
+            // set internal flag to enable scalar vs vector loop optimizations
+            inner_loop_end->has_outer_loop = outer_work_amount > 1;
+            // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
+            // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
+            // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
+            // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
+            // on LoopBegin to guarantee that the constants are executed inside the Loop.
+            for (const auto& n : model->get_ordered_ops()) {
+                if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n))
+                    c->add_control_dependency(inner_loop_begin);
+                else if (n == inner_loop_begin)
+                    break;
+            }
+
+            if (outer_work_amount > 1) {
+                std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(ioShapes);
+                const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
+                op::insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1lu, apply_increments);
+            }
+        } else {
+            insert_loops_explicitly(ops, m_vector_size);
+        }
+    }
+
+    return true;
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp
index 0e237ed3219..397345cc456 100644
--- a/src/common/snippets/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp
@@ -7,6 +7,8 @@
 
 #include "snippets/pass/insert_movebroadcast.hpp"
 #include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+#include <ngraph/pattern/op/wrap_type.hpp>
 
 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/rt_info.hpp>
@@ -17,43 +19,47 @@ using namespace ngraph;
 
 namespace {
 
-std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngraph::Node>& value,
-                                                   const ov::Shape& target_shape, const ov::Shape& normalized_shape) {
-    std::shared_ptr<ngraph::Node> broadcasted_node = value.get_node_shared_ptr();
-
-    if (target_shape == value.get_shape()) {
-        return broadcasted_node;
-    }
-    // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
-    // will be handled by pointer arithmetics in TileScheduler
-    if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
-        ov::Shape broadcasted_shape = normalized_shape;
-        *broadcasted_shape.rbegin() = *target_shape.rbegin();
-        broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
-    }
-
-    return broadcasted_node;
-}
-
-
-std::pair<ov::Shape, std::vector<ov::Shape>> get_numpy_broadcast_shapes(const std::vector<ov::Shape>& input_shapes) {
+std::pair<ov::PartialShape, std::vector<ov::PartialShape>> get_numpy_broadcast_partial_shapes(const std::vector<ov::PartialShape>& input_shapes) {
     ov::PartialShape target_shape =  input_shapes.front();
     for (auto i = 1; i < input_shapes.size(); i++) {
         if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY))
             throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes");
     }
-    std::vector<ov::Shape> normalized_shapes;
+    std::vector<ov::PartialShape> normalized_shapes;
     for (const auto& input : input_shapes) {
-        ov::Shape padded_shape{input};
+        ov::PartialShape padded_shape{input};
         padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1);
         normalized_shapes.push_back(std::move(padded_shape));
     }
 
-    return {target_shape.get_shape(), normalized_shapes};
+    return {target_shape, normalized_shapes};
 }
 
 } // namespace
 
+ngraph::Output<ngraph::Node> ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim(
+        const ngraph::Output<ngraph::Node>& value, const ov::PartialShape& target_shape, const ov::PartialShape& normalized_shape) {
+    if (target_shape == value.get_partial_shape()) {
+        return value;
+    }
+
+    // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
+    // will be handled by pointer arithmetics inside outer LoopEmitter
+    if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
+        ov::PartialShape broadcasted_shape = normalized_shape;
+        *broadcasted_shape.rbegin() = *target_shape.rbegin();
+        const auto broadcast_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(value, broadcasted_shape);
+        // BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted).
+        // For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info
+        broadcast_node->add_node_control_dependents(value.get_node_shared_ptr());
+        ov::copy_runtime_info(value.get_node_shared_ptr(), broadcast_node);
+
+        return broadcast_node->output(0);
+    }
+
+    return value;
+}
+
 ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
     MATCHER_SCOPE(InsertMoveBroadcast);
     ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
@@ -64,31 +70,35 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
             return false;
         }
 
-        auto is_scalar_constant = [](const ov::Output<ov::Node>& v){
-            if (auto constant = ov::as_type_ptr<ov::op::v0::Constant>(v.get_node_shared_ptr())) {
-                if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) {
-                    return true;
-                }
-            }
-            return false;
+        auto is_ignored_node = [](const ov::Output<ov::Node>& v){
+            // We don't need to insert BroadcastMove after the following operations:
+            // - Scalar has emitter with explicit broadcasting
+            // - VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion.
+            return utils::is_scalar_constant(v.get_node_shared_ptr()) ||
+                   ov::is_type<ngraph::snippets::op::VectorBuffer>(v.get_node_shared_ptr());
         };
-        std::vector<ov::Shape> input_shapes;
-        std::vector<bool> ignore_as_scalar;
+        std::vector<ov::PartialShape> input_shapes;
+        std::vector<bool> is_ignored;
         for (const auto& val : values) {
-            input_shapes.emplace_back(val.get_shape());
-            ignore_as_scalar.push_back(is_scalar_constant(val));
+            input_shapes.emplace_back(val.get_partial_shape());
+            is_ignored.push_back(is_ignored_node(val));
+            // Do not insert MoveBroadcast if any of the last dims is dynamic,
+            // since we don't know if we really need it. In these cases, broadcasting will be performed
+            // by outer Loop based on runtime shapes.
+            if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static())
+                return false;
         }
 
         // find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim
-        auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes);
+        auto bcast_shapes = get_numpy_broadcast_partial_shapes(input_shapes);
 
         ngraph::OutputVector broadcasted_inputs;
         for (size_t i = 0; i < values.size(); ++i) {
-            if (ignore_as_scalar[i]) {
+            if (is_ignored[i]) {
                 broadcasted_inputs.push_back(values[i]);
             } else {
-                auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
-                ngraph::copy_runtime_info(root, node);
+                auto node = BroadcastNodeLastDim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
+                ngraph::copy_runtime_info(root, node.get_node_shared_ptr());
                 broadcasted_inputs.push_back(node);
             }
         }
diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
index 5a30f2c2d5a..b4fdb2506dc 100644
--- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
@@ -34,10 +34,10 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
                 return false;
             }
 
-            auto inshape = root->input(0).get_shape();
-            auto outshape = root->output(0).get_shape();
+            auto inshape = root->input(0).get_partial_shape();
+            auto outshape = root->output(0).get_partial_shape();
 
-            auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
+            auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape, ov::as_type_ptr<snippets::op::Load>(input)->get_offset());
             ngraph::copy_runtime_info(root, broadcastload);
             ngraph::replace_node(root, broadcastload);
 
diff --git a/src/common/snippets/src/pass/loop_fusion.cpp b/src/common/snippets/src/pass/loop_fusion.cpp
new file mode 100644
index 00000000000..587daa79121
--- /dev/null
+++ b/src/common/snippets/src/pass/loop_fusion.cpp
@@ -0,0 +1,331 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/loop_fusion.hpp"
+#include "snippets/utils.hpp"
+
+namespace {
+using InputSet = std::set<ov::Input<ov::Node>>;
+using Edge = std::pair<ov::Output<ov::Node>, InputSet>;
+
+auto can_be_merged(const std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end_up,
+                   const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin_down) -> bool {
+    if (!loop_end_up || !loop_begin_down)
+        return false;
+
+    const auto loop_end_down = loop_begin_down->get_loop_end();
+    const auto loop_begin_up = loop_end_up->get_loop_begin();
+    if (loop_end_down->get_work_amount() != loop_end_up->get_work_amount() ||
+        loop_end_down->get_increment() != loop_end_up->get_increment())
+        return false;
+
+    // If between Loops there are common dependencies (for example, reducing operations), we cannot merge these Loops
+    // Example, when there is HorizonMax op between Loops:
+    //                    Data
+    //  VectorBuffer    LoopBegin
+    //         \          Load |  \
+    //           Maximum       |  /
+    //              /    LoopEnd
+    //       HorizonMax     |
+    //             \   LoopBegin
+    //              \     Load \
+    //               Subtract   |
+    //                Store    /
+    //               LoopEnd
+    auto up_dependent_ptrs = loop_end_up->get_control_dependents();
+    ov::NodeVector up_dependents(up_dependent_ptrs.size(), nullptr);
+    std::transform(up_dependent_ptrs.begin(), up_dependent_ptrs.end(), up_dependents.begin(), [](ngraph::Node* node) { return node->shared_from_this(); });
+    auto down_dependencies = loop_begin_down->get_control_dependencies();
+    std::sort(up_dependents.begin(), up_dependents.end());
+    std::sort(down_dependencies.begin(), down_dependencies.end());
+    std::vector<std::shared_ptr<ov::Node>> common_nodes;
+    std::set_intersection(up_dependents.begin(), up_dependents.end(), down_dependencies.begin(), down_dependencies.end(),
+                          std::back_inserter(common_nodes));
+    // TODO: Add check for sequence/subgraph of depending nodes between Loops.
+    //       At these moment we should have full list of dependencies and dependents of Loops to find intersection,
+    //       not just first dependent of LoopEnd and first dependency of LoopBegin
+    return common_nodes.size() == 0;
+}
+
+auto get_buffer_and_loop_end(const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin_down,
+                             std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end_up,
+                             std::shared_ptr<ngraph::snippets::op::Buffer>& buffer) -> bool {
+    size_t fusion_input_num = 0;
+    for (const auto& parent : loop_begin_down->input_values()) {
+        const auto parent_shared = parent.get_node_shared_ptr();
+        if (ov::is_type<ov::op::v0::Constant>(parent_shared) ||
+            ov::is_type<ov::op::v0::Parameter>(parent_shared) ||
+            ov::is_type<ngraph::snippets::op::LoopBegin>(parent_shared))
+            continue;
+
+        // We can fuse Loops even LoopBegin has several the same inputs (the common Buffer/LoopEnd)
+        if (buffer && buffer == parent_shared || !buffer && loop_end_up && loop_end_up == parent_shared)
+            continue;
+
+        loop_end_up = ngraph::as_type_ptr<ngraph::snippets::op::LoopEnd>(parent_shared);
+        buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(parent_shared);
+        if (buffer) {
+            if (buffer->output(0).get_target_inputs().size() == 0 ||
+                buffer->get_input_size() != 1 ||
+                buffer->get_input_source_output(0).get_target_inputs().size() != 1)
+                return false;
+
+            loop_end_up = ngraph::as_type_ptr<ngraph::snippets::op::LoopEnd>(buffer->get_input_node_shared_ptr(0));
+        }
+        if (loop_end_up)
+            fusion_input_num++;
+    }
+
+    return fusion_input_num == 1;
+}
+
+auto collect_loop_inputs(const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin,
+                         const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer,
+                         std::vector<Edge>& new_loop_inputs,
+                         std::vector<int64_t>& new_ptr_increments,
+                         std::vector<int64_t>& new_finalization_offsets) -> void {
+    const auto loop_end = loop_begin->get_loop_end();
+    const auto ptr_increments = loop_end->get_ptr_increments();
+    const auto finalization_offsets = loop_end->get_finalization_offsets();
+    for (size_t i = 0; i < loop_begin->get_input_size(); i++) {
+        const auto input = loop_begin->input(i);
+        // Skip target Buffer
+        if (input.get_source_output().get_node_shared_ptr() != buffer) {
+            const auto edge = Edge{ input.get_source_output(),
+                                    loop_begin->output(input.get_index()).get_target_inputs() };
+            new_loop_inputs.push_back(edge);
+            new_ptr_increments.push_back(ptr_increments[i]);
+            new_finalization_offsets.push_back(finalization_offsets[i]);
+            // Remove LoopBegin from Parent as target input
+            input.get_source_output().remove_target_input(input);
+        }
+    }
+}
+
+auto collect_loop_outputs(const std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end,
+                          const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer,
+                          std::vector<Edge>& new_loop_outputs,
+                          std::vector<int64_t>& new_ptr_increments,
+                          std::vector<int64_t>& new_finalization_offsets,
+                          const bool reduce_max_case) -> bool {
+    const auto loop_begin = loop_end->get_loop_begin();
+    const auto ptr_increments = loop_end->get_ptr_increments();
+    const auto finalization_offsets = loop_end->get_finalization_offsets();
+    bool is_current_reduce_max_case = false;
+    for (size_t i = 0; i < loop_end->get_output_size(); i++) {
+        // ReduceMax case. When Loop cannot have empty output as ngraph op,
+        // we should have fake edge through all Loops (LoopBegin->LoopEnd) which connect src and dst data.
+        // If we merge these this Loop and Loop Before, we should remove this fake edge
+        // because now we have real data for storing
+        auto new_input_node = loop_end->get_input_node_shared_ptr(i);
+        if (ov::is_type<ngraph::snippets::op::LoopBegin>(new_input_node)) {
+            // We set temporary boolean variable because this value is for the next LoopEnd (upper), not for the current LoopEnd
+            is_current_reduce_max_case = true;
+            // Remove LoopEnd from Parent as target input
+            loop_end->input_value(i).remove_target_input(loop_end->input(i));
+        } else {
+            const auto output = loop_end->output(i);
+            // Skip target Buffer
+            InputSet target_inputs;
+            for (const auto& input : output.get_target_inputs()) {
+                if (input.get_node()->shared_from_this() != buffer || reduce_max_case) {
+                    target_inputs.insert(input);
+                }
+            }
+
+            if (target_inputs.size()) {
+                const auto edge = Edge{loop_end->input_value(output.get_index()), target_inputs};
+                new_loop_outputs.push_back(edge);
+                new_ptr_increments.push_back(ptr_increments[loop_begin->get_input_size() + i]);
+                new_finalization_offsets.push_back(finalization_offsets[loop_begin->get_input_size() + i]);
+                // Remove LoopEnd from Parent as target input
+                loop_end->input_value(i).remove_target_input(loop_end->input(i));
+            }
+        }
+    }
+
+    return is_current_reduce_max_case;
+}
+
+} // namespace
+
+
+bool ngraph::snippets::pass::LoopFusion::Merge(const std::shared_ptr<op::LoopBegin>& loop_begin_down) {
+    if (!loop_begin_down) {
+        return false;
+    }
+
+    std::shared_ptr<ngraph::snippets::op::LoopEnd> loop_end_up = nullptr;
+    std::shared_ptr<ngraph::snippets::op::Buffer> buffer = nullptr;
+    // Initialize the corresponding upper LoopEnd and Buffer
+    if (!get_buffer_and_loop_end(loop_begin_down, loop_end_up, buffer)) {
+        return false;
+    }
+    // Check for conditions of fusion
+    if (!can_be_merged(loop_end_up, loop_begin_down)) {
+        return false;
+    }
+
+    const auto loop_end_down = loop_begin_down->get_loop_end();
+    const auto loop_begin_up = loop_end_up->get_loop_begin();
+    const auto new_input_count = loop_begin_up->get_input_size() + loop_begin_down->get_input_size();
+    const auto new_output_count = loop_end_up->get_output_size() + loop_end_down->get_output_size();
+    const auto new_io_count = new_input_count + new_output_count;
+    const auto ptr_increments_up = loop_end_up->get_ptr_increments();
+    const auto ptr_increments_down = loop_end_down->get_ptr_increments();
+    const auto finalization_offsets_up = loop_end_up->get_finalization_offsets();
+    const auto finalization_offsets_down = loop_end_down->get_finalization_offsets();
+    std::vector<int64_t> new_ptr_increments, new_finalization_offsets;
+    new_ptr_increments.reserve(new_io_count);
+    new_finalization_offsets.reserve(new_io_count);
+
+    // Collect new loop inputs
+    std::vector<Edge> loop_inputs;
+    loop_inputs.reserve(new_input_count);
+    new_ptr_increments.reserve(new_io_count);
+    new_finalization_offsets.reserve(new_io_count);
+    collect_loop_inputs(loop_begin_up, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets);
+    collect_loop_inputs(loop_begin_down, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets);
+
+    // Collect new Loop outputs
+    std::vector<Edge> loop_outputs;
+    loop_outputs.reserve(new_output_count);
+    // We can fuse Loop with maximum accumulator pattern only with Smth input
+    // So firstly, we analyze LoopEnd down (it's possible maximum accumulator pattern), set `reduce_max_case` variable
+    // if it's really maximum accumulator pattern, and then analyze LoopEnd up using `reduce_max_case` variable
+    const bool reduce_max_case = collect_loop_outputs(loop_end_down, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, false);
+    collect_loop_outputs(loop_end_up, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, reduce_max_case);
+    if (reduce_max_case) {
+        const auto target_inputs = loop_begin_down->output(0).get_target_inputs();
+        NGRAPH_CHECK(target_inputs.size() == 1, "LoopBegin in ReduceMax should have only one consumer (Load) for out port 0");
+        const auto load = ov::as_type_ptr<op::Load>(target_inputs.begin()->get_node()->shared_from_this());
+        NGRAPH_CHECK(load != nullptr, "LoopBegin in ReduceMax should have only one consumer for out port 0 - Load");
+
+        const auto store = ov::as_type_ptr<op::Store>(loop_end_up->get_input_node_shared_ptr(0));
+        NGRAPH_CHECK(store != nullptr, "Before LoopEnd should be Store emitter");
+
+        // Connect vector emitters before Store and after Load
+        load->output(0).replace(store->get_input_source_output(0));
+    }
+
+    const auto new_increment = loop_end_up->get_increment();
+    const auto new_work_amount = loop_end_up->get_work_amount();
+
+    // Create new LoopBegin
+    OutputVector new_loop_begin_inputs;
+    new_loop_begin_inputs.reserve(loop_inputs.size());
+    for (const auto& loop_input : loop_inputs) {
+        const auto data_output = loop_input.first;
+        new_loop_begin_inputs.push_back(data_output);
+    }
+    const auto new_loop_begin = std::make_shared<op::LoopBegin>(new_loop_begin_inputs);
+    NGRAPH_CHECK(new_loop_begin->get_input_size() == loop_inputs.size(), "New LoopBegin has incorrect count of inputs.");
+
+    // Connect new LoopBegin to input edges
+    for (size_t i = 0; i < loop_inputs.size(); i++) {
+        const auto edge = loop_inputs[i];
+        for (auto& target_input : edge.second) {
+            target_input.replace_source_output(new_loop_begin->output(i));
+        }
+    }
+
+    // Create new LoopEnd
+    OutputVector new_loop_end_inputs;
+    new_loop_end_inputs.reserve(loop_outputs.size() + 1);  // + 1 - for loop_begin
+    for (const auto& loop_output : loop_outputs) {
+        const auto data_output = loop_output.first;
+        new_loop_end_inputs.push_back(data_output);
+    }
+    new_loop_end_inputs.push_back(new_loop_begin->output(new_loop_begin->get_input_size()));
+    const auto new_loop_end = std::make_shared<op::LoopEnd>(new_loop_end_inputs, new_work_amount, new_increment,
+                                                            new_ptr_increments, new_finalization_offsets);
+    NGRAPH_CHECK(new_loop_end->get_output_size() == loop_outputs.size(), "New LoopEnd has incorrect count of outputs.");
+    // Connect new LoopEnd to output edges
+    for (size_t i = 0; i < loop_outputs.size(); i++) {
+        const auto edge = loop_outputs[i];
+        auto new_output = new_loop_end->output(i);
+        for (auto& target_input : edge.second) {
+            target_input.replace_source_output(new_output);
+        }
+    }
+
+    if (reduce_max_case) {
+        loop_end_down->output(0).replace(buffer->output(0));
+    } else {
+        // Remove old Loops and Load/Store if there are around Buffer
+        for (size_t i = 0; i < loop_end_up->get_input_size() - 1; i++) {
+            auto new_output = loop_end_up->input_value(i);
+            loop_end_up->output(i).replace(new_output);
+            new_output.remove_target_input(loop_end_up->input(i));
+        }
+        for (size_t i = 0; i < loop_begin_down->get_input_size(); i++) {
+            const auto output_target_inputs = loop_begin_down->output(i).get_target_inputs();
+            const auto new_output = loop_begin_down->input_value(i);
+            for (const auto &target_input : output_target_inputs) {
+                target_input.replace_source_output(new_output);
+            }
+
+            // Clear old Buffer children
+            new_output.remove_target_input(loop_begin_down->input(i));
+        }
+    }
+
+    new_loop_end->has_outer_loop = loop_end_down->has_outer_loop || loop_end_up->has_outer_loop;
+
+    loop_begin_up->transfer_control_dependents(new_loop_begin);
+    loop_begin_down->transfer_control_dependents(new_loop_begin);
+    loop_end_up->transfer_control_dependents(new_loop_end);
+    loop_end_down->transfer_control_dependents(new_loop_end);
+    new_loop_begin->add_node_control_dependencies(loop_begin_up);
+    new_loop_begin->add_node_control_dependencies(loop_begin_down);
+    new_loop_end->add_node_control_dependencies(loop_end_up);
+    new_loop_end->add_node_control_dependencies(loop_end_down);
+
+    // If there was Buffer between Loops, after Loop fusion
+    // we should remove the Buffer node and MemoryAccess nodes if it's needed
+    if (buffer) {
+        const auto buffer_input = buffer->get_input_node_shared_ptr(0);
+        const auto buffer_output = buffer->output(0).get_target_inputs().begin()->get_node()->shared_from_this();
+
+        // If after merging there are Load and Store, we should remove them
+        if (const auto store = ov::as_type_ptr<op::Store>(buffer_input)) {
+            store->output(0).replace(store->input_value(0));
+        }
+        if (const auto load = ov::as_type_ptr<op::Load>(buffer_output)) {
+            load->output(0).replace(load->input_value(0));
+        }
+
+        // Remove Buffer if there are no Loops and MatMul after Loop fusion
+        // because only these operations can have Buffer node on inputs and outputs.
+        // So if there aren't, it means that Buffer is extra, and we can remove it
+        if (!ov::is_type<op::LoopBegin>(buffer_output) && !ov::is_type<op::LoopEnd>(buffer_input) &&
+            !ov::is_type<op::Brgemm>(buffer_output) && !ov::is_type<op::Brgemm>(buffer_input)) {
+            buffer->output(0).replace(buffer->input_value(0));
+        }
+    }
+
+    return true;
+}
+
+ngraph::snippets::pass::LoopFusion::LoopFusion() {
+    MATCHER_SCOPE(LoopFusion);
+
+    auto m_loop_begin = ngraph::pattern::wrap_type<op::LoopBegin>();
+
+    auto callback = [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::LoopFusion")
+        auto& pattern_to_output = m.get_pattern_value_map();
+        const auto loop_begin = ngraph::as_type_ptr<op::LoopBegin>(pattern_to_output.at(m_loop_begin).get_node_shared_ptr());
+        const auto status = Merge(loop_begin);
+        return status;
+    };
+
+    auto matcher = std::make_shared<ngraph::pattern::Matcher>(m_loop_begin, matcher_name);
+    register_matcher(matcher, callback);
+}
diff --git a/src/common/snippets/src/pass/loop_helpers.cpp b/src/common/snippets/src/pass/loop_helpers.cpp
new file mode 100644
index 00000000000..696f7816a27
--- /dev/null
+++ b/src/common/snippets/src/pass/loop_helpers.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/op.hpp"
+#include "snippets/pass/loop_helpers.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs) {
+    std::vector<std::set<Input<Node>>> originalChildInputs;
+    for (const auto& out : originalOutputs) {
+        originalChildInputs.push_back(out.get_target_inputs());
+    }
+
+    auto loop_begin = std::make_shared<LoopBegin>(originalOutputs);
+
+    for (int i = 0; i < originalChildInputs.size(); i++) {
+        for (auto& input : originalChildInputs[i]) {
+            input.replace_source_output(loop_begin->output(i));
+        }
+    }
+    return loop_begin;
+}
+
+std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
+                                                   const std::shared_ptr<LoopBegin>& loopBegin,
+                                                   size_t work_amount, size_t increment,
+                                                   std::vector<bool> apply_increment,
+                                                   std::vector<int64_t> finalization_offsets) {
+    OutputVector originalParentOutputs;
+    for (const auto& in : originalInputs) {
+        originalParentOutputs.push_back(in.get_source_output());
+    }
+    originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1));
+    auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, work_amount, increment,
+                                             std::move(apply_increment), std::move(finalization_offsets));
+
+    for (int i = 0; i < originalInputs.size(); i++) {
+        originalInputs[i].replace_source_output(loop_end->output(i));
+    }
+    return loop_end;
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/pass/matmul_to_brgemm.cpp b/src/common/snippets/src/pass/matmul_to_brgemm.cpp
new file mode 100644
index 00000000000..b74fb3e68cc
--- /dev/null
+++ b/src/common/snippets/src/pass/matmul_to_brgemm.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/itt.hpp"
+
+#include "snippets/pass/matmul_to_brgemm.hpp"
+
+#include "snippets/op/brgemm.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+#include "ngraph/rt_info.hpp"
+#include "ngraph/pattern/op/wrap_type.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+MatMulToBrgemm::MatMulToBrgemm() {
+    MATCHER_SCOPE(MatMulToBrgemm);
+    auto matmul_pattern = ngraph::pattern::wrap_type<ngraph::opset1::MatMul>({ngraph::pattern::any_input(),
+                                                                               ngraph::pattern::any_input()});
+
+    auto callback = [=](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MatMulToBrgemm")
+        auto& pm = m.get_pattern_value_map();
+        const auto matmul = as_type_ptr<ngraph::opset1::MatMul>(pm.at(matmul_pattern).get_node_shared_ptr());
+        // Brgemm doesn't support transposed inputs currently, so we don't convert such matmuls
+        if (matmul->get_transpose_a() || matmul->get_transpose_b())
+            return false;
+
+        auto brgemm = std::make_shared<op::Brgemm>(matmul->get_input_source_output(0), matmul->get_input_source_output(1));
+        brgemm->set_friendly_name(matmul->get_friendly_name());
+        ngraph::copy_runtime_info(matmul, brgemm);
+        ngraph::replace_node(matmul, brgemm);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(matmul_pattern, matcher_name);
+    register_matcher(m, callback);
+}
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp
new file mode 100644
index 00000000000..69a166140b4
--- /dev/null
+++ b/src/common/snippets/src/pass/mha_tokenization.cpp
@@ -0,0 +1,394 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/mha_tokenization.hpp"
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/op/subgraph.hpp"
+
+#include <ngraph/opsets/opset8.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/validation_util.hpp>
+
+
+namespace {
+auto is_supported_tensor(const ngraph::descriptor::Tensor& t) -> bool {
+    // TODO: Add support of all supported by common tokenization element types
+    //       return ngraph::snippets::pass::TokenizeSnippets::supported_element_types.count(input.get_element_type()) != 0;
+    //       Also only 4D is supported at the moment
+    return t.get_element_type() == ngraph::element::f32 && t.get_partial_shape().is_static() && t.get_shape().size() == 4;
+}
+
+// TODO: Add support of FQ, Reshape?
+auto is_supported_op(const std::shared_ptr<ngraph::Node>& node) -> bool {
+    return ngraph::snippets::pass::TokenizeSnippets::AppropriateForSubgraph(node) &&
+           (ngraph::is_type<ngraph::op::util::UnaryElementwiseArithmetic>(node) ||
+            ngraph::is_type<ngraph::op::util::BinaryElementwiseArithmetic>(node) ||
+            ngraph::is_type<ngraph::op::v1::Select>(node));
+}
+
+auto is_valid_transpose(const std::shared_ptr<ngraph::opset1::Transpose>& node, std::vector<int64_t> expected_order) -> bool {
+    auto valid_transpose_order = [expected_order](const std::shared_ptr<ngraph::Node>& node) -> bool {
+        const auto transpose_pattern = ngraph::as_type_ptr<ngraph::opset1::Constant>(node);
+        if (!transpose_pattern)
+            return false;
+        return transpose_pattern->cast_vector<int64_t>() == expected_order;
+    };
+
+    return node && node->get_output_target_inputs(0).size() == 1 && node->get_shape().size() == 4 &&
+           valid_transpose_order(node->get_input_node_shared_ptr(1)) && is_supported_tensor(node->get_input_tensor(0));
+}
+
+auto tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops) -> void {
+    // We can tokenize Broadcast op only when output shape of child doesn't depend on Broadcast shape without last dimension.
+    // Snippets remove Broadcast op and insert BroadcastMove if last dimensions before and after Broadcast are different.
+    // Otherwise, we can lose original shape.
+    // Example:
+    //        in0 [1, 1, 1]      in0 [1, 1, 1]              in0 [1, 1, 1]   in0 [1, 1, 1]
+    //     Broadcast [1, 10, 1]    /                                 \       /
+    //           \               /                --->>>                Add
+    //                  Add                                              |
+    //             Result [1, 10, 1]                              Result [1, 1, 1]
+
+    ov::PartialShape new_output_shape(std::vector<ov::Dimension>{1});
+    ov::NodeVector broadcast_nodes;
+
+    auto skip_last_dim = [](const ov::PartialShape& shape) {
+        return ov::PartialShape(std::vector<ov::Dimension>{shape.begin(), shape.end() - 1});
+    };
+
+    for (auto input : interm_op->inputs()) {
+        auto broadcast = ov::as_type_ptr<ngraph::opset1::Broadcast>(input.get_source_output().get_node_shared_ptr());
+        // TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast
+        if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY &&
+            broadcast->get_output_target_inputs(0).size() == 1) {
+            broadcast_nodes.push_back(broadcast);
+
+            const auto pshape = broadcast->get_input_partial_shape(0);
+            if (pshape.rank().is_static() && pshape.size() > 2) {
+                ov::PartialShape::broadcast_merge_into(new_output_shape,
+                                                       skip_last_dim(pshape),
+                                                       ::ngraph::op::AutoBroadcastType::NUMPY);
+            }
+        } else {
+            const auto pshape = input.get_partial_shape();
+            if (pshape.rank().is_static() && pshape.size() > 2) {
+                ov::PartialShape::broadcast_merge_into(new_output_shape,
+                                                       skip_last_dim(pshape),
+                                                       ::ngraph::op::AutoBroadcastType::NUMPY);
+            }
+        }
+    }
+
+    if (!broadcast_nodes.empty()) {
+        if (new_output_shape == skip_last_dim(interm_op->get_output_partial_shape(0))) {
+            std::copy(broadcast_nodes.begin(), broadcast_nodes.end(), std::back_inserter(ordered_ops));
+        }
+    }
+}
+
+auto tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op,
+                                     std::shared_ptr<ngraph::opset1::Reshape>& reshape,
+                                     ngraph::NodeVector& ordered_ops) -> bool {
+    reshape = ngraph::as_type_ptr<ngraph::opset1::Reshape>(interm_op);
+    if (reshape) {
+        const auto shape = reshape->get_input_shape(0);
+        if (shape.back() != reshape->get_output_shape(0).back() || reshape->get_output_target_inputs(0).size() != 1)
+            return false;
+        ordered_ops.push_back(reshape);
+        interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+    }
+    return true;
+};
+
+auto update_intermediate_supported_ops(std::shared_ptr<ov::Node>& interm_op, ngraph::NodeVector& ordered_ops) -> bool {
+    // TODO: Add Reshape, FQ support
+    while (is_supported_op(interm_op)) {
+        // All supported intermediate ops have only one output port
+        // To verify output element type is enough because all supported intermediate ops have the same output element type as input type
+        if (interm_op->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(interm_op->get_output_tensor(0)))
+            return false;
+
+        // Check for supported Broadcast op
+        if (interm_op->get_input_size() > 1) {
+            tokenize_broadcast(interm_op, ordered_ops);
+        }
+
+        ordered_ops.push_back(interm_op);
+        interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+    }
+    return true;
+};
+}  // namespace
+
+ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() {
+    MATCHER_SCOPE(TokenizeMHASnippets);
+
+    auto m_matmul0 = std::make_shared<ngraph::opset1::MatMul>(ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
+                                                              ngraph::pattern::any_input(ngraph::pattern::has_static_shape()));
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_matmul0, matcher_name),
+        [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TokenizeMHASnippets")
+        auto& pattern_to_output = m.get_pattern_value_map();
+
+        // After some transformations, a different number of Constants for some operations may be created
+        // than the actual number of Constants during tokenization.
+        // To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
+        // we should calculate potential number of non-scalar Constants that will be moved up from body.
+        // TODO: Need update this variable when FQ will be supported
+        size_t hidden_virtual_ports_count = 0;
+        // Default value is True because MHA pattern always requires Buffer op
+        bool need_buffer = true;
+        std::string fused_names;
+        ngraph::NodeVector ordered_ops;
+
+        /* ======== Matcher Pass ========== */
+
+        /****** Skeleton ******/
+        /* Skeleton on MHA-pattern is:
+         *              \     /
+         *              MatMul0
+         *                 |
+         *    Eltwise/Select/Reshape/FakeQuantize
+         *                 |
+         *              Softmax
+         *                 |
+         *    Eltwise/Select/Reshape/FakeQuantize
+         *                  \      /
+         *                   MatMul1
+         */
+        const auto matmul0 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(pattern_to_output.at(m_matmul0).get_node_shared_ptr());
+        if (!matmul0 || matmul0->get_output_target_inputs(0).size() != 1 || matmul0->get_transpose_a() ||
+            !is_supported_tensor(matmul0->get_input_tensor(0)) || !is_supported_tensor(matmul0->get_input_tensor(1)))
+            return false;
+
+        if (transformation_callback(matmul0)) {
+            return false;
+        }
+
+        ordered_ops.push_back(matmul0);
+
+        auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        // Add supported operations which are between MatMul0 and Softmax to ordered_ops
+        if (!update_intermediate_supported_ops(interm_op, ordered_ops))
+            return false;
+
+        std::shared_ptr<ngraph::opset1::Reshape> reshape0 = nullptr;
+        if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops))
+            return false;
+
+        int64_t axis = 0;
+        const auto rank = interm_op->get_input_partial_shape(0).rank();
+        if (const auto softmax_v8 = ngraph::as_type_ptr<ngraph::opset8::Softmax>(interm_op)) {
+            axis = ngraph::normalize_axis(interm_op->get_friendly_name(), softmax_v8->get_axis(), rank);
+        } else if (const auto softmax_v1 = ngraph::as_type_ptr<ngraph::opset1::Softmax>(interm_op)) {
+            axis = softmax_v1->get_axis();
+        } else {
+            return false;
+        }
+
+        if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1)
+            return false;
+        ordered_ops.push_back(interm_op);
+
+        interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        std::shared_ptr<ngraph::opset1::Reshape> reshape1 = nullptr;
+        if (!tokenize_reshape_around_softmax(interm_op, reshape1, ordered_ops))
+            return false;
+
+        if (((reshape0 == nullptr) != (reshape1 == nullptr)) ||
+             (reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0))))
+            return false;
+
+        // Add supported operations which are between Softmax and MatMul1 to ordered_ops
+        if (!update_intermediate_supported_ops(interm_op, ordered_ops))
+            return false;
+
+        const auto matmul1 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(interm_op);
+        if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 || matmul1->get_transpose_a() || matmul1->get_transpose_b() ||
+            !is_supported_tensor(matmul1->get_input_tensor(0)) || !is_supported_tensor(matmul1->get_input_tensor(1)))
+            return false;
+
+        /***********************/
+
+        /***** Transposes *****/
+        /* There may be Transpose and Reshape ops on inputs and outputs of MHA-pattern skeleton
+         * We can add them into Subgraph body
+         */
+
+        // First input branch of MatMul0 should be executed before second input branch of MatMul0,
+        // so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose1
+        bool are_weights_scalar = true;
+        auto parent = matmul0->get_input_node_shared_ptr(1);
+        while (is_supported_op(parent)) {
+            // All supported ops have only one output port
+            // To verify output element type is enough because all supported ops have the same output element type as input type
+            if (parent->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(parent->get_output_tensor(0)))
+                break;
+
+            const auto parent_count = parent->inputs().size();
+            for (size_t i = 1; i < parent_count; ++i) {
+                are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent->get_input_shape(i)) == 1;
+            }
+            ordered_ops.insert(ordered_ops.begin(), parent);
+            // We think that sequence of ops goes through input port 0
+            // But can be Select here? If it can be, parent shouldn't be on input port 0. Need another way?
+            parent = parent->get_input_node_shared_ptr(0);
+        }
+
+        auto transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent);
+        if (matmul0->get_transpose_b()) {
+            if (is_valid_transpose(transpose1, {0, 2, 1, 3})) {
+                // We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order
+                // only if these ops have scalar shapes on other inputs.
+                // There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false).
+                // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching
+                if (are_weights_scalar) {
+                    ordered_ops.insert(ordered_ops.begin(), transpose1);
+                } else {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        } else {
+            if (is_valid_transpose(transpose1, {0, 2, 3, 1})) {
+                ordered_ops.insert(ordered_ops.begin(), transpose1);
+            }
+        }
+
+        // TODO: Add Reshape Support for all Transposes
+        //       Add 3D support for all Transposes
+        const auto transpose0 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(matmul0->get_input_node_shared_ptr(0));
+        if (is_valid_transpose(transpose0, {0, 2, 1, 3})) {
+            ordered_ops.insert(ordered_ops.begin(), transpose0);
+        } else if (matmul0->get_transpose_b()) {
+            return false;
+        }
+
+        const auto transpose2 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(matmul1->get_input_node_shared_ptr(1));
+        if (is_valid_transpose(transpose2, {0, 2, 1, 3})) {
+            ordered_ops.push_back(transpose2);
+        }
+        ordered_ops.push_back(matmul1);
+
+        auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
+        // TODO: Add support Eltwises between MatMul1 and Transpose
+        // status = update_intermediate_supported_ops(child, ordered_ops);
+        // if (!status) {
+        //     ordered_ops.push_back(child);
+        // }
+
+        auto transpose3 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(child);
+        if (is_valid_transpose(transpose3, {0, 2, 1, 3})) {
+            ordered_ops.push_back(transpose3);
+        }
+
+        /**********************/
+
+        /* ================================ */
+
+        /* ====== Subgraph creation ======= */
+
+        ngraph::OutputVector body_inputs, subgraph_inputs;
+        ngraph::ParameterVector body_parameters;
+        ngraph::ResultVector body_results;
+        std::vector<std::set<Input<Node>>> subgraph_result_inputs;
+
+        auto create_body_inputs = [&](const std::shared_ptr<ngraph::Node>& node) -> void {
+            for (size_t i = 0; i < node->get_input_size(); ++i) {
+                const auto input = node->input(i);
+                const auto parent = input.get_source_output().get_node_shared_ptr();
+                const auto constant = ov::as_type_ptr<ov::op::v0::Constant>(parent);
+                if (constant && (ngraph::shape_size(input.get_shape()) == 1 || op::Subgraph::constant_input_should_be_inside_body(node))) {
+                    // If Constant has one consumer - target node, we add Constant to body_inputs
+                    // If Constant has several consumers, we should check that all these consumers are inside Subgraph body
+                    // and if all of them are inside body, we can explicitly add Constant to the body_inputs, otherwise we should
+                    // make a copy and add copy of Constant to body_inputs
+                    // For example, this case is especially valid for Transposes nodes
+                    //              (several Transposes have the same order so there can be the common Constant with this order)
+                    if (constant->get_output_target_inputs(0).size() == 1) {
+                        body_inputs.push_back(input.get_source_output());
+                    } else {
+                        const auto constant_consumers = constant->get_output_target_inputs(0);
+                        bool all_consumers_are_inside = std::all_of(constant_consumers.begin(), constant_consumers.end(),
+                                                                    [&ordered_ops](const ngraph::Input<ngraph::Node>& input) {
+                                                                        return std::find(ordered_ops.begin(), ordered_ops.end(),
+                                                                                         input.get_node()->shared_from_this()) != ordered_ops.end();
+                                                                    });
+                        if (all_consumers_are_inside) {
+                            body_inputs.push_back(input.get_source_output());
+                        } else {
+                            const auto constant_copy = constant->clone_with_new_inputs({});
+                            node->set_argument(input.get_index(), constant_copy);
+                            body_inputs.push_back(constant_copy);
+                        }
+                    }
+                } else if (std::find(ordered_ops.begin(), ordered_ops.end(), parent) == ordered_ops.end()) {
+                    auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
+                    body_parameters.push_back(parameter);
+                    body_parameters.back()->set_friendly_name(input.get_node()->get_friendly_name());
+                    body_inputs.push_back(parameter->output(0));
+
+                    subgraph_inputs.push_back(input.get_source_output());
+
+                    node->input(i).replace_source_output(parameter);
+                }
+            }
+        };
+
+        for (const auto& op : ordered_ops) {
+            create_body_inputs(op);
+            op->clear_control_dependencies();
+            fused_names += op->get_friendly_name() + ",";
+        }
+
+        const auto last_node = ordered_ops.back();
+        for (const auto& output : last_node->outputs()) {
+            subgraph_result_inputs.push_back(output.get_target_inputs());
+        }
+        for (const auto& output : last_node->outputs()) {
+            body_results.push_back(std::make_shared<ngraph::opset1::Result>(last_node->output(output.get_index())));
+        }
+
+        if (body_results.size() != subgraph_result_inputs.size()) {
+            throw ngraph_error("body results and node results size mismatch during subgraph collapse");
+        }
+
+        // todo: move this plugin-specific constraint to the plugin callback
+        if (body_parameters.size() + body_results.size() + hidden_virtual_ports_count > 12) {
+            return false;
+        }
+
+        auto body = op::create_body(last_node->get_friendly_name(), body_results, body_parameters);
+        auto subgraph = std::make_shared<op::Subgraph>(subgraph_inputs, body);
+        // Copy runtime info from last node to subgraph - to copy topological order
+        copy_runtime_info(last_node, subgraph);
+        subgraph->set_friendly_name(last_node->get_friendly_name());
+
+        for (size_t i = 0; i < subgraph->get_output_size(); ++i) {
+            for (const auto& target_input : subgraph_result_inputs[i]) {
+                target_input.replace_source_output(subgraph->output(i));
+            }
+        }
+        op::update_out_tensor_name(subgraph);
+
+        subgraph->validate_and_infer_types();
+
+        auto act_body = subgraph->body_ptr();
+        for (size_t i = 0; i < act_body->get_parameters().size(); i++) {
+            act_body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
+        }
+        subgraph->get_rt_info()["originalLayersNames"] = fused_names;
+        subgraph->set_virtual_port_count(hidden_virtual_ports_count);
+        subgraph->set_buffer_needed(need_buffer);
+
+        return true;
+
+        /* ================================ */
+    });
+}
diff --git a/src/common/snippets/src/pass/reset_buffer.cpp b/src/common/snippets/src/pass/reset_buffer.cpp
new file mode 100644
index 00000000000..bae2ac58ccd
--- /dev/null
+++ b/src/common/snippets/src/pass/reset_buffer.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "snippets/snippets_isa.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/op/subgraph.hpp"
+
+
+namespace {
+void normalize_ptr_and_offsets(const ov::NodeVector &io, std::vector<int64_t> &ptr_increments, std::vector<int64_t> &finalization_offsets) {
+    bool there_is_buffer = false;
+    // Iterations are from end because before we correct finalization offsets for Loop outputs (io = inputs + outputs)
+    for (int i = static_cast<int>(io.size()) - 1; i >= 0; --i) {
+        if (ov::is_type<ngraph::snippets::op::Buffer>(io[i])) {
+            if (there_is_buffer) {
+                ptr_increments[i] = 0;
+                finalization_offsets[i] = 0;
+            } else {
+                there_is_buffer = true;
+            }
+        }
+    }
+}
+} // namespace
+
+int64_t ngraph::snippets::pass::ResetBufferState::calculate_required_finalization_offsets(const size_t back_step, const size_t target_work_amount) {
+    return target_work_amount != 1 ? -static_cast<int64_t>(back_step) : 0;
+}
+
+ngraph::snippets::pass::ResetBufferState::ResetBufferState() {
+    MATCHER_SCOPE(ResetBufferState);
+
+    // Match on LoopEnd is enough at the moment because Buffer op may be only after MatMul and LoopEnd, but
+    // MatMul doesn't change Buffer memory pointer after execution
+    auto m_loop_end = ngraph::pattern::wrap_type<op::LoopEnd>();
+
+    auto callback = [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ResetBufferState")
+        auto& pattern_to_output = m.get_pattern_value_map();
+
+        const auto loop_end = ngraph::as_type_ptr<op::LoopEnd>(pattern_to_output.at(m_loop_end).get_node_shared_ptr());
+        const auto loop_begin = loop_end->get_loop_begin();
+
+        const auto i_size = loop_begin->get_input_size();
+        const auto o_size = loop_end->get_output_size();
+        const auto count_io = i_size + o_size;
+        std::vector<ov::PartialShape> body_shapes(count_io);
+        ov::NodeVector io(count_io);
+        for (size_t i = 0; i < i_size; ++i) {
+            body_shapes[i] = loop_begin->input_value(i).get_partial_shape();
+            io[i] = loop_begin->input_value(i).get_node_shared_ptr();
+            auto port_idx = loop_begin->input_value(i).get_index();
+            while (std::dynamic_pointer_cast<op::LoopBase>(io[i])) {
+                const auto source_output = io[i]->input_value(port_idx);
+                io[i] = source_output.get_node_shared_ptr();
+                port_idx = source_output.get_index();
+            }
+        }
+        for (size_t i = 0; i < o_size; ++i) {
+            body_shapes[i_size + i] = loop_end->output(i).get_partial_shape();
+            // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
+            auto consumer = *loop_end->output(i).get_target_inputs().begin();
+            auto port_idx = consumer.get_index();
+            io[i_size + i] = consumer.get_node()->shared_from_this();
+            while (std::dynamic_pointer_cast<op::LoopBase>(io[i_size + i])) {
+                auto consumer = *io[i_size + i]->output(port_idx).get_target_inputs().begin();
+                port_idx = consumer.get_index();
+                io[i_size + i] = consumer.get_node()->shared_from_this();
+            }
+        }
+
+        auto ptr_increments = loop_end->get_ptr_increments();
+        auto finalization_offsets = loop_end->get_finalization_offsets();
+
+        // If after Loop there is immediately Buffer, we should reset the Buffer ptr for the next calculations
+        for (size_t i = 0; i < o_size; ++i) {
+            const auto result_shape = body_shapes[i_size + i].get_shape();
+            // check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
+            const auto consumer = loop_end->output(i).get_target_inputs().begin()->get_node();
+            if (ov::is_type<ngraph::snippets::op::Buffer>(consumer)) {
+                // To calculate finalization offset we should know index of nesting Loop
+                auto loop_index = 0lu;
+                auto loop = loop_end->input_value(i).get_node_shared_ptr();
+                auto port_idx = loop_end->input_value(i).get_index();
+                while (std::dynamic_pointer_cast<op::LoopEnd>(loop)) {
+                    const auto source_output = loop->input_value(port_idx);
+                    loop = source_output.get_node_shared_ptr();
+                    port_idx = source_output.get_index();
+                    loop_index++;
+                }
+
+                const auto work_amount = std::accumulate(result_shape.rbegin(), result_shape.rbegin() + loop_index + 1, size_t(1), std::multiplies<size_t>());
+                finalization_offsets[i_size + i] =
+                        calculate_required_finalization_offsets(work_amount, *(result_shape.rbegin() + loop_index));
+            }
+        }
+
+        // If there are several Buffers on I/O we should remember that all Buffer have the register,
+        // so we should update ptr for only one Buffer
+        normalize_ptr_and_offsets(io, ptr_increments, finalization_offsets);
+        loop_end->set_finalization_offsets(finalization_offsets);
+        loop_end->set_ptr_increments(ptr_increments);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_loop_end, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/softmax_decomposition.cpp b/src/common/snippets/src/pass/softmax_decomposition.cpp
new file mode 100644
index 00000000000..1a7330fb537
--- /dev/null
+++ b/src/common/snippets/src/pass/softmax_decomposition.cpp
@@ -0,0 +1,216 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/remarks.hpp"
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/reset_buffer.hpp"
+#include "snippets/pass/insert_loops.hpp"
+#include "snippets/pass/loop_helpers.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/validation_util.hpp>
+
+
+ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank) {
+    MATCHER_SCOPE(SoftmaxDecomposition);
+
+    auto m_softmax = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>();
+
+    auto callback = [=](ngraph::pattern::Matcher &m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition")
+        auto root = m.get_match_root();
+        const auto master_pshape = root->get_input_partial_shape(0);
+        const auto rank = master_pshape.rank();
+        if (rank.is_dynamic() || master_pshape.is_dynamic())
+            return false;
+
+        int64_t axis = 0;
+        if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(root)) {
+            axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank);
+        } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(root)) {
+            axis = softmax_v1->get_axis();
+        } else {
+            return false;
+        }
+
+        const auto shape_rank = rank.get_length();
+        if (axis != shape_rank - 1)
+            return false;
+
+        const auto data = root->get_input_node_shared_ptr(0);
+
+        const auto master_shape = master_pshape.get_shape();
+        const auto dimension = shape_rank - 1;
+        const auto work_amount = master_shape[dimension];
+        const auto increment = vector_size;
+        const auto inner_dim = shape_rank - 1;
+        const auto inner_master_work_amount = static_cast<size_t>(master_shape[inner_dim]);
+        const int outer_dim = shape_rank > 1 ? static_cast<int>(shape_rank - 2) : -1;
+        const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+
+        /* ====== ReduceMax decomposition ====== */
+
+        // We have to have fake edge Data -> Loop[ReduceMax] -> Loop[Sub + Exp + ReduceSum] because ReduceMax is
+        // accumulator which finds maximum of elements and save it to vector register. Loop works only with GPR (data) but ReduceMax Loop
+        // doesn't save maximum to data. Seems like, LoopEnd shouldn't have outputs:
+        //                     Data
+        //  VectorBuffer   LoopBegin   \
+        //         \         Load    \  |
+        //           Maximum         /  |
+        //              /   LoopEnd     |
+        //       HorizonMax            /
+        //             \   LoopBegin[Sub + Exp + ReduceSum]
+        // But nGraph doesn't allow to have 0 outputs for Node (at least 1 output).
+        // Thus, we propagate data through Loop[ReduceMax] using fake edge because of that Loop[ReduceMax] has two inputs "Data"
+        //                    Data
+        //  VectorBuffer    LoopBegin
+        //         \          Load |  \
+        //           Maximum       |  /
+        //              /    LoopEnd
+        //       HorizonMax     |
+        //             \   LoopBegin[Sub + Exp + ReduceSum]
+        const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+        const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data});
+
+        const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+        const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
+
+        auto apply_increments_max =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {data->get_shape(), data->get_shape(), data->get_shape()});
+        // Input of softmax is Input and Output of this loop, which isn't used inside (it's just to have one output in Loop at least)
+        // So we shouldn't increment pointer after each loop iteration
+        apply_increments_max[0] = false;
+        apply_increments_max[1] = false;
+        // we should always reset data ptr after this loop because in the next Loop this ptr is used
+        // Although output isn't a Buffer op, we set finalization offset and ptr increment for output, because ResetBufferState pass
+        // normalizes offsets and increments starting from outputs
+        const auto finalization_offsets_max =
+            std::vector<int64_t>{ 0, 0, ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]) };
+        const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
+            work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+        const auto horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+
+        /* =========================================== */
+
+        /* === Sub + Exp + ReduceSum decomposition === */
+
+        const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+        const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
+
+        const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+        const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+        const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+        const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+        const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+        auto apply_increments_sum =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
+        std::vector<int64_t> finalization_offsets_sum(2, 0);
+        if (has_outer_loop) {
+            finalization_offsets_sum =
+                InsertLoops::calculate_finalization_offsets(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
+        }
+        // we should always reset buffer ptr after loop because in the next Loop this buffer ptr is used
+        finalization_offsets_sum[1] = ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]);
+        const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+            ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+            apply_increments_sum, finalization_offsets_sum);
+
+        const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+        const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0), buffer_allocation_rank);
+
+        /* =========================================== */
+
+        /* ================== Div ==================== */
+
+        // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
+        const auto pow = std::make_shared<ngraph::opset1::Power>(horizon_sum,
+            ngraph::op::Constant::create(ov::element::f32, ngraph::Shape{}, {-1}));
+
+        const auto loop_div_begin = op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+        const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+        const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+        const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+        auto apply_increments_div =
+                InsertLoops::calculate_inner_apply_increments(master_shape, {load_div->get_shape(), store_div->get_shape()});
+        std::vector<int64_t> finalization_offsets_div(2, 0);
+        if (has_outer_loop) {
+            finalization_offsets_div =
+                InsertLoops::calculate_finalization_offsets(master_shape, {load_div->get_shape(), store_div->get_shape()});
+        }
+        const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+            ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+            apply_increments_div, finalization_offsets_div);
+
+        /* =========================================== */
+
+        /* ========== Control dependency ============= */
+
+        loop_max_begin->add_control_dependency(vector_buffer_max);
+        loop_max_end->add_control_dependency(max);
+        horizon_max->add_control_dependency(loop_max_end);
+        loop_sum_begin->add_control_dependency(vector_buffer_sum);
+        loop_sum_begin->add_control_dependency(horizon_max);
+        loop_sum_end->add_control_dependency(sum);
+        horizon_sum->add_control_dependency(loop_sum_end);
+        loop_div_begin->add_control_dependency(horizon_sum);
+        loop_div_begin->add_control_dependency(pow);
+
+        /* =========================================== */
+
+        /* ============= Runtime Info ================ */
+
+        // For tail loop we should fill input of Max by float min and
+        // input of Sum by zero to avoid math incorrect calculations
+        max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff);
+        sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000);
+
+        // These nodes should be executed outside loops
+        ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp };
+        for (const auto& op : ops_outside_loop) {
+            op->get_rt_info()["outside_loop"] = true;
+        }
+
+        ngraph::copy_runtime_info(root,
+            {vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end,
+             vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, pow,
+             loop_div_begin, load_div, mul, store_div, loop_div_end});
+
+        /* =========================================== */
+
+        ngraph::replace_node(root, loop_div_end);
+
+        /* ============== Outer loop ================= */
+        if (has_outer_loop) {
+            std::vector<bool> apply_increments =
+                    InsertLoops::calculate_outer_apply_increments({root->get_input_shape(0), root->get_output_shape(0)});
+            const auto softmax_parameters =
+                std::vector<ov::Output<ov::Node>>{loop_max_begin->input(0).get_source_output()};
+            const auto output_set = loop_div_end->output(0).get_target_inputs();
+            const auto softmax_results = std::vector<ov::Input<ov::Node>>{output_set.begin(), output_set.end()};
+            const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(softmax_parameters);
+            const auto outer_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs(
+                softmax_results, outer_loop_begin, master_shape[outer_dim], 1, apply_increments);
+
+            vector_buffer_max->add_control_dependency(outer_loop_begin);
+
+            ngraph::copy_runtime_info(root, {outer_loop_begin, outer_loop_end});
+        }
+        /* =========================================== */
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(m_softmax, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp
new file mode 100644
index 00000000000..f770f4e8066
--- /dev/null
+++ b/src/common/snippets/src/pass/softmax_reshape_elimination.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/remarks.hpp"
+
+#include "snippets/pass/softmax_reshape_elimination.hpp"
+#include "snippets/snippets_isa.hpp"
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/validation_util.hpp>
+
+ngraph::snippets::pass::SoftmaxReshapeElimination::SoftmaxReshapeElimination() {
+    MATCHER_SCOPE(SoftmaxReshapeElimination);
+    const auto m_reshape0 = pattern::wrap_type<opset1::Reshape>(pattern::has_static_shape());
+    const auto m_softmax = pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>({m_reshape0});
+    const auto m_reshape1 = pattern::wrap_type<opset1::Reshape>({m_softmax, pattern::wrap_type<opset1::Constant>()});
+
+    register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_reshape1, matcher_name),
+            [=](ngraph::pattern::Matcher &m) {
+            OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxReshapeElimination")
+            auto& pattern_to_output = m.get_pattern_value_map();
+            auto reshape0 = pattern_to_output[m_reshape0].get_node_shared_ptr();
+            auto softmax = pattern_to_output[m_softmax].get_node_shared_ptr();
+            auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr();
+
+            const auto input_shape = reshape0->get_input_partial_shape(0);
+            const auto output_shape = reshape1->get_output_partial_shape(0);
+            if (input_shape.is_dynamic() || output_shape.is_dynamic() || input_shape.get_shape() != output_shape.get_shape())
+                return false;
+
+            const auto softmax_rank = softmax->get_input_partial_shape(0).rank();
+            int64_t axis = 0;
+            if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(softmax)) {
+                axis = ngraph::normalize_axis(softmax->get_friendly_name(), softmax_v8->get_axis(), softmax_rank);
+            } else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(softmax)) {
+                axis = softmax_v1->get_axis();
+            } else {
+                return false;
+            }
+
+            // Supports only last axis
+            if (axis != softmax_rank.get_length() - 1)
+                return false;
+
+            // Dimensions by reduction axis should be equal
+            if (input_shape.get_shape().back() != softmax->get_input_shape(0).back())
+                return false;
+
+            // Eliminate Reshape before Softmax
+            reshape0->output(0).replace(reshape0->input_value(0));
+            copy_runtime_info({reshape0->input_value(0).get_node_shared_ptr(), reshape0->output(0).get_node_shared_ptr()},
+                reshape0->input_value(0).get_node_shared_ptr());
+
+            // Eliminate Reshape after Softmax with name saving
+            replace_output_update_name(reshape1->output(0), reshape1->input_value(0));
+
+            // update axis
+            const auto new_axis = input_shape.rank().get_length() - 1;
+            if (auto softmax_v8 = ngraph::as_type_ptr<ov::op::v8::Softmax>(softmax)) {
+                softmax_v8->set_axis(new_axis);
+            } else if (auto softmax_v1 = ngraph::as_type_ptr<ov::op::v1::Softmax>(softmax)) {
+                softmax_v1->set_axis(new_axis);
+            }
+
+            return true;
+        });
+}
diff --git a/src/common/snippets/src/pass/tokenization.cpp b/src/common/snippets/src/pass/tokenization.cpp
new file mode 100644
index 00000000000..4744b73b882
--- /dev/null
+++ b/src/common/snippets/src/pass/tokenization.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/pass/common_optimizations.hpp"
+
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
+    auto &rt = node->get_rt_info();
+    rt["SnippetsNodeType"] = nodeType;
+}
+
+SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node> &node) {
+    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType")
+    auto &rt = node->get_rt_info();
+    const auto rinfo = rt.find("SnippetsNodeType");
+    if (rinfo == rt.end())
+        return SnippetsNodeType::NotSet;
+    return rinfo->second.as<SnippetsNodeType>();
+}
+
+void SetTopologicalOrder(const std::shared_ptr<Node> &node, int64_t order) {
+    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder")
+    auto &rt = node->get_rt_info();
+    rt["TopologicalOrder"] = order;
+}
+
+int64_t GetTopologicalOrder(const std::shared_ptr<const Node> &node) {
+    auto &rt = node->get_rt_info();
+    const auto rinfo = rt.find("TopologicalOrder");
+    if (rinfo == rt.end())
+        throw ngraph_error("Topological order is required, but not set.");
+    return rinfo->second.as<int64_t>();
+}
+
+bool EnumerateNodes::run_on_model(const std::shared_ptr<ov::Model> &m) {
+    OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes")
+    int64_t order = 0;
+    // Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough
+    for (auto &node : m->get_ordered_ops()) {
+        SetTopologicalOrder(node, order++);
+    }
+    return true;
+}
+
+
+bool SnippetsTokenization::run_on_model(const std::shared_ptr<ov::Model>& m) {
+    RUN_ON_FUNCTION_SCOPE(SnippetsTokenization);
+    ngraph::pass::Manager manager(get_pass_config());
+    manager.set_per_pass_validation(false);
+
+    manager.register_pass<EnumerateNodes>();
+    manager.register_pass<TokenizeMHASnippets>();
+    manager.register_pass<TokenizeSnippets>();
+    manager.register_pass<CommonOptimizations>();
+    manager.run_passes(m);
+
+    // Returning value is false because pass::Manager always apply Validation pass if function was changed.
+    // But we don't need to validate the model
+    return false;
+}
+
+} // namespace pass
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/src/pass/transpose_decomposition.cpp b/src/common/snippets/src/pass/transpose_decomposition.cpp
new file mode 100644
index 00000000000..5dc6960b2fd
--- /dev/null
+++ b/src/common/snippets/src/pass/transpose_decomposition.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/pass/transpose_decomposition.hpp>
+#include <snippets/itt.hpp>
+#include <snippets/snippets_isa.hpp>
+#include <snippets/pass/loop_helpers.hpp>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/partial_shape.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <numeric>
+const std::set<std::vector<int>> ngraph::snippets::pass::TransposeDecomposition::supported_cases = {{0, 2, 3, 1}};
+ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() {
+    MATCHER_SCOPE(TransposeDecomposition);
+    // todo: we need a special transformation that detects and propagates data access pattern to Parameters and Results
+    //  this is needed to communicate access pattern to the plugin node and op::Kernel
+    // This is the reason we match only to Parameter, this limitation could be relaxed if we propagate access pattern
+    // to the appropriate parameter
+    auto match_data = ngraph::pattern::wrap_type<opset1::Parameter>();
+    auto match_order = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto match_transpose = ngraph::pattern::wrap_type<ngraph::opset1::Transpose>({match_data, match_order});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransposeDecomposition")
+        auto& pattern_to_output = m.get_pattern_value_map();
+        const auto transpose = ov::as_type_ptr<ngraph::opset1::Transpose>(
+                                                            pattern_to_output.at(match_transpose).get_node_shared_ptr());
+
+        const auto order = ov::as_type_ptr<ov::op::v0::Constant>(pattern_to_output.at(match_order).get_node_shared_ptr());
+        if (transformation_callback(transpose) || transpose->is_dynamic())
+            return false;
+
+        auto order_value = order->cast_vector<int>();
+        if (supported_cases.count(order_value) == 0)
+            return false;
+
+        auto data_input = pattern_to_output.at(match_data);
+        const auto& data_node = pattern_to_output.at(match_data).get_node_shared_ptr();
+        auto &param_rt = data_node->get_rt_info();
+        // Note: store and usage inside emitters as size_t is more convenient, so static_cast here
+        const auto& access_pattern = order->cast_vector<size_t>();
+        param_rt["Layout"] = access_pattern;
+
+        // The line below is Ok, since we ensured that transpose is static above
+        auto data_shape = data_input.get_shape();
+        // dim indexes with respect to SRC
+        const auto dim_C_idx = data_shape.size() - 3;
+        const auto dim_H_idx = data_shape.size() - 2;
+        const auto dim_W_idx = data_shape.size() - 1;
+        const auto size_C = static_cast<int64_t>(data_shape[dim_C_idx]);
+        const auto size_W = static_cast<int64_t>(data_shape[dim_W_idx]);
+        const auto size_H = static_cast<int64_t>(data_shape[dim_H_idx]);
+
+        auto loop_W_begin = std::make_shared<op::LoopBegin>(OutputVector{data_input});
+        auto loop_C_begin = std::make_shared<op::LoopBegin>(OutputVector{loop_W_begin->output(0)});
+        // todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
+        //  fix this in future and develop a more consistent shape propagation approach.
+        auto load = std::make_shared<snippets::op::LoadReshape>(loop_C_begin->output(0), 1, 0, access_pattern);
+        auto store = std::make_shared<snippets::op::Store>(load, 1);
+        const std::vector<int64_t> ptr_increments_C {size_H * size_W, 1};
+        const std::vector<int64_t> finalization_offsets_C {1 - size_H * size_W * size_C, 0};
+        auto loop_C_end = std::make_shared<op::LoopEnd>(OutputVector{store->output(0), loop_C_begin->output(1)},
+                                                        size_C, 1, ptr_increments_C, finalization_offsets_C);
+        auto loop_W_end = std::make_shared<op::LoopEnd>(OutputVector{loop_C_end->output(0), loop_W_begin->output(1)},
+                                                        size_W, 1, std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
+
+        for (auto& input : transpose->output(0).get_target_inputs()) {
+            input.replace_source_output(loop_W_end->output(0));
+        }
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(match_transpose, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/src/common/snippets/src/utils.cpp b/src/common/snippets/src/utils.cpp
index 9c38a17b2d5..11adf0fe954 100644
--- a/src/common/snippets/src/utils.cpp
+++ b/src/common/snippets/src/utils.cpp
@@ -6,8 +6,11 @@
 
 #include "snippets/pass/fq_decomposition.hpp"
 
+namespace ngraph {
+namespace snippets {
+namespace utils {
 
-auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t {
+auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<opset1::FakeQuantize>& fq) -> size_t {
     std::vector<float> out_scales;
     std::vector<float> cl, ch, isc, ish, osc, osh;
     const bool status = ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh);
@@ -55,3 +58,54 @@ auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::sh
         return 1;
     return 0;
 }
+std::vector<size_t> get_node_output_layout(const std::shared_ptr<Node>& node) {
+    return get_node_output_layout(node.get());
+}
+std::vector<size_t> get_node_output_layout(const Node* node) {
+    if (!node)
+        return {};
+    if (node->is_dynamic())
+        throw ngraph_error("It's illegal to call get_node_output_layout for dynamic nodes");
+    auto &rt = node->get_rt_info();
+    const auto rinfo = rt.find("Layout");
+    if (rinfo != rt.end()) {
+        std::vector<size_t> layout(rinfo->second.as<std::vector<size_t>>());
+        // This might be a little costy, but still useful sanity check. Remove if proved to be unacceptably heavy.
+        std::set<size_t> unique_elements(layout.begin(), layout.end());
+        if (unique_elements.size() < layout.size())
+            throw ngraph_error("Layout must contain only unique dimension indexes");
+        return layout;
+    } else {
+        return {};
+    }
+}
+
+ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector<size_t>& layout) {
+    if (layout.empty())
+        return shape;
+    std::vector<Dimension> reordered_shape(layout.size());
+    if (shape.rank().is_dynamic())
+        throw ngraph_error("get_reordered_planar_shape can't be called for outputs with dynamic rank");
+    const size_t rank = shape.rank().get_length();
+    if (layout.size() > rank)
+        throw ngraph_error("Layout rank can't be larger than tensor rank");
+    // Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes
+    if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;}))
+        throw ngraph_error("Invalid layout detected: all layout indexes must be smaller than the tensor rank");
+    for (int i = 0; i < layout.size(); i++)
+        reordered_shape[i] = shape[layout[i]];
+    return reordered_shape;
+}
+
+ov::PartialShape get_port_planar_shape(const Output<Node>& out) {
+    std::vector<size_t> layout = get_node_output_layout(out.get_node_shared_ptr());
+    const auto& tensor = out.get_tensor_ptr();
+    if (!tensor)
+        throw ngraph_error("get_port_planar_shape can't be called for an uninitialized output tensor");
+    auto tensor_shape = tensor->get_partial_shape();
+    return get_reordered_planar_shape(tensor_shape, layout);
+}
+
+} // namespace utils
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
index 8e86d321e7e..c629b1c13f5 100644
--- a/src/common/snippets/tests/include/lowering_utils.hpp
+++ b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -40,10 +40,16 @@ public:
 class LoweringTests : public TransformationTestsF {
 public:
     LoweringTests();
+
+    void SetUp() override;
+    void TearDown() override;
+
 protected:
     static std::shared_ptr<ngraph::snippets::op::Subgraph> getSubgraph(const std::shared_ptr<Model>& f);
-    static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f);
+    static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f,
+                                                                              const ov::PartialShape& master_shape);
     static std::shared_ptr<ngraph::snippets::op::Subgraph> getTokenizedSubgraph(const std::shared_ptr<Model>& f);
+    ov::PartialShape master_shape{};
 };
 
 }  // namespace snippets
diff --git a/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp b/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp
new file mode 100644
index 00000000000..15a1f5a9846
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/broadcast_to_movebroadcast.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lowering_utils.hpp"
+#include "snippets_helpers.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+typedef std::tuple<
+        Shape, // Input shape 0
+        Shape, // Input shape 1
+        Shape  // Broadcast shape
+> BroadcastParams;
+
+class BroadcastToMoveBroadcastTests : public LoweringTests, public testing::WithParamInterface<BroadcastParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<BroadcastParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
new file mode 100644
index 00000000000..8b886ef9876
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/fuse_transpose_brgemm.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lowering_utils.hpp"
+#include "snippets_helpers.hpp"
+
+/* The main purpose is to test that FuseTransposeBrgemm properly fuses 0213 Transposes on both inputs, as well as on output
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<PartialShape>, // Input shapes
+        PartialShape,              // Master shape
+        size_t                     // Transpose position
+> fuseTransposeBrgemmParams;
+
+class FuseTransposeBrgemmTests : public LoweringTests, public testing::WithParamInterface<fuseTransposeBrgemmParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<fuseTransposeBrgemmParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/include/pass/mha_tokenization.hpp b/src/common/snippets/tests/include/pass/mha_tokenization.hpp
new file mode 100644
index 00000000000..60e06d591ca
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/mha_tokenization.hpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <common_test_utils/ngraph_test_utils.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class TokenizeMHASnippetsTests : public TransformationTestsF {
+public:
+    virtual void run();
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/include/pass/softmax_decomposition.hpp b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp
new file mode 100644
index 00000000000..3943bd641bf
--- /dev/null
+++ b/src/common/snippets/tests/include/pass/softmax_decomposition.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "lowering_utils.hpp"
+#include "snippets_helpers.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        Shape, // Input shape 0
+        int  // Axis
+> SoftmaxParams;
+
+typedef std::tuple<
+        Shape, // Input shape 0
+        Shape, // Input shape 1
+        int  // Axis
+> AddSoftmaxParams;
+
+class SoftmaxTests : public LoweringTests, public testing::WithParamInterface<SoftmaxParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+class AddSoftmaxTests : public LoweringTests, public testing::WithParamInterface<AddSoftmaxParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj);
+protected:
+    void SetUp() override;
+    std::shared_ptr<SnippetsFunctionBase> snippets_function;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp
index de46de861ca..a07fb4c0884 100644
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@@ -4,7 +4,7 @@
 
 #include <common_test_utils/ngraph_test_utils.hpp>
 #include "lowering_utils.hpp"
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 
 
 namespace ov {
@@ -21,7 +21,12 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[op::v1::Add::get_type_info_static()] = dummy_functor;
     jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor;
     jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
-    jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
+    jitters[op::v1::Divide::get_type_info_static()] = dummy_functor;
+    jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor;
+    jitters[op::v0::Exp::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::PowerStatic::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
 
@@ -30,8 +35,12 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor;
 }
 
 LoweringTests::LoweringTests() : TransformationTestsF() {
@@ -41,6 +50,29 @@ LoweringTests::LoweringTests() : TransformationTestsF() {
     comparator.disable(FunctionsComparator::CmpValues::SUBGRAPH_DESCRIPTORS);
 }
 
+void LoweringTests::SetUp() {
+    manager.register_pass<ngraph::pass::InitNodeInfo>();
+}
+
+void LoweringTests::TearDown() {
+    auto cloned_function = ngraph::clone_function(*function);
+    if (!function_ref) {
+        function_ref = cloned_function;
+    }
+    manager.run_passes(function);
+        ASSERT_NO_THROW(check_rt_info(function));
+
+    if (comparator.should_compare(FunctionsComparator::ACCURACY)) {
+        auto acc_comparator = FunctionsComparator::no_default();
+        acc_comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
+        auto res = acc_comparator.compare(function, cloned_function);
+        ASSERT_TRUE(res.valid) << res.message;
+        comparator.disable(FunctionsComparator::CmpValues::ACCURACY);
+    }
+    auto res = comparator.compare(function, function_ref);
+    ASSERT_TRUE(res.valid) << res.message;
+}
+
 std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
     std::shared_ptr<ngraph::snippets::op::Subgraph> subgraph;
     for (const auto &op : f->get_ops()) {
@@ -59,9 +91,30 @@ std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const
     return subgraph;
 }
 
-std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f) {
+std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f,
+                                                                                  const ov::PartialShape& master_shape) {
     auto subgraph = getTokenizedSubgraph(f);
     subgraph->set_generator(std::make_shared<DummyGenerator>());
+    subgraph->set_master_shape(master_shape);
+    const auto& body = subgraph->body_ptr();
+    auto& body_rt_info = body->get_rt_info();
+    // todo: insertLoops pass requires body_rt_info["PluginShapesOverride"] and subgraph->set_tile_rank to work normally
+    //  consider revising snippets-plugin shape and scheduling communication
+    std::vector<std::vector<size_t>> new_shapes;
+    for (const auto& p : body->get_parameters()) {
+        const auto pshape = p->get_output_partial_shape(0);
+        if (pshape.is_dynamic())
+            IE_THROW() << "getLoweredSubgraph supports only static shapes";
+        new_shapes.push_back(pshape.get_shape());
+    }
+    for (const auto& r : body->get_results()) {
+        const auto pshape = r->get_input_partial_shape(0);
+        if (pshape.is_dynamic())
+            IE_THROW() << "getLoweredSubgraph supports only static shapes";
+        new_shapes.push_back(pshape.get_shape());
+    }
+    body_rt_info["PluginShapesOverride"] = new_shapes;
+    subgraph->set_tile_rank(2);
     subgraph->generate();
     return subgraph;
 }
diff --git a/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp b/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp
new file mode 100644
index 00000000000..eec9fddf0f4
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/broadcast_to_movebroadcast.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "pass/broadcast_to_movebroadcast.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include <subgraph_lowered.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+std::string BroadcastToMoveBroadcastTests::getTestCaseName(testing::TestParamInfo<BroadcastParams> obj) {
+    std::vector<Shape> inputShapes(2);
+    Shape broadcast_shape;
+    std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = obj.param;
+    std::ostringstream result;
+    for (size_t i = 0; i < inputShapes.size(); i++)
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+    result << "BS=" << CommonTestUtils::vec2str(broadcast_shape) << "_";
+    return result.str();
+}
+
+void BroadcastToMoveBroadcastTests::SetUp() {
+    TransformationTestsF::SetUp();
+    std::vector<PartialShape> inputShapes(2);
+    PartialShape broadcast_shape;
+    std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = this->GetParam();
+    snippets_function = std::make_shared<BroadcastAddLoweredFunction>(inputShapes, broadcast_shape);
+    master_shape = {};
+    for (int i = 0; i < inputShapes[0].size(); i++)
+        master_shape.push_back(static_cast<int64_t>(std::max(inputShapes[0].get_shape()[i], inputShapes[1].get_shape()[i])));
+}
+
+TEST_P(BroadcastToMoveBroadcastTests, BroadcastSelect) {
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
+    function = subgraph->body_ptr();
+    function_ref = snippets_function->getLowered();
+}
+
+namespace BroadcastToMoveBroadcastTestsInstantiation {
+using ov::Shape;
+std::vector<Shape> inputShapes0 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}};
+std::vector<Shape> inputShapes1 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}};
+Shape broadcastShape {1, 8, 2, 10};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Broadcast, BroadcastToMoveBroadcastTests,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapes0),
+                                 ::testing::ValuesIn(inputShapes1),
+                                 ::testing::Values(broadcastShape)),
+                         BroadcastToMoveBroadcastTests::getTestCaseName);
+} // namespace BroadcastToMoveBroadcastTestsInstantiation
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp
index 15c33e6df96..7b687bad226 100644
--- a/src/common/snippets/tests/src/pass/canonicalization.cpp
+++ b/src/common/snippets/tests/src/pass/canonicalization.cpp
@@ -23,12 +23,12 @@ std::string CanonicalizationTests::getTestCaseName(testing::TestParamInfo<canoni
         // input shape
         result << "IS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(inputs[i])) << "_";
         // input blocked shape
-        result << "IBS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(blockedshape)) << "_";
+        result << "IBS[" << i << "]=" << CommonTestUtils::partialShape2str({std::get<0>(blockedshape)}) << "_";
         // input blocked order
         result << "IBO[" << i << "]=" << CommonTestUtils::vec2str(std::get<1>(blockedshape)) << "_";
     }
     // output blocked shape
-    result << "OBS[0]=" << CommonTestUtils::vec2str(std::get<0>(output)) << "_";
+    result << "OBS[0]=" << CommonTestUtils::partialShape2str({std::get<0>(output)}) << "_";
     // output blocked order
     result << "OBO[0]=" << CommonTestUtils::vec2str(std::get<1>(output)) << "_";
     result << "ExpOS[0]=" << CommonTestUtils::vec2str(expectedOutput) << "_";
@@ -42,7 +42,7 @@ void CanonicalizationTests::SetUp() {
     std::tie(inputs[0], inputs[1], output_blocked_shapes[0], expected_output_shape) = this->GetParam();
 
     input_blocked_shapes = {std::get<1>(inputs[0]), std::get<1>(inputs[1])};
-    snippets_function = std::make_shared<AddFunction>(std::vector<Shape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
+    snippets_function = std::make_shared<AddFunction>(std::vector<PartialShape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
 }
 
 TEST_P(CanonicalizationTests, Add) {
@@ -50,8 +50,9 @@ TEST_P(CanonicalizationTests, Add) {
     function_ref = snippets_function->getReference();
     auto subgraph =  getTokenizedSubgraph(function);
     subgraph->set_generator(std::make_shared<DummyGenerator>());
-    Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
-    ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
+    auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    ASSERT_TRUE(canonical_output_shape.is_static());
+    ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape);
 }
 
 namespace CanonicalizationTestsInstantiation {
diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
index f5be10838d6..cc4394c5ad3 100644
--- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
@@ -6,7 +6,7 @@
 #include <pass/collapse_subgraph.hpp>
 #include <subgraph_simple.hpp>
 #include <subgraph_converts.hpp>
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 
 namespace ov {
 namespace test {
@@ -17,59 +17,64 @@ void CollapseSubgraphTests::run() {
     std::string name;
     manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
     manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
+    // todo: This is a temporary work-around. remove when MatMul tokenization is supported through general pipeline
+    manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
+            [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                return ov::is_type<const ov::op::v0::MatMul>(n);
+            });
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_Eltwise) {
-    const auto &f = EltwiseFunction(std::vector<Shape> {{2, 3}, {1, 3}});
+    const auto &f = EltwiseFunction(std::vector<PartialShape> {{2, 3}, {1, 3}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_MatMulWithEltwise) {
-    const auto &f = MatMulEltwiseBranchesFunction(std::vector<Shape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
+    const auto &f = MatMulEltwiseBranchesFunction(std::vector<PartialShape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
-    const auto &f = EltwiseLogLoopFunction(std::vector<Shape> {{2, 5}, {2, 1}});
+    const auto &f = EltwiseLogLoopFunction(std::vector<PartialShape> {{2, 5}, {2, 1}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
-    const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
+    const auto &f = ConvertFunction(std::vector<PartialShape>{{2, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
-    const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    const auto &f = ConvertInputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
-    const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    const auto &f = ConvertOutputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
-    const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
+    const auto &f = ConvertStubFunction(std::vector<PartialShape>{{2, 5, 2}, {1, 5, 1}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
-    const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
+    const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<PartialShape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
                                                            std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
                                                            std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
     function = f.getOriginal();
@@ -78,7 +83,7 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_EltwiseTwoResultsFunction) {
-    const auto &f = EltwiseTwoResultsFunction(std::vector<Shape>{{2, 5}, {2, 1}});
+    const auto &f = EltwiseTwoResultsFunction(std::vector<PartialShape>{{2, 5}, {2, 1}});
     function = f.getOriginal();
     function_ref = f.getReference();
     comparator.enable(FunctionsComparator::CmpValues::NAMES);
diff --git a/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
new file mode 100644
index 00000000000..22936ca0c62
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "pass/fuse_transpose_brgemm.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "subgraph_matmul.hpp"
+#include "subgraph_lowered.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string FuseTransposeBrgemmTests::getTestCaseName(testing::TestParamInfo<fuseTransposeBrgemmParams> obj) {
+    std::vector<PartialShape> input_shapes(2);
+    PartialShape master_shape;
+    size_t transpose_position;
+    std::tie(input_shapes, master_shape, transpose_position) = obj.param;
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_";
+    result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_";
+    result << "MS=" << CommonTestUtils::partialShape2str({master_shape}) << "_";
+    result << "Pos=" << transpose_position << "_";
+    return result.str();
+}
+
+void FuseTransposeBrgemmTests::SetUp() {
+    LoweringTests::SetUp();
+    std::vector<PartialShape> input_shapes(2);
+    size_t transpose_position;
+    std::tie(input_shapes, master_shape, transpose_position) = this->GetParam();
+
+    snippets_function = std::make_shared<Transpose0213MatMulLoweredFunction>(input_shapes, transpose_position);
+}
+
+TEST_P(FuseTransposeBrgemmTests, FuseTransposeMatmul) {
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape);
+    function = subgraph->body_ptr();
+    function_ref = snippets_function->getLowered();
+}
+
+namespace FuseTransposeBrgemmTestsInstantiation {
+using ov::Shape;
+std::vector<fuseTransposeBrgemmParams> test_params{
+        {{{1, 49, 2, 23}, {2, 2, 23, 39}}, {2, 2, 49, 23}, 0},
+        {{{1, 2, 49, 23}, {2, 23, 1, 39}}, {2, 2, 49, 39}, 1},
+        {{{1, 2, 49, 23}, {2, 2, 23, 39}}, {2, 2, 49, 39}, 2},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FuseTransposeMatMul, FuseTransposeBrgemmTests,
+                         ::testing::ValuesIn(test_params),
+                         FuseTransposeBrgemmTests::getTestCaseName);
+
+} // namespace FuseTransposeBrgemmTestsInstantiation
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/tests/src/pass/insert_load_store.cpp b/src/common/snippets/tests/src/pass/insert_load_store.cpp
index 1a2fa5a75fc..929697852cb 100644
--- a/src/common/snippets/tests/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/tests/src/pass/insert_load_store.cpp
@@ -25,16 +25,18 @@ std::string InsertLoadStoreTests::getTestCaseName(testing::TestParamInfo<insertL
 }
 
 void InsertLoadStoreTests::SetUp() {
-    TransformationTestsF::SetUp();
+    LoweringTests::SetUp();
     std::vector<Shape> inputShapes(3);
     std::vector<Shape> broadcastShapes(3);
     std::tie(inputShapes[0], inputShapes[1], inputShapes[2],
              broadcastShapes[0], broadcastShapes[1], broadcastShapes[2]) = this->GetParam();
-    snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(inputShapes, broadcastShapes);
+    snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(
+            std::vector<PartialShape> {inputShapes[0], inputShapes[1], inputShapes[2]}, broadcastShapes);
+    master_shape = inputShapes[0];
 }
 
 TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) {
-    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape);
     function = subgraph->body_ptr();
     function_ref = snippets_function->getLowered();
 }
diff --git a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
index f97b8019239..669cb34bc30 100644
--- a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
@@ -24,15 +24,22 @@ std::string InsertMoveBroadcastTests::getTestCaseName(testing::TestParamInfo<ins
 }
 
 void InsertMoveBroadcastTests::SetUp() {
-    TransformationTestsF::SetUp();
+    LoweringTests::SetUp();
     std::vector<Shape> inputShapes(2);
     std::vector<Shape> broadcastShapes(2);
     std::tie(inputShapes[0], inputShapes[1], broadcastShapes[0], broadcastShapes[1]) = this->GetParam();
-    snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(inputShapes, broadcastShapes);
+    snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(std::vector<PartialShape> {inputShapes[0], inputShapes[1]}, broadcastShapes);
+    if (inputShapes[0].size() != inputShapes[1].size())
+        IE_THROW() << "Expected input shapes of the same size";
+    master_shape = {};
+    for (int i = 0; i < inputShapes[0].size(); i++)
+        master_shape.push_back(static_cast<int64_t>(std::max(inputShapes[0][i], inputShapes[1][i])));
 }
 
 TEST_P(InsertMoveBroadcastTests, AddBroadcast) {
-    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
     function = subgraph->body_ptr();
     function_ref = snippets_function->getLowered();
 }
diff --git a/src/common/snippets/tests/src/pass/merge_loops.cpp b/src/common/snippets/tests/src/pass/merge_loops.cpp
new file mode 100644
index 00000000000..be398f2107f
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/merge_loops.cpp
@@ -0,0 +1,169 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <ngraph/function.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include <snippets/snippets_isa.hpp>
+#include <snippets/pass/loop_fusion.hpp>
+
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST(TransformationTests, UnaryEltwisesLoops) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    auto shape = Shape{2, 3, 240};
+    const size_t vector_size = 16;
+    const std::vector<int64_t> inner_ptr_increments(2, vector_size);
+    const std::vector<int64_t> inner_finalization_offsets(2, 0);
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, shape);
+
+        auto outer_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{data});
+        auto inner_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_up});
+        auto load_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(0));
+        auto relu = std::make_shared<op::v0::Relu>(load_up);
+        auto store_up = std::make_shared<snippets::op::Store>(relu);
+        auto inner_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store_up, inner_loop_begin_up->output(1)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(1)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
+
+        auto buffer = std::make_shared<snippets::op::Buffer>(outer_loop_end_up);
+
+        auto outer_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{buffer});
+        auto inner_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_down});
+        auto load_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(0));
+        auto hswish = std::make_shared<op::v4::HSwish>(load_down);
+        auto store_down = std::make_shared<snippets::op::Store>(hswish);
+        auto inner_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store_down, inner_loop_begin_down->output(1)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(1)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
+
+        f = std::make_shared<Function>(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data});
+
+        pass::Manager m;
+        m.register_pass<pass::InitNodeInfo>();
+        m.register_pass<snippets::pass::LoopFusion>();
+        m.run_passes(f);
+    }
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, shape);
+
+        auto outer_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{data});
+        auto inner_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin});
+        auto load = std::make_shared<snippets::op::Load>(inner_loop_begin->output(0));
+        auto relu = std::make_shared<op::v0::Relu>(load);
+        auto hswish = std::make_shared<op::v4::HSwish>(relu);
+        auto store = std::make_shared<snippets::op::Store>(hswish);
+        auto inner_loop_end = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store, inner_loop_begin->output(1)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end->output(0), outer_loop_begin->output(1)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
+
+        f_ref = std::make_shared<Function>(OutputVector{outer_loop_end->output(0)}, ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, BinaryEltwisesLoops) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    auto shape = Shape{2, 3, 240};
+    const size_t vector_size = 16;
+    {
+        const std::vector<int64_t> inner_ptr_increments(3, vector_size);
+        const std::vector<int64_t> inner_finalization_offsets(3, 0);
+
+        auto data0 = std::make_shared<opset1::Parameter>(element::f32, shape);
+        auto data1 = std::make_shared<opset1::Parameter>(element::f32, shape);
+
+        auto outer_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{data0, data1});
+        auto inner_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_up->output(0),
+                                                                                          outer_loop_begin_up->output(1)});
+        auto load0_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(0));
+        auto load1_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(1));
+        auto add = std::make_shared<op::v1::Add>(load0_up, load1_up);
+        auto relu = std::make_shared<op::v0::Relu>(add);
+        auto store_up = std::make_shared<snippets::op::Store>(relu);
+        auto inner_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store_up, inner_loop_begin_up->output(2)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(2)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{0, 0, 0});
+
+        auto buffer = std::make_shared<snippets::op::Buffer>(outer_loop_end_up);
+
+        auto data2 = std::make_shared<opset1::Parameter>(element::f32, shape);
+
+        auto outer_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{buffer, data2});
+        auto inner_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_down->output(0),
+                                                                                            outer_loop_begin_down->output(1)});
+        auto load0_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(0));
+        auto load1_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(1));
+        auto mul = std::make_shared<op::v1::Multiply>(load0_down, load1_down);
+        auto hswish = std::make_shared<op::v4::HSwish>(mul);
+        auto store_down = std::make_shared<snippets::op::Store>(hswish);
+        auto inner_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store_down, inner_loop_begin_down->output(2)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(2)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{0, 0, 0});
+
+        f = std::make_shared<Function>(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data0, data1, data2});
+
+        pass::Manager m;
+        m.register_pass<pass::InitNodeInfo>();
+        m.register_pass<snippets::pass::LoopFusion>();
+        m.run_passes(f);
+    }
+    {
+        const std::vector<int64_t> inner_ptr_increments(4, vector_size);
+        const std::vector<int64_t> inner_finalization_offsets(4, 0);
+
+        auto data0 = std::make_shared<opset1::Parameter>(element::f32, shape);
+        auto data1 = std::make_shared<opset1::Parameter>(element::f32, shape);
+        auto data2 = std::make_shared<opset1::Parameter>(element::f32, shape);
+
+        auto outer_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{data0, data1, data2});
+        auto inner_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin->output(0),
+                                                                                       outer_loop_begin->output(1),
+                                                                                       outer_loop_begin->output(2)});
+        auto load0 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(0));
+        auto load1 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(1));
+        auto load2 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(2));
+        auto add = std::make_shared<op::v1::Add>(load0, load1);
+        auto relu = std::make_shared<op::v0::Relu>(add);
+        auto mul = std::make_shared<op::v1::Multiply>(relu, load2);
+        auto hswish = std::make_shared<op::v4::HSwish>(mul);
+        auto store = std::make_shared<snippets::op::Store>(hswish);
+        auto inner_loop_end = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{store, inner_loop_begin->output(3)}, shape[shape.size() - 1], vector_size,
+                inner_ptr_increments, inner_finalization_offsets);
+        auto outer_loop_end = std::make_shared<snippets::op::LoopEnd>(
+                OutputVector{inner_loop_end->output(0), outer_loop_begin->output(3)}, shape[shape.size() - 2], 1,
+                std::vector<int64_t>{0, 0, 0, 0}, std::vector<int64_t>{0, 0, 0, 0});
+
+        f_ref = std::make_shared<Function>(OutputVector{outer_loop_end->output(0)}, ParameterVector{data0, data1, data2});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
new file mode 100644
index 00000000000..4c3d967be5f
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <pass/mha_tokenization.hpp>
+#include <subgraph_mha.hpp>
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+void TokenizeMHASnippetsTests::run() {
+    ASSERT_TRUE(function);
+    std::string name;
+    manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
+    manager.register_pass<ngraph::snippets::pass::TokenizeMHASnippets>();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA) {
+    const auto &f = MHAFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) {
+    const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
+    function = f.getOriginal();
+    function_ref = f.getReference();
+    run();
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/tests/src/pass/softmax_decomposition.cpp b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp
new file mode 100644
index 00000000000..e3330bd69de
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/softmax_decomposition.cpp
@@ -0,0 +1,122 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "pass/softmax_decomposition.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "subgraph_softmax.hpp"
+#include "subgraph_lowered.hpp"
+
+#include "snippets/pass/softmax_decomposition.hpp"
+#include "snippets/pass/insert_load_store.hpp"
+#include "snippets/pass/insert_movebroadcast.hpp"
+#include "snippets/pass/insert_buffer.hpp"
+#include "snippets/pass/convert_power_to_powerstatic.hpp"
+
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj) {
+    Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis) = obj.param;
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "Axis=" << axis << "_";
+    return result.str();
+}
+
+void SoftmaxTests::SetUp() {
+    LoweringTests::SetUp();
+
+    const size_t count = 10;
+    manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
+    manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
+    manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
+    Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis) = this->GetParam();
+    snippets_function = std::make_shared<SoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape}, axis);
+    master_shape = inputShape;
+}
+
+std::string AddSoftmaxTests::getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj) {
+    Shape inputShape0, inputShape1;
+    int axis;
+    std::tie(inputShape0, inputShape1, axis) = obj.param;
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShape0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShape1) << "_";
+    result << "Axis=" << axis << "_";
+    return result.str();
+}
+
+void AddSoftmaxTests::SetUp() {
+    LoweringTests::SetUp();
+
+    const size_t count = 10;
+    manager.register_pass<ngraph::snippets::pass::InsertBuffer>();
+    manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
+    manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
+    manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
+    manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
+    Shape inputShape0, inputShape1;
+    int axis;
+    std::tie(inputShape0, inputShape1, axis) = this->GetParam();
+    snippets_function = std::make_shared<AddSoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape0, inputShape1}, axis);
+
+    ov::PartialShape master_pshape(inputShape0);
+    ov::PartialShape::broadcast_merge_into(master_pshape, inputShape1, op::AutoBroadcastType::NUMPY);
+    master_shape = master_pshape.get_shape();
+}
+
+TEST_P(SoftmaxTests, SoftmaxDecomposition) {
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
+    function = subgraph->body_ptr();
+    function_ref = snippets_function->getLowered();
+}
+
+TEST_P(AddSoftmaxTests, AddSoftmaxDecomposition) {
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
+    function = subgraph->body_ptr();
+    function_ref = snippets_function->getLowered();
+}
+
+namespace SoftmaxTestsInstantiation {
+std::vector<ov::Shape> inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShape),
+                                 ::testing::Values(-1)),
+                         SoftmaxTests::getTestCaseName);
+
+}  // namespace SoftmaxTestsInstantiation
+
+namespace AddSoftmaxTestsInstantiation {
+std::vector<ov::Shape> inputShape0{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
+std::vector<ov::Shape> inputShape1{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmaxDecomposition, AddSoftmaxTests,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShape0),
+                                 ::testing::ValuesIn(inputShape1),
+                                 ::testing::Values(-1)),
+                         AddSoftmaxTests::getTestCaseName);
+
+}  // namespace AddSoftmaxTestsInstantiation
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp
new file mode 100644
index 00000000000..f8b51924a02
--- /dev/null
+++ b/src/common/snippets/tests/src/pass/softmax_reshape_elimination.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <ngraph/function.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include <snippets/snippets_isa.hpp>
+#include <snippets/pass/softmax_reshape_elimination.hpp>
+
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST_F(TransformationTestsF, SoftmaxV1ReshapeElimination) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{6, 240});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(reshape0, 1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int32_t>{2, 3, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
+        auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(data, 2);
+        function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
+    }
+}
+
+TEST_F(TransformationTestsF, SoftmaxV8ReshapeElimination) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{680, 240});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(data, 3);
+        function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
+    }
+}
+
+TEST_F(TransformationTestsF, SoftmaxReshapeElimination_IncorrectReshape) {
+    {
+        auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
+        auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{2, 81600});
+        auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
+        auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
+        auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
+        auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
+        function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
+
+        manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
+    }
+}
diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp
index 677a318dfab..41b94db5e3f 100644
--- a/src/common/snippets/tests/src/registers.cpp
+++ b/src/common/snippets/tests/src/registers.cpp
@@ -33,6 +33,8 @@ TEST(TransformationTests, AssignRegisters) {
         auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
         s00->set_friendly_name("s00");
         f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
+        // Note that testing the result is not strictly necessary, since the Result doesn't emit any code
+        f->get_result()->set_friendly_name("r00");
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -52,18 +54,19 @@ TEST(TransformationTests, AssignRegisters) {
             {"y01", 1},
             {"y02", 2},
             {"s00", 2}, // gpr
+            {"r00", 2}  // gpr
         };
 
         auto total_ops = 0;
         for (auto& op : f->get_ordered_ops()) {
-            auto& rt = op->get_rt_info();
-
-            auto it_rinfo = rt.find("reginfo");
-            if (it_rinfo != rt.end()) {
-                auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
-                auto reg = reginfo[0];
-                ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
-                total_ops++;
+            for (const auto& output : op->outputs()) {
+                const auto& rt = output.get_tensor_ptr()->get_rt_info();
+                auto it_rt = rt.find("reginfo");
+                if (it_rt != rt.end()) {
+                    auto reg = it_rt->second.as<size_t>();
+                    ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
+                    total_ops++;
+                }
             }
         }
         ASSERT_EQ(total_ops, ref_registers.size());
@@ -120,6 +123,7 @@ TEST(TransformationTests, AssignRegisters2) {
         s00->set_friendly_name("s00");
 
         f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
+        f->get_result()->set_friendly_name("res00");
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -140,17 +144,19 @@ TEST(TransformationTests, AssignRegisters2) {
             {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
             {"r24", 1},
             {"s00", 8},
+            {"res00", 8}
         };
 
         auto total_ops = 0;
         for (auto& op : f->get_ordered_ops()) {
-            auto& rt = op->get_rt_info();
-            auto it_rinfo = rt.find("reginfo");
-            if (it_rinfo != rt.end()) {
-                auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
-                auto reg = reginfo[0];
-                ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
-                total_ops++;
+            for (const auto& output : op->outputs()) {
+                const auto& rt = output.get_tensor_ptr()->get_rt_info();
+                auto it_rt = rt.find("reginfo");
+                if (it_rt != rt.end()) {
+                    auto reg = it_rt->second.as<size_t>();
+                    ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
+                    total_ops++;
+                }
             }
         }
         ASSERT_EQ(total_ops, ref_registers.size());
diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
index 67a557298ed..45e4876998a 100644
--- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
+++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
@@ -653,7 +653,7 @@ NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_qkv_hidden_sizes) {
 
     test_case.add_input<float>(input);
     test_case.add_expected_output<float>(output);
-    test_case.run_with_tolerance_as_fp(1e-6);
+    test_case.run_with_tolerance_as_fp(1e-4);
 }
 
 NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_unidirectional) {
diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
index e112fe687ad..294ab6a18e0 100644
--- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
+++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp
@@ -108,6 +108,19 @@ DECLARE_CONFIG_KEY(CONFIG_DEVICE_ID);
  * @brief enable hyper thread
  */
 DECLARE_CONFIG_KEY(ENABLE_HYPER_THREAD);
+
+/**
+ * @brief Defines Snippets tokenization mode
+ *      @param ENABLE - default pipeline
+ *      @param IGNORE_CALLBACK - disable the Snippets markup transformation and tokenization callback
+ *      @param DISABLE - turn off the Snippets
+ * @ingroup ie_dev_api_plugin_api
+ */
+DECLARE_CONFIG_KEY(SNIPPETS_MODE);
+DECLARE_CONFIG_VALUE(ENABLE);
+DECLARE_CONFIG_VALUE(IGNORE_CALLBACK);
+DECLARE_CONFIG_VALUE(DISABLE);
+
 }  // namespace PluginConfigInternalParams
 
 }  // namespace InferenceEngine
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index 92f498876e3..b7a52b2b21c 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -188,6 +188,16 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
                 IE_THROW() << "Wrong value for property key " << CPUConfigParams::KEY_CPU_DENORMALS_OPTIMIZATION
                 << ". Expected only YES/NO";
             }
+        } else if (key == PluginConfigInternalParams::KEY_SNIPPETS_MODE) {
+            if (val == PluginConfigInternalParams::ENABLE)
+                snippetsMode = SnippetsMode::Enable;
+            else if (val == PluginConfigInternalParams::IGNORE_CALLBACK)
+                snippetsMode = SnippetsMode::IgnoreCallback;
+            else if (val == PluginConfigInternalParams::DISABLE)
+                snippetsMode = SnippetsMode::Disable;
+            else
+                IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_SNIPPETS_MODE
+                            << ". Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
         } else {
             IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin";
         }
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 8a1dcc59b83..012fd0fd9f0 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -32,9 +32,16 @@ struct Config {
         DO_On,
     };
 
+    enum SnippetsMode {
+        Enable,
+        IgnoreCallback,
+        Disable,
+    };
+
     bool collectPerfCounters = false;
     bool exclusiveAsyncRequests = false;
     bool enableDynamicBatch = false;
+    SnippetsMode snippetsMode = SnippetsMode::Enable;
     std::string dumpToDot = "";
     int batchLimit = 0;
     float fcSparseWeiDecompressionRate = 1.0f;
diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
index bba788545f2..fb3f12a9761 100644
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
@@ -17,6 +17,7 @@
 
 #include "snippets_transformations/op/load_convert.hpp"
 #include "snippets_transformations/op/store_convert.hpp"
+#include "snippets/op/brgemm.hpp"
 #include "ngraph_transformations/op/swish_cpu.hpp"
 
 #include <ngraph/opsets/opset5.hpp>
@@ -45,9 +46,12 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     // data movement
     jitters[ngraph::opset1::Parameter::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
     jitters[ngraph::opset1::Result::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
+    jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = CREATE_EMITTER(NopEmitter);
+    jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = CREATE_EMITTER(VectorBufferEmitter);
     // jitters[ngraph::opset1::Constant::get_type_info_static()] = CREATE_EMITTER(); // Not supported
 
     jitters[ngraph::snippets::op::Load::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
+    jitters[ngraph::snippets::op::LoadReshape::get_type_info_static()] = CREATE_EMITTER(LoadEmitter);
     jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_EMITTER(BroadcastLoadEmitter);
     jitters[ov::intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
     jitters[ov::intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_EMITTER(LoadConvertEmitter);
@@ -65,6 +69,9 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     jitters[ngraph::snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_convert_saturation_emitter);
     // jitters[ngraph::opset1::FakeQuantize::get_type_info_static()] = CREATE_EMITTER(); // not supported
 
+    // ternary
+    jitters[ngraph::opset1::Select::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_select_emitter);
+
     // binary
     jitters[ngraph::opset1::Add::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_add_emitter);
     jitters[ngraph::opset1::Divide::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_divide_emitter);
@@ -121,10 +128,15 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     // jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported
     jitters[ngraph::op::v0::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v0_emitter);
     jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter);
+    jitters[ngraph::snippets::op::Fill::get_type_info_static()] = CREATE_EMITTER(FillEmitter);
+
+    jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = CREATE_EMITTER(HorizonMaxEmitter);
+    jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = CREATE_EMITTER(HorizonSumEmitter);
 
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
-    jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter);
-    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter);
+    jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter);
+    jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = CREATE_EMITTER(LoopEndEmitter);
+    jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = CREATE_EMITTER(BrgemmEmitter);
 }
 
 size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const {
@@ -141,7 +153,9 @@ bool ov::intel_cpu::CPUTargetMachine::is_supported() const {
 }
 
 code ov::intel_cpu::CPUTargetMachine::get_snippet() const {
-    h->create_kernel();
+    if (h->create_kernel() != status::success) {
+        IE_THROW() << "Failed to create jit_kernel in get_snippet()";
+    }
     return h->jit_ker();
 }
 
diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp
index 506b77603e4..143a78ef173 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp
@@ -2143,5 +2143,66 @@ void jit_is_nan_emitter::register_table_entries() {
     }
 }
 
+/// SELECT ///
+jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
+        : jit_emitter(host, host_isa, node, exec_prc) {}
+jit_select_emitter::jit_select_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc)
+        : jit_emitter(host, host_isa, exec_prc) {}
+
+size_t jit_select_emitter::get_inputs_num() const { return 3; }
+
+size_t jit_select_emitter::aux_vecs_count() const {
+    if (host_isa_ == x64::avx512_core)
+        return 0;
+    else if (host_isa_ == x64::avx2)  // tmp vec for mask
+        return 1;
+    else // mask should be xmm0 on sse41 +  tmp vec for mask
+        return 2;
+}
+
+void jit_select_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                                   const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
+                                   const emitter_context *emit_context) const {
+    if (host_isa_ == x64::sse41) {
+        emit_isa<x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == x64::avx2) {
+        emit_isa<x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == x64::avx512_core) {
+        emit_isa<x64::avx512_core>(in_vec_idxs, out_vec_idxs);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void jit_select_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
+    using Vmm = typename conditional3<isa == x64::sse41, Xmm, isa == x64::avx2, Ymm, Zmm>::type;
+    Vmm vmm_cond = Vmm(in_vec_idxs[0]);
+    Vmm vmm_src0 = Vmm(in_vec_idxs[1]);
+    Vmm vmm_src1 = Vmm(in_vec_idxs[2]);
+    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
+
+    if (isa == x64::sse41) {
+        Vmm vmm_mask = Vmm(aux_vec_idxs[0]);
+        Vmm vmm_zero = Vmm(aux_vec_idxs[1]);
+        h->uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+        h->uni_vcmpps(vmm_cond, vmm_cond, vmm_zero, 0x4);
+        if (vmm_mask.getIdx() != vmm_cond.getIdx()) {
+            h->uni_vmovups(vmm_mask, vmm_cond);
+        }
+        if (vmm_src1.getIdx() != vmm_dst.getIdx()) {
+            h->uni_vmovups(vmm_dst, vmm_src1);
+        }
+        h->uni_vblendvps(vmm_dst, vmm_dst, vmm_src0, vmm_mask);
+    } else if (isa == x64::avx2) {
+        Vmm vmm_zero = Vmm(aux_vec_idxs[0]);
+        h->uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+        h->uni_vcmpps(vmm_cond, vmm_cond, vmm_zero, 0x4);
+        h->uni_vblendvps(vmm_dst, vmm_src1, vmm_src0, vmm_cond);
+    } else {
+        h->vptestmd(k_mask, vmm_cond, vmm_cond);
+        h->vblendmps(vmm_dst | k_mask, vmm_src1, vmm_src0);
+    }
+}
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp
index b8059793859..83a042633df 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp
@@ -692,5 +692,23 @@ private:
     void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };
 
+class jit_select_emitter : public jit_emitter {
+public:
+    jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
+                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+    jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
+                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
+
+    size_t get_inputs_num() const override;
+    size_t aux_vecs_count() const override;
+
+private:
+    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
+                   const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
+                   const emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+};
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
index 2130457847f..728c5de139b 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -7,8 +7,12 @@
 #include <cpu/x64/jit_generator.hpp>
 
 #include "jit_snippets_emitters.hpp"
+#include "snippets/op/brgemm.hpp"
+#include "snippets/op/subgraph.hpp"
+#include "snippets/utils.hpp"
 
 using namespace Xbyak;
+using ngraph::snippets::op::Subgraph;
 
 namespace ov {
 namespace intel_cpu {
@@ -23,57 +27,71 @@ jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator
     in_out_type_ = emitter_in_out_map::gpr_to_gpr;
 }
 
-void jit_container_emitter::map_abstract_registers(const std::vector<size_t> &vec_pool,  const std::vector<size_t> &gpr_pool,
-                                                    std::set<size_t>& vecs_used, std::set<size_t>& gprs_used) {
-    if (body.empty())
-        IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty";
-    auto abstract_to_physical = [](const std::vector<size_t>& abstract_regs, const std::vector<size_t>& regs_pool) {
+void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool,  mapping_info& vec_map_pool,
+                            std::vector<AllocatedEmitter>& allocated_emitters) const {
+    if (allocated_emitters.empty())
+        IE_THROW() << "Cannot map registers when there is no allocated_emitters provided";
+    auto map_regs = [](const std::vector<size_t>& abstract_regs, mapping_info& mapping) {
+        auto& abstract_to_physical = mapping.first;
+        auto& regs_pool = mapping.second;
         std::vector<size_t> physical_regs(abstract_regs.size());
-        for (size_t i = 0; i < abstract_regs.size(); i++)
-            physical_regs[i] = regs_pool.at(abstract_regs[i]);
+        for (size_t i = 0; i < abstract_regs.size(); i++) {
+            const auto abstract = abstract_regs[i];
+            auto& physical = physical_regs[i];
+            if (abstract_to_physical.count(abstract) == 0) {
+                if (regs_pool.empty())
+                    IE_THROW() << "Cannot map registers for jit_container_emitter: not enough regs in the pool";
+                physical = regs_pool.back();
+                regs_pool.pop_back();
+                abstract_to_physical[abstract] = physical;
+            } else {
+                physical = abstract_to_physical[abstract];
+            }
+        }
         return physical_regs;
     };
-    for (auto& code : body) {
+
+    for (auto& code : allocated_emitters) {
         const auto& emitter = code.first;
         std::vector<size_t> in_abstract_regs, out_abstract_regs;
         std::tie(in_abstract_regs, out_abstract_regs) = code.second;
         std::vector<size_t> in_physical_regs, out_physical_regs;
         switch (std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type()) {
             case gpr_to_gpr:
-                // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile.
+                // Note that gpr_to_gpr is used for high-level utility operations like Kernel/Loop.
                 // Input registers are not mapped in this case, since they contain utility info
-                // (num_params, tile increment, etc.), but not reg indexes.
-                in_physical_regs = std::move(in_abstract_regs);
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
-                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                // (num_params, loop increment, etc.), but not reg indexes.
+                // todo: Note that LoopBeginEmitter and LoopEndEmitter demonstrate new paradigm,
+                //  where all utility emitters align with conventional Op emitters
+                if (std::dynamic_pointer_cast<LoopBeginEmitter>(emitter) ||
+                    std::dynamic_pointer_cast<LoopEndEmitter>(emitter) ||
+                    std::dynamic_pointer_cast<BrgemmEmitter>(emitter))
+                    in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool));
+                else
+                    in_physical_regs = std::move(in_abstract_regs);
+                out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool));
                 break;
             case gpr_to_vec:
                 // Load Emitters
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
-                gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool));
                 break;
             case vec_to_gpr:
                 // Store Emitters
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
-                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool));
                 break;
             case vec_to_vec:
                 // Regular operations
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
-                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool));
                 break;
             default:
                 IE_THROW() << "Unhandled in_out type";
         }
         code.second = std::make_pair(in_physical_regs, out_physical_regs);
         if (auto container = std::dynamic_pointer_cast<jit_container_emitter>(code.first))
-            container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used);
+            container->map_abstract_registers(gpr_map_pool,  vec_map_pool, allocated_emitters);
     }
 }
 
@@ -84,15 +102,66 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
         IE_THROW() << "KernelEmitter invoked with invalid op argument";
     if (kernel->region.empty())
         IE_THROW() << "KernelEmitter invoked with empty body";
+    if (kernel->compile_params == nullptr)
+        IE_THROW() << "KernelEmitter invoked with op::Kernel that contains no compile_params";
     body = kernel->region;
-    if (!kernel->compile_params)
-        IE_THROW() << "KernelEmitter invoked without compile_params";
     jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
+    // calc data access pattern. we'll need it for offsets calculation
+    const auto&  model = kernel->model;
+    const auto get_static_shape = [](const std::shared_ptr<ov::Node>& node) {
+        const auto& pshape = node->get_output_partial_shape(0);
+        if (pshape.is_dynamic())
+            IE_THROW() << "KernelEmitter can't calc offsets for dynamic shapes";
+        return pshape.get_shape();
+    };
+    const auto get_data_layout = [](const Output<ov::Node>& out, std::vector<size_t>& shape) {
+        const auto& layout = ngraph::snippets::utils::get_node_output_layout(out.get_node_shared_ptr());
+        // default access pattern
+        if (!layout.empty()) {
+            const auto layout_shape_diff = static_cast<int64_t>(shape.size()) - static_cast<int64_t>(layout.size());
+            // Plugin can (and usually does) prepend shapes with 1's to facilitate scheduling, here we can safely remove leading 1's
+            if (layout_shape_diff > 0) {
+                if (std::any_of(shape.begin(), shape.begin() + layout_shape_diff, [](size_t x){return x != 1;}))
+                    IE_THROW() << "KernelEmitter detected shape vs access pattern conflict: only leading 1's can be removed from the shape";
+                shape.erase(shape.begin(), shape.begin() + layout_shape_diff);
+            }
+        }
+        return layout;
+    };
+    const auto& ops = model->get_ordered_ops();
+    auto params = model->get_parameters();
+    auto results = model->get_results();
+    num_inputs = params.size();
+    num_outputs = results.size();
+    is_buffer_needed = std::any_of(ops.begin(), ops.end(),
+        [](const std::shared_ptr<ov::Node>& node) { return ov::is_type<ngraph::snippets::op::Buffer>(node); } );
+    NodeVector io_nodes;
+    std::copy(params.begin(), params.end(), std::back_inserter(io_nodes));
+    std::copy(results.begin(), results.end(), std::back_inserter(io_nodes));
+
+    const auto& model_rt_info = model->get_rt_info();
+    const auto& plugin_shapes = model_rt_info.find("PluginShapesOverride");
+    if (plugin_shapes == model_rt_info.end()) {
+        IE_THROW() << "JIT KernelEmitter requires plugin-overriden shapes in model rt_info";
+    } else {
+        const auto& new_shapes = plugin_shapes->second.as<std::vector<std::vector<size_t>>>();
+        if (new_shapes.size() != num_inputs + num_outputs)
+            IE_THROW() << "JIT KernelEmitter detected invalid plugin-overriden shapes";
+        io_shapes = new_shapes;
+    }
+    for (int i = 0; i < io_nodes.size(); i++) {
+        const auto& out = i < num_inputs ? io_nodes[i]->output(0) : io_nodes[i]->input_value(0);
+        data_layout.push_back(get_data_layout(out, io_shapes[i]));
+        io_data_size.push_back(out.get_element_type().size());
+    }
     // Initialize pools of gp and vec registers
     gp_regs_pool.resize(16);
     vec_regs_pool.resize(16);
-    std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0);
-    std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0);
+    // It's easier to remove the last item during mapping, so fill descending to map ascending
+    for (size_t i = 0; i < 16; i++)
+        gp_regs_pool[i] = vec_regs_pool[i] = 15 - i;
+    // todo: it's more convenient to use std::set as a pool container (unique and always sorted),
+    //  but pools are vectors to align with emit_code signature. Change signature?
     auto remove_regs_from_pool = [](std::vector<size_t>& pool, const std::set<size_t>& to_remove) {
         // It's important to keep the order of other elements
         pool.erase(std::remove_if(pool.begin(), pool.end(),
@@ -101,14 +170,31 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
     // Reserve stack base and pointer for push(...) and pop(...) operations
     // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel
     remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP,
-                                         static_cast<size_t>(abi_param1.getIdx()),
-                                         static_cast<size_t>(abi_param2.getIdx())});
-    std::set<size_t> vecs_used, gprs_used;
-    map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used);
-    remove_regs_from_pool(gp_regs_pool, gprs_used);
-    remove_regs_from_pool(vec_regs_pool, vecs_used);
-    // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs
-    gp_regs_used = std::vector<size_t>(gprs_used.begin(), gprs_used.end());
+                                         reg_indexes_idx, reg_const_params_idx});
+
+    mapping_info gpr_map_pool({}, gp_regs_pool);
+    mapping_info vec_map_pool({}, vec_regs_pool);
+    std::vector<AllocatedEmitter> data_io_emitters;
+    std::copy_if(body.begin(), body.end(), std::back_inserter(data_io_emitters),
+                           [](const AllocatedEmitter& code){
+                                   const auto& emitter = code.first;
+                                   const auto emitter_type = std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type();
+                                   // todo: how this will be handled if Brgemm in & out are op::Buffer
+                                   // Brgemm is a special case since it incorporates input and output (we use onednn kernel)
+                                   // Just like Load & Store it requires offsets calculation
+                                   const auto is_brgemm = std::dynamic_pointer_cast<BrgemmEmitter>(emitter) != nullptr;
+                                   return emitter_type == gpr_to_vec || emitter_type == vec_to_gpr || is_brgemm;
+                           });
+    // Note that we can't use reg_indexes_idx or reg_const_params_idx to store data pointers because these two
+    // regs are used to calculate offsets for the data pointers
+    map_abstract_registers(gpr_map_pool, vec_map_pool, data_io_emitters);
+    for (const auto& abstract_to_physical : gpr_map_pool.first)
+        data_ptr_regs_idx.push_back(abstract_to_physical.second);
+    // However we can use reg_indexes_idx and reg_const_params_idx for other operations since we won't need them
+    // after offsets calculation
+    gpr_map_pool.second.push_back(reg_indexes_idx);
+    gpr_map_pool.second.push_back(reg_const_params_idx);
+    map_abstract_registers(gpr_map_pool, vec_map_pool, body);
 }
 
 void KernelEmitter::emit_code(const std::vector<size_t> &in,
@@ -123,268 +209,259 @@ void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
                                        const std::vector<size_t> &out,
                                        const std::vector<size_t> &pool,
                                        const std::vector<size_t> &gpr) const {
-    if (in.size() != 2)
-        IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
+    if (!in.empty())
+        IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 0, got " << in.size();
     if (!out.empty())
-        IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
+        IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
+    const auto num_params = num_inputs + num_outputs + static_cast<size_t>(is_buffer_needed);
+    // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount
+    if (data_ptr_regs_idx.size() != num_params)
+        IE_THROW() << "KernelEmitter: number of inputs and outputs is inconsisnent with the number of allocated registers"
+        << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size();
 }
 
-void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
-                                              const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
-    const int64_t harness_num_dims = jcp.output_dims.size() - 1;
-    auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) {
-        for (int j = 0; j < harness_num_dims; j++) {
-            if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
+void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, bool is_buffer_needed,
+                                       const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
+    // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter
+    const size_t offset_rank = jcp.master_shape.size() - 1;
+    //const size_t tile_rank = jcp.tile_rank;
+    std::vector<std::vector<size_t>> data_offsets(num_params, std::vector<size_t>{});
+    auto offset_calculation = [=](const std::vector<size_t>& shape,
+                                            const std::vector<size_t>& layout, const size_t data_size) {
+        // Strides represent distance between consecutive elements of corresponding dimension.
+        // If a dim size == 1, then the next dim starts immediately and the stride is 0
+        // case 1:
+        //    shape:         s0,    s1, s2, s3
+        //    strides: s1*s2*s3, s2*s3, s3,  1
+        // case 2:
+        //    shape:      s0, s1, s2 == 1, s3
+        //    strides: s1*s3, s3,       0,  1
+        std::vector<size_t> strides(shape.size());
+        size_t dim_step = 1;
+        strides[shape.size() - 1] = 1;
+        for (int k = static_cast<int>(shape.size()) - 2; k >= 0; k--) {
+            dim_step *= shape[k+1];
+            strides[k] = shape[k] != 1 ? dim_step * data_size : 0;
+        }
+        // Note: this is an extra copy, but let's keep it for clarity
+        if (!layout.empty()) {
+            std::vector<size_t> reordered_strides(strides.size());
+            for (auto i = 0; i < layout.size(); i++)
+                reordered_strides[i] = strides[layout[i]];
+            strides = std::move(reordered_strides);
+        }
+        // the last stride is ignored, since the entire last dim is processed by kernel
+        // and no parallel_for data_ptr offsets can be applied in this case (cover tile_rank == 1)
+        strides.pop_back();
+        // if tile_rank > 1, then zero corresponding strides since no external offset can be applied
+        // for (auto j = 0; j < tile_rank - 1; j++)
+        //    strides[strides.size() - 1 - j] = 0;
+        // actual offset size might be larger that the shape size due to 6D scheduling
+        strides.insert(strides.begin(), offset_rank - strides.size(), 0);
+
+        return strides;
+    };
+    for (size_t i = 0; i < num_params; i++) {
+        data_offsets[i] = offset_calculation(io_shapes[i],  data_layout[i], io_data_size[i]);
+    }
+    // master_shape size must be valid in both static and dynamic cases
+    std::function<void(Reg64, const std::vector<size_t>&, Reg64)> init_ptr_with_offset;
+    init_ptr_with_offset = [&](Reg64 pointer, const std::vector<size_t>& offsets, Reg64 reg_tmp) {
+        for (int j = 0; j < offset_rank; j++) {
+            if (jcp.master_shape[j] != 1 && offsets[j] != 0) {
                 h->mov(reg_tmp, offsets[j]);
                 h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]);
                 h->add(pointer, reg_tmp);
             }
         }
     };
-    for (auto i = 0; i < num_params; i++) {
+    const auto spare_corruptable_gpr = std::find_if(gp_regs_pool.begin(), gp_regs_pool.end(),
+                                                   [this](size_t reg) {
+                                                        return reg != reg_indexes_idx && reg != reg_const_params_idx;
+                                                   });
+    const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end();
+    Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast<int>(*spare_corruptable_gpr));
+    // Vector "data_ptr_regs" is sorted by abstract regs.
+    // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer]
+    // So we can initialize buffer register firstly as last value of vector "data_ptr_regs"
+    if (is_buffer_needed) {
+        h->mov(data_ptr_regs[num_params], h->ptr[reg_const_params + GET_OFF(buffer_scratchpad_ptr)]);
+    }
+    size_t i = 0;
+    for (; i < num_params - last_iter_explicitly; i++) {
         if (i < num_inputs)
             h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
         else
             h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
-        // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then
-        Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params;
-        init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp);
+        init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], reg_tmp);
+    }
+    // a rare case when num_params is maximal, so we have no spare gprs
+    // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since
+    //     it won't be used anymore
+    // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to
+    //     push a reg on the stack, and restore it value afterwards
+    if (last_iter_explicitly) {
+        h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
+        reg_tmp = reg_const_params;
+        // can corrupt reg_const_params, since we won't use it anymore
+        init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], reg_tmp);
     }
 }
 void KernelEmitter::emit_impl(const std::vector<size_t>& in,
                               const std::vector<size_t>& out,
-                              const std::vector<size_t>& allocated_vec_regs,
-                              const std::vector<size_t>& allocated_gp_regs,
+                              const std::vector<size_t>& vec_pool,
+                              const std::vector<size_t>& gpr_pool,
                               const ov::intel_cpu::emitter_context *emit_context) const {
     h->preamble();
 
-    const size_t num_inputs = in[0];
-    const size_t num_outputs = in[1];
-
-    Reg64 reg_indexes = Reg64(abi_param1.getIdx());
-    Reg64 reg_const_params = Reg64(abi_param2.getIdx());
+    Reg64 reg_indexes = Reg64(static_cast<int>(reg_indexes_idx));
+    Reg64 reg_const_params = Reg64(static_cast<int>(reg_const_params_idx));
     std::vector<Reg64> data_ptr_regs;
-    transform_idxs_to_regs(gp_regs_used, data_ptr_regs);
+    transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs);
 
-    init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
-    // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool.
-    //  we need a more elegant approach to avoid a full copy here
-    auto local_gpr_pool = gp_regs_pool;
-    local_gpr_pool.push_back(static_cast<size_t>(reg_indexes.getIdx()));
-    local_gpr_pool.push_back(static_cast<size_t>(reg_const_params.getIdx()));
+    init_data_pointers(num_inputs, num_inputs + num_outputs, is_buffer_needed, reg_indexes, reg_const_params, data_ptr_regs);
     for (const auto& c : body) {
         const auto& emitter = c.first;
         std::vector<size_t> in_regs, out_regs;
         std::tie(in_regs, out_regs) = c.second;
-        if (auto tile_scheduler = std::dynamic_pointer_cast<TileSchedulerEmitter>(emitter))
-            out_regs = gp_regs_used;
-        emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool);
+        emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool);
     }
     h->postamble();
 }
 
-TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                                           const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
-    const auto tile_scheduler = ov::as_type_ptr<ngraph::snippets::op::TileScheduler>(n);
-    if (!tile_scheduler)
-        IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument";
-    if (!tile_scheduler->compile_params)
-        IE_THROW() << "TileEmitter invoked without compile_params";
-    body = {tile_scheduler->vector_region, tile_scheduler->scalar_region};
-    jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile_scheduler->compile_params);
+
+LoopBeginEmitter::LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                         const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(n);
+    if (!loop_begin)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid op argument";
+    const auto& target_inputs = loop_begin->output(loop_begin->get_output_size() - 1).get_target_inputs();
+    // todo: this check could be excessive, since we check for it in validate_and_infer_types()
+    if (target_inputs.size() != 1)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must have exactly one input attached";
+    const auto loop_end = ov::as_type_ptr<ngraph::snippets::op::LoopEnd>(target_inputs.begin()->get_node()->shared_from_this());
+    if (!loop_end)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must be LoopEnd";
+    work_amount = loop_begin->get_work_amount();
+    evaluate_once = loop_begin->get_evaluate_once();
+    num_inputs = loop_begin->get_input_size();
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
 }
-void TileSchedulerEmitter::emit_code(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
+
+void LoopBeginEmitter::emit_code(const std::vector<size_t> &in,
+                                 const std::vector<size_t> &out,
+                                 const std::vector<size_t> &pool,
+                                 const std::vector<size_t> &gpr) const {
     validate_arguments(in, out, pool, gpr);
     emit_impl(in, out, pool, gpr, nullptr);
 }
-void TileSchedulerEmitter::validate_arguments(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
-    if (in.size() != 3)
-        IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size();
-    if (out.size() != in[0] + in[1])
-        IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size();
-    if (body.size() != 2)
-        IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size();
-    if (!(std::dynamic_pointer_cast<TileEmitter>(body[0].first) && std::dynamic_pointer_cast<TileEmitter>(body[1].first)))
-        IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body";
+
+void LoopBeginEmitter::validate_arguments(const std::vector<size_t> &in,
+                                        const std::vector<size_t> &out,
+                                        const std::vector<size_t> &pool,
+                                        const std::vector<size_t> &gpr) const {
+    if (in.size() != num_inputs)
+        IE_THROW() << "Invalid inputs size: expected " << num_inputs << " got " << in.size();
+    if (out.size() != num_inputs + 1)
+        IE_THROW() << "Invalid outputs size: expected " << num_inputs + 1 << " got " << out.size();
 }
 
-void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector<Reg64>& data_ptr_regs, size_t vector_size,
-                                      const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
-    // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times
-    using TileAllocatedEmitter = std::pair<std::shared_ptr<TileEmitter>, const ngraph::snippets::RegInfo&>;
-    TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast<TileEmitter>(body[0].first), body[0].second};
-    TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast<TileEmitter>(body[1].first), body[1].second};
-    const size_t inner_work_amount = jcp.scheduler_dims[1];
-    auto process_tile =
-        [&](const bool evaluate_once, const TileAllocatedEmitter& tile) {
-            // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks
-            if (evaluate_once) {
-                tile.first->emit_body(vec_pool, gpr_pool);
-            } else {
-                std::vector<size_t> in_regs, out_regs;
-                std::tie(in_regs, out_regs) = tile.second;
-                // pass work_amount reg to Tile
-                in_regs.push_back(static_cast<size_t>(reg_inner_amount.getIdx()));
-                for (const auto& reg : data_ptr_regs)
-                    out_regs.emplace_back(reg.getIdx());
-                tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool);
-            }
-        };
-    // todo: these optimizations should be performed on using Tile graph representation in the future
-    bool vector_evaluate_once = false;
-    if (inner_work_amount >= vector_size) {
-        vector_evaluate_once = inner_work_amount < 2 * vector_size;
-        // Need to set proper work amount for inner tiles if evaluated multiple times
-        if (!vector_evaluate_once)
-            h->mov(reg_inner_amount, inner_work_amount);
-        process_tile(vector_evaluate_once, vector_tile);
-    }
-    if (inner_work_amount % vector_size >= 1) {
-        bool scalar_evaluate_once = inner_work_amount % vector_size < 2;
-        if (!scalar_evaluate_once) {
-            // vector_tile is not executed, work_amount is not set
-            if (inner_work_amount < vector_size) {
-                h->mov(reg_inner_amount, inner_work_amount);
-                // vector_tile is executed, but work_amount is neither set nor decremented appropriately.
-            } else if (vector_evaluate_once) {
-                vector_tile.first->emit_ptr_increments(data_ptr_regs);
-                h->mov(reg_inner_amount, inner_work_amount - vector_size);
-            }
-            // else: vector_tile is executed multiple times, so work_amount is already set
-        } else {
-            if (vector_evaluate_once) {
-                vector_tile.first->emit_ptr_increments(data_ptr_regs);
-            }
-        }
-        process_tile(scalar_evaluate_once, scalar_tile);
+void LoopBeginEmitter::emit_impl(const std::vector<size_t>& in,
+                                 const std::vector<size_t>& out,
+                                 const std::vector<size_t>& pool,
+                                 const std::vector<size_t>& gpr,
+                                 const ov::intel_cpu::emitter_context *emit_context) const {
+    // todo: In dynamic case we will also need to set broadcasting info here
+    Reg64 reg_work_amount = Reg64(out.back());
+    Label for_body;
+    // save previous register state (if there is an outer loop that uses this reg for example)
+    if (!evaluate_once) {
+        h->mov(reg_work_amount, work_amount);
     }
+    // Note: loop address is not calculated at this point, so need to call calcJmpAddress() which is protected
+    // or ready(), but they both set internal flags and that's not a desired way to use them.
+    // So the most obvious WA is just to use current address manually
+    loop_begin->begin_address = h->getCurr();
+    loop_begin->input_regs = in;
 }
 
-void TileSchedulerEmitter::emit_impl(const std::vector<size_t>& in,
-                                     const std::vector<size_t>& out,
-                                     const std::vector<size_t>& vec_pool,
-                                     const std::vector<size_t>& gpr_pool,
-                                     const ov::intel_cpu::emitter_context *emit_context) const {
-    const size_t num_inputs = in[0];
-    const size_t num_outputs = in[1];
-    const size_t vector_size = in[2];
-    const size_t num_params = num_inputs + num_outputs;
-    const auto& data_ptr_reg_idxs(out);
+LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                   const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    loop_end = ov::as_type_ptr<ngraph::snippets::op::LoopEnd>(n);
+    if (!loop_end)
+        IE_THROW() << "LoopEndEmitter invoked with invalid op argument";
+    loop_begin = loop_end->get_loop_begin();
+    // todo: this check could be excessive, since we check for it in validate_and_infer_types()
+    if (!loop_begin)
+        IE_THROW() << "LoopEndEmitter invoked with invalid configuration: the last arg must be LoopBegin";
+    // Note that 1 edge connects LoopBegin and LoopEnd
+    num_inputs = loop_begin->get_input_size();
+    num_outputs = loop_end->get_output_size();
+    wa_increment = loop_end->get_increment();
+    work_amount = loop_end->get_work_amount();
+    ptr_increments = loop_end->get_ptr_increments();
+    finalization_offsets = loop_end->get_finalization_offsets();
+    evaluate_once = loop_end->get_evaluate_once();
+    for (int i = 0; i < num_inputs; i++)
+        io_data_size.push_back(static_cast<int64_t>(loop_begin->get_input_element_type(i).size()));
+    for (int i = 0; i < num_outputs; i++)
+        io_data_size.push_back(static_cast<int64_t>(loop_end->get_output_element_type(i).size()));
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
+}
+
+void LoopEndEmitter::emit_code(const std::vector<size_t> &in,
+                                 const std::vector<size_t> &out,
+                                 const std::vector<size_t> &pool,
+                                 const std::vector<size_t> &gpr) const {
+    validate_arguments(in, out, pool, gpr);
+    emit_impl(in, out, pool, gpr, nullptr);
+}
+
+
+void LoopEndEmitter::validate_arguments(const std::vector<size_t> &in,
+                                       const std::vector<size_t> &out,
+                                       const std::vector<size_t> &pool,
+                                       const std::vector<size_t> &gpr) const {
+    if (loop_begin->input_regs.size() != num_inputs)
+        IE_THROW() << "Invalid loop_begin->input_regs size: expected " << num_inputs << " got " << loop_begin->input_regs.size();
+    if (out.size() != num_outputs)
+        IE_THROW() << "Invalid number of out arguments: expected " << num_outputs << " got " << out.size();
+    if (in.size() != num_outputs + 1)
+        IE_THROW() << "Invalid number of in arguments: expected " << num_inputs + 1 << " got " << in.size();
+    const auto io_size = num_inputs + num_outputs;
+    if (ptr_increments.size() != io_size)
+        IE_THROW() << "Invalid apply_increments size: expected " << io_size << " got " << ptr_increments.size();
+    if (finalization_offsets.size() != io_size)
+        IE_THROW() << "Invalid finalization_offsets size: expected: " << io_size << " got " << finalization_offsets.size();
+}
+
+void LoopEndEmitter::emit_impl(const std::vector<size_t>& in,
+                                 const std::vector<size_t>& out,
+                                 const std::vector<size_t>& pool,
+                                 const std::vector<size_t>& gpr,
+                                 const ov::intel_cpu::emitter_context *emit_context) const {
+    std::vector<size_t> data_ptr_reg_idxs(loop_begin->input_regs);
+    data_ptr_reg_idxs.reserve(num_inputs + num_outputs);
+    std::copy(out.begin(), out.end(), std::back_inserter(data_ptr_reg_idxs));
     std::vector<Reg64> data_ptr_regs;
     transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
-    // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool.
-    //  we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter
-    auto local_gpr_pool = gpr_pool;
-    Reg64 reg_outer_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
-    local_gpr_pool.pop_back();
-    Reg64 reg_inner_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
-    local_gpr_pool.pop_back();
-    Label for_body;
-    const size_t outer_work_amount = jcp.scheduler_dims[0];
-    if (outer_work_amount == 1) {
-        // emit code directly without looping over external dim
-        emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
-    } else if (outer_work_amount > 1) {
-        // We need to create a Loop in this case
-        h->mov(reg_outer_amount, outer_work_amount);
-        h->L(for_body);
-        {
-            emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
-
-            // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
-            //   after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
-            //   To overcome this limitation, we add appropriate negative offsets if necessary.
-            for (auto i = 0; i < num_params; i++) {
-                if (jcp.scheduler_offsets[i] != 0) {
-                    h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]);
-                }
-            }
-            // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar)
-            h->sub(reg_outer_amount, 1);
-            h->cmp(reg_outer_amount, 1);
-            h->jge(for_body, CodeGenerator::T_NEAR);
+    Reg64 reg_work_amount = Reg64(in.back());
+    if (!evaluate_once) {
+        for (int idx = 0; idx < data_ptr_regs.size(); idx++) {
+            if (ptr_increments[idx] != 0)
+                h->add(data_ptr_regs[idx], ptr_increments[idx] * io_data_size[idx]);
         }
+        h->sub(reg_work_amount, wa_increment);
+        h->cmp(reg_work_amount, wa_increment);
+        h->jge(loop_begin->begin_address);
     }
-}
 
-std::vector<AllocatedEmitter>& TileEmitter::get_nested_code() {
-    return body;
-}
-
-TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                         const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
-    const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
-    if (!tile)
-        IE_THROW() << "TileEmitter invoked with invalid op argument";
-    body = tile->region;
-    if (body.empty())
-        IE_THROW() << "TileEmitter is invoked with empty body";
-    num_inputs = tile->num_inputs;
-    num_outputs = tile->num_outputs;
-    io_dims = tile->io_dims;
-    io_data_size = tile->io_data_size;
-    increment = tile->increment;
-    if (io_dims.size() != num_inputs + num_outputs)
-        IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()";
-}
-
-void TileEmitter::emit_code(const std::vector<size_t> &in,
-                            const std::vector<size_t> &out,
-                            const std::vector<size_t> &pool,
-                            const std::vector<size_t> &gpr) const {
-    validate_arguments(in, out, pool, gpr);
-    emit_impl(in, out, pool, gpr, nullptr);
-}
-
-void TileEmitter::validate_arguments(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
-    if (in.size() != 1)
-        IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size();
-    if (out.size() != io_dims.size())
-        IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size();
-}
-
-void TileEmitter::emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
-    for (auto& code : body)
-        code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool);
-}
-
-void TileEmitter::emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const {
-    for (size_t i = 0; i < num_inputs + num_outputs; i++) {
-        // those with dims == 1 will be broadcasted, hence don't require increment
-        if (io_dims[i] != 1)
-            h->add(data_ptr_regs[i], increment * io_data_size[i]);
+    for (int idx = 0; idx < data_ptr_regs.size(); idx++) {
+        if (finalization_offsets[idx] != 0)
+            h->add(data_ptr_regs[idx], finalization_offsets[idx] * io_data_size[idx]);
     }
 }
 
-void TileEmitter::emit_impl(const std::vector<size_t>& in,
-                            const std::vector<size_t>& out,
-                            const std::vector<size_t>& vec_pool,
-                            const std::vector<size_t>& gpr_pool,
-                            const ov::intel_cpu::emitter_context *emit_context) const {
-    Reg64 work_amount = Reg64(static_cast<int>(in[0]));
-    std::vector<Reg64> data_ptr_regs;
-    transform_idxs_to_regs(out, data_ptr_regs);
-    Label for_body;
-    // Note that:
-    // * Work amount must be set by TileScheduler that executes Tiles
-    // * TileScheduler executes Tile only if it has to perform >= 1 iterations
-    h->L(for_body);
-    emit_body(vec_pool, gpr_pool);
-    emit_ptr_increments(data_ptr_regs);
-    h->sub(work_amount, increment);
-    h->cmp(work_amount, increment);
-    h->jge(for_body, CodeGenerator::T_NEAR);
-}
-
 BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                            const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
     if (n->get_input_element_type(0) != n->get_output_element_type(0))
@@ -467,7 +544,9 @@ StoreEmitter::StoreEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::c
     if (src_prc != dst_prc)
         IE_THROW() << "StoreEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
-    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
+    count = store->get_count();
+    byte_offset = store->get_offset();
     in_out_type_ = emitter_in_out_map::vec_to_gpr;
     store_emitter.reset(new jit_store_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -494,7 +573,7 @@ void StoreEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<siz
             Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
     if (!store_emitter)
         IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
-    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void StoreEmitter::emit_data() const {
@@ -506,7 +585,12 @@ LoadEmitter::LoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu
     if (src_prc != dst_prc)
         IE_THROW() << "LoadEmitter supports only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
-    count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
+    const auto load = std::dynamic_pointer_cast<ngraph::snippets::op::Load>(n);
+    if (!load)
+        IE_THROW() << "LoadEmitter expects Load snippets op";
+
+    count = load->get_count();
+    byte_offset = load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
     load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -533,7 +617,7 @@ void LoadEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size
             Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
     if (!load_emitter)
         IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
-    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void LoadEmitter::emit_data() const {
@@ -543,8 +627,13 @@ void LoadEmitter::emit_data() const {
 BroadcastLoadEmitter::BroadcastLoadEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                            const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
     if (src_prc != dst_prc)
-            IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
+        IE_THROW() << "BroadcastEmitters support only equal input and output types but gets: " << src_prc.name() << " and " << dst_prc.name();
 
+    const auto broadcast_load = std::dynamic_pointer_cast<ngraph::snippets::op::BroadcastLoad>(n);
+    if (!broadcast_load)
+        IE_THROW() << "BroadcastLoadEmitter expects BroadcastLoad snippets op";
+
+    byte_offset = broadcast_load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
 }
 
@@ -574,16 +663,18 @@ void BroadcastLoadEmitter::emit_isa(const std::vector<size_t> &in, const std::ve
     // In doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`,
     // key point here is not to add post-increment, it might be fixed by some other approach in future
     switch (src_prc.size()) {
-        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg]); break;
-        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg]); break;
-        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg]); break;
+        case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + byte_offset]); break;
+        case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + byte_offset]); break;
+        case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + byte_offset]); break;
         default: assert(!"unsupported data type");
     }
 }
 
 LoadConvertEmitter::LoadConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
     : MemoryEmitter(h, isa, n) {
-    count = ov::as_type_ptr<ngraph::snippets::op::Load>(n)->get_count();
+    const auto load = ov::as_type_ptr<ngraph::snippets::op::Load>(n);
+    count = load->get_count();
+    byte_offset = load->get_offset();
     in_out_type_ = emitter_in_out_map::gpr_to_vec;
     load_emitter.reset(new jit_load_emitter(h, isa, src_prc, dst_prc, count));
 }
@@ -608,7 +699,7 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void LoadConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
     if (!load_emitter)
         IE_THROW() << "Load CPU emitter isn't initialized for LoadEmitter!";
-    load_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    load_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void LoadConvertEmitter::emit_data() const {
@@ -617,7 +708,9 @@ void LoadConvertEmitter::emit_data() const {
 
 StoreConvertEmitter::StoreConvertEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                                          const std::shared_ptr<ov::Node>& n) : MemoryEmitter(h, isa, n) {
-    count = ov::as_type_ptr<ngraph::snippets::op::Store>(n)->get_count();
+    const auto store = ov::as_type_ptr<ngraph::snippets::op::Store>(n);
+    count = store->get_count();
+    byte_offset = store->get_offset();
     in_out_type_ = emitter_in_out_map::vec_to_gpr;
 
     if (ov::is_type<ov::intel_cpu::StoreConvertTruncation>(n)) {
@@ -647,12 +740,477 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
 void StoreConvertEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
     if (!store_emitter)
         IE_THROW() << "Store CPU emitter isn't initialized for StoreEmitter!";
-    store_emitter->emit_code({in[0]}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
+    store_emitter->emit_code({in[0], byte_offset}, {out[0]}, aux_vec_idxs, aux_gpr_idxs);
 }
 
 void StoreConvertEmitter::emit_data() const {
     store_emitter->emit_data();
 }
+size_t BrgemmEmitter::getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const {
+    return mIdx * 4 + kIdx * 2 + nIdx;
+}
+BrgemmEmitter::BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                         const std::shared_ptr<ov::Node>& node) : jit_emitter(h, isa, node) {
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
+    const auto& brgemm_node = as_type_ptr<ngraph::snippets::op::Brgemm>(node);
+    if (brgemm_node->is_dynamic())
+        IE_THROW() << "Snippets don't support code generation for dynamic Brgemm";
+    const OutputVector io_values {brgemm_node->input_value(0), brgemm_node->input_value(1), brgemm_node->output(0)};
+    std::vector<size_t> leading_dimensions;
+    std::vector<std::vector<size_t>> io_layouts;
+    for (const auto& val : io_values) {
+        const auto& layout = ngraph::snippets::utils::get_node_output_layout(val.get_node_shared_ptr());
+        const auto& io_shape = val.get_shape();
+        if (layout.empty()) {
+            // empty value indicates a planar layout
+            leading_dimensions.push_back(io_shape.back());
+            std::vector<size_t> default_layout(io_shape.size());
+            std::iota(default_layout.begin(), default_layout.end(), 0);
+            io_layouts.push_back(default_layout);
+        } else {
+            // The idea here is to find "2" (for 4D shapes) in the layout and multiply dimensions that are to the right
+            // This implies that "3" is the last layout value, otherwise this layout is not supported.
+            // counting from the end since shape could be prepended with ones
+            const int64_t num_last_dims = layout.end() - std::find(layout.begin(), layout.end(), layout.size() - 2) - 1;
+            if (layout.back() != layout.size() - 1 || num_last_dims < 1)
+                IE_THROW() << "BrgemmEmitter detected invalid layout values: " <<
+                    "check that this shape + layout combination is schedulable";
+            leading_dimensions.emplace_back(
+                    std::accumulate(io_shape.end() - num_last_dims, io_shape.end(), 1, std::multiplies<size_t>()));
+            io_layouts.push_back(layout);
+        }
+    }
+    // todo: leave AMX and VNNI related code for now, it'll help to enable int8 and bf16 support
+    bool isAMXSupported = mayiuse(avx512_core_bf16_amx_int8) || mayiuse(avx512_core_bf16_amx_bf16);
+
+    const auto& A_shape = io_values[0].get_shape();
+    const auto& A_layout = io_layouts[0];
+    const auto& C_shape = io_values[2].get_shape();
+    const auto& C_layout = io_layouts[2];
+
+    M = C_shape[C_layout[2]];
+    K = A_shape[A_layout[3]];
+    M_blk = matmulOptimalM;
+    M_tail = M % M_blk;
+    // B_shape[B_layout[3]]
+    N = C_shape[C_layout[3]];
+
+    auto brg0Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(0));
+    auto brg1Prc = InferenceEngine::details::convertPrecision(brgemm_node->get_input_element_type(1));
+    io_data_size = {brg0Prc.size(), brg1Prc.size(), brgemm_node->get_output_element_type(0).size()};
+    brg0VnniFactor = 4 / brg0Prc.size();
+    bool brg0WithAMX = isAMXSupported && brg0Prc != Precision::FP32 && (K % brg0VnniFactor == 0) && (N % brg0VnniFactor == 0);
+
+    N_blk = brg0Prc == Precision::FP32 ? N :
+             brg0Prc == Precision::BF16 ? 32 : 64;
+    N_tail = N % N_blk;
+    K_blk = brg0WithAMX ? brg0Prc == Precision::BF16 ? 32 : 64
+                         : K;
+    K_tail = K % K_blk;
+
+    size_t brg0BaseIdx = -1;
+    for (size_t m = 0; m < 2; m++) {
+        for (size_t k = 0; k < 2; k++) {
+            for (size_t n = 0; n < 2; n++) {
+                auto& brgemmCtx = brgCtxs0[getBrgIdx(m, k, n)];
+
+                auto M_ = m ? M_tail
+                            : M < M_blk ? 0 : M_blk;
+                auto N_ = n ? N_tail : N - N_tail;
+                auto K_ = k ? K_tail : K - K_tail;
+                auto beta = k && brgCtxs0[getBrgIdx(m, 0, n)].K != 0 ? 1.0f : 0.0f;
+
+                brgemmCtx.M = M_;
+                brgemmCtx.N = N_;
+                brgemmCtx.K = K_;
+                brgemmCtx.LDA = leading_dimensions[0];
+                brgemmCtx.LDB = leading_dimensions[1];
+                brgemmCtx.LDC = leading_dimensions[2];
+                brgemmCtx.dt_in0 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(brg0Prc));
+                brgemmCtx.dt_in1 = static_cast<dnnl_data_type_t>(DnnlExtensionUtils::IEPrecisionToDataType(brg1Prc));
+                brgemmCtx.beta = beta;
+
+                // don't create brgemm kernels for empty tiles
+                if (M_ != 0 && K_ != 0 && N_ != 0) {
+                    if (brg0BaseIdx == -1)
+                        brg0BaseIdx = getBrgIdx(m, k, n);
+                    initBrgemm(brgemmCtx, brgKernels0[getBrgIdx(m, k, n)], brg0WithAMX);
+                }
+            }
+        }
+    }
+
+    load_offset_a = brgemm_node->get_offset_a();
+    load_offset_b = brgemm_node->get_offset_b();
+    store_offset_c = brgemm_node->get_offset_c();
+}
+
+void BrgemmEmitter::initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const {
+    brgemm_t brgDesc;
+    brgemm_strides_t strides {static_cast<dnnl_dim_t>(ctx.M * ctx.K), static_cast<dnnl_dim_t>(ctx.K * ctx.N)};
+    // When implementing int8 support, note that isa logics is more complicated in the MHA node
+    auto status = brgemm_desc_init(&brgDesc, host_isa_, brgemm_strd, ctx.dt_in0, ctx.dt_in1,
+                                   false, false, brgemm_row_major, 1.f, ctx.beta, ctx.LDA, ctx.LDB, ctx.LDC, ctx.M, ctx.N, ctx.K, &strides);
+    if (status != dnnl_success)
+        IE_THROW() << "BrgemmEmitter cannot initialize brgemm descriptor due to invalid params";
+
+    ctx.is_with_amx = use_amx;
+    status = brgemm_init_tiles(brgDesc, ctx.palette);
+    if (use_amx)
+        amx_tile_configure(ctx.palette);
+
+    ctx.is_with_comp = ctx.dt_in0 == dnnl_data_type_t::dnnl_s8 && !ctx.is_with_amx;
+
+    brgemm_kernel_t* brgKernel_ = nullptr;
+    status = brgemm_kernel_create(&brgKernel_, brgDesc);
+    if (status != dnnl_success)
+        IE_THROW() << "BrgemmEmitter cannot create brgemm kernel due to invalid params";
+    brgKernel.reset(brgKernel_);
+}
+
+void BrgemmEmitter::emit_impl(const std::vector<size_t>& in,
+                              const std::vector<size_t>& out,
+                              const std::vector<size_t>& pool,
+                              const std::vector<size_t>& gpr,
+                              const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == cpu::x64::sse41 || host_isa_ == cpu::x64::avx2) {
+        IE_THROW() << "BrgemmEmitter requires at least avx512_core instruction set";
+    } else if (host_isa_ == cpu::x64::avx512_core) {
+        emit_isa<cpu::x64::avx512_core>(in, out);
+    } else {
+        assert(!"unsupported isa");
+    }
+}
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void BrgemmEmitter::emit_brgemm_kernel_call(const brgemm_kernel_t *brgKernel, int bs,
+                                            Reg64 addr_A, Reg64 addr_B,
+                                            const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                            const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    size_t gpr_size = 8;
+    Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax,
+                                     h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx};
+    size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
+
+    h->sub(h->rsp, n_gprs_to_save * gpr_size);
+    for (size_t i = 0; i < n_gprs_to_save; ++i)
+        h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);
+
+    // caller obligation to save k-regs as callee may use them
+    size_t n_k_regs_to_save = 8;
+    if (isa == cpu::x64::avx512_core) {
+        h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
+        for (size_t i = 0; i < n_k_regs_to_save; ++i) {
+            if (mayiuse(avx512_core))
+                h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
+            else
+                h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
+        }
+    }
+
+    // 1. Caller obligation to save vector registers as callee may use them.
+    // 2. There is an implicit assumption that the host code uses the same
+    // `isa` as the injector. Once the assumption is wrong, `vecs_count` and
+    // `vlen` should be replaced with `host_isa::vlen` and
+    // `host_isa::vecs_count`.
+    h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
+    for (size_t i = 0; i < get_max_vecs_count(); ++i)
+        h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i));
+
+    // save function address in gpr to pass in call instruction
+    const auto& brgemm_kernel_overload = static_cast<void (*)(const brgemm_kernel_t*,
+                                                              const void*,
+                                                              const void*,
+                                                              void*)>(kernel_execute);
+    h->mov(h->rbp, reinterpret_cast<uintptr_t>(brgemm_kernel_overload));
+    // todo: several of addr_{A, B, C} could be also abi_paramX, so one of them could be corrupted
+    //  if moving directly h->uni_vmovq(abi_paramX, adr_X). Save them to vector regs to avoid corruption.
+    //  It's likely that a more efficient solution exists.
+    h->uni_vmovq(Xmm(0), addr_A);
+    h->uni_vmovq(Xmm(1), addr_B);
+    h->uni_vmovq(Xmm(2), addr_C);
+
+    const auto data_ptr_reg = [&](Xmm xmm, Xbyak::Reg64 reg, size_t bytes_offset) {
+        h->uni_vmovq(reg, xmm);
+        if (bytes_offset) h->add(reg, bytes_offset);
+    };
+    h->mov(abi_param1, reinterpret_cast<uintptr_t>(brgKernel));
+    data_ptr_reg(Xmm(0), abi_param2, in0_kernel_offset);
+    data_ptr_reg(Xmm(1), abi_param3, in1_kernel_offset);
+    data_ptr_reg(Xmm(2), abi_param4, out0_kernel_offset);
+
+    // align stack on 16-byte as ABI requires
+    // note that RBX must not be changed by the callee
+    h->mov(h->rbx, h->rsp);
+    h->and_(h->rbx, 0xf);
+    h->sub(h->rsp, h->rbx);
+
+    h->call(h->rbp);
+
+    h->add(h->rsp, h->rbx);
+    // restore vector registers
+    for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
+        h->uni_vmovups(Vmm(i), h->ptr[h->rsp + i * get_vec_length()]);
+    }
+    h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());
+
+    // restore k registers
+    if (isa == cpu::x64::avx512_core) {
+        for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
+            if (mayiuse(avx512_core))
+                h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+            else
+                h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
+        }
+        h->add(h->rsp, n_k_regs_to_save * k_mask_size);
+    }
+
+    // restore gpr registers
+    for (int i = n_gprs_to_save - 1; i >= 0; --i)
+        h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
+    h->add(h->rsp, n_gprs_to_save * gpr_size);
+}
+
+void BrgemmEmitter::kernel_execute(const brgemm_kernel_t *brg_kernel, const void *A, const void *B, void *C) {
+    // TODO: There are 4 available abi_params on Windows so we have the copy of brgemm_kernel_execute() function
+    //       with 4 runtime parameters (kernel and I/O) and 4 default parameter values (batch, bs and scratch)
+    brgemm_kernel_params_t brgemm_p;
+
+    brgemm_p.batch = nullptr;  // default value
+    brgemm_p.ptr_A = A;
+    brgemm_p.ptr_B = B;
+    brgemm_p.ptr_C = C;
+    brgemm_p.ptr_D = C;
+    brgemm_p.ptr_buf = nullptr;  // default value
+    brgemm_p.ptr_bias = nullptr;
+    brgemm_p.do_post_ops = 0;
+    brgemm_p.do_apply_comp = 0;
+    brgemm_p.skip_accm = 0;
+    brgemm_p.BS = 1;  // default value
+    assert(brg_kernel);
+    (*brg_kernel)(&brgemm_p);
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void BrgemmEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
+    Reg64 input_0(static_cast<int>(in[0]));
+    Reg64 input_1(static_cast<int>(in[1]));
+    Reg64 output_0(static_cast<int>(out[0]));
+
+    for (size_t mb = 0; mb < div_up(M, M_blk); mb++) {
+        const bool is_M_tail = (M - mb * M_blk < M_blk);
+
+        size_t brgIdx0 = getBrgIdx(0, 0, 0);
+        size_t K0_step0 = brgCtxs0[brgIdx0].K;
+        size_t K0_step1 = brgCtxs0[brgIdx0].K * brgCtxs0[brgIdx0].LDB;
+        size_t N0_step0 = brgCtxs0[brgIdx0].N * brg0VnniFactor;
+        size_t N0_step1 = brgCtxs0[brgIdx0].N;
+        for (size_t n = 0; n < 2; n++) {
+            for (size_t k = 0; k < 2; k++) {
+                size_t mIdx = is_M_tail ? 1 : 0;
+                auto& brgemmCtx = brgCtxs0[getBrgIdx(mIdx, k, n)];
+
+                if (brgemmCtx.K != 0 && brgemmCtx.N != 0) {
+                    const size_t in0_offset = load_offset_a + (k * K0_step0 + mb * M_blk * brgemmCtx.LDA) * io_data_size[0];
+                    const size_t in1_offset = load_offset_b + (k * K0_step1 + n * N0_step0) * io_data_size[1];
+                    const size_t out0_offset = store_offset_c + (n * N0_step1 + mb * M_blk * brgemmCtx.LDC) * io_data_size[2];
+
+                    emit_brgemm_kernel_call<isa>(brgKernels0[getBrgIdx(mIdx, k, n)].get(),
+                                                 1,
+                                                 input_0,
+                                                 input_1,
+                                                 nullptr,
+                                                 output_0,
+                                                 nullptr,
+                                                 in0_offset,
+                                                 in1_offset,
+                                                 out0_offset);
+                }
+            }
+        }
+    }
+}
+
+HorizonMaxEmitter::HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void HorizonMaxEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "HorizonMax emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void HorizonMaxEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Xmm dst_xmm = Xmm(out[0]);
+    Xmm aux_xmm = Xmm(aux_vec_idxs[0]);
+
+    Reg64 aux_reg = Reg64(aux_gpr_idxs[0]);
+    Reg32 aux_reg_32 = Reg32(aux_reg.getIdx());
+
+    const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen;
+    const size_t vec_size = vlen / sizeof(float);
+    h->sub(h->rsp, vlen);
+    h->uni_vmovups(h->ptr[h->rsp], src_vmm);
+    // Let the first value be the max
+    h->mov(aux_reg, h->ptr[h->rsp]);
+    h->vmovq(dst_xmm, aux_reg);
+    for (size_t i = 1; i < vec_size; i++) {
+        h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]);
+        h->vmovq(aux_xmm, aux_reg);
+        h->uni_vmaxps(dst_xmm, dst_xmm, aux_xmm);
+    }
+    h->add(h->rsp, vlen);
+}
+
+HorizonSumEmitter::HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void HorizonSumEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "HorizonSum emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void HorizonSumEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Xmm dst_xmm = Xmm(out[0]);
+    Xmm aux_xmm = Xmm(aux_vec_idxs[0]);
+
+    Reg64 aux_reg = Reg64(aux_gpr_idxs[0]);
+    Reg32 aux_reg_32 = Reg32(aux_reg.getIdx());
+
+    const size_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits<isa>::vlen;
+    const size_t vec_size = vlen / sizeof(float);
+    h->sub(h->rsp, vlen);
+    h->uni_vmovups(h->ptr[h->rsp], src_vmm);
+    h->uni_vpxor(dst_xmm, dst_xmm, dst_xmm);
+    for (size_t i = 0; i < vec_size; i++) {
+        h->mov(aux_reg, h->ptr[h->rsp + i * sizeof(float)]);
+        h->vmovq(aux_xmm, aux_reg);
+        h->uni_vaddps(dst_xmm, dst_xmm, aux_xmm);
+    }
+    h->add(h->rsp, vlen);
+}
+
+VectorBufferEmitter::VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {}
+
+void VectorBufferEmitter::emit_impl(const std::vector<size_t>& in,
+                                    const std::vector<size_t>& out,
+                                    const std::vector<size_t>& pool,
+                                    const std::vector<size_t>& gpr,
+                                    const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Zero emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void VectorBufferEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm vmm = Vmm(out[0]);
+    h->uni_vpxor(vmm, vmm, vmm);
+}
+
+FillEmitter::FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n) :
+    jit_emitter(h, isa, n, Precision::FP32, emitter_in_out_map::vec_to_vec) {
+    const auto fill = ov::as_type_ptr<ngraph::snippets::op::Fill>(n);
+    if (fill->get_element_type().size() != 4) {
+        IE_THROW() << "Fill emitter supports only 4 Byte element types but gets: " << fill->get_element_type();
+    }
+
+    offset = fill->get_offset();
+    fill_value = fill->get_fill_value();
+    prepare_table();
+}
+
+size_t FillEmitter::aux_gprs_count() const {
+    // + 1 reg for temp reg for mask in avx512
+    return one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core) ? 2 : 1;
+}
+
+void FillEmitter::emit_impl(const std::vector<size_t>& in,
+                            const std::vector<size_t>& out,
+                            const std::vector<size_t>& pool,
+                            const std::vector<size_t>& gpr,
+                            const ov::intel_cpu::emitter_context *emit_context) const {
+    if (host_isa_ == dnnl::impl::cpu::x64::sse41) {
+        emit_isa<dnnl::impl::cpu::x64::sse41>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) {
+        emit_isa<dnnl::impl::cpu::x64::avx2>(in, out);
+    } else if (host_isa_ == dnnl::impl::cpu::x64::avx512_core) {
+        emit_isa<dnnl::impl::cpu::x64::avx512_core>(in, out);
+    } else {
+        IE_THROW() << "Fill emitter doesn't support " << host_isa_;
+    }
+}
+
+template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+void FillEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
+    using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
+            Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
+
+    Vmm src_vmm = Vmm(in[0]);
+    Vmm dst_vmm = Vmm(out[0]);
+
+    if (one_of(host_isa_, dnnl::impl::cpu::x64::avx512_core)) {
+        uint64_t tail_mask = 1;
+        tail_mask = ~((tail_mask << offset) - tail_mask);
+        h->mov(Reg64(aux_gpr_idxs[0]), tail_mask);
+        h->kmovq(k_mask, Reg64(aux_gpr_idxs[0]));
+        h->vblendmps(dst_vmm | k_mask, src_vmm, table_val("value"));
+    } else if (one_of(host_isa_, dnnl::impl::cpu::x64::avx2, dnnl::impl::cpu::x64::sse41)) {
+        uint8 imm = 1;
+        imm = ~((imm << offset) - imm);  // shift load_num bit
+        if (host_isa_ == dnnl::impl::cpu::x64::sse41 && src_vmm.getIdx() != dst_vmm.getIdx()) {
+            h->uni_vmovups(dst_vmm, src_vmm);
+            src_vmm = Vmm(dst_vmm.getIdx());
+        }
+        h->uni_vblendps(dst_vmm, src_vmm, table_val("value"), imm);
+    }
+}
+
+void FillEmitter::register_table_entries() {
+    push_arg_entry_of("value", fill_value, true);
+}
 
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
index adfd88dfedd..51b2d2d7840 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -12,6 +12,11 @@
 #include "jit_load_store_emitters.hpp"
 
 #include "snippets_transformations/op/store_convert.hpp"
+// Matmul support:
+#include <cpu/x64/brgemm/brgemm.hpp>
+#include <cpu/x64/matmul/brgemm_matmul_copy_utils.hpp>
+#include <cpu/x64/matmul/brgemm_matmul_utils.hpp>
+#include <cpu/x64/amx_tile_configure.hpp>
 
 using namespace Xbyak;
 using ngraph::snippets::AllocatedEmitter;
@@ -23,47 +28,49 @@ namespace intel_cpu {
 #define SNIPPETS_MAX_SNIPPETS_DIMS 12
 #define SNIPPETS_MAX_HARNESS_DIMS 5
 #define SNIPPETS_MAX_TILE_RANK 2
+#define SNIPPETS_DYNAMIC_MASTER_SHAPE_RANK 6
 #define GET_OFF(field) offsetof(jit_snippets_call_args, field)
 struct jit_snippets_call_args {
     const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
     void *dst_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
+    void *buffer_scratchpad_ptr = nullptr;
 };
 
 struct jit_snippets_compile_args {
-    int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {};
-    int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
-    int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {};
-    std::vector<size_t> output_dims = {};
+    std::vector<size_t> master_shape{};
+    size_t tile_rank = 0;
 };
 ///
-/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter,
-/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping
+/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (for example, KernelEmitter)
+///  This is needed to provide common interface for register mapping
 /// (abstract to physical) and nested code access.
 ///
 class jit_container_emitter: public jit_emitter {
 public:
     jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                           const std::shared_ptr<ov::Node>& n);
+    // mapping info contains abstract_to_physical map + regs_pool
+    using mapping_info = std::pair<std::map<size_t, size_t>, std::vector<size_t>&>;
 protected:
     // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools
     // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args).
-    void map_abstract_registers(const std::vector<size_t>&,  const std::vector<size_t>&,
-                                std::set<size_t>&, std::set<size_t>&);
+    void map_abstract_registers(mapping_info& gpr_map_pool,  mapping_info& vec_map_pool,
+                                std::vector<AllocatedEmitter>& allocated_emitters) const;
     std::vector<AllocatedEmitter> body;
 };
 ///
 /// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
-/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one)
-/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way:
-/// KernelEmitter {          /* entry point, maps registers, creates pools of available registers */
-///     TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */
-///         TileEmitter {    /* inner vector tile */
-///             ...          /* All the necessary Load/Strore/elementwise emitters */
-///         }
-///         TileEmitter {    /* inner scalar tile for tail processing */
-///             ...          /* All the necessary Load/Strore/elementwise emitters */
-///         }
-///     }
+/// mapping and creates a pools of available gpr and vec registers. Kernel usually contains (at least one)
+/// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way:
+/// KernelEmitter {                 /* entry point, maps registers, creates pools of available registers */
+///     1.S LoopBeginEmitter        /* Scalar Loop over the outer dimension [START] */
+///         2.S LoopBeginEmitter    /* inner vector loop [START] */
+///             ...                 /* All the necessary Load/Strore/elementwise emitters */
+///         2.E LoopEndEmitter      /* inner vector loop [END] */
+///         3.S LoopBeginEmitter    /* inner scalar loop for tail processing [START]*/
+///             ...                 /* All the necessary Load/Strore/elementwise emitters */
+///         3.E LoopEndEmitter      /* inner scalar loop for tail processing [END]*/
+///     1.E LoopEndEmitter          /* Scalar Loop over the outer dimension [END] */
 /// }
 /// Note that Kernel doesn't accept any input arguments.
 ///
@@ -88,33 +95,36 @@ private:
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
-    void init_data_pointers(size_t, size_t, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
+    void init_data_pointers(size_t, size_t, bool, const Reg64&, const Reg64&, const std::vector<Reg64>&) const;
 
     jit_snippets_compile_args jcp;
     std::vector<size_t> gp_regs_pool;
-    std::vector<size_t> gp_regs_used;
+    size_t num_inputs;
+    size_t num_outputs;
+    bool is_buffer_needed;
+    // Vector of indices (lenght = input tensor rank) per every input and output that describes in which order
+    // corresponding tensor dimensions are accessed (default: consecutive dense, e.g. 0,1,2,3 for 4D tensor).
+    // Needed to calc i/o offsets.
+    std::vector<std::vector<size_t>> data_layout;
+    std::vector<std::vector<size_t>> io_shapes = {};
+    std::vector<size_t> io_data_size {};
+
+    // gpr's used to store data pointers, track them to apply offsets in Kernel
+    std::vector<size_t> data_ptr_regs_idx;
     std::vector<size_t> vec_regs_pool;
+    const size_t reg_indexes_idx = abi_param1.getIdx();
+    const size_t reg_const_params_idx = abi_param2.getIdx();
 };
-///
-/// \brief  TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets
-/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector
-/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required.
-///
-/// \param      in[0]      The number of the node inputs
-/// \param      in[1]      The number of the node outputs
-/// \param      in[2]      The number of elements that fits into vector register
-///
 
-class TileSchedulerEmitter : public jit_container_emitter {
+class LoopBeginEmitter : public jit_emitter {
 public:
-    TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                         const std::shared_ptr<ov::Node>& n);
-
-    size_t get_inputs_num() const override {return 0;}
+    LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
     void emit_code(const std::vector<size_t> &in,
                    const std::vector<size_t> &out,
                    const std::vector<size_t> &pool,
                    const std::vector<size_t> &gpr) const override;
+    // todo: it is purely virtual in the base class, but do we need it?
+    size_t get_inputs_num() const override {return 0;}
 
 private:
     void validate_arguments(const std::vector<size_t> &in,
@@ -127,50 +137,49 @@ private:
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
 
-    void emit_tiles(const Reg64&, const std::vector<Reg64>&, size_t, const std::vector<size_t>& , const std::vector<size_t>&) const;
-
-    jit_snippets_compile_args jcp;
+    std::shared_ptr<ngraph::snippets::op::LoopBegin> loop_begin;
+    size_t num_inputs = 0;
+    bool evaluate_once = false;
+    size_t work_amount = 0; // need to store work_amount explicitly, since two loops can work on the same dim (e.g. vector + scalar)
 };
 
-///
-/// \brief    Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop:
-/// it performs operations specified by enclosed emitters, advances iteration counters
-/// and breaks when necessary.
-///
-/// \param      in[0]    The number of input entities (or scheduler counts) processed during one iteration of the tile.
-///  It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
-class TileEmitter : public jit_container_emitter {
+class LoopEndEmitter : public jit_emitter {
 public:
-    TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
-
-    size_t get_inputs_num() const override {return 0;}
-    std::vector<AllocatedEmitter>& get_nested_code();
+    LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
     void emit_code(const std::vector<size_t> &in,
                    const std::vector<size_t> &out,
                    const std::vector<size_t> &pool,
                    const std::vector<size_t> &gpr) const override;
-
-    void emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const;
-    void emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const;
+    // todo: it is purely virtual in the base class, but do we need it?
+    size_t get_inputs_num() const override {return 0;}
 
 private:
     void validate_arguments(const std::vector<size_t> &in,
                             const std::vector<size_t> &out,
                             const std::vector<size_t> &pool,
                             const std::vector<size_t> &gpr) const override;
+
     void emit_impl(const std::vector<size_t>& in,
                    const std::vector<size_t>& out,
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
 
+    std::shared_ptr<ngraph::snippets::op::LoopBegin> loop_begin;
+    std::shared_ptr<ngraph::snippets::op::LoopEnd> loop_end;
+
     size_t num_inputs = 0;
     size_t num_outputs = 0;
-    std::vector<size_t> io_dims {};
-    std::vector<size_t> io_data_size {};
-    size_t increment = 0;
+    // keep data_size int64_t to avoid conversion to size_t (and overflow) when multiplied by negative increments or offsets
+    std::vector<int64_t> io_data_size {};
+    size_t wa_increment = 0;
+    size_t work_amount = 0;
+    bool evaluate_once = false;
+    std::vector<int64_t> ptr_increments;
+    std::vector<int64_t> finalization_offsets;
 };
 
+
 class NopEmitter : public jit_emitter {
 public:
     NopEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
@@ -205,7 +214,6 @@ private:
     void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 
 private:
-    bool use_broadcast;
     size_t byte_size = 0lu;
 };
 
@@ -239,7 +247,7 @@ private:
 /// it's illigal to load/store to the same address multiple times
 /// Typical application can be if Load and BroadcastLoad are performed from the same pointer.
 /// If Load goes before BroadcastLoad topologicaly the resilt will be incorrect
-/// For scalar loads we can use different tiles. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA.
+/// For scalar loads we can use different loops. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA.
 /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load.
 class MemoryEmitter : public jit_emitter  {
 public:
@@ -248,6 +256,9 @@ public:
 protected:
     Precision src_prc;
     Precision dst_prc;
+
+    size_t count = 0;
+    size_t byte_offset = 0;
 };
 
 class StoreEmitter : public MemoryEmitter  {
@@ -268,7 +279,6 @@ private:
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
@@ -290,7 +300,6 @@ private:
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
@@ -329,7 +338,6 @@ private:
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_load_emitter> load_emitter = nullptr;
 };
 
@@ -351,9 +359,141 @@ private:
     void emit_data() const override;
 
 private:
-    size_t count;
     std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
 
+class BrgemmEmitter : public jit_emitter {
+public:
+    BrgemmEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 2;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+    std::vector<size_t> io_data_size {};
+    struct brgemmCtx {
+        size_t M, N, K, LDA, LDB, LDC;
+        dnnl_data_type_t dt_in0, dt_in1;
+        char palette[64];
+        bool is_with_amx;
+        bool is_with_comp;
+        float beta;
+    };
+    void initBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, bool use_amx) const;
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void callBrgemm(brgemmCtx& ctx, std::unique_ptr<brgemm_kernel_t>& brgKernel, const void* pin0, const void* pin1, void* pout, void* wsp) const;
+    size_t getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) const;
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_brgemm_kernel_call(const brgemm_kernel_t *brg_kernel, int bs,
+                                 Reg64 addr_A, Reg64 addr_B,
+                                 const brgemm_batch_element_t *batch, Reg64 addr_C, void *scratch,
+                                 const size_t in0_kernel_offset, const size_t in1_kernel_offset, const size_t out0_kernel_offset) const;
+    static void kernel_execute(const brgemm_kernel_t *brg_kernel, const void *A, const void *B, void *C);
+    static constexpr size_t BRGEMM_KERNELS_NUM = 8;
+    static constexpr size_t matmulOptimalM = 32;
+    brgemmCtx brgCtxs0[BRGEMM_KERNELS_NUM];
+    std::unique_ptr<dnnl::impl::cpu::x64::brgemm_kernel_t> brgKernels0[BRGEMM_KERNELS_NUM];
+
+    size_t M, M_blk, M_tail;
+    size_t K, K_blk, K_tail;
+    size_t N, N_blk, N_tail;
+    size_t brg0VnniFactor;
+
+    size_t load_offset_a = 0lu;
+    size_t load_offset_b = 0lu;
+    size_t store_offset_c = 0lu;
+};
+
+class HorizonMaxEmitter : public jit_emitter {
+public:
+    HorizonMaxEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override {return 1;}
+    size_t aux_vecs_count() const override {return 1;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class HorizonSumEmitter : public jit_emitter {
+public:
+    HorizonSumEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override {return 1;}
+    size_t aux_vecs_count() const override {return 1;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class VectorBufferEmitter : public jit_emitter {
+public:
+    VectorBufferEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 0;}
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+};
+
+class FillEmitter : public jit_emitter {
+public:
+    FillEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
+
+    size_t get_inputs_num() const override {return 1;}
+
+protected:
+    size_t aux_gprs_count() const override;
+
+private:
+    void emit_impl(const std::vector<size_t>& in,
+                   const std::vector<size_t>& out,
+                   const std::vector<size_t>& pool,
+                   const std::vector<size_t>& gpr,
+                   const ov::intel_cpu::emitter_context *emit_context) const override;
+
+    template <dnnl::impl::cpu::x64::cpu_isa_t isa>
+    void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
+
+    void register_table_entries() override;
+
+    size_t offset = 0;
+    uint32_t fill_value = 0x0;
+};
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index eafce8f3aeb..2e27effd8b5 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -131,19 +131,26 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
         ngraph::OpSet opset;
 
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
+        NGRAPH_OP(Brgemm, ngraph::snippets::op)
         NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
         NGRAPH_OP(BroadcastMove, ngraph::snippets::op)
+        NGRAPH_OP(Buffer, ngraph::snippets::op)
         NGRAPH_OP(ConvertSaturation, ngraph::snippets::op)
         NGRAPH_OP(ConvertTruncation, ngraph::snippets::op)
+        NGRAPH_OP(Fill, ngraph::snippets::op)
+        NGRAPH_OP(HorizonMax, ngraph::snippets::op)
+        NGRAPH_OP(HorizonSum, ngraph::snippets::op)
         NGRAPH_OP(Kernel, ngraph::snippets::op)
         NGRAPH_OP(Load, ngraph::snippets::op)
+        NGRAPH_OP(LoadReshape, ngraph::snippets::op)
+        NGRAPH_OP(LoopBegin, ngraph::snippets::op)
+        NGRAPH_OP(LoopEnd, ngraph::snippets::op)
         NGRAPH_OP(Nop, ngraph::snippets::op)
         NGRAPH_OP(PowerStatic, ngraph::snippets::op)
         NGRAPH_OP(Scalar, ngraph::snippets::op)
         NGRAPH_OP(Store, ngraph::snippets::op)
         NGRAPH_OP(Subgraph, ngraph::snippets::op)
-        NGRAPH_OP(Tile, ngraph::snippets::op)
-        NGRAPH_OP(TileScheduler, ngraph::snippets::op)
+        NGRAPH_OP(VectorBuffer, ngraph::snippets::op)
 #undef NGRAPH_OP
 
         return opset;
diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
index 54f046c984a..71fdc039a4b 100644
--- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "snippets_mark_skipped.hpp"
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "snippets/utils.hpp"
 #include <ngraph/opsets/opset1.hpp>
@@ -81,7 +81,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
             fusingPort = i;
             dataShape = node->get_input_partial_shape(i);
             // only one non-const parent is allowed
-            if (dataShape.is_dynamic() || ++numNonConstInputs != 1)
+            if (++numNonConstInputs != 1)
                 return false;
         } else {
             // every const parent must have exactly one child
@@ -97,8 +97,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
             if (i == fusingPort)
                 continue;
             const ov::PartialShape weightShape = node->get_input_partial_shape(i);
-            if (weightShape.is_dynamic() ||
-                !isPerTensorOrPerChannelBroadcastable(dataShape.get_shape(), weightShape.get_shape(), channelAxis, true))
+            if (!isPerTensorOrPerChannelBroadcastable(dataShape.get_max_shape(), weightShape.get_max_shape(), channelAxis, true))
                 return false;
         }
         return true;
@@ -250,22 +249,20 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, con
                                     NodeFusingType &updatedChainType, int& fusingAxis) {
     int num_non_const_inputs = 0;
     bool can_be_converted_to_FC = false;
-    ov::Shape bias_shape;
-    ov::Shape matmul_shape;
+    ov::PartialShape bias_shape;
+    ov::PartialShape matmul_shape;
     for (const auto &parent_out : node->input_values()) {
         const auto parent = parent_out.get_node_shared_ptr();
         if (ngraph::op::is_constant(parent)) {
             bias_shape = parent_out.get_shape();
             num_non_const_inputs++;
         } else {
-            const auto pshape = parent_out.get_partial_shape();
-            if (pshape.is_dynamic() || pshape.get_shape().empty())
+              matmul_shape = parent_out.get_partial_shape();
+              if (matmul_shape.size() == 0)
                 return false;
-            matmul_shape = pshape.get_shape();
             const auto& grandparents = parent->input_values();
             // first check that weights are constant and both activations and weights have static shape
             if (grandparents.size() == 2 &&
-                grandparents[0].get_partial_shape().is_static() &&
                 grandparents[1].get_partial_shape().is_static() &&
                 ov::is_type<ov::op::v0::Constant>(grandparents[1].get_node_shared_ptr())) {
                 auto rank_a = grandparents[0].get_partial_shape().rank().get_length();
@@ -280,8 +277,9 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, con
 
     // Matmul / FC bias fusion
     if (ov::is_type<ngraph::opset1::Add>(node) &&
-        bias_shape.back() == matmul_shape.back() &&
-        bias_shape.back() == shape_size(bias_shape)) {
+        bias_shape.rbegin()->get_length() == matmul_shape.rbegin()->get_length() &&
+        bias_shape.is_static() &&
+        bias_shape.rbegin()->get_length() == shape_size(bias_shape.get_shape())) {
         return true;
     }
 
@@ -431,7 +429,7 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
     RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped);
     int channelAxis = DEFAULT_AXIS;
     for (auto &node : m->get_ordered_ops()) {
-        if (ngraph::op::is_constant(node))
+        if (ngraph::op::is_constant(node) || ov::is_type<ov::op::v0::Result>(node))
             continue;
         if (isSuitableConvolutionParent(node)) {
             // Initiate fusing chain
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 0443416f356..71e13fe7f07 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -22,6 +22,7 @@
 
 #include <snippets/op/subgraph.hpp>
 #include "emitters/cpu_generator.hpp"
+#include "utils/cpu_utils.hpp"
 #include "snippets_transformations/fuse_load_store_and_convert.hpp"
 #include "ngraph_transformations/convert_to_swish_cpu.hpp"
 
@@ -64,6 +65,7 @@ void Snippet::copy_snippet() {
     ngraph::copy_runtime_info(original_snippet, snippet);
     snippet->set_friendly_name(original_snippet->get_friendly_name());
     snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
+    isa_num_lanes =  snippet->get_generator()->get_target_machine()->get_lanes();
 }
 
 void Snippet::initSupportedPrimitiveDescriptors() {
@@ -82,11 +84,19 @@ void Snippet::initSupportedPrimitiveDescriptors() {
     }
 
     const size_t ndims = outputShapes[0].getRank();
-    const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 3, 4, 5) && dimRanksAreEqual;
+    // Domain sensitive operations support only Planar layout
+    const bool isOnlyPlanarApplicable = snippet->has_domain_sensitive_ops();
+    const bool isChannelsFirstApplicable = dnnl::impl::utils::one_of(ndims, 1, 2, 3, 4, 5) && dimRanksAreEqual && !isOnlyPlanarApplicable;
     // Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because
     //  canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases.
     //  See snippets::op::Subgraph::canonicalize for details.
-    const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual;
+    bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual && !isOnlyPlanarApplicable;
+
+    for (const auto& inShape : inputShapes) {
+        if (isDynamic && inShape.getRank() != 1)
+            isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1;
+    }
+
     enum LayoutType {
         Planar,
         ChannelsFirst,
@@ -189,42 +199,6 @@ void Snippet::initSupportedPrimitiveDescriptors() {
 void Snippet::selectOptimalPrimitiveDescriptor() {
     selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true);
 }
-
-void Snippet::createPrimitive() {
-    // schedule definition part
-    // it defines offsets, strides and sizes for snippet kernel scheduling
-    define_schedule();
-
-    // code generation part
-    // it might be worth to generate explicitly for scheduler work amount for now,
-    // but in future some interface should be defined in order to communicate schedule for a kernel
-    // or generate schedule for a kernel.
-    // Here kernel is generated for most warying dimension by default.
-    generate();
-}
-
-void Snippet::execute(dnnl::stream strm) {
-    if (schedule.ptr == nullptr || !canUseOptimizedImpl) {
-        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
-    }
-    jit_snippets_call_args call_args;
-    for (size_t i = 0; i < srcMemPtrs.size(); i++)
-        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
-
-    for (size_t i = 0; i < dstMemPtrs.size(); i++)
-        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
-
-    if (tensorRank == rank6D) {
-        schedule_6d(call_args);
-    } else {
-        schedule_nt(call_args);
-    }
-}
-
-bool Snippet::created() const {
-    return getType() == Type::Subgraph;
-}
-
 InferenceEngine::Precision Snippet::getRuntimePrecision() const {
     std::vector<InferenceEngine::Precision> inputPrecisions;
     for (size_t i = 0; i < getParentEdges().size(); i++) {
@@ -237,11 +211,268 @@ InferenceEngine::Precision Snippet::getRuntimePrecision() const {
     return getMaxPrecision(inputPrecisions);
 }
 
-bool Snippet::canBeInPlace() const {
-    if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
+bool Snippet::optimizeExecDomain(std::vector<VectorDims>& inputShapes, std::vector<VectorDims>& outputShapes,
+                                 VectorDims &domain, size_t& TileRank) const {
+    const size_t minimalConcurrency = parallel_get_max_threads();
+    const size_t minimalJitWorkAmount = 256;
+    const size_t ds = domain.size();
+    if ( ds <= 2 || // not enough dimensions to collapse
+         domain[ds-1] >= minimalJitWorkAmount || // There is enough work for 1D Tiles, no need to collapse
+         domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency) // There won't be enough work for every thread (even one iter) if we collapse
         return false;
+    auto findDimsToCollapse = [&]() {
+        auto collapseLastDims = [](VectorDims& dims, size_t dimsToCollapse) {
+            if (dimsToCollapse >= dims.size() - 1)
+                IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
+            for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+                dims[dims.size() - 1] *= dims[i];
+            }
+
+            for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+                dims[i] = dims[i - dimsToCollapse];
+            }
+
+            for (int i = dimsToCollapse - 1; i >= 0; i--) {
+                dims[i] = 1;
+            }
+        };
+        int collapsedDims = 0;
+        size_t currentJitWorkAmount = domain[domain.size() - 1];
+        while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
+            if (static_cast<int>(domain.size()) - collapsedDims - 2 < 0)
+                break;
+
+            bool canCollapse = true;
+            for (size_t i = 0; i < inputShapes.size(); i++) {
+                const size_t last = inputShapes[i].size() - 1;
+                if ((inputShapes[i][last - 1] != 1 && inputShapes[i][last] == 1) ||
+                    (inputShapes[i][last - 1] == 1 && inputShapes[i][last] != 1)) {
+                    canCollapse = false;
+                    break;
+                }
+            }
+
+            size_t nextJitWorkAmount = currentJitWorkAmount * domain[domain.size() - 2];
+            if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
+                currentJitWorkAmount = nextJitWorkAmount;
+                // if we cannot use dim collapsing we should use tile2D
+                if (!canCollapse) {
+                    if (TileRank < maxTileRank) {
+                        TileRank++;
+                        continue;
+                    }
+
+                    break;
+                }
+                collapsedDims++;
+                for (auto &d : inputShapes)
+                    collapseLastDims(d, 1);
+                for (auto &d : outputShapes)
+                    collapseLastDims(d, 1);
+                collapseLastDims(domain, 1);
+            } else {
+                break;
+            }
+        }
+        return collapsedDims > 0;
+    };
+    return findDimsToCollapse();
+}
+ov::PartialShape Snippet::canonicalizeBody() {
+    auto edgeToBlockedShape = [](const EdgePtr& edge) {
+        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
+        std::vector<Dimension> dims;
+        // if blockDim == Shape::UNDEFINED_DIM, then it's a dynamic dimension, and we need to recreate a proper dynamic Dim
+        for (const auto& d : blockedDesc->getBlockDims())
+            dims.emplace_back(d == Shape::UNDEFINED_DIM ? -1 : d);
+        ngraph::PartialShape shape(dims);
+        ngraph::AxisVector blocking(blockedDesc->getOrder());
+        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
+        return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
+    };
+    inputShapeIsBlocked.resize(inputShapes.size(), false);
+    masterShapeIsBlocked = false;
+    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
+    for (size_t i = 0; i < inputShapes.size(); i++) {
+        auto blockedShape = edgeToBlockedShape(getParentEdgesAtPort(i)[0]);
+        inputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size();
+        masterShapeIsBlocked = masterShapeIsBlocked || inputShapeIsBlocked[i];
+        input_blocked_shapes.push_back(blockedShape);
     }
 
+    outputShapeIsBlocked.resize(outputShapes.size(), false);
+    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
+    for (size_t i = 0; i < outputShapes.size(); i++) {
+        auto blockedShape = edgeToBlockedShape(getChildEdgesAtPort(i)[0]);
+        outputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size();
+        output_blocked_shapes.push_back(blockedShape);
+    }
+
+    const auto& canonicalShape = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    return canonicalShape;
+}
+void Snippet::createPrimitive() {
+    // determine canonicalize, determine master_shape and prepend up to 6D
+    // NB! normInputShapes are updated, so body reshape might be needed
+    const auto& canonicalShape = canonicalizeBody();
+    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
+    tensorRank = std::max(static_cast<size_t>(rank6D), canonicalShape.size());
+
+    const auto config = getSelectedPrimitiveDescriptor()->getConfig();
+    auto initDataSizes = [this, config]() {
+        const size_t numInputs = inputShapes.size();
+        const size_t numOutputs = outputShapes.size();
+        dataSize.resize(numInputs + numOutputs);
+        for (size_t i = 0; i < numInputs; i++)
+            dataSize[i] = config.inConfs[i].getMemDesc()->getPrecision().size();
+        for (size_t i = 0; i < numOutputs; i++)
+            dataSize[i + numInputs] = config.outConfs[i].getMemDesc()->getPrecision().size();
+    };
+    initDataSizes();
+
+    jit_snippets_compile_args jcp;
+    if (canonicalShape.is_dynamic())
+        IE_THROW() << "Snippets: Canonicalization returned dynamic shape in static pipeline";
+    masterShape = canonicalShape.get_shape();
+    const auto &body = snippet->body_ptr();
+    for (const auto& p : body->get_parameters())
+        normInputShapes.emplace_back(p->get_output_shape(0));
+    for (const auto& r : body->get_results())
+        normOutputShapes.emplace_back(r->get_input_shape(0));
+
+    prepareParams();
+    jcp.master_shape = masterShape;
+    jcp.tile_rank = tileRank;
+    generate(&jcp);
+    buffer_scratchpad_size = snippet->get_buffer_scratchpad_size();
+    buffer_scratchpad.resize(buffer_scratchpad_size * parallel_get_max_threads(), 0);
+}
+
+std::vector<VectorDims> Snippet::shapeInfer() const {
+    // todo: it's very strange that we don't have broadcast_merge_into for cpu shapes
+    auto broadcast_merge = [](VectorDims& dst, const VectorDims& src){
+        // Ranks are both static.
+        auto dst_rank = dst.size();
+        auto src_rank = src.size();
+        const auto new_rank = std::max(dst_rank, src_rank);
+        dst.insert(dst.begin(), new_rank - dst_rank, 1);
+        std::vector<Dimension> dims(new_rank);
+        bool success = true;
+        for (int64_t i = 0; i < new_rank; i++) {
+            auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)];
+            auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)];
+            if (dsti != srci && srci != Shape::UNDEFINED_DIM) {
+                if (dsti == 1 || dsti == Shape::UNDEFINED_DIM) {
+                    dsti = srci;
+                } else {
+                    success = false;
+                }
+            }
+        }
+        return success;
+    };
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        VectorDims inDims {getParentEdgesAtPort(i)[0]->getMemory().GetShape().getDims()};
+        if (masterShapeIsBlocked && !inputShapeIsBlocked[i])
+            inDims.insert(inDims.end(), 1);
+        // todo: this is a simple master_shape inference for shape-agnostic operations,
+        //  we'll need to account for body operations semantics in the future
+        if (i == 0)
+            masterShape = inDims;
+        else
+            broadcast_merge(masterShape, inDims);
+        normInputShapes[i] = std::move(inDims);
+    }
+    if (std::any_of(masterShape.begin(), masterShape.end(), [](const Dim& d){ return d == Shape::UNDEFINED_DIM;})) {
+        std::ostringstream errorMessage;
+        errorMessage << "Can't compute static master shape for Snippet node with name: " << getName();
+        errorMessage << ". Input shapes = ( ";
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            errorMessage << i << " port = " << getParentEdgesAtPort(i)[0]->getMemory().GetShape().toString() << ", ";
+        }
+        errorMessage << "). Master shape = ( " << Shape(masterShape).toString() << " )";
+        IE_THROW() << errorMessage.str();
+    }
+
+    if (normOutputShapes.size() == 1) {
+        normOutputShapes[0] = masterShape;
+        return {masterShape};
+    }
+    std::vector<VectorDims> outputDims;
+    std::vector<ov::Shape> new_shapes;
+    for (const auto& s : normInputShapes)
+        new_shapes.emplace_back(s);
+    const auto& outputShapes = snippet->reshape_body(new_shapes);
+    for (size_t i = 0; i < outputShapes.size(); i++)
+            normOutputShapes[i] = outputShapes[i];
+    return normOutputShapes;
+}
+
+void Snippet::prepareParams() {
+    masterShape = getNormalizedDimsBySize(masterShape, tensorRank);
+    for (auto& pshape : normInputShapes)
+        pshape = getNormalizedDimsBySize(pshape, tensorRank);
+    for (auto& pshape : normOutputShapes)
+        pshape = getNormalizedDimsBySize(pshape, tensorRank);
+
+    tileRank = 1;
+    fullWorkAmount = std::accumulate(masterShape.begin(), masterShape.end(), 1, std::multiplies<size_t>());
+    if (snippet->has_domain_sensitive_ops()) {
+        tileRank = 2;
+    } else {
+        optimizeExecDomain(normInputShapes, normOutputShapes, masterShape, tileRank);
+    }
+    exec_domain = masterShape;
+
+    auto initStartMemoryOffsets = [this]() {
+        const auto config = getSelectedPrimitiveDescriptor()->getConfig();
+        const size_t numInputs = inputShapes.size();
+        start_offset_in.resize(numInputs);
+        srcMemPtrs.resize(numInputs);
+        for (size_t i = 0; i < numInputs; i++) {
+            const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
+            srcMemPtrs[i] = memPtr;
+            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize[i];
+        }
+        const size_t numOutputs = outputShapes.size();
+        start_offset_out.resize(numOutputs);
+        dstMemPtrs.resize(numOutputs);
+        for (size_t i = 0; i < numOutputs; i++) {
+            const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
+            dstMemPtrs[i] = memPtr;
+            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize[i + numInputs];
+        }
+    };
+    // initialize start offsets to src and dst memory
+    // Needs to be done for every set of input shapes sce memory ptrs could've updated
+    initStartMemoryOffsets();
+    std::vector<size_t> scheduler_work_amounts;
+    // rename schedulerWorkAmount to harnessWorkAmount?
+    harnessWorkAmount = fullWorkAmount;
+    const auto rank = exec_domain.size();
+    for (auto i = rank - tileRank; i < rank; i++) {
+        auto& dim = exec_domain[i];
+        harnessWorkAmount /= dim;
+        scheduler_work_amounts.push_back(dim);
+        dim = 1;
+    }
+
+    auto& body_rt_info = snippet->body_ptr()->get_rt_info();
+    std::vector<std::vector<size_t>> new_shapes(normInputShapes);
+    std::copy(normOutputShapes.begin(), normOutputShapes.end(), std::back_inserter(new_shapes));
+    body_rt_info["PluginShapesOverride"] = new_shapes;
+    snippet->set_master_shape(ov::PartialShape(masterShape));
+    snippet->set_tile_rank(tileRank);
+}
+
+bool Snippet::needPrepareParams() const {
+    return inputShapesModified() || !schedule.ptr;
+}
+
+bool Snippet::canBeInPlace() const {
+    if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
+        return false;
+    }
     if (getChildEdges().size() != 1) {
         return false;
     }
@@ -263,238 +494,11 @@ bool Snippet::canBeInPlace() const {
     return getInputShapeAtPort(0) == getOutputShapeAtPort(0);
 }
 
-static void offset_calculation(std::vector<size_t>& offset, const std::vector<size_t>& dims_in, const std::vector<size_t>& dims_out) {
-    size_t k = 1;
-    for (int i = offset.size() - 1; i >= 0; i--) {
-        offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
-        k *= dims_in[i];
-    }
+bool Snippet::created() const {
+    return getType() == Type::Subgraph;
 }
 
-static auto collapseLastDims(std::vector<size_t>& dims, size_t dimsToCollapse) -> void {
-    if (dimsToCollapse >= dims.size() - 1)
-        IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
-    for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
-        dims[dims.size() - 1] *= dims[i];
-    }
-
-    for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
-        dims[i] = dims[i - dimsToCollapse];
-    }
-
-    for (int i = dimsToCollapse - 1; i >= 0; i--) {
-        dims[i] = 1;
-    }
-}
-
-void Snippet::define_schedule() {
-    auto edgeToBlockedShape = [](const EdgePtr& edge) {
-        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
-        ngraph::Shape shape(blockedDesc->getBlockDims());
-        ngraph::AxisVector blocking(blockedDesc->getOrder());
-        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
-        return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
-    };
-    auto prependWithOnes = [this](const std::vector<size_t>& dims) {
-        if (tensorRank <= dims.size())
-            return dims;
-        VectorDims result(tensorRank, 1);
-        std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]);
-        return result;
-    };
-    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
-    for (size_t i = 0; i < inputShapes.size(); i++)
-        input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0]));
-
-    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
-    for (size_t i = 0; i < outputShapes.size(); i++)
-        output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
-
-    exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
-
-    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
-    tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
-    // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
-    // prepend to enable 6D scheduler
-    exec_domain = prependWithOnes(exec_domain);
-    const auto &body = snippet->body();
-    for (const auto& p : body.get_parameters()) {
-        dims_in.emplace_back(prependWithOnes(p->get_shape()));
-    }
-
-    for (size_t i = 0; i < body.get_output_size(); i++) {
-        dims_out.push_back(prependWithOnes(body.get_output_shape(i)));
-    }
-
-    const auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    auto initOffsets = [this, config]() {
-        // find max rank input among all outputs
-        const size_t inputNum = getParentEdges().size();
-        offsets_in.resize(inputNum);
-        for (size_t i = 0; i < inputNum; i++) {
-            offsets_in[i].resize(tensorRank, 1);
-            offset_calculation(offsets_in[i], dims_in[i], exec_domain);
-            for (size_t j = 0; j < tensorRank; j++) {
-                offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size();
-            }
-        }
-
-        start_offset_in.resize(inputNum);
-        srcMemPtrs.resize(inputNum);
-        for (size_t i = 0; i < inputNum; i++) {
-            const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
-            srcMemPtrs[i] = memPtr;
-            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
-                    config.inConfs[i].getMemDesc()->getPrecision().size();
-        }
-
-        const size_t outputNum = config.outConfs.size();
-        offsets_out.resize(outputNum);
-        for (size_t i = 0; i < outputNum; i++) {
-            offsets_out[i].resize(tensorRank, 1);
-            offset_calculation(offsets_out[i], dims_out[i], exec_domain);
-            for (size_t j = 0; j < tensorRank; j++) {
-                offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size();
-            }
-        }
-
-        start_offset_out.resize(outputNum);
-        dstMemPtrs.resize(outputNum);
-        for (size_t i = 0; i < outputNum; i++) {
-            const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
-            dstMemPtrs[i] = memPtr;
-            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
-                    config.outConfs[i].getMemDesc()->getPrecision().size();
-        }
-    };
-
-    auto find_dims_to_collapse = [this, config]() -> int {
-        int collapsedDims = 0;
-        size_t minimalConcurrency = parallel_get_max_threads();
-        size_t minimalJitWorkAmount = 256;
-        size_t currentJitWorkAmount = exec_domain.back();
-        while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
-            if (static_cast<int>(exec_domain.size()) - collapsedDims - 2 < 0)
-                break;
-
-            bool canCollapse = true;
-            for (size_t i = 0; i < dims_in.size(); i++) {
-                if ((dims_in[i][dims_in[i].size() - 2] != 1 && dims_in[i][dims_in[i].size() - 1] == 1) ||
-                    (dims_in[i][dims_in[i].size() - 2] == 1 && dims_in[i][dims_in[i].size() - 1] != 1)) {
-                    canCollapse = false;
-                    break;
-                }
-            }
-
-            size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2];
-            if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
-                currentJitWorkAmount = nextJitWorkAmount;
-                // if we cannot use dim collapsing we should use tile2D
-                if (!canCollapse) {
-                    if (tileRank < maxTileRank) {
-                        tileRank++;
-                        continue;
-                    }
-
-                    break;
-                }
-
-                collapsedDims++;
-                for (auto &d : dims_in)
-                    collapseLastDims(d, 1);
-
-                for (auto &d : dims_out)
-                    collapseLastDims(d, 1);
-
-                collapseLastDims(exec_domain, 1);
-            } else {
-                break;
-            }
-        }
-        return collapsedDims;
-    };
-
-    auto initSchedulingInfo = [this, config]() -> void {
-        // initialize scheduling information
-        sch_offsets_in.resize(offsets_in.size(), 0);
-        sch_offsets_out.resize(offsets_out.size(), 0);
-        sch_dims.resize(maxTileRank, 1);
-        sch_dims[maxTileRank-1] = exec_domain.back();
-        schedulerWorkAmount = fullWorkAmount / exec_domain.back();
-        if (tileRank > 1) {
-            sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2];
-            schedulerWorkAmount /= exec_domain[tensorRank - 2];
-            exec_domain[tensorRank - 2] = 1;
-
-            // update offsets for tile 2D because loaders and stores have ptr shifts in some cases
-            const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes();
-            for (size_t i = 0; i < offsets_in.size(); i++) {
-                const int64_t offset = offsets_in[i][tensorRank - 2];
-                const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size();
-                if (offset == data_size || offset == vector_size * data_size) {
-                    sch_offsets_in[i] = offset;
-                } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) {
-                    sch_offsets_in[i] = offset - exec_domain.back() * data_size;
-
-                    // If scalar tile executes one time, ptr doesn't move on 1 value
-                    // so we should absolutelly decrease offset
-                    if (exec_domain.back() % vector_size == 1) {
-                        sch_offsets_in[i] += data_size;
-                    }
-                }
-            }
-
-            for (size_t i = 0; i < offsets_out.size(); i++) {
-                const int64_t offset = offsets_out[i][tensorRank - 2];
-                const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size();
-                if (offset == data_size || offset == vector_size * data_size) {
-                    sch_offsets_out[i] = offset;
-                } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) {
-                    sch_offsets_out[i] = offset - exec_domain.back() * data_size;
-
-                    // If scalar tile executes one time, ptr doesn't move on 1 value
-                    // so we should absolutelly decrease offset
-                    if (exec_domain.back() % vector_size == 1) {
-                        sch_offsets_out[i] += data_size;
-                    }
-                }
-            }
-        }
-    };
-
-    fullWorkAmount = 1;
-    for (const auto &d : exec_domain) {
-        fullWorkAmount *= d;
-    }
-
-    batchDimIdx = tensorRank - exec_domain.size();
-    // Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo()
-    find_dims_to_collapse();
-
-    initOffsets();
-    initSchedulingInfo();
-}
-
-void Snippet::generate() {
-    jit_snippets_compile_args jcp;
-    jcp.output_dims = exec_domain;
-    std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims);
-    std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets);
-    std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]);
-    size_t harness_num_dims = jcp.output_dims.size() - 1;
-    if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) {
-        canUseOptimizedImpl = false;
-        harness_num_dims = SNIPPETS_MAX_HARNESS_DIMS;
-    }
-    for (size_t i = 0; i < inputShapes.size(); i++) {
-        auto b = offsets_in[i].begin();
-        std::copy(b, b + harness_num_dims, &jcp.data_offsets[i * harness_num_dims]);
-    }
-    for (size_t i = 0; i < outputShapes.size(); i++) {
-        auto b = offsets_out[i].begin();
-        std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
-    }
-
+void Snippet::generate(const jit_snippets_compile_args* jcp) {
     ov::pass::Manager optManager;
     optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
     optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
@@ -515,25 +519,54 @@ void Snippet::generate() {
                     return convert->get_input_element_type(0) != ov::element::f32;
                 return true;
             });
-
-    schedule = snippet->generate(optManager, reinterpret_cast<void*>(&jcp));
+    schedule = snippet->generate(optManager, reinterpret_cast<const void*>(jcp));
 }
 
-void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {
+void Snippet::update_ptrs(jit_snippets_call_args& call_args) {
+    for (size_t i = 0; i < srcMemPtrs.size(); i++)
+        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
+
+    for (size_t i = 0; i < dstMemPtrs.size(); i++)
+        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
+
+    if (buffer_scratchpad_size > 0) {
+        call_args.buffer_scratchpad_ptr =
+                reinterpret_cast<uint8_t*>(buffer_scratchpad.data()) + parallel_get_thread_num() * buffer_scratchpad_size;
+    }
+}
+
+void Snippet::execute(dnnl::stream strm) {
+    if (schedule.ptr == nullptr) {
+        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
+    }
+    if (tensorRank == rank6D) {
+        schedule_6d();
+    } else {
+        schedule_nt();
+    }
+}
+
+void Snippet::schedule_6d() {
     const auto& dom = exec_domain;
     // < N, C, H, W > < 1, 1, N, C*H*W>
     parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4],
         [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) {
             int64_t indexes[] = {d0, d1, d2, d3, d4};
+            jit_snippets_call_args call_args;
+            update_ptrs(call_args);
+
             schedule.get_callable<kernel>()(indexes, &call_args);
         });
 }
 
-void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const {
+void Snippet::schedule_nt() {
     const auto& work_size = exec_domain;
     parallel_nt(0, [&](const int ithr, const int nthr) {
+        jit_snippets_call_args call_args;
+        update_ptrs(call_args);
+
         size_t start = 0, end = 0;
-        splitter(schedulerWorkAmount, nthr, ithr, start, end);
+        splitter(harnessWorkAmount, nthr, ithr, start, end);
 
         std::vector<int64_t> indexes(work_size.size() - 1, 0);
         for (size_t iwork = start; iwork < end; ++iwork) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index 9b9b06036fb..ce9a01f951b 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -38,6 +38,9 @@ public:
 
     // Here we convert to canonical for & jit everything
     void createPrimitive() override;
+    void prepareParams() override;
+    std::vector<VectorDims> shapeInfer() const override;
+    bool needPrepareParams() const override;
 
     bool canBeInPlace() const override;
     bool created() const override;
@@ -55,13 +58,15 @@ private:
     // NOTE: Before call mutex should be initialized
     void copy_snippet();
 
-    void define_schedule();
-
-    void generate();
+    ov::PartialShape canonicalizeBody();
+    // returns true if exec domain was modified
+    bool optimizeExecDomain(std::vector<VectorDims>&, std::vector<VectorDims>&, VectorDims&, size_t&) const;
 
+    void generate(const jit_snippets_compile_args*);
+    inline void update_ptrs(jit_snippets_call_args&);
     // Evaluates generated snippet using parallel backend
-    void schedule_6d(const jit_snippets_call_args& const_args) const;
-    void schedule_nt(const jit_snippets_call_args& const_args) const;
+    void schedule_6d();
+    void schedule_nt();
 
     // Original subgraph node
     std::shared_ptr<ngraph::snippets::op::Subgraph> original_snippet;
@@ -73,34 +78,39 @@ private:
 
     // Holds ISA version used is codeGeneration target
     dnnl::impl::cpu::x64::cpu_isa_t host_isa;
+    size_t isa_num_lanes; // number of elements that fit in vector size
 
     // Holds index of output used as in execution domain
     // it should be compatible with a schedule's work size
     std::vector<size_t> exec_domain = {};
 
     /// scheduling info
-    size_t batchDimIdx = 0;
     size_t tensorRank = 0;
     size_t tileRank = 1;
     size_t fullWorkAmount = 0;
-    size_t schedulerWorkAmount = 0;
+    size_t harnessWorkAmount = 0;
     const size_t maxTileRank = 2;
 
     std::vector<MemoryPtr> srcMemPtrs = {};
     std::vector<MemoryPtr> dstMemPtrs = {};
+    std::vector<size_t> dataSize = {};
+
+    // this is needed for fast shape inference of blocking-invariant prepended shapes
+    std::vector<bool> inputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts
+    std::vector<bool> outputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts
+    bool masterShapeIsBlocked = false;
+
+    // master shape is mutable since we need to modify it inside const shapeInfer method
+    mutable VectorDims masterShape = {};
+    mutable std::vector<VectorDims> normInputShapes = {};
+    mutable std::vector<VectorDims> normOutputShapes = {};
 
-    std::vector<std::vector<size_t>> dims_in = {};
-    std::vector<std::vector<size_t>> offsets_in = {};
     std::vector<ptrdiff_t> start_offset_in = {};
     std::vector<ptrdiff_t> start_offset_out = {};
 
-    std::vector<std::vector<size_t>> dims_out = {};
-    std::vector<std::vector<size_t>> offsets_out = {};
-
-    std::vector<int64_t> sch_dims = {};
-    std::vector<int64_t> sch_offsets_in = {};
-    std::vector<int64_t> sch_offsets_out = {};
-    bool canUseOptimizedImpl = true;
+    // Buffer scratchpad
+    std::vector<uint8_t> buffer_scratchpad = {};
+    size_t buffer_scratchpad_size = 0;
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 04e6e8c23e8..871cc3a5381 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -396,12 +396,24 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
     const auto& dynamicBatchProp = config.find(InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED);
     const bool enableDynamicBatch = (dynamicBatchProp != config.end() && dynamicBatchProp->second == PluginConfigParams::YES)
             || engConfig.enableDynamicBatch;
-    const bool enableSnippets = !enableDynamicBatch;
+
+    auto snippetsMode = enableDynamicBatch ? Config::SnippetsMode::Disable : Config::SnippetsMode::Enable;
+    const auto& snippetsModeProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE);
+    if (snippetsMode == Config::SnippetsMode::Enable && snippetsModeProp != config.end()) {
+        const auto& val = snippetsModeProp->second;
+        if (val == PluginConfigInternalParams::IGNORE_CALLBACK)
+            snippetsMode =  Config::SnippetsMode::IgnoreCallback;
+        else if (val == PluginConfigInternalParams::DISABLE)
+            snippetsMode =  Config::SnippetsMode::Disable;
+        else
+            IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
+    }
+
     auto nGraphFunc = clonedNetwork.getFunction();
 
     DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));
 
-    Transformations transformations(nGraphFunc, enableLPT, enableSnippets, enableBF16, isLegacyAPI(), engConfig);
+    Transformations transformations(nGraphFunc, enableLPT, enableBF16, isLegacyAPI(), snippetsMode, engConfig);
     transformations.UpToCpuSpecificOpSet();
 
     // need to check that all outputs have static shapes
@@ -645,7 +657,18 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
     const auto& lptProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_LP_TRANSFORMS_MODE);
     const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/
                         || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */;
-    const bool enableSnippets = !conf.enableDynamicBatch;
+
+    auto snippetsMode = conf.enableDynamicBatch ? Config::SnippetsMode::Disable : Config::SnippetsMode::Enable;
+    const auto& snippetsModeProp = config.find(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE);
+    if (snippetsMode == Config::SnippetsMode::Enable && snippetsModeProp != config.end()) {
+        const auto& val = snippetsModeProp->second;
+        if (val == PluginConfigInternalParams::IGNORE_CALLBACK)
+            snippetsMode =  Config::SnippetsMode::IgnoreCallback;
+        else if (val == PluginConfigInternalParams::DISABLE)
+            snippetsMode =  Config::SnippetsMode::Disable;
+        else
+            IE_THROW() << "Wrong value for property key SNIPPETS_MODE. Expected values: ENABLE/DISABLE/IGNORE_CALLBACK";
+    }
 
     auto model = network.getFunction();
     if (model == nullptr) {
@@ -657,7 +680,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
 
     auto supported = GetSupportedNodes(model,
                                        [&](std::shared_ptr<ov::Model>& model) {
-                                           Transformations transformation(model, enableLPT, enableSnippets, conf.enforceBF16, isLegacyAPI(), engConfig);
+                                           Transformations transformation(model, enableLPT, conf.enforceBF16, isLegacyAPI(), snippetsMode, engConfig);
                                            transformation.UpToCpuSpecificOpSet();
                                            transformation.CpuSpecificOpSet();
                                        },
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
index af29d870b0b..b47fcfe73da 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/fuse_load_store_and_convert.cpp
@@ -42,12 +42,12 @@ ov::intel_cpu::pass::FuseLoadConvert::FuseLoadConvert() {
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
             load_convert = std::make_shared<ov::intel_cpu::LoadConvertSaturation>(param,
                                                                                   convert_saturation->get_destination_type(),
-                                                                                  load->get_count());
+                                                                                  load->get_count(), load->get_offset());
         } else if (const auto convert_truncation =
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
             load_convert = std::make_shared<ov::intel_cpu::LoadConvertTruncation>(param,
                                                                                   convert_truncation->get_destination_type(),
-                                                                                  load->get_count());
+                                                                                  load->get_count(), load->get_offset());
         } else {
             throw ngraph::ngraph_error(
                 "Type of Convert op is undefined. Supports only fusing Load and ConvertTruncation or ConvertSaturation ops");
@@ -91,12 +91,12 @@ ov::intel_cpu::pass::FuseStoreConvert::FuseStoreConvert() {
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertSaturation>(convert)) {
             store_convert = std::make_shared<ov::intel_cpu::StoreConvertSaturation>(input,
                                                                                     convert_saturation->get_destination_type(),
-                                                                                    store->get_count());
+                                                                                    store->get_count(), store->get_offset());
         } else if (const auto convert_truncation =
                 std::dynamic_pointer_cast<ngraph::snippets::op::ConvertTruncation>(convert)) {
             store_convert = std::make_shared<ov::intel_cpu::StoreConvertTruncation>(input,
                                                                                     convert_truncation->get_destination_type(),
-                                                                                    store->get_count());
+                                                                                    store->get_count(), store->get_offset());
         } else {
             throw ngraph::ngraph_error(
                 "Type of Convert op is undefined. Supports only fusing Store and ConvertTruncation or ConvertSaturation ops");
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
index fcf59b169ef..dbb8046f636 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.cpp
@@ -11,8 +11,9 @@
 using namespace std;
 using namespace ov;
 
-intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-    Load(x, count), m_destination_type(destination_type) {
+intel_cpu::LoadConvertSaturation::LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+    Load(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -30,11 +31,12 @@ void intel_cpu::LoadConvertSaturation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::LoadConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<LoadConvertSaturation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
 
-intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-        Load(x, count), m_destination_type(destination_type) {
+intel_cpu::LoadConvertTruncation::LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+        Load(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -52,5 +54,5 @@ void intel_cpu::LoadConvertTruncation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::LoadConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(LoadConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<LoadConvertTruncation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
index 50d7fe44c2e..9931a6f057d 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/load_convert.hpp
@@ -20,7 +20,7 @@ class LoadConvertSaturation : public ngraph::snippets::op::Load {
 public:
     OPENVINO_OP("LoadConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Load);
 
-    LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    LoadConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     LoadConvertSaturation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
@@ -47,7 +47,7 @@ class LoadConvertTruncation : public ngraph::snippets::op::Load {
 public:
     OPENVINO_OP("LoadConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Load);
 
-    LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    LoadConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     LoadConvertTruncation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
index 5362ff9c9b6..52921e681e9 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.cpp
@@ -11,8 +11,9 @@
 using namespace std;
 using namespace ov;
 
-intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-    Store(x, count), m_destination_type(destination_type) {
+intel_cpu::StoreConvertSaturation::StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+    Store(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -30,11 +31,12 @@ void intel_cpu::StoreConvertSaturation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::StoreConvertSaturation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<StoreConvertSaturation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
 
-intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count) :
-        Store(x, count), m_destination_type(destination_type) {
+intel_cpu::StoreConvertTruncation::StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type,
+    const size_t count, const size_t offset) :
+        Store(x, count, offset), m_destination_type(destination_type) {
     constructor_validate_and_infer_types();
 }
 
@@ -52,5 +54,5 @@ void intel_cpu::StoreConvertTruncation::validate_and_infer_types() {
 std::shared_ptr<Node> intel_cpu::StoreConvertTruncation::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(StoreConvert_clone_with_new_inputs);
     check_new_args_count(this, new_args);
-    return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count);
+    return std::make_shared<StoreConvertTruncation>(new_args.at(0), m_destination_type, m_count, m_offset);
 }
diff --git a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
index d44ccacd4f4..ee6410682b8 100644
--- a/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
+++ b/src/plugins/intel_cpu/src/snippets_transformations/op/store_convert.hpp
@@ -20,7 +20,7 @@ class StoreConvertSaturation : public ngraph::snippets::op::Store {
 public:
     OPENVINO_OP("StoreConvertSaturation", "SnippetsOpset", ngraph::snippets::op::Store);
 
-    StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    StoreConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     StoreConvertSaturation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
@@ -47,7 +47,7 @@ class StoreConvertTruncation : public ngraph::snippets::op::Store {
 public:
     OPENVINO_OP("StoreConvertTruncation", "SnippetsOpset", ngraph::snippets::op::Store);
 
-    StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu);
+    StoreConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type, const size_t count = 1lu, const size_t offset = 0lu);
     StoreConvertTruncation() = default;
 
     ov::element::Type get_destination_type() const { return m_destination_type; }
diff --git a/src/plugins/intel_cpu/src/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformation_pipeline.cpp
index b4e0de4ff3e..293e6fc500f 100644
--- a/src/plugins/intel_cpu/src/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformation_pipeline.cpp
@@ -95,7 +95,7 @@
 #include "ngraph_transformations/swap_convert_transpose.hpp"
 
 // Snippets
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 #include "snippets/pass/common_optimizations.hpp"
 
 // Misc
@@ -140,7 +140,7 @@ void Transformations::UpToCpuSpecificOpSet() {
         ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(model) &&
         CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);
 
-    const bool useSnippets = enableSnippets &&
+    const bool useSnippets = snippetsMode != Config::SnippetsMode::Disable &&
         CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Snippets);
 
     auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector<ov::element::Type>{};
@@ -543,47 +543,95 @@ void Transformations::PostLpt() {
             return false;
         });
 
+    // Float MHA is supported by snippets now
+    if (!enableBF16) {
+        postLPTPassManager.get_pass_config()->disable<MHAFloatFusion>();
+        postLPTPassManager.get_pass_config()->disable<MHAFloatFusion2>();
+    }
+
     // Execute before snippets. Otherwise FQ will be converted to Subgraph
     postLPTPassManager.register_pass<ConvertFqRnnToQuantizedRnn>();
     postLPTPassManager.run_passes(model);
 }
 
 void Transformations::MainSnippets(void) {
-    if (!enableSnippets ||
+    if (snippetsMode == Config::SnippetsMode::Disable ||
         !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) // snippets are implemeted only for relevant platforms (avx2+ extentions)
         return;
 
-    ov::pass::Manager snippetsManager;
-    snippetsManager.register_pass<SnippetsMarkSkipped>();
-    snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
-    snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
-    snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
-        [](const std::shared_ptr<const ov::Node>& n) -> bool {
-            // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
-            if (ov::is_type<const ov::op::v4::Swish>(n)) {
-                if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
-                    return true;
-            }
+    ngraph::pass::Manager snippetsManager;
+    if (snippetsMode != Config::SnippetsMode::IgnoreCallback)
+        snippetsManager.register_pass<SnippetsMarkSkipped>();
+    snippetsManager.register_pass<ngraph::snippets::pass::SnippetsTokenization>();
 
-            const auto& inputs = n->inputs();
-            // todo: clarify whether we can evaluate snippets on const paths
-            const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
-                                                           [](const ov::Input<const ov::Node> &in) {
-                                                               return ov::is_type<ov::op::v0::Constant>(in.get_source_output().get_node_shared_ptr());
-                                                           });
-            // todo: clarify whether we can evaluate snippets on inputs with larger ranks
-            auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) {
-                // callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
-                return t.get_partial_shape().rank().get_length() > 6;
-            };
-            const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
-                                                    [&](const ov::Input<const ov::Node>& in) {return  rank_is_too_large(in.get_tensor());});
-            const auto& outputs = n->outputs();
-            const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
-                                                     [&](const ov::Output<const ov::Node>& out) {return  rank_is_too_large(out.get_tensor());});
-            return has_only_const_inputs || bad_input_rank || bad_output_rank;
-        });
-    snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
+    const bool isMHASupported =
+            !enableBF16 &&  // TODO: Need to add BF16 support for MHA in Snippets
+            dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core);  // MHA has BRGEMM that is supported only on AVX512 platforms
+    if (!isMHASupported) {
+        snippetsManager.get_pass_config()->disable<ngraph::snippets::pass::TokenizeMHASnippets>();
+    }
+    if (snippetsMode != Config::SnippetsMode::IgnoreCallback) {
+        snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeMHASnippets>(
+                [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                    const auto pshape = n->get_output_partial_shape(0);
+                    const auto shape = pshape.get_shape();
+                    const auto parallel_work_amount =
+                            std::accumulate(shape.rbegin() + 2, shape.rend(), 1, std::multiplies<size_t>());
+                    const auto kernel_buffer_size =
+                            std::accumulate(shape.rbegin(), shape.rbegin() + 2, 1, std::multiplies<size_t>()) *
+                            n->get_output_element_type(0).size();
+                    // Heuristic values:
+                    //    parallelism work amount - not enough work amount for parallelism
+                    //    kernel work amount - large shape for kernel execution, not cache-local
+                    // TODO: The heuristics will be removed after
+                    //       - loop blocking support on code generation level
+                    //       - parallelism support on JIT level
+                    const auto needed_num_of_threads = 12lu;
+                    const auto l2_cache_size = dnnl::utils::get_cache_size(2, true);
+                    const auto is_unsupported_parallel_work_amount = parallel_get_num_threads() / 2 > parallel_work_amount &&
+                                                                     parallel_work_amount < needed_num_of_threads;
+                    const auto is_unsupported_kernel_work_amount = kernel_buffer_size > l2_cache_size;
+                    return is_unsupported_parallel_work_amount || is_unsupported_kernel_work_amount;
+                });
+        snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
+                [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                    // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
+                    const bool is_unsupported_swish =
+                            ov::is_type<const ov::op::v4::Swish>(n) && n->inputs().size() > 1 &&
+                            !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1));
+                    // todo: general tokenization flow is not currently supported for these operations.
+                    //  they can be tokenized only as a part of complex patterns
+                    const bool is_disabled_tokenization = (ov::is_type<const ov::op::v1::Softmax>(n) ||
+                                                           ov::is_type<const ov::op::v8::Softmax>(n) ||
+                                                           ov::is_type<const ov::op::v0::MatMul>(n) ||
+                                                           ov::is_type<const ov::op::v1::Transpose>(n) ||
+                                                           ov::is_type<const ov::op::v1::Broadcast>(n) ||
+                                                           ov::is_type<const ov::op::v3::Broadcast>(n));
+                    const auto& inputs = n->inputs();
+                    // todo: clarify whether we can evaluate snippets on const paths
+                    const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
+                                                                   [](const ov::Input<const ov::Node>& in) {
+                                                                       return ov::is_type<ov::op::v0::Constant>(
+                                                                               in.get_source_output().get_node_shared_ptr());
+                                                                   });
+                    // todo: clarify whether we can evaluate snippets on inputs with larger ranks
+                    auto rank_is_too_large = [](const ov::descriptor::Tensor& t) {
+                        // callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
+                        return t.get_partial_shape().rank().get_length() > 6;
+                    };
+                    const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
+                                                            [&](const ov::Input<const ov::Node>& in) {
+                                                                return rank_is_too_large(in.get_tensor());
+                                                            });
+                    const auto& outputs = n->outputs();
+                    const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
+                                                             [&](const ov::Output<const ov::Node>& out) {
+                                                                 return rank_is_too_large(out.get_tensor());
+                                                             });
+                    return has_only_const_inputs || bad_input_rank || bad_output_rank || is_unsupported_swish ||
+                           is_disabled_tokenization;
+                });
+    }
     snippetsManager.run_passes(model);
 }
 
diff --git a/src/plugins/intel_cpu/src/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformation_pipeline.h
index f56427afa8d..e01ad93f0fd 100644
--- a/src/plugins/intel_cpu/src/transformation_pipeline.h
+++ b/src/plugins/intel_cpu/src/transformation_pipeline.h
@@ -24,15 +24,15 @@ class Transformations {
 public:
     Transformations(const std::shared_ptr<ov::Model>& initialModel,
                     const bool                        enableLpt,
-                    const bool                        enableSnippets,
                     const bool                        enableBF16,
                     const bool                        isLegacyApi,
+                    Config::SnippetsMode&             snippetsMode,
                     const Config&                     config)
         : model(initialModel),
           enableLpt(enableLpt),
-          enableSnippets(enableSnippets),
           enableBF16(enableBF16),
           isLegacyApi(isLegacyApi),
+          snippetsMode(snippetsMode),
           config(config) {}
 
     void UpToCpuSpecificOpSet();
@@ -41,10 +41,10 @@ public:
 private:
     std::shared_ptr<ov::Model> model;
     const bool    enableLpt;
-    const bool    enableSnippets;
     const bool    enableBF16;
     const bool    isLegacyApi;
     const Config& config;
+    const Config::SnippetsMode snippetsMode;
 
     void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi);
 
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index cf4ff70fbe9..fd697eedd77 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -215,6 +215,10 @@ std::vector<std::string> disabledTestPatterns() {
         // tests are useless on such platforms
         retVector.emplace_back(R"(.*(BF|bf)16.*)");
         retVector.emplace_back(R"(.*bfloat16.*)");
+        // MatMul in Snippets uses BRGEMM that is supported only on AVX512 platforms
+        // Disabled Snippets MHA tests as well because MHA pattern contains MatMul
+        retVector.emplace_back(R"(.*Snippets.*MHA.*)");
+        retVector.emplace_back(R"(.*Snippets.*(MatMul|Matmul).*)");
     }
     if (!InferenceEngine::with_cpu_x86_avx512_core_amx_int8())
         //TODO: Issue 92895
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
index acebcb77d6e..86b7d6b3b11 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
@@ -12,15 +12,49 @@ namespace snippets {
 
 namespace {
 
+namespace snippets_static_1 {
+// These  inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc)
+std::vector<ov::Shape> inShapesStatic1{{1, 16, 29,  1}, {1, 16, 29,  7}, {1, 16, 29,  8}, {1, 16, 29,  15}, {1, 16, 29,  16}, {1, 16, 29,  31}};
+std::vector<ov::Shape> inShapesStatic2{{1, 16, 29,  1}, {1, 16, 1, 1}, {1, 1, 1, 1}};
+
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
+                         ::testing::Combine(
+                             ::testing::ValuesIn(inShapesStatic1),
+                             ::testing::ValuesIn(inShapesStatic2),
+                             ::testing::Values(ov::element::f32),
+                             ::testing::Values(1), // Add
+                             ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         Add::getTestCaseName);
+// test cross-tile (vector vs scalar) optimizations in the absence of vector tile
+std::vector<std::vector<ov::Shape>> inShapesStatic{
+        {{1, 128, 1, 1}, {1, 128, 1, 1}},
+        {{1, 128, 1, 9}, {1, 128, 1, 9}},
+        {{1, 128, 1, 17}, {1, 128, 1, 17}},
+        {{1, 128, 1, 29}, {1, 128, 1, 29}},
+        {{1, 128, 1, 33}, {1, 128, 1, 33}},
+        {{1, 128, 9, 30}, {1, 128, 1, 30}},
+        {{1, 128, 9, 1}, {1, 128, 1, 30}},
+};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddPair,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inShapesStatic),
+                                 ::testing::Values(ov::element::f32),
+                                 ::testing::Values(1),
+                                 ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         AddPair::getTestCaseName);
+
+} // namespace snippets_static_1
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddConst,
         ::testing::Combine(
                 ::testing::Values(ov::Shape {1, 42, 16, 64}),
-                ::testing::Values(ov::Shape {1, 42, 16,  1}),
                 ::testing::Values(ov::element::f32),
-                ::testing::Values(1),
-                ::testing::Values(1), // one node - Add
+                ::testing::Values(1), // Add
+                ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        Add::getTestCaseName);
+        AddConst::getTestCaseName);
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddRollConst,
         ::testing::Combine(
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
index bdf0fd38a50..ffc6ef57add 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/conv_eltwise.cpp
@@ -10,26 +10,26 @@ namespace test {
 namespace snippets {
 namespace {
 
-    ov::Shape convInputShape {1, 10, 16, 16};
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise,
-            ::testing::Combine(
-            ::testing::Values(convInputShape),
-            ::testing::Values(convInputShape),
-            ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Add>())), // non-tokenizable
-            ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs
-            ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution
-            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-            ConvEltwise::getTestCaseName);
+ov::Shape convInputShape {1, 10, 16, 16};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise,
+        ::testing::Combine(
+        ::testing::Values(convInputShape),
+        ::testing::Values(convInputShape),
+        ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Add>())), // non-tokenizable
+        ::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs
+        ::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ConvEltwise::getTestCaseName);
 
-    INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise,
-            ::testing::Combine(
-            ::testing::Values(convInputShape),
-            ::testing::Values(convInputShape),
-            ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Multiply>())), // fully-tokenizable
-            ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders
-            ::testing::Values(1), // num subgraphs = 0: Mul (2 inputs) can't be fused into Conv => Subgraph is created
-            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-            ConvEltwise::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise,
+        ::testing::Combine(
+        ::testing::Values(convInputShape),
+        ::testing::Values(convInputShape),
+        ::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Multiply>())), // fully-tokenizable
+        ::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders
+        ::testing::Values(1), // num subgraphs = 1: Mul (2 inputs) can't be fused into Conv => Subgraph is created
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ConvEltwise::getTestCaseName);
 }  // namespace
 } // namespace snippets
 } // namespace test
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
index e2890469356..b3d2907e7ee 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
@@ -30,10 +30,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::u8 }, { ov::element::i8 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_Convert = {
-        { ov::Shape{2, 16} },
-        { ov::Shape{5, 5} },
-        { ov::Shape{2, 12, 1} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_Convert = {
+        { ov::PartialShape{2, 16} },
+        { ov::PartialShape{5, 5} },
+        { ov::PartialShape{2, 12, 1} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
@@ -57,10 +57,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::u8 }, { ov::element::bf16 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_ConvertInput = {
-        { ov::Shape{2, 16}, ov::Shape{1, 16} },
-        { ov::Shape{5, 18}, ov::Shape{5, 1} },
-        { ov::Shape{3, 1}, ov::Shape{3, 21} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_ConvertInput = {
+        { ov::PartialShape{2, 16}, ov::PartialShape{1, 16} },
+        { ov::PartialShape{5, 18}, ov::PartialShape{5, 1} },
+        { ov::PartialShape{3, 1}, ov::PartialShape{3, 21} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput,
@@ -94,10 +94,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::i8, ov::element::i8, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
-        { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} },
-        { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} },
-        { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_ConvertPartialInputsAndResults = {
+        { ov::PartialShape{2, 16}, ov::PartialShape{1, 16}, ov::PartialShape{1, 1} },
+        { ov::PartialShape{5, 18}, ov::PartialShape{5, 1}, ov::PartialShape{1, 18} },
+        { ov::PartialShape{3, 1}, ov::PartialShape{3, 21}, ov::PartialShape{3, 1} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults,
@@ -117,7 +117,7 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertMany),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
@@ -126,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertMany),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
@@ -140,7 +140,7 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertManyIO),
                                  ::testing::Values(1),
                                  ::testing::Values(1),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
new file mode 100644
index 00000000000..9ab22c79d2e
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/matmul.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+std::vector<std::vector<ov::PartialShape>> input_shapes{
+        {{2, 1, 3, 5}, {1, 3, 5, 3}},
+        {{3, 1, 32, 14}, {1, 2, 14, 32}},
+        {{1, 2, 37, 23}, {2, 1, 23, 37}},
+        {{1, 1, 37, 23}, {1, 2, 23, 33}},
+        {{2, 1, 69, 43}, {1, 1, 43, 49}}
+};
+std::vector<element::Type> precisions{element::f32};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, MatMul,
+                         ::testing::Combine(
+                             ::testing::ValuesIn(input_shapes),
+                             ::testing::ValuesIn(precisions),
+                             ::testing::Values(1), // MatMu;
+                             ::testing::Values(1), // Tokenized MatMul
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 1, 43, 49}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         ExplicitTransposeMatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMulMatMulBias, ExplicitTransposeMulMatMulBias,
+                         ::testing::Combine(
+                                 ::testing::Values(std::vector<ov::PartialShape>{{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 2, 1, 1}, {1, 1, 69, 49}}),
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // Subgraph;
+                                 ::testing::Values(1), // Tokenized MatMul+Bias
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MatMul::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
new file mode 100644
index 00000000000..11aeaebdcc2
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/mha.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<std::vector<ov::PartialShape>> inputShapes = {
+        {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}},
+        {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}},
+        {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}},
+        {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}},
+        {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHA,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(inputShapes),
+                             ::testing::ValuesIn({false, true}),
+                             ::testing::Values(1),
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     MHA::getTestCaseName);
+
+const std::vector<std::vector<ov::PartialShape>> inputShapeSelect = {
+        // without broadcast
+        {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}},
+        {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}},
+        // with broadcast
+        {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}},
+        {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}}
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHASelect,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapeSelect),
+                                 ::testing::Values(false),  // Need to support True for graph builder in tests
+                                 ::testing::Values(2), // Less + MHA
+                                 ::testing::Values(2),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MHA::getTestCaseName);
+
+const std::vector<std::vector<ov::PartialShape>> inputShapesWOTranspose = {
+        {{1, 12, 197, 64}, {1, 12, 64, 197}, {1, 12, 197, 64}}
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAWOTransposeOnInputs, MHAWOTransposeOnInputs,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inputShapesWOTranspose),
+                                 ::testing::ValuesIn({true}),  // Need to support False for graph builder in tests
+                                 ::testing::Values(1),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MHA::getTestCaseName);
+
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp
new file mode 100644
index 00000000000..677d7678af0
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/select.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/select.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Select, Select,
+        ::testing::Combine(
+                ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}),
+                ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}),
+                ::testing::ValuesIn({ov::Shape{1, 5, 5, 35}, ov::Shape{1}}),
+                ::testing::ValuesIn({ov::element::f32, ov::element::i8}),
+                ::testing::Values(1),
+                ::testing::Values(1),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        Select::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastSelect, BroadcastSelect,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn({Shape{1, 8, 2, 1}, Shape{1, 1, 1, 1}}),
+                                 ::testing::ValuesIn({Shape{1, 8, 2, 10}, Shape{1, 8, 2, 1}}),
+                                 ::testing::ValuesIn({Shape{1, 8, 2, 10}, Shape{1, 1, 1, 1}}),
+                                 ::testing::ValuesIn({Shape{1, 8, 2, 1}, Shape{1, 8, 2, 10}}),
+                                 ::testing::ValuesIn({ov::element::f32, ov::element::i8}),
+                                 ::testing::Values(1),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         BroadcastSelect::getTestCaseName);
+
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
new file mode 100644
index 00000000000..8869ffdb6ee
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/softmax.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<ov::Shape> inputShape = {
+    ov::Shape{1, 16},
+    ov::Shape{1, 32},
+    ov::Shape{1, 1},
+    ov::Shape{1, 9},
+    ov::Shape{1, 17},
+    ov::Shape{1, 19},
+    ov::Shape{1, 49},
+    ov::Shape{1, 50},
+    ov::Shape{5, 16},
+    ov::Shape{5, 32},
+    ov::Shape{5, 1},
+    ov::Shape{5, 9},
+    ov::Shape{5, 17},
+    ov::Shape{5, 19},
+    ov::Shape{5, 49},
+    ov::Shape{5, 50},
+    ov::Shape{1, 3, 128, 128},
+    ov::Shape{1, 3, 128, 129},
+    ov::Shape{1, 3, 128, 130},
+    ov::Shape{1, 3, 128, 1},
+    ov::Shape{1, 3, 128, 9},
+    ov::Shape{1, 3, 128, 16},
+    ov::Shape{1, 3, 128, 17},
+    ov::Shape{1, 3, 128, 20},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Softmax, Softmax,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(inputShape),
+                             ::testing::Values(-1),
+                             ::testing::Values(1),
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     Softmax::getTestCaseName);
+
+const std::vector<std::pair<ov::Shape, ov::Shape>> inputShapesPair = {
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 16, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 1}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 1}, ov::Shape{1, 5, 16, 1}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 16, 35}, ov::Shape{1, 5, 1, 35}},
+    std::pair<ov::Shape, ov::Shape>{ov::Shape{1, 5, 1, 35}, ov::Shape{1, 5, 1, 35}},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmax, AddSoftmax,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(inputShapesPair),
+                             ::testing::Values(-1),
+                             ::testing::Values(1),
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     AddSoftmax::getTestCaseName);
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
new file mode 100644
index 00000000000..0179adb0a7a
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/transpose.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+std::vector<ov::PartialShape> input_shapes{{2, 3, 5, 13}, {2, 3, 2, 4}, {1, 7, 1, 4}};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Transpose, Transpose,
+                     ::testing::Combine(
+                             ::testing::ValuesIn(input_shapes),
+                             ::testing::Values(std::vector<int> {0, 2,  3, 1}),
+                             ::testing::Values(1), // Transpose
+                             ::testing::Values(1), // Tokenized Transpose
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     Transpose::getTestCaseName);
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp
new file mode 100644
index 00000000000..8e3af45fd52
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/transpose_matmul.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+std::vector<element::Type> precisions{element::f32};
+namespace transpose_zero_input {
+std::vector<std::vector<ov::PartialShape>> transpose_input_shapes{
+        {{1, 49, 2, 23}, {2, 2, 23, 39}}
+};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(transpose_input_shapes),
+                                 ::testing::Values(0), // Transpose on 0th Matmul input
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // MatMul;
+                                 ::testing::Values(1), // Tokenized MatMul + FusedTranspose
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TransposeMatMul::getTestCaseName);
+} // namespace transpose_zero_input
+
+namespace transpose_first_input {
+std::vector<std::vector<ov::PartialShape>> transpose_input_shapes{
+        {{2, 1, 49, 13}, {1, 13, 3, 39}}
+};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(transpose_input_shapes),
+                                 ::testing::Values(1), // Transpose on 1st Matmul input
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // MatMu;
+                                 ::testing::Values(1), // Tokenized MatMul + FusedTranspose
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TransposeMatMul::getTestCaseName);
+} // namespace transpose_first_input
+
+namespace transpose_output {
+std::vector<std::vector<ov::PartialShape>> transpose_input_shapes{
+        {{2, 1, 49, 13}, {1, 2, 13, 39}}
+};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(transpose_input_shapes),
+                                 ::testing::Values(2), // Transpose on Matmul output
+                                 ::testing::ValuesIn(precisions),
+                                 ::testing::Values(1), // MatMu;
+                                 ::testing::Values(1), // Tokenized MatMul + FusedTranspose
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TransposeMatMul::getTestCaseName);
+} // namespace transpose_output
+
+}  // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp
new file mode 100644
index 00000000000..1158dff31c3
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_softmax.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/transpose_softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+
+namespace {
+
+const std::vector<ov::Shape> inputShape = {
+    ov::Shape{1, 128, 3, 16},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmax, TransposeSoftmax,
+                     ::testing::Combine(
+                             ::testing::Values(inputShape),
+                             ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
+                             ::testing::Values(-1),
+                             ::testing::Values(1),
+                             ::testing::Values(1),
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                     TransposeSoftmax::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeSoftmaxEltwise, TransposeSoftmaxEltwise,
+                         ::testing::Combine(
+                                 ::testing::Values(inputShape),
+                                 ::testing::Values(std::vector<int64_t>{0, 2, 3, 1}),
+                                 ::testing::Values(-1),
+                                 ::testing::Values(1),
+                                 ::testing::Values(1),
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         TransposeSoftmax::getTestCaseName);
+
+} // namespace
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
index 64042a3b01a..3b4db9cc4d3 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
@@ -10,7 +10,7 @@ namespace test {
 namespace snippets {
 namespace {
 
-const std::vector<std::vector<ov::Shape>> input_shapes = {
+const std::vector<std::vector<ov::PartialShape>> input_shapes = {
         { {5, 5, 256, 1}, {5, 5, 256, 1} },
         { {5, 5, 16, 35}, {5, 5, 16, 35} },
         { {5, 5, 256, 1}, {5, 5, 256, 35} },
@@ -26,7 +26,6 @@ const std::vector<std::vector<ov::Shape>> input_shapes = {
 
         { {5, 5, 35, 17}, {5, 5, 35, 17} },
         { {5, 5, 35, 17}, {5, 5, 1, 17} },
-
         { {5, 5, 35, 18}, {5, 5, 35, 18} },
         { {5, 5, 35, 18}, {5, 5, 1, 18} },
 };
diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp
index 4f3578b9106..4222cb9b975 100644
--- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp
+++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/mha.cpp
@@ -25,6 +25,7 @@ typedef std::tuple<
         std::vector<ElementType>,  // Input precisions
         std::vector<ElementType>,  // MatMul input #0 precisions
         size_t,                    // pattern type #
+        std::string,               // Expected node
         std::string                // Device name
 > MHATuple;
 
@@ -155,8 +156,9 @@ public:
         std::vector<ElementType> inputPrecisions;
         std::vector<ElementType> matMulIn0Precisions;
         size_t patternType;
+        std::string expectedNode;
         std::string targetName;
-        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetName) = obj.param;
+        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetName) = obj.param;
         std::ostringstream results;
 
         results << "IS=(";
@@ -173,6 +175,7 @@ public:
             results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i] << "_";
         }
         results << "patternType=" << patternType;
+        results << "expect=" << expectedNode;
         results << "targetDevice=" << targetName;
 
         return results.str();
@@ -195,7 +198,8 @@ protected:
         std::vector<ElementType> inputPrecisions;
         std::vector<ElementType> matMulIn0Precisions;
         size_t patternType;
-        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam();
+        std::string expectedNode;
+        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam();
 
         init_input_shapes(inputShapes);
 
@@ -223,7 +227,8 @@ TEST_P(MHATest, CompareWithRefs) {
     std::vector<ElementType> inputPrecisions;
     std::vector<ElementType> matMulIn0Precisions;
     size_t patternType;
-    std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam();
+    std::string expectedNode;
+    std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam();
 
     if (inputPrecisions[0] == ElementType::bf16 && !InferenceEngine::with_cpu_x86_bfloat16())
         GTEST_SKIP();
@@ -232,7 +237,7 @@ TEST_P(MHATest, CompareWithRefs) {
         GTEST_SKIP();
 
     run();
-    CheckNumberOfNodesWithType(compiledModel, "MHA", 1);
+    CheckNumberOfNodesWithType(compiledModel, expectedNode, 1);
 }
 
 namespace {
@@ -247,11 +252,6 @@ std::vector<std::vector<ngraph::Shape>> inputShapes = {
     {{1, 204, 13, 212},  {1, 204, 13, 212},  {1, 1, 1, 204}, {1, 204, 13, 212}},
 };
 
-std::vector<std::vector<ElementType>> inputPrecisions = {
-    { ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 },
-    { ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16 },
-};
-
 std::vector<std::vector<ElementType>> matMulIn0Precisions = {
     {},
 };
@@ -260,15 +260,26 @@ std::vector<size_t> patternTypes = {
     0, 1
 };
 
-INSTANTIATE_TEST_SUITE_P(smoke_MHA, MHATest,
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA, MHATest,
                         ::testing::Combine(
                                 ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)),
-                                ::testing::ValuesIn(inputPrecisions),
+                                ::testing::Values(std::vector<ElementType>{ ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 }),
                                 ::testing::ValuesIn(matMulIn0Precisions),
                                 ::testing::ValuesIn(patternTypes),
+                                ::testing::Values("Subgraph"),
                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                         MHATest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MHA, MHATest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)),
+                                 ::testing::Values(std::vector<ElementType>{ ElementType::bf16, ElementType::bf16, ElementType::bf16, ElementType::bf16 }),
+                                 ::testing::ValuesIn(matMulIn0Precisions),
+                                 ::testing::ValuesIn(patternTypes),
+                                 ::testing::Values("MHA"),  // Snippets don't support BF16 MHA pattern yet
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         MHATest::getTestCaseName);
+
 } // namespace
 
 static std::shared_ptr<ov::Model> initMHAQuantSubgraph0(std::vector<ov::PartialShape>& inputDynamicShapes, std::vector<ElementType>& inputPrecisions,
@@ -425,7 +436,8 @@ public:
         std::vector<ElementType> matMulIn0Precisions;
         size_t patternType;
         std::string targetName;
-        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetName) = obj.param;
+        std::string expectedNode;
+        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetName) = obj.param;
         std::ostringstream results;
 
         results << "IS=(";
@@ -445,6 +457,7 @@ public:
             results << "MatMulIn0PRC" << std::to_string(i) << "=" << matMulIn0Precisions[i] << "_";
         }
         results << "patternType=" << patternType;
+        results << "expect=" << expectedNode;
         results << "targetDevice=" << targetName;
 
         return results.str();
@@ -474,7 +487,8 @@ protected:
         std::vector<ElementType> inputPrecisions;
         std::vector<ElementType> matMulIn0Precisions;
         size_t patternType;
-        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam();
+        std::string expectedNode;
+        std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam();
 
         init_input_shapes(inputShapes);
 
@@ -493,7 +507,8 @@ TEST_P(MHAQuantTest, CompareWithRefs) {
     std::vector<ElementType> inputPrecisions;
     std::vector<ElementType> matMulIn0Precisions;
     size_t patternType;
-    std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, targetDevice) = this->GetParam();
+    std::string expectedNode;
+    std::tie(inputShapes, inputPrecisions, matMulIn0Precisions, patternType, expectedNode, targetDevice) = this->GetParam();
 
     if (inputPrecisions[0] == ElementType::bf16 && !InferenceEngine::with_cpu_x86_bfloat16())
         GTEST_SKIP();
@@ -502,7 +517,7 @@ TEST_P(MHAQuantTest, CompareWithRefs) {
         GTEST_SKIP();
 
     run();
-    CheckNumberOfNodesWithType(compiledModel, "MHA", 1);
+    CheckNumberOfNodesWithType(compiledModel, expectedNode, 1);
 }
 
 namespace {
@@ -538,6 +553,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant, MHAQuantTest,
                                 ::testing::ValuesIn(inputPrecisionsQuant),
                                 ::testing::ValuesIn(matMulIn0PrecisionsQuant),
                                 ::testing::ValuesIn(patternTypesQuant),
+                                ::testing::Values("MHA"),  // Snippets don't support Quantized MHA pattern yet
                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                         MHAQuantTest::getTestCaseName);
 
diff --git a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
index aee3aff68bf..f3eaa9a38d6 100644
--- a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
@@ -6,7 +6,7 @@
 #include <subgraph_customizable.hpp>
 #include <snippets_helpers.hpp>
 #include <ngraph_transformations/snippets_mark_skipped.hpp>
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 
 namespace ov {
 namespace test {
@@ -19,11 +19,17 @@ public:
         manager.register_pass<ov::intel_cpu::SnippetsMarkSkipped>();
         manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
         manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
+        //
+        // todo: This is a temporary work-around. remove when MatMul tokenization is supported through general pipeline
+        manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
+                [](const std::shared_ptr<const ov::Node>& n) -> bool {
+                        return ov::is_type<const ov::op::v0::MatMul>(n);
+                });
     }
 };
 
 TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsMatMulEltwise) {
-    const auto &f = MatMulEltwiseBranchesFunction(std::vector<Shape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
+    const auto &f = MatMulEltwiseBranchesFunction(std::vector<PartialShape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
     function = f.getOriginal();
     // Fully tokenizable, since inputs are followed by MatMul
     function_ref = f.getReference();
@@ -34,7 +40,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipConvFused_ConvMulActivation)
     std::vector<std::shared_ptr<Node>> eltwiseOps {std::make_shared<ov::op::v1::Multiply>(),
                                                    std::make_shared<ov::op::v0::Tanh>(),
                                                    std::make_shared<ov::op::v0::Sqrt>()};
-    std::vector<Shape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
+    std::vector<PartialShape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
     const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps);
     function = f.getOriginal();
     // Fully tokenizable, since Mul with 2 inputs isn't fused into Convolution
@@ -46,7 +52,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_SkipConvFused_ConvSumActivation) {
     std::vector<std::shared_ptr<Node>> eltwiseOps {std::make_shared<ov::op::v1::Add>(),
                                                    std::make_shared<ov::op::v0::Tanh>(),
                                                    std::make_shared<ov::op::v0::Sqrt>()};
-    std::vector<Shape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
+    std::vector<PartialShape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
     const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps);
     function = f.getOriginal();
     // Not tokenizable, since Add + Eltwises can be fused into Convolution
diff --git a/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp b/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp
index e6f83cad753..518d2dfb1cc 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets/fake_quantize_tokenization_test.cpp
@@ -6,7 +6,7 @@
 
 #include "common_test_utils/ngraph_test_utils.hpp"
 #include "snippets/pass/fq_decomposition.hpp"
-#include "snippets/pass/collapse_subgraph.hpp"
+#include "snippets/pass/tokenization.hpp"
 #include "fake_quantize_function.hpp"
 #include "snippets/op/subgraph.hpp"
 #include "ngraph_transformations/snippets_mark_skipped.hpp"
diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp
index 7499d8ade45..1895d204df6 100644
--- a/src/tests/functional/plugin/shared/include/snippets/add.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp
@@ -19,6 +19,14 @@ typedef std::tuple<
         std::string                  // Target Device
 > AddParams;
 
+typedef std::tuple<
+        std::vector<ov::Shape>,      // Input 0, Input 1 Shape
+        ov::element::Type,           // Element type
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> AddParamsPair;
+
 typedef std::tuple<
         ov::Shape,                   // Input 0 Shape
         ov::element::Type,           // Element type
@@ -49,6 +57,15 @@ protected:
     void SetUp() override;
 };
 
+// repack AddPair input shapes into shape vector to cover some cases easier
+class AddPair : public testing::WithParamInterface<ov::test::snippets::AddParamsPair>,
+                virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParamsPair> obj);
+protected:
+    void SetUp() override;
+};
+
 } // namespace snippets
 } // namespace test
 } // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
index bd4d7641711..fe534480fc4 100644
--- a/src/tests/functional/plugin/shared/include/snippets/convert.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
@@ -11,7 +11,7 @@ namespace test {
 namespace snippets {
 
 typedef std::tuple<
-        std::vector<ov::Shape>,                                                     // InputShapes
+        std::vector<ov::PartialShape>,                                              // InputShapes
         std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>,  // Input and Output data types for Converts
         size_t,                                                                     // Expected num nodes
         size_t,                                                                     // Expected num subgraphs
diff --git a/src/tests/functional/plugin/shared/include/snippets/matmul.hpp b/src/tests/functional/plugin/shared/include/snippets/matmul.hpp
new file mode 100644
index 00000000000..bfa2a82921f
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/matmul.hpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::PartialShape>, // Input  Shapes
+        ov::element::Type,             // Element type
+        size_t,                        // Expected num nodes
+        size_t,                        // Expected num subgraphs
+        std::string                    // Target Device
+> MatMulParams;
+
+typedef std::tuple<
+        std::vector<ov::PartialShape>, // Input  Shapes
+        size_t ,                       // Transpose position
+        ov::element::Type,             // Element type
+        size_t,                        // Expected num nodes
+        size_t,                        // Expected num subgraphs
+        std::string                    // Target Device
+> TransposeMatMulParams;
+
+class MatMul : public testing::WithParamInterface<ov::test::snippets::MatMulParams>,
+            virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MatMulParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+class MatMulBias : public MatMul {
+protected:
+    void SetUp() override;
+};
+
+class ExplicitTransposeMatMul : public MatMul {
+protected:
+    void SetUp() override;
+};
+
+class ExplicitTransposeMatMulBias : public MatMul {
+protected:
+    void SetUp() override;
+};
+
+class ExplicitTransposeMulMatMulBias : public MatMul {
+protected:
+    void SetUp() override;
+};
+
+class TransposeMatMul : public testing::WithParamInterface<ov::test::snippets::TransposeMatMulParams>,
+                        virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeMatMulParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp
new file mode 100644
index 00000000000..9f95dcc30ac
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::PartialShape>,   // Input shapes
+        bool,                            // With Multiply
+        size_t,                          // Expected num nodes
+        size_t,                          // Expected num subgraphs
+        std::string                      // Target Device
+> MHAParams;
+
+
+class MHA : public testing::WithParamInterface<ov::test::snippets::MHAParams>,
+        virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::MHAParams> obj);
+
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) override;
+};
+
+class MHASelect : public MHA {
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) override;
+};
+
+class MHAWOTransposeOnInputs : public MHA {
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/select.hpp b/src/tests/functional/plugin/shared/include/snippets/select.hpp
new file mode 100644
index 00000000000..e8e15ab97e4
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/select.hpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::Shape,                   // Input 0 Shape
+        ov::Shape,                   // Input 1 Shape
+        ov::Shape,                   // Input 2 Shape
+        ov::element::Type,           // Element type
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> SelectParams;
+
+typedef std::tuple<
+        ov::Shape,                   // Input 0 Shape
+        ov::Shape,                   // Input 1 Shape
+        ov::Shape,                   // Input 2 Shape
+        ov::Shape,                   // Input 3 Shape
+        ov::element::Type,           // Element type
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> BroadcastSelectParams;
+
+class Select : public testing::WithParamInterface<ov::test::snippets::SelectParams>,
+               virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::SelectParams> obj);
+
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+};
+
+class BroadcastSelect : public testing::WithParamInterface<ov::test::snippets::BroadcastSelectParams>,
+                        virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::BroadcastSelectParams> obj);
+
+protected:
+    void SetUp() override;
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+};
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp
new file mode 100644
index 00000000000..ca3f77e4319
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/softmax.hpp
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::Shape,                       // Input 0 Shape
+        int,                             // Axis
+        size_t,                          // Expected num nodes
+        size_t,                          // Expected num subgraphs
+        std::string                      // Target Device
+> SoftmaxParams;
+
+typedef std::tuple<
+        std::pair<ov::Shape, ov::Shape>,  // Input Shapes
+        int,                              // Axis
+        size_t,                           // Expected num nodes
+        size_t,                           // Expected num subgraphs
+        std::string                       // Target Device
+> AddSoftmaxParams;
+
+class Softmax : public testing::WithParamInterface<ov::test::snippets::SoftmaxParams>,
+                virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::SoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+class AddSoftmax : public testing::WithParamInterface<ov::test::snippets::AddSoftmaxParams>,
+                   virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddSoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
index 2bb61b3b2b7..ce1fd7a8b5b 100644
--- a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
@@ -19,6 +19,15 @@ typedef std::tuple<
         std::string                  // Target Device
 > ThreeInputsEltwiseParams;
 
+typedef std::tuple<
+    InputShape,                   // Input 0 Shape
+    InputShape,                   // Input 1 Shape
+    InputShape,                   // Input 2 Shape
+    size_t,                      // Expected num nodes
+    size_t,                      // Expected num subgraphs
+    std::string                  // Target Device
+    > ThreeInputsEltwiseDynamicParams;
+
 class ThreeInputsEltwise : public testing::WithParamInterface<ov::test::snippets::ThreeInputsEltwiseParams>,
                    virtual public ov::test::SnippetsTestsCommon {
 public:
@@ -28,7 +37,6 @@ protected:
     void SetUp() override;
 };
 
-
 } // namespace snippets
 } // namespace test
 } // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose.hpp
new file mode 100644
index 00000000000..e1491ebe8b1
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/transpose.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        ov::PartialShape,            // Input 0 Shape
+        std::vector<int>,            // Transpose order
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> TransposeParams;
+
+class Transpose : public testing::WithParamInterface<ov::test::snippets::TransposeParams>,
+            virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp
new file mode 100644
index 00000000000..f949e9df9d5
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/transpose_matmul.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::PartialShape>, // Input  Shapes
+        size_t ,                       // Transpose position
+        ov::element::Type,             // Element type
+        size_t,                        // Expected num nodes
+        size_t,                        // Expected num subgraphs
+        std::string                    // Target Device
+> TransposeMatMulParams;
+
+class TransposeMatMul : public testing::WithParamInterface<ov::test::snippets::TransposeMatMulParams>,
+               virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeMatMulParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp
new file mode 100644
index 00000000000..952b7528a00
--- /dev/null
+++ b/src/tests/functional/plugin/shared/include/snippets/transpose_softmax.hpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/base/snippets_test_utils.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+        std::vector<ov::Shape>,          // Input shapes
+        std::vector<int64_t>,            // Transpose Order
+        int64_t,                         // Softmax Axis
+        size_t,                          // Expected num nodes
+        size_t,                          // Expected num subgraphs
+        std::string                      // Target Device
+> TransposeSoftmaxParams;
+
+
+class TransposeSoftmax : public testing::WithParamInterface<ov::test::snippets::TransposeSoftmaxParams>,
+                         virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeSoftmaxParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+class TransposeSoftmaxEltwise : public TransposeSoftmax {
+protected:
+    void SetUp() override;
+};
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
\ No newline at end of file
diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
index 0a209de2fe9..4284ceacfa4 100644
--- a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
@@ -11,7 +11,7 @@ namespace test {
 namespace snippets {
 
 typedef std::tuple<
-        std::vector<ov::Shape>,      // Input Shape All shapes
+        std::vector<ov::PartialShape>,      // Input Shape All shapes
         size_t,                      // Expected num nodes
         size_t,                      // Expected num subgraphs
         std::string                  // Target Device
diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp
index beb85401f52..bb6a5fbee60 100644
--- a/src/tests/functional/plugin/shared/src/snippets/add.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp
@@ -5,6 +5,8 @@
 #include "common_test_utils/common_utils.hpp"
 #include "snippets/add.hpp"
 #include "subgraph_simple.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
 
 namespace ov {
 namespace test {
@@ -76,6 +78,38 @@ void AddRollConst::SetUp() {
     setInferenceType(type);
 }
 
+std::string AddPair::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParamsPair> obj) {
+    std::vector<ov::Shape> input_shapes;
+    ov::element::Type type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(input_shapes, type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+    if (input_shapes.size() != 2)
+        IE_THROW() << "Invalid input shapes vector size";
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(input_shapes[0]) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(input_shapes[1]) << "_";
+    result << "T=" << type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void AddPair::SetUp() {
+    std::vector<ov::Shape> input_shapes;
+    ov::element::Type type;
+    std::tie(input_shapes, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    std::vector<InputShape> is;
+    for (const auto& s : input_shapes) {
+        is.emplace_back(InputShape {{}, {s, }});
+    }
+    init_input_shapes(is);
+    auto f = ov::test::snippets::AddFunction({input_shapes[0], input_shapes[1]});
+    function = f.getOriginal();
+    setInferenceType(type);
+}
+
 TEST_P(Add, CompareWithRefImpl) {
     run();
     validateNumSubgraphs();
@@ -91,6 +125,10 @@ TEST_P(AddRollConst, CompareWithRefImpl) {
     validateNumSubgraphs();
 }
 
+TEST_P(AddPair, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
 
 } // namespace snippets
 } // namespace test
diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
index b4c5c840cb6..60419d28b2f 100644
--- a/src/tests/functional/plugin/shared/src/snippets/convert.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
@@ -12,7 +12,7 @@ namespace test {
 namespace snippets {
 
 std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj) {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::string targetDevice;
     size_t num_nodes, num_subgraphs;
@@ -21,7 +21,7 @@ std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::
     std::ostringstream result;
     result << "IS=";
     for (const auto& sh : inputShape)
-        result << CommonTestUtils::vec2str(sh) << "_";
+        result << CommonTestUtils::vec2str(sh.get_shape()) << "_";
     result << "IT=" << CommonTestUtils::vec2str(types.first) << "_";
     result << "OT=" << CommonTestUtils::vec2str(types.second) << "_";
     result << "#N=" << num_nodes << "_";
@@ -31,11 +31,10 @@ std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::
 }
 
 void Convert::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
-
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
     output_type = types.second.front();
@@ -85,11 +84,10 @@ void Convert::generate_inputs(const std::vector<ov::Shape>& targetInputStaticSha
 }
 
 void ConvertInput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
-
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
 }
@@ -125,10 +123,10 @@ parameters ConvertInput::generate_params_random() const {
 }
 
 void ConvertOutput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
@@ -136,10 +134,10 @@ void ConvertOutput::SetUp() {
 }
 
 void ConvertStub::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
@@ -147,40 +145,40 @@ void ConvertStub::SetUp() {
 }
 
 void ConvertPartialInputsAndResults::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second);
     function = f.getOriginal();
 }
 
 void ConvertManyOnInputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first);
     function = f.getOriginal();
 }
 
 void ConvertManyOnOutputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first);
     function = f.getOriginal();
 }
 
 void ConvertManyOnInputOutput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second);
     function = f.getOriginal();
diff --git a/src/tests/functional/plugin/shared/src/snippets/matmul.cpp b/src/tests/functional/plugin/shared/src/snippets/matmul.cpp
new file mode 100644
index 00000000000..36782e59ad7
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/matmul.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/matmul.hpp"
+#include "subgraph_matmul.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string MatMul::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MatMulParams> obj) {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(input_shapes, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+    std::ostringstream result;
+    for (size_t i = 0; i < input_shapes.size(); i++)
+        result << "IS[" << i <<"]=" << CommonTestUtils::partialShape2str({input_shapes[i]}) << "_";
+    result << "T=" << elem_type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void MatMul::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::MatMulFunction(input_shapes);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void MatMulBias::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::MatMulBiasFunction(input_shapes);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void ExplicitTransposeMatMul::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::TransposeMatMulFunction(input_shapes);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void ExplicitTransposeMatMulBias::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::TransposeMatMulBiasFunction(input_shapes);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void ExplicitTransposeMulMatMulBias::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::TransposeMulMatMulBiasFunction(input_shapes);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+std::string TransposeMatMul::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeMatMulParams> obj) {
+    std::vector<ov::PartialShape> input_shapes;
+    size_t transpose_position;
+    ov::element::Type elem_type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(input_shapes, transpose_position, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+    if (input_shapes.size() != 2)
+        IE_THROW() << "Invalid input shapes vector size";
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_";
+    result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_";
+    result << "Pos=" << transpose_position << "_";
+    result << "T=" << elem_type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void TransposeMatMul::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    size_t transpose_position;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, transpose_position, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::Transpose0213MatMulFunction(input_shapes, transpose_position);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(MatMul, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(MatMulBias, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ExplicitTransposeMatMul, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ExplicitTransposeMatMulBias, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(ExplicitTransposeMulMatMulBias, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(TransposeMatMul, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
index 1061a2a4f1b..e0b4490ecbd 100644
--- a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
@@ -27,10 +27,10 @@ std::string MaxNumParamsEltwise::getTestCaseName(testing::TestParamInfo<ov::test
 void MaxNumParamsEltwise::SetUp() {
     ov::Shape inputShape;
     std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    std::vector<ov::Shape> expandedShapes(10, inputShape);
+    std::vector<ov::PartialShape> expandedShapes(10, inputShape);
     std::vector<InputShape> input_shapes;
     for (const auto& s : expandedShapes) {
-        input_shapes.emplace_back(InputShape {{}, {s, }});
+        input_shapes.emplace_back(InputShape {{}, {s.get_shape(), }});
     }
 
     init_input_shapes(input_shapes);
diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp
new file mode 100644
index 00000000000..bb8d7e585a2
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp
@@ -0,0 +1,125 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/mha.hpp"
+#include "subgraph_mha.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+#include <common_test_utils/ov_tensor_utils.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string MHA::getTestCaseName(testing::TestParamInfo<ov::test::snippets::MHAParams> obj) {
+    std::vector<ov::PartialShape> inputShapes;
+    bool withMul;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, withMul, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    for (size_t i = 0; i < inputShapes.size(); ++i)
+        result << "IS[" << i << "]=" << CommonTestUtils::partialShape2str({inputShapes[i]}) << "_";
+    result << "Mul=" << withMul << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void MHA::SetUp() {
+    std::vector<ov::PartialShape> inputShapes;
+    bool withMul;
+    std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::MHAFunction(inputDynamicShapes, withMul);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void MHA::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    const auto& model_inputs = function->inputs();
+    for (int i = 0; i < model_inputs.size(); ++i) {
+        const auto& model_input = model_inputs[i];
+        ov::Tensor tensor;
+        tensor = ov::test::utils::create_and_fill_tensor_normal_distribution(model_input.get_element_type(), targetInputStaticShapes[i], 1.0f, 0.5f);
+        inputs.insert({model_input.get_node_shared_ptr(), tensor});
+    }
+}
+
+void MHASelect::SetUp() {
+    std::vector<ov::PartialShape> inputShapes;
+    bool withMul;
+    std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::MHASelectFunction(inputDynamicShapes);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void MHASelect::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    auto model_inputs = function->inputs();
+    for (auto& model_input : model_inputs) {
+        const auto node_input = model_input.get_node_shared_ptr();
+        const auto name = node_input->get_friendly_name();
+        ov::Tensor tensor;
+        int seed = 0;
+        if (name.find("less") != std::string::npos) {
+            tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(), model_input.get_shape(), 5 + seed, -2, 10, seed++);
+        } else {
+            tensor = ov::test::utils::create_and_fill_tensor_normal_distribution(model_input.get_element_type(), model_input.get_shape(), 1.0f, 0.5f);
+        }
+        inputs.insert({node_input, tensor});
+    }
+}
+
+void MHAWOTransposeOnInputs::SetUp() {
+    std::vector<ov::PartialShape> inputShapes;
+    bool withMul;
+    std::tie(inputShapes, withMul, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::MHAWOTransposeOnInputsFunction(inputDynamicShapes);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+
+TEST_P(MHA, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(MHASelect, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(MHAWOTransposeOnInputs, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/select.cpp b/src/tests/functional/plugin/shared/src/snippets/select.cpp
new file mode 100644
index 00000000000..a2814a57890
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/select.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include <common_test_utils/ov_tensor_utils.hpp>
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+#include "snippets/select.hpp"
+#include "subgraph_simple.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+namespace {
+void generate_data(std::map<std::shared_ptr<ov::Node>, ov::Tensor>& data_inputs, const std::vector<ov::Output<ov::Node>>& model_inputs) {
+    data_inputs.clear();
+    auto tensor_bool = ov::test::utils::create_and_fill_tensor(model_inputs[0].get_element_type(), model_inputs[0].get_shape(), 3, -1, 2);
+    auto tensor0 = ov::test::utils::create_and_fill_tensor(model_inputs[1].get_element_type(), model_inputs[1].get_shape(), 10, -10, 2);
+    auto tensor1 = ov::test::utils::create_and_fill_tensor(model_inputs[2].get_element_type(), model_inputs[2].get_shape(), 10, 0, 2);
+    data_inputs.insert({model_inputs[0].get_node_shared_ptr(), tensor_bool});
+    data_inputs.insert({model_inputs[1].get_node_shared_ptr(), tensor0});
+    data_inputs.insert({model_inputs[2].get_node_shared_ptr(), tensor1});
+}
+} // namespace
+
+std::string Select::getTestCaseName(testing::TestParamInfo<ov::test::snippets::SelectParams> obj) {
+    ov::Shape inputShapes0, inputShapes1, inputShapes2;
+    ov::element::Type type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes0, inputShapes1, inputShapes2, type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
+    result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
+    result << "T=" << type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Select::SetUp() {
+    ov::Shape inputShape0, inputShape1, inputShape2;
+    ov::element::Type type;
+    std::tie(inputShape0, inputShape1, inputShape2, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation({inputShape0, inputShape1, inputShape2}));
+
+    auto f = ov::test::snippets::SelectFunction({inputShape0, inputShape1, inputShape2});
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void Select::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
+    generate_data(inputs, function->inputs());
+}
+
+std::string BroadcastSelect::getTestCaseName(testing::TestParamInfo<ov::test::snippets::BroadcastSelectParams> obj) {
+    ov::Shape inputShapes0, inputShapes1, inputShapes2, broadcastShape;
+    ov::element::Type type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes0, inputShapes1, inputShapes2, broadcastShape, type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes0) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes1) << "_";
+    result << "IS[2]=" << CommonTestUtils::vec2str(inputShapes2) << "_";
+    result << "BS=" << CommonTestUtils::vec2str(broadcastShape) << "_";
+    result << "T=" << type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void BroadcastSelect::SetUp() {
+    ov::Shape inputShape0, inputShape1, inputShape2, broadcastShape;
+    ov::element::Type type;
+    std::tie(inputShape0, inputShape1, inputShape2, broadcastShape, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation({inputShape0, inputShape1, inputShape2}));
+
+    auto f = ov::test::snippets::BroadcastSelectFunction({inputShape0, inputShape1, inputShape2}, broadcastShape);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void BroadcastSelect::generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) {
+    generate_data(inputs, function->inputs());
+}
+
+TEST_P(Select, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(BroadcastSelect, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp
new file mode 100644
index 00000000000..13b45283278
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/softmax.cpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/softmax.hpp"
+#include "subgraph_softmax.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string Softmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::SoftmaxParams> obj) {
+    ov::Shape inputShapes;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Softmax::SetUp() {
+    ov::Shape inputShape;
+    int axis;
+    std::tie(inputShape, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShape, }}});
+
+    auto f = ov::test::snippets::SoftmaxFunction({inputShape}, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+std::string AddSoftmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddSoftmaxParams> obj) {
+    std::pair<ov::Shape, ov::Shape> inputShapes;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(inputShapes.first) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(inputShapes.second) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void AddSoftmax::SetUp() {
+    std::pair<ov::Shape, ov::Shape> inputShapes;
+    int axis;
+    std::tie(inputShapes, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{}, {inputShapes.first, }}, {{}, {inputShapes.second, }}});
+
+    auto f = ov::test::snippets::AddSoftmaxFunction({inputShapes.first, inputShapes.second}, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(Softmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(AddSoftmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
index 0c601cc8ebe..ad8db673983 100644
--- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
@@ -5,6 +5,7 @@
 #include "common_test_utils/common_utils.hpp"
 #include "snippets/three_inputs_eltwise.hpp"
 #include "subgraph_simple.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
 
 namespace ov {
 namespace test {
diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose.cpp
new file mode 100644
index 00000000000..c5886fe74a8
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/transpose.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/transpose.hpp"
+#include "subgraph_transpose.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string Transpose::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeParams> obj) {
+    ov::PartialShape inputShape;
+    std::vector<int> order;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShape, order, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::partialShape2str({inputShape}) << "_";
+    result << "Order=" << CommonTestUtils::vec2str(order) << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void Transpose::SetUp() {
+    ov::PartialShape inputShape;
+    std::vector<int> order;
+    std::tie(inputShape, order, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes({{{inputShape}, {inputShape.get_shape(), }}});
+
+    auto f = ov::test::snippets::TransposeFunction({inputShape}, order);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(Transpose, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp
new file mode 100644
index 00000000000..68a2140339f
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/transpose_matmul.cpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/transpose_matmul.hpp"
+#include "subgraph_matmul.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string TransposeMatMul::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeMatMulParams> obj) {
+    std::vector<ov::PartialShape> input_shapes;
+    size_t transpose_position;
+    ov::element::Type elem_type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(input_shapes, transpose_position, elem_type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+    if (input_shapes.size() != 2)
+        IE_THROW() << "Invalid input shapes vector size";
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_";
+    result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_";
+    result << "Pos=" << transpose_position << "_";
+    result << "T=" << elem_type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void TransposeMatMul::SetUp() {
+    std::vector<ov::PartialShape> input_shapes;
+    size_t transpose_position;
+    ov::element::Type elem_type;
+    std::tie(input_shapes, transpose_position, elem_type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_partial_shapes_to_test_representation(input_shapes));
+
+    auto f = ov::test::snippets::Transpose0213MatMulFunction(input_shapes, transpose_position);
+    function = f.getOriginal();
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+TEST_P(TransposeMatMul, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp
new file mode 100644
index 00000000000..aecdd418f05
--- /dev/null
+++ b/src/tests/functional/plugin/shared/src/snippets/transpose_softmax.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/common_utils.hpp"
+#include "snippets/transpose_softmax.hpp"
+#include "subgraph_softmax.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string TransposeSoftmax::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TransposeSoftmaxParams> obj) {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int axis;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(inputShapes, order, axis, num_nodes, num_subgraphs, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    for (size_t i = 0; i < inputShapes.size(); ++i)
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+    result << "TO=" << CommonTestUtils::vec2str(order) << "_";
+    result << "Axis=" << axis << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void TransposeSoftmax::SetUp() {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int64_t axis;
+    std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::TransposeSoftmaxFunction(inputDynamicShapes, order, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+}
+
+void TransposeSoftmaxEltwise::SetUp() {
+    std::vector<ov::Shape> inputShapes;
+    std::vector<int64_t> order;
+    int64_t axis;
+    std::tie(inputShapes, order, axis, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    init_input_shapes(static_shapes_to_test_representation(inputShapes));
+
+    auto f = ov::test::snippets::TransposeSoftmaxEltwiseFunction(inputDynamicShapes, order, axis);
+    function = f.getOriginal();
+
+    if (!configuration.count(InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE)) {
+        configuration.insert({InferenceEngine::PluginConfigInternalParams::KEY_SNIPPETS_MODE,
+                              InferenceEngine::PluginConfigInternalParams::IGNORE_CALLBACK});
+    }
+
+    abs_threshold = 1e-6;
+}
+
+TEST_P(TransposeSoftmax, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+TEST_P(TransposeSoftmaxEltwise, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
+
+
+} // namespace snippets
+} // namespace test
+} // namespace ov
diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
index 205587e1a30..81b3c93079c 100644
--- a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
@@ -11,14 +11,14 @@ namespace test {
 namespace snippets {
 
 std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj) {
-    std::vector<ov::Shape> inputShapes;
+    std::vector<ov::PartialShape> inputShapes;
     std::string targetDevice;
     size_t num_nodes, num_subgraphs;
     std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
 
     std::ostringstream result;
     for (auto i = 0; i < inputShapes.size(); i++)
-        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i].get_shape()) << "_";
     result << "#N=" << num_nodes << "_";
     result << "#S=" << num_subgraphs << "_";
     result << "targetDevice=" << targetDevice;
@@ -26,9 +26,9 @@ std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test
 }
 
 void TwoInputsAndOutputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(static_partial_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape);
     function = f.getOriginal();
 }
diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
index f4ccb9480e3..88a5f861aff 100644
--- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
@@ -43,6 +43,11 @@ ov::runtime::Tensor generate(const std::shared_ptr<ov::Node>& node,
 }
 
 namespace Activation {
+// todo: this is a bug fixed! Merge it separately.
+//  Default parameters InputGenerateData(10, 20, 32768, 1) lead to input generation according to 10 + x/32768,
+//  where x {0, 20}, so all generated values are in the range [10, 10 + 6.1e-4].
+//  Thus all the interval more-or-less fall within the uncertainty validation interval
+//  Fix let the range be at least 20x of resolution
 ov::runtime::Tensor generate(const ov::element::Type& elemType,
                              const ov::Shape& targetShape,
                              InputGenerateData inGenData = InputGenerateData(-1, 2*32768, 32768, 1)) {
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
index 68986aea9ca..8bf96f6c99a 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
@@ -18,7 +18,7 @@ class SnippetsFunctionBase {
 public:
     SnippetsFunctionBase() = delete;
 
-    explicit SnippetsFunctionBase(const std::vector<Shape>& inputShapes, ov::element::Type_t precision = element::f32)
+    explicit SnippetsFunctionBase(const std::vector<PartialShape>& inputShapes, ov::element::Type_t precision = element::f32)
                 : input_shapes{inputShapes}, precision{precision} {};
 
     std::shared_ptr<Model> getReference() const {
@@ -53,7 +53,7 @@ protected:
     }
 
     const ov::element::Type_t precision;
-    const std::vector<Shape> input_shapes;
+    const std::vector<PartialShape> input_shapes;
 
     virtual void validate_function(const std::shared_ptr<Model> &f) const;
 };
@@ -67,7 +67,7 @@ protected:
 class SnippetsFunctionCustomizable : public SnippetsFunctionBase {
 public:
     SnippetsFunctionCustomizable() = delete;
-    SnippetsFunctionCustomizable(const std::vector<Shape>& inputShapes,
+    SnippetsFunctionCustomizable(const std::vector<PartialShape>& inputShapes,
                                  const std::vector<std::shared_ptr<Node>>& customOps,
                                  const std::vector<size_t>&& customOpsNumInputs);
 
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
index a7c6bd34e0f..526234409b3 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
@@ -22,7 +22,7 @@ namespace snippets {
 //   Result
 class ConvertFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertFunction(const std::vector<PartialShape>& inputShapes,
                              const ov::element::Type inType = ov::element::f32,
                              const ov::element::Type outType = ov::element::u8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -45,7 +45,7 @@ protected:
 //      Result
 class ConvertInputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertInputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertInputFunction(const std::vector<PartialShape>& inputShapes,
                                   const ov::element::Type inType = ov::element::f32,
                                   const ov::element::Type outType = ov::element::u8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -67,7 +67,7 @@ protected:
 //      Result
 class ConvertOutputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertOutputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertOutputFunction(const std::vector<PartialShape>& inputShapes,
                                    const ov::element::Type inType = ov::element::f32,
                                    const ov::element::Type outType = ov::element::i8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -91,7 +91,7 @@ protected:
 //      Result                Result
 class ConvertStubFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertStubFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertStubFunction(const std::vector<PartialShape>& inputShapes,
                                  const ov::element::Type inType = ov::element::f32,
                                  const ov::element::Type outType = ov::element::i8)
         : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -117,7 +117,7 @@ protected:
 //            Result2
 class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertPartialInputsAndResultsFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertPartialInputsAndResultsFunction(const std::vector<PartialShape>& inputShapes,
                                                     const std::vector<ov::element::Type>& inTypes = {ov::element::f32},
                                                     const std::vector<ov::element::Type>& outTypes = {ov::element::f32})
     : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
@@ -142,7 +142,7 @@ protected:
 //  Result
 class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnInputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    explicit ConvertManyOnInputsFunction(const std::vector<ov::PartialShape>& inputShapes, const std::vector<ov::element::Type>& types)
     : SnippetsFunctionBase(inputShapes), types(types) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
         NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
@@ -165,7 +165,7 @@ protected:
 //  Result        Result
 class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnOutputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    explicit ConvertManyOnOutputsFunction(const std::vector<ov::PartialShape>& inputShapes, const std::vector<ov::element::Type>& types)
     : SnippetsFunctionBase(inputShapes), types(types) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
         NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
@@ -191,7 +191,7 @@ protected:
 //  Result        Result
 class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnInputOutputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertManyOnInputOutputFunction(const std::vector<ov::PartialShape>& inputShapes,
                                               const std::vector<ov::element::Type>& inTypes,
                                               const std::vector<ov::element::Type>& outTypes)
     : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
index b663c22671f..3cbcfdac4a5 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
@@ -28,7 +28,7 @@ namespace snippets {
 //          Result
 class ConvMulActivationFunction : public SnippetsFunctionCustomizable {
 public:
-    explicit ConvMulActivationFunction(const std::vector<Shape>& inputShapes, const std::vector<std::shared_ptr<Node>>& customOps)
+    explicit ConvMulActivationFunction(const std::vector<PartialShape>& inputShapes, const std::vector<std::shared_ptr<Node>>& customOps)
             : SnippetsFunctionCustomizable(inputShapes, customOps, {2, 1, 1}) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
             NGRAPH_CHECK(input_shapes[0].size() == 4, "Only 4D input shapes are currently supported");
@@ -36,6 +36,7 @@ public:
                          ov::op::util::is_unary_elementwise_arithmetic(customOps[1]) &&
                          ov::op::util::is_unary_elementwise_arithmetic(customOps[2]),
                          "Got invalid custom ops: expected binary and two unary operations");
+            NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes");
     }
 private:
     std::shared_ptr<ov::Model> initOriginal() const override;
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
index fad086acf03..c583b5882ab 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
@@ -8,8 +8,10 @@
 #include "snippets_helpers.hpp"
 #include "subgraph_simple.hpp"
 #include "subgraph_converts.hpp"
+#include "subgraph_matmul.hpp"
+#include "subgraph_softmax.hpp"
 
-/* This file provides lowered representations (after the generate() was calles) for some simple functions.
+/* This file provides lowered representations (after the generate() was called) for some simple functions.
  * This is required to test snippets lowering and optimization passes. All the functions are expected to be direct
  * descendants of SnippetsFunctionCustomizable (defined here) and one of the SnippetsFunctionBase derived classes
  * (declared in subgraph_simple.hpp). Note that the corresponding SnippetsFunctionBase child should use virtual inheritance
@@ -22,7 +24,7 @@ namespace snippets {
 
 class AddFunctionLoweredBroadcast : public AddFunction {
 public:
-    explicit AddFunctionLoweredBroadcast(const std::vector<Shape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
+    explicit AddFunctionLoweredBroadcast(const std::vector<PartialShape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
         AddFunction(inputShapes), broadcast_shapes{broadcastShapes} {
         NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(),
                      "Broadcast shapes should have the same size as input_shapes");
@@ -37,10 +39,12 @@ private:
 
 class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction {
 public:
-    explicit EltwiseThreeInputsLoweredFunction(const std::vector<Shape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
+    explicit EltwiseThreeInputsLoweredFunction(const std::vector<PartialShape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
             EltwiseThreeInputsFunction(inputShapes), broadcast_shapes{broadcastShapes} {
         NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(),
                      "Broadcast shapes should have the same size as input_shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static() && input_shapes[2].is_static(),
+                     "Broadcast shapes should have the same size as input_shapes");
     }
 
 protected:
@@ -49,6 +53,41 @@ private:
     std::vector<Shape> broadcast_shapes;
 };
 
+class Transpose0213MatMulLoweredFunction : public Transpose0213MatMulFunction {
+public:
+    explicit Transpose0213MatMulLoweredFunction(const std::vector<PartialShape>& inputShapes, size_t position = 0) :
+            Transpose0213MatMulFunction(inputShapes, position) {
+    }
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
+class SoftmaxLoweredFunction : public SoftmaxFunction {
+public:
+    explicit SoftmaxLoweredFunction(const std::vector<PartialShape>& inputShapes, int axis) : SoftmaxFunction(inputShapes, axis) {}
+
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
+// With LoopFusion pass
+class AddSoftmaxLoweredFunction : public AddSoftmaxFunction {
+public:
+    explicit AddSoftmaxLoweredFunction(const std::vector<PartialShape>& inputShapes, int axis) : AddSoftmaxFunction(inputShapes, axis) {}
+
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
+class BroadcastAddLoweredFunction : public BroadcastAddFunction {
+public:
+    explicit BroadcastAddLoweredFunction(const std::vector<PartialShape>& inputShapes, const PartialShape& targetShape) :
+            BroadcastAddFunction(inputShapes, targetShape) {}
+
+protected:
+    std::shared_ptr<ov::Model> initLowered() const override;
+};
+
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp
new file mode 100644
index 00000000000..ea533334e80
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_matmul.hpp
@@ -0,0 +1,96 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "./snippets_helpers.hpp"
+
+/* This file contains definitions of relatively simple functions (models) that will be used
+ * to test snippets-specific behavior. All the functions are expected to be direct descendants of
+ * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument.
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+/// Minimal graph to test MatMul support
+/// Tokenized simply by starting subgraph,
+//   in1        in2
+//        Matmul
+//         Result
+// todo: remove  once "no subgraph after input" limitation is relaxed
+class MatMulFunction : public SnippetsFunctionBase {
+public:
+    explicit MatMulFunction(const std::vector<PartialShape>& inputShapes)
+    : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+};
+
+// As same as MatMulFunction but with biases
+class MatMulBiasFunction : public SnippetsFunctionBase {
+public:
+    explicit MatMulBiasFunction(const std::vector<PartialShape>& inputShapes)
+            : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+/// Minimal graph to test MatMul+Transpose combinations. Transpose location is specified via the position argument:
+/// 0 - before the first MatMul input; 1 - before the second MatMul input; 2 - after the MatMul output.
+/// Tokenized simply by starting subgraph,
+//   in1        in2
+// Transpose  /
+//         Matmul
+//         Result
+class Transpose0213MatMulFunction : public SnippetsFunctionBase {
+public:
+    explicit Transpose0213MatMulFunction(const std::vector<PartialShape>& inputShapes, size_t position = 0)
+    : SnippetsFunctionBase(inputShapes), transpose_position(position)  {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+        NGRAPH_CHECK(input_shapes[0].rank().get_length() == 4 && input_shapes[1].rank().get_length() == 4,
+                     "Only rank 4 input shapes are supported by this test");
+        NGRAPH_CHECK(transpose_position >=0 && transpose_position <= 2, "Got invalid transpose position");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    size_t transpose_position;
+};
+
+class TransposeMatMulFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeMatMulFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+class TransposeMatMulBiasFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeMatMulBiasFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+class TransposeMulMatMulBiasFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeMulMatMulBiasFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp
new file mode 100644
index 00000000000..309a32e9145
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_mha.hpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "snippets_helpers.hpp"
+
+
+/* The file contains graphs with different MHA-patterns:
+ * Skeleton on MHA-pattern is:
+ *              \     /
+ *              MatMul0
+ *                 |
+ * Eltwise/Select/Reshape/FakeQuantize
+ *                 |
+ *              Softmax
+ *                 |
+ * Eltwise/Select/Reshape/FakeQuantize
+ *                  \      /
+ *                   MatMul1
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+/* Graph:
+ *       Transpose1[0,2,3,1]  Constant
+ *                     \       /
+ * Transpose0[0,2,1,3] Multiply [with_mul = true]
+ *              \     /
+ *              MatMul0
+ *                 \   /
+ *                  Add
+ *                Reshape0
+ *                Softmax
+ *                Reshape1  Transpose2[0,2,1,3]
+ *                    \      /
+ *                     MatMul1
+ *                   Transpose3[0,2,1,3]
+ */
+class MHAFunction : public SnippetsFunctionBase {
+public:
+    explicit MHAFunction(const std::vector<PartialShape>& inputShapes, bool with_mul = true)
+        : SnippetsFunctionBase(inputShapes), with_mul(with_mul) {
+        NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+
+    bool with_mul = true;
+};
+
+/* Graph:
+ *       Transpose1[0,2,1,3]  Constant
+ *                     \       /
+ * Transpose0[0,2,1,3] Multiply
+ *              \     /
+ *              MatMul0 [transposed_b = true]
+ *                 \   /
+ *                  Add
+ *                Reshape0
+ *                Softmax
+ *                Reshape1  Transpose2[0,2,1,3]
+ *                    \      /
+ *                     MatMul1
+ *                   Transpose3[0,2,1,3]
+ */
+class MHAMatMul0TransposeFunction : public SnippetsFunctionBase {
+public:
+    explicit MHAMatMul0TransposeFunction(const std::vector<PartialShape>& inputShapes)
+            : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 4, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+};
+
+/* Graph:
+ *             Transpose1[0,2,3,1]  Constant
+ *                           \       /
+ *       Transpose0[0,2,1,3] Multiply
+ *     \               \     /
+ * Broadcast  Scalar   MatMul0
+ *       \      |      /
+ *           Select
+ *          Reshape0
+ *          Softmax
+ *          Reshape1  Transpose2[0,2,1,3]
+ *              \      /
+ *               MatMul1
+ *             Transpose3[0,2,1,3]
+ */
+class MHASelectFunction : public SnippetsFunctionBase {
+public:
+    explicit MHASelectFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 6, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+/* Graph:
+ *             Constant
+ *        \      /
+ *        Multiply
+ *    \     /
+ *    MatMul0
+ *       |
+ *    Softmax
+ *        \      /
+ *         MatMul1
+ *           |
+ *       Transpose3[0,2,1,3]
+ */
+class MHAWOTransposeOnInputsFunction : public SnippetsFunctionBase {
+public:
+    explicit MHAWOTransposeOnInputsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
index 6ebc6acd7d7..b62719917ae 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
@@ -22,7 +22,7 @@ namespace snippets {
 //   Result
 class AddFunction : public SnippetsFunctionBase {
 public:
-    explicit AddFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -36,8 +36,9 @@ protected:
 // todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class AddConstFunction : public SnippetsFunctionBase {
 public:
-    explicit AddConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddConstFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static(), "This test supports only static shapes");
     }
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
@@ -53,8 +54,9 @@ protected:
 // The function is needed to check different input element types (model precision change)
 class AddRollConstFunction : public SnippetsFunctionBase {
 public:
-    explicit AddRollConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddRollConstFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static(), "Only static shapes are supported");
     }
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
@@ -69,7 +71,7 @@ protected:
 //   Result
 class EltwiseFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -84,7 +86,7 @@ protected:
 //       Result
 class EltwiseThreeInputsFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseThreeInputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseThreeInputsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
     }
 protected:
@@ -99,7 +101,7 @@ protected:
 //          Result
 class EltwiseMaxNumParamsFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseMaxNumParamsFunction(const std::vector<Shape>& inputShapes) :
+    explicit EltwiseMaxNumParamsFunction(const std::vector<PartialShape>& inputShapes) :
             SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes");
     }
@@ -115,7 +117,7 @@ protected:
 //                     Result
 class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase {
 public:
-    explicit MatMulEltwiseBranchesFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit MatMulEltwiseBranchesFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
             NGRAPH_CHECK(input_shapes[0].size() == 4 && input_shapes[1].size() == 4,
                          "Only 4D input shapes are currently supported by this test");
@@ -123,6 +125,7 @@ public:
             //  Note that single-element constant are not supported by the test, since they'll be converted
             //  to snippets::op::Scalar. So a more comlex logics is required to produce reference function.
             NGRAPH_CHECK(input_shapes[0][1] == input_shapes[1][1], "Channel dimensions must be equal and != 1");
+            NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes");
     }
 
 protected:
@@ -138,7 +141,7 @@ protected:
 //       Result
 class EltwiseLogLoopFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseLogLoopFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseLogLoopFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -155,7 +158,7 @@ protected:
 //  Result
 class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseTwoResultsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseTwoResultsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -172,12 +175,58 @@ protected:
 //             Result
 class TwoInputsAndOutputsFunction : public SnippetsFunctionBase {
 public:
-    explicit TwoInputsAndOutputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit TwoInputsAndOutputsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
 };
+/// Verify Select
+//   in0     in1    in2
+//     \      |     /
+//         Select
+//         Result
+class SelectFunction : public SnippetsFunctionBase {
+public:
+    explicit SelectFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+        NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+/// Verify Broadcast in passes
+//   in0     in1
+// Broadcast  |
+//     \     /
+//        Add
+//       Result
+class BroadcastAddFunction : public SnippetsFunctionBase {
+public:
+    explicit BroadcastAddFunction(const std::vector<PartialShape>& inputShapes, const PartialShape& targetShape)
+        : SnippetsFunctionBase(inputShapes), m_target_shape(targetShape) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+
+    PartialShape m_target_shape;
+};
+
+/// Verify Select + Broadcast
+//   in0     in1    in2
+// Broadcast  |      |
+//     \      |     /
+//         Select
+//         Result
+class BroadcastSelectFunction : public SelectFunction {
+public:
+    explicit BroadcastSelectFunction(const std::vector<PartialShape>& inputShapes, const PartialShape& targetShape)
+            : SelectFunction(inputShapes), m_target_shape(targetShape) {}
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+
+    PartialShape m_target_shape;
+};
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp
new file mode 100644
index 00000000000..90cec1a1a3c
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_softmax.hpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "./snippets_helpers.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+class SoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit SoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class AddSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit AddSoftmaxFunction(const std::vector<PartialShape>& inputShapes, int axis) : SnippetsFunctionBase(inputShapes), axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    int axis;
+};
+
+class TransposeSoftmaxFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeSoftmaxFunction(const std::vector<PartialShape>& inputShapes, const std::vector<int64_t>& order, const int64_t axis)
+            : SnippetsFunctionBase(inputShapes), m_order(order), m_axis(axis) {
+        NGRAPH_CHECK(input_shapes.size() > 0, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+
+    std::vector<int64_t> m_order;
+    int64_t m_axis;
+};
+
+class TransposeSoftmaxEltwiseFunction : public TransposeSoftmaxFunction {
+public:
+    explicit TransposeSoftmaxEltwiseFunction(const std::vector<PartialShape>& inputShapes, const std::vector<int64_t>& order, const int64_t axis)
+            : TransposeSoftmaxFunction(inputShapes, order, axis) {}
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp
new file mode 100644
index 00000000000..b77ea54e257
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_transpose.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/ngraph.hpp"
+#include "snippets_helpers.hpp"
+
+/* This file contains definitions of relatively simple functions (models) that will be used
+ * to test snippets-specific behavior. All the functions are expected to be direct descendants of
+ * SnippetsFunctionBase, so their constructors take only one (inputShapes) argument.
+ */
+
+namespace ov {
+namespace test {
+namespace snippets {
+/// Minimal graph to test Transpose support: Parameter->Sinh->Transpose->Result
+/// Tokenized simply by starting subgraph, supported through TransposeDecomposition
+//   in1        Const(order)
+//        Transpose
+//         Result
+class TransposeFunction : public SnippetsFunctionBase {
+public:
+    explicit TransposeFunction(const std::vector<PartialShape>& inputShapes, std::vector<int> order)
+    : SnippetsFunctionBase(inputShapes), order(std::move(order)) {
+        NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+    }
+protected:
+    std::shared_ptr<ov::Model> initOriginal() const override;
+    std::shared_ptr<ov::Model> initReference() const override;
+    std::vector<int> order;
+};
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
index ff7cdc986a5..8cec4a4aca9 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
@@ -15,11 +15,11 @@ void SnippetsFunctionBase::validate_function(const std::shared_ptr<Model> &f) co
     NGRAPH_CHECK(params.size() == input_shapes.size(),
                  "Passed input shapes and produced function are inconsistent.");
     for (size_t i = 0; i < input_shapes.size(); i++)
-        NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_shape().begin()),
+        NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_partial_shape().begin()),
                      "Passed input shapes and produced function are inconsistent.");
 }
 
-SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector<Shape>& inputShapes,
+SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector<PartialShape>& inputShapes,
                                                            const std::vector<std::shared_ptr<Node>>& customOps,
                                                            const std::vector<size_t>&& customOpsNumInputs)
         : SnippetsFunctionBase(inputShapes), custom_ops{customOps}, custom_ops_num_inputs{customOpsNumInputs} {
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
index ccf1ce4081e..9975f5185c1 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
@@ -13,7 +13,7 @@ namespace snippets {
 
 std::shared_ptr<ov::Model> ConvMulActivationFunction::initOriginal() const {
     auto conv_param = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const auto channels = input_shapes[0][1];
+    const auto channels = static_cast<size_t>(input_shapes[0][1].get_length());
     ngraph::Shape strides(2, 1);
     std::vector<ptrdiff_t> pad_begin(2, 1), pad_end(2, 1);
     const Shape const_shape {channels, channels, 3, 3};
@@ -37,7 +37,7 @@ std::shared_ptr<ov::Model> ConvMulActivationFunction::initReference() const {
     auto conv_param = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     ngraph::Shape strides(2, 1);
     std::vector<ptrdiff_t> pad_begin(2, 1), pad_end(2, 1);
-    const auto channels = input_shapes[0][1];
+    const auto channels = static_cast<size_t>(input_shapes[0][1].get_length());
     const Shape const_shape {channels, channels, 3, 3};
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(const_shape), -10., 10.);
     auto weights = std::make_shared<op::v0::Constant>(precision, const_shape, const_values);
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
index 8fd664b1921..22b86982e9e 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
@@ -6,6 +6,7 @@
 #include "common_test_utils/data_utils.hpp"
 #include <snippets/snippets_isa.hpp>
 #include "ngraph_functions/builders.hpp"
+#include "snippets/pass/loop_helpers.hpp"
 
 namespace ov {
 namespace test {
@@ -14,7 +15,7 @@ namespace snippets {
 std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     std::shared_ptr<Node> add_input0 = nullptr;
-    if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) {
+    if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].rbegin()->get_length()) {
         add_input0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data0, broadcast_shapes[0]);
     } else {
         add_input0 = std::make_shared<ngraph::snippets::op::Load>(data0);
@@ -22,18 +23,38 @@ std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
 
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
     std::shared_ptr<Node> add_input1 = nullptr;
-    if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) {
+    if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].rbegin()->get_length()) {
         add_input1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data1, broadcast_shapes[1]);
     } else {
         add_input1 = std::make_shared<ngraph::snippets::op::Load>(data1);
     }
     auto add = std::make_shared<op::v1::Add>(add_input0, add_input1);
     auto store = std::make_shared<ngraph::snippets::op::Store>(add);
-    return std::make_shared<ov::Model>(NodeVector{store}, ParameterVector{data0, data1});
+    ParameterVector input_params {data0, data1};
+    auto model = std::make_shared<ov::Model>(NodeVector{store}, input_params);
+
+    // Create dummy scheduler to pass graph comparison tests
+    // Note that if there is more than one results, they should be reverted
+    ResultVector results({model->get_results()[0]});
+    const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+    std::vector<bool> apply_increments(input_params.size() + results.size(), true);
+    insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments);
+    auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0,
+                   [](int64_t max_val, const PartialShape& ps) {
+                        return std::max(ps[ps.size() - 2].get_length(), max_val);
+                    });
+    if (outer_WA > 1) {
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments);
+    }
+    return model;
 }
 std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() const {
     // todo: implement conversion between std::vector<size_t> and std::vector<Shape>
-    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0], input_shapes[1], input_shapes[2]});
+    auto input_params = ngraph::builder::makeParams(precision,
+                                                    {input_shapes[0].get_shape(),
+                                                     input_shapes[1].get_shape(),
+                                                     input_shapes[2].get_shape()});
     auto load_or_broadcastload = [&](size_t i) -> std::shared_ptr<Node> {
         // user specified that no broadcasting is required
         if (broadcast_shapes[i].empty()) {
@@ -41,7 +62,7 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
         // broadcasting is required: could be Load + BroadcastMove or BroiadcastLoad
         } else {
             // The last dim is processed by vector Tile, so BroadcastLoad is required if the last dim being broadcasted
-            if (input_shapes[i].back() == 1 && broadcast_shapes[i].back() != 1) {
+            if (input_shapes[i].rbegin()->get_length() == 1 && broadcast_shapes[i].back() != 1) {
                 return std::make_shared<ngraph::snippets::op::BroadcastLoad>(input_params[i], broadcast_shapes[i]);
             // Todo: Cover this logics with functional tests, Review FakeBroadcast Emitter
             // Broadcasting of other dims is handled by BroadcastMove. Strictly speaking, broadcasting is achieved via
@@ -57,12 +78,6 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto sub_scalar = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[0]);
     std::shared_ptr<Node> sub_load;
-//  Todo: Uncomment when invalid read in vector tile will be fixed
-//    if (input_shapes[2].back() == 1)
-//        sub_load = std::make_shared<snippets::op::ScalarLoad>(input_params[2]);
-//    else
-//        sub_load = std::make_shared<snippets::op::Load>(input_params[2]);
-//  remove when the code above is enabled:
     sub_load = std::make_shared<ngraph::snippets::op::Load>(input_params[2]);
     auto sub = std::make_shared<op::v1::Subtract>(sub_load, sub_scalar);
     std::shared_ptr<Node> sub_out;
@@ -72,7 +87,334 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
         sub_out = std::make_shared<ngraph::snippets::op::BroadcastMove>(sub, broadcast_shapes[2]);
     auto mul = std::make_shared<op::v1::Multiply>(add, sub_out);
     auto store = std::make_shared<ngraph::snippets::op::Store>(mul);
-    return std::make_shared<ov::Model>(NodeVector{store}, input_params);
+    auto model = std::make_shared<ov::Model>(NodeVector{store}, input_params);
+
+    // Create dummy scheduler to pass graph comparison tests
+    // Note that if there is more than one results, they should be reverted
+    ResultVector results({model->get_results()[0]});
+    const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+    std::vector<bool> apply_increments(input_params.size() + results.size(), true);
+    const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments);
+    auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0,
+                                    [](int64_t max_val, const PartialShape& ps) {
+                                        return std::max(ps[ps.size() - 2].get_length(), max_val);
+                                    });
+    if (outer_WA > 1) {
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments);
+    }
+    return model;
+}
+
+std::shared_ptr<ov::Model> Transpose0213MatMulLoweredFunction::initLowered() const {
+    ParameterVector data{std::make_shared<op::v0::Parameter>(precision, input_shapes[0]),
+                         std::make_shared<op::v0::Parameter>(precision, input_shapes[1])};
+    std::vector<size_t> layout{0, 2, 1, 3};
+    // Note: validity of transpose_position values is checked in Transpose0213MatMulSinhFunction constructor
+    if (transpose_position <= 1) {
+        auto &rt_info = data[transpose_position]->get_rt_info();
+        rt_info["Layout"] = layout;
+    }
+    auto matmul = std::make_shared<ngraph::snippets::op::Brgemm>(data[0], data[1]);
+    if (transpose_position == 2) {
+        auto &rt_info = matmul->get_rt_info();
+        rt_info["Layout"] = layout;
+        matmul->validate_and_infer_types();
+    }
+    return std::make_shared<ov::Model>(NodeVector{matmul}, data);
+}
+
+std::shared_ptr<ov::Model> SoftmaxLoweredFunction::initLowered() const {
+    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape()});
+
+    const auto data = input_params.front();
+
+    const auto master_shape = input_shapes[0].get_shape();
+    const auto shape_rank = master_shape.size();
+    const auto dimension = shape_rank - 1;
+    const auto work_amount = master_shape[dimension];
+    const auto increment = 10;
+    const auto inner_dim = shape_rank - 1;
+    const auto inner_master_wa = static_cast<int>(master_shape[inner_dim]);
+    const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1;
+    const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+    const bool is_scalar = work_amount == 1;
+
+    /* ====== ReduceMax decomposition ====== */
+
+    const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data});
+
+    // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation
+    const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+    const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
+
+    std::vector<bool> apply_increments_max(3, false);
+    std::vector<int64_t> finalization_offsets_max(3, 0);
+    apply_increments_max[0] = data->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_max[0] = data->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
+        work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+    std::shared_ptr<ov::Node> horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+    horizon_max->add_control_dependency(loop_max_end);
+    const auto prev_horizon_max = horizon_max;
+    if (!is_scalar) {
+        horizon_max = std::make_shared<ngraph::snippets::op::BroadcastMove>(horizon_max, horizon_max->get_input_partial_shape(0));
+    }
+
+    loop_max_begin->add_control_dependency(vector_buffer_max);
+    loop_max_end->add_control_dependency(max);
+
+    /* =========================================== */
+
+    /* === Sub + Exp + ReduceSum decomposition === */
+
+    const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
+
+    const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+    const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+    // we don't insert Fill here after Exp to verify because in generate() call Fill op is inserted only on vector representation
+    const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+    const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+    const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+    std::vector<bool> apply_increments_sum(2, false);
+    std::vector<int64_t> finalization_offsets_sum(2, 0);
+    apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+        apply_increments_sum, finalization_offsets_sum);
+    loop_sum_end->add_control_dependency(sum);
+
+    const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+    horizon_sum->add_control_dependency(loop_sum_end);
+
+    const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0));
+
+    loop_sum_begin->add_control_dependency(vector_buffer_sum);
+    loop_sum_begin->add_control_dependency(horizon_max);
+    loop_sum_begin->add_control_dependency(prev_horizon_max);
+
+    /* =========================================== */
+
+    /* ================== Div ==================== */
+
+    std::shared_ptr<ov::Node> pow = std::make_shared<ngraph::snippets::op::PowerStatic>(horizon_sum, -1);
+    const auto prev_pow = pow;
+    if (!is_scalar) {
+        pow = std::make_shared<ngraph::snippets::op::BroadcastMove>(pow, horizon_sum->get_input_partial_shape(0));
+    }
+
+    const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+    const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+    const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+    const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+    std::vector<bool> apply_increments_div(2, false);
+    std::vector<int64_t> finalization_offsets_div(2, 0);
+    apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+        apply_increments_div, finalization_offsets_div);
+    loop_div_begin->add_control_dependency(horizon_sum);
+    loop_div_begin->add_control_dependency(pow);
+    loop_div_begin->add_control_dependency(prev_pow);
+
+    /* =========================================== */
+
+    const auto result = std::make_shared<ov::op::v0::Result>(loop_div_end);
+    if (has_outer_loop) {
+        const auto need_increment = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1;
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        const auto outer_loop_end = insertLoopEnd(NodeVector{result}, outer_loop_begin, 1, 1, std::vector<bool>{need_increment, need_increment});
+        vector_buffer_max->add_control_dependency(outer_loop_begin);
+    }
+
+    return std::make_shared<ov::Model>(ResultVector{result}, input_params);
+}
+std::shared_ptr<ov::Model> AddSoftmaxLoweredFunction::initLowered() const {
+    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0].get_shape(), input_shapes[1].get_shape()});
+
+    auto master_pshape = input_shapes[0];
+    ov::PartialShape::broadcast_merge_into(master_pshape, input_shapes[1], op::AutoBroadcastType::NUMPY);
+    const auto master_shape = master_pshape.get_shape();
+    const auto shape_rank = master_shape.size();
+    const auto dimension = shape_rank - 1;
+    const auto work_amount = master_shape[dimension];
+    const auto increment = 10;
+    const auto inner_dim = shape_rank - 1;
+    const auto inner_master_wa = static_cast<int>(master_shape[inner_dim]);
+    const int outer_dim = shape_rank > 1 ? shape_rank - 2 : -1;
+    const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
+    const bool is_scalar = work_amount == 1;
+
+    /* ================== Add + ReduceMax ==================== */
+
+    const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+
+    std::shared_ptr<ov::Node> load0 = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
+    if (!is_scalar && input_shapes[0].get_shape().back() == 1) {
+        auto new_shape = input_shapes[0].get_shape();
+        new_shape[new_shape.size() - 1] = static_cast<size_t>(inner_master_wa);
+        load0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(loop_max_begin->output(0), new_shape);
+    }
+    std::shared_ptr<ov::Node> load1 = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(1), increment);
+    if (!is_scalar && input_shapes[1].get_shape().back() == 1) {
+        auto new_shape = input_shapes[1].get_shape();
+        new_shape[new_shape.size() - 1] = static_cast<size_t>(inner_master_wa);
+        load1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(loop_max_begin->output(1), new_shape);
+    }
+    const auto add = std::make_shared<ov::op::v1::Add>(load0, load1);
+    const auto store = std::make_shared<ngraph::snippets::op::Store>(add, increment);
+
+    // we don't insert Fill here after load_max to verify because in generate() call Fill op is inserted only on vector representation
+    const auto max = std::make_shared<ov::op::v1::Maximum>(add, vector_buffer_max);
+
+    std::vector<bool> apply_increments_max(3, false);
+    std::vector<int64_t> finalization_offsets_max(3, 0);
+    apply_increments_max[0] = input_shapes[0].get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_max[1] = input_shapes[1].get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_max[2] = master_shape[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_max[0] = input_shapes[0].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_max[1] = input_shapes[1].get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_max[2] = master_shape[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{store, loop_max_begin->output(2)},
+        work_amount, increment, apply_increments_max, finalization_offsets_max);
+
+    std::shared_ptr<ov::Node> horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
+    horizon_max->add_control_dependency(loop_max_end);
+    const auto prev_horizon_max = horizon_max;
+    if (!is_scalar) {
+        horizon_max = std::make_shared<ngraph::snippets::op::BroadcastMove>(horizon_max, horizon_max->get_input_partial_shape(0));
+    }
+
+    loop_max_begin->add_control_dependency(vector_buffer_max);
+    loop_max_end->add_control_dependency(max);
+
+    /* =========================================== */
+
+    const auto buffer_add = std::make_shared<ngraph::snippets::op::Buffer>(loop_max_end->output(0));
+
+    /* === Sub + Exp + ReduceSum decomposition === */
+
+    const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
+    const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_add->output(0)});
+
+    const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
+    const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
+    // we don't insert Fill here after exp to verify because in generate() call Fill op is inserted only on vector representation
+    const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
+    const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
+    const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
+
+    std::vector<bool> apply_increments_sum(2, false);
+    std::vector<int64_t> finalization_offsets_sum(2, 0);
+    apply_increments_sum[0] = load_sub->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_sum[1] = store_exp->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_sum[0] = has_outer_loop && load_sub->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_sum[1] = store_exp->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
+        apply_increments_sum, finalization_offsets_sum);
+    loop_sum_end->add_control_dependency(sum);
+
+    const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
+    horizon_sum->add_control_dependency(loop_sum_end);
+
+    const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0));
+
+    loop_sum_begin->add_control_dependency(vector_buffer_sum);
+    loop_sum_begin->add_control_dependency(horizon_max);
+    loop_sum_begin->add_control_dependency(prev_horizon_max);
+
+    /* =========================================== */
+
+    /* ================== Div ==================== */
+
+    std::shared_ptr<ov::Node> pow = std::make_shared<ngraph::snippets::op::PowerStatic>(horizon_sum, -1);
+    const auto prev_pow = pow;
+    if (!is_scalar) {
+        pow = std::make_shared<ngraph::snippets::op::BroadcastMove>(pow, horizon_sum->get_input_partial_shape(0));
+    }
+
+    const auto loop_div_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
+
+    const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
+    const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
+    const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
+
+    std::vector<bool> apply_increments_div(2, false);
+    std::vector<int64_t> finalization_offsets_div(2, 0);
+    apply_increments_div[0] = load_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    apply_increments_div[1] = store_div->get_shape()[inner_dim] != 1 && inner_master_wa != 1;
+    finalization_offsets_div[0] = has_outer_loop && load_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    finalization_offsets_div[1] = has_outer_loop && store_div->get_shape()[inner_dim] != 1 ? -inner_master_wa : 0;
+    const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
+        ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
+        apply_increments_div, finalization_offsets_div);
+    loop_div_begin->add_control_dependency(horizon_sum);
+    loop_div_begin->add_control_dependency(pow);
+    loop_div_begin->add_control_dependency(prev_pow);
+
+    /* =========================================== */
+
+    const auto result = std::make_shared<ov::op::v0::Result>(loop_div_end);
+    if (has_outer_loop) {
+        const auto need_increment0 = input_shapes[0].get_shape()[outer_dim] != 1 && input_shapes[0].get_shape()[inner_dim] == 1;
+        const auto need_increment1 = input_shapes[1].get_shape()[outer_dim] != 1 && input_shapes[1].get_shape()[inner_dim] == 1;
+        const auto need_increment2 = master_shape[outer_dim] != 1 && master_shape[inner_dim] == 1;
+        const auto outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        const auto outer_loop_end = insertLoopEnd(
+                NodeVector{result}, outer_loop_begin, 1, 1, std::vector<bool>{need_increment0, need_increment1, need_increment2});
+        vector_buffer_max->add_control_dependency(outer_loop_begin);
+    }
+
+    return std::make_shared<ov::Model>(ResultVector{result}, input_params);
+}
+std::shared_ptr<ov::Model> BroadcastAddLoweredFunction::initLowered() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    ov::NodeVector datas = {data0, data1};
+    auto last_dim = std::max(input_shapes[0].get_shape().back(), std::max(input_shapes[1].get_shape().back(), m_target_shape.get_shape().back()));
+    ov::NodeVector loads(datas.size(), nullptr);
+    for (auto i = 0; i < datas.size(); i++) {
+        if (input_shapes[i].get_shape().back() != last_dim) {
+            auto new_shape = input_shapes[i];
+            new_shape[new_shape.size() - 1] = last_dim;
+            loads[i] = std::make_shared<ngraph::snippets::op::BroadcastLoad>(datas[i], new_shape);
+        } else {
+            loads[i] = std::make_shared<ngraph::snippets::op::Load>(datas[i]);
+        }
+    }
+    auto add = std::make_shared<op::v1::Add>(loads[0], loads[1]);
+    auto store = std::make_shared<ngraph::snippets::op::Store>(add);
+    auto model = std::make_shared<Model>(NodeVector{store}, ParameterVector{data0, data1});
+
+    // Create dummy scheduler to pass graph comparison tests
+    // Note that if there is more than one results, they should be reverted
+    ResultVector results({model->get_results()[0]});
+    const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(datas);
+    std::vector<bool> apply_increments(datas.size() + results.size(), true);
+    insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments);
+    auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0,
+                                    [](int64_t max_val, const PartialShape& ps) {
+                                        return std::max(ps[ps.size() - 2].get_length(), max_val);
+                                    });
+    if (outer_WA > 1) {
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(datas);
+        insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments);
+    }
+    return model;
 }
 }  // namespace snippets
 }  // namespace test
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp
new file mode 100644
index 00000000000..af312a2ee2d
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_matmul.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_matmul.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/subgraph.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+std::shared_ptr<ov::Model> MatMulFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto matmul = std::make_shared<op::v0::MatMul>(data0, data1);
+    return std::make_shared<ov::Model>(NodeVector{matmul}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> MatMulFunction::initReference() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto indata0 = std::make_shared<op::v0::Parameter>(precision, data0->get_output_partial_shape(0));
+    auto indata1 = std::make_shared<op::v0::Parameter>(precision, data1->get_output_partial_shape(0));
+    auto matmul = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{data0, data1},
+                                          std::make_shared<ov::Model>(NodeVector{std::make_shared<op::v0::MatMul>(indata0, indata1)},
+                                                                      ParameterVector{indata0, indata1}));
+    return std::make_shared<ov::Model>(NodeVector{matmul}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> MatMulBiasFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto matmul = std::make_shared<op::v0::MatMul>(data0, data1);
+    auto data2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
+    auto bias = std::make_shared<op::v1::Add>(matmul, data2);
+    return std::make_shared<ov::Model>(NodeVector{bias}, ParameterVector{data0, data1, data2});
+}
+std::shared_ptr<ov::Model> Transpose0213MatMulFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {4}, std::vector<int>{0, 2, 1, 3});
+    std::shared_ptr<Node> result;
+    switch (transpose_position) {
+        case 0: {
+            auto transpose = std::make_shared<op::v1::Transpose>(data0, const_order);
+            result = std::make_shared<op::v0::MatMul>(transpose, data1);
+            break;
+        } case 1: {
+            auto transpose = std::make_shared<op::v1::Transpose>(data1, const_order);
+            result = std::make_shared<op::v0::MatMul>(data0, transpose);
+            break;
+        } case 2: {
+            auto matmul = std::make_shared<op::v0::MatMul>(data0, data1);
+            result = std::make_shared<op::v1::Transpose>(matmul, const_order);
+            break;
+        }
+    }
+    return std::make_shared<ov::Model>(NodeVector{result}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> TransposeMatMulFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {4}, std::vector<int>{0, 2, 3, 1});
+    auto transpose = std::make_shared<op::v1::Transpose>(data1, const_order);
+    auto matmul = std::make_shared<op::v0::MatMul>(data0, transpose);
+    return std::make_shared<ov::Model>(NodeVector{matmul}, ParameterVector{data0, data1});
+}
+std::shared_ptr<ov::Model> TransposeMatMulBiasFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {4}, std::vector<int>{0, 2, 3, 1});
+    auto transpose = std::make_shared<op::v1::Transpose>(data1, const_order);
+    auto matmul = std::make_shared<op::v0::MatMul>(data0, transpose);
+    auto bias = std::make_shared<op::v1::Add>(matmul, data2);
+    return std::make_shared<ov::Model>(NodeVector{bias}, ParameterVector{data0, data1, data2});
+}
+std::shared_ptr<ov::Model> TransposeMulMatMulBiasFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
+    auto data3 = std::make_shared<op::v0::Parameter>(precision, input_shapes[3]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {4}, std::vector<int>{0, 2, 3, 1});
+    auto transpose = std::make_shared<op::v1::Transpose>(data1, const_order);
+    auto mul = std::make_shared<op::v1::Multiply>(transpose, data2);
+    auto matmul = std::make_shared<op::v0::MatMul>(data0, mul);
+    auto bias = std::make_shared<op::v1::Add>(matmul, data3);
+    return std::make_shared<ov::Model>(NodeVector{bias}, ParameterVector{data0, data1, data2, data3});
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp
new file mode 100644
index 00000000000..ac38ea47624
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_mha.cpp
@@ -0,0 +1,348 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_mha.hpp"
+
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/subgraph.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::shared_ptr<ov::Model> MHAFunction::initOriginal() const {
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+    ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param};
+
+    std::vector<ov::Shape> constantShapes;
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
+    constantShapes.push_back(ov::Shape({2}));
+    constantShapes.push_back(ov::Shape({4}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+
+    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
+    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
+
+    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
+                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
+                                              -1};
+    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+
+    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
+    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+
+    float transA = false;
+    float transB = false;
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+    std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
+    if (with_mul) {
+        std::vector<float> mulConstData(ngraph::shape_size(constantShapes[2]));
+        auto mulConst = ngraph::builder::makeConstant(precision, constantShapes[2], mulConstData, true);
+        matmul_parent1 = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulConst);
+    }
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1, transA, transB);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
+    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(transpose3)};
+    return std::make_shared<ov::Model>(results, ngraphParam, "mha");
+}
+std::shared_ptr<ov::Model> MHAFunction::initReference() const {
+    auto data0 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto data3 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+    ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3};
+    NodeVector subgraph_inputs = {data0, data1, data2, data3};
+
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+
+    std::vector<ov::Shape> constantShapes;
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
+    constantShapes.push_back(ov::Shape({2}));
+    constantShapes.push_back(ov::Shape({4}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+
+    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
+    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
+
+    ngraph::ParameterVector subgraph_params = {transpose0Param, transpose1Param, addParam, transpose2Param};
+
+    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
+                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
+                                              -1};
+    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+
+    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
+    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+
+    float transA = false;
+    float transB = false;
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+    std::shared_ptr<ov::Node> matmul_parent1 = transpose1;
+    if (with_mul) {
+        std::vector<float> mulConstData(ngraph::shape_size(constantShapes[2]));
+        auto mulConst = ngraph::builder::makeConstant(precision, constantShapes[2], mulConstData, true);
+        auto mulParam = std::make_shared<ngraph::opset1::Parameter>(precision, mulConst->get_shape());
+        matmul_parent1 = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulParam);
+        subgraph_params = {transpose0Param, transpose1Param, mulParam, addParam, transpose2Param};
+        subgraph_inputs = {data0, data1, mulConst, data2, data3};
+    }
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, matmul_parent1, transA, transB);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
+    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(subgraph_inputs,
+            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraph_params));
+
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ngraphParams);
+}
+
+std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initOriginal() const {
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+    ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param};
+
+    std::vector<ov::Shape> constantShapes;
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
+    constantShapes.push_back(ov::Shape({2}));
+    constantShapes.push_back(ov::Shape({4}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+
+    const auto order = std::vector<int64_t>{0, 2, 1, 3};
+    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], order);
+    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], order);
+    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], order);
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], order);
+
+    std::vector<float> mulConstData(1);
+    auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, mulConstData, true);
+
+    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
+                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
+                                              -1};
+    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+
+    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
+    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+
+    float transA = false;
+    float transB = false;
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+    const auto mul = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulConst);
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, mul, transA, true);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
+    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(transpose3)};
+    return std::make_shared<ov::Model>(results, ngraphParam, "mha");
+}
+std::shared_ptr<ov::Model> MHAMatMul0TransposeFunction::initReference() const {
+    auto data0 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto data3 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+    ngraph::ParameterVector ngraphParams = {data0, data1, data2, data3};
+
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+
+    std::vector<ov::Shape> constantShapes;
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
+    constantShapes.push_back(ov::Shape({2}));
+    constantShapes.push_back(ov::Shape({4}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+
+    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1], std::vector<int64_t>{0, 2, 3, 1});
+    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5], std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6], std::vector<int64_t>{0, 2, 1, 3});
+
+    std::vector<float> mulConstData(1);
+    auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, mulConstData, true);
+    ngraph::ParameterVector subgraphParams = {transpose0Param, transpose1Param, addParam, transpose2Param};
+
+    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
+                                                                   input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]),
+                                              -1};
+    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+
+    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
+    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+
+    float transA = false;
+    float transB = false;
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+    const auto mul = std::make_shared<ngraph::opset3::Multiply>(transpose1, mulConst);
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, mul, transA, transB);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(add, reshape0Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
+    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
+            NodeVector{data0, data1, data2, data3},
+            std::make_shared<ov::Model>(NodeVector{transpose3}, subgraphParams));
+
+    return std::make_shared<ov::Model>(NodeVector{subgraph}, ngraphParams);
+}
+
+std::shared_ptr<ov::Model> MHASelectFunction::initOriginal() const {
+    auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto transpose1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto addParam = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    auto less0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[3]);
+    auto less1Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[4]);
+    auto transpose2Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[5]);
+    ngraph::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, less0Param, less1Param, transpose2Param};
+
+    std::vector<ov::Shape> constantShapes;
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({1, input_shapes[1].get_shape()[2], 1, 1}));
+    constantShapes.push_back(ov::Shape({2}));
+    constantShapes.push_back(ov::Shape({4}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+    constantShapes.push_back(ov::Shape({input_shapes[0].get_shape().size()}));
+
+    auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[0],
+                                                         std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[1],
+                                                         std::vector<int64_t>{0, 2, 3, 1});
+    auto transpose2Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5],
+                                                         std::vector<int64_t>{0, 2, 1, 3});
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[6],
+                                                         std::vector<int64_t>{0, 2, 1, 3});
+
+    std::vector<int64_t> reshape0ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0] *
+                                                                   input_shapes[0].get_shape()[1] *
+                                                                   input_shapes[0].get_shape()[2]),
+                                              -1};
+    auto reshape0Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[3], reshape0ConstData);
+
+    std::vector<int64_t> reshape1ConstData = {static_cast<int64_t>(input_shapes[0].get_shape()[0]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[2]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1]),
+                                              static_cast<int64_t>(input_shapes[0].get_shape()[1])};
+    auto reshape1Const = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[4], reshape1ConstData);
+    // Value is equal to '1' - to avoid situation e^(-1000) / (sum(e^(-1000)) = 0/0 = NAN
+    auto selectConst = ngraph::builder::makeConstant(precision, ov::Shape{1}, std::vector<float>{1});
+
+    float transA = false;
+    float transB = false;
+    const auto transpose0 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(transpose1Param, transpose1Const);
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(transpose0, transpose1, transA, transB);
+    const auto add = std::make_shared<ngraph::opset3::Add>(matMul0, addParam);
+    const auto less = std::make_shared<ngraph::opset3::Less>(less0Param, less1Param);
+    std::shared_ptr<ov::Node> selectCond = less;
+    if (add->get_output_partial_shape(0) != input_shapes[3]) {
+        const auto broadcast_shape = ngraph::builder::makeConstant(ngraph::element::i64, constantShapes[5],
+                                                                   add->get_output_shape(0));
+        const auto broadcast = ngraph::builder::makeBroadcast(selectCond, broadcast_shape,
+                                                              ngraph::op::BroadcastType::NUMPY);
+        selectCond = broadcast;
+    }
+    const auto select = std::make_shared<ngraph::opset1::Select>(selectCond, selectConst, add,
+                                                                 ngraph::op::AutoBroadcastType::NUMPY);
+    const auto reshape0 = std::make_shared<ngraph::opset1::Reshape>(select, reshape0Const, true);
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(reshape0, 1);
+    const auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(softMax, reshape1Const, true);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(reshape1, transpose2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    // to generate valid values
+    less0Param->set_friendly_name("less0");
+    less0Param->set_friendly_name("less1");
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(transpose3)};
+    return std::make_shared<ov::Model>(results, ngraphParam, "mha");
+}
+
+std::shared_ptr<ov::Model> MHAWOTransposeOnInputsFunction::initOriginal() const {
+    auto param0 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    auto param1 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[1]);
+    auto param2 = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[2]);
+    ngraph::ParameterVector ngraphParam = {param0, param1, param2};
+
+    auto transpose3Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape({4}), std::vector<int64_t>{0, 2, 1, 3});
+
+    float transA = false;
+    float transB = false;
+    const auto mulConst = ngraph::builder::makeConstant(precision, ov::Shape({1}), std::vector<float>{1}, true);
+    const auto mul = std::make_shared<ngraph::opset3::Multiply>(param1, mulConst);
+    const auto matMul0 = std::make_shared<ngraph::opset3::MatMul>(param0, mul, transA, transB);
+    const auto softmax = std::make_shared<ngraph::opset1::Softmax>(matMul0, 3);
+    const auto matMul1 = std::make_shared<ngraph::opset3::MatMul>(softmax, param2, transA, transB);
+    const auto transpose3 = std::make_shared<ov::op::v1::Transpose>(matMul1, transpose3Const);
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(transpose3)};
+    return std::make_shared<ov::Model>(results, ngraphParam, "mha");
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
index a8e7aa6aa76..6ad6a087e2e 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
@@ -28,15 +28,16 @@ std::shared_ptr<ov::Model> AddFunction::initReference() const {
 }
 std::shared_ptr<ov::Model> AddConstFunction::initOriginal() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
-    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0].get_shape()), -10., 10.);
+    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0].get_shape(), const_values);
     auto add = std::make_shared<op::v1::Add>(data0, const_data1);
     return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0});
 }
 std::shared_ptr<ov::Model> AddRollConstFunction::initOriginal() const {
-    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
-    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
+    const auto input_shape = input_shapes[0].get_shape();
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shape);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shape), -10., 10.);
+    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shape, const_values);
     auto shift = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<float>{1});
     auto axes = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<float>{0});
     auto roll0 = std::make_shared<ov::op::v7::Roll>(data0, shift, axes);
@@ -49,7 +50,7 @@ std::shared_ptr<ov::Model> AddRollConstFunction::initOriginal() const {
 std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto const_data = std::make_shared<op::v0::Constant>(precision, data1->get_shape(), const_values);
     auto add = std::make_shared<op::v1::Add>(data0, data1);
     auto sub = std::make_shared<op::v1::Subtract>(add, const_data);
@@ -59,7 +60,7 @@ std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
 std::shared_ptr<ov::Model> EltwiseFunction::initReference() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto const_data = std::make_shared<op::v0::Constant>(precision, data1->get_shape(), const_values);
     auto indata0 = std::make_shared<op::v0::Parameter>(precision, data0->get_shape());
     auto indata1 = std::make_shared<op::v0::Parameter>(precision, data1->get_shape());
@@ -108,7 +109,9 @@ std::shared_ptr<ov::Model> EltwiseMaxNumParamsFunction::initOriginal() const {
 std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initOriginal() const {
     auto data_1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data_2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    auto non_snippet_op = std::make_shared<op::v0::MatMul>(data_1, data_2);
+    auto sinh_1 = std::make_shared<op::v0::Sinh>(data_1);
+    auto sinh_2 = std::make_shared<op::v0::Sinh>(data_2);
+    auto non_snippet_op = std::make_shared<op::v0::MatMul>(sinh_1, sinh_2);
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(4, -10., 10.);
     auto mul_const_1 = op::v0::Constant::create(precision, {1}, {const_values[0]});
     auto mul_1 = std::make_shared<op::v1::Multiply>(non_snippet_op, mul_const_1);
@@ -131,17 +134,19 @@ std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initOriginal() const {
 std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initReference() const {
     auto data_1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data_2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto sinh_1 = std::make_shared<op::v0::Sinh>(data_1);
+    auto sinh_2 = std::make_shared<op::v0::Sinh>(data_2);
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(4, -10., 10.);
     // snippet inputs
-    auto non_snippet_op = std::make_shared<op::v0::MatMul>(data_1, data_2);
+    auto non_snippet_op = std::make_shared<op::v0::MatMul>(sinh_1, sinh_2);
     auto mul_const_1 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[0]);
     auto add_const_1 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[1]);
     auto mul_const_2 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[2]);
     auto sub_const_2 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[3]);
 
     // snippet function
-    Shape matMulOutShape = input_shapes[0];
-    matMulOutShape.back() = input_shapes[1].back();
+    Shape matMulOutShape = input_shapes[0].get_shape();
+    matMulOutShape.back() = input_shapes[1].get_shape().back();
     auto snippet_input = std::make_shared<op::v0::Parameter>(precision, matMulOutShape);
 
     auto mul_1 = std::make_shared<op::v1::Multiply>(snippet_input, mul_const_1);
@@ -272,6 +277,37 @@ std::shared_ptr<ov::Model> TwoInputsAndOutputsFunction::initOriginal() const {
     return std::make_shared<Model>(NodeVector{hswish, sin3}, ParameterVector{data0, data1});
 }
 
+std::shared_ptr<ov::Model> SelectFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(ov::element::boolean, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
+    auto select = std::make_shared<op::v1::Select>(data0, data1, data2);
+
+    return std::make_shared<Model>(NodeVector{select}, ParameterVector{data0, data1, data2});
+}
+
+std::shared_ptr<ov::Model> BroadcastAddFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto target_shape = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{m_target_shape.size()}, m_target_shape.get_shape());
+    auto broadcast = std::make_shared<ov::op::v1::Broadcast>(data0, target_shape);
+    auto add = std::make_shared<op::v1::Add>(broadcast, data1);
+
+    return std::make_shared<Model>(NodeVector{add}, ParameterVector{data0, data1});
+}
+
+
+std::shared_ptr<ov::Model> BroadcastSelectFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(ov::element::boolean, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto data2 = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
+    auto target_shape = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{m_target_shape.size()}, m_target_shape.get_shape());
+    auto broadcast = std::make_shared<ov::op::v1::Broadcast>(data0, target_shape);
+    auto select = std::make_shared<op::v1::Select>(broadcast, data1, data2);
+
+    return std::make_shared<Model>(NodeVector{select}, ParameterVector{data0, data1, data2});
+}
+
 }  // namespace snippets
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp
new file mode 100644
index 00000000000..fb692734ebd
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_softmax.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_softmax.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/subgraph.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::shared_ptr<ov::Model> SoftmaxFunction::initOriginal() const {
+    auto data = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(data, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data});
+}
+
+std::shared_ptr<ov::Model> AddSoftmaxFunction::initOriginal() const {
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
+    auto add = std::make_shared<ov::op::v1::Add>(data0, data1);
+    auto softmax = std::make_shared<ov::op::v8::Softmax>(add, axis);
+    return std::make_shared<ov::Model>(NodeVector{softmax}, ParameterVector{data0, data1});
+}
+
+std::shared_ptr<ov::Model> TransposeSoftmaxFunction::initOriginal() const {
+    const auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()}, m_order);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto softMax = std::make_shared<ngraph::opset8::Softmax>(transpose2, m_axis);
+    return std::make_shared<ov::Model>(ov::NodeVector{softMax}, ov::ParameterVector {transpose0Param}, "softmax_transpose");
+}
+
+std::shared_ptr<ov::Model> TransposeSoftmaxEltwiseFunction::initOriginal() const {
+    const auto transpose0Param = std::make_shared<ngraph::opset1::Parameter>(precision, input_shapes[0]);
+    const auto transpose0Const = ngraph::builder::makeConstant(ngraph::element::i64, ov::Shape{m_order.size()},
+                                                               m_order);
+    const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose0Param, transpose0Const);
+    const auto mulConst = ngraph::builder::makeConstant(ngraph::element::f32, transpose2->get_shape(),
+                                                        std::vector<float>{}, true);
+    const auto mul = std::make_shared<ngraph::opset1::Multiply>(transpose2, mulConst);
+    const auto softMax = std::make_shared<ngraph::opset8::Softmax>(mul, m_axis);
+    const auto hswish = std::make_shared<ngraph::opset6::HSwish>(softMax);
+    return std::make_shared<ov::Model>(ov::NodeVector{hswish}, ov::ParameterVector{transpose0Param},
+                                       "softmax_transpose");
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp
new file mode 100644
index 00000000000..dcfb04a74d9
--- /dev/null
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_transpose.cpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_transpose.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include <snippets/op/subgraph.hpp>
+
+namespace ov {
+namespace test {
+namespace snippets {
+std::shared_ptr<ov::Model> TransposeFunction::initOriginal() const {
+    auto data = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {order.size()}, order);
+    auto transpose = std::make_shared<op::v1::Transpose>(data, const_order);
+    return std::make_shared<ov::Model>(NodeVector{transpose}, ParameterVector{data});
+}
+std::shared_ptr<ov::Model> TransposeFunction::initReference() const {
+    auto data = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
+    auto const_order = std::make_shared<op::v0::Constant>(ov::element::i32, Shape {order.size()}, order);
+    auto indata0 = std::make_shared<op::v0::Parameter>(precision, data->get_output_partial_shape(0));
+    auto indata1 = std::make_shared<op::v0::Parameter>(const_order->get_output_element_type(0),
+                                                       const_order->get_output_partial_shape(0));
+    auto transpose = std::make_shared<ngraph::snippets::op::Subgraph>(NodeVector{data, const_order},
+                                          std::make_shared<ov::Model>(NodeVector{std::make_shared<op::v1::Transpose>(indata0, indata1)},
+                                                                      ParameterVector{indata0, indata1}));
+    return std::make_shared<ov::Model>(NodeVector{transpose}, ParameterVector{data});
+}
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
\ No newline at end of file