[Snippets] Refactored work with Buffers (#19644)

[Snippets] BufferManager is not derived from PassPipeline now [Snippets] Added MemorySolver support [Snippets] Made as static class [Snippets] Added one-level inplace support [Snippets] Added optimization bits [Snippets] Small cosmetic fixes [Snippets] Renamed to BufferSolver [Snippets] Refactored [Snippets] Fixed IdendifyBuffers [Snippets] Add inplace multi + identify buffers [Snippets] Made common pass [Snippets] Added PassPipeline::get_pass<>() [Snippets] Added comments, briefs, refactored smth [Snippets] Fixed win build [Snippets] Not allow to have the same Buffer ID for multi level Buffers [Snippets] Moved CleanupRepeatedPtrShifts to common pioeline [Snippets] Made IdentifyBuffers::ShiftPtrParams [Snippets] Fixed window sliding mode [Snippets] Refactored nested clusters [Snippets] Adde normalized buffer regs [Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers [Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find [Snippets] Removed useless method from InitLoops [Snippets] Fixed CC build [Snippets] Applied Ivan comments [Snippets] Applied Ivan comment: refactored pass classes [Snippets] Applied Vladislav comments [Snippets] Applied Ivan comments 2 [Runtime] Moved MemorySolver to API2.0 [Snippets] Created common buffer allocation pass AllocateBuffers [Snippets][Tests] Added InplaceEltwise unit test [Snippets] fixed NormalizeBufferIDs [Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm [Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
2023-11-30 17:46:35 +04:00 · 2023-11-30 17:46:35 +04:00 · df03b0437a
commit df03b0437a
parent 6ab5ef72d5
29 changed files with 1688 additions and 266 deletions
--- a/src/common/snippets/include/snippets/lowered/linear_ir.hpp
+++ b/src/common/snippets/include/snippets/lowered/linear_ir.hpp
@ -29,6 +29,9 @@ public:
    // Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
    // Set by a backend, should be large enough to compensate for the kernel call overheads
    size_t m_min_kernel_work_amount = 256;
+    // True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
+    // False if all Buffers will have uniqie ID and offsets in the Linear IR
+    bool m_are_buffers_optimized = true;
 };

 /* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).
--- a/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp
@ -5,7 +5,6 @@
 #pragma once

 #include "pass.hpp"
-#include "snippets/snippets_isa.hpp"

 namespace ov {
 namespace snippets {
@ -14,26 +13,40 @@ namespace pass {

 /**
 * @interface AllocateBuffers
- * @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
- *        Notes:
- *           - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
- *             The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
- *           - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
- *             and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
+ * @brief The pass allocates common memory for all Buffers.
+ *        There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
+ *        Optimized mode allocates memory for Buffer ops using the following optimizations:
+ *         - MemorySolver: helps to solve issue of optimal memory allocation;
+ *         - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
+ *         - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
+ *        Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
+ *              The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
 * @ingroup snippets
 */
-
-class AllocateBuffers : public Pass {
+class AllocateBuffers: public Pass {
 public:
    OPENVINO_RTTI("AllocateBuffers", "Pass")
-    bool run(lowered::LinearIR& linear_ir) override;
+    AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);

-    size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(LinearIR& linear_ir) override;

+    /**
+     * @brief Set offset to Buffer op and propagates its to the connected memory access ops
+     * @param buffer_expr expression with Buffer op
+     * @param offset offset in common buffer scratchpad
+     */
+    static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
+
+    using BufferCluster = std::set<ExpressionPtr>;
+    using BufferClusters = std::vector<BufferCluster>;
 private:
-    static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);
-
-    size_t m_buffer_scratchpad_size = 0;
+    size_t& m_buffer_scratchpad_size;
+    bool m_is_optimized_mode = true;
 };

 } // namespace pass
--- a/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/define_buffer_clusters.hpp
@ -0,0 +1,138 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+#include "allocate_buffers.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface DefineBufferClusters
+ * @brief The pass defines buffer clusters. The buffers from one cluster share the
+ *        same memory (has the same offset relative to the data pointer of buffer scratchpad).
+ *         - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
+ *         - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
+ *           It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
+ *           Demonstration:
+ *                               |-----------------------------------------------------|
+ *                               | |------------|                       |------------| |                        InnerLoops have work amount 128
+ *             Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128]     OuterLoop has work amount 3
+ *                               | |------------|      OuterLoop        |------------| |
+ *                               |-----------------------------------------------------|
+ *           Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
+ *           Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
+ *                 These passes should be executed separately before this pass!
+ * @ingroup snippets
+ */
+class DefineBufferClusters : public Pass {
+public:
+    OPENVINO_RTTI("DefineBufferClusters", "Pass")
+
+    DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
+
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+
+private:
+    using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
+    /**
+     * @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
+     * @param target target expression with Buffer op
+     * @return vector iterator which refers to the found cluster
+     */
+    AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
+    /**
+     * @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
+     * @param buffer_expr expression with assumed Buffer op
+     * @param target_expr expression with target op - LoopEnd or MemoryAccess op
+     * @return boolean value
+     */
+    bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
+    /**
+     * @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
+     * @param buffer_expr expression with Buffer op
+     */
+    void create_new_cluster(const ExpressionPtr& buffer_expr);
+    /**
+     * @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
+     *        that means that Buffers in cluster have different IDs.
+     * @param cluster set of Buffer expressions - cluster
+     * @return common buffer ID or SIZE_MAX - size value
+     */
+    size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
+
+    /**
+     * @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
+     * @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
+     */
+    void parse_loop(const LinearIR::constExprIt& expr_it);
+    /**
+     * @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
+     * @param expr expression with full MemoryAccess op
+     */
+    void parse_memory_access_op(const ExpressionPtr& expr);
+    /**
+     * @brief Gets input outputs buffers of Loop
+     * @param loop_expr expression with LoopEnd op
+     * @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
+     */
+    BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
+    /**
+     * @brief Gets output buffers of Loop
+     * @param loop_expr expression with LoopEnd op
+     * @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
+     */
+    BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
+    /**
+     * @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
+     * @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
+     * @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
+     * @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
+     */
+    void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
+    /**
+     * @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
+     * @param buffer_expr expression with Buffer op
+     * @return finalization offset - int64_t value
+     */
+    int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
+    /**
+     * @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
+     *        indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
+     * @param up expression with upper Buffer op
+     * @param down expression with lower Buffer op
+     * @param loop expression with common LoopEnd op
+     * @param up_idx the reference to port index of upper Buffer op to the Loop
+     * @param down_idx the reference to port index of lower Buffer op to the Loop
+     * @return Return True if the Buffers are connected to the same Loop
+     */
+    static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
+    /**
+     * @brief Unite clusters
+     * @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
+     * @param outer_cluster buffer clusters with buffers outside the Loop
+     * @param outer_buffer target Buffer from outer_cluster
+     * @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
+     * @return Return True if clusters have been united
+     */
+    bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
+                               const ExpressionPtr& outer_buffer, bool is_outer_up);
+
+    AllocateBuffers::BufferClusters& m_clusters;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/lowered/pass/enumerate_expressions.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/enumerate_expressions.hpp
@ -0,0 +1,28 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface EnumerateExpressions
+ * @brief The pass enumerates expression by execution order
+ * @ingroup snippets
+ */
+class EnumerateExpressions : public Pass {
+public:
+    OPENVINO_RTTI("EnumerateExpressions", "Pass")
+    bool run(LinearIR& linear_ir) override;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp
@ -6,8 +6,6 @@

 #include "pass.hpp"

-#include "snippets/op/buffer.hpp"
-
 namespace ov {
 namespace snippets {
 namespace lowered {
@ -22,7 +20,8 @@ namespace pass {
 *          - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
 *                   The buffers are connected to the same Loop - are adjacent in graph sense bounds.
 *          - The vertices (buffers) are adjacent if they are connected to the same Loop and
- *            their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
+ *            their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
+ *            or one of the Buffers is in some a Loop but another Buffer is not;
 *          - Firstly, create adjacency matrix using the definition above;
 *          - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
 *        Note: should be called before ResetBuffer() pass to have correct offsets
@ -33,13 +32,79 @@ public:
    OPENVINO_RTTI("IdentifyBuffers", "Pass")
    IdentifyBuffers() = default;

+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
    bool run(LinearIR& linear_ir) override;

-private:
-    using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
+    struct ShiftPtrParams {
+        ShiftPtrParams() = default;
+        ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
+        int64_t data_size = 0;
+        int64_t ptr_increment = 0;
+        int64_t finalization_offset = 0;

-    std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
-    std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
+        friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+        friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+    };
+
+    /**
+     * @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
+     * @param lhs Data pointer shift params for first Buffer
+     * @param rhs Data pointer shift params for second Buffer
+     * @return Returns True if params are valid for reusing. Otherwise returns False
+     */
+    static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
+
+private:
+    using BufferPool = std::vector<ExpressionPtr>;
+
+    /**
+     * @brief Get Buffer Index in Buffer set
+     * @param target the target Buffer expression
+     * @param pool set of Buffers from the Linear IR
+     * @return index of target Buffer expression in set
+     */
+    static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
+    /**
+     * @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
+     * @param linear_ir the target Linear IR
+     * @param pool set of Buffers from the Linear IR
+     * @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
+     */
+    static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
+    /**
+     * @brief Algorithm of Graph coloring where vertices are Buffers
+     * @param buffers set of Buffers from the Linear IR
+     * @param adj adjacency matrix
+     * @return map [color id -> Buffer set]
+     */
+    static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
+    /**
+     * @brief Update the adjacency matrix:
+     *         - If Buffers are from the same Loops and connected to the same Loop and
+     *           they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
+     *         - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
+     *           the Buffers are adjacent - set value True in the matrix;
+     * @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
+     * @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
+     * @param buffers set of Buffers from the Linear IR
+     * @param adj Target adjacency matrix
+     */
+    static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                                  const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
+                                  const BufferPool& buffers,
+                                  std::vector<bool>& adj);
+    /**
+     * @brief Check if two Buffers are adjacent and cannot have the same ID
+     * @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
+     * @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
+     * @return Returns True if they are adjacent, otherwise returns False
+     */
+    static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                             const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
 };

 } // namespace pass
--- a/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/init_buffers_default.hpp
@ -0,0 +1,41 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface InitBuffersDefault
+ * @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
+ * @ingroup snippets
+ */
+
+class InitBuffersDefault : public Pass {
+public:
+    OPENVINO_RTTI("InitBuffersDefault", "Pass")
+
+    InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
+        m_buffer_scratchpad_size = 0;
+    }
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+
+private:
+    size_t& m_buffer_scratchpad_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/init_loops.hpp
@ -15,7 +15,7 @@ namespace pass {

 /**
 * @interface InitLoops
- * @brief The pass initialize scheduling information in LoopInfo
+ * @brief The pass initializes scheduling information in LoopInfo
 * @ingroup snippets
 */
 class InitLoops : public Pass {
--- a/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/normalize_buffer_ids.hpp
@ -0,0 +1,40 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface NormalizeBufferIDs
+ * @brief After optimizations some Buffer IDs might be set unevenly: some numbers are missed.
+ *        For example,
+ *                 [Buffer -> ID]
+ *                  Buffer0 -> 0    Two Buffers have ID = 0, one has ID = 2.
+ *                  Buffer1 -> 2    Obviosly, we can normalize this IDs to set ID = 1 to Buffer1.
+ *                  Buffer2 -> 0    It helps to assign GPR registers in `AssignRegister` more effective.
+ *        Thus, the pass normalize IDs of Buffers in Linear IR.
+ * @ingroup snippets
+ */
+
+class NormalizeBufferIDs : public Pass {
+public:
+    OPENVINO_RTTI("NormalizeBufferIDs", "Pass")
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp
@ -39,6 +39,11 @@ public:
        return get_type_info().name;
    }

+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
    virtual bool run(lowered::LinearIR& linear_ir) = 0;
 };

--- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
+++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp
@ -0,0 +1,54 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "pass.hpp"
+
+#include "allocate_buffers.hpp"
+#include "openvino/runtime/memory_solver.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+/**
+ * @interface SolveBufferMemory
+ * @brief The pass optimally calculates the common buffer scratchpad size and
+ *        set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API.
+ *        Note: The pass requires expression enumeration. It should be executed separately before this pass!
+ * @ingroup snippets
+ */
+class SolveBufferMemory : public Pass {
+public:
+    OPENVINO_RTTI("SolveBufferMemory", "Pass")
+
+    SolveBufferMemory(size_t& buffer_scratchpad_size, AllocateBuffers::BufferClusters& clusters)
+        : m_buffer_scratchpad_size(buffer_scratchpad_size), m_clusters(clusters) {}
+    /**
+     * @brief Apply the pass to the Linear IR
+     * @param linear_ir the target Linear IR
+     * @return status of the pass
+     */
+    bool run(lowered::LinearIR& linear_ir) override;
+
+private:
+    /**
+     * @brief Initializes boxes for MemorySolver
+     * @param buffer_clusters buffer clusters. These clusters could be obtained using DefineBufferClusters pass
+     * @return vector of boxes for MemorySolver
+     */
+    std::vector<ov::MemorySolver::Box> init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters);
+
+    size_t& m_buffer_scratchpad_size;
+    AllocateBuffers::BufferClusters& m_clusters;
+
+    constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/include/snippets/utils.hpp
+++ b/src/common/snippets/include/snippets/utils.hpp
@ -51,6 +51,7 @@ constexpr inline bool implication(bool cause, bool cond) {

 template <typename T, typename U>
 inline T div_up(const T a, const U b) {
+    OPENVINO_ASSERT(b != 0, "Divider must not be zero");
    return static_cast<T>((a + b - 1) / b);
 }

--- a/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/allocate_buffers.cpp
@ -2,9 +2,16 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+
 #include "snippets/lowered/pass/allocate_buffers.hpp"

-#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/enumerate_expressions.hpp"
+#include "snippets/lowered/pass/solve_buffer_memory.hpp"
+#include "snippets/lowered/pass/init_buffers_default.hpp"
+#include "snippets/lowered/pass/identify_buffers.hpp"
+#include "snippets/lowered/pass/define_buffer_clusters.hpp"
+#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
+#include "snippets/pass/tokenization.hpp"
 #include "snippets/itt.hpp"

 namespace ov {
@ -12,11 +19,15 @@ namespace snippets {
 namespace lowered {
 namespace pass {

-void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, const size_t offset) {
+AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
+    : m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
+
+void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
    // If Buffer has offset We set this offset in the connected MemoryAccess ops
-    // to correctly read and write data because all Buffers has the common data pointer on buffer scratchpad
+    // to correctly read and write data because all Buffers have the common data pointer on buffer scratchpad

    const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+    OPENVINO_ASSERT(buffer, "Failed to set Buffer offset: AllocateBuffers expects Buffer op");
    buffer->set_offset(static_cast<int64_t>(offset));

    // Propagate to up: in Store. Buffer can have only one Store
@ -55,106 +66,23 @@ void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const Expressi
    }
 }

-
-bool AllocateBuffers::run(LinearIR& linear_ir) {
+bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
-    // [113664] The pass contains two main logics: it defines which of buffers can be inplace (use the same memory) and
-    // allocate memory of needed size. It should be splitted into several passes and updated in bounds of the ticket 113664.
-
-    // [113664] At the moment New Memory Buffer is used only in BrgemmCPU for AMX case. This memory can be reused for each Brgemm.
-    // This plugin-specific condition will be removed in the near future after the task 113664 will be implemented
-    size_t offset = 0, new_memory_buffer_offset = 0;
-    size_t prev_data_size = 0, current_data_size = 0;
-    std::set<ExpressionPtr> allocated_buffers;
-    bool new_memory_buffer_allocated = false;
-
-    auto allocate = [&](const std::shared_ptr<op::Buffer>& buffer, const ExpressionPtr& expr, size_t buffer_size) {
-        offset = m_buffer_scratchpad_size;
-        propagate_offset(linear_ir, expr, offset);
-        m_buffer_scratchpad_size += buffer_size;
-        allocated_buffers.insert(expr);
-        prev_data_size = current_data_size;
-    };
-
-    for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
-        const auto& expr = *expr_it;
-        if (auto buffer = as_type_ptr<op::Buffer>(expr->get_node())) {
-            const auto buffer_size = buffer->get_byte_size();
-            current_data_size = buffer->get_element_type().size();
-            // If it's the first buffer, offsets are zero => nothing to propagate, can continue
-            if (m_buffer_scratchpad_size == 0) {
-                m_buffer_scratchpad_size += buffer_size;
-                allocated_buffers.insert(expr);
-                prev_data_size = current_data_size;
-                continue;
-            }
-
-            if (buffer->is_intermediate_memory()) {
-                const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr();
-                const auto& parent_node = parent_expr->get_node();
-                // Full MemoryAccess ops need new memory. Previous logic is to check for parent isn't Loop
-                // [113664] It should be unified in MemoryManager with memory reuse in the near future
-                const auto ma = ov::as_type_ptr<op::MemoryAccess>(parent_node);
-                if (ma && ma->is_full_memory_access_op()) {
-                    allocate(buffer, *expr_it, buffer_size);
-                    continue;
-                }
-
-                // Loop       Full_MA
-                //  |           |
-                // Buffer_1  Buffer_0
-                //   \         /
-                //     Full_MA
-                // At the moment the pass support only sequentially implicit InPlace.
-                // If Buffer_0 is allocated firstly as Buffer after full memory access op,
-                // we cannot reuse this allocated memory for Buffer_1 - we must allocate new memory for it.
-                // [113664] It should be unified in MemoryManager with memory reuse in the near future
-                bool need_allocate = false;
-                const auto consumers = expr->get_output_port_connector(0)->get_consumers();
-                for (const auto& consumer : consumers) {
-                    const auto& consumer_expr = consumer.get_expr();
-                    const auto& child_node = consumer_expr->get_node();
-                    const auto ma = ov::as_type_ptr<op::MemoryAccess>(child_node);
-                    if (ma && ma->is_full_memory_access_op()) {
-                        for (size_t i = 0; i < consumer_expr->get_input_count() && !need_allocate; ++i) {
-                            if (i == consumer.get_index())
-                                continue;
-                            const auto buffer_sibling = consumer_expr->get_input_port_connector(i)->get_source().get_expr();
-                            need_allocate = ov::is_type<op::Buffer>(buffer_sibling->get_node()) && allocated_buffers.count(buffer_sibling) != 0;
-                        }
-                    }
-                    if (need_allocate)
-                        break;
-                }
-                if (need_allocate) {
-                    allocate(buffer, *expr_it, buffer_size);
-                    continue;
-                }
-
-                // [113664] For more details and reason of the current solution, please, go to the ticket description
-                const auto current_allocated_memory_size = m_buffer_scratchpad_size - offset;
-                if (((current_data_size == prev_data_size) && buffer_size > current_allocated_memory_size) ||
-                    ((current_data_size != prev_data_size) && buffer_size != current_allocated_memory_size)) {
-                    allocate(buffer, expr, buffer_size);
-                    continue;
-                }
-                propagate_offset(linear_ir, *expr_it, offset);
-                allocated_buffers.insert(expr);
-                prev_data_size = current_data_size;
-            } else {
-                if (!new_memory_buffer_allocated) {
-                    allocate(buffer, *expr_it, buffer_size);
-                    new_memory_buffer_allocated = true;
-                    new_memory_buffer_offset = offset;
-                } else {
-                    propagate_offset(linear_ir, *expr_it, new_memory_buffer_offset);
-                    allocated_buffers.insert(expr);
-                    prev_data_size = current_data_size;
-                }
-            }
-        }
+    m_buffer_scratchpad_size = 0;
+    PassPipeline pipeline;
+    if (m_is_optimized_mode) {
+        BufferClusters buffer_clusters;
+        pipeline.register_pass<EnumerateExpressions>();
+        pipeline.register_pass<IdentifyBuffers>();
+        pipeline.register_pass<DefineBufferClusters>(buffer_clusters);
+        pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
+        pipeline.register_pass<NormalizeBufferIDs>();
+    } else {
+        pipeline.register_pass<InitBuffersDefault>(m_buffer_scratchpad_size);
    }
-    return !allocated_buffers.empty();
+    pipeline.run(linear_ir);
+
+    return m_buffer_scratchpad_size > 0;
 }

 } // namespace pass
--- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
+++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
@ -0,0 +1,346 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/define_buffer_clusters.hpp"
+
+#include "snippets/lowered/pass/identify_buffers.hpp"
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+using ShiftPtrParams = IdentifyBuffers::ShiftPtrParams;
+
+AllocateBuffers::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) {
+    return std::find_if(m_clusters.begin(), m_clusters.end(),
+                        [&target](const AllocateBuffers::BufferCluster& cluster) { return cluster.count(target) > 0; });
+}
+
+bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const {
+    const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
+    return buffer && buffer_expr->get_loop_ids() == target_expr->get_loop_ids();
+}
+
+void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) {
+    const auto cluster_it = find_cluster_by_expr(buffer_expr);
+    // If Buffer is missed in clusters, create new cluster with the single Buffer node inside
+    if (cluster_it == m_clusters.cend()) {
+        m_clusters.push_back(AllocateBuffers::BufferCluster{buffer_expr});
+    }
+}
+
+size_t DefineBufferClusters::get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const {
+    OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!");
+    const auto id = (ov::as_type_ptr<op::Buffer>(cluster.cbegin()->get()->get_node()))->get_id();
+    if (std::all_of(cluster.cbegin(), cluster.cend(),
+                    [&id](const ExpressionPtr& expr) { return (ov::as_type_ptr<op::Buffer>(expr->get_node()))->get_id() == id; })) {
+        return id;
+    }
+    return SIZE_MAX;
+}
+
+DefineBufferClusters::BufferPorts DefineBufferClusters::get_input_buffers(const ExpressionPtr& loop_expr) const {
+    BufferPorts input_buffers;
+
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
+    const auto in_count = loop_end->get_input_num();
+    const auto connectors = loop_expr->get_input_port_connectors();
+
+    // Input Buffers
+    for (size_t i = 0; i < in_count; ++i) {
+        const auto source_expr = connectors[i]->get_source().get_expr();
+        if (!is_direct_buffer(source_expr, loop_expr))
+            continue;
+        // Save as input Buffer
+        const auto ret = input_buffers.insert(std::make_pair(source_expr, std::set<size_t>{ i })).second;
+        if (!ret)
+            input_buffers[source_expr].insert(i);
+    }
+    return input_buffers;
+}
+
+DefineBufferClusters::BufferPorts DefineBufferClusters::get_output_buffers(const ExpressionPtr& loop_expr) const {
+    BufferPorts output_buffers;
+
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
+    const auto in_count = loop_end->get_input_num();
+    const auto out_count = loop_end->get_output_num();
+    const auto connectors = loop_expr->get_input_port_connectors();
+
+    for (size_t i = in_count; i < in_count + out_count; ++i) {
+        for (const auto& consumer : connectors[i]->get_consumers()) {
+            auto consumer_expr = consumer.get_expr();
+            if (!is_direct_buffer(consumer_expr, loop_expr))
+                continue;
+            // Save as output Buffer
+            output_buffers[consumer_expr] = { i };
+        }
+    }
+    return output_buffers;
+}
+
+void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) {
+    const auto& expr = *expr_it;
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
+    const auto& ptr_increments = loop_end->get_ptr_increments();
+    const auto& final_offsets = loop_end->get_finalization_offsets();
+    const auto& data_sizes = loop_end->get_element_type_sizes();
+
+    // [ Expression -> Port indexes ]
+    const auto input_buffers = get_input_buffers(expr);
+    const auto output_buffers = get_output_buffers(expr);
+
+    for (const auto& in : input_buffers)
+        create_new_cluster(in.first);
+
+    std::set<ExpressionPtr> visited_buffers;
+    for (const auto& out : output_buffers) {
+        const auto output_buffer_expr = out.first;
+        const auto output_buffer_port_idx = *(out.second.cbegin());  // Output port is always one
+        const auto output_buffer = ov::as_type_ptr<op::Buffer>(output_buffer_expr->get_node());
+        bool has_been_added = false;
+
+        for (const auto& in : input_buffers) {
+            const auto& input_buffer_expr = in.first;
+            if (visited_buffers.count(input_buffer_expr) > 0)
+                continue;
+
+            const auto input_buffer = ov::as_type_ptr<op::Buffer>(input_buffer_expr->get_node());
+            const auto& input_buffer_ports = in.second;
+            for (const auto& input_buffer_port_idx : input_buffer_ports) {
+                // Memory can be reused if reading and writing are executed proportionally:
+                //  - the same ShiftPtrParams (data size, final offsets, ptr increments)
+                //  - the same reading/writing order
+                //  - the same buffer memory sizes
+                const auto input_params =
+                    ShiftPtrParams(data_sizes[input_buffer_port_idx], ptr_increments[input_buffer_port_idx], final_offsets[input_buffer_port_idx]);
+                const auto output_params =
+                    ShiftPtrParams(data_sizes[output_buffer_port_idx], ptr_increments[output_buffer_port_idx], final_offsets[output_buffer_port_idx]);
+                if (input_buffer->get_byte_size() == output_buffer->get_byte_size() &&
+                    input_buffer_expr->get_output_port_descriptor(0)->get_layout() == output_buffer_expr->get_input_port_descriptor(0)->get_layout() &&
+                    input_params == output_params) {
+                    const auto cluster_it = find_cluster_by_expr(input_buffer_expr);
+                    OPENVINO_ASSERT(cluster_it != m_clusters.end(), "Buffer on inputs of Loop must be already saved in clusters");
+                    // Add to the existing cluster
+                    has_been_added = cluster_it->insert(output_buffer_expr).second;
+                    OPENVINO_ASSERT(has_been_added, "Buffer has not been saved in cluster");
+                    // Remove input buffer because we have already use its memory
+                    visited_buffers.insert(input_buffer_expr);
+                    break;
+                }
+            }
+            if (has_been_added) break;
+        }
+        if (!has_been_added) {
+            m_clusters.push_back(AllocateBuffers::BufferCluster{output_buffer_expr});
+        }
+    }
+
+    // Check Buffers inside to possible memory reusing using `window` sliding
+    parse_nested_loops(input_buffers, output_buffers, expr_it);
+}
+
+void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers,
+                                              const LinearIR::constExprIt& outer_loop_end_expr_it) {
+    if (input_buffers.empty() && output_buffers.empty())
+        return;
+
+    // The inner Buffer can reuse memory of the outer Buffer using `window` sliding only if:
+    //  - The finalization offset of the latest Loop connected to the inner Buffer is equal to pointer increment of outer Buffer to emulate `window` sliding
+    //  - This outer Buffer should have the same Buffer ID as inner to move data ptr of inner Buffer after each outer Loop iteration.
+    //    It's needed because all Loops reset data pointers of connected Buffer after full work.
+    //    To avoid rewriting of outer Buffer data we have to have the same Buffer ID (GPR) to proportionally shift pointers both Buffers.
+
+    auto can_be_data_ptr_proportionally_shifted = [](int64_t outer_buffer_ptr_increment, int64_t outer_buffer_data_size,
+                                                     int64_t inner_buffer_final_offsets, int64_t inner_buffer_data_size) {
+        return (outer_buffer_ptr_increment != 0) &&
+               ((inner_buffer_data_size * inner_buffer_final_offsets * -1) == outer_buffer_ptr_increment * outer_buffer_data_size);
+    };
+
+    const auto outer_loop_end = ov::as_type_ptr<op::LoopEnd>(outer_loop_end_expr_it->get()->get_node());
+    const auto outer_loop_begin = outer_loop_end->get_loop_begin();
+    const auto& outer_ptr_increments = outer_loop_end->get_ptr_increments();
+    const auto& outer_data_sizes = outer_loop_end->get_element_type_sizes();
+
+    for (auto it = std::reverse_iterator<LinearIR::constExprIt>(outer_loop_end_expr_it); (*it)->get_node() != outer_loop_begin; ++it) {
+        const auto& inner_expr = *it;
+        if (const auto inner_buffer = ov::as_type_ptr<op::Buffer>(inner_expr->get_node())) {
+            const auto inner_cluster_it = find_cluster_by_expr(inner_expr);
+            OPENVINO_ASSERT(inner_cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
+            const auto inner_cluster_id = get_cluster_buffer_id(*inner_cluster_it);
+            if (inner_cluster_id == SIZE_MAX) continue;
+
+            const auto final_offset = get_buffer_finalization_offset(inner_expr);
+
+            auto unite = [&](const BufferPorts& ports, const bool is_input) {
+                bool applied = false;
+                for (const auto& port : ports) {
+                    const auto cluster_it = find_cluster_by_expr(port.first);
+                    OPENVINO_ASSERT(cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
+                    // If the buffers are already in the same cluster or have different Buffer ID - skip
+                    if (cluster_it == inner_cluster_it) continue;
+
+                    bool can_be_reused = true;
+                    for (const auto idx : port.second) {
+                        can_be_reused = can_be_reused &&
+                            can_be_data_ptr_proportionally_shifted(outer_ptr_increments[idx], outer_data_sizes[idx],
+                                                                   final_offset, inner_buffer->get_element_type().size());
+                    }
+                    if (!can_be_reused)
+                        continue;
+
+                    applied = unite_nested_clusters(inner_cluster_it, *cluster_it, port.first, is_input);
+                    if (applied) break;
+                }
+                return applied;
+            };
+
+            if (unite(input_buffers, true)) continue;
+            if (unite(output_buffers, false)) continue;
+        }
+    }
+}
+
+int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const {
+    auto index = [](const std::vector<PortConnectorPtr>& loop_inputs, const PortConnectorPtr& buffer_out) {
+        const auto it = std::find(loop_inputs.cbegin(), loop_inputs.cend(), buffer_out);
+        OPENVINO_ASSERT(it != loop_inputs.cend(), "Buffer output PortConnector has not been found in target LoopEnd inputs");
+        return std::distance(loop_inputs.cbegin(), it);
+    };
+    int64_t final_offset = 0;
+    int64_t last_loop_exec_order = 0;
+    const auto buffer_outs = buffer_expr->get_output_port_connectors();
+    for (const auto& buffer_out : buffer_outs) {
+        const auto consumers = buffer_out->get_consumers();
+        for (const auto& consumer : consumers) {
+            const auto consumer_expr = consumer.get_expr();
+            const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(consumer_expr->get_node());
+            if (loop_end && consumer_expr->get_loop_ids() == buffer_expr->get_loop_ids()) {
+                const auto loop_order = ov::snippets::pass::GetTopologicalOrder(loop_end);
+                if (loop_order > last_loop_exec_order) {
+                    const auto loop_inputs = consumer_expr->get_input_port_connectors();
+                    final_offset = loop_end->get_finalization_offsets()[index(loop_inputs, buffer_out)];
+                    last_loop_exec_order = loop_order;
+                }
+            }
+        }
+    }
+    return final_offset;
+}
+
+bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it,
+                                                 AllocateBuffers::BufferCluster& outer_cluster,
+                                                 const ExpressionPtr& outer_buffer, bool is_outer_up) {
+    for (const auto& inner_buffer : *inner_cluster_it) {
+        ExpressionPtr common_loop_end_expr = nullptr;
+        size_t outer_idx = SIZE_MAX, inner_idx = SIZE_MAX;
+        const auto& up_buffer = is_outer_up ? outer_buffer : inner_buffer;
+        const auto& down_buffer = is_outer_up ? inner_buffer : outer_buffer;
+        auto& up_idx = is_outer_up ? outer_idx : inner_idx;
+        auto& down_idx = is_outer_up ? inner_idx : outer_idx;
+        if (are_buffer_neighbours(up_buffer, down_buffer, common_loop_end_expr, up_idx, down_idx)) {
+            const auto common_loop_end = ov::as_type_ptr<op::LoopEnd>(common_loop_end_expr->get_node());
+            const auto& inner_ptr_increments = common_loop_end->get_ptr_increments();
+            const auto& inner_final_offsets = common_loop_end->get_finalization_offsets();
+            const auto& inner_data_sizes = common_loop_end->get_element_type_sizes();
+            if (IdentifyBuffers::can_reuse_id({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] },
+                                              { inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) {
+                const auto buffer_id = ov::as_type_ptr<op::Buffer>(outer_buffer->get_node())->get_id();
+                for (const auto& inner_buffer : *inner_cluster_it)
+                    ov::as_type_ptr<op::Buffer>(inner_buffer->get_node())->set_id(buffer_id);
+
+                outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend());
+                m_clusters.erase(inner_cluster_it);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool DefineBufferClusters::are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx) {
+    auto find_input = [&down](const PortConnectorPtr& in) {
+        return in->get_source().get_expr() == down;
+    };
+    auto find_output = [&down](const PortConnectorPtr& in) {
+        const auto consumers = in->get_consumers();
+        return std::any_of(consumers.cbegin(), consumers.cend(),
+                           [&down](const ExpressionPort& port) { return port.get_expr() == down; });
+    };
+    auto find = [&](const std::vector<PortConnectorPtr>::const_iterator& begin,
+                    const std::vector<PortConnectorPtr>::const_iterator& end,
+                    const std::vector<PortConnectorPtr>::const_iterator& orig_begin,
+                    const ExpressionPort& loop_port,
+                    bool is_input) -> bool {
+        const auto in_buffer_it = is_input ? std::find_if(begin, end, find_input)
+                                           : std::find_if(begin, end, find_output);
+        if (in_buffer_it != end) {
+            up_idx = loop_port.get_index();
+            down_idx = std::distance(orig_begin, in_buffer_it);
+            loop = loop_port.get_expr();
+            return true;
+        }
+        return false;
+    };
+    for (const auto& out : up->get_output_port_connectors()) {
+        for (const auto& buffer_consumer : out->get_consumers()) {
+            const auto buffer_consumer_expr = buffer_consumer.get_expr();
+            const auto loop_end = ov::as_type_ptr<op::LoopEnd>(buffer_consumer_expr->get_node());
+            if (!loop_end)
+                continue;
+            const auto& loop_inputs = buffer_consumer_expr->get_input_port_connectors();
+            if (find(loop_inputs.cbegin(), loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cbegin(), buffer_consumer, true)) return true;
+            if (find(loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cend(), loop_inputs.cbegin(), buffer_consumer, false)) return true;
+        }
+    }
+    return false;
+}
+
+void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) {
+    const auto ma = ov::as_type_ptr<op::MemoryAccess>(expr->get_node());
+    if (!ma->is_full_memory_access_op())
+        return;
+    // TODO: Some full MemoryAccess ops can have inplace inputs and outputs in general.
+    //       Need to add mechanism of inplace ports using MemoryAccess::PortDescriptor::inplace
+    for (const auto& input : expr->get_input_port_connectors()) {
+        if (is_direct_buffer(input->get_source().get_expr(), expr)) {
+            create_new_cluster(input->get_source().get_expr());
+        }
+    }
+    for (const auto& output : expr->get_output_port_connectors()) {
+        for (const auto& consumer : output->get_consumers()) {
+            if (is_direct_buffer(consumer.get_expr(), expr)) {
+                create_new_cluster(consumer.get_expr());
+            }
+        }
+    }
+}
+
+bool DefineBufferClusters::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters");
+
+    for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
+        const auto& expr = *expr_it;
+        const auto op = expr->get_node();
+        if (ov::is_type<op::LoopEnd>(op)) {
+            parse_loop(expr_it);
+            continue;
+        }
+
+        if (ov::is_type<op::MemoryAccess>(op)) {
+            parse_memory_access_op(expr);
+            continue;
+        }
+    }
+
+    return true;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/lowered/pass/enumerate_expressions.cpp
+++ b/src/common/snippets/src/lowered/pass/enumerate_expressions.cpp
@ -0,0 +1,29 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/enumerate_expressions.hpp"
+
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+bool EnumerateExpressions::run(LinearIR& linear_ir) {
+    // [113536]: Temporary solution is reusing of topological order from tokenization.
+    //           Need to add execution order of Expression support
+    int64_t order = 0;
+    for (const auto& expr : linear_ir) {
+        ov::snippets::pass::SetTopologicalOrder(expr->get_node(), order++);
+    }
+    return order > 0;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/identify_buffers.cpp
@ -20,37 +20,69 @@ inline size_t index(size_t col_num, size_t row, size_t col) {
 }
 } // namespace

-std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const {
+bool operator==(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
+    if (&lhs == &rhs)
+        return true;
+    return lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset && lhs.data_size == rhs.data_size;
+}
+bool operator!=(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
+    return !(rhs == lhs);
+}
+
+size_t IdentifyBuffers::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) {
+    const auto iter = std::find(pool.cbegin(), pool.cend(), target);
+    OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph");
+    return std::distance(pool.cbegin(), iter);
+}
+
+bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) {
+    const auto equal_ptr_params_shifting = lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset;
+    const auto equal_element_type_sizes = lhs.data_size == rhs.data_size;
+    return equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0));
+}
+
+bool IdentifyBuffers::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                                   const std::pair<ExpressionPtr, ShiftPtrParams>& rhs) {
+    const auto lhs_ids = lhs.first->get_loop_ids();
+    const auto rhs_ids = rhs.first->get_loop_ids();
+    const auto equal_loop_ids = lhs_ids == rhs_ids;
+    if (equal_loop_ids) {  // Buffers are connected to the same Loop and have the same outer Loops
+        return !can_reuse_id(lhs.second, rhs.second);
+    } else {  // Buffers are connected to the same Loop, but one of Buffers - inside this Loop, another - outside
+        // Buffers are adjacent if outer Buffer has not zero data shift params
+        if (lhs_ids.size() == rhs_ids.size()) // If the count of outer Loops are equal, it means that outer loops are already different
+            return true;
+        const auto& outer_buffer = lhs_ids.size() < rhs_ids.size() ? lhs : rhs;
+        const auto count_outer_loops = std::min(lhs_ids.size(), rhs_ids.size());
+        const auto are_outer_loops_the_same = lhs_ids.size() != rhs_ids.size() &&
+            std::equal(rhs_ids.cbegin(), rhs_ids.cbegin() + count_outer_loops, lhs_ids.cbegin());
+        const auto outer_buffer_has_zero_shifts = outer_buffer.second.ptr_increment == 0 && outer_buffer.second.finalization_offset == 0;
+        return !are_outer_loops_the_same || !outer_buffer_has_zero_shifts;
+    }
+}
+
+void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
+                                        const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
+                                        const BufferPool& buffers,
+                                        std::vector<bool>& adj) {
+    if (are_adjacent(lhs, rhs)) {
+        const auto size = buffers.size();
+        const auto lhs_idx = get_buffer_idx(lhs.first, buffers);
+        const auto rhs_idx = get_buffer_idx(rhs.first, buffers);
+        adj[index(size, rhs_idx, lhs_idx)] = adj[index(size, lhs_idx, rhs_idx)] = true;
+    }
+}
+
+std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
    // There are several sync points for adjacency check:
    // 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict
    //    (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent
    // 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs
-    const auto size = buffers.size();
-    // TODO: Can we use triangular matrix? Need verify using tests
+    const auto size = pool.size();
    std::vector<bool> adj(size * size, false);
    for (size_t i = 0; i < size; ++i)
        adj[index(size, i, i)] = true;

-    // < ptr_increment, finalization_offset >
-    using ShiftPtrParams = std::pair<int64_t, int64_t>;
-
-    auto get_buffer_idx = [&](const std::shared_ptr<op::Buffer>& buffer) {
-        const auto iter = std::find(buffers.cbegin(), buffers.cend(), buffer);
-        OPENVINO_ASSERT(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph");
-        return std::distance(buffers.cbegin(), iter);
-    };
-
-    auto update_adj_matrix = [&](const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& buffer,
-                                 const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& neighbour_buffer) {
-        const bool equal_ptr_params_shifting = buffer.second == neighbour_buffer.second;
-        const bool equal_element_type_sizes = buffer.first->get_element_type().size() == neighbour_buffer.first->get_element_type().size();
-        if (!equal_ptr_params_shifting || ((buffer.second.first != 0 || buffer.second.second != 0) && !equal_element_type_sizes)) {
-            const auto buffer_idx = get_buffer_idx(buffer.first);
-            const auto neighbour_idx = get_buffer_idx(neighbour_buffer.first);
-            adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
-        }
-    };
-
    auto is_buffer = [](const ExpressionPort& port) {
        return ov::is_type<op::Buffer>(port.get_expr()->get_node());
    };
@ -65,19 +97,19 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
                continue;
            OPENVINO_ASSERT(std::count_if(consumers.begin(), consumers.end(), is_buffer) == 1, "Brgemm mustn't have more than 1 consumer buffer");

-            std::vector<std::shared_ptr<op::Buffer>> adjacency_buffers;
-            adjacency_buffers.push_back(ov::as_type_ptr<op::Buffer>(buffer_it->get_expr()->get_node()));
+            BufferPool adjacency_buffers;
+            adjacency_buffers.push_back(buffer_it->get_expr());

            for (const auto& input_connector : expr->get_input_port_connectors()) {
-                const auto parent_node = input_connector->get_source().get_expr()->get_node();
-                if (const auto neighbour_buffer = ov::as_type_ptr<op::Buffer>(parent_node)) {
-                    adjacency_buffers.push_back(neighbour_buffer);
+                const auto parent_expr = input_connector->get_source().get_expr();
+                if (ov::is_type<op::Buffer>(parent_expr->get_node())) {
+                    adjacency_buffers.push_back(parent_expr);
                }
            }
            for (auto buffer_it = adjacency_buffers.begin(); buffer_it != adjacency_buffers.end(); ++buffer_it) {
                for (auto neighbour_it = std::next(buffer_it); neighbour_it != adjacency_buffers.end(); ++neighbour_it) {
-                    const auto buffer_idx = get_buffer_idx(*buffer_it);
-                    const auto neighbour_idx = get_buffer_idx(*neighbour_it);
+                    const auto buffer_idx = get_buffer_idx(*buffer_it, pool);
+                    const auto neighbour_idx = get_buffer_idx(*neighbour_it, pool);
                    adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
                }
            }
@ -91,29 +123,36 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
        const auto input_count = loop_end->get_input_num();
        const auto output_count = loop_end->get_output_num();

-        const auto ptr_increments = loop_end->get_ptr_increments();
-        const auto finalization_offsets = loop_end->get_finalization_offsets();
+        const auto& ptr_increments = loop_end->get_ptr_increments();
+        const auto& finalization_offsets = loop_end->get_finalization_offsets();
+        const auto& data_sizes = loop_end->get_element_type_sizes();

        // Buffer -> <ptr increment, finalization_offsets>
-        std::map<std::shared_ptr<op::Buffer>, ShiftPtrParams> buffer_neighbours;
+        std::map<ExpressionPtr, ShiftPtrParams> buffer_neighbours;

        for (size_t i = 0; i < input_count; ++i) {
            const auto& parent_output = expr->get_input_port_connector(i)->get_source().get_expr();
-            if (const auto buffer = ov::as_type_ptr<op::Buffer>(parent_output->get_node())) {
-                buffer_neighbours[buffer] = { ptr_increments[i], finalization_offsets[i] };
+            if (ov::is_type<op::Buffer>(parent_output->get_node())) {
+                if (buffer_neighbours.count(parent_output) > 0) {
+                    OPENVINO_ASSERT(buffer_neighbours[parent_output].ptr_increment == ptr_increments[i] &&
+                                    buffer_neighbours[parent_output].finalization_offset == finalization_offsets[i],
+                                    "Invalid data pointer shifts: If Buffer has several consumers, this consumers must have the same shifts or zero");
+                    continue;
+                }
+                buffer_neighbours[parent_output] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
            }
        }
-        for (size_t i = 0; i < output_count; ++i) {
+        for (size_t i = input_count; i < input_count + output_count; ++i) {
            // The consumers of the corresponding Store ops
-            const auto index = input_count + i;
-            const auto consumer_inputs = expr->get_input_port_connector(index)->get_consumers();
+            const auto consumer_inputs = expr->get_input_port_connector(i)->get_consumers();
            size_t buffer_count = 0;
            size_t loop_count = 0;
            for (const auto& consumer_input : consumer_inputs) {
-                const auto& child_node = consumer_input.get_expr()->get_node();
-                if (const auto buffer = ov::as_type_ptr<op::Buffer>(child_node)) {
-                    buffer_neighbours[buffer] = { ptr_increments[index], finalization_offsets[index] };
-                } else if (ov::is_type<op::LoopEnd>(child_node)) {
+                const auto& child_expr = consumer_input.get_expr();
+                if (ov::is_type<op::Buffer>(child_expr->get_node())) {
+                    buffer_neighbours[child_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
+                    buffer_count++;
+                } else if (ov::is_type<op::LoopEnd>(child_expr->get_node())) {
                    loop_count++;
                }
            }
@ -123,9 +162,24 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
            }
        }

+        // Buffers which are connected to the current Loop but without ptr shifts and Buffers which are inside this Loop - must be adjacent because
+        // after each Loop iteration GPR will be shifted using ptr increment of Buffer outside. But Buffers inside have the same GPR - it means that
+        // Buffers inside will work with shifted memory.
+        const auto loop_begin = loop_end->get_loop_begin();
+        for (auto it = std::reverse_iterator<LinearIR::constExprIt>(expr_it); (*it)->get_node() != loop_begin; ++it) {
+            const auto& inner_expr = *it;
+            if (ov::is_type<op::Buffer>(inner_expr->get_node())) {
+                // To make Buffers adjacent, we set value "INT64_MAX" for data ptr shifts params for inner Buffers,
+                // since outer Buffers (and other any Buffers) cannot have this value in shifting because of semantic of Loop op.
+                // Thus, inner and outer Buffers have always different data shift ptr params -> they're adjacent
+                if (buffer_neighbours.count(inner_expr) == 0)
+                    buffer_neighbours[inner_expr] = { INT64_MAX, INT64_MAX, INT64_MAX };
+            }
+        }
+
        for (auto buffer_it = buffer_neighbours.begin(); buffer_it != buffer_neighbours.end(); ++buffer_it) {
            for (auto neighbour_it = std::next(buffer_it); neighbour_it != buffer_neighbours.end(); ++neighbour_it) {
-                update_adj_matrix(*buffer_it, *neighbour_it);
+                update_adj_matrix(*buffer_it, *neighbour_it, pool, adj);
            }
        }
    }
@ -133,9 +187,9 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
    return adj;
 }

-auto IdentifyBuffers::coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferSet> {
+auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferPool> {
    size_t color = 0;
-    std::map<size_t, BufferSet> color_groups;
+    std::map<size_t, BufferPool> color_groups;
    const auto size = buffers.size();
    for (size_t i = 0; i < size; i++) {
        // The Buffer is already colored (visited) - skip
@ -183,25 +237,25 @@ bool IdentifyBuffers::run(LinearIR& linear_ir) {
    // Unite Buffers using Graph coloring algorithm.
    // Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
    //        so these Buffers are always IntermediateBuffer nonadjacent
-    BufferSet buffer_exprs;
+    BufferPool buffer_pool;

    for (const auto& expr : linear_ir) {
-        if (const auto buffer = ov::as_type_ptr<op::Buffer>(expr->get_node())) {
-            buffer_exprs.push_back(buffer);
+        if (ov::is_type<op::Buffer>(expr->get_node())) {
+            buffer_pool.push_back(expr);
        }
    }

    // Creation of Adj matrix
-    auto adj = create_adjacency_matrix(linear_ir, buffer_exprs);
+    auto adj = create_adjacency_matrix(linear_ir, buffer_pool);

    // Graph coloring algorithm
-    const auto color_groups = coloring(buffer_exprs, adj);
+    const auto color_groups = coloring(buffer_pool, adj);

    for (const auto& pair : color_groups) {
        const auto color = pair.first;
        const auto& united_buffers = pair.second;
-        for (const auto& buffer : united_buffers) {
-            buffer->set_id(color);
+        for (const auto& buffer_expr : united_buffers) {
+            ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->set_id(color);
        }
    }

--- a/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
+++ b/src/common/snippets/src/lowered/pass/init_buffers_default.cpp
@ -0,0 +1,40 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/init_buffers_default.hpp"
+
+#include "snippets/lowered/pass/allocate_buffers.hpp"
+#include "snippets/op/buffer.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+bool InitBuffersDefault::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault");
+
+    size_t id = 0;
+    size_t offset = 0;
+    for (const auto& expr : linear_ir) {
+        const auto op = expr->get_node();
+        if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
+            AllocateBuffers::set_buffer_offset(expr, offset);
+            buffer->set_id(id);
+
+            offset += buffer->get_byte_size();
+            id++;
+        }
+    }
+
+    m_buffer_scratchpad_size = offset;
+    return m_buffer_scratchpad_size > 0;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
+++ b/src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
@ -0,0 +1,38 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
+
+#include "snippets/op/buffer.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+bool NormalizeBufferIDs::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs");
+
+    // [ original Buffer ID -> normalized ]
+    std::map<size_t, size_t> buffer_ids;
+    for (const auto& expr : linear_ir) {
+        const auto op = expr->get_node();
+        if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
+            const auto buffer_id = buffer->get_id();
+            if (buffer_ids.count(buffer_id) == 0) {
+                const auto new_id = buffer_ids.size();
+                buffer_ids[buffer_id] = new_id;
+            }
+            buffer->set_id(buffer_ids[buffer_id]);
+        }
+    }
+    return buffer_ids.size();
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
+++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
@ -0,0 +1,89 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/solve_buffer_memory.hpp"
+
+#include "snippets/pass/tokenization.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+std::vector<ov::MemorySolver::Box> SolveBufferMemory::init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters) {
+    std::vector<ov::MemorySolver::Box> boxes;
+    const auto count = static_cast<int>(buffer_clusters.size());
+    for (int i = 0; i < count; i++) {
+        ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
+        int64_t box_size = 0;
+        for (const auto& buffer_expr : buffer_clusters[i]) {
+            int e_start = 0, e_finish = 0;
+            const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(buffer_expr->get_node());
+            OPENVINO_ASSERT(buffer != nullptr, "BufferSolver expects Buffer ops in clusters");
+
+            // life finish time - order of LoopEnd / MemoryAccess ops
+            const auto buffer_outs = buffer_expr->get_output_port_connectors();
+            for (const auto& buffer_out : buffer_outs) {
+                const auto consumers = buffer_out->get_consumers();
+                for (const auto& consumer : consumers) {
+                    const auto consumer_order = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node()));
+                    e_finish = std::max(e_finish, consumer_order);  // the last consumer
+                }
+            }
+            e_start = e_finish;
+
+            const auto buffer_ins = buffer_expr->get_input_port_connectors();
+            for (const auto& buffer_in : buffer_ins) {
+                const auto& source = buffer_in->get_source();
+                e_start = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node()));
+
+                const auto buffer_siblings = buffer_in->get_consumers();
+                for (const auto& sibling : buffer_siblings) {
+                    if (const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(sibling.get_expr()->get_node())) {
+                        e_start = std::min(e_start, static_cast<int>(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin())));
+                    }
+                }
+            }
+            OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!");
+
+            auto buffer_size = static_cast<int64_t>(buffer->get_byte_size());
+            box_size = std::max(buffer_size, box_size);
+
+            box.start = std::min(e_start, box.start);
+            box.finish = std::max(e_finish, box.finish);
+        }
+
+        // We use data alignment to put data in the line cache
+        box.size = utils::div_up(box_size, m_alignment);
+        boxes.push_back(box);
+    }
+    return boxes;
+}
+
+
+bool SolveBufferMemory::run(LinearIR& linear_ir) {
+    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory");
+
+    const auto boxes = init_boxes(m_clusters);
+
+    ov::MemorySolver memSolver(boxes);
+    m_buffer_scratchpad_size = static_cast<size_t>(memSolver.solve()) * m_alignment;  // alignment in byte
+
+    // Set offsets for Buffers
+    for (const auto& box : boxes) {
+        for (const auto& buffer : m_clusters[box.id]) {
+            const auto offset = static_cast<size_t>(memSolver.get_offset(static_cast<int>(box.id)));
+            AllocateBuffers::set_buffer_offset(buffer, offset * m_alignment);  // alignment in byte
+        }
+    }
+    return m_buffer_scratchpad_size > 0;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@ -39,7 +39,6 @@
 #include "snippets/lowered/pass/move_scalar_to_consumer.hpp"
 #include "snippets/lowered/pass/move_result_out_of_loop.hpp"
 #include "snippets/lowered/pass/clean_repeated_ptr_shifts.hpp"
-#include "snippets/lowered/pass/identify_buffers.hpp"
 #include "snippets/lowered/pass/validate_loops.hpp"
 #include "snippets/lowered/pass/insert_loops.hpp"
 #include "snippets/lowered/pass/optimize_domain.hpp"
@ -453,19 +452,12 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

    backend_passes_post_common.run(linear_ir);

-    const auto buffer_allocation_pass = std::make_shared<lowered::pass::AllocateBuffers>();
-    lowered::pass::PassPipeline buffer_pipeline;
-    buffer_pipeline.register_pass<lowered::pass::IdentifyBuffers>();
-    buffer_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
-    buffer_pipeline.register_pass(buffer_allocation_pass);
-    buffer_pipeline.run(linear_ir);
-
    lowered::pass::PassPipeline final_pipeline;
+    final_pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
+    final_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
    final_pipeline.register_pass<lowered::pass::PropagateLayout>();
    final_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
    final_pipeline.run(linear_ir);
-
-    lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
 }

 snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes,
--- a/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
+++ b/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp
@ -0,0 +1,61 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <common_test_utils/ov_test_utils.hpp>
+
+#include "snippets/op/brgemm.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+typedef std::tuple<
+    bool,   // Optimized pipeline
+    bool,   // With SplitLoops opt
+    size_t, // Expected Buffer size in bytes
+    size_t  // Expected unique Buffer IDs count
+> BufferAllocationParams;
+
+class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
+public:
+    using VectorDims = ov::snippets::VectorDims;
+    static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationParams> obj);
+
+protected:
+    void SetUp() override;
+    void ApplyTransformations(bool is_optimized, bool with_split_loops);
+    void Validate();
+
+    virtual std::shared_ptr<ov::Model> GetModel() const = 0;
+
+    static void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor);
+
+    size_t m_buffer_scratchpad = 0;
+    ov::snippets::lowered::LinearIR m_linear_ir;
+
+    size_t m_expected_size = 0;
+    size_t m_expected_count = 0;
+
+    size_t m_loop_depth = 2;
+    size_t m_vector_size = 16;
+};
+
+class EltwiseBufferAllocationTest : public BufferAllocationTest {
+protected:
+    std::shared_ptr<ov::Model> GetModel() const override;
+};
+
+class MHABufferAllocationTest : public BufferAllocationTest {
+protected:
+    std::shared_ptr<ov::Model> GetModel() const override;
+
+    static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
+};
+
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@ -0,0 +1,213 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lowered/pass/buffer_allocation.hpp"
+
+#include "openvino/opsets/opset.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/mark_loops.hpp"
+#include "snippets/lowered/pass/init_loops.hpp"
+#include "snippets/lowered/pass/insert_load_store.hpp"
+#include "snippets/lowered/pass/validate_loops.hpp"
+#include "snippets/lowered/pass/insert_loops.hpp"
+#include "snippets/lowered/pass/allocate_buffers.hpp"
+#include "snippets/lowered/pass/fuse_loops.hpp"
+#include "snippets/lowered/pass/split_loops.hpp"
+#include "snippets/lowered/pass/insert_buffers.hpp"
+#include "snippets/lowered/pass/softmax_decomposition.hpp"
+
+#include "common_test_utils/common_utils.hpp"
+
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+std::string BufferAllocationTest::getTestCaseName(testing::TestParamInfo<ov::test::snippets::BufferAllocationParams> obj) {
+    bool is_optimized, with_split_loops;
+    size_t expected_size, expected_count;
+
+    std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
+
+    std::ostringstream result;
+    result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
+    result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
+    result << "ExpBufferSize=" << expected_size << "_";
+    result << "ExpBufferNum=" << expected_count;
+    return result.str();
+}
+
+void BufferAllocationTest::SetUp() {
+    bool is_optimized, with_split_loops;
+    std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
+
+    const auto body = GetModel();
+    m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::IShapeInferSnippetsFactory>());
+    m_linear_ir.set_loop_depth(m_loop_depth);
+    ApplyTransformations(is_optimized, with_split_loops);
+}
+
+void BufferAllocationTest::MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) {
+    for (const auto& input : node->inputs())
+        ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+            input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
+    for (const auto& output : node->outputs())
+        ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+            output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
+}
+
+void BufferAllocationTest::ApplyTransformations(bool is_optimized, bool with_split) {
+    ov::snippets::lowered::pass::PassPipeline pipeline;
+    pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
+    pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
+    pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
+    if (with_split)
+        pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
+    pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
+    pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
+    pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
+    pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
+    pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
+    pipeline.run(m_linear_ir);
+}
+
+void BufferAllocationTest::Validate() {
+    std::set<size_t> gprs;
+    for (const auto& expr : m_linear_ir) {
+        if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
+            gprs.insert(buffer->get_id());
+        }
+    }
+    EXPECT_EQ(gprs.size(), m_expected_count);
+    EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
+}
+
+std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
+    const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
+    const auto subtensor_buffer = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
+                                                      ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+
+    const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
+    const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
+    const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
+    const auto buffer0 = std::make_shared<ov::snippets::op::Buffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
+    const auto buffer1 = std::make_shared<ov::snippets::op::Buffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
+    const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});
+
+    MarkOp(add, subtensor_eltwise);
+    MarkOp(relu, subtensor_eltwise);
+    MarkOp(exp, subtensor_eltwise);
+    MarkOp(buffer0, subtensor_buffer);
+    MarkOp(buffer1, subtensor_buffer);
+
+    return body;
+}
+
+void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
+    const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
+                                                    ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+        node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
+    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+        node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
+    ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+        node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
+}
+
+std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
+    const auto subtensor_scalar = std::vector<size_t>{1, 1};
+    const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
+    const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+    const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+
+    const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
+    const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
+    const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
+
+    const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
+    const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
+    const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
+    const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
+    const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);
+    const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
+    const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(softmax, parameter2);
+    const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);
+
+    const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
+
+    MarkOp(load_reshape, subtensor_scalar);
+    MarkOp(store, subtensor_scalar);
+    MarkOp(softmax, subtensor_softmax);
+
+    MarkBrgemm(matmul0, subtensor_brgemm);
+    MarkBrgemm(matmul1, subtensor_brgemm);
+
+    return body;
+}
+
+TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
+    Validate();
+}
+TEST_P(MHABufferAllocationTest, BufferAllocation) {
+    Validate();
+}
+
+namespace BufferAllocationTest_Instances {
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseNotOptimized, EltwiseBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(false),  // in this test it doesn't make sense
+                                 ::testing::Values(80000), // Each Buffer has own allocated memory
+                                 ::testing::Values(2)),  // Each Buffer has unique ID
+                         BufferAllocationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, EltwiseBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(false),  // in this test it doesn't make sense
+                                 ::testing::Values(40000),  // Two Buffer reuse memory
+                                 ::testing::Values(1)),  // Two Buffers reuse IDs
+                         BufferAllocationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(true),
+                                 ::testing::Values(139264), // Each Buffer has own allocated memory
+                                 ::testing::Values(7)),  // Each Buffer has unique ID
+                         BufferAllocationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(true),
+                                 ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
+                                 ::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
+                         BufferAllocationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(false),
+                                 ::testing::Values(360448), // Each Buffer has own allocated memory
+                                 ::testing::Values(7)),  // Each Buffer has unique ID
+                         BufferAllocationTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(false),
+                                 ::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
+                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
+                         BufferAllocationTest::getTestCaseName);
+
+}  // namespace BufferAllocationTest_Instances
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
+
--- a/src/inference/dev_api/openvino/runtime/memory_solver.hpp
+++ b/src/inference/dev_api/openvino/runtime/memory_solver.hpp
@ -8,13 +8,16 @@
 */
 #pragma once

-#include <ie_common.h>
 #include <stdint.h>

 #include <algorithm>
 #include <map>
 #include <vector>

+#include "openvino/core/except.hpp"
+
+namespace ov {
+
 /**
 * @brief Helps to solve issue of optimal memory allocation only for particular
 *        execution order.
@ -42,7 +45,6 @@
 *  Exec order is predefined.
 */

-IE_SUPPRESS_DEPRECATED_START
 class MemorySolver {
 public:
    /** @brief Representation of edge (size and live time)*/
@ -67,7 +69,7 @@ public:
    /** @brief Performes inplace normalization of the input boxes
        @return lifespan of all boxes
    */
-    static int normalizeBoxes(std::vector<Box>& boxes) {
+    static int normalize_boxes(std::vector<Box>& boxes) {
        int max_ts = 0;
        for (const Box& box : boxes)
            max_ts = std::max(std::max(max_ts, box.start), box.finish);
@ -113,10 +115,10 @@ public:
        // 2. Box.finish >= Box.start (except Box.finish == -1)
        // 3. Box.size > 0 (or == 0 ?)
        // 4. Box.id == any unique value
-        _time_duration = normalizeBoxes(_boxes);
+        _time_duration = normalize_boxes(_boxes);
    }

-    inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
+    inline bool popup_together_with(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
        if (box_new.id + box_new.size > box_old.id && box_old.id + box_old.size > box_new.id) {
            // Move the new one up. There is an intersection
            box_new.id = box_old.id + box_old.size;
@ -131,7 +133,7 @@ public:
     * @return Size of common memory blob required for storing all
     */
    int64_t solve() {
-        maxTopDepth();  // at first make sure that we no need more for boxes sorted by box.start
+        max_top_depth();  // at first make sure that we no need more for boxes sorted by box.start
        std::vector<std::vector<const Box*>> time_slots(_time_duration);
        for (auto& slot : time_slots)
            slot.reserve(_top_depth);  // 2D array [_time_duration][_top_depth]
@ -155,8 +157,8 @@ public:
                    for (auto* box_in_slot : time_slots[i_slot]) {
                        // intersect with already stored boxes for all covered time slots
                        // and move up the new one if needed
-                        // Execution of 'popupTogetherWith' is important even if 'popped_up' is already 'true'
-                        popped_up = popupTogetherWith(box, *box_in_slot) || popped_up;
+                        // Execution of 'popup_together_with' is important even if 'popped_up' is already 'true'
+                        popped_up = popup_together_with(box, *box_in_slot) || popped_up;
                    }
                }
            } while (popped_up);
@ -174,23 +176,23 @@ public:
    }

    /** Provides calculated offset for specified box id */
-    int64_t getOffset(int id) const {
+    int64_t get_offset(int id) const {
        auto res = _offsets.find(id);
        if (res == _offsets.end())
-            IE_THROW() << "There are no box for provided ID";
+            OPENVINO_THROW("There are no box for provided ID");
        return res->second;
    }

    /** Additional info. Max sum of box sizes required for any time stamp. */
-    int64_t maxDepth() {
+    int64_t max_depth() {
        if (_depth == -1)
-            calcDepth();
+            calc_depth();
        return _depth;
    }
    /** Additional info. Max num of boxes required for any time stamp. */
-    int64_t maxTopDepth() {
+    int64_t max_top_depth() {
        if (_top_depth == -1)
-            calcDepth();
+            calc_depth();
        return _top_depth;
    }

@ -201,7 +203,7 @@ private:
    int64_t _depth = -1;
    int _time_duration = -1;

-    void calcDepth() {
+    void calc_depth() {
        int64_t top_depth = 0;
        int64_t depth = 0;
        std::map<int64_t, std::vector<const Box*>> release_at;
@ -218,11 +220,12 @@ private:
                top_depth--;
            }
            release_at.erase(time);
-            IE_ASSERT(top_depth > 0);
+            OPENVINO_ASSERT(top_depth > 0);

            _top_depth = std::max(_top_depth, top_depth);
            _depth = std::max(_depth, depth);
        }
    }
 };
-IE_SUPPRESS_DEPRECATED_END
+
+}  // namespace ov
--- a/src/inference/tests/unit/memory_solver_test.cpp
+++ b/src/inference/tests/unit/memory_solver_test.cpp
@ -2,33 +2,33 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "memory_solver.hpp"
+#include "openvino/runtime/memory_solver.hpp"

 #include <gtest/gtest.h>

 #include <vector>

-using Box = MemorySolver::Box;
+using Box = ov::MemorySolver::Box;

 TEST(MemSolverTest, CanConstruct) {
    {  // Empty vector<Box>
-        MemorySolver ms(std::vector<Box>{});
+        ov::MemorySolver ms(std::vector<Box>{});
    }

    {  // vector with default Box
-        MemorySolver ms(std::vector<Box>{{}});
+        ov::MemorySolver ms(std::vector<Box>{{}});
    }

    {  // vector with Box with non-default Box
-        MemorySolver ms(std::vector<Box>{{1, 3, 3}});
+        ov::MemorySolver ms(std::vector<Box>{{1, 3, 3}});
    }

    {  // vector with Box with size == 0
-        MemorySolver ms(std::vector<Box>{{0, 0, 0}});
+        ov::MemorySolver ms(std::vector<Box>{{0, 0, 0}});
    }

    {  // vector with Box with finish == -1
-        MemorySolver ms(std::vector<Box>{{3, -1, 6}});
+        ov::MemorySolver ms(std::vector<Box>{{3, -1, 6}});
    }

    // TODO: enable after implement TODO from memory_solver.hpp#L66
@ -42,7 +42,7 @@ TEST(MemSolverTest, CanConstruct) {
 //  |   __|____||____|
 //  |__|____||____|_____
 //      0  1  2  3  4
-TEST(MemSolverTest, GetOffset) {
+TEST(MemSolverTest, get_offset) {
    int n = 0;
    std::vector<Box> boxes{
        {n, ++n, 2, 0},
@ -51,13 +51,13 @@ TEST(MemSolverTest, GetOffset) {
        {n, ++n, 2, 3},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    ms.solve();

    //  The correct answer is [0, 2, 0, 2] or [2, 0, 2, 0].
-    EXPECT_EQ(ms.getOffset(0) + ms.getOffset(1), 2);
-    EXPECT_EQ(ms.getOffset(1) + ms.getOffset(2), 2);
-    EXPECT_EQ(ms.getOffset(2) + ms.getOffset(3), 2);
+    EXPECT_EQ(ms.get_offset(0) + ms.get_offset(1), 2);
+    EXPECT_EQ(ms.get_offset(1) + ms.get_offset(2), 2);
+    EXPECT_EQ(ms.get_offset(2) + ms.get_offset(3), 2);
 }

 //  |
@ -65,7 +65,7 @@ TEST(MemSolverTest, GetOffset) {
 //  |   __|____||____|
 //  |__|____||____|_____
 //      0  1  2  3  4
-TEST(MemSolverTest, GetOffsetThrowException) {
+TEST(MemSolverTest, get_offsetThrowException) {
    int n = 0, id = 0;
    std::vector<Box> boxes{
        {n, ++n, 2, id++},
@ -74,10 +74,10 @@ TEST(MemSolverTest, GetOffsetThrowException) {
        {n, ++n, 2, id++},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    ms.solve();

-    EXPECT_THROW(ms.getOffset(100), std::runtime_error);
+    EXPECT_THROW(ms.get_offset(100), std::runtime_error);
 }

 //  |
@ -93,10 +93,10 @@ TEST(MemSolverTest, LinearAndEven) {
        {n, ++n, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 4);
-    EXPECT_EQ(ms.maxDepth(), 4);
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    EXPECT_EQ(ms.max_depth(), 4);
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |      ____
@ -112,10 +112,10 @@ TEST(MemSolverTest, LinearAndNotEven) {
        {n, ++n, 3},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 5);
-    EXPECT_EQ(ms.maxDepth(), 5);
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    EXPECT_EQ(ms.max_depth(), 5);
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |         _______
@ -131,10 +131,10 @@ TEST(MemSolverTest, LinearWithEmptyExecIndexes) {
        {n, n += 2, 3},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 5);
-    EXPECT_EQ(ms.maxDepth(), 5);
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    EXPECT_EQ(ms.max_depth(), 5);
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |            __________
@ -150,10 +150,10 @@ TEST(MemSolverTest, DISABLED_Unefficiency) {
        {2, 3, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 5);  // currently we have answer 6
-    EXPECT_EQ(ms.maxDepth(), 5);
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    EXPECT_EQ(ms.max_depth(), 5);
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |            __________
@ -169,10 +169,10 @@ TEST(MemSolverTest, OverlappingBoxes) {
        {2, 3, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 6);
-    EXPECT_EQ(ms.maxDepth(), 6);
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    EXPECT_EQ(ms.max_depth(), 6);
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |      ____
@ -190,10 +190,10 @@ TEST(MemSolverTest, EndOnSeveralBegins) {
        {3, 4, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 6);
-    EXPECT_EQ(ms.maxDepth(), 6);
-    EXPECT_EQ(ms.maxTopDepth(), 3);
+    EXPECT_EQ(ms.max_depth(), 6);
+    EXPECT_EQ(ms.max_top_depth(), 3);
 }

 //  |      _____________
@ -211,10 +211,10 @@ TEST(MemSolverTest, ToEndBoxes) {
        {3, 4, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 8);
-    EXPECT_EQ(ms.maxDepth(), 8);
-    EXPECT_EQ(ms.maxTopDepth(), 4);
+    EXPECT_EQ(ms.max_depth(), 8);
+    EXPECT_EQ(ms.max_top_depth(), 4);
 }

 //  |                     _
@ -232,10 +232,10 @@ TEST(MemSolverTest, LastAndToEndBox) {
        {3, 4, 2},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 6);
-    EXPECT_EQ(ms.maxDepth(), 6);
-    EXPECT_EQ(ms.maxTopDepth(), 3);
+    EXPECT_EQ(ms.max_depth(), 6);
+    EXPECT_EQ(ms.max_top_depth(), 3);
 }

 TEST(MemSolverTest, OptimalAlexnet) {
@ -269,10 +269,10 @@ TEST(MemSolverTest, OptimalAlexnet) {
    for (const auto& sh : shapes)
        boxes.push_back({n, ++n, sh[0] * sh[1] * sh[2]});

-    // For linear topology bottom score is reachable minRequired == maxDepth
-    MemorySolver ms(boxes);
-    EXPECT_EQ(ms.solve(), ms.maxDepth());
-    EXPECT_EQ(ms.maxTopDepth(), 2);
+    // For linear topology bottom score is reachable minRequired == max_depth
+    ov::MemorySolver ms(boxes);
+    EXPECT_EQ(ms.solve(), ms.max_depth());
+    EXPECT_EQ(ms.max_top_depth(), 2);
 }

 //  |         _____________
@ -290,14 +290,14 @@ TEST(MemSolverTest, NoOverlapping) {
        {2, 4, 2, n++},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    ms.solve();
    // TODO: Current algorithm doesn't solve that case. Uncomment check to see inefficiency
    // EXPECT_EQ(ms.solve(), 5);

    auto no_overlap = [&](Box box1, Box box2) -> bool {
-        int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
-        int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
+        int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
+        int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
        return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
               off1 >= off2 + box2.size;
    };
@ -322,12 +322,12 @@ TEST(MemSolverTest, BestSolution1) {
        {6, 7, 3, n++},
    };

-    MemorySolver ms(boxes);
+    ov::MemorySolver ms(boxes);
    EXPECT_EQ(ms.solve(), 5);

    auto no_overlap = [&](Box box1, Box box2) -> bool {
-        int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
-        int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
+        int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
+        int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
        return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
               off1 >= off2 + box2.size;
    };
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@ -28,7 +28,6 @@
 #include "low_precision/low_precision.hpp"
 #include "memory_desc/cpu_memory_desc_utils.h"
 #include "memory_desc/dnnl_blocked_memory_desc.h"
-#include "memory_solver.hpp"
 #include "nodes/common/cpu_convert.h"
 #include "nodes/common/cpu_memcpy.h"
 #include "nodes/convert.h"
@ -50,6 +49,8 @@
 #include "utils/verbose.h"
 #include "memory_desc/cpu_memory_desc_utils.h"

+#include "openvino/runtime/memory_solver.hpp"
+
 #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
 #    include <tbb/task.h>
 #endif
@ -629,10 +630,10 @@ void Graph::AllocateWithReuse() {
    const int64_t alignment = 32;  // 32 bytes

    // Markup the boxes
-    std::vector<MemorySolver::Box> definedBoxes;
-    std::vector<MemorySolver::Box> undefinedBoxes;
+    std::vector<ov::MemorySolver::Box> definedBoxes;
+    std::vector<ov::MemorySolver::Box> undefinedBoxes;
    for (size_t i = 0; i < remaining_edge_clusters_count; i++) {
-        MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
+        ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
        int64_t boxSize = 0;
        for (auto &edge : edge_clusters[i]) {
            int e_start = edge->getParent()->execIndex;
@ -679,7 +680,7 @@ void Graph::AllocateWithReuse() {
    }

    // Process defined boxes (static shapes)
-    MemorySolver staticMemSolver(definedBoxes);
+    ov::MemorySolver staticMemSolver(definedBoxes);
    size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;

    memWorkspace = std::make_shared<Memory>(getEngine(), DnnlBlockedMemoryDesc(ov::element::i8, Shape(VectorDims{total_size})));
@ -693,7 +694,7 @@ void Graph::AllocateWithReuse() {
        int count = 0;
        for (auto& edge : edge_clusters[box.id]) {
            if (edge->getStatus() == Edge::Status::NeedAllocation) {
-                int64_t offset = staticMemSolver.getOffset(box.id);
+                int64_t offset = staticMemSolver.get_offset(box.id);
                // !! Fallback to individual memory allocation !!
                // if you like to check infer without reuse just call this function without arguments.
                edge->allocate(workspace_ptr + offset * alignment);  // alignment in byte
@ -762,9 +763,9 @@ void Graph::AllocateWithReuse() {
            }
        }

-        MemorySolver::normalizeBoxes(undefinedBoxes);
+        ov::MemorySolver::normalize_boxes(undefinedBoxes);

-        std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
+        std::vector<std::vector<ov::MemorySolver::Box>> groups; //groups of nonoverlapping boxes
        constexpr bool enableMemReuse = true; // set false to disable mem reuse for debug purposes
        if (enableMemReuse) {
            groups.push_back({undefinedBoxes.front()});
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
@ -22,6 +22,20 @@ using LoopPort = LoopManager::LoopPort;

 BrgemmBlocking::BrgemmBlocking() : Pass() {}

+void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it) {
+    const auto& brgemm_expr = brgemm_it->get();
+    const auto wsp_expr = brgemm_expr->get_input_port_connector(2)->get_source().get_expr();
+    const auto wsp_buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(wsp_expr->get_node());
+    OPENVINO_ASSERT(wsp_buffer && wsp_buffer->is_new_memory(), "Incorrect Scratchpad buffer for Brgemm AMX");
+    // [115164] Should be fully supported by explicit loops of blocking by K, N
+    OPENVINO_ASSERT(brgemm_expr->get_loop_ids().empty() && wsp_expr->get_loop_ids().empty(), "Incorrect blocking loop marking for Brgemm AMX");
+    // If scratchpad with temp memory is not explicitly before Brgemm, need to move to there.
+    if (wsp_expr != *std::prev(brgemm_it)) {
+        const auto wsp_it = linear_ir.find(wsp_expr);
+        linear_ir.move(wsp_it, brgemm_it);
+    }
+}
+
 bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
    OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking")
    if (linear_ir.empty())
@ -64,11 +78,18 @@ bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
        const auto work_amount = m;
        const auto increment = block_size;

+        auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
        std::vector<LoopPort> entries{LoopPort(expr->get_input_port(0), true), LoopPort(expr->get_input_port(1), false)};
-        if (brgemm->is_with_scratchpad())
+        // Scratchpad for AMX scenario is needed only as temporary buffer for each M block - it means that the Buffer should be in this loop.
+        // Other scratchpads (that after BrgemmCopyB) should be the loop outside.
+        if (brgemm->is_with_compensations()) {
            entries.emplace_back(expr->get_input_port(2), false);
+        } else if (brgemm->is_amx()) {
+            move_new_memory_buffer(linear_ir, expr_it);
+            loop_begin_it = std::prev(expr_it);
+        }
        std::vector<LoopPort> exits{LoopPort(expr->get_output_port(0), true)};
-        loop_manager->mark_loop(expr_it, std::next(expr_it), work_amount, increment, dim_idx, entries, exits);
+        loop_manager->mark_loop(loop_begin_it, loop_end_it, work_amount, increment, dim_idx, entries, exits);
    }

    return modified;
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp
@ -21,6 +21,9 @@ public:
    OPENVINO_RTTI("BrgemmBlocking", "Pass")
    BrgemmBlocking();
    bool run(snippets::lowered::LinearIR& linear_ir) override;
+
+private:
+    static void move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it);
 };

 }  // namespace pass
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
@ -0,0 +1,216 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/opsets/opset.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/mark_loops.hpp"
+#include "snippets/lowered/pass/init_loops.hpp"
+#include "snippets/lowered/pass/insert_load_store.hpp"
+#include "snippets/lowered/pass/validate_loops.hpp"
+#include "snippets/lowered/pass/insert_loops.hpp"
+#include "snippets/lowered/pass/allocate_buffers.hpp"
+#include "snippets/lowered/pass/fuse_loops.hpp"
+#include "snippets/lowered/pass/split_loops.hpp"
+#include "snippets/lowered/pass/insert_buffers.hpp"
+#include "snippets/lowered/pass/softmax_decomposition.hpp"
+
+#include "transformations/snippets/x64/shape_inference.hpp"
+#include "transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp"
+#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
+#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
+#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+
+namespace ov {
+namespace test {
+namespace snippets {
+
+/*  Note[74841]:
+ *  This test is almost full copy of BufferAllocationTest class from openvino/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp.
+ *  The BufferAllocationTest class should be shared test class to reuse this structure in backend-specific tests in test infrastructure refactoring.
+ */
+
+typedef std::tuple<
+    bool,   // Optimized pipeline
+    bool,   // With SplitLoops opt
+    size_t, // Expected Buffer size in bytes
+    size_t  // Expected unique Buffer IDs count
+> BufferAllocationCPUParams;
+
+class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCPUParams> {
+public:
+    using VectorDims = ov::snippets::VectorDims;
+    static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationCPUParams> obj) {
+        bool is_optimized, with_split_loops;
+        size_t expected_size, expected_count;
+        std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
+        std::ostringstream result;
+        result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
+        result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
+        result << "ExpBufferSize=" << expected_size << "_";
+        result << "ExpBufferNum=" << expected_count;
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        bool is_optimized, with_split_loops;
+        std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
+
+        const auto body = GetModel();
+        m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::CPUShapeInferSnippetsFactory>());
+        m_linear_ir.set_loop_depth(m_loop_depth);
+        ApplyTransformations(is_optimized, with_split_loops);
+    }
+
+    void ApplyTransformations(bool is_optimized, bool with_split_loops) {
+        ov::snippets::lowered::pass::PassPipeline pipeline;
+        pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
+        pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
+        pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
+        pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
+        if (with_split_loops)
+            pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
+        pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
+        pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
+        pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
+        pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
+        pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
+        pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
+        pipeline.run(m_linear_ir);
+    }
+
+    void Validate() {
+        std::set<size_t> gprs;
+        for (const auto& expr : m_linear_ir) {
+            if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
+                gprs.insert(buffer->get_id());
+            }
+        }
+        EXPECT_EQ(gprs.size(), m_expected_count);
+        EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
+    }
+
+    virtual std::shared_ptr<ov::Model> GetModel() const = 0;
+
+    void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) const {
+        for (const auto& input : node->inputs())
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+                input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
+        for (const auto& output : node->outputs())
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
+                output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
+    }
+
+    size_t m_buffer_scratchpad = 0;
+    ov::snippets::lowered::LinearIR m_linear_ir;
+
+    size_t m_expected_size = 0;
+    size_t m_expected_count = 0;
+
+    size_t m_loop_depth = 2;
+    size_t m_vector_size = 16;
+};
+
+class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
+protected:
+    std::shared_ptr<ov::Model> GetModel() const override {
+        const auto subtensor_scalar = std::vector<size_t>{1, 1};
+        const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
+        const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);
+
+        const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
+        const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 128, 12, 64}));
+        const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
+
+        const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
+        const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
+        const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
+        const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);
+        const auto convert1 = std::make_shared<ov::snippets::op::ConvertSaturation>(relu0, ov::element::bf16);
+
+        const auto brgemm_copyb0 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
+            convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
+        const auto scratch0 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
+        const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
+            parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX);
+        brgemm_cpu0->set_m_block_size(32);
+
+        const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
+        const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
+        const auto convert2 = std::make_shared<ov::snippets::op::ConvertSaturation>(softmax, ov::element::bf16);
+
+        const auto brgemm_copyb1 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
+            parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
+        const auto scratch1 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
+        const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
+            convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX);
+        brgemm_cpu1->set_m_block_size(32);
+
+        const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);
+
+        const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
+
+        MarkOp(load_reshape, subtensor_scalar);
+        MarkOp(store, subtensor_scalar);
+        MarkOp(softmax, subtensor_softmax);
+
+        MarkOp(brgemm_cpu0, subtensor_full);
+        MarkOp(brgemm_cpu1, subtensor_full);
+        MarkOp(brgemm_copyb0, subtensor_full);
+        MarkOp(brgemm_copyb1, subtensor_full);
+        MarkOp(scratch0, subtensor_full);
+        MarkOp(scratch1, subtensor_full);
+
+        return body;
+    }
+};
+
+TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) {
+    Validate();
+}
+
+
+namespace BufferAllocationCPUTest_Instances {
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWSplit, MHABF16AMXBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(true),
+                                 ::testing::Values(196608),
+                                 ::testing::Values(11)),
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABF16AMXBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(true),
+                                 ::testing::Values(90112),
+                                 ::testing::Values(4)),
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(false),
+                                 ::testing::Values(false),
+                                 ::testing::Values(393216),
+                                 ::testing::Values(11)),
+                         BufferAllocationCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
+                         ::testing::Combine(
+                                 ::testing::Values(true),
+                                 ::testing::Values(false),
+                                 ::testing::Values(114688),
+                                 ::testing::Values(4)),
+                         BufferAllocationCPUTest::getTestCaseName);
+
+}  // namespace BufferAllocationCPUTest_Instances
+}  // namespace snippets
+}  // namespace test
+}  // namespace ov
--- a/src/plugins/intel_gna/src/memory/gna_mem_requests_queue.hpp
+++ b/src/plugins/intel_gna/src/memory/gna_mem_requests_queue.hpp
@ -16,7 +16,7 @@
 #include "gna_lib_ver_selector.hpp"
 #include "gna_mem_requests.hpp"
 #include "log/log.hpp"
-#include "memory_solver.hpp"
+#include "openvino/runtime/memory_solver.hpp"

 using namespace ov::intel_gna;

@ -239,7 +239,7 @@ public:
    size_t calcSize(bool isCompact = false) override {
        if (isCompact) {
            _size = 0;
-            std::vector<MemorySolver::Box> boxes;
+            std::vector<ov::MemorySolver::Box> boxes;
            for (size_t i = 0; i < _mem_requests.size(); ++i) {
                // skipping BIND, cross-region and empty requests
                if (_mem_requests[i]._type & REQUEST_BIND || _mem_requests[i]._ptr_out == nullptr) {
@ -255,12 +255,12 @@ public:
                boxes.push_back({start, stop, static_cast<int64_t>(original_with_pad), static_cast<int64_t>(i)});
            }

-            MemorySolver memSolver(boxes);
+            ov::MemorySolver memSolver(boxes);
            _size = memSolver.solve();

            // setting offsets
            for (auto const& box : boxes) {
-                _mem_requests[box.id]._offset = memSolver.getOffset(static_cast<int>(box.id));
+                _mem_requests[box.id]._offset = memSolver.get_offset(static_cast<int>(box.id));
            }
            return _size;
        } else {
--- a/src/plugins/intel_gna/src/memory/gna_memory.hpp
+++ b/src/plugins/intel_gna/src/memory/gna_memory.hpp
@ -22,7 +22,7 @@
 #include "gna_mem_requests_queue.hpp"
 #include "log/log.hpp"
 #include "memory/gna_allocator.hpp"
-#include "memory_solver.hpp"
+#include "openvino/runtime/memory_solver.hpp"

 #ifdef GNA_MEMORY_DUMP
 #    include <iomanip>