[Snippets] Refactored work with Buffers (#19644)

[Snippets] BufferManager is not derived from PassPipeline now

[Snippets] Added MemorySolver support

[Snippets] Made as static class

[Snippets] Added one-level inplace support

[Snippets] Added optimization bits

[Snippets] Small cosmetic fixes

[Snippets] Renamed to BufferSolver

[Snippets] Refactored

[Snippets] Fixed IdendifyBuffers

[Snippets] Add inplace multi + identify buffers

[Snippets] Made common pass

[Snippets] Added PassPipeline::get_pass<>()

[Snippets] Added comments, briefs, refactored smth

[Snippets] Fixed win build

[Snippets] Not allow to have the same Buffer ID for multi level Buffers

[Snippets] Moved CleanupRepeatedPtrShifts to common pioeline

[Snippets] Made IdentifyBuffers::ShiftPtrParams

[Snippets] Fixed window sliding mode

[Snippets] Refactored nested clusters

[Snippets] Adde normalized buffer regs

[Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers

[Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find

[Snippets] Removed useless method from InitLoops

[Snippets] Fixed CC build

[Snippets] Applied Ivan comments

[Snippets] Applied Ivan comment: refactored pass classes

[Snippets] Applied Vladislav comments

[Snippets] Applied Ivan comments 2

[Runtime] Moved MemorySolver to API2.0

[Snippets] Created common buffer allocation pass AllocateBuffers

[Snippets][Tests] Added InplaceEltwise unit test

[Snippets] fixed NormalizeBufferIDs

[Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm

[Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
This commit is contained in:
Alexandra Sidorova 2023-11-30 17:46:35 +04:00 committed by GitHub
parent 6ab5ef72d5
commit df03b0437a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 1688 additions and 266 deletions

View File

@ -29,6 +29,9 @@ public:
// Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
// Set by a backend, should be large enough to compensate for the kernel call overheads
size_t m_min_kernel_work_amount = 256;
// True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
// False if all Buffers will have uniqie ID and offsets in the Linear IR
bool m_are_buffers_optimized = true;
};
/* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).

View File

@ -5,7 +5,6 @@
#pragma once
#include "pass.hpp"
#include "snippets/snippets_isa.hpp"
namespace ov {
namespace snippets {
@ -14,26 +13,40 @@ namespace pass {
/**
* @interface AllocateBuffers
* @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
* Notes:
* - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
* The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
* - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
* and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
* @brief The pass allocates common memory for all Buffers.
* There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
* Optimized mode allocates memory for Buffer ops using the following optimizations:
* - MemorySolver: helps to solve issue of optimal memory allocation;
* - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
* - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
* Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
* The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
* @ingroup snippets
*/
class AllocateBuffers : public Pass {
class AllocateBuffers: public Pass {
public:
OPENVINO_RTTI("AllocateBuffers", "Pass")
bool run(lowered::LinearIR& linear_ir) override;
AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(LinearIR& linear_ir) override;
/**
* @brief Set offset to Buffer op and propagates its to the connected memory access ops
* @param buffer_expr expression with Buffer op
* @param offset offset in common buffer scratchpad
*/
static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
using BufferCluster = std::set<ExpressionPtr>;
using BufferClusters = std::vector<BufferCluster>;
private:
static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);
size_t m_buffer_scratchpad_size = 0;
size_t& m_buffer_scratchpad_size;
bool m_is_optimized_mode = true;
};
} // namespace pass

View File

@ -0,0 +1,138 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
#include "allocate_buffers.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface DefineBufferClusters
* @brief The pass defines buffer clusters. The buffers from one cluster share the
* same memory (has the same offset relative to the data pointer of buffer scratchpad).
* - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
* - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
* It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
* Demonstration:
* |-----------------------------------------------------|
* | |------------| |------------| | InnerLoops have work amount 128
* Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128] OuterLoop has work amount 3
* | |------------| OuterLoop |------------| |
* |-----------------------------------------------------|
* Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
* Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
* These passes should be executed separately before this pass!
* @ingroup snippets
*/
class DefineBufferClusters : public Pass {
public:
OPENVINO_RTTI("DefineBufferClusters", "Pass")
DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;
private:
using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
/**
* @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
* @param target target expression with Buffer op
* @return vector iterator which refers to the found cluster
*/
AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
/**
* @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
* @param buffer_expr expression with assumed Buffer op
* @param target_expr expression with target op - LoopEnd or MemoryAccess op
* @return boolean value
*/
bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
/**
* @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
* @param buffer_expr expression with Buffer op
*/
void create_new_cluster(const ExpressionPtr& buffer_expr);
/**
* @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
* that means that Buffers in cluster have different IDs.
* @param cluster set of Buffer expressions - cluster
* @return common buffer ID or SIZE_MAX - size value
*/
size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
/**
* @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
* @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
*/
void parse_loop(const LinearIR::constExprIt& expr_it);
/**
* @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
* @param expr expression with full MemoryAccess op
*/
void parse_memory_access_op(const ExpressionPtr& expr);
/**
* @brief Gets input outputs buffers of Loop
* @param loop_expr expression with LoopEnd op
* @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
*/
BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
/**
* @brief Gets output buffers of Loop
* @param loop_expr expression with LoopEnd op
* @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
*/
BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
/**
* @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
* @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
* @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
* @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
*/
void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
/**
* @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
* @param buffer_expr expression with Buffer op
* @return finalization offset - int64_t value
*/
int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
/**
* @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
* indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
* @param up expression with upper Buffer op
* @param down expression with lower Buffer op
* @param loop expression with common LoopEnd op
* @param up_idx the reference to port index of upper Buffer op to the Loop
* @param down_idx the reference to port index of lower Buffer op to the Loop
* @return Return True if the Buffers are connected to the same Loop
*/
static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
/**
* @brief Unite clusters
* @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
* @param outer_cluster buffer clusters with buffers outside the Loop
* @param outer_buffer target Buffer from outer_cluster
* @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
* @return Return True if clusters have been united
*/
bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
const ExpressionPtr& outer_buffer, bool is_outer_up);
AllocateBuffers::BufferClusters& m_clusters;
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,28 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface EnumerateExpressions
* @brief The pass enumerates expression by execution order
* @ingroup snippets
*/
class EnumerateExpressions : public Pass {
public:
OPENVINO_RTTI("EnumerateExpressions", "Pass")
bool run(LinearIR& linear_ir) override;
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -6,8 +6,6 @@
#include "pass.hpp"
#include "snippets/op/buffer.hpp"
namespace ov {
namespace snippets {
namespace lowered {
@ -22,7 +20,8 @@ namespace pass {
* - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
* The buffers are connected to the same Loop - are adjacent in graph sense bounds.
* - The vertices (buffers) are adjacent if they are connected to the same Loop and
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
* or one of the Buffers is in some a Loop but another Buffer is not;
* - Firstly, create adjacency matrix using the definition above;
* - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
* Note: should be called before ResetBuffer() pass to have correct offsets
@ -33,13 +32,79 @@ public:
OPENVINO_RTTI("IdentifyBuffers", "Pass")
IdentifyBuffers() = default;
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(LinearIR& linear_ir) override;
private:
using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
struct ShiftPtrParams {
ShiftPtrParams() = default;
ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
int64_t data_size = 0;
int64_t ptr_increment = 0;
int64_t finalization_offset = 0;
std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
};
/**
* @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
* @param lhs Data pointer shift params for first Buffer
* @param rhs Data pointer shift params for second Buffer
* @return Returns True if params are valid for reusing. Otherwise returns False
*/
static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
private:
using BufferPool = std::vector<ExpressionPtr>;
/**
* @brief Get Buffer Index in Buffer set
* @param target the target Buffer expression
* @param pool set of Buffers from the Linear IR
* @return index of target Buffer expression in set
*/
static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
/**
* @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
* @param linear_ir the target Linear IR
* @param pool set of Buffers from the Linear IR
* @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
*/
static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
/**
* @brief Algorithm of Graph coloring where vertices are Buffers
* @param buffers set of Buffers from the Linear IR
* @param adj adjacency matrix
* @return map [color id -> Buffer set]
*/
static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
/**
* @brief Update the adjacency matrix:
* - If Buffers are from the same Loops and connected to the same Loop and
* they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
* - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
* the Buffers are adjacent - set value True in the matrix;
* @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
* @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
* @param buffers set of Buffers from the Linear IR
* @param adj Target adjacency matrix
*/
static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
const BufferPool& buffers,
std::vector<bool>& adj);
/**
* @brief Check if two Buffers are adjacent and cannot have the same ID
* @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
* @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
* @return Returns True if they are adjacent, otherwise returns False
*/
static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
};
} // namespace pass

View File

@ -0,0 +1,41 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface InitBuffersDefault
* @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
* @ingroup snippets
*/
class InitBuffersDefault : public Pass {
public:
OPENVINO_RTTI("InitBuffersDefault", "Pass")
InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
m_buffer_scratchpad_size = 0;
}
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;
private:
size_t& m_buffer_scratchpad_size;
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -15,7 +15,7 @@ namespace pass {
/**
* @interface InitLoops
* @brief The pass initialize scheduling information in LoopInfo
* @brief The pass initializes scheduling information in LoopInfo
* @ingroup snippets
*/
class InitLoops : public Pass {

View File

@ -0,0 +1,40 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface NormalizeBufferIDs
* @brief After optimizations some Buffer IDs might be set unevenly: some numbers are missed.
* For example,
* [Buffer -> ID]
* Buffer0 -> 0 Two Buffers have ID = 0, one has ID = 2.
* Buffer1 -> 2 Obviosly, we can normalize this IDs to set ID = 1 to Buffer1.
* Buffer2 -> 0 It helps to assign GPR registers in `AssignRegister` more effective.
* Thus, the pass normalize IDs of Buffers in Linear IR.
* @ingroup snippets
*/
class NormalizeBufferIDs : public Pass {
public:
OPENVINO_RTTI("NormalizeBufferIDs", "Pass")
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -39,6 +39,11 @@ public:
return get_type_info().name;
}
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
virtual bool run(lowered::LinearIR& linear_ir) = 0;
};

View File

@ -0,0 +1,54 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "pass.hpp"
#include "allocate_buffers.hpp"
#include "openvino/runtime/memory_solver.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
/**
* @interface SolveBufferMemory
* @brief The pass optimally calculates the common buffer scratchpad size and
* set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API.
* Note: The pass requires expression enumeration. It should be executed separately before this pass!
* @ingroup snippets
*/
class SolveBufferMemory : public Pass {
public:
OPENVINO_RTTI("SolveBufferMemory", "Pass")
SolveBufferMemory(size_t& buffer_scratchpad_size, AllocateBuffers::BufferClusters& clusters)
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_clusters(clusters) {}
/**
* @brief Apply the pass to the Linear IR
* @param linear_ir the target Linear IR
* @return status of the pass
*/
bool run(lowered::LinearIR& linear_ir) override;
private:
/**
* @brief Initializes boxes for MemorySolver
* @param buffer_clusters buffer clusters. These clusters could be obtained using DefineBufferClusters pass
* @return vector of boxes for MemorySolver
*/
std::vector<ov::MemorySolver::Box> init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters);
size_t& m_buffer_scratchpad_size;
AllocateBuffers::BufferClusters& m_clusters;
constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory
};
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -51,6 +51,7 @@ constexpr inline bool implication(bool cause, bool cond) {
template <typename T, typename U>
inline T div_up(const T a, const U b) {
OPENVINO_ASSERT(b != 0, "Divider must not be zero");
return static_cast<T>((a + b - 1) / b);
}

View File

@ -2,9 +2,16 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/allocate_buffers.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/enumerate_expressions.hpp"
#include "snippets/lowered/pass/solve_buffer_memory.hpp"
#include "snippets/lowered/pass/init_buffers_default.hpp"
#include "snippets/lowered/pass/identify_buffers.hpp"
#include "snippets/lowered/pass/define_buffer_clusters.hpp"
#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
#include "snippets/pass/tokenization.hpp"
#include "snippets/itt.hpp"
namespace ov {
@ -12,11 +19,15 @@ namespace snippets {
namespace lowered {
namespace pass {
void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, const size_t offset) {
AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
// If Buffer has offset We set this offset in the connected MemoryAccess ops
// to correctly read and write data because all Buffers has the common data pointer on buffer scratchpad
// to correctly read and write data because all Buffers have the common data pointer on buffer scratchpad
const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
OPENVINO_ASSERT(buffer, "Failed to set Buffer offset: AllocateBuffers expects Buffer op");
buffer->set_offset(static_cast<int64_t>(offset));
// Propagate to up: in Store. Buffer can have only one Store
@ -55,106 +66,23 @@ void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const Expressi
}
}
bool AllocateBuffers::run(LinearIR& linear_ir) {
bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
// [113664] The pass contains two main logics: it defines which of buffers can be inplace (use the same memory) and
// allocate memory of needed size. It should be splitted into several passes and updated in bounds of the ticket 113664.
// [113664] At the moment New Memory Buffer is used only in BrgemmCPU for AMX case. This memory can be reused for each Brgemm.
// This plugin-specific condition will be removed in the near future after the task 113664 will be implemented
size_t offset = 0, new_memory_buffer_offset = 0;
size_t prev_data_size = 0, current_data_size = 0;
std::set<ExpressionPtr> allocated_buffers;
bool new_memory_buffer_allocated = false;
auto allocate = [&](const std::shared_ptr<op::Buffer>& buffer, const ExpressionPtr& expr, size_t buffer_size) {
offset = m_buffer_scratchpad_size;
propagate_offset(linear_ir, expr, offset);
m_buffer_scratchpad_size += buffer_size;
allocated_buffers.insert(expr);
prev_data_size = current_data_size;
};
for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
const auto& expr = *expr_it;
if (auto buffer = as_type_ptr<op::Buffer>(expr->get_node())) {
const auto buffer_size = buffer->get_byte_size();
current_data_size = buffer->get_element_type().size();
// If it's the first buffer, offsets are zero => nothing to propagate, can continue
if (m_buffer_scratchpad_size == 0) {
m_buffer_scratchpad_size += buffer_size;
allocated_buffers.insert(expr);
prev_data_size = current_data_size;
continue;
}
if (buffer->is_intermediate_memory()) {
const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr();
const auto& parent_node = parent_expr->get_node();
// Full MemoryAccess ops need new memory. Previous logic is to check for parent isn't Loop
// [113664] It should be unified in MemoryManager with memory reuse in the near future
const auto ma = ov::as_type_ptr<op::MemoryAccess>(parent_node);
if (ma && ma->is_full_memory_access_op()) {
allocate(buffer, *expr_it, buffer_size);
continue;
}
// Loop Full_MA
// | |
// Buffer_1 Buffer_0
// \ /
// Full_MA
// At the moment the pass support only sequentially implicit InPlace.
// If Buffer_0 is allocated firstly as Buffer after full memory access op,
// we cannot reuse this allocated memory for Buffer_1 - we must allocate new memory for it.
// [113664] It should be unified in MemoryManager with memory reuse in the near future
bool need_allocate = false;
const auto consumers = expr->get_output_port_connector(0)->get_consumers();
for (const auto& consumer : consumers) {
const auto& consumer_expr = consumer.get_expr();
const auto& child_node = consumer_expr->get_node();
const auto ma = ov::as_type_ptr<op::MemoryAccess>(child_node);
if (ma && ma->is_full_memory_access_op()) {
for (size_t i = 0; i < consumer_expr->get_input_count() && !need_allocate; ++i) {
if (i == consumer.get_index())
continue;
const auto buffer_sibling = consumer_expr->get_input_port_connector(i)->get_source().get_expr();
need_allocate = ov::is_type<op::Buffer>(buffer_sibling->get_node()) && allocated_buffers.count(buffer_sibling) != 0;
}
}
if (need_allocate)
break;
}
if (need_allocate) {
allocate(buffer, *expr_it, buffer_size);
continue;
}
// [113664] For more details and reason of the current solution, please, go to the ticket description
const auto current_allocated_memory_size = m_buffer_scratchpad_size - offset;
if (((current_data_size == prev_data_size) && buffer_size > current_allocated_memory_size) ||
((current_data_size != prev_data_size) && buffer_size != current_allocated_memory_size)) {
allocate(buffer, expr, buffer_size);
continue;
}
propagate_offset(linear_ir, *expr_it, offset);
allocated_buffers.insert(expr);
prev_data_size = current_data_size;
} else {
if (!new_memory_buffer_allocated) {
allocate(buffer, *expr_it, buffer_size);
new_memory_buffer_allocated = true;
new_memory_buffer_offset = offset;
} else {
propagate_offset(linear_ir, *expr_it, new_memory_buffer_offset);
allocated_buffers.insert(expr);
prev_data_size = current_data_size;
}
}
}
m_buffer_scratchpad_size = 0;
PassPipeline pipeline;
if (m_is_optimized_mode) {
BufferClusters buffer_clusters;
pipeline.register_pass<EnumerateExpressions>();
pipeline.register_pass<IdentifyBuffers>();
pipeline.register_pass<DefineBufferClusters>(buffer_clusters);
pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
pipeline.register_pass<NormalizeBufferIDs>();
} else {
pipeline.register_pass<InitBuffersDefault>(m_buffer_scratchpad_size);
}
return !allocated_buffers.empty();
pipeline.run(linear_ir);
return m_buffer_scratchpad_size > 0;
}
} // namespace pass

View File

@ -0,0 +1,346 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/define_buffer_clusters.hpp"
#include "snippets/lowered/pass/identify_buffers.hpp"
#include "snippets/pass/tokenization.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
using ShiftPtrParams = IdentifyBuffers::ShiftPtrParams;
AllocateBuffers::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) {
return std::find_if(m_clusters.begin(), m_clusters.end(),
[&target](const AllocateBuffers::BufferCluster& cluster) { return cluster.count(target) > 0; });
}
bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const {
const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
return buffer && buffer_expr->get_loop_ids() == target_expr->get_loop_ids();
}
void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) {
const auto cluster_it = find_cluster_by_expr(buffer_expr);
// If Buffer is missed in clusters, create new cluster with the single Buffer node inside
if (cluster_it == m_clusters.cend()) {
m_clusters.push_back(AllocateBuffers::BufferCluster{buffer_expr});
}
}
size_t DefineBufferClusters::get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const {
OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!");
const auto id = (ov::as_type_ptr<op::Buffer>(cluster.cbegin()->get()->get_node()))->get_id();
if (std::all_of(cluster.cbegin(), cluster.cend(),
[&id](const ExpressionPtr& expr) { return (ov::as_type_ptr<op::Buffer>(expr->get_node()))->get_id() == id; })) {
return id;
}
return SIZE_MAX;
}
DefineBufferClusters::BufferPorts DefineBufferClusters::get_input_buffers(const ExpressionPtr& loop_expr) const {
BufferPorts input_buffers;
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
const auto in_count = loop_end->get_input_num();
const auto connectors = loop_expr->get_input_port_connectors();
// Input Buffers
for (size_t i = 0; i < in_count; ++i) {
const auto source_expr = connectors[i]->get_source().get_expr();
if (!is_direct_buffer(source_expr, loop_expr))
continue;
// Save as input Buffer
const auto ret = input_buffers.insert(std::make_pair(source_expr, std::set<size_t>{ i })).second;
if (!ret)
input_buffers[source_expr].insert(i);
}
return input_buffers;
}
DefineBufferClusters::BufferPorts DefineBufferClusters::get_output_buffers(const ExpressionPtr& loop_expr) const {
BufferPorts output_buffers;
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
const auto in_count = loop_end->get_input_num();
const auto out_count = loop_end->get_output_num();
const auto connectors = loop_expr->get_input_port_connectors();
for (size_t i = in_count; i < in_count + out_count; ++i) {
for (const auto& consumer : connectors[i]->get_consumers()) {
auto consumer_expr = consumer.get_expr();
if (!is_direct_buffer(consumer_expr, loop_expr))
continue;
// Save as output Buffer
output_buffers[consumer_expr] = { i };
}
}
return output_buffers;
}
void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) {
const auto& expr = *expr_it;
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
const auto& ptr_increments = loop_end->get_ptr_increments();
const auto& final_offsets = loop_end->get_finalization_offsets();
const auto& data_sizes = loop_end->get_element_type_sizes();
// [ Expression -> Port indexes ]
const auto input_buffers = get_input_buffers(expr);
const auto output_buffers = get_output_buffers(expr);
for (const auto& in : input_buffers)
create_new_cluster(in.first);
std::set<ExpressionPtr> visited_buffers;
for (const auto& out : output_buffers) {
const auto output_buffer_expr = out.first;
const auto output_buffer_port_idx = *(out.second.cbegin()); // Output port is always one
const auto output_buffer = ov::as_type_ptr<op::Buffer>(output_buffer_expr->get_node());
bool has_been_added = false;
for (const auto& in : input_buffers) {
const auto& input_buffer_expr = in.first;
if (visited_buffers.count(input_buffer_expr) > 0)
continue;
const auto input_buffer = ov::as_type_ptr<op::Buffer>(input_buffer_expr->get_node());
const auto& input_buffer_ports = in.second;
for (const auto& input_buffer_port_idx : input_buffer_ports) {
// Memory can be reused if reading and writing are executed proportionally:
// - the same ShiftPtrParams (data size, final offsets, ptr increments)
// - the same reading/writing order
// - the same buffer memory sizes
const auto input_params =
ShiftPtrParams(data_sizes[input_buffer_port_idx], ptr_increments[input_buffer_port_idx], final_offsets[input_buffer_port_idx]);
const auto output_params =
ShiftPtrParams(data_sizes[output_buffer_port_idx], ptr_increments[output_buffer_port_idx], final_offsets[output_buffer_port_idx]);
if (input_buffer->get_byte_size() == output_buffer->get_byte_size() &&
input_buffer_expr->get_output_port_descriptor(0)->get_layout() == output_buffer_expr->get_input_port_descriptor(0)->get_layout() &&
input_params == output_params) {
const auto cluster_it = find_cluster_by_expr(input_buffer_expr);
OPENVINO_ASSERT(cluster_it != m_clusters.end(), "Buffer on inputs of Loop must be already saved in clusters");
// Add to the existing cluster
has_been_added = cluster_it->insert(output_buffer_expr).second;
OPENVINO_ASSERT(has_been_added, "Buffer has not been saved in cluster");
// Remove input buffer because we have already use its memory
visited_buffers.insert(input_buffer_expr);
break;
}
}
if (has_been_added) break;
}
if (!has_been_added) {
m_clusters.push_back(AllocateBuffers::BufferCluster{output_buffer_expr});
}
}
// Check Buffers inside to possible memory reusing using `window` sliding
parse_nested_loops(input_buffers, output_buffers, expr_it);
}
void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers,
const LinearIR::constExprIt& outer_loop_end_expr_it) {
if (input_buffers.empty() && output_buffers.empty())
return;
// The inner Buffer can reuse memory of the outer Buffer using `window` sliding only if:
// - The finalization offset of the latest Loop connected to the inner Buffer is equal to pointer increment of outer Buffer to emulate `window` sliding
// - This outer Buffer should have the same Buffer ID as inner to move data ptr of inner Buffer after each outer Loop iteration.
// It's needed because all Loops reset data pointers of connected Buffer after full work.
// To avoid rewriting of outer Buffer data we have to have the same Buffer ID (GPR) to proportionally shift pointers both Buffers.
auto can_be_data_ptr_proportionally_shifted = [](int64_t outer_buffer_ptr_increment, int64_t outer_buffer_data_size,
int64_t inner_buffer_final_offsets, int64_t inner_buffer_data_size) {
return (outer_buffer_ptr_increment != 0) &&
((inner_buffer_data_size * inner_buffer_final_offsets * -1) == outer_buffer_ptr_increment * outer_buffer_data_size);
};
const auto outer_loop_end = ov::as_type_ptr<op::LoopEnd>(outer_loop_end_expr_it->get()->get_node());
const auto outer_loop_begin = outer_loop_end->get_loop_begin();
const auto& outer_ptr_increments = outer_loop_end->get_ptr_increments();
const auto& outer_data_sizes = outer_loop_end->get_element_type_sizes();
for (auto it = std::reverse_iterator<LinearIR::constExprIt>(outer_loop_end_expr_it); (*it)->get_node() != outer_loop_begin; ++it) {
const auto& inner_expr = *it;
if (const auto inner_buffer = ov::as_type_ptr<op::Buffer>(inner_expr->get_node())) {
const auto inner_cluster_it = find_cluster_by_expr(inner_expr);
OPENVINO_ASSERT(inner_cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
const auto inner_cluster_id = get_cluster_buffer_id(*inner_cluster_it);
if (inner_cluster_id == SIZE_MAX) continue;
const auto final_offset = get_buffer_finalization_offset(inner_expr);
auto unite = [&](const BufferPorts& ports, const bool is_input) {
bool applied = false;
for (const auto& port : ports) {
const auto cluster_it = find_cluster_by_expr(port.first);
OPENVINO_ASSERT(cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
// If the buffers are already in the same cluster or have different Buffer ID - skip
if (cluster_it == inner_cluster_it) continue;
bool can_be_reused = true;
for (const auto idx : port.second) {
can_be_reused = can_be_reused &&
can_be_data_ptr_proportionally_shifted(outer_ptr_increments[idx], outer_data_sizes[idx],
final_offset, inner_buffer->get_element_type().size());
}
if (!can_be_reused)
continue;
applied = unite_nested_clusters(inner_cluster_it, *cluster_it, port.first, is_input);
if (applied) break;
}
return applied;
};
if (unite(input_buffers, true)) continue;
if (unite(output_buffers, false)) continue;
}
}
}
int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const {
auto index = [](const std::vector<PortConnectorPtr>& loop_inputs, const PortConnectorPtr& buffer_out) {
const auto it = std::find(loop_inputs.cbegin(), loop_inputs.cend(), buffer_out);
OPENVINO_ASSERT(it != loop_inputs.cend(), "Buffer output PortConnector has not been found in target LoopEnd inputs");
return std::distance(loop_inputs.cbegin(), it);
};
int64_t final_offset = 0;
int64_t last_loop_exec_order = 0;
const auto buffer_outs = buffer_expr->get_output_port_connectors();
for (const auto& buffer_out : buffer_outs) {
const auto consumers = buffer_out->get_consumers();
for (const auto& consumer : consumers) {
const auto consumer_expr = consumer.get_expr();
const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(consumer_expr->get_node());
if (loop_end && consumer_expr->get_loop_ids() == buffer_expr->get_loop_ids()) {
const auto loop_order = ov::snippets::pass::GetTopologicalOrder(loop_end);
if (loop_order > last_loop_exec_order) {
const auto loop_inputs = consumer_expr->get_input_port_connectors();
final_offset = loop_end->get_finalization_offsets()[index(loop_inputs, buffer_out)];
last_loop_exec_order = loop_order;
}
}
}
}
return final_offset;
}
bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it,
AllocateBuffers::BufferCluster& outer_cluster,
const ExpressionPtr& outer_buffer, bool is_outer_up) {
for (const auto& inner_buffer : *inner_cluster_it) {
ExpressionPtr common_loop_end_expr = nullptr;
size_t outer_idx = SIZE_MAX, inner_idx = SIZE_MAX;
const auto& up_buffer = is_outer_up ? outer_buffer : inner_buffer;
const auto& down_buffer = is_outer_up ? inner_buffer : outer_buffer;
auto& up_idx = is_outer_up ? outer_idx : inner_idx;
auto& down_idx = is_outer_up ? inner_idx : outer_idx;
if (are_buffer_neighbours(up_buffer, down_buffer, common_loop_end_expr, up_idx, down_idx)) {
const auto common_loop_end = ov::as_type_ptr<op::LoopEnd>(common_loop_end_expr->get_node());
const auto& inner_ptr_increments = common_loop_end->get_ptr_increments();
const auto& inner_final_offsets = common_loop_end->get_finalization_offsets();
const auto& inner_data_sizes = common_loop_end->get_element_type_sizes();
if (IdentifyBuffers::can_reuse_id({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] },
{ inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) {
const auto buffer_id = ov::as_type_ptr<op::Buffer>(outer_buffer->get_node())->get_id();
for (const auto& inner_buffer : *inner_cluster_it)
ov::as_type_ptr<op::Buffer>(inner_buffer->get_node())->set_id(buffer_id);
outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend());
m_clusters.erase(inner_cluster_it);
return true;
}
}
}
return false;
}
bool DefineBufferClusters::are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx) {
auto find_input = [&down](const PortConnectorPtr& in) {
return in->get_source().get_expr() == down;
};
auto find_output = [&down](const PortConnectorPtr& in) {
const auto consumers = in->get_consumers();
return std::any_of(consumers.cbegin(), consumers.cend(),
[&down](const ExpressionPort& port) { return port.get_expr() == down; });
};
auto find = [&](const std::vector<PortConnectorPtr>::const_iterator& begin,
const std::vector<PortConnectorPtr>::const_iterator& end,
const std::vector<PortConnectorPtr>::const_iterator& orig_begin,
const ExpressionPort& loop_port,
bool is_input) -> bool {
const auto in_buffer_it = is_input ? std::find_if(begin, end, find_input)
: std::find_if(begin, end, find_output);
if (in_buffer_it != end) {
up_idx = loop_port.get_index();
down_idx = std::distance(orig_begin, in_buffer_it);
loop = loop_port.get_expr();
return true;
}
return false;
};
for (const auto& out : up->get_output_port_connectors()) {
for (const auto& buffer_consumer : out->get_consumers()) {
const auto buffer_consumer_expr = buffer_consumer.get_expr();
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(buffer_consumer_expr->get_node());
if (!loop_end)
continue;
const auto& loop_inputs = buffer_consumer_expr->get_input_port_connectors();
if (find(loop_inputs.cbegin(), loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cbegin(), buffer_consumer, true)) return true;
if (find(loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cend(), loop_inputs.cbegin(), buffer_consumer, false)) return true;
}
}
return false;
}
void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) {
const auto ma = ov::as_type_ptr<op::MemoryAccess>(expr->get_node());
if (!ma->is_full_memory_access_op())
return;
// TODO: Some full MemoryAccess ops can have inplace inputs and outputs in general.
// Need to add mechanism of inplace ports using MemoryAccess::PortDescriptor::inplace
for (const auto& input : expr->get_input_port_connectors()) {
if (is_direct_buffer(input->get_source().get_expr(), expr)) {
create_new_cluster(input->get_source().get_expr());
}
}
for (const auto& output : expr->get_output_port_connectors()) {
for (const auto& consumer : output->get_consumers()) {
if (is_direct_buffer(consumer.get_expr(), expr)) {
create_new_cluster(consumer.get_expr());
}
}
}
}
bool DefineBufferClusters::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters");
for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
const auto& expr = *expr_it;
const auto op = expr->get_node();
if (ov::is_type<op::LoopEnd>(op)) {
parse_loop(expr_it);
continue;
}
if (ov::is_type<op::MemoryAccess>(op)) {
parse_memory_access_op(expr);
continue;
}
}
return true;
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,29 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/enumerate_expressions.hpp"
#include "snippets/pass/tokenization.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
bool EnumerateExpressions::run(LinearIR& linear_ir) {
// [113536]: Temporary solution is reusing of topological order from tokenization.
// Need to add execution order of Expression support
int64_t order = 0;
for (const auto& expr : linear_ir) {
ov::snippets::pass::SetTopologicalOrder(expr->get_node(), order++);
}
return order > 0;
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -20,37 +20,69 @@ inline size_t index(size_t col_num, size_t row, size_t col) {
}
} // namespace
std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const {
bool operator==(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
if (&lhs == &rhs)
return true;
return lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset && lhs.data_size == rhs.data_size;
}
bool operator!=(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
return !(rhs == lhs);
}
size_t IdentifyBuffers::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) {
const auto iter = std::find(pool.cbegin(), pool.cend(), target);
OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph");
return std::distance(pool.cbegin(), iter);
}
bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) {
const auto equal_ptr_params_shifting = lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset;
const auto equal_element_type_sizes = lhs.data_size == rhs.data_size;
return equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0));
}
bool IdentifyBuffers::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs) {
const auto lhs_ids = lhs.first->get_loop_ids();
const auto rhs_ids = rhs.first->get_loop_ids();
const auto equal_loop_ids = lhs_ids == rhs_ids;
if (equal_loop_ids) { // Buffers are connected to the same Loop and have the same outer Loops
return !can_reuse_id(lhs.second, rhs.second);
} else { // Buffers are connected to the same Loop, but one of Buffers - inside this Loop, another - outside
// Buffers are adjacent if outer Buffer has not zero data shift params
if (lhs_ids.size() == rhs_ids.size()) // If the count of outer Loops are equal, it means that outer loops are already different
return true;
const auto& outer_buffer = lhs_ids.size() < rhs_ids.size() ? lhs : rhs;
const auto count_outer_loops = std::min(lhs_ids.size(), rhs_ids.size());
const auto are_outer_loops_the_same = lhs_ids.size() != rhs_ids.size() &&
std::equal(rhs_ids.cbegin(), rhs_ids.cbegin() + count_outer_loops, lhs_ids.cbegin());
const auto outer_buffer_has_zero_shifts = outer_buffer.second.ptr_increment == 0 && outer_buffer.second.finalization_offset == 0;
return !are_outer_loops_the_same || !outer_buffer_has_zero_shifts;
}
}
void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
const BufferPool& buffers,
std::vector<bool>& adj) {
if (are_adjacent(lhs, rhs)) {
const auto size = buffers.size();
const auto lhs_idx = get_buffer_idx(lhs.first, buffers);
const auto rhs_idx = get_buffer_idx(rhs.first, buffers);
adj[index(size, rhs_idx, lhs_idx)] = adj[index(size, lhs_idx, rhs_idx)] = true;
}
}
std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
// There are several sync points for adjacency check:
// 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict
// (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent
// 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs
const auto size = buffers.size();
// TODO: Can we use triangular matrix? Need verify using tests
const auto size = pool.size();
std::vector<bool> adj(size * size, false);
for (size_t i = 0; i < size; ++i)
adj[index(size, i, i)] = true;
// < ptr_increment, finalization_offset >
using ShiftPtrParams = std::pair<int64_t, int64_t>;
auto get_buffer_idx = [&](const std::shared_ptr<op::Buffer>& buffer) {
const auto iter = std::find(buffers.cbegin(), buffers.cend(), buffer);
OPENVINO_ASSERT(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph");
return std::distance(buffers.cbegin(), iter);
};
auto update_adj_matrix = [&](const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& buffer,
const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& neighbour_buffer) {
const bool equal_ptr_params_shifting = buffer.second == neighbour_buffer.second;
const bool equal_element_type_sizes = buffer.first->get_element_type().size() == neighbour_buffer.first->get_element_type().size();
if (!equal_ptr_params_shifting || ((buffer.second.first != 0 || buffer.second.second != 0) && !equal_element_type_sizes)) {
const auto buffer_idx = get_buffer_idx(buffer.first);
const auto neighbour_idx = get_buffer_idx(neighbour_buffer.first);
adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
}
};
auto is_buffer = [](const ExpressionPort& port) {
return ov::is_type<op::Buffer>(port.get_expr()->get_node());
};
@ -65,19 +97,19 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
continue;
OPENVINO_ASSERT(std::count_if(consumers.begin(), consumers.end(), is_buffer) == 1, "Brgemm mustn't have more than 1 consumer buffer");
std::vector<std::shared_ptr<op::Buffer>> adjacency_buffers;
adjacency_buffers.push_back(ov::as_type_ptr<op::Buffer>(buffer_it->get_expr()->get_node()));
BufferPool adjacency_buffers;
adjacency_buffers.push_back(buffer_it->get_expr());
for (const auto& input_connector : expr->get_input_port_connectors()) {
const auto parent_node = input_connector->get_source().get_expr()->get_node();
if (const auto neighbour_buffer = ov::as_type_ptr<op::Buffer>(parent_node)) {
adjacency_buffers.push_back(neighbour_buffer);
const auto parent_expr = input_connector->get_source().get_expr();
if (ov::is_type<op::Buffer>(parent_expr->get_node())) {
adjacency_buffers.push_back(parent_expr);
}
}
for (auto buffer_it = adjacency_buffers.begin(); buffer_it != adjacency_buffers.end(); ++buffer_it) {
for (auto neighbour_it = std::next(buffer_it); neighbour_it != adjacency_buffers.end(); ++neighbour_it) {
const auto buffer_idx = get_buffer_idx(*buffer_it);
const auto neighbour_idx = get_buffer_idx(*neighbour_it);
const auto buffer_idx = get_buffer_idx(*buffer_it, pool);
const auto neighbour_idx = get_buffer_idx(*neighbour_it, pool);
adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
}
}
@ -91,29 +123,36 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
const auto input_count = loop_end->get_input_num();
const auto output_count = loop_end->get_output_num();
const auto ptr_increments = loop_end->get_ptr_increments();
const auto finalization_offsets = loop_end->get_finalization_offsets();
const auto& ptr_increments = loop_end->get_ptr_increments();
const auto& finalization_offsets = loop_end->get_finalization_offsets();
const auto& data_sizes = loop_end->get_element_type_sizes();
// Buffer -> <ptr increment, finalization_offsets>
std::map<std::shared_ptr<op::Buffer>, ShiftPtrParams> buffer_neighbours;
std::map<ExpressionPtr, ShiftPtrParams> buffer_neighbours;
for (size_t i = 0; i < input_count; ++i) {
const auto& parent_output = expr->get_input_port_connector(i)->get_source().get_expr();
if (const auto buffer = ov::as_type_ptr<op::Buffer>(parent_output->get_node())) {
buffer_neighbours[buffer] = { ptr_increments[i], finalization_offsets[i] };
if (ov::is_type<op::Buffer>(parent_output->get_node())) {
if (buffer_neighbours.count(parent_output) > 0) {
OPENVINO_ASSERT(buffer_neighbours[parent_output].ptr_increment == ptr_increments[i] &&
buffer_neighbours[parent_output].finalization_offset == finalization_offsets[i],
"Invalid data pointer shifts: If Buffer has several consumers, this consumers must have the same shifts or zero");
continue;
}
buffer_neighbours[parent_output] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
}
}
for (size_t i = 0; i < output_count; ++i) {
for (size_t i = input_count; i < input_count + output_count; ++i) {
// The consumers of the corresponding Store ops
const auto index = input_count + i;
const auto consumer_inputs = expr->get_input_port_connector(index)->get_consumers();
const auto consumer_inputs = expr->get_input_port_connector(i)->get_consumers();
size_t buffer_count = 0;
size_t loop_count = 0;
for (const auto& consumer_input : consumer_inputs) {
const auto& child_node = consumer_input.get_expr()->get_node();
if (const auto buffer = ov::as_type_ptr<op::Buffer>(child_node)) {
buffer_neighbours[buffer] = { ptr_increments[index], finalization_offsets[index] };
} else if (ov::is_type<op::LoopEnd>(child_node)) {
const auto& child_expr = consumer_input.get_expr();
if (ov::is_type<op::Buffer>(child_expr->get_node())) {
buffer_neighbours[child_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
buffer_count++;
} else if (ov::is_type<op::LoopEnd>(child_expr->get_node())) {
loop_count++;
}
}
@ -123,9 +162,24 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
}
}
// Buffers which are connected to the current Loop but without ptr shifts and Buffers which are inside this Loop - must be adjacent because
// after each Loop iteration GPR will be shifted using ptr increment of Buffer outside. But Buffers inside have the same GPR - it means that
// Buffers inside will work with shifted memory.
const auto loop_begin = loop_end->get_loop_begin();
for (auto it = std::reverse_iterator<LinearIR::constExprIt>(expr_it); (*it)->get_node() != loop_begin; ++it) {
const auto& inner_expr = *it;
if (ov::is_type<op::Buffer>(inner_expr->get_node())) {
// To make Buffers adjacent, we set value "INT64_MAX" for data ptr shifts params for inner Buffers,
// since outer Buffers (and other any Buffers) cannot have this value in shifting because of semantic of Loop op.
// Thus, inner and outer Buffers have always different data shift ptr params -> they're adjacent
if (buffer_neighbours.count(inner_expr) == 0)
buffer_neighbours[inner_expr] = { INT64_MAX, INT64_MAX, INT64_MAX };
}
}
for (auto buffer_it = buffer_neighbours.begin(); buffer_it != buffer_neighbours.end(); ++buffer_it) {
for (auto neighbour_it = std::next(buffer_it); neighbour_it != buffer_neighbours.end(); ++neighbour_it) {
update_adj_matrix(*buffer_it, *neighbour_it);
update_adj_matrix(*buffer_it, *neighbour_it, pool, adj);
}
}
}
@ -133,9 +187,9 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
return adj;
}
auto IdentifyBuffers::coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferSet> {
auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferPool> {
size_t color = 0;
std::map<size_t, BufferSet> color_groups;
std::map<size_t, BufferPool> color_groups;
const auto size = buffers.size();
for (size_t i = 0; i < size; i++) {
// The Buffer is already colored (visited) - skip
@ -183,25 +237,25 @@ bool IdentifyBuffers::run(LinearIR& linear_ir) {
// Unite Buffers using Graph coloring algorithm.
// Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
// so these Buffers are always IntermediateBuffer nonadjacent
BufferSet buffer_exprs;
BufferPool buffer_pool;
for (const auto& expr : linear_ir) {
if (const auto buffer = ov::as_type_ptr<op::Buffer>(expr->get_node())) {
buffer_exprs.push_back(buffer);
if (ov::is_type<op::Buffer>(expr->get_node())) {
buffer_pool.push_back(expr);
}
}
// Creation of Adj matrix
auto adj = create_adjacency_matrix(linear_ir, buffer_exprs);
auto adj = create_adjacency_matrix(linear_ir, buffer_pool);
// Graph coloring algorithm
const auto color_groups = coloring(buffer_exprs, adj);
const auto color_groups = coloring(buffer_pool, adj);
for (const auto& pair : color_groups) {
const auto color = pair.first;
const auto& united_buffers = pair.second;
for (const auto& buffer : united_buffers) {
buffer->set_id(color);
for (const auto& buffer_expr : united_buffers) {
ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->set_id(color);
}
}

View File

@ -0,0 +1,40 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/init_buffers_default.hpp"
#include "snippets/lowered/pass/allocate_buffers.hpp"
#include "snippets/op/buffer.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
bool InitBuffersDefault::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault");
size_t id = 0;
size_t offset = 0;
for (const auto& expr : linear_ir) {
const auto op = expr->get_node();
if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
AllocateBuffers::set_buffer_offset(expr, offset);
buffer->set_id(id);
offset += buffer->get_byte_size();
id++;
}
}
m_buffer_scratchpad_size = offset;
return m_buffer_scratchpad_size > 0;
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,38 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
#include "snippets/op/buffer.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
bool NormalizeBufferIDs::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs");
// [ original Buffer ID -> normalized ]
std::map<size_t, size_t> buffer_ids;
for (const auto& expr : linear_ir) {
const auto op = expr->get_node();
if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
const auto buffer_id = buffer->get_id();
if (buffer_ids.count(buffer_id) == 0) {
const auto new_id = buffer_ids.size();
buffer_ids[buffer_id] = new_id;
}
buffer->set_id(buffer_ids[buffer_id]);
}
}
return buffer_ids.size();
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -0,0 +1,89 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/lowered/pass/solve_buffer_memory.hpp"
#include "snippets/pass/tokenization.hpp"
#include "snippets/utils.hpp"
#include "snippets/itt.hpp"
namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
std::vector<ov::MemorySolver::Box> SolveBufferMemory::init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters) {
std::vector<ov::MemorySolver::Box> boxes;
const auto count = static_cast<int>(buffer_clusters.size());
for (int i = 0; i < count; i++) {
ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
int64_t box_size = 0;
for (const auto& buffer_expr : buffer_clusters[i]) {
int e_start = 0, e_finish = 0;
const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(buffer_expr->get_node());
OPENVINO_ASSERT(buffer != nullptr, "BufferSolver expects Buffer ops in clusters");
// life finish time - order of LoopEnd / MemoryAccess ops
const auto buffer_outs = buffer_expr->get_output_port_connectors();
for (const auto& buffer_out : buffer_outs) {
const auto consumers = buffer_out->get_consumers();
for (const auto& consumer : consumers) {
const auto consumer_order = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node()));
e_finish = std::max(e_finish, consumer_order); // the last consumer
}
}
e_start = e_finish;
const auto buffer_ins = buffer_expr->get_input_port_connectors();
for (const auto& buffer_in : buffer_ins) {
const auto& source = buffer_in->get_source();
e_start = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node()));
const auto buffer_siblings = buffer_in->get_consumers();
for (const auto& sibling : buffer_siblings) {
if (const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(sibling.get_expr()->get_node())) {
e_start = std::min(e_start, static_cast<int>(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin())));
}
}
}
OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!");
auto buffer_size = static_cast<int64_t>(buffer->get_byte_size());
box_size = std::max(buffer_size, box_size);
box.start = std::min(e_start, box.start);
box.finish = std::max(e_finish, box.finish);
}
// We use data alignment to put data in the line cache
box.size = utils::div_up(box_size, m_alignment);
boxes.push_back(box);
}
return boxes;
}
bool SolveBufferMemory::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory");
const auto boxes = init_boxes(m_clusters);
ov::MemorySolver memSolver(boxes);
m_buffer_scratchpad_size = static_cast<size_t>(memSolver.solve()) * m_alignment; // alignment in byte
// Set offsets for Buffers
for (const auto& box : boxes) {
for (const auto& buffer : m_clusters[box.id]) {
const auto offset = static_cast<size_t>(memSolver.get_offset(static_cast<int>(box.id)));
AllocateBuffers::set_buffer_offset(buffer, offset * m_alignment); // alignment in byte
}
}
return m_buffer_scratchpad_size > 0;
}
} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

View File

@ -39,7 +39,6 @@
#include "snippets/lowered/pass/move_scalar_to_consumer.hpp"
#include "snippets/lowered/pass/move_result_out_of_loop.hpp"
#include "snippets/lowered/pass/clean_repeated_ptr_shifts.hpp"
#include "snippets/lowered/pass/identify_buffers.hpp"
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/optimize_domain.hpp"
@ -453,19 +452,12 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
backend_passes_post_common.run(linear_ir);
const auto buffer_allocation_pass = std::make_shared<lowered::pass::AllocateBuffers>();
lowered::pass::PassPipeline buffer_pipeline;
buffer_pipeline.register_pass<lowered::pass::IdentifyBuffers>();
buffer_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
buffer_pipeline.register_pass(buffer_allocation_pass);
buffer_pipeline.run(linear_ir);
lowered::pass::PassPipeline final_pipeline;
final_pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
final_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
final_pipeline.register_pass<lowered::pass::PropagateLayout>();
final_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
final_pipeline.run(linear_ir);
lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
}
snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes,

View File

@ -0,0 +1,61 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <common_test_utils/ov_test_utils.hpp>
#include "snippets/op/brgemm.hpp"
#include "snippets/lowered/pass/pass.hpp"
namespace ov {
namespace test {
namespace snippets {
typedef std::tuple<
bool, // Optimized pipeline
bool, // With SplitLoops opt
size_t, // Expected Buffer size in bytes
size_t // Expected unique Buffer IDs count
> BufferAllocationParams;
class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
public:
using VectorDims = ov::snippets::VectorDims;
static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationParams> obj);
protected:
void SetUp() override;
void ApplyTransformations(bool is_optimized, bool with_split_loops);
void Validate();
virtual std::shared_ptr<ov::Model> GetModel() const = 0;
static void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor);
size_t m_buffer_scratchpad = 0;
ov::snippets::lowered::LinearIR m_linear_ir;
size_t m_expected_size = 0;
size_t m_expected_count = 0;
size_t m_loop_depth = 2;
size_t m_vector_size = 16;
};
class EltwiseBufferAllocationTest : public BufferAllocationTest {
protected:
std::shared_ptr<ov::Model> GetModel() const override;
};
class MHABufferAllocationTest : public BufferAllocationTest {
protected:
std::shared_ptr<ov::Model> GetModel() const override;
static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,213 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "lowered/pass/buffer_allocation.hpp"
#include "openvino/opsets/opset.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/init_loops.hpp"
#include "snippets/lowered/pass/insert_load_store.hpp"
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/allocate_buffers.hpp"
#include "snippets/lowered/pass/fuse_loops.hpp"
#include "snippets/lowered/pass/split_loops.hpp"
#include "snippets/lowered/pass/insert_buffers.hpp"
#include "snippets/lowered/pass/softmax_decomposition.hpp"
#include "common_test_utils/common_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
std::string BufferAllocationTest::getTestCaseName(testing::TestParamInfo<ov::test::snippets::BufferAllocationParams> obj) {
bool is_optimized, with_split_loops;
size_t expected_size, expected_count;
std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
std::ostringstream result;
result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
result << "ExpBufferSize=" << expected_size << "_";
result << "ExpBufferNum=" << expected_count;
return result.str();
}
void BufferAllocationTest::SetUp() {
bool is_optimized, with_split_loops;
std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
const auto body = GetModel();
m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::IShapeInferSnippetsFactory>());
m_linear_ir.set_loop_depth(m_loop_depth);
ApplyTransformations(is_optimized, with_split_loops);
}
void BufferAllocationTest::MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) {
for (const auto& input : node->inputs())
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
for (const auto& output : node->outputs())
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
}
void BufferAllocationTest::ApplyTransformations(bool is_optimized, bool with_split) {
ov::snippets::lowered::pass::PassPipeline pipeline;
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
if (with_split)
pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
pipeline.run(m_linear_ir);
}
void BufferAllocationTest::Validate() {
std::set<size_t> gprs;
for (const auto& expr : m_linear_ir) {
if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
gprs.insert(buffer->get_id());
}
}
EXPECT_EQ(gprs.size(), m_expected_count);
EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
}
std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
const auto subtensor_buffer = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
const auto buffer0 = std::make_shared<ov::snippets::op::Buffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
const auto buffer1 = std::make_shared<ov::snippets::op::Buffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});
MarkOp(add, subtensor_eltwise);
MarkOp(relu, subtensor_eltwise);
MarkOp(exp, subtensor_eltwise);
MarkOp(buffer0, subtensor_buffer);
MarkOp(buffer1, subtensor_buffer);
return body;
}
void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
}
std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
const auto subtensor_scalar = std::vector<size_t>{1, 1};
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);
const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(softmax, parameter2);
const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
MarkOp(load_reshape, subtensor_scalar);
MarkOp(store, subtensor_scalar);
MarkOp(softmax, subtensor_softmax);
MarkBrgemm(matmul0, subtensor_brgemm);
MarkBrgemm(matmul1, subtensor_brgemm);
return body;
}
TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
Validate();
}
TEST_P(MHABufferAllocationTest, BufferAllocation) {
Validate();
}
namespace BufferAllocationTest_Instances {
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseNotOptimized, EltwiseBufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(false), // in this test it doesn't make sense
::testing::Values(80000), // Each Buffer has own allocated memory
::testing::Values(2)), // Each Buffer has unique ID
BufferAllocationTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, EltwiseBufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(false), // in this test it doesn't make sense
::testing::Values(40000), // Two Buffer reuse memory
::testing::Values(1)), // Two Buffers reuse IDs
BufferAllocationTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(true),
::testing::Values(139264), // Each Buffer has own allocated memory
::testing::Values(7)), // Each Buffer has unique ID
BufferAllocationTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(true),
::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
BufferAllocationTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(false),
::testing::Values(360448), // Each Buffer has own allocated memory
::testing::Values(7)), // Each Buffer has unique ID
BufferAllocationTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(false),
::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
BufferAllocationTest::getTestCaseName);
} // namespace BufferAllocationTest_Instances
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -8,13 +8,16 @@
*/
#pragma once
#include <ie_common.h>
#include <stdint.h>
#include <algorithm>
#include <map>
#include <vector>
#include "openvino/core/except.hpp"
namespace ov {
/**
* @brief Helps to solve issue of optimal memory allocation only for particular
* execution order.
@ -42,7 +45,6 @@
* Exec order is predefined.
*/
IE_SUPPRESS_DEPRECATED_START
class MemorySolver {
public:
/** @brief Representation of edge (size and live time)*/
@ -67,7 +69,7 @@ public:
/** @brief Performes inplace normalization of the input boxes
@return lifespan of all boxes
*/
static int normalizeBoxes(std::vector<Box>& boxes) {
static int normalize_boxes(std::vector<Box>& boxes) {
int max_ts = 0;
for (const Box& box : boxes)
max_ts = std::max(std::max(max_ts, box.start), box.finish);
@ -113,10 +115,10 @@ public:
// 2. Box.finish >= Box.start (except Box.finish == -1)
// 3. Box.size > 0 (or == 0 ?)
// 4. Box.id == any unique value
_time_duration = normalizeBoxes(_boxes);
_time_duration = normalize_boxes(_boxes);
}
inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
inline bool popup_together_with(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
if (box_new.id + box_new.size > box_old.id && box_old.id + box_old.size > box_new.id) {
// Move the new one up. There is an intersection
box_new.id = box_old.id + box_old.size;
@ -131,7 +133,7 @@ public:
* @return Size of common memory blob required for storing all
*/
int64_t solve() {
maxTopDepth(); // at first make sure that we no need more for boxes sorted by box.start
max_top_depth(); // at first make sure that we no need more for boxes sorted by box.start
std::vector<std::vector<const Box*>> time_slots(_time_duration);
for (auto& slot : time_slots)
slot.reserve(_top_depth); // 2D array [_time_duration][_top_depth]
@ -155,8 +157,8 @@ public:
for (auto* box_in_slot : time_slots[i_slot]) {
// intersect with already stored boxes for all covered time slots
// and move up the new one if needed
// Execution of 'popupTogetherWith' is important even if 'popped_up' is already 'true'
popped_up = popupTogetherWith(box, *box_in_slot) || popped_up;
// Execution of 'popup_together_with' is important even if 'popped_up' is already 'true'
popped_up = popup_together_with(box, *box_in_slot) || popped_up;
}
}
} while (popped_up);
@ -174,23 +176,23 @@ public:
}
/** Provides calculated offset for specified box id */
int64_t getOffset(int id) const {
int64_t get_offset(int id) const {
auto res = _offsets.find(id);
if (res == _offsets.end())
IE_THROW() << "There are no box for provided ID";
OPENVINO_THROW("There are no box for provided ID");
return res->second;
}
/** Additional info. Max sum of box sizes required for any time stamp. */
int64_t maxDepth() {
int64_t max_depth() {
if (_depth == -1)
calcDepth();
calc_depth();
return _depth;
}
/** Additional info. Max num of boxes required for any time stamp. */
int64_t maxTopDepth() {
int64_t max_top_depth() {
if (_top_depth == -1)
calcDepth();
calc_depth();
return _top_depth;
}
@ -201,7 +203,7 @@ private:
int64_t _depth = -1;
int _time_duration = -1;
void calcDepth() {
void calc_depth() {
int64_t top_depth = 0;
int64_t depth = 0;
std::map<int64_t, std::vector<const Box*>> release_at;
@ -218,11 +220,12 @@ private:
top_depth--;
}
release_at.erase(time);
IE_ASSERT(top_depth > 0);
OPENVINO_ASSERT(top_depth > 0);
_top_depth = std::max(_top_depth, top_depth);
_depth = std::max(_depth, depth);
}
}
};
IE_SUPPRESS_DEPRECATED_END
} // namespace ov

View File

@ -2,33 +2,33 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "memory_solver.hpp"
#include "openvino/runtime/memory_solver.hpp"
#include <gtest/gtest.h>
#include <vector>
using Box = MemorySolver::Box;
using Box = ov::MemorySolver::Box;
TEST(MemSolverTest, CanConstruct) {
{ // Empty vector<Box>
MemorySolver ms(std::vector<Box>{});
ov::MemorySolver ms(std::vector<Box>{});
}
{ // vector with default Box
MemorySolver ms(std::vector<Box>{{}});
ov::MemorySolver ms(std::vector<Box>{{}});
}
{ // vector with Box with non-default Box
MemorySolver ms(std::vector<Box>{{1, 3, 3}});
ov::MemorySolver ms(std::vector<Box>{{1, 3, 3}});
}
{ // vector with Box with size == 0
MemorySolver ms(std::vector<Box>{{0, 0, 0}});
ov::MemorySolver ms(std::vector<Box>{{0, 0, 0}});
}
{ // vector with Box with finish == -1
MemorySolver ms(std::vector<Box>{{3, -1, 6}});
ov::MemorySolver ms(std::vector<Box>{{3, -1, 6}});
}
// TODO: enable after implement TODO from memory_solver.hpp#L66
@ -42,7 +42,7 @@ TEST(MemSolverTest, CanConstruct) {
// | __|____||____|
// |__|____||____|_____
// 0 1 2 3 4
TEST(MemSolverTest, GetOffset) {
TEST(MemSolverTest, get_offset) {
int n = 0;
std::vector<Box> boxes{
{n, ++n, 2, 0},
@ -51,13 +51,13 @@ TEST(MemSolverTest, GetOffset) {
{n, ++n, 2, 3},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
ms.solve();
// The correct answer is [0, 2, 0, 2] or [2, 0, 2, 0].
EXPECT_EQ(ms.getOffset(0) + ms.getOffset(1), 2);
EXPECT_EQ(ms.getOffset(1) + ms.getOffset(2), 2);
EXPECT_EQ(ms.getOffset(2) + ms.getOffset(3), 2);
EXPECT_EQ(ms.get_offset(0) + ms.get_offset(1), 2);
EXPECT_EQ(ms.get_offset(1) + ms.get_offset(2), 2);
EXPECT_EQ(ms.get_offset(2) + ms.get_offset(3), 2);
}
// |
@ -65,7 +65,7 @@ TEST(MemSolverTest, GetOffset) {
// | __|____||____|
// |__|____||____|_____
// 0 1 2 3 4
TEST(MemSolverTest, GetOffsetThrowException) {
TEST(MemSolverTest, get_offsetThrowException) {
int n = 0, id = 0;
std::vector<Box> boxes{
{n, ++n, 2, id++},
@ -74,10 +74,10 @@ TEST(MemSolverTest, GetOffsetThrowException) {
{n, ++n, 2, id++},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
ms.solve();
EXPECT_THROW(ms.getOffset(100), std::runtime_error);
EXPECT_THROW(ms.get_offset(100), std::runtime_error);
}
// |
@ -93,10 +93,10 @@ TEST(MemSolverTest, LinearAndEven) {
{n, ++n, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 4);
EXPECT_EQ(ms.maxDepth(), 4);
EXPECT_EQ(ms.maxTopDepth(), 2);
EXPECT_EQ(ms.max_depth(), 4);
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | ____
@ -112,10 +112,10 @@ TEST(MemSolverTest, LinearAndNotEven) {
{n, ++n, 3},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 5);
EXPECT_EQ(ms.maxDepth(), 5);
EXPECT_EQ(ms.maxTopDepth(), 2);
EXPECT_EQ(ms.max_depth(), 5);
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | _______
@ -131,10 +131,10 @@ TEST(MemSolverTest, LinearWithEmptyExecIndexes) {
{n, n += 2, 3},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 5);
EXPECT_EQ(ms.maxDepth(), 5);
EXPECT_EQ(ms.maxTopDepth(), 2);
EXPECT_EQ(ms.max_depth(), 5);
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | __________
@ -150,10 +150,10 @@ TEST(MemSolverTest, DISABLED_Unefficiency) {
{2, 3, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 5); // currently we have answer 6
EXPECT_EQ(ms.maxDepth(), 5);
EXPECT_EQ(ms.maxTopDepth(), 2);
EXPECT_EQ(ms.max_depth(), 5);
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | __________
@ -169,10 +169,10 @@ TEST(MemSolverTest, OverlappingBoxes) {
{2, 3, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 6);
EXPECT_EQ(ms.maxDepth(), 6);
EXPECT_EQ(ms.maxTopDepth(), 2);
EXPECT_EQ(ms.max_depth(), 6);
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | ____
@ -190,10 +190,10 @@ TEST(MemSolverTest, EndOnSeveralBegins) {
{3, 4, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 6);
EXPECT_EQ(ms.maxDepth(), 6);
EXPECT_EQ(ms.maxTopDepth(), 3);
EXPECT_EQ(ms.max_depth(), 6);
EXPECT_EQ(ms.max_top_depth(), 3);
}
// | _____________
@ -211,10 +211,10 @@ TEST(MemSolverTest, ToEndBoxes) {
{3, 4, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 8);
EXPECT_EQ(ms.maxDepth(), 8);
EXPECT_EQ(ms.maxTopDepth(), 4);
EXPECT_EQ(ms.max_depth(), 8);
EXPECT_EQ(ms.max_top_depth(), 4);
}
// | _
@ -232,10 +232,10 @@ TEST(MemSolverTest, LastAndToEndBox) {
{3, 4, 2},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 6);
EXPECT_EQ(ms.maxDepth(), 6);
EXPECT_EQ(ms.maxTopDepth(), 3);
EXPECT_EQ(ms.max_depth(), 6);
EXPECT_EQ(ms.max_top_depth(), 3);
}
TEST(MemSolverTest, OptimalAlexnet) {
@ -269,10 +269,10 @@ TEST(MemSolverTest, OptimalAlexnet) {
for (const auto& sh : shapes)
boxes.push_back({n, ++n, sh[0] * sh[1] * sh[2]});
// For linear topology bottom score is reachable minRequired == maxDepth
MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), ms.maxDepth());
EXPECT_EQ(ms.maxTopDepth(), 2);
// For linear topology bottom score is reachable minRequired == max_depth
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), ms.max_depth());
EXPECT_EQ(ms.max_top_depth(), 2);
}
// | _____________
@ -290,14 +290,14 @@ TEST(MemSolverTest, NoOverlapping) {
{2, 4, 2, n++},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
ms.solve();
// TODO: Current algorithm doesn't solve that case. Uncomment check to see inefficiency
// EXPECT_EQ(ms.solve(), 5);
auto no_overlap = [&](Box box1, Box box2) -> bool {
int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
off1 >= off2 + box2.size;
};
@ -322,12 +322,12 @@ TEST(MemSolverTest, BestSolution1) {
{6, 7, 3, n++},
};
MemorySolver ms(boxes);
ov::MemorySolver ms(boxes);
EXPECT_EQ(ms.solve(), 5);
auto no_overlap = [&](Box box1, Box box2) -> bool {
int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
off1 >= off2 + box2.size;
};

View File

@ -28,7 +28,6 @@
#include "low_precision/low_precision.hpp"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "memory_solver.hpp"
#include "nodes/common/cpu_convert.h"
#include "nodes/common/cpu_memcpy.h"
#include "nodes/convert.h"
@ -50,6 +49,8 @@
#include "utils/verbose.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "openvino/runtime/memory_solver.hpp"
#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
# include <tbb/task.h>
#endif
@ -629,10 +630,10 @@ void Graph::AllocateWithReuse() {
const int64_t alignment = 32; // 32 bytes
// Markup the boxes
std::vector<MemorySolver::Box> definedBoxes;
std::vector<MemorySolver::Box> undefinedBoxes;
std::vector<ov::MemorySolver::Box> definedBoxes;
std::vector<ov::MemorySolver::Box> undefinedBoxes;
for (size_t i = 0; i < remaining_edge_clusters_count; i++) {
MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
int64_t boxSize = 0;
for (auto &edge : edge_clusters[i]) {
int e_start = edge->getParent()->execIndex;
@ -679,7 +680,7 @@ void Graph::AllocateWithReuse() {
}
// Process defined boxes (static shapes)
MemorySolver staticMemSolver(definedBoxes);
ov::MemorySolver staticMemSolver(definedBoxes);
size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;
memWorkspace = std::make_shared<Memory>(getEngine(), DnnlBlockedMemoryDesc(ov::element::i8, Shape(VectorDims{total_size})));
@ -693,7 +694,7 @@ void Graph::AllocateWithReuse() {
int count = 0;
for (auto& edge : edge_clusters[box.id]) {
if (edge->getStatus() == Edge::Status::NeedAllocation) {
int64_t offset = staticMemSolver.getOffset(box.id);
int64_t offset = staticMemSolver.get_offset(box.id);
// !! Fallback to individual memory allocation !!
// if you like to check infer without reuse just call this function without arguments.
edge->allocate(workspace_ptr + offset * alignment); // alignment in byte
@ -762,9 +763,9 @@ void Graph::AllocateWithReuse() {
}
}
MemorySolver::normalizeBoxes(undefinedBoxes);
ov::MemorySolver::normalize_boxes(undefinedBoxes);
std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
std::vector<std::vector<ov::MemorySolver::Box>> groups; //groups of nonoverlapping boxes
constexpr bool enableMemReuse = true; // set false to disable mem reuse for debug purposes
if (enableMemReuse) {
groups.push_back({undefinedBoxes.front()});

View File

@ -22,6 +22,20 @@ using LoopPort = LoopManager::LoopPort;
BrgemmBlocking::BrgemmBlocking() : Pass() {}
void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it) {
const auto& brgemm_expr = brgemm_it->get();
const auto wsp_expr = brgemm_expr->get_input_port_connector(2)->get_source().get_expr();
const auto wsp_buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(wsp_expr->get_node());
OPENVINO_ASSERT(wsp_buffer && wsp_buffer->is_new_memory(), "Incorrect Scratchpad buffer for Brgemm AMX");
// [115164] Should be fully supported by explicit loops of blocking by K, N
OPENVINO_ASSERT(brgemm_expr->get_loop_ids().empty() && wsp_expr->get_loop_ids().empty(), "Incorrect blocking loop marking for Brgemm AMX");
// If scratchpad with temp memory is not explicitly before Brgemm, need to move to there.
if (wsp_expr != *std::prev(brgemm_it)) {
const auto wsp_it = linear_ir.find(wsp_expr);
linear_ir.move(wsp_it, brgemm_it);
}
}
bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking")
if (linear_ir.empty())
@ -64,11 +78,18 @@ bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
const auto work_amount = m;
const auto increment = block_size;
auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
std::vector<LoopPort> entries{LoopPort(expr->get_input_port(0), true), LoopPort(expr->get_input_port(1), false)};
if (brgemm->is_with_scratchpad())
// Scratchpad for AMX scenario is needed only as temporary buffer for each M block - it means that the Buffer should be in this loop.
// Other scratchpads (that after BrgemmCopyB) should be the loop outside.
if (brgemm->is_with_compensations()) {
entries.emplace_back(expr->get_input_port(2), false);
} else if (brgemm->is_amx()) {
move_new_memory_buffer(linear_ir, expr_it);
loop_begin_it = std::prev(expr_it);
}
std::vector<LoopPort> exits{LoopPort(expr->get_output_port(0), true)};
loop_manager->mark_loop(expr_it, std::next(expr_it), work_amount, increment, dim_idx, entries, exits);
loop_manager->mark_loop(loop_begin_it, loop_end_it, work_amount, increment, dim_idx, entries, exits);
}
return modified;

View File

@ -21,6 +21,9 @@ public:
OPENVINO_RTTI("BrgemmBlocking", "Pass")
BrgemmBlocking();
bool run(snippets::lowered::LinearIR& linear_ir) override;
private:
static void move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it);
};
} // namespace pass

View File

@ -0,0 +1,216 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "openvino/opsets/opset.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/init_loops.hpp"
#include "snippets/lowered/pass/insert_load_store.hpp"
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/allocate_buffers.hpp"
#include "snippets/lowered/pass/fuse_loops.hpp"
#include "snippets/lowered/pass/split_loops.hpp"
#include "snippets/lowered/pass/insert_buffers.hpp"
#include "snippets/lowered/pass/softmax_decomposition.hpp"
#include "transformations/snippets/x64/shape_inference.hpp"
#include "transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp"
#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "common_test_utils/ov_test_utils.hpp"
#include "common_test_utils/common_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
/* Note[74841]:
* This test is almost full copy of BufferAllocationTest class from openvino/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp.
* The BufferAllocationTest class should be shared test class to reuse this structure in backend-specific tests in test infrastructure refactoring.
*/
typedef std::tuple<
bool, // Optimized pipeline
bool, // With SplitLoops opt
size_t, // Expected Buffer size in bytes
size_t // Expected unique Buffer IDs count
> BufferAllocationCPUParams;
class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCPUParams> {
public:
using VectorDims = ov::snippets::VectorDims;
static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationCPUParams> obj) {
bool is_optimized, with_split_loops;
size_t expected_size, expected_count;
std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
std::ostringstream result;
result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
result << "ExpBufferSize=" << expected_size << "_";
result << "ExpBufferNum=" << expected_count;
return result.str();
}
protected:
void SetUp() override {
bool is_optimized, with_split_loops;
std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
const auto body = GetModel();
m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::CPUShapeInferSnippetsFactory>());
m_linear_ir.set_loop_depth(m_loop_depth);
ApplyTransformations(is_optimized, with_split_loops);
}
void ApplyTransformations(bool is_optimized, bool with_split_loops) {
ov::snippets::lowered::pass::PassPipeline pipeline;
pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
if (with_split_loops)
pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
pipeline.run(m_linear_ir);
}
void Validate() {
std::set<size_t> gprs;
for (const auto& expr : m_linear_ir) {
if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
gprs.insert(buffer->get_id());
}
}
EXPECT_EQ(gprs.size(), m_expected_count);
EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
}
virtual std::shared_ptr<ov::Model> GetModel() const = 0;
void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) const {
for (const auto& input : node->inputs())
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
for (const auto& output : node->outputs())
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
}
size_t m_buffer_scratchpad = 0;
ov::snippets::lowered::LinearIR m_linear_ir;
size_t m_expected_size = 0;
size_t m_expected_count = 0;
size_t m_loop_depth = 2;
size_t m_vector_size = 16;
};
class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
protected:
std::shared_ptr<ov::Model> GetModel() const override {
const auto subtensor_scalar = std::vector<size_t>{1, 1};
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 128, 12, 64}));
const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);
const auto convert1 = std::make_shared<ov::snippets::op::ConvertSaturation>(relu0, ov::element::bf16);
const auto brgemm_copyb0 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
const auto scratch0 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX);
brgemm_cpu0->set_m_block_size(32);
const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
const auto convert2 = std::make_shared<ov::snippets::op::ConvertSaturation>(softmax, ov::element::bf16);
const auto brgemm_copyb1 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
const auto scratch1 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX);
brgemm_cpu1->set_m_block_size(32);
const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
MarkOp(load_reshape, subtensor_scalar);
MarkOp(store, subtensor_scalar);
MarkOp(softmax, subtensor_softmax);
MarkOp(brgemm_cpu0, subtensor_full);
MarkOp(brgemm_cpu1, subtensor_full);
MarkOp(brgemm_copyb0, subtensor_full);
MarkOp(brgemm_copyb1, subtensor_full);
MarkOp(scratch0, subtensor_full);
MarkOp(scratch1, subtensor_full);
return body;
}
};
TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) {
Validate();
}
namespace BufferAllocationCPUTest_Instances {
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWSplit, MHABF16AMXBufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(true),
::testing::Values(196608),
::testing::Values(11)),
BufferAllocationCPUTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABF16AMXBufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(true),
::testing::Values(90112),
::testing::Values(4)),
BufferAllocationCPUTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
::testing::Combine(
::testing::Values(false),
::testing::Values(false),
::testing::Values(393216),
::testing::Values(11)),
BufferAllocationCPUTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
::testing::Combine(
::testing::Values(true),
::testing::Values(false),
::testing::Values(114688),
::testing::Values(4)),
BufferAllocationCPUTest::getTestCaseName);
} // namespace BufferAllocationCPUTest_Instances
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -16,7 +16,7 @@
#include "gna_lib_ver_selector.hpp"
#include "gna_mem_requests.hpp"
#include "log/log.hpp"
#include "memory_solver.hpp"
#include "openvino/runtime/memory_solver.hpp"
using namespace ov::intel_gna;
@ -239,7 +239,7 @@ public:
size_t calcSize(bool isCompact = false) override {
if (isCompact) {
_size = 0;
std::vector<MemorySolver::Box> boxes;
std::vector<ov::MemorySolver::Box> boxes;
for (size_t i = 0; i < _mem_requests.size(); ++i) {
// skipping BIND, cross-region and empty requests
if (_mem_requests[i]._type & REQUEST_BIND || _mem_requests[i]._ptr_out == nullptr) {
@ -255,12 +255,12 @@ public:
boxes.push_back({start, stop, static_cast<int64_t>(original_with_pad), static_cast<int64_t>(i)});
}
MemorySolver memSolver(boxes);
ov::MemorySolver memSolver(boxes);
_size = memSolver.solve();
// setting offsets
for (auto const& box : boxes) {
_mem_requests[box.id]._offset = memSolver.getOffset(static_cast<int>(box.id));
_mem_requests[box.id]._offset = memSolver.get_offset(static_cast<int>(box.id));
}
return _size;
} else {

View File

@ -22,7 +22,7 @@
#include "gna_mem_requests_queue.hpp"
#include "log/log.hpp"
#include "memory/gna_allocator.hpp"
#include "memory_solver.hpp"
#include "openvino/runtime/memory_solver.hpp"
#ifdef GNA_MEMORY_DUMP
# include <iomanip>