[Snippets] Refactored work with Buffers (#19644)
[Snippets] BufferManager is not derived from PassPipeline now [Snippets] Added MemorySolver support [Snippets] Made as static class [Snippets] Added one-level inplace support [Snippets] Added optimization bits [Snippets] Small cosmetic fixes [Snippets] Renamed to BufferSolver [Snippets] Refactored [Snippets] Fixed IdendifyBuffers [Snippets] Add inplace multi + identify buffers [Snippets] Made common pass [Snippets] Added PassPipeline::get_pass<>() [Snippets] Added comments, briefs, refactored smth [Snippets] Fixed win build [Snippets] Not allow to have the same Buffer ID for multi level Buffers [Snippets] Moved CleanupRepeatedPtrShifts to common pioeline [Snippets] Made IdentifyBuffers::ShiftPtrParams [Snippets] Fixed window sliding mode [Snippets] Refactored nested clusters [Snippets] Adde normalized buffer regs [Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers [Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find [Snippets] Removed useless method from InitLoops [Snippets] Fixed CC build [Snippets] Applied Ivan comments [Snippets] Applied Ivan comment: refactored pass classes [Snippets] Applied Vladislav comments [Snippets] Applied Ivan comments 2 [Runtime] Moved MemorySolver to API2.0 [Snippets] Created common buffer allocation pass AllocateBuffers [Snippets][Tests] Added InplaceEltwise unit test [Snippets] fixed NormalizeBufferIDs [Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm [Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
This commit is contained in:
parent
6ab5ef72d5
commit
df03b0437a
@ -29,6 +29,9 @@ public:
|
||||
// Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
|
||||
// Set by a backend, should be large enough to compensate for the kernel call overheads
|
||||
size_t m_min_kernel_work_amount = 256;
|
||||
// True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
|
||||
// False if all Buffers will have uniqie ID and offsets in the Linear IR
|
||||
bool m_are_buffers_optimized = true;
|
||||
};
|
||||
|
||||
/* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).
|
||||
|
@ -5,7 +5,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
@ -14,26 +13,40 @@ namespace pass {
|
||||
|
||||
/**
|
||||
* @interface AllocateBuffers
|
||||
* @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
|
||||
* Notes:
|
||||
* - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
|
||||
* The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
|
||||
* - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
|
||||
* and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
|
||||
* @brief The pass allocates common memory for all Buffers.
|
||||
* There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
|
||||
* Optimized mode allocates memory for Buffer ops using the following optimizations:
|
||||
* - MemorySolver: helps to solve issue of optimal memory allocation;
|
||||
* - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
|
||||
* - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
|
||||
* Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
|
||||
* The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
||||
class AllocateBuffers : public Pass {
|
||||
class AllocateBuffers: public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("AllocateBuffers", "Pass")
|
||||
bool run(lowered::LinearIR& linear_ir) override;
|
||||
AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
|
||||
|
||||
size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(LinearIR& linear_ir) override;
|
||||
|
||||
/**
|
||||
* @brief Set offset to Buffer op and propagates its to the connected memory access ops
|
||||
* @param buffer_expr expression with Buffer op
|
||||
* @param offset offset in common buffer scratchpad
|
||||
*/
|
||||
static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
|
||||
|
||||
using BufferCluster = std::set<ExpressionPtr>;
|
||||
using BufferClusters = std::vector<BufferCluster>;
|
||||
private:
|
||||
static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);
|
||||
|
||||
size_t m_buffer_scratchpad_size = 0;
|
||||
size_t& m_buffer_scratchpad_size;
|
||||
bool m_is_optimized_mode = true;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
@ -0,0 +1,138 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
#include "allocate_buffers.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface DefineBufferClusters
|
||||
* @brief The pass defines buffer clusters. The buffers from one cluster share the
|
||||
* same memory (has the same offset relative to the data pointer of buffer scratchpad).
|
||||
* - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
|
||||
* - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
|
||||
* It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
|
||||
* Demonstration:
|
||||
* |-----------------------------------------------------|
|
||||
* | |------------| |------------| | InnerLoops have work amount 128
|
||||
* Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128] OuterLoop has work amount 3
|
||||
* | |------------| OuterLoop |------------| |
|
||||
* |-----------------------------------------------------|
|
||||
* Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
|
||||
* Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
|
||||
* These passes should be executed separately before this pass!
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class DefineBufferClusters : public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("DefineBufferClusters", "Pass")
|
||||
|
||||
DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
|
||||
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(lowered::LinearIR& linear_ir) override;
|
||||
|
||||
private:
|
||||
using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
|
||||
/**
|
||||
* @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
|
||||
* @param target target expression with Buffer op
|
||||
* @return vector iterator which refers to the found cluster
|
||||
*/
|
||||
AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
|
||||
/**
|
||||
* @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
|
||||
* @param buffer_expr expression with assumed Buffer op
|
||||
* @param target_expr expression with target op - LoopEnd or MemoryAccess op
|
||||
* @return boolean value
|
||||
*/
|
||||
bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
|
||||
/**
|
||||
* @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
|
||||
* @param buffer_expr expression with Buffer op
|
||||
*/
|
||||
void create_new_cluster(const ExpressionPtr& buffer_expr);
|
||||
/**
|
||||
* @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
|
||||
* that means that Buffers in cluster have different IDs.
|
||||
* @param cluster set of Buffer expressions - cluster
|
||||
* @return common buffer ID or SIZE_MAX - size value
|
||||
*/
|
||||
size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
|
||||
|
||||
/**
|
||||
* @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
|
||||
* @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
|
||||
*/
|
||||
void parse_loop(const LinearIR::constExprIt& expr_it);
|
||||
/**
|
||||
* @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
|
||||
* @param expr expression with full MemoryAccess op
|
||||
*/
|
||||
void parse_memory_access_op(const ExpressionPtr& expr);
|
||||
/**
|
||||
* @brief Gets input outputs buffers of Loop
|
||||
* @param loop_expr expression with LoopEnd op
|
||||
* @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
|
||||
*/
|
||||
BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
|
||||
/**
|
||||
* @brief Gets output buffers of Loop
|
||||
* @param loop_expr expression with LoopEnd op
|
||||
* @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
|
||||
*/
|
||||
BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
|
||||
/**
|
||||
* @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
|
||||
* @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
|
||||
* @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
|
||||
* @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
|
||||
*/
|
||||
void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
|
||||
/**
|
||||
* @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
|
||||
* @param buffer_expr expression with Buffer op
|
||||
* @return finalization offset - int64_t value
|
||||
*/
|
||||
int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
|
||||
/**
|
||||
* @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
|
||||
* indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
|
||||
* @param up expression with upper Buffer op
|
||||
* @param down expression with lower Buffer op
|
||||
* @param loop expression with common LoopEnd op
|
||||
* @param up_idx the reference to port index of upper Buffer op to the Loop
|
||||
* @param down_idx the reference to port index of lower Buffer op to the Loop
|
||||
* @return Return True if the Buffers are connected to the same Loop
|
||||
*/
|
||||
static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
|
||||
/**
|
||||
* @brief Unite clusters
|
||||
* @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
|
||||
* @param outer_cluster buffer clusters with buffers outside the Loop
|
||||
* @param outer_buffer target Buffer from outer_cluster
|
||||
* @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
|
||||
* @return Return True if clusters have been united
|
||||
*/
|
||||
bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
|
||||
const ExpressionPtr& outer_buffer, bool is_outer_up);
|
||||
|
||||
AllocateBuffers::BufferClusters& m_clusters;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface EnumerateExpressions
|
||||
* @brief The pass enumerates expression by execution order
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class EnumerateExpressions : public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("EnumerateExpressions", "Pass")
|
||||
bool run(LinearIR& linear_ir) override;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -6,8 +6,6 @@
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
#include "snippets/op/buffer.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
@ -22,7 +20,8 @@ namespace pass {
|
||||
* - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
|
||||
* The buffers are connected to the same Loop - are adjacent in graph sense bounds.
|
||||
* - The vertices (buffers) are adjacent if they are connected to the same Loop and
|
||||
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
|
||||
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
|
||||
* or one of the Buffers is in some a Loop but another Buffer is not;
|
||||
* - Firstly, create adjacency matrix using the definition above;
|
||||
* - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
|
||||
* Note: should be called before ResetBuffer() pass to have correct offsets
|
||||
@ -33,13 +32,79 @@ public:
|
||||
OPENVINO_RTTI("IdentifyBuffers", "Pass")
|
||||
IdentifyBuffers() = default;
|
||||
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(LinearIR& linear_ir) override;
|
||||
|
||||
private:
|
||||
using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
|
||||
struct ShiftPtrParams {
|
||||
ShiftPtrParams() = default;
|
||||
ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
|
||||
int64_t data_size = 0;
|
||||
int64_t ptr_increment = 0;
|
||||
int64_t finalization_offset = 0;
|
||||
|
||||
std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
|
||||
std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
|
||||
friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
|
||||
friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
|
||||
* @param lhs Data pointer shift params for first Buffer
|
||||
* @param rhs Data pointer shift params for second Buffer
|
||||
* @return Returns True if params are valid for reusing. Otherwise returns False
|
||||
*/
|
||||
static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
|
||||
|
||||
private:
|
||||
using BufferPool = std::vector<ExpressionPtr>;
|
||||
|
||||
/**
|
||||
* @brief Get Buffer Index in Buffer set
|
||||
* @param target the target Buffer expression
|
||||
* @param pool set of Buffers from the Linear IR
|
||||
* @return index of target Buffer expression in set
|
||||
*/
|
||||
static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
|
||||
/**
|
||||
* @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
|
||||
* @param linear_ir the target Linear IR
|
||||
* @param pool set of Buffers from the Linear IR
|
||||
* @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
|
||||
*/
|
||||
static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
|
||||
/**
|
||||
* @brief Algorithm of Graph coloring where vertices are Buffers
|
||||
* @param buffers set of Buffers from the Linear IR
|
||||
* @param adj adjacency matrix
|
||||
* @return map [color id -> Buffer set]
|
||||
*/
|
||||
static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
|
||||
/**
|
||||
* @brief Update the adjacency matrix:
|
||||
* - If Buffers are from the same Loops and connected to the same Loop and
|
||||
* they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
|
||||
* - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
|
||||
* the Buffers are adjacent - set value True in the matrix;
|
||||
* @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
|
||||
* @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
|
||||
* @param buffers set of Buffers from the Linear IR
|
||||
* @param adj Target adjacency matrix
|
||||
*/
|
||||
static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
|
||||
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
|
||||
const BufferPool& buffers,
|
||||
std::vector<bool>& adj);
|
||||
/**
|
||||
* @brief Check if two Buffers are adjacent and cannot have the same ID
|
||||
* @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
|
||||
* @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
|
||||
* @return Returns True if they are adjacent, otherwise returns False
|
||||
*/
|
||||
static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
|
||||
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
@ -0,0 +1,41 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InitBuffersDefault
|
||||
* @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
||||
class InitBuffersDefault : public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("InitBuffersDefault", "Pass")
|
||||
|
||||
InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
|
||||
m_buffer_scratchpad_size = 0;
|
||||
}
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(lowered::LinearIR& linear_ir) override;
|
||||
|
||||
private:
|
||||
size_t& m_buffer_scratchpad_size;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -15,7 +15,7 @@ namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InitLoops
|
||||
* @brief The pass initialize scheduling information in LoopInfo
|
||||
* @brief The pass initializes scheduling information in LoopInfo
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class InitLoops : public Pass {
|
||||
|
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface NormalizeBufferIDs
|
||||
* @brief After optimizations some Buffer IDs might be set unevenly: some numbers are missed.
|
||||
* For example,
|
||||
* [Buffer -> ID]
|
||||
* Buffer0 -> 0 Two Buffers have ID = 0, one has ID = 2.
|
||||
* Buffer1 -> 2 Obviosly, we can normalize this IDs to set ID = 1 to Buffer1.
|
||||
* Buffer2 -> 0 It helps to assign GPR registers in `AssignRegister` more effective.
|
||||
* Thus, the pass normalize IDs of Buffers in Linear IR.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
||||
class NormalizeBufferIDs : public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("NormalizeBufferIDs", "Pass")
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(lowered::LinearIR& linear_ir) override;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -39,6 +39,11 @@ public:
|
||||
return get_type_info().name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
virtual bool run(lowered::LinearIR& linear_ir) = 0;
|
||||
};
|
||||
|
||||
|
@ -0,0 +1,54 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pass.hpp"
|
||||
|
||||
#include "allocate_buffers.hpp"
|
||||
#include "openvino/runtime/memory_solver.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface SolveBufferMemory
|
||||
* @brief The pass optimally calculates the common buffer scratchpad size and
|
||||
* set the offsets relative to the common data pointer to all Buffers. The pass uses MemorySolver API.
|
||||
* Note: The pass requires expression enumeration. It should be executed separately before this pass!
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class SolveBufferMemory : public Pass {
|
||||
public:
|
||||
OPENVINO_RTTI("SolveBufferMemory", "Pass")
|
||||
|
||||
SolveBufferMemory(size_t& buffer_scratchpad_size, AllocateBuffers::BufferClusters& clusters)
|
||||
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_clusters(clusters) {}
|
||||
/**
|
||||
* @brief Apply the pass to the Linear IR
|
||||
* @param linear_ir the target Linear IR
|
||||
* @return status of the pass
|
||||
*/
|
||||
bool run(lowered::LinearIR& linear_ir) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Initializes boxes for MemorySolver
|
||||
* @param buffer_clusters buffer clusters. These clusters could be obtained using DefineBufferClusters pass
|
||||
* @return vector of boxes for MemorySolver
|
||||
*/
|
||||
std::vector<ov::MemorySolver::Box> init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters);
|
||||
|
||||
size_t& m_buffer_scratchpad_size;
|
||||
AllocateBuffers::BufferClusters& m_clusters;
|
||||
|
||||
constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -51,6 +51,7 @@ constexpr inline bool implication(bool cause, bool cond) {
|
||||
|
||||
template <typename T, typename U>
|
||||
inline T div_up(const T a, const U b) {
|
||||
OPENVINO_ASSERT(b != 0, "Divider must not be zero");
|
||||
return static_cast<T>((a + b - 1) / b);
|
||||
}
|
||||
|
||||
|
@ -2,9 +2,16 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
|
||||
#include "snippets/lowered/pass/allocate_buffers.hpp"
|
||||
|
||||
#include "snippets/lowered/linear_ir.hpp"
|
||||
#include "snippets/lowered/pass/enumerate_expressions.hpp"
|
||||
#include "snippets/lowered/pass/solve_buffer_memory.hpp"
|
||||
#include "snippets/lowered/pass/init_buffers_default.hpp"
|
||||
#include "snippets/lowered/pass/identify_buffers.hpp"
|
||||
#include "snippets/lowered/pass/define_buffer_clusters.hpp"
|
||||
#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
namespace ov {
|
||||
@ -12,11 +19,15 @@ namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, const size_t offset) {
|
||||
AllocateBuffers::AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized)
|
||||
: m_buffer_scratchpad_size(buffer_scratchpad_size), m_is_optimized_mode(is_optimized) {}
|
||||
|
||||
void AllocateBuffers::set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset) {
|
||||
// If Buffer has offset We set this offset in the connected MemoryAccess ops
|
||||
// to correctly read and write data because all Buffers has the common data pointer on buffer scratchpad
|
||||
// to correctly read and write data because all Buffers have the common data pointer on buffer scratchpad
|
||||
|
||||
const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
|
||||
OPENVINO_ASSERT(buffer, "Failed to set Buffer offset: AllocateBuffers expects Buffer op");
|
||||
buffer->set_offset(static_cast<int64_t>(offset));
|
||||
|
||||
// Propagate to up: in Store. Buffer can have only one Store
|
||||
@ -55,106 +66,23 @@ void AllocateBuffers::propagate_offset(const LinearIR& linear_ir, const Expressi
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool AllocateBuffers::run(LinearIR& linear_ir) {
|
||||
bool AllocateBuffers::run(lowered::LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::AllocateBuffers");
|
||||
// [113664] The pass contains two main logics: it defines which of buffers can be inplace (use the same memory) and
|
||||
// allocate memory of needed size. It should be splitted into several passes and updated in bounds of the ticket 113664.
|
||||
|
||||
// [113664] At the moment New Memory Buffer is used only in BrgemmCPU for AMX case. This memory can be reused for each Brgemm.
|
||||
// This plugin-specific condition will be removed in the near future after the task 113664 will be implemented
|
||||
size_t offset = 0, new_memory_buffer_offset = 0;
|
||||
size_t prev_data_size = 0, current_data_size = 0;
|
||||
std::set<ExpressionPtr> allocated_buffers;
|
||||
bool new_memory_buffer_allocated = false;
|
||||
|
||||
auto allocate = [&](const std::shared_ptr<op::Buffer>& buffer, const ExpressionPtr& expr, size_t buffer_size) {
|
||||
offset = m_buffer_scratchpad_size;
|
||||
propagate_offset(linear_ir, expr, offset);
|
||||
m_buffer_scratchpad_size += buffer_size;
|
||||
allocated_buffers.insert(expr);
|
||||
prev_data_size = current_data_size;
|
||||
};
|
||||
|
||||
for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
|
||||
const auto& expr = *expr_it;
|
||||
if (auto buffer = as_type_ptr<op::Buffer>(expr->get_node())) {
|
||||
const auto buffer_size = buffer->get_byte_size();
|
||||
current_data_size = buffer->get_element_type().size();
|
||||
// If it's the first buffer, offsets are zero => nothing to propagate, can continue
|
||||
if (m_buffer_scratchpad_size == 0) {
|
||||
m_buffer_scratchpad_size += buffer_size;
|
||||
allocated_buffers.insert(expr);
|
||||
prev_data_size = current_data_size;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (buffer->is_intermediate_memory()) {
|
||||
const auto& parent_expr = expr->get_input_port_connector(0)->get_source().get_expr();
|
||||
const auto& parent_node = parent_expr->get_node();
|
||||
// Full MemoryAccess ops need new memory. Previous logic is to check for parent isn't Loop
|
||||
// [113664] It should be unified in MemoryManager with memory reuse in the near future
|
||||
const auto ma = ov::as_type_ptr<op::MemoryAccess>(parent_node);
|
||||
if (ma && ma->is_full_memory_access_op()) {
|
||||
allocate(buffer, *expr_it, buffer_size);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Loop Full_MA
|
||||
// | |
|
||||
// Buffer_1 Buffer_0
|
||||
// \ /
|
||||
// Full_MA
|
||||
// At the moment the pass support only sequentially implicit InPlace.
|
||||
// If Buffer_0 is allocated firstly as Buffer after full memory access op,
|
||||
// we cannot reuse this allocated memory for Buffer_1 - we must allocate new memory for it.
|
||||
// [113664] It should be unified in MemoryManager with memory reuse in the near future
|
||||
bool need_allocate = false;
|
||||
const auto consumers = expr->get_output_port_connector(0)->get_consumers();
|
||||
for (const auto& consumer : consumers) {
|
||||
const auto& consumer_expr = consumer.get_expr();
|
||||
const auto& child_node = consumer_expr->get_node();
|
||||
const auto ma = ov::as_type_ptr<op::MemoryAccess>(child_node);
|
||||
if (ma && ma->is_full_memory_access_op()) {
|
||||
for (size_t i = 0; i < consumer_expr->get_input_count() && !need_allocate; ++i) {
|
||||
if (i == consumer.get_index())
|
||||
continue;
|
||||
const auto buffer_sibling = consumer_expr->get_input_port_connector(i)->get_source().get_expr();
|
||||
need_allocate = ov::is_type<op::Buffer>(buffer_sibling->get_node()) && allocated_buffers.count(buffer_sibling) != 0;
|
||||
}
|
||||
}
|
||||
if (need_allocate)
|
||||
break;
|
||||
}
|
||||
if (need_allocate) {
|
||||
allocate(buffer, *expr_it, buffer_size);
|
||||
continue;
|
||||
}
|
||||
|
||||
// [113664] For more details and reason of the current solution, please, go to the ticket description
|
||||
const auto current_allocated_memory_size = m_buffer_scratchpad_size - offset;
|
||||
if (((current_data_size == prev_data_size) && buffer_size > current_allocated_memory_size) ||
|
||||
((current_data_size != prev_data_size) && buffer_size != current_allocated_memory_size)) {
|
||||
allocate(buffer, expr, buffer_size);
|
||||
continue;
|
||||
}
|
||||
propagate_offset(linear_ir, *expr_it, offset);
|
||||
allocated_buffers.insert(expr);
|
||||
prev_data_size = current_data_size;
|
||||
} else {
|
||||
if (!new_memory_buffer_allocated) {
|
||||
allocate(buffer, *expr_it, buffer_size);
|
||||
new_memory_buffer_allocated = true;
|
||||
new_memory_buffer_offset = offset;
|
||||
} else {
|
||||
propagate_offset(linear_ir, *expr_it, new_memory_buffer_offset);
|
||||
allocated_buffers.insert(expr);
|
||||
prev_data_size = current_data_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
m_buffer_scratchpad_size = 0;
|
||||
PassPipeline pipeline;
|
||||
if (m_is_optimized_mode) {
|
||||
BufferClusters buffer_clusters;
|
||||
pipeline.register_pass<EnumerateExpressions>();
|
||||
pipeline.register_pass<IdentifyBuffers>();
|
||||
pipeline.register_pass<DefineBufferClusters>(buffer_clusters);
|
||||
pipeline.register_pass<SolveBufferMemory>(m_buffer_scratchpad_size, buffer_clusters);
|
||||
pipeline.register_pass<NormalizeBufferIDs>();
|
||||
} else {
|
||||
pipeline.register_pass<InitBuffersDefault>(m_buffer_scratchpad_size);
|
||||
}
|
||||
return !allocated_buffers.empty();
|
||||
pipeline.run(linear_ir);
|
||||
|
||||
return m_buffer_scratchpad_size > 0;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
|
346
src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
Normal file
346
src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp
Normal file
@ -0,0 +1,346 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/define_buffer_clusters.hpp"
|
||||
|
||||
#include "snippets/lowered/pass/identify_buffers.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
using ShiftPtrParams = IdentifyBuffers::ShiftPtrParams;
|
||||
|
||||
AllocateBuffers::BufferClusters::iterator DefineBufferClusters::find_cluster_by_expr(const ExpressionPtr& target) {
|
||||
return std::find_if(m_clusters.begin(), m_clusters.end(),
|
||||
[&target](const AllocateBuffers::BufferCluster& cluster) { return cluster.count(target) > 0; });
|
||||
}
|
||||
|
||||
bool DefineBufferClusters::is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const {
|
||||
const auto buffer = ov::as_type_ptr<op::Buffer>(buffer_expr->get_node());
|
||||
return buffer && buffer_expr->get_loop_ids() == target_expr->get_loop_ids();
|
||||
}
|
||||
|
||||
void DefineBufferClusters::create_new_cluster(const ExpressionPtr& buffer_expr) {
|
||||
const auto cluster_it = find_cluster_by_expr(buffer_expr);
|
||||
// If Buffer is missed in clusters, create new cluster with the single Buffer node inside
|
||||
if (cluster_it == m_clusters.cend()) {
|
||||
m_clusters.push_back(AllocateBuffers::BufferCluster{buffer_expr});
|
||||
}
|
||||
}
|
||||
|
||||
size_t DefineBufferClusters::get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const {
|
||||
OPENVINO_ASSERT(!cluster.empty(), "Buffer cluster is empty!");
|
||||
const auto id = (ov::as_type_ptr<op::Buffer>(cluster.cbegin()->get()->get_node()))->get_id();
|
||||
if (std::all_of(cluster.cbegin(), cluster.cend(),
|
||||
[&id](const ExpressionPtr& expr) { return (ov::as_type_ptr<op::Buffer>(expr->get_node()))->get_id() == id; })) {
|
||||
return id;
|
||||
}
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
DefineBufferClusters::BufferPorts DefineBufferClusters::get_input_buffers(const ExpressionPtr& loop_expr) const {
|
||||
BufferPorts input_buffers;
|
||||
|
||||
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
|
||||
const auto in_count = loop_end->get_input_num();
|
||||
const auto connectors = loop_expr->get_input_port_connectors();
|
||||
|
||||
// Input Buffers
|
||||
for (size_t i = 0; i < in_count; ++i) {
|
||||
const auto source_expr = connectors[i]->get_source().get_expr();
|
||||
if (!is_direct_buffer(source_expr, loop_expr))
|
||||
continue;
|
||||
// Save as input Buffer
|
||||
const auto ret = input_buffers.insert(std::make_pair(source_expr, std::set<size_t>{ i })).second;
|
||||
if (!ret)
|
||||
input_buffers[source_expr].insert(i);
|
||||
}
|
||||
return input_buffers;
|
||||
}
|
||||
|
||||
DefineBufferClusters::BufferPorts DefineBufferClusters::get_output_buffers(const ExpressionPtr& loop_expr) const {
|
||||
BufferPorts output_buffers;
|
||||
|
||||
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(loop_expr->get_node());
|
||||
const auto in_count = loop_end->get_input_num();
|
||||
const auto out_count = loop_end->get_output_num();
|
||||
const auto connectors = loop_expr->get_input_port_connectors();
|
||||
|
||||
for (size_t i = in_count; i < in_count + out_count; ++i) {
|
||||
for (const auto& consumer : connectors[i]->get_consumers()) {
|
||||
auto consumer_expr = consumer.get_expr();
|
||||
if (!is_direct_buffer(consumer_expr, loop_expr))
|
||||
continue;
|
||||
// Save as output Buffer
|
||||
output_buffers[consumer_expr] = { i };
|
||||
}
|
||||
}
|
||||
return output_buffers;
|
||||
}
|
||||
|
||||
void DefineBufferClusters::parse_loop(const LinearIR::constExprIt& expr_it) {
|
||||
const auto& expr = *expr_it;
|
||||
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
|
||||
const auto& ptr_increments = loop_end->get_ptr_increments();
|
||||
const auto& final_offsets = loop_end->get_finalization_offsets();
|
||||
const auto& data_sizes = loop_end->get_element_type_sizes();
|
||||
|
||||
// [ Expression -> Port indexes ]
|
||||
const auto input_buffers = get_input_buffers(expr);
|
||||
const auto output_buffers = get_output_buffers(expr);
|
||||
|
||||
for (const auto& in : input_buffers)
|
||||
create_new_cluster(in.first);
|
||||
|
||||
std::set<ExpressionPtr> visited_buffers;
|
||||
for (const auto& out : output_buffers) {
|
||||
const auto output_buffer_expr = out.first;
|
||||
const auto output_buffer_port_idx = *(out.second.cbegin()); // Output port is always one
|
||||
const auto output_buffer = ov::as_type_ptr<op::Buffer>(output_buffer_expr->get_node());
|
||||
bool has_been_added = false;
|
||||
|
||||
for (const auto& in : input_buffers) {
|
||||
const auto& input_buffer_expr = in.first;
|
||||
if (visited_buffers.count(input_buffer_expr) > 0)
|
||||
continue;
|
||||
|
||||
const auto input_buffer = ov::as_type_ptr<op::Buffer>(input_buffer_expr->get_node());
|
||||
const auto& input_buffer_ports = in.second;
|
||||
for (const auto& input_buffer_port_idx : input_buffer_ports) {
|
||||
// Memory can be reused if reading and writing are executed proportionally:
|
||||
// - the same ShiftPtrParams (data size, final offsets, ptr increments)
|
||||
// - the same reading/writing order
|
||||
// - the same buffer memory sizes
|
||||
const auto input_params =
|
||||
ShiftPtrParams(data_sizes[input_buffer_port_idx], ptr_increments[input_buffer_port_idx], final_offsets[input_buffer_port_idx]);
|
||||
const auto output_params =
|
||||
ShiftPtrParams(data_sizes[output_buffer_port_idx], ptr_increments[output_buffer_port_idx], final_offsets[output_buffer_port_idx]);
|
||||
if (input_buffer->get_byte_size() == output_buffer->get_byte_size() &&
|
||||
input_buffer_expr->get_output_port_descriptor(0)->get_layout() == output_buffer_expr->get_input_port_descriptor(0)->get_layout() &&
|
||||
input_params == output_params) {
|
||||
const auto cluster_it = find_cluster_by_expr(input_buffer_expr);
|
||||
OPENVINO_ASSERT(cluster_it != m_clusters.end(), "Buffer on inputs of Loop must be already saved in clusters");
|
||||
// Add to the existing cluster
|
||||
has_been_added = cluster_it->insert(output_buffer_expr).second;
|
||||
OPENVINO_ASSERT(has_been_added, "Buffer has not been saved in cluster");
|
||||
// Remove input buffer because we have already use its memory
|
||||
visited_buffers.insert(input_buffer_expr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_been_added) break;
|
||||
}
|
||||
if (!has_been_added) {
|
||||
m_clusters.push_back(AllocateBuffers::BufferCluster{output_buffer_expr});
|
||||
}
|
||||
}
|
||||
|
||||
// Check Buffers inside to possible memory reusing using `window` sliding
|
||||
parse_nested_loops(input_buffers, output_buffers, expr_it);
|
||||
}
|
||||
|
||||
void DefineBufferClusters::parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers,
|
||||
const LinearIR::constExprIt& outer_loop_end_expr_it) {
|
||||
if (input_buffers.empty() && output_buffers.empty())
|
||||
return;
|
||||
|
||||
// The inner Buffer can reuse memory of the outer Buffer using `window` sliding only if:
|
||||
// - The finalization offset of the latest Loop connected to the inner Buffer is equal to pointer increment of outer Buffer to emulate `window` sliding
|
||||
// - This outer Buffer should have the same Buffer ID as inner to move data ptr of inner Buffer after each outer Loop iteration.
|
||||
// It's needed because all Loops reset data pointers of connected Buffer after full work.
|
||||
// To avoid rewriting of outer Buffer data we have to have the same Buffer ID (GPR) to proportionally shift pointers both Buffers.
|
||||
|
||||
auto can_be_data_ptr_proportionally_shifted = [](int64_t outer_buffer_ptr_increment, int64_t outer_buffer_data_size,
|
||||
int64_t inner_buffer_final_offsets, int64_t inner_buffer_data_size) {
|
||||
return (outer_buffer_ptr_increment != 0) &&
|
||||
((inner_buffer_data_size * inner_buffer_final_offsets * -1) == outer_buffer_ptr_increment * outer_buffer_data_size);
|
||||
};
|
||||
|
||||
const auto outer_loop_end = ov::as_type_ptr<op::LoopEnd>(outer_loop_end_expr_it->get()->get_node());
|
||||
const auto outer_loop_begin = outer_loop_end->get_loop_begin();
|
||||
const auto& outer_ptr_increments = outer_loop_end->get_ptr_increments();
|
||||
const auto& outer_data_sizes = outer_loop_end->get_element_type_sizes();
|
||||
|
||||
for (auto it = std::reverse_iterator<LinearIR::constExprIt>(outer_loop_end_expr_it); (*it)->get_node() != outer_loop_begin; ++it) {
|
||||
const auto& inner_expr = *it;
|
||||
if (const auto inner_buffer = ov::as_type_ptr<op::Buffer>(inner_expr->get_node())) {
|
||||
const auto inner_cluster_it = find_cluster_by_expr(inner_expr);
|
||||
OPENVINO_ASSERT(inner_cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
|
||||
const auto inner_cluster_id = get_cluster_buffer_id(*inner_cluster_it);
|
||||
if (inner_cluster_id == SIZE_MAX) continue;
|
||||
|
||||
const auto final_offset = get_buffer_finalization_offset(inner_expr);
|
||||
|
||||
auto unite = [&](const BufferPorts& ports, const bool is_input) {
|
||||
bool applied = false;
|
||||
for (const auto& port : ports) {
|
||||
const auto cluster_it = find_cluster_by_expr(port.first);
|
||||
OPENVINO_ASSERT(cluster_it != m_clusters.cend(), "Buffer cluster has not been found");
|
||||
// If the buffers are already in the same cluster or have different Buffer ID - skip
|
||||
if (cluster_it == inner_cluster_it) continue;
|
||||
|
||||
bool can_be_reused = true;
|
||||
for (const auto idx : port.second) {
|
||||
can_be_reused = can_be_reused &&
|
||||
can_be_data_ptr_proportionally_shifted(outer_ptr_increments[idx], outer_data_sizes[idx],
|
||||
final_offset, inner_buffer->get_element_type().size());
|
||||
}
|
||||
if (!can_be_reused)
|
||||
continue;
|
||||
|
||||
applied = unite_nested_clusters(inner_cluster_it, *cluster_it, port.first, is_input);
|
||||
if (applied) break;
|
||||
}
|
||||
return applied;
|
||||
};
|
||||
|
||||
if (unite(input_buffers, true)) continue;
|
||||
if (unite(output_buffers, false)) continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t DefineBufferClusters::get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const {
|
||||
auto index = [](const std::vector<PortConnectorPtr>& loop_inputs, const PortConnectorPtr& buffer_out) {
|
||||
const auto it = std::find(loop_inputs.cbegin(), loop_inputs.cend(), buffer_out);
|
||||
OPENVINO_ASSERT(it != loop_inputs.cend(), "Buffer output PortConnector has not been found in target LoopEnd inputs");
|
||||
return std::distance(loop_inputs.cbegin(), it);
|
||||
};
|
||||
int64_t final_offset = 0;
|
||||
int64_t last_loop_exec_order = 0;
|
||||
const auto buffer_outs = buffer_expr->get_output_port_connectors();
|
||||
for (const auto& buffer_out : buffer_outs) {
|
||||
const auto consumers = buffer_out->get_consumers();
|
||||
for (const auto& consumer : consumers) {
|
||||
const auto consumer_expr = consumer.get_expr();
|
||||
const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(consumer_expr->get_node());
|
||||
if (loop_end && consumer_expr->get_loop_ids() == buffer_expr->get_loop_ids()) {
|
||||
const auto loop_order = ov::snippets::pass::GetTopologicalOrder(loop_end);
|
||||
if (loop_order > last_loop_exec_order) {
|
||||
const auto loop_inputs = consumer_expr->get_input_port_connectors();
|
||||
final_offset = loop_end->get_finalization_offsets()[index(loop_inputs, buffer_out)];
|
||||
last_loop_exec_order = loop_order;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return final_offset;
|
||||
}
|
||||
|
||||
bool DefineBufferClusters::unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it,
|
||||
AllocateBuffers::BufferCluster& outer_cluster,
|
||||
const ExpressionPtr& outer_buffer, bool is_outer_up) {
|
||||
for (const auto& inner_buffer : *inner_cluster_it) {
|
||||
ExpressionPtr common_loop_end_expr = nullptr;
|
||||
size_t outer_idx = SIZE_MAX, inner_idx = SIZE_MAX;
|
||||
const auto& up_buffer = is_outer_up ? outer_buffer : inner_buffer;
|
||||
const auto& down_buffer = is_outer_up ? inner_buffer : outer_buffer;
|
||||
auto& up_idx = is_outer_up ? outer_idx : inner_idx;
|
||||
auto& down_idx = is_outer_up ? inner_idx : outer_idx;
|
||||
if (are_buffer_neighbours(up_buffer, down_buffer, common_loop_end_expr, up_idx, down_idx)) {
|
||||
const auto common_loop_end = ov::as_type_ptr<op::LoopEnd>(common_loop_end_expr->get_node());
|
||||
const auto& inner_ptr_increments = common_loop_end->get_ptr_increments();
|
||||
const auto& inner_final_offsets = common_loop_end->get_finalization_offsets();
|
||||
const auto& inner_data_sizes = common_loop_end->get_element_type_sizes();
|
||||
if (IdentifyBuffers::can_reuse_id({ inner_data_sizes[up_idx], inner_ptr_increments[up_idx], inner_final_offsets[up_idx] },
|
||||
{ inner_data_sizes[down_idx], inner_ptr_increments[down_idx], inner_final_offsets[down_idx] })) {
|
||||
const auto buffer_id = ov::as_type_ptr<op::Buffer>(outer_buffer->get_node())->get_id();
|
||||
for (const auto& inner_buffer : *inner_cluster_it)
|
||||
ov::as_type_ptr<op::Buffer>(inner_buffer->get_node())->set_id(buffer_id);
|
||||
|
||||
outer_cluster.insert(inner_cluster_it->cbegin(), inner_cluster_it->cend());
|
||||
m_clusters.erase(inner_cluster_it);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DefineBufferClusters::are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx) {
|
||||
auto find_input = [&down](const PortConnectorPtr& in) {
|
||||
return in->get_source().get_expr() == down;
|
||||
};
|
||||
auto find_output = [&down](const PortConnectorPtr& in) {
|
||||
const auto consumers = in->get_consumers();
|
||||
return std::any_of(consumers.cbegin(), consumers.cend(),
|
||||
[&down](const ExpressionPort& port) { return port.get_expr() == down; });
|
||||
};
|
||||
auto find = [&](const std::vector<PortConnectorPtr>::const_iterator& begin,
|
||||
const std::vector<PortConnectorPtr>::const_iterator& end,
|
||||
const std::vector<PortConnectorPtr>::const_iterator& orig_begin,
|
||||
const ExpressionPort& loop_port,
|
||||
bool is_input) -> bool {
|
||||
const auto in_buffer_it = is_input ? std::find_if(begin, end, find_input)
|
||||
: std::find_if(begin, end, find_output);
|
||||
if (in_buffer_it != end) {
|
||||
up_idx = loop_port.get_index();
|
||||
down_idx = std::distance(orig_begin, in_buffer_it);
|
||||
loop = loop_port.get_expr();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
for (const auto& out : up->get_output_port_connectors()) {
|
||||
for (const auto& buffer_consumer : out->get_consumers()) {
|
||||
const auto buffer_consumer_expr = buffer_consumer.get_expr();
|
||||
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(buffer_consumer_expr->get_node());
|
||||
if (!loop_end)
|
||||
continue;
|
||||
const auto& loop_inputs = buffer_consumer_expr->get_input_port_connectors();
|
||||
if (find(loop_inputs.cbegin(), loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cbegin(), buffer_consumer, true)) return true;
|
||||
if (find(loop_inputs.cbegin() + loop_end->get_input_num(), loop_inputs.cend(), loop_inputs.cbegin(), buffer_consumer, false)) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void DefineBufferClusters::parse_memory_access_op(const ExpressionPtr& expr) {
|
||||
const auto ma = ov::as_type_ptr<op::MemoryAccess>(expr->get_node());
|
||||
if (!ma->is_full_memory_access_op())
|
||||
return;
|
||||
// TODO: Some full MemoryAccess ops can have inplace inputs and outputs in general.
|
||||
// Need to add mechanism of inplace ports using MemoryAccess::PortDescriptor::inplace
|
||||
for (const auto& input : expr->get_input_port_connectors()) {
|
||||
if (is_direct_buffer(input->get_source().get_expr(), expr)) {
|
||||
create_new_cluster(input->get_source().get_expr());
|
||||
}
|
||||
}
|
||||
for (const auto& output : expr->get_output_port_connectors()) {
|
||||
for (const auto& consumer : output->get_consumers()) {
|
||||
if (is_direct_buffer(consumer.get_expr(), expr)) {
|
||||
create_new_cluster(consumer.get_expr());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool DefineBufferClusters::run(LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::DefineBufferClusters");
|
||||
|
||||
for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); ++expr_it) {
|
||||
const auto& expr = *expr_it;
|
||||
const auto op = expr->get_node();
|
||||
if (ov::is_type<op::LoopEnd>(op)) {
|
||||
parse_loop(expr_it);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ov::is_type<op::MemoryAccess>(op)) {
|
||||
parse_memory_access_op(expr);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -0,0 +1,29 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/enumerate_expressions.hpp"
|
||||
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
bool EnumerateExpressions::run(LinearIR& linear_ir) {
|
||||
// [113536]: Temporary solution is reusing of topological order from tokenization.
|
||||
// Need to add execution order of Expression support
|
||||
int64_t order = 0;
|
||||
for (const auto& expr : linear_ir) {
|
||||
ov::snippets::pass::SetTopologicalOrder(expr->get_node(), order++);
|
||||
}
|
||||
return order > 0;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -20,37 +20,69 @@ inline size_t index(size_t col_num, size_t row, size_t col) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const {
|
||||
bool operator==(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
|
||||
if (&lhs == &rhs)
|
||||
return true;
|
||||
return lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset && lhs.data_size == rhs.data_size;
|
||||
}
|
||||
bool operator!=(const IdentifyBuffers::ShiftPtrParams& lhs, const IdentifyBuffers::ShiftPtrParams& rhs) {
|
||||
return !(rhs == lhs);
|
||||
}
|
||||
|
||||
size_t IdentifyBuffers::get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool) {
|
||||
const auto iter = std::find(pool.cbegin(), pool.cend(), target);
|
||||
OPENVINO_ASSERT(iter != pool.cend(), "Buffer wasn't find in Buffer system of Subgraph");
|
||||
return std::distance(pool.cbegin(), iter);
|
||||
}
|
||||
|
||||
bool IdentifyBuffers::can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs) {
|
||||
const auto equal_ptr_params_shifting = lhs.ptr_increment == rhs.ptr_increment && lhs.finalization_offset == rhs.finalization_offset;
|
||||
const auto equal_element_type_sizes = lhs.data_size == rhs.data_size;
|
||||
return equal_ptr_params_shifting && (equal_element_type_sizes || (lhs.ptr_increment == 0 && lhs.finalization_offset == 0));
|
||||
}
|
||||
|
||||
bool IdentifyBuffers::are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
|
||||
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs) {
|
||||
const auto lhs_ids = lhs.first->get_loop_ids();
|
||||
const auto rhs_ids = rhs.first->get_loop_ids();
|
||||
const auto equal_loop_ids = lhs_ids == rhs_ids;
|
||||
if (equal_loop_ids) { // Buffers are connected to the same Loop and have the same outer Loops
|
||||
return !can_reuse_id(lhs.second, rhs.second);
|
||||
} else { // Buffers are connected to the same Loop, but one of Buffers - inside this Loop, another - outside
|
||||
// Buffers are adjacent if outer Buffer has not zero data shift params
|
||||
if (lhs_ids.size() == rhs_ids.size()) // If the count of outer Loops are equal, it means that outer loops are already different
|
||||
return true;
|
||||
const auto& outer_buffer = lhs_ids.size() < rhs_ids.size() ? lhs : rhs;
|
||||
const auto count_outer_loops = std::min(lhs_ids.size(), rhs_ids.size());
|
||||
const auto are_outer_loops_the_same = lhs_ids.size() != rhs_ids.size() &&
|
||||
std::equal(rhs_ids.cbegin(), rhs_ids.cbegin() + count_outer_loops, lhs_ids.cbegin());
|
||||
const auto outer_buffer_has_zero_shifts = outer_buffer.second.ptr_increment == 0 && outer_buffer.second.finalization_offset == 0;
|
||||
return !are_outer_loops_the_same || !outer_buffer_has_zero_shifts;
|
||||
}
|
||||
}
|
||||
|
||||
void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
|
||||
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
|
||||
const BufferPool& buffers,
|
||||
std::vector<bool>& adj) {
|
||||
if (are_adjacent(lhs, rhs)) {
|
||||
const auto size = buffers.size();
|
||||
const auto lhs_idx = get_buffer_idx(lhs.first, buffers);
|
||||
const auto rhs_idx = get_buffer_idx(rhs.first, buffers);
|
||||
adj[index(size, rhs_idx, lhs_idx)] = adj[index(size, lhs_idx, rhs_idx)] = true;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
|
||||
// There are several sync points for adjacency check:
|
||||
// 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict
|
||||
// (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent
|
||||
// 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs
|
||||
const auto size = buffers.size();
|
||||
// TODO: Can we use triangular matrix? Need verify using tests
|
||||
const auto size = pool.size();
|
||||
std::vector<bool> adj(size * size, false);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
adj[index(size, i, i)] = true;
|
||||
|
||||
// < ptr_increment, finalization_offset >
|
||||
using ShiftPtrParams = std::pair<int64_t, int64_t>;
|
||||
|
||||
auto get_buffer_idx = [&](const std::shared_ptr<op::Buffer>& buffer) {
|
||||
const auto iter = std::find(buffers.cbegin(), buffers.cend(), buffer);
|
||||
OPENVINO_ASSERT(iter != buffers.cend(), "Buffer wasn't find in Buffer system of Subgraph");
|
||||
return std::distance(buffers.cbegin(), iter);
|
||||
};
|
||||
|
||||
auto update_adj_matrix = [&](const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& buffer,
|
||||
const std::pair<std::shared_ptr<op::Buffer>, ShiftPtrParams>& neighbour_buffer) {
|
||||
const bool equal_ptr_params_shifting = buffer.second == neighbour_buffer.second;
|
||||
const bool equal_element_type_sizes = buffer.first->get_element_type().size() == neighbour_buffer.first->get_element_type().size();
|
||||
if (!equal_ptr_params_shifting || ((buffer.second.first != 0 || buffer.second.second != 0) && !equal_element_type_sizes)) {
|
||||
const auto buffer_idx = get_buffer_idx(buffer.first);
|
||||
const auto neighbour_idx = get_buffer_idx(neighbour_buffer.first);
|
||||
adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
|
||||
}
|
||||
};
|
||||
|
||||
auto is_buffer = [](const ExpressionPort& port) {
|
||||
return ov::is_type<op::Buffer>(port.get_expr()->get_node());
|
||||
};
|
||||
@ -65,19 +97,19 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
|
||||
continue;
|
||||
OPENVINO_ASSERT(std::count_if(consumers.begin(), consumers.end(), is_buffer) == 1, "Brgemm mustn't have more than 1 consumer buffer");
|
||||
|
||||
std::vector<std::shared_ptr<op::Buffer>> adjacency_buffers;
|
||||
adjacency_buffers.push_back(ov::as_type_ptr<op::Buffer>(buffer_it->get_expr()->get_node()));
|
||||
BufferPool adjacency_buffers;
|
||||
adjacency_buffers.push_back(buffer_it->get_expr());
|
||||
|
||||
for (const auto& input_connector : expr->get_input_port_connectors()) {
|
||||
const auto parent_node = input_connector->get_source().get_expr()->get_node();
|
||||
if (const auto neighbour_buffer = ov::as_type_ptr<op::Buffer>(parent_node)) {
|
||||
adjacency_buffers.push_back(neighbour_buffer);
|
||||
const auto parent_expr = input_connector->get_source().get_expr();
|
||||
if (ov::is_type<op::Buffer>(parent_expr->get_node())) {
|
||||
adjacency_buffers.push_back(parent_expr);
|
||||
}
|
||||
}
|
||||
for (auto buffer_it = adjacency_buffers.begin(); buffer_it != adjacency_buffers.end(); ++buffer_it) {
|
||||
for (auto neighbour_it = std::next(buffer_it); neighbour_it != adjacency_buffers.end(); ++neighbour_it) {
|
||||
const auto buffer_idx = get_buffer_idx(*buffer_it);
|
||||
const auto neighbour_idx = get_buffer_idx(*neighbour_it);
|
||||
const auto buffer_idx = get_buffer_idx(*buffer_it, pool);
|
||||
const auto neighbour_idx = get_buffer_idx(*neighbour_it, pool);
|
||||
adj[index(size, neighbour_idx, buffer_idx)] = adj[index(size, buffer_idx, neighbour_idx)] = true;
|
||||
}
|
||||
}
|
||||
@ -91,29 +123,36 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
|
||||
const auto input_count = loop_end->get_input_num();
|
||||
const auto output_count = loop_end->get_output_num();
|
||||
|
||||
const auto ptr_increments = loop_end->get_ptr_increments();
|
||||
const auto finalization_offsets = loop_end->get_finalization_offsets();
|
||||
const auto& ptr_increments = loop_end->get_ptr_increments();
|
||||
const auto& finalization_offsets = loop_end->get_finalization_offsets();
|
||||
const auto& data_sizes = loop_end->get_element_type_sizes();
|
||||
|
||||
// Buffer -> <ptr increment, finalization_offsets>
|
||||
std::map<std::shared_ptr<op::Buffer>, ShiftPtrParams> buffer_neighbours;
|
||||
std::map<ExpressionPtr, ShiftPtrParams> buffer_neighbours;
|
||||
|
||||
for (size_t i = 0; i < input_count; ++i) {
|
||||
const auto& parent_output = expr->get_input_port_connector(i)->get_source().get_expr();
|
||||
if (const auto buffer = ov::as_type_ptr<op::Buffer>(parent_output->get_node())) {
|
||||
buffer_neighbours[buffer] = { ptr_increments[i], finalization_offsets[i] };
|
||||
if (ov::is_type<op::Buffer>(parent_output->get_node())) {
|
||||
if (buffer_neighbours.count(parent_output) > 0) {
|
||||
OPENVINO_ASSERT(buffer_neighbours[parent_output].ptr_increment == ptr_increments[i] &&
|
||||
buffer_neighbours[parent_output].finalization_offset == finalization_offsets[i],
|
||||
"Invalid data pointer shifts: If Buffer has several consumers, this consumers must have the same shifts or zero");
|
||||
continue;
|
||||
}
|
||||
buffer_neighbours[parent_output] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < output_count; ++i) {
|
||||
for (size_t i = input_count; i < input_count + output_count; ++i) {
|
||||
// The consumers of the corresponding Store ops
|
||||
const auto index = input_count + i;
|
||||
const auto consumer_inputs = expr->get_input_port_connector(index)->get_consumers();
|
||||
const auto consumer_inputs = expr->get_input_port_connector(i)->get_consumers();
|
||||
size_t buffer_count = 0;
|
||||
size_t loop_count = 0;
|
||||
for (const auto& consumer_input : consumer_inputs) {
|
||||
const auto& child_node = consumer_input.get_expr()->get_node();
|
||||
if (const auto buffer = ov::as_type_ptr<op::Buffer>(child_node)) {
|
||||
buffer_neighbours[buffer] = { ptr_increments[index], finalization_offsets[index] };
|
||||
} else if (ov::is_type<op::LoopEnd>(child_node)) {
|
||||
const auto& child_expr = consumer_input.get_expr();
|
||||
if (ov::is_type<op::Buffer>(child_expr->get_node())) {
|
||||
buffer_neighbours[child_expr] = { data_sizes[i], ptr_increments[i], finalization_offsets[i] };
|
||||
buffer_count++;
|
||||
} else if (ov::is_type<op::LoopEnd>(child_expr->get_node())) {
|
||||
loop_count++;
|
||||
}
|
||||
}
|
||||
@ -123,9 +162,24 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
|
||||
}
|
||||
}
|
||||
|
||||
// Buffers which are connected to the current Loop but without ptr shifts and Buffers which are inside this Loop - must be adjacent because
|
||||
// after each Loop iteration GPR will be shifted using ptr increment of Buffer outside. But Buffers inside have the same GPR - it means that
|
||||
// Buffers inside will work with shifted memory.
|
||||
const auto loop_begin = loop_end->get_loop_begin();
|
||||
for (auto it = std::reverse_iterator<LinearIR::constExprIt>(expr_it); (*it)->get_node() != loop_begin; ++it) {
|
||||
const auto& inner_expr = *it;
|
||||
if (ov::is_type<op::Buffer>(inner_expr->get_node())) {
|
||||
// To make Buffers adjacent, we set value "INT64_MAX" for data ptr shifts params for inner Buffers,
|
||||
// since outer Buffers (and other any Buffers) cannot have this value in shifting because of semantic of Loop op.
|
||||
// Thus, inner and outer Buffers have always different data shift ptr params -> they're adjacent
|
||||
if (buffer_neighbours.count(inner_expr) == 0)
|
||||
buffer_neighbours[inner_expr] = { INT64_MAX, INT64_MAX, INT64_MAX };
|
||||
}
|
||||
}
|
||||
|
||||
for (auto buffer_it = buffer_neighbours.begin(); buffer_it != buffer_neighbours.end(); ++buffer_it) {
|
||||
for (auto neighbour_it = std::next(buffer_it); neighbour_it != buffer_neighbours.end(); ++neighbour_it) {
|
||||
update_adj_matrix(*buffer_it, *neighbour_it);
|
||||
update_adj_matrix(*buffer_it, *neighbour_it, pool, adj);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,9 +187,9 @@ std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linea
|
||||
return adj;
|
||||
}
|
||||
|
||||
auto IdentifyBuffers::coloring(BufferSet& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferSet> {
|
||||
auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> std::map<size_t, BufferPool> {
|
||||
size_t color = 0;
|
||||
std::map<size_t, BufferSet> color_groups;
|
||||
std::map<size_t, BufferPool> color_groups;
|
||||
const auto size = buffers.size();
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
// The Buffer is already colored (visited) - skip
|
||||
@ -183,25 +237,25 @@ bool IdentifyBuffers::run(LinearIR& linear_ir) {
|
||||
// Unite Buffers using Graph coloring algorithm.
|
||||
// Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
|
||||
// so these Buffers are always IntermediateBuffer nonadjacent
|
||||
BufferSet buffer_exprs;
|
||||
BufferPool buffer_pool;
|
||||
|
||||
for (const auto& expr : linear_ir) {
|
||||
if (const auto buffer = ov::as_type_ptr<op::Buffer>(expr->get_node())) {
|
||||
buffer_exprs.push_back(buffer);
|
||||
if (ov::is_type<op::Buffer>(expr->get_node())) {
|
||||
buffer_pool.push_back(expr);
|
||||
}
|
||||
}
|
||||
|
||||
// Creation of Adj matrix
|
||||
auto adj = create_adjacency_matrix(linear_ir, buffer_exprs);
|
||||
auto adj = create_adjacency_matrix(linear_ir, buffer_pool);
|
||||
|
||||
// Graph coloring algorithm
|
||||
const auto color_groups = coloring(buffer_exprs, adj);
|
||||
const auto color_groups = coloring(buffer_pool, adj);
|
||||
|
||||
for (const auto& pair : color_groups) {
|
||||
const auto color = pair.first;
|
||||
const auto& united_buffers = pair.second;
|
||||
for (const auto& buffer : united_buffers) {
|
||||
buffer->set_id(color);
|
||||
for (const auto& buffer_expr : united_buffers) {
|
||||
ov::as_type_ptr<op::Buffer>(buffer_expr->get_node())->set_id(color);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/init_buffers_default.hpp"
|
||||
|
||||
#include "snippets/lowered/pass/allocate_buffers.hpp"
|
||||
#include "snippets/op/buffer.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
bool InitBuffersDefault::run(LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InitBuffersDefault");
|
||||
|
||||
size_t id = 0;
|
||||
size_t offset = 0;
|
||||
for (const auto& expr : linear_ir) {
|
||||
const auto op = expr->get_node();
|
||||
if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
|
||||
AllocateBuffers::set_buffer_offset(expr, offset);
|
||||
buffer->set_id(id);
|
||||
|
||||
offset += buffer->get_byte_size();
|
||||
id++;
|
||||
}
|
||||
}
|
||||
|
||||
m_buffer_scratchpad_size = offset;
|
||||
return m_buffer_scratchpad_size > 0;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
38
src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
Normal file
38
src/common/snippets/src/lowered/pass/normalize_bufer_ids.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/normalize_buffer_ids.hpp"
|
||||
|
||||
#include "snippets/op/buffer.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
bool NormalizeBufferIDs::run(LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::NormalizeBufferIDs");
|
||||
|
||||
// [ original Buffer ID -> normalized ]
|
||||
std::map<size_t, size_t> buffer_ids;
|
||||
for (const auto& expr : linear_ir) {
|
||||
const auto op = expr->get_node();
|
||||
if (const auto buffer = ov::as_type_ptr<op::Buffer>(op)) {
|
||||
const auto buffer_id = buffer->get_id();
|
||||
if (buffer_ids.count(buffer_id) == 0) {
|
||||
const auto new_id = buffer_ids.size();
|
||||
buffer_ids[buffer_id] = new_id;
|
||||
}
|
||||
buffer->set_id(buffer_ids[buffer_id]);
|
||||
}
|
||||
}
|
||||
return buffer_ids.size();
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
89
src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
Normal file
89
src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/lowered/pass/solve_buffer_memory.hpp"
|
||||
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace snippets {
|
||||
namespace lowered {
|
||||
namespace pass {
|
||||
|
||||
std::vector<ov::MemorySolver::Box> SolveBufferMemory::init_boxes(const AllocateBuffers::BufferClusters& buffer_clusters) {
|
||||
std::vector<ov::MemorySolver::Box> boxes;
|
||||
const auto count = static_cast<int>(buffer_clusters.size());
|
||||
for (int i = 0; i < count; i++) {
|
||||
ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
|
||||
int64_t box_size = 0;
|
||||
for (const auto& buffer_expr : buffer_clusters[i]) {
|
||||
int e_start = 0, e_finish = 0;
|
||||
const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(buffer_expr->get_node());
|
||||
OPENVINO_ASSERT(buffer != nullptr, "BufferSolver expects Buffer ops in clusters");
|
||||
|
||||
// life finish time - order of LoopEnd / MemoryAccess ops
|
||||
const auto buffer_outs = buffer_expr->get_output_port_connectors();
|
||||
for (const auto& buffer_out : buffer_outs) {
|
||||
const auto consumers = buffer_out->get_consumers();
|
||||
for (const auto& consumer : consumers) {
|
||||
const auto consumer_order = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(consumer.get_expr()->get_node()));
|
||||
e_finish = std::max(e_finish, consumer_order); // the last consumer
|
||||
}
|
||||
}
|
||||
e_start = e_finish;
|
||||
|
||||
const auto buffer_ins = buffer_expr->get_input_port_connectors();
|
||||
for (const auto& buffer_in : buffer_ins) {
|
||||
const auto& source = buffer_in->get_source();
|
||||
e_start = static_cast<int>(ov::snippets::pass::GetTopologicalOrder(source.get_expr()->get_node()));
|
||||
|
||||
const auto buffer_siblings = buffer_in->get_consumers();
|
||||
for (const auto& sibling : buffer_siblings) {
|
||||
if (const auto loop_end = ov::as_type_ptr<ov::snippets::op::LoopEnd>(sibling.get_expr()->get_node())) {
|
||||
e_start = std::min(e_start, static_cast<int>(ov::snippets::pass::GetTopologicalOrder(loop_end->get_loop_begin())));
|
||||
}
|
||||
}
|
||||
}
|
||||
OPENVINO_ASSERT(e_start <= e_finish, "Incorrect life time of buffer!");
|
||||
|
||||
auto buffer_size = static_cast<int64_t>(buffer->get_byte_size());
|
||||
box_size = std::max(buffer_size, box_size);
|
||||
|
||||
box.start = std::min(e_start, box.start);
|
||||
box.finish = std::max(e_finish, box.finish);
|
||||
}
|
||||
|
||||
// We use data alignment to put data in the line cache
|
||||
box.size = utils::div_up(box_size, m_alignment);
|
||||
boxes.push_back(box);
|
||||
}
|
||||
return boxes;
|
||||
}
|
||||
|
||||
|
||||
bool SolveBufferMemory::run(LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::SolveBufferMemory");
|
||||
|
||||
const auto boxes = init_boxes(m_clusters);
|
||||
|
||||
ov::MemorySolver memSolver(boxes);
|
||||
m_buffer_scratchpad_size = static_cast<size_t>(memSolver.solve()) * m_alignment; // alignment in byte
|
||||
|
||||
// Set offsets for Buffers
|
||||
for (const auto& box : boxes) {
|
||||
for (const auto& buffer : m_clusters[box.id]) {
|
||||
const auto offset = static_cast<size_t>(memSolver.get_offset(static_cast<int>(box.id)));
|
||||
AllocateBuffers::set_buffer_offset(buffer, offset * m_alignment); // alignment in byte
|
||||
}
|
||||
}
|
||||
return m_buffer_scratchpad_size > 0;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace lowered
|
||||
} // namespace snippets
|
||||
} // namespace ov
|
@ -39,7 +39,6 @@
|
||||
#include "snippets/lowered/pass/move_scalar_to_consumer.hpp"
|
||||
#include "snippets/lowered/pass/move_result_out_of_loop.hpp"
|
||||
#include "snippets/lowered/pass/clean_repeated_ptr_shifts.hpp"
|
||||
#include "snippets/lowered/pass/identify_buffers.hpp"
|
||||
#include "snippets/lowered/pass/validate_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_loops.hpp"
|
||||
#include "snippets/lowered/pass/optimize_domain.hpp"
|
||||
@ -453,19 +452,12 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,
|
||||
|
||||
backend_passes_post_common.run(linear_ir);
|
||||
|
||||
const auto buffer_allocation_pass = std::make_shared<lowered::pass::AllocateBuffers>();
|
||||
lowered::pass::PassPipeline buffer_pipeline;
|
||||
buffer_pipeline.register_pass<lowered::pass::IdentifyBuffers>();
|
||||
buffer_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
|
||||
buffer_pipeline.register_pass(buffer_allocation_pass);
|
||||
buffer_pipeline.run(linear_ir);
|
||||
|
||||
lowered::pass::PassPipeline final_pipeline;
|
||||
final_pipeline.register_pass<lowered::pass::AllocateBuffers>(lowering_result.buffer_scratchpad_size, linear_ir.get_config().m_are_buffers_optimized);
|
||||
final_pipeline.register_pass<lowered::pass::CleanRepeatedDataPointerShifts>();
|
||||
final_pipeline.register_pass<lowered::pass::PropagateLayout>();
|
||||
final_pipeline.register_pass<lowered::pass::CleanupLoopOffsets>();
|
||||
final_pipeline.run(linear_ir);
|
||||
|
||||
lowering_result.buffer_scratchpad_size = buffer_allocation_pass->get_scratchpad_size();
|
||||
}
|
||||
|
||||
snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_shapes,
|
||||
|
@ -0,0 +1,61 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <common_test_utils/ov_test_utils.hpp>
|
||||
|
||||
#include "snippets/op/brgemm.hpp"
|
||||
#include "snippets/lowered/pass/pass.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
bool, // Optimized pipeline
|
||||
bool, // With SplitLoops opt
|
||||
size_t, // Expected Buffer size in bytes
|
||||
size_t // Expected unique Buffer IDs count
|
||||
> BufferAllocationParams;
|
||||
|
||||
class BufferAllocationTest : public testing::TestWithParam<BufferAllocationParams> {
|
||||
public:
|
||||
using VectorDims = ov::snippets::VectorDims;
|
||||
static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationParams> obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
void ApplyTransformations(bool is_optimized, bool with_split_loops);
|
||||
void Validate();
|
||||
|
||||
virtual std::shared_ptr<ov::Model> GetModel() const = 0;
|
||||
|
||||
static void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor);
|
||||
|
||||
size_t m_buffer_scratchpad = 0;
|
||||
ov::snippets::lowered::LinearIR m_linear_ir;
|
||||
|
||||
size_t m_expected_size = 0;
|
||||
size_t m_expected_count = 0;
|
||||
|
||||
size_t m_loop_depth = 2;
|
||||
size_t m_vector_size = 16;
|
||||
};
|
||||
|
||||
class EltwiseBufferAllocationTest : public BufferAllocationTest {
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> GetModel() const override;
|
||||
};
|
||||
|
||||
class MHABufferAllocationTest : public BufferAllocationTest {
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> GetModel() const override;
|
||||
|
||||
static void MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor);
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
213
src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
Normal file
213
src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
Normal file
@ -0,0 +1,213 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "lowered/pass/buffer_allocation.hpp"
|
||||
|
||||
#include "openvino/opsets/opset.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/lowered/linear_ir.hpp"
|
||||
#include "snippets/lowered/pass/mark_loops.hpp"
|
||||
#include "snippets/lowered/pass/init_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_load_store.hpp"
|
||||
#include "snippets/lowered/pass/validate_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_loops.hpp"
|
||||
#include "snippets/lowered/pass/allocate_buffers.hpp"
|
||||
#include "snippets/lowered/pass/fuse_loops.hpp"
|
||||
#include "snippets/lowered/pass/split_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_buffers.hpp"
|
||||
#include "snippets/lowered/pass/softmax_decomposition.hpp"
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string BufferAllocationTest::getTestCaseName(testing::TestParamInfo<ov::test::snippets::BufferAllocationParams> obj) {
|
||||
bool is_optimized, with_split_loops;
|
||||
size_t expected_size, expected_count;
|
||||
|
||||
std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
|
||||
result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
|
||||
result << "ExpBufferSize=" << expected_size << "_";
|
||||
result << "ExpBufferNum=" << expected_count;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void BufferAllocationTest::SetUp() {
|
||||
bool is_optimized, with_split_loops;
|
||||
std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
|
||||
|
||||
const auto body = GetModel();
|
||||
m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::IShapeInferSnippetsFactory>());
|
||||
m_linear_ir.set_loop_depth(m_loop_depth);
|
||||
ApplyTransformations(is_optimized, with_split_loops);
|
||||
}
|
||||
|
||||
void BufferAllocationTest::MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) {
|
||||
for (const auto& input : node->inputs())
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
|
||||
for (const auto& output : node->outputs())
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
|
||||
}
|
||||
|
||||
void BufferAllocationTest::ApplyTransformations(bool is_optimized, bool with_split) {
|
||||
ov::snippets::lowered::pass::PassPipeline pipeline;
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
|
||||
if (with_split)
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
|
||||
pipeline.run(m_linear_ir);
|
||||
}
|
||||
|
||||
void BufferAllocationTest::Validate() {
|
||||
std::set<size_t> gprs;
|
||||
for (const auto& expr : m_linear_ir) {
|
||||
if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
|
||||
gprs.insert(buffer->get_id());
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(gprs.size(), m_expected_count);
|
||||
EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
|
||||
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
|
||||
const auto subtensor_buffer = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
|
||||
ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
|
||||
|
||||
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
|
||||
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
|
||||
const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
|
||||
const auto buffer0 = std::make_shared<ov::snippets::op::Buffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
|
||||
const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
|
||||
const auto buffer1 = std::make_shared<ov::snippets::op::Buffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
|
||||
const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
|
||||
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});
|
||||
|
||||
MarkOp(add, subtensor_eltwise);
|
||||
MarkOp(relu, subtensor_eltwise);
|
||||
MarkOp(exp, subtensor_eltwise);
|
||||
MarkOp(buffer0, subtensor_buffer);
|
||||
MarkOp(buffer1, subtensor_buffer);
|
||||
|
||||
return body;
|
||||
}
|
||||
|
||||
void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op::Brgemm>& node, const std::vector<size_t>& subtensor) {
|
||||
const auto subtensor_full = std::vector<size_t>{ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM,
|
||||
ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
node->input(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(0), subtensor));
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
node->input(1), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->input(1), subtensor_full));
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
node->output(0), std::make_shared<ov::snippets::lowered::PortDescriptor>(node->output(0), subtensor));
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
|
||||
const auto subtensor_scalar = std::vector<size_t>{1, 1};
|
||||
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
|
||||
const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
|
||||
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
|
||||
|
||||
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
|
||||
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 128, 12, 64}));
|
||||
const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 12, 128, 64}));
|
||||
|
||||
const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
|
||||
const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
|
||||
const auto relu0 = std::make_shared<ov::op::v0::Relu>(store);
|
||||
const auto matmul0 = std::make_shared<ov::snippets::op::Brgemm>(parameter0, relu0);
|
||||
const auto relu1 = std::make_shared<ov::op::v0::Relu>(matmul0);
|
||||
const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
|
||||
const auto matmul1 = std::make_shared<ov::snippets::op::Brgemm>(softmax, parameter2);
|
||||
const auto relu2 = std::make_shared<ov::op::v0::Relu>(matmul1);
|
||||
|
||||
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
|
||||
|
||||
MarkOp(load_reshape, subtensor_scalar);
|
||||
MarkOp(store, subtensor_scalar);
|
||||
MarkOp(softmax, subtensor_softmax);
|
||||
|
||||
MarkBrgemm(matmul0, subtensor_brgemm);
|
||||
MarkBrgemm(matmul1, subtensor_brgemm);
|
||||
|
||||
return body;
|
||||
}
|
||||
|
||||
TEST_P(EltwiseBufferAllocationTest, BufferAllocation) {
|
||||
Validate();
|
||||
}
|
||||
TEST_P(MHABufferAllocationTest, BufferAllocation) {
|
||||
Validate();
|
||||
}
|
||||
|
||||
namespace BufferAllocationTest_Instances {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseNotOptimized, EltwiseBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(false),
|
||||
::testing::Values(false), // in this test it doesn't make sense
|
||||
::testing::Values(80000), // Each Buffer has own allocated memory
|
||||
::testing::Values(2)), // Each Buffer has unique ID
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_EltwiseOptimized, EltwiseBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(true),
|
||||
::testing::Values(false), // in this test it doesn't make sense
|
||||
::testing::Values(40000), // Two Buffer reuse memory
|
||||
::testing::Values(1)), // Two Buffers reuse IDs
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWSplit, MHABufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(false),
|
||||
::testing::Values(true),
|
||||
::testing::Values(139264), // Each Buffer has own allocated memory
|
||||
::testing::Values(7)), // Each Buffer has unique ID
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(true),
|
||||
::testing::Values(true),
|
||||
::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
|
||||
::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(false),
|
||||
::testing::Values(false),
|
||||
::testing::Values(360448), // Each Buffer has own allocated memory
|
||||
::testing::Values(7)), // Each Buffer has unique ID
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(true),
|
||||
::testing::Values(false),
|
||||
::testing::Values(98304), // (between brgemms) + (Buffer before brgemm0 and after brgemm1)
|
||||
::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
|
||||
BufferAllocationTest::getTestCaseName);
|
||||
|
||||
} // namespace BufferAllocationTest_Instances
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
||||
|
@ -8,13 +8,16 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <ie_common.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "openvino/core/except.hpp"
|
||||
|
||||
namespace ov {
|
||||
|
||||
/**
|
||||
* @brief Helps to solve issue of optimal memory allocation only for particular
|
||||
* execution order.
|
||||
@ -42,7 +45,6 @@
|
||||
* Exec order is predefined.
|
||||
*/
|
||||
|
||||
IE_SUPPRESS_DEPRECATED_START
|
||||
class MemorySolver {
|
||||
public:
|
||||
/** @brief Representation of edge (size and live time)*/
|
||||
@ -67,7 +69,7 @@ public:
|
||||
/** @brief Performes inplace normalization of the input boxes
|
||||
@return lifespan of all boxes
|
||||
*/
|
||||
static int normalizeBoxes(std::vector<Box>& boxes) {
|
||||
static int normalize_boxes(std::vector<Box>& boxes) {
|
||||
int max_ts = 0;
|
||||
for (const Box& box : boxes)
|
||||
max_ts = std::max(std::max(max_ts, box.start), box.finish);
|
||||
@ -113,10 +115,10 @@ public:
|
||||
// 2. Box.finish >= Box.start (except Box.finish == -1)
|
||||
// 3. Box.size > 0 (or == 0 ?)
|
||||
// 4. Box.id == any unique value
|
||||
_time_duration = normalizeBoxes(_boxes);
|
||||
_time_duration = normalize_boxes(_boxes);
|
||||
}
|
||||
|
||||
inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
|
||||
inline bool popup_together_with(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
|
||||
if (box_new.id + box_new.size > box_old.id && box_old.id + box_old.size > box_new.id) {
|
||||
// Move the new one up. There is an intersection
|
||||
box_new.id = box_old.id + box_old.size;
|
||||
@ -131,7 +133,7 @@ public:
|
||||
* @return Size of common memory blob required for storing all
|
||||
*/
|
||||
int64_t solve() {
|
||||
maxTopDepth(); // at first make sure that we no need more for boxes sorted by box.start
|
||||
max_top_depth(); // at first make sure that we no need more for boxes sorted by box.start
|
||||
std::vector<std::vector<const Box*>> time_slots(_time_duration);
|
||||
for (auto& slot : time_slots)
|
||||
slot.reserve(_top_depth); // 2D array [_time_duration][_top_depth]
|
||||
@ -155,8 +157,8 @@ public:
|
||||
for (auto* box_in_slot : time_slots[i_slot]) {
|
||||
// intersect with already stored boxes for all covered time slots
|
||||
// and move up the new one if needed
|
||||
// Execution of 'popupTogetherWith' is important even if 'popped_up' is already 'true'
|
||||
popped_up = popupTogetherWith(box, *box_in_slot) || popped_up;
|
||||
// Execution of 'popup_together_with' is important even if 'popped_up' is already 'true'
|
||||
popped_up = popup_together_with(box, *box_in_slot) || popped_up;
|
||||
}
|
||||
}
|
||||
} while (popped_up);
|
||||
@ -174,23 +176,23 @@ public:
|
||||
}
|
||||
|
||||
/** Provides calculated offset for specified box id */
|
||||
int64_t getOffset(int id) const {
|
||||
int64_t get_offset(int id) const {
|
||||
auto res = _offsets.find(id);
|
||||
if (res == _offsets.end())
|
||||
IE_THROW() << "There are no box for provided ID";
|
||||
OPENVINO_THROW("There are no box for provided ID");
|
||||
return res->second;
|
||||
}
|
||||
|
||||
/** Additional info. Max sum of box sizes required for any time stamp. */
|
||||
int64_t maxDepth() {
|
||||
int64_t max_depth() {
|
||||
if (_depth == -1)
|
||||
calcDepth();
|
||||
calc_depth();
|
||||
return _depth;
|
||||
}
|
||||
/** Additional info. Max num of boxes required for any time stamp. */
|
||||
int64_t maxTopDepth() {
|
||||
int64_t max_top_depth() {
|
||||
if (_top_depth == -1)
|
||||
calcDepth();
|
||||
calc_depth();
|
||||
return _top_depth;
|
||||
}
|
||||
|
||||
@ -201,7 +203,7 @@ private:
|
||||
int64_t _depth = -1;
|
||||
int _time_duration = -1;
|
||||
|
||||
void calcDepth() {
|
||||
void calc_depth() {
|
||||
int64_t top_depth = 0;
|
||||
int64_t depth = 0;
|
||||
std::map<int64_t, std::vector<const Box*>> release_at;
|
||||
@ -218,11 +220,12 @@ private:
|
||||
top_depth--;
|
||||
}
|
||||
release_at.erase(time);
|
||||
IE_ASSERT(top_depth > 0);
|
||||
OPENVINO_ASSERT(top_depth > 0);
|
||||
|
||||
_top_depth = std::max(_top_depth, top_depth);
|
||||
_depth = std::max(_depth, depth);
|
||||
}
|
||||
}
|
||||
};
|
||||
IE_SUPPRESS_DEPRECATED_END
|
||||
|
||||
} // namespace ov
|
@ -2,33 +2,33 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "memory_solver.hpp"
|
||||
#include "openvino/runtime/memory_solver.hpp"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using Box = MemorySolver::Box;
|
||||
using Box = ov::MemorySolver::Box;
|
||||
|
||||
TEST(MemSolverTest, CanConstruct) {
|
||||
{ // Empty vector<Box>
|
||||
MemorySolver ms(std::vector<Box>{});
|
||||
ov::MemorySolver ms(std::vector<Box>{});
|
||||
}
|
||||
|
||||
{ // vector with default Box
|
||||
MemorySolver ms(std::vector<Box>{{}});
|
||||
ov::MemorySolver ms(std::vector<Box>{{}});
|
||||
}
|
||||
|
||||
{ // vector with Box with non-default Box
|
||||
MemorySolver ms(std::vector<Box>{{1, 3, 3}});
|
||||
ov::MemorySolver ms(std::vector<Box>{{1, 3, 3}});
|
||||
}
|
||||
|
||||
{ // vector with Box with size == 0
|
||||
MemorySolver ms(std::vector<Box>{{0, 0, 0}});
|
||||
ov::MemorySolver ms(std::vector<Box>{{0, 0, 0}});
|
||||
}
|
||||
|
||||
{ // vector with Box with finish == -1
|
||||
MemorySolver ms(std::vector<Box>{{3, -1, 6}});
|
||||
ov::MemorySolver ms(std::vector<Box>{{3, -1, 6}});
|
||||
}
|
||||
|
||||
// TODO: enable after implement TODO from memory_solver.hpp#L66
|
||||
@ -42,7 +42,7 @@ TEST(MemSolverTest, CanConstruct) {
|
||||
// | __|____||____|
|
||||
// |__|____||____|_____
|
||||
// 0 1 2 3 4
|
||||
TEST(MemSolverTest, GetOffset) {
|
||||
TEST(MemSolverTest, get_offset) {
|
||||
int n = 0;
|
||||
std::vector<Box> boxes{
|
||||
{n, ++n, 2, 0},
|
||||
@ -51,13 +51,13 @@ TEST(MemSolverTest, GetOffset) {
|
||||
{n, ++n, 2, 3},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
ms.solve();
|
||||
|
||||
// The correct answer is [0, 2, 0, 2] or [2, 0, 2, 0].
|
||||
EXPECT_EQ(ms.getOffset(0) + ms.getOffset(1), 2);
|
||||
EXPECT_EQ(ms.getOffset(1) + ms.getOffset(2), 2);
|
||||
EXPECT_EQ(ms.getOffset(2) + ms.getOffset(3), 2);
|
||||
EXPECT_EQ(ms.get_offset(0) + ms.get_offset(1), 2);
|
||||
EXPECT_EQ(ms.get_offset(1) + ms.get_offset(2), 2);
|
||||
EXPECT_EQ(ms.get_offset(2) + ms.get_offset(3), 2);
|
||||
}
|
||||
|
||||
// |
|
||||
@ -65,7 +65,7 @@ TEST(MemSolverTest, GetOffset) {
|
||||
// | __|____||____|
|
||||
// |__|____||____|_____
|
||||
// 0 1 2 3 4
|
||||
TEST(MemSolverTest, GetOffsetThrowException) {
|
||||
TEST(MemSolverTest, get_offsetThrowException) {
|
||||
int n = 0, id = 0;
|
||||
std::vector<Box> boxes{
|
||||
{n, ++n, 2, id++},
|
||||
@ -74,10 +74,10 @@ TEST(MemSolverTest, GetOffsetThrowException) {
|
||||
{n, ++n, 2, id++},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
ms.solve();
|
||||
|
||||
EXPECT_THROW(ms.getOffset(100), std::runtime_error);
|
||||
EXPECT_THROW(ms.get_offset(100), std::runtime_error);
|
||||
}
|
||||
|
||||
// |
|
||||
@ -93,10 +93,10 @@ TEST(MemSolverTest, LinearAndEven) {
|
||||
{n, ++n, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 4);
|
||||
EXPECT_EQ(ms.maxDepth(), 4);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
EXPECT_EQ(ms.max_depth(), 4);
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | ____
|
||||
@ -112,10 +112,10 @@ TEST(MemSolverTest, LinearAndNotEven) {
|
||||
{n, ++n, 3},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 5);
|
||||
EXPECT_EQ(ms.maxDepth(), 5);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
EXPECT_EQ(ms.max_depth(), 5);
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | _______
|
||||
@ -131,10 +131,10 @@ TEST(MemSolverTest, LinearWithEmptyExecIndexes) {
|
||||
{n, n += 2, 3},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 5);
|
||||
EXPECT_EQ(ms.maxDepth(), 5);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
EXPECT_EQ(ms.max_depth(), 5);
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | __________
|
||||
@ -150,10 +150,10 @@ TEST(MemSolverTest, DISABLED_Unefficiency) {
|
||||
{2, 3, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 5); // currently we have answer 6
|
||||
EXPECT_EQ(ms.maxDepth(), 5);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
EXPECT_EQ(ms.max_depth(), 5);
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | __________
|
||||
@ -169,10 +169,10 @@ TEST(MemSolverTest, OverlappingBoxes) {
|
||||
{2, 3, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 6);
|
||||
EXPECT_EQ(ms.maxDepth(), 6);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
EXPECT_EQ(ms.max_depth(), 6);
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | ____
|
||||
@ -190,10 +190,10 @@ TEST(MemSolverTest, EndOnSeveralBegins) {
|
||||
{3, 4, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 6);
|
||||
EXPECT_EQ(ms.maxDepth(), 6);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 3);
|
||||
EXPECT_EQ(ms.max_depth(), 6);
|
||||
EXPECT_EQ(ms.max_top_depth(), 3);
|
||||
}
|
||||
|
||||
// | _____________
|
||||
@ -211,10 +211,10 @@ TEST(MemSolverTest, ToEndBoxes) {
|
||||
{3, 4, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 8);
|
||||
EXPECT_EQ(ms.maxDepth(), 8);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 4);
|
||||
EXPECT_EQ(ms.max_depth(), 8);
|
||||
EXPECT_EQ(ms.max_top_depth(), 4);
|
||||
}
|
||||
|
||||
// | _
|
||||
@ -232,10 +232,10 @@ TEST(MemSolverTest, LastAndToEndBox) {
|
||||
{3, 4, 2},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 6);
|
||||
EXPECT_EQ(ms.maxDepth(), 6);
|
||||
EXPECT_EQ(ms.maxTopDepth(), 3);
|
||||
EXPECT_EQ(ms.max_depth(), 6);
|
||||
EXPECT_EQ(ms.max_top_depth(), 3);
|
||||
}
|
||||
|
||||
TEST(MemSolverTest, OptimalAlexnet) {
|
||||
@ -269,10 +269,10 @@ TEST(MemSolverTest, OptimalAlexnet) {
|
||||
for (const auto& sh : shapes)
|
||||
boxes.push_back({n, ++n, sh[0] * sh[1] * sh[2]});
|
||||
|
||||
// For linear topology bottom score is reachable minRequired == maxDepth
|
||||
MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), ms.maxDepth());
|
||||
EXPECT_EQ(ms.maxTopDepth(), 2);
|
||||
// For linear topology bottom score is reachable minRequired == max_depth
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), ms.max_depth());
|
||||
EXPECT_EQ(ms.max_top_depth(), 2);
|
||||
}
|
||||
|
||||
// | _____________
|
||||
@ -290,14 +290,14 @@ TEST(MemSolverTest, NoOverlapping) {
|
||||
{2, 4, 2, n++},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
ms.solve();
|
||||
// TODO: Current algorithm doesn't solve that case. Uncomment check to see inefficiency
|
||||
// EXPECT_EQ(ms.solve(), 5);
|
||||
|
||||
auto no_overlap = [&](Box box1, Box box2) -> bool {
|
||||
int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
|
||||
int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
|
||||
int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
|
||||
int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
|
||||
return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
|
||||
off1 >= off2 + box2.size;
|
||||
};
|
||||
@ -322,12 +322,12 @@ TEST(MemSolverTest, BestSolution1) {
|
||||
{6, 7, 3, n++},
|
||||
};
|
||||
|
||||
MemorySolver ms(boxes);
|
||||
ov::MemorySolver ms(boxes);
|
||||
EXPECT_EQ(ms.solve(), 5);
|
||||
|
||||
auto no_overlap = [&](Box box1, Box box2) -> bool {
|
||||
int64_t off1 = ms.getOffset(static_cast<int>(box1.id));
|
||||
int64_t off2 = ms.getOffset(static_cast<int>(box2.id));
|
||||
int64_t off1 = ms.get_offset(static_cast<int>(box1.id));
|
||||
int64_t off2 = ms.get_offset(static_cast<int>(box2.id));
|
||||
return box1.finish < box2.start || box1.start > box2.finish || off1 + box1.size <= off2 ||
|
||||
off1 >= off2 + box2.size;
|
||||
};
|
||||
|
@ -28,7 +28,6 @@
|
||||
#include "low_precision/low_precision.hpp"
|
||||
#include "memory_desc/cpu_memory_desc_utils.h"
|
||||
#include "memory_desc/dnnl_blocked_memory_desc.h"
|
||||
#include "memory_solver.hpp"
|
||||
#include "nodes/common/cpu_convert.h"
|
||||
#include "nodes/common/cpu_memcpy.h"
|
||||
#include "nodes/convert.h"
|
||||
@ -50,6 +49,8 @@
|
||||
#include "utils/verbose.h"
|
||||
#include "memory_desc/cpu_memory_desc_utils.h"
|
||||
|
||||
#include "openvino/runtime/memory_solver.hpp"
|
||||
|
||||
#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
|
||||
# include <tbb/task.h>
|
||||
#endif
|
||||
@ -629,10 +630,10 @@ void Graph::AllocateWithReuse() {
|
||||
const int64_t alignment = 32; // 32 bytes
|
||||
|
||||
// Markup the boxes
|
||||
std::vector<MemorySolver::Box> definedBoxes;
|
||||
std::vector<MemorySolver::Box> undefinedBoxes;
|
||||
std::vector<ov::MemorySolver::Box> definedBoxes;
|
||||
std::vector<ov::MemorySolver::Box> undefinedBoxes;
|
||||
for (size_t i = 0; i < remaining_edge_clusters_count; i++) {
|
||||
MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
|
||||
ov::MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, static_cast<int64_t>(i) };
|
||||
int64_t boxSize = 0;
|
||||
for (auto &edge : edge_clusters[i]) {
|
||||
int e_start = edge->getParent()->execIndex;
|
||||
@ -679,7 +680,7 @@ void Graph::AllocateWithReuse() {
|
||||
}
|
||||
|
||||
// Process defined boxes (static shapes)
|
||||
MemorySolver staticMemSolver(definedBoxes);
|
||||
ov::MemorySolver staticMemSolver(definedBoxes);
|
||||
size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;
|
||||
|
||||
memWorkspace = std::make_shared<Memory>(getEngine(), DnnlBlockedMemoryDesc(ov::element::i8, Shape(VectorDims{total_size})));
|
||||
@ -693,7 +694,7 @@ void Graph::AllocateWithReuse() {
|
||||
int count = 0;
|
||||
for (auto& edge : edge_clusters[box.id]) {
|
||||
if (edge->getStatus() == Edge::Status::NeedAllocation) {
|
||||
int64_t offset = staticMemSolver.getOffset(box.id);
|
||||
int64_t offset = staticMemSolver.get_offset(box.id);
|
||||
// !! Fallback to individual memory allocation !!
|
||||
// if you like to check infer without reuse just call this function without arguments.
|
||||
edge->allocate(workspace_ptr + offset * alignment); // alignment in byte
|
||||
@ -762,9 +763,9 @@ void Graph::AllocateWithReuse() {
|
||||
}
|
||||
}
|
||||
|
||||
MemorySolver::normalizeBoxes(undefinedBoxes);
|
||||
ov::MemorySolver::normalize_boxes(undefinedBoxes);
|
||||
|
||||
std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
|
||||
std::vector<std::vector<ov::MemorySolver::Box>> groups; //groups of nonoverlapping boxes
|
||||
constexpr bool enableMemReuse = true; // set false to disable mem reuse for debug purposes
|
||||
if (enableMemReuse) {
|
||||
groups.push_back({undefinedBoxes.front()});
|
||||
|
@ -22,6 +22,20 @@ using LoopPort = LoopManager::LoopPort;
|
||||
|
||||
BrgemmBlocking::BrgemmBlocking() : Pass() {}
|
||||
|
||||
void BrgemmBlocking::move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it) {
|
||||
const auto& brgemm_expr = brgemm_it->get();
|
||||
const auto wsp_expr = brgemm_expr->get_input_port_connector(2)->get_source().get_expr();
|
||||
const auto wsp_buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(wsp_expr->get_node());
|
||||
OPENVINO_ASSERT(wsp_buffer && wsp_buffer->is_new_memory(), "Incorrect Scratchpad buffer for Brgemm AMX");
|
||||
// [115164] Should be fully supported by explicit loops of blocking by K, N
|
||||
OPENVINO_ASSERT(brgemm_expr->get_loop_ids().empty() && wsp_expr->get_loop_ids().empty(), "Incorrect blocking loop marking for Brgemm AMX");
|
||||
// If scratchpad with temp memory is not explicitly before Brgemm, need to move to there.
|
||||
if (wsp_expr != *std::prev(brgemm_it)) {
|
||||
const auto wsp_it = linear_ir.find(wsp_expr);
|
||||
linear_ir.move(wsp_it, brgemm_it);
|
||||
}
|
||||
}
|
||||
|
||||
bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
|
||||
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmBlocking")
|
||||
if (linear_ir.empty())
|
||||
@ -64,11 +78,18 @@ bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {
|
||||
const auto work_amount = m;
|
||||
const auto increment = block_size;
|
||||
|
||||
auto loop_begin_it = expr_it, loop_end_it = std::next(expr_it);
|
||||
std::vector<LoopPort> entries{LoopPort(expr->get_input_port(0), true), LoopPort(expr->get_input_port(1), false)};
|
||||
if (brgemm->is_with_scratchpad())
|
||||
// Scratchpad for AMX scenario is needed only as temporary buffer for each M block - it means that the Buffer should be in this loop.
|
||||
// Other scratchpads (that after BrgemmCopyB) should be the loop outside.
|
||||
if (brgemm->is_with_compensations()) {
|
||||
entries.emplace_back(expr->get_input_port(2), false);
|
||||
} else if (brgemm->is_amx()) {
|
||||
move_new_memory_buffer(linear_ir, expr_it);
|
||||
loop_begin_it = std::prev(expr_it);
|
||||
}
|
||||
std::vector<LoopPort> exits{LoopPort(expr->get_output_port(0), true)};
|
||||
loop_manager->mark_loop(expr_it, std::next(expr_it), work_amount, increment, dim_idx, entries, exits);
|
||||
loop_manager->mark_loop(loop_begin_it, loop_end_it, work_amount, increment, dim_idx, entries, exits);
|
||||
}
|
||||
|
||||
return modified;
|
||||
|
@ -21,6 +21,9 @@ public:
|
||||
OPENVINO_RTTI("BrgemmBlocking", "Pass")
|
||||
BrgemmBlocking();
|
||||
bool run(snippets::lowered::LinearIR& linear_ir) override;
|
||||
|
||||
private:
|
||||
static void move_new_memory_buffer(snippets::lowered::LinearIR& linear_ir, const snippets::lowered::LinearIR::constExprIt& brgemm_it);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
@ -0,0 +1,216 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "openvino/opsets/opset.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/lowered/linear_ir.hpp"
|
||||
#include "snippets/lowered/pass/mark_loops.hpp"
|
||||
#include "snippets/lowered/pass/init_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_load_store.hpp"
|
||||
#include "snippets/lowered/pass/validate_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_loops.hpp"
|
||||
#include "snippets/lowered/pass/allocate_buffers.hpp"
|
||||
#include "snippets/lowered/pass/fuse_loops.hpp"
|
||||
#include "snippets/lowered/pass/split_loops.hpp"
|
||||
#include "snippets/lowered/pass/insert_buffers.hpp"
|
||||
#include "snippets/lowered/pass/softmax_decomposition.hpp"
|
||||
|
||||
#include "transformations/snippets/x64/shape_inference.hpp"
|
||||
#include "transformations/snippets/x64/pass/lowered/brgemm_blocking.hpp"
|
||||
#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
|
||||
#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
|
||||
|
||||
#include "common_test_utils/ov_test_utils.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
/* Note[74841]:
|
||||
* This test is almost full copy of BufferAllocationTest class from openvino/src/common/snippets/tests/include/lowered/pass/buffer_allocation.hpp.
|
||||
* The BufferAllocationTest class should be shared test class to reuse this structure in backend-specific tests in test infrastructure refactoring.
|
||||
*/
|
||||
|
||||
typedef std::tuple<
|
||||
bool, // Optimized pipeline
|
||||
bool, // With SplitLoops opt
|
||||
size_t, // Expected Buffer size in bytes
|
||||
size_t // Expected unique Buffer IDs count
|
||||
> BufferAllocationCPUParams;
|
||||
|
||||
class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCPUParams> {
|
||||
public:
|
||||
using VectorDims = ov::snippets::VectorDims;
|
||||
static std::string getTestCaseName(testing::TestParamInfo<BufferAllocationCPUParams> obj) {
|
||||
bool is_optimized, with_split_loops;
|
||||
size_t expected_size, expected_count;
|
||||
std::tie(is_optimized, with_split_loops, expected_size, expected_count) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "Opt=" << ov::test::utils::bool2str(is_optimized) << "_";
|
||||
result << "Split=" << ov::test::utils::bool2str(with_split_loops) << "_";
|
||||
result << "ExpBufferSize=" << expected_size << "_";
|
||||
result << "ExpBufferNum=" << expected_count;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
bool is_optimized, with_split_loops;
|
||||
std::tie(is_optimized, with_split_loops, m_expected_size, m_expected_count) = this->GetParam();
|
||||
|
||||
const auto body = GetModel();
|
||||
m_linear_ir = ov::snippets::lowered::LinearIR(body, std::make_shared<ov::snippets::CPUShapeInferSnippetsFactory>());
|
||||
m_linear_ir.set_loop_depth(m_loop_depth);
|
||||
ApplyTransformations(is_optimized, with_split_loops);
|
||||
}
|
||||
|
||||
void ApplyTransformations(bool is_optimized, bool with_split_loops) {
|
||||
ov::snippets::lowered::pass::PassPipeline pipeline;
|
||||
pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
|
||||
if (with_split_loops)
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::SplitLoops>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertBuffers>(2);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoadStore>(m_vector_size);
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InitLoops>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::InsertLoops>();
|
||||
pipeline.register_pass<ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape>();
|
||||
pipeline.register_pass<ov::snippets::lowered::pass::AllocateBuffers>(m_buffer_scratchpad, is_optimized);
|
||||
pipeline.run(m_linear_ir);
|
||||
}
|
||||
|
||||
void Validate() {
|
||||
std::set<size_t> gprs;
|
||||
for (const auto& expr : m_linear_ir) {
|
||||
if (const auto buffer = ov::as_type_ptr<ov::snippets::op::Buffer>(expr->get_node())) {
|
||||
gprs.insert(buffer->get_id());
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(gprs.size(), m_expected_count);
|
||||
EXPECT_EQ(m_buffer_scratchpad, m_expected_size);
|
||||
}
|
||||
|
||||
virtual std::shared_ptr<ov::Model> GetModel() const = 0;
|
||||
|
||||
void MarkOp(const std::shared_ptr<ov::Node>& node, const std::vector<size_t>& subtensor) const {
|
||||
for (const auto& input : node->inputs())
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
input, std::make_shared<ov::snippets::lowered::PortDescriptor>(input, subtensor));
|
||||
for (const auto& output : node->outputs())
|
||||
ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor_ptr(
|
||||
output, std::make_shared<ov::snippets::lowered::PortDescriptor>(output, subtensor));
|
||||
}
|
||||
|
||||
size_t m_buffer_scratchpad = 0;
|
||||
ov::snippets::lowered::LinearIR m_linear_ir;
|
||||
|
||||
size_t m_expected_size = 0;
|
||||
size_t m_expected_count = 0;
|
||||
|
||||
size_t m_loop_depth = 2;
|
||||
size_t m_vector_size = 16;
|
||||
};
|
||||
|
||||
class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> GetModel() const override {
|
||||
const auto subtensor_scalar = std::vector<size_t>{1, 1};
|
||||
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
|
||||
const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);
|
||||
|
||||
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
|
||||
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 128, 12, 64}));
|
||||
const auto parameter2 = std::make_shared<ov::op::v0::Parameter>(ov::element::bf16, ov::PartialShape({1, 12, 128, 64}));
|
||||
|
||||
const auto load_reshape = std::make_shared<ov::snippets::op::LoadReshape>(parameter1, 1, 0, std::vector<size_t>{0, 2, 3, 1});
|
||||
const auto store = std::make_shared<ov::snippets::op::Store>(load_reshape);
|
||||
const auto convert0 = std::make_shared<ov::snippets::op::ConvertSaturation>(store, ov::element::f32);
|
||||
const auto relu0 = std::make_shared<ov::op::v0::Relu>(convert0);
|
||||
const auto convert1 = std::make_shared<ov::snippets::op::ConvertSaturation>(relu0, ov::element::bf16);
|
||||
|
||||
const auto brgemm_copyb0 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
|
||||
convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
|
||||
const auto scratch0 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
|
||||
const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
|
||||
parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX);
|
||||
brgemm_cpu0->set_m_block_size(32);
|
||||
|
||||
const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
|
||||
const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
|
||||
const auto convert2 = std::make_shared<ov::snippets::op::ConvertSaturation>(softmax, ov::element::bf16);
|
||||
|
||||
const auto brgemm_copyb1 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
|
||||
parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
|
||||
const auto scratch1 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
|
||||
const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
|
||||
convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX);
|
||||
brgemm_cpu1->set_m_block_size(32);
|
||||
|
||||
const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);
|
||||
|
||||
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(relu2), ov::ParameterVector{parameter0, parameter1, parameter2});
|
||||
|
||||
MarkOp(load_reshape, subtensor_scalar);
|
||||
MarkOp(store, subtensor_scalar);
|
||||
MarkOp(softmax, subtensor_softmax);
|
||||
|
||||
MarkOp(brgemm_cpu0, subtensor_full);
|
||||
MarkOp(brgemm_cpu1, subtensor_full);
|
||||
MarkOp(brgemm_copyb0, subtensor_full);
|
||||
MarkOp(brgemm_copyb1, subtensor_full);
|
||||
MarkOp(scratch0, subtensor_full);
|
||||
MarkOp(scratch1, subtensor_full);
|
||||
|
||||
return body;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) {
|
||||
Validate();
|
||||
}
|
||||
|
||||
|
||||
namespace BufferAllocationCPUTest_Instances {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXNotOptimizedWSplit, MHABF16AMXBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(false),
|
||||
::testing::Values(true),
|
||||
::testing::Values(196608),
|
||||
::testing::Values(11)),
|
||||
BufferAllocationCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHABF16AMXBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(true),
|
||||
::testing::Values(true),
|
||||
::testing::Values(90112),
|
||||
::testing::Values(4)),
|
||||
BufferAllocationCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(false),
|
||||
::testing::Values(false),
|
||||
::testing::Values(393216),
|
||||
::testing::Values(11)),
|
||||
BufferAllocationCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(true),
|
||||
::testing::Values(false),
|
||||
::testing::Values(114688),
|
||||
::testing::Values(4)),
|
||||
BufferAllocationCPUTest::getTestCaseName);
|
||||
|
||||
} // namespace BufferAllocationCPUTest_Instances
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -16,7 +16,7 @@
|
||||
#include "gna_lib_ver_selector.hpp"
|
||||
#include "gna_mem_requests.hpp"
|
||||
#include "log/log.hpp"
|
||||
#include "memory_solver.hpp"
|
||||
#include "openvino/runtime/memory_solver.hpp"
|
||||
|
||||
using namespace ov::intel_gna;
|
||||
|
||||
@ -239,7 +239,7 @@ public:
|
||||
size_t calcSize(bool isCompact = false) override {
|
||||
if (isCompact) {
|
||||
_size = 0;
|
||||
std::vector<MemorySolver::Box> boxes;
|
||||
std::vector<ov::MemorySolver::Box> boxes;
|
||||
for (size_t i = 0; i < _mem_requests.size(); ++i) {
|
||||
// skipping BIND, cross-region and empty requests
|
||||
if (_mem_requests[i]._type & REQUEST_BIND || _mem_requests[i]._ptr_out == nullptr) {
|
||||
@ -255,12 +255,12 @@ public:
|
||||
boxes.push_back({start, stop, static_cast<int64_t>(original_with_pad), static_cast<int64_t>(i)});
|
||||
}
|
||||
|
||||
MemorySolver memSolver(boxes);
|
||||
ov::MemorySolver memSolver(boxes);
|
||||
_size = memSolver.solve();
|
||||
|
||||
// setting offsets
|
||||
for (auto const& box : boxes) {
|
||||
_mem_requests[box.id]._offset = memSolver.getOffset(static_cast<int>(box.id));
|
||||
_mem_requests[box.id]._offset = memSolver.get_offset(static_cast<int>(box.id));
|
||||
}
|
||||
return _size;
|
||||
} else {
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include "gna_mem_requests_queue.hpp"
|
||||
#include "log/log.hpp"
|
||||
#include "memory/gna_allocator.hpp"
|
||||
#include "memory_solver.hpp"
|
||||
#include "openvino/runtime/memory_solver.hpp"
|
||||
|
||||
#ifdef GNA_MEMORY_DUMP
|
||||
# include <iomanip>
|
||||
|
Loading…
Reference in New Issue
Block a user