[Snippets][CPU] Added FP32 MHA tokenization support (#14327)
This commit is contained in:
parent
6ec71c376a
commit
6525dd4727
@ -26,9 +26,10 @@ ie_faster_build(${TARGET_NAME}
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime
|
||||
PRIVATE ngraph_reference openvino::runtime::dev)
|
||||
PRIVATE ngraph_reference ov_shape_inference openvino::runtime::dev)
|
||||
|
||||
target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>)
|
||||
target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${PUBLIC_HEADERS_DIR}>
|
||||
PRIVATE $<BUILD_INTERFACE:${SHAPE_INFER_INCLUDE_DIR}>)
|
||||
|
||||
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
|
||||
|
||||
|
@ -84,7 +84,7 @@ public:
|
||||
* @param f can this kernel be linearided to 1D range
|
||||
* @param p pointer to generated code
|
||||
*/
|
||||
Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
|
||||
Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
|
||||
/**
|
||||
* @brief Returns callable instanse of code pointer
|
||||
*/
|
||||
@ -92,7 +92,7 @@ public:
|
||||
return reinterpret_cast<K>(const_cast<unsigned char*>(ptr));
|
||||
}
|
||||
|
||||
Shape work_size {};
|
||||
ov::PartialShape work_size {};
|
||||
bool is_flat {false};
|
||||
code ptr {nullptr};
|
||||
};
|
||||
@ -112,21 +112,43 @@ public:
|
||||
* @brief Default destructor
|
||||
*/
|
||||
virtual ~Generator() = default;
|
||||
/**
|
||||
* @interface GeneratorConfig
|
||||
* @brief Allows to tweak the lowering process.
|
||||
*/
|
||||
class GeneratorConfig {
|
||||
public:
|
||||
// True if the lowered Emitters need to be accessed during runtime. Normally they're destroyed after code emission.
|
||||
bool m_save_lowered_code = false;
|
||||
// True if we can optimize tails for single evaluation during code generation
|
||||
// More details with optimization examples you can see in generate() method
|
||||
// For example, tails with Buffer ops doesn't support single evaluation optimizations
|
||||
// because of that we should always reset memory pointer using finalization offsets
|
||||
// after data storing to Buffer
|
||||
bool m_optimize_single_evaluation = true;
|
||||
// True if we should check runtime info for nodes to call specific needed transformations
|
||||
bool m_need_fill_tail_register = false;
|
||||
};
|
||||
/**
|
||||
* @brief virtual method any specific implementation should implement
|
||||
* @param m model in canonical for for table-based code generation
|
||||
* @param config config with transformation and optimization parameters
|
||||
* @param compile_params parameters for generated code
|
||||
* @return pointer to generated code
|
||||
*/
|
||||
code generate(std::shared_ptr<ov::Model>& m, const void* compile_params = nullptr) const;
|
||||
code generate(std::shared_ptr<ov::Model>& m, const GeneratorConfig& config, const void* compile_params = nullptr);
|
||||
|
||||
/**
|
||||
* @brief gets target machine
|
||||
* @return pointer to constant target machine
|
||||
*/
|
||||
std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
|
||||
std::shared_ptr<const TargetMachine> get_target_machine() const;
|
||||
|
||||
protected:
|
||||
std::shared_ptr<TargetMachine> target;
|
||||
// todo: we need to save lowered code to access compiled brgemm kernels on execution time (normally lowered is destructed by then).
|
||||
// This is temporary solution, remove this when kernel caching is implemented. Don't forget to make generate const method.
|
||||
std::vector<AllocatedEmitter> lowered_saved;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
|
47
src/common/snippets/include/snippets/op/brgemm.hpp
Normal file
47
src/common/snippets/include/snippets/op/brgemm.hpp
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "ngraph/op/matmul.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface Brgemm
|
||||
* @brief Brgemm is a batch-reduced matrix multiplication with the support of arbitrary strides between matrices rows
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Brgemm : public ngraph::op::v0::MatMul {
|
||||
public:
|
||||
OPENVINO_OP("Brgemm", "SnippetsOpset", ngraph::op::v0::MatMul);
|
||||
Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a = 0lu, const size_t offset_b = 0lu, const size_t offset_c = 0lu);
|
||||
Brgemm() = default;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
void validate_and_infer_types() override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
bool has_evaluate() const override { return false; }
|
||||
|
||||
size_t get_offset_a() const { return m_offset_a; }
|
||||
size_t get_offset_b() const { return m_offset_b; }
|
||||
size_t get_offset_c() const { return m_offset_c; }
|
||||
|
||||
void set_offset_a(const size_t offset) { m_offset_a = offset; }
|
||||
void set_offset_b(const size_t offset) { m_offset_b = offset; }
|
||||
void set_offset_c(const size_t offset) { m_offset_c = offset; }
|
||||
|
||||
private:
|
||||
size_t m_offset_a = 0lu; // offset for first input
|
||||
size_t m_offset_b = 0lu; // offset for second input
|
||||
size_t m_offset_c = 0lu; // offset for output
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -21,12 +21,18 @@ class BroadcastLoad : public BroadcastMove {
|
||||
public:
|
||||
OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);
|
||||
|
||||
BroadcastLoad(const Output<Node>& x, Shape output_shape);
|
||||
BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape, size_t offset = 0lu);
|
||||
BroadcastLoad() = default;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
size_t get_offset() const { return m_offset; }
|
||||
void set_offset(const size_t offset) { m_offset = offset; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
private:
|
||||
size_t m_offset = 0lu;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("BroadcastMove", "SnippetsOpset");
|
||||
|
||||
BroadcastMove(const Output<Node>& x, Shape output_shape);
|
||||
BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
|
||||
BroadcastMove() = default;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
@ -28,12 +28,9 @@ public:
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
|
||||
protected:
|
||||
Shape output_shape;
|
||||
ov::PartialShape output_shape;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
47
src/common/snippets/include/snippets/op/buffer.hpp
Normal file
47
src/common/snippets/include/snippets/op/buffer.hpp
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface Buffer
|
||||
* @brief The operation is for intermediate data storage
|
||||
* - m_allocation_rank - rank of shape for memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
|
||||
* It's needed to allocate needed memory size that depends on Tile rank, for example.
|
||||
* Default value is -1 (full shape)
|
||||
* Notes:
|
||||
* - All buffers in a graph have the same memory pointer. So if we have a few buffers,
|
||||
* each the corresponding MemoryAccess op for Buffer should have offset for common memory pointer of this Buffer
|
||||
* - Buffer should be a single consumer for operation output port
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Buffer : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Buffer", "SnippetsOpset");
|
||||
|
||||
Buffer(const Output<Node>& x, const int32_t allocation_rank = -1);
|
||||
Buffer() = default;
|
||||
|
||||
int32_t get_allocation_rank() const { return m_allocation_rank; }
|
||||
void set_allocation_rank(int32_t rank) { m_allocation_rank = rank; }
|
||||
|
||||
size_t get_byte_size() const;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
private:
|
||||
int32_t m_allocation_rank = -1;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
47
src/common/snippets/include/snippets/op/fill.hpp
Normal file
47
src/common/snippets/include/snippets/op/fill.hpp
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface Fill
|
||||
* @brief Generated in Tail Loop vector representation in code generation step for cases when we should
|
||||
* refill registers by special values.
|
||||
* For example, for cases with ReduceMax or ReduceSum in Softmax
|
||||
* Where:
|
||||
* - offset - starting element index where filling is performed while beginning of input data is untouched
|
||||
* - fill_value - hexadecimal filling value
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Fill : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Fill", "SnippetsOpset");
|
||||
|
||||
Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value = 0x0);
|
||||
Fill() = default;
|
||||
|
||||
size_t get_offset() const { return m_offset; }
|
||||
uint32_t get_fill_value() const { return m_fill_value; }
|
||||
|
||||
void set_offset(const size_t offset) { m_offset = offset; }
|
||||
void set_fill_value(const uint32_t fill_value) { m_fill_value = fill_value; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
protected:
|
||||
size_t m_offset = 0lu;
|
||||
uint32_t m_fill_value = 0x0;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
32
src/common/snippets/include/snippets/op/horizon_max.hpp
Normal file
32
src/common/snippets/include/snippets/op/horizon_max.hpp
Normal file
@ -0,0 +1,32 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface HorizonMax
|
||||
* @brief The operation calculates a horizon maximum of a vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class HorizonMax : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("HorizonMax", "SnippetsOpset");
|
||||
|
||||
HorizonMax(const Output<Node>& x);
|
||||
HorizonMax() = default;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override { return true;}
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
32
src/common/snippets/include/snippets/op/horizon_sum.hpp
Normal file
32
src/common/snippets/include/snippets/op/horizon_sum.hpp
Normal file
@ -0,0 +1,32 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface HorizonSum
|
||||
* @brief The operation calculates a horizon sum of a vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class HorizonSum : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("HorizonSum", "SnippetsOpset");
|
||||
|
||||
HorizonSum(const Output<Node>& x);
|
||||
HorizonSum() = default;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override { return true;}
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -20,13 +20,14 @@ class Kernel : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Kernel", "SnippetsOpset");
|
||||
|
||||
Kernel(const std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>>& region);
|
||||
Kernel(std::vector<AllocatedEmitter> region, std::shared_ptr<const ov::Model> m);
|
||||
Kernel() = default;
|
||||
|
||||
std::vector<std::pair<std::shared_ptr<ngraph::snippets::Emitter>, ngraph::snippets::RegInfo>> region;
|
||||
std::vector<AllocatedEmitter> region;
|
||||
const std::shared_ptr<const ov::Model> model;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
|
||||
return std::make_shared<Kernel>(region);
|
||||
return std::make_shared<Kernel>(region, model);
|
||||
}
|
||||
const void *compile_params = nullptr;
|
||||
};
|
||||
|
@ -5,6 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "snippets/op/memory_access.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
@ -12,36 +13,41 @@ namespace op {
|
||||
|
||||
/**
|
||||
* @interface Load
|
||||
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data loading
|
||||
* where number of elements to load is determined by "count"
|
||||
* Default value is "1" - to load one element
|
||||
* @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data loading
|
||||
* where number of elements to load is determined by "count" (Default value is "1" - to load one element)
|
||||
* and memory offset for loading is determined by "offset" (Default value is "0" - to load starting from the first element)
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Load : public ngraph::op::Op {
|
||||
class Load : public MemoryAccess {
|
||||
public:
|
||||
OPENVINO_OP("Load", "SnippetsOpset");
|
||||
|
||||
Load(const Output<Node>& x, const size_t count = 1lu);
|
||||
Load(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
|
||||
Load() = default;
|
||||
|
||||
size_t get_count() const { return m_count; }
|
||||
|
||||
void set_count(const size_t count) { m_count = count; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
|
||||
protected:
|
||||
size_t m_count = 0lu;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface LoadReshape
|
||||
* @brief It's just Load operation (and it's mapped on LoadEmitter during code generation) that allows to tweak
|
||||
* shape propagation. We need it to keep correct shape propagation when Transpose is decomposed to
|
||||
* Load and Store. This is a temporary solution until tokenization of Reshape operation is supported.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoadReshape : public Load {
|
||||
public:
|
||||
OPENVINO_OP("LoadReshape", "SnippetsOpset", Load);
|
||||
LoadReshape(const Output<Node>& x, size_t count = 1lu, const size_t offset = 0lu, std::vector<size_t> order = {});
|
||||
LoadReshape() = default;
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
private:
|
||||
std::vector<size_t> m_order;
|
||||
};
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
|
111
src/common/snippets/include/snippets/op/loop.hpp
Normal file
111
src/common/snippets/include/snippets/op/loop.hpp
Normal file
@ -0,0 +1,111 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/emitter.hpp"
|
||||
#include "ngraph/op/parameter.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface LoopBase
|
||||
* @brief Base class for LoopBegin and LoopEnd
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoopBase : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("LoopBase", "SnippetsOpset");
|
||||
LoopBase(const std::vector<Output<Node>>& args, size_t work_amount, size_t increment);
|
||||
LoopBase() = default;
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
size_t get_work_amount() const;
|
||||
size_t get_increment() const;
|
||||
bool get_evaluate_once() const;
|
||||
|
||||
protected:
|
||||
size_t work_amount;
|
||||
size_t work_amount_increment;
|
||||
bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter
|
||||
};
|
||||
class LoopEnd;
|
||||
/**
|
||||
* @interface LoopBegin
|
||||
* @brief Marks the start of the Loop region.
|
||||
* Number of outputs always equals to the number of inputs (bypassed values) + 1 (edge to the corresponding LoopEnd)
|
||||
* @param args - vector of input values, they are passed directly to output.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoopBegin : public LoopBase {
|
||||
friend LoopEnd;
|
||||
|
||||
public:
|
||||
OPENVINO_OP("LoopBegin", "SnippetsOpset", LoopBase);
|
||||
explicit LoopBegin(const OutputVector& args);
|
||||
LoopBegin() = default;
|
||||
void validate_and_infer_types() override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
std::shared_ptr<LoopEnd> get_loop_end();
|
||||
// begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters
|
||||
const uint8_t* begin_address;
|
||||
std::vector<size_t> input_regs;
|
||||
|
||||
private:
|
||||
void validate_and_infer_types_except_LoopEnd();
|
||||
LoopBegin(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment);
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface LoopEnd
|
||||
* @brief Marks the end of the Loop region and defines the loop properties.
|
||||
* Number of outputs always equals to the number of inputs (bypassed values) - 1 (edge to the corresponding LoopEnd)
|
||||
* @param args vector of input values + LoopBegin, all values except for the LoopBegin are passed directly to output.
|
||||
* @param work_amount total number of evaluations to be processed by the loop
|
||||
* @param increment number of evaluations processed in one iteration of the loop.
|
||||
* @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
|
||||
* should be used when Loop is connected to Parameters and/or Results. If apply_increment[i] == true then i-th i/o data
|
||||
* pointer will be incremented by work_amount*data_size on every iteration.
|
||||
* @param ptr_increments specifies i/o pointer increment performed on every iteration. This is an alternative to
|
||||
* apply_increments, which enables more flexibility.
|
||||
* @param finalization_offsets pointer increments that are be applied to i/o pointers before exiting the loop
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoopEnd : public LoopBase {
|
||||
public:
|
||||
OPENVINO_OP("LoopEnd", "SnippetsOpset", LoopBase);
|
||||
LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment,
|
||||
std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets);
|
||||
LoopEnd(const std::vector<Output<Node>>& args, size_t work_amount, size_t work_amount_increment,
|
||||
std::vector<int64_t> ptr_increments, std::vector<int64_t> finalization_offsets);
|
||||
LoopEnd() = default;
|
||||
std::shared_ptr<LoopBegin> get_loop_begin();
|
||||
void validate_and_infer_types() override;
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override;
|
||||
const std::vector<int64_t>& get_finalization_offsets() const;
|
||||
const std::vector<int64_t>& get_ptr_increments() const;
|
||||
void set_finalization_offsets(std::vector<int64_t> offsets);
|
||||
void set_ptr_increments(std::vector<int64_t> new_ptr_increments);
|
||||
// update_ptr_increments resets non-zero increments to the new_increments. It's used when work_amount_increment is
|
||||
// updated and we need to refresh ptr increments accordingly while respecting the broadcasting pattern
|
||||
void update_ptr_increments(int64_t new_increment);
|
||||
void set_work_amount(size_t new_work_amount);
|
||||
void set_increment(size_t new_increment);
|
||||
void set_evaluate_once(bool once);
|
||||
// Used to propagate information about Loop structure, needed to simplify some optimizations. For example,
|
||||
// to skip pointer increments when outer Loop is empty, and work_amount == vector_size (one inner vector Loop)
|
||||
// true by default, the optimizations enabled if it's false;
|
||||
bool has_outer_loop;
|
||||
|
||||
private:
|
||||
std::vector<int64_t> ptr_increments;
|
||||
std::vector<int64_t> finalization_offsets;
|
||||
size_t loop_io_size;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
42
src/common/snippets/include/snippets/op/memory_access.hpp
Normal file
42
src/common/snippets/include/snippets/op/memory_access.hpp
Normal file
@ -0,0 +1,42 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface MemoryAccess
|
||||
* @brief This is a base class for memory access operations (like Load and Store).
|
||||
* It provides universal set/get interface to manipulate the number
|
||||
* of elements accessed during one operation call ("count").
|
||||
* Default "count" value is "1" - it means to load/store one element
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
||||
class MemoryAccess : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("MemoryAccess", "SnippetsOpset");
|
||||
|
||||
size_t get_count() const;
|
||||
size_t get_offset() const;
|
||||
void set_count(const size_t count);
|
||||
void set_offset(const size_t offset);
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
protected:
|
||||
explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu, size_t offset = 0lu);
|
||||
MemoryAccess() = default;
|
||||
size_t m_count = 0lu;
|
||||
size_t m_offset = 0lu;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -20,7 +20,6 @@ namespace op {
|
||||
class PowerStatic : public ov::op::util::UnaryElementwiseArithmetic {
|
||||
public:
|
||||
OPENVINO_OP("PowerStatic", "SnippetsOpset", ov::op::util::UnaryElementwiseArithmetic);
|
||||
BWDCMP_RTTI_DECLARATION;
|
||||
|
||||
PowerStatic() = default;
|
||||
PowerStatic(const Output <Node> &arg, float power) : UnaryElementwiseArithmetic(arg), power(power) {
|
||||
|
@ -19,7 +19,6 @@ namespace op {
|
||||
class Scalar : public ov::op::v0::Constant {
|
||||
public:
|
||||
OPENVINO_OP("Scalar", "SnippetsOpset", ov::op::v0::Constant);
|
||||
BWDCMP_RTTI_DECLARATION;
|
||||
|
||||
Scalar() = default;
|
||||
|
||||
@ -37,6 +36,7 @@ public:
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
@ -5,6 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
#include "snippets/op/memory_access.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
@ -12,34 +13,19 @@ namespace op {
|
||||
|
||||
/**
|
||||
* @interface Store
|
||||
* @brief Generated by Canonicalization step where explicit instructions should be emitted for data storing
|
||||
* where number of elements to store is determined by "count"
|
||||
* Default value is "1" - to store one element
|
||||
* @brief Generated during Lowering stage (convert_to_snippets_dialect) where explicit instructions should be emitted for data storing
|
||||
* where number of elements to store is determined by "count" (Default value is "1" - to store one element)
|
||||
* and memory offset for storing is determined by "offset" (Default value is "0" - to store starting at start memory ptr)
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Store : public ngraph::op::Op {
|
||||
class Store : public MemoryAccess {
|
||||
public:
|
||||
OPENVINO_OP("Store", "SnippetsOpset");
|
||||
|
||||
Store(const Output<Node>& x, const size_t count = 1lu);
|
||||
Store(const Output<Node>& x, const size_t count = 1lu, const size_t offset = 0lu);
|
||||
Store() = default;
|
||||
|
||||
size_t get_count() const { return m_count; }
|
||||
|
||||
void set_count(const size_t count) { m_count = count; }
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
|
||||
protected:
|
||||
size_t m_count = 0lu;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
|
@ -26,7 +26,7 @@ namespace op {
|
||||
class Subgraph : public ov::op::util::SubGraphOp {
|
||||
public:
|
||||
OPENVINO_OP("Subgraph", "SnippetsOpset", ov::op::util::SubGraphOp);
|
||||
BWDCMP_RTTI_DECLARATION;
|
||||
enum {DYNAMIC_DIMENSION = 0xffffffffffffffff};
|
||||
|
||||
// < 1, 42, 17, 15, 16> < 0, 1, 2, 3, 1>
|
||||
// should be:
|
||||
@ -69,7 +69,7 @@ public:
|
||||
//
|
||||
// D = < 1, 3, 17, 15, 32> < 0, 1, 2, 3, 4>
|
||||
// E = < 1, 3, 17, 1, 32> < 0, 1, 2, 3, 4>
|
||||
using BlockedShape = std::tuple<ngraph::Shape, ngraph::AxisVector, ngraph::element::Type>;
|
||||
using BlockedShape = std::tuple<ngraph::PartialShape, ngraph::AxisVector, ngraph::element::Type>;
|
||||
using BlockedShapeVector = std::vector<BlockedShape>;
|
||||
|
||||
Subgraph() = default;
|
||||
@ -86,80 +86,82 @@ public:
|
||||
|
||||
// we introduce this method instead of using SubGraphOp::get_function()
|
||||
// to align naming with other methods
|
||||
const std::shared_ptr<ov::Model> & body_ptr() const {
|
||||
return m_bodies[0];
|
||||
}
|
||||
const std::shared_ptr<ov::Model>& body_ptr() const { return m_bodies[0]; }
|
||||
std::shared_ptr<ov::Model>& body_ptr() { return m_bodies[0]; }
|
||||
|
||||
std::shared_ptr<ov::Model> & body_ptr() {
|
||||
return m_bodies[0];
|
||||
}
|
||||
const ov::Model& body() const { return *m_bodies[0]; }
|
||||
ov::Model& body() { return *m_bodies[0]; }
|
||||
|
||||
const ov::Model & body() const {
|
||||
return *m_bodies[0];
|
||||
}
|
||||
const std::shared_ptr<ngraph::snippets::Generator>& get_generator() const { return m_generator; }
|
||||
std::shared_ptr<ngraph::snippets::Generator> & get_generator() { return m_generator; }
|
||||
|
||||
ov::Model & body() {
|
||||
return *m_bodies[0];
|
||||
}
|
||||
|
||||
const std::shared_ptr<ngraph::snippets::Generator> & get_generator() const {
|
||||
return m_generator;
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::snippets::Generator> & get_generator() {
|
||||
return m_generator;
|
||||
}
|
||||
|
||||
size_t get_non_scalar_constants_count() const {
|
||||
return m_non_scalar_constants_count;
|
||||
}
|
||||
|
||||
bool is_quantized() const {
|
||||
return config.m_is_quantized;
|
||||
}
|
||||
|
||||
bool has_type_relaxed_ops() const {
|
||||
return config.m_has_type_relaxed_ops;
|
||||
}
|
||||
size_t get_buffer_scratchpad_size() const { return m_buffer_scratchpad; }
|
||||
size_t get_virtual_port_count() const { return m_virtual_port_count; }
|
||||
bool is_buffer_needed() const { return m_buffer_needed; }
|
||||
bool is_quantized() const { return config.m_is_quantized; }
|
||||
bool has_type_relaxed_ops() const { return config.m_has_type_relaxed_ops; }
|
||||
bool has_domain_sensitive_ops() const { return config.m_has_domain_sensitive_ops; }
|
||||
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
|
||||
const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
|
||||
snippets::Schedule generate(const void* compile_params = nullptr);
|
||||
Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
|
||||
ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
|
||||
std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
|
||||
std::vector<Shape> reshape_body(const std::vector<Shape>& input_shapes);
|
||||
|
||||
// plugin sets generator for a snippet to some specific generator.
|
||||
// it's going to be replaced with Jitters table later
|
||||
void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
|
||||
void set_non_scalar_constants_count(const size_t count);
|
||||
void set_tile_rank(size_t newRank) {tileRank = newRank;}
|
||||
void set_virtual_port_count(const size_t count);
|
||||
void set_buffer_needed(const bool need);
|
||||
|
||||
void print() const;
|
||||
void print_statistics(bool verbose);
|
||||
|
||||
void serialize() const;
|
||||
void set_master_shape(ov::PartialShape new_shape) {master_shape = std::move(new_shape);}
|
||||
|
||||
static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
|
||||
static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
|
||||
|
||||
// Non-scalar Constants are tokenized as Parameters inside Subgraph body but some operations with constant inputs
|
||||
// should have explicit Constants even if they're non-scalar (Reshape, Transpose, Broadcast)
|
||||
// This check returns True if Constant op which is input of this op should be inside Subgraph body
|
||||
static auto constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool;
|
||||
|
||||
private:
|
||||
void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
|
||||
void convert_to_snippet_dialect();
|
||||
|
||||
// Count of potentional non-scalar Consants that will be created after some tranformations
|
||||
// At the moment it's relevant only for FakeQuantize decomposition
|
||||
// NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
|
||||
void init_config();
|
||||
void initialize_buffer_scratchpad_size();
|
||||
// Count of Subgraph virtual ports:
|
||||
// - Potential non-scalar Constants that will be created after some transformations (At the moment it's relevant only for FakeQuantize decomposition)
|
||||
// Need Buffer op or not
|
||||
// - Buffers. All Buffers are considered as one common additional virtual port. So we cannot summarize them as potential non-scalar Constants
|
||||
// NOTE: To avoid overheads in each calculation of this count (for example, in validate_and_type_infer()),
|
||||
// we should MANUALLY calculate it where it needed.
|
||||
size_t m_non_scalar_constants_count = 0;
|
||||
size_t m_virtual_port_count = 0;
|
||||
bool m_buffer_needed = false;
|
||||
size_t m_buffer_scratchpad = 0lu;
|
||||
Shape exec_domain = {};
|
||||
std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
|
||||
|
||||
// TODO: Change logic of insert Converts. This exec element type can be different for plugins
|
||||
const ov::element::Type execution_element_type = ov::element::f32;
|
||||
|
||||
// Config to know which transformations should be called.
|
||||
// It helps to avoid overheads of extra transformation calls
|
||||
struct {
|
||||
ov::PartialShape master_shape;
|
||||
size_t tileRank = 0; // set by plugin to specify the number of dimensions processed in a single kernel call
|
||||
|
||||
/**
|
||||
* @interface SubgraphConfig
|
||||
* @brief Config to optimize IR transformation pipeline. It indicates which transformations are necessary
|
||||
* so the irrelevant ones could be skipped.
|
||||
*/
|
||||
class SubgraphConfig {
|
||||
public:
|
||||
// True if Subgraph contains FakeQuantize -> FQ decomposition should be called
|
||||
bool m_is_quantized = false;
|
||||
// True if we should align element types indise body
|
||||
@ -167,6 +169,12 @@ private:
|
||||
// True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
|
||||
// because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
|
||||
bool m_has_type_relaxed_ops = false;
|
||||
// True if body has operations that don't support plugin-side domain optimizations
|
||||
// (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing)
|
||||
bool m_has_domain_sensitive_ops = false;
|
||||
// True if we should go through whole body to check for where loops should be explicitly inserted.
|
||||
// Otherwise, we insert Loops on Parameters and Results - for example, it's optimized out for subgraph with only Eltwise ops
|
||||
bool m_explicit_loop_insertion = false;
|
||||
} config;
|
||||
};
|
||||
|
||||
@ -190,6 +198,24 @@ static inline auto build_subgraph(const std::shared_ptr<ngraph::Node>& node, con
|
||||
return subgraph;
|
||||
};
|
||||
|
||||
// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name();
|
||||
// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name
|
||||
auto inline update_out_tensor_name(const std::shared_ptr<ngraph::snippets::op::Subgraph>& subgraph) -> void {
|
||||
bool not_set = true;
|
||||
for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
|
||||
for (const auto &in : subgraph->get_output_target_inputs(i)) {
|
||||
if (ov::is_type<ov::op::v0::Result>(in.get_node())) {
|
||||
const auto& body_result = subgraph->body_ptr()->get_output_op(i);
|
||||
const auto& body_result_input = body_result->get_input_source_output(0);
|
||||
ngraph::snippets::op::Subgraph::fill_empty_output_names(
|
||||
subgraph->output(i), body_result_input);
|
||||
not_set = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
|
@ -1,48 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/emitter.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface Tile
|
||||
* @brief Generated by Canonicalization and represents Loop in affine notation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class Tile : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("Tile", "SnippetsOpset");
|
||||
|
||||
/// \brief Construct an Tile
|
||||
/// \param region The vector of pairs: emitters and the corresponding registers
|
||||
/// \param increment Tile size - count of elements to load and store.
|
||||
/// Vector Tile should have size of vector register and Scalar Tile should have 1
|
||||
/// \param num_inputs Count of inputs
|
||||
/// \param num_outputs Count of outputs
|
||||
/// \param io_dims Vector of last dimensions of inputs and outputs
|
||||
/// \param io_data_sizes Vector of data type sizes of inputs and outputs
|
||||
Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
|
||||
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
|
||||
Tile() = default;
|
||||
std::vector<AllocatedEmitter> region;
|
||||
size_t increment = 0;
|
||||
size_t num_inputs = 0;
|
||||
size_t num_outputs = 0;
|
||||
std::vector<size_t> io_dims {};
|
||||
std::vector<size_t> io_data_size {};
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
|
||||
return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,39 +0,0 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/emitter.hpp"
|
||||
#include "tile.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface TileScheduler
|
||||
* @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
|
||||
* before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
|
||||
* have to be read several times (broadcasting).
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TileScheduler : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("TileScheduler", "SnippetsOpset");
|
||||
|
||||
TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
|
||||
TileScheduler() = default;
|
||||
AllocatedEmitter vector_region;
|
||||
AllocatedEmitter scalar_region;
|
||||
// todo: this clone_with_new_inputs is irrelevant
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
|
||||
return std::make_shared<TileScheduler>(vector_region, scalar_region);
|
||||
}
|
||||
const void *compile_params;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
34
src/common/snippets/include/snippets/op/vector_buffer.hpp
Normal file
34
src/common/snippets/include/snippets/op/vector_buffer.hpp
Normal file
@ -0,0 +1,34 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/op/op.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/**
|
||||
* @interface VectorBuffer
|
||||
* @brief The operation is for intermediate data storage in vector register
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class VectorBuffer : public ngraph::op::Op {
|
||||
public:
|
||||
OPENVINO_OP("VectorBuffer", "SnippetsOpset");
|
||||
|
||||
VectorBuffer(const ov::element::Type element_type = ov::element::f32);
|
||||
|
||||
bool visit_attributes(AttributeVisitor& visitor) override { return true;}
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
private:
|
||||
ov::element::Type m_element_type;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface BroadcastToMoveBroadcast
|
||||
* @brief Inserts explicit MoveBroadcast instruction if broadcasting by most varying dimension is needed instead of Broadcast.
|
||||
* Otherwise the pass removes Broadcast operation.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class BroadcastToMoveBroadcast: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
BroadcastToMoveBroadcast();
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -12,28 +12,6 @@
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
/*
|
||||
NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked
|
||||
SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...).
|
||||
*/
|
||||
enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin};
|
||||
void SetSnippetsNodeType(const std::shared_ptr<Node>&, SnippetsNodeType);
|
||||
SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node>&);
|
||||
void SetTopologicalOrder(const std::shared_ptr<Node>&, int64_t);
|
||||
int64_t GetTopologicalOrder(const std::shared_ptr<const Node>&);
|
||||
bool AppropriateForSubgraph(const std::shared_ptr<const Node>&);
|
||||
|
||||
/**
|
||||
* @interface EnumerateNodes
|
||||
* @brief Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class EnumerateNodes : public ov::pass::ModelPass {
|
||||
public:
|
||||
OPENVINO_RTTI("EnumerateNodes", "0");
|
||||
EnumerateNodes() : ModelPass() {}
|
||||
bool run_on_model(const std::shared_ptr<ov::Model>&) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* @interface TokenizeSnippets
|
||||
@ -61,6 +39,10 @@ class TokenizeSnippets: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("TokenizeSnippets", "0");
|
||||
explicit TokenizeSnippets();
|
||||
|
||||
static bool AppropriateForSubgraph(const std::shared_ptr<const Node>&);
|
||||
|
||||
static const std::set<ngraph::element::Type> supported_element_types;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
@ -0,0 +1,32 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ExplicitTransposeMatMulInputs
|
||||
* @brief At the moment Snippets supports Transpose only with order {0, 2, 3, 1},
|
||||
* so if there is pattern in graph:
|
||||
* in0 Transpose{0, 2, 1, 3}
|
||||
* \ /
|
||||
* MatMul[false, true]
|
||||
* We can set false in MatMul parameter `transposed_b` and
|
||||
* change Transpose order to {0, 2, 3, 1} which is supported by Snippets
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ExplicitTransposeMatMulInputs: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ExplicitTransposeMatMulInputs();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,30 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/pass/graph_rewrite.hpp"
|
||||
#include "ngraph/pattern/matcher.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface FuseTransposeBrgemm
|
||||
* @brief Fuses Transpose with Brgemm node, fusing on both Brgemm inputs and output is supported. Applicable to
|
||||
* Transposes that don't change the position of the last dimension (since Brgemm supports strided rows i/o),
|
||||
* but only 0213 Transpose is currently supported.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class FuseTransposeBrgemm: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("FuseTransposeBrgemm", "0");
|
||||
FuseTransposeBrgemm();
|
||||
static const std::set<std::vector<int>> supported_cases;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
30
src/common/snippets/include/snippets/pass/insert_buffer.hpp
Normal file
30
src/common/snippets/include/snippets/pass/insert_buffer.hpp
Normal file
@ -0,0 +1,30 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InsertBuffer
|
||||
* @brief The pass inserts Buffers on Inputs and Outputs of special operations [Softmax, Transpose] is it's needed
|
||||
* @param allocation_rank - rank of shape for Buffer memory allocation: shape[shape_rank - normalize(m_allocation_rank) : shape_rank].
|
||||
* It's needed to allocate needed memory size that depends on Tile rank, for example.
|
||||
* Default value is -1 (full shape)
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class InsertBuffer: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
InsertBuffer(const int32_t allocation_rank = -1);
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -13,7 +13,7 @@ namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InsertLoad
|
||||
* @brief Inserts explicit load instruction after each parameter.
|
||||
* @brief Inserts explicit load instruction after each parameter and buffer.
|
||||
* The pass is used to convert model to a canonical form for code generation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
@ -24,7 +24,7 @@ public:
|
||||
|
||||
/**
|
||||
* @interface InsertStore
|
||||
* @brief Inserts explicit store instruction before each result.
|
||||
* @brief Inserts explicit store instruction before each result and buffer.
|
||||
* The pass is used to convert model to a canonical form for code generation
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
43
src/common/snippets/include/snippets/pass/insert_loops.hpp
Normal file
43
src/common/snippets/include/snippets/pass/insert_loops.hpp
Normal file
@ -0,0 +1,43 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface InsertLoops
|
||||
* @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution
|
||||
* @param master_shape - shape used to determine loop work amounts
|
||||
* @param loop_depth - the number of last master_shape dimensions processed by loops (aka tileRank - obsolete), could be 1 or 2
|
||||
* @param vector_size - the number of entities processed on one iteration of vector loop
|
||||
* @param single_loop_body - true, if we can just insert LoopBegin on inputs and LoopEnd on outputs, othwerwise
|
||||
* the pass goes all over the body analyzing where LoopBegin and LoopEnd should be inserted:
|
||||
* synchronization nodes are MatMul, Buffer and other already existing Loops.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class InsertLoops: public ngraph::pass::FunctionPass {
|
||||
public:
|
||||
OPENVINO_RTTI("InsertLoops", "0");
|
||||
InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool is_optimized = true);
|
||||
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
|
||||
|
||||
static std::vector<bool> calculate_inner_apply_increments(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
|
||||
static std::vector<bool> calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes);
|
||||
static std::vector<int64_t> calculate_finalization_offsets(const ov::PartialShape& master, const std::vector<ov::PartialShape>& shapes);
|
||||
private:
|
||||
ov::PartialShape m_master_shape;
|
||||
size_t m_loop_depth;
|
||||
size_t m_vector_size;
|
||||
bool m_single_loop_body;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -20,6 +20,10 @@ namespace pass {
|
||||
class InsertMoveBroadcast: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
InsertMoveBroadcast();
|
||||
|
||||
static Output<ngraph::Node> BroadcastNodeLastDim(const ngraph::Output<ngraph::Node>& value,
|
||||
const ov::PartialShape& target_shape,
|
||||
const ov::PartialShape& normalized_shape);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
|
29
src/common/snippets/include/snippets/pass/loop_fusion.hpp
Normal file
29
src/common/snippets/include/snippets/pass/loop_fusion.hpp
Normal file
@ -0,0 +1,29 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface LoopFusion
|
||||
* @brief Fuse Loops into one Loop if their semantics allow it
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class LoopFusion: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
LoopFusion();
|
||||
|
||||
private:
|
||||
bool Merge(const std::shared_ptr<op::LoopBegin>& buffer);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
99
src/common/snippets/include/snippets/pass/loop_helpers.hpp
Normal file
99
src/common/snippets/include/snippets/pass/loop_helpers.hpp
Normal file
@ -0,0 +1,99 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "ngraph/op/parameter.hpp"
|
||||
#include "snippets/op/loop.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
/* ==== LoopBegin === */
|
||||
/**
|
||||
* @interface insertLoopBeginAfterOutputs
|
||||
* @brief Inserts LoopBegin operation after the group of operations described
|
||||
* by the input argument (OutputVector). Use insertLoopBegin instead - it has a more universal interface.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs);
|
||||
|
||||
/**
|
||||
* @interface insertLoopBegin
|
||||
* @brief Inserts LoopBegin operation after the group of operations described
|
||||
* by the input argument (ParameterVector, NodeVector or OutputVector).
|
||||
* @ingroup snippets
|
||||
*/
|
||||
template<typename T>
|
||||
std::shared_ptr<LoopBegin> insertLoopBegin(const T& afterTheseNodes) {
|
||||
static_assert(std::is_same<T, ParameterVector>() || std::is_same<T, NodeVector>(),
|
||||
"Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
|
||||
OutputVector originalOutputs;
|
||||
std::vector<std::set<Input<Node>>> childInputs;
|
||||
for (const auto &n : afterTheseNodes) {
|
||||
const auto& nodeOutputs = n->outputs();
|
||||
// Ignore the LoopBegin->LoopEnd edge to make it easier to construct enclosed Loops
|
||||
std::move(nodeOutputs.begin(), nodeOutputs.end() - 1 * ov::is_type<LoopBegin>(n), std::back_inserter(originalOutputs));
|
||||
}
|
||||
|
||||
return insertLoopBeginAfterOutputs(originalOutputs);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline std::shared_ptr<LoopBegin> insertLoopBegin(const OutputVector& afterTheseNodes) {
|
||||
return insertLoopBeginAfterOutputs(afterTheseNodes);
|
||||
}
|
||||
/* ============== */
|
||||
|
||||
/* ==== LoopEnd === */
|
||||
/**
|
||||
* @interface insertLoopBeginAfterOutputs
|
||||
* @brief Inserts LoopBegin operation after the group of operations described
|
||||
* by the input argument (vector of inputs). Use insertLoopEnd instead - it has a more universal interface.
|
||||
* @param originalInputs LoopEnd will be inserted before these inputs
|
||||
* @param loopBegin pointer to the beginning of the Loop region
|
||||
* @param work_amount total number of evaluations to be processed by the loop
|
||||
* @param increment number of evaluations processed in one iteration of the loop
|
||||
* @param apply_increment describes which data pointers attributed to the loop should be incremented on every iteration.
|
||||
* should be used when Loop is connected to Parameters and/or Results
|
||||
* @param finalization_offsets pointer shifts that should be applied to data pointers before exiting the loop
|
||||
* @ingroup snippets
|
||||
*/
|
||||
|
||||
std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
|
||||
const std::shared_ptr<LoopBegin>& loopBegin,
|
||||
size_t work_amount, size_t increment,
|
||||
std::vector<bool> apply_increment = {},
|
||||
std::vector<int64_t> finalization_offsets = {});
|
||||
|
||||
/**
|
||||
* @interface insertLoopEnd
|
||||
* @brief Inserts LoopEnd operation before the group of operations described
|
||||
* by the input argument (ResultVector, NodeVector or OutputVector).
|
||||
* @ingroup snippets
|
||||
*/
|
||||
template<typename T, typename ...Args>
|
||||
std::shared_ptr<LoopEnd> insertLoopEnd(const T& beforeTheseNodes, Args ...args) {
|
||||
static_assert(std::is_same<T, ResultVector>() || std::is_same<T, NodeVector>(),
|
||||
"Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
|
||||
std::vector<Input<Node>> originalInputs;
|
||||
for (const auto &n : beforeTheseNodes) {
|
||||
const auto& nodeInputs = n->inputs();
|
||||
// Ignore the LoopBegin->LoopEnd edge to facilitate enclosed Loops construction
|
||||
std::move(nodeInputs.begin(), nodeInputs.end() - 1 * ov::is_type<LoopEnd>(n), std::back_inserter(originalInputs));
|
||||
}
|
||||
return insertLoopEndBeforeInputs(originalInputs, args...);
|
||||
}
|
||||
|
||||
template<typename ...Args>
|
||||
std::shared_ptr<LoopEnd> insertLoopEnd(const std::vector<Input<Node>>& beforeTheseNodes, Args ...args) {
|
||||
return insertLoopEndBeforeInputs(beforeTheseNodes, args...);
|
||||
}
|
||||
/* ============== */
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/pass/graph_rewrite.hpp"
|
||||
#include "ngraph/pattern/matcher.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface MatMulToBrgemm
|
||||
* @brief Replaces ngraph::MatMul with snippets::op::Brgemm operation (only non-trasposing MatMuls are currently supported)
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class MatMulToBrgemm: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("MatMulToBrgemm", "0");
|
||||
MatMulToBrgemm();
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface TokenizeMHASnippets
|
||||
* @brief The pass tokenizes MHA-pattern into Subgraph
|
||||
* TODO: Write pattern
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TokenizeMHASnippets: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("TokenizeMHASnippets", "0");
|
||||
TokenizeMHASnippets();
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
29
src/common/snippets/include/snippets/pass/reset_buffer.hpp
Normal file
29
src/common/snippets/include/snippets/pass/reset_buffer.hpp
Normal file
@ -0,0 +1,29 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface ResetBufferState
|
||||
* @brief If there is Buffer between loops we should reset Buffer pointer after first loop execution (data storing) using finalization offsets
|
||||
* to have correct buffer data pointer for data loading in the next loop where data was stored in previous loop
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class ResetBufferState: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
ResetBufferState();
|
||||
|
||||
static int64_t calculate_required_finalization_offsets(const size_t inner_master_work_amount, const size_t inner_target_work_amount);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,30 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface SoftmaxDecomposition
|
||||
* @brief The pass decomposise Softmax into explicit Snippets dialects
|
||||
* Note:
|
||||
* - At the moment Snippets supports Softmax only in MHA pattern where there are Buffer ops before and after Softmax.
|
||||
* Also Snippets support Loops with Buffer ops on inputs and outputs if Buffer have the same buffer byte size
|
||||
* because of work with ptr increment. So we have to set Tile rank as buffer allocation rank even if rank 1 is enough
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class SoftmaxDecomposition: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank = -1);
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,27 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface SoftmaxReshapeElimination
|
||||
* @brief The pass removes Reshape operations around Softmax if possible
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class SoftmaxReshapeElimination: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
SoftmaxReshapeElimination();
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
58
src/common/snippets/include/snippets/pass/tokenization.hpp
Normal file
58
src/common/snippets/include/snippets/pass/tokenization.hpp
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
#include "snippets/pass/mha_tokenization.hpp"
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/*
|
||||
NotSet - default value returned by GetSnippetsNodeType(...) if the node wasn't marked
|
||||
SkippedByPlugin - indicate that snippets can't include this node in subgraph. Can be set by Plugin via SetSnippetsNodeType(...).
|
||||
*/
|
||||
enum class SnippetsNodeType : int64_t {NotSet, SkippedByPlugin};
|
||||
void SetSnippetsNodeType(const std::shared_ptr<Node>&, SnippetsNodeType);
|
||||
SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node>&);
|
||||
void SetTopologicalOrder(const std::shared_ptr<Node>&, int64_t);
|
||||
int64_t GetTopologicalOrder(const std::shared_ptr<const Node>&);
|
||||
|
||||
/**
|
||||
* @interface EnumerateNodes
|
||||
* @brief Snippets rely on topological order to avoid creating cyclic dependencies. This transformation sets the order.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class EnumerateNodes : public ov::pass::ModelPass {
|
||||
public:
|
||||
OPENVINO_RTTI("EnumerateNodes", "0");
|
||||
EnumerateNodes() : ModelPass() {}
|
||||
bool run_on_model(const std::shared_ptr<ov::Model>&) override;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @interface SnippetsTokenization
|
||||
* @brief Splits model to supported subgraphs
|
||||
* 1. Enumerate nodes by topological order
|
||||
* 2. MHA tokenization
|
||||
* 3. Common tokenization
|
||||
* 4. Some common transformations for Subgraphs. For example, FakeQuantize decomposition
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class SnippetsTokenization : public ngraph::pass::FunctionPass {
|
||||
public:
|
||||
OPENVINO_RTTI("SnippetsTokenization", "0");
|
||||
bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
|
||||
};
|
||||
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
#include <ngraph/pattern/matcher.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
/**
|
||||
* @interface TransposeDecomposition
|
||||
* @brief Decompose Transpose to Load + Store wrapped in several loops.
|
||||
* @ingroup snippets
|
||||
*/
|
||||
class TransposeDecomposition: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("TransposeDecomposition", "0");
|
||||
TransposeDecomposition();
|
||||
static const std::set<std::vector<int>> supported_cases;
|
||||
};
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -9,16 +9,21 @@
|
||||
|
||||
#include "op/broadcastload.hpp"
|
||||
#include "op/broadcastmove.hpp"
|
||||
#include "op/buffer.hpp"
|
||||
#include "op/convert_saturation.hpp"
|
||||
#include "op/convert_truncation.hpp"
|
||||
#include "op/horizon_max.hpp"
|
||||
#include "op/horizon_sum.hpp"
|
||||
#include "op/fill.hpp"
|
||||
#include "op/kernel.hpp"
|
||||
#include "op/load.hpp"
|
||||
#include "op/nop.hpp"
|
||||
#include "op/scalar.hpp"
|
||||
#include "op/powerstatic.hpp"
|
||||
#include "op/store.hpp"
|
||||
#include "op/tile.hpp"
|
||||
#include "op/tile_scheduler.hpp"
|
||||
#include "op/loop.hpp"
|
||||
#include "op/brgemm.hpp"
|
||||
#include "op/vector_buffer.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
|
@ -11,6 +11,10 @@
|
||||
|
||||
// SnippetS dialect
|
||||
NGRAPH_OP(Load, ngraph::snippets::op)
|
||||
NGRAPH_OP(LoadReshape, ngraph::snippets::op)
|
||||
NGRAPH_OP(LoopBegin, ngraph::snippets::op)
|
||||
NGRAPH_OP(LoopEnd, ngraph::snippets::op)
|
||||
NGRAPH_OP(Brgemm, ngraph::snippets::op)
|
||||
NGRAPH_OP(BroadcastLoad, ngraph::snippets::op)
|
||||
|
||||
NGRAPH_OP(Store, ngraph::snippets::op)
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "snippets_isa.hpp"
|
||||
#include "emitter.hpp"
|
||||
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace utils {
|
||||
@ -23,6 +24,15 @@ inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_outpu
|
||||
return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
|
||||
}
|
||||
|
||||
|
||||
ov::PartialShape get_port_planar_shape(const Output<Node>& out);
|
||||
ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector<size_t>& layout);
|
||||
std::vector<size_t> get_node_output_layout(const std::shared_ptr<Node>& node);
|
||||
std::vector<size_t> get_node_output_layout(const Node* node);
|
||||
|
||||
inline ov::Dimension get_inner_dim(const ov::PartialShape &shape) { return *(shape.rbegin()); }
|
||||
inline ov::Dimension get_outer_dim(const ov::PartialShape &shape) { return *(shape.rbegin() + 1); }
|
||||
|
||||
} // namespace utils
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
|
@ -6,106 +6,219 @@
|
||||
#include "snippets/pass/assign_registers.hpp"
|
||||
#include "snippets/pass/vector_to_scalar.hpp"
|
||||
#include "snippets/pass/insert_load_store.hpp"
|
||||
#include "snippets/op/tile.hpp"
|
||||
#include "snippets/op/loop.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
#include "snippets/op/kernel.hpp"
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <openvino/core/type.hpp>
|
||||
|
||||
auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo {
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
|
||||
auto getRegisters(const std::shared_ptr<ngraph::Node> &n) -> RegInfo {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters")
|
||||
auto rt = n->get_rt_info();
|
||||
|
||||
// ToDo: change to reg_t
|
||||
std::vector<size_t> rin, rout;
|
||||
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
|
||||
rout.push_back(reg);
|
||||
}
|
||||
for (const auto& output : n->outputs()) {
|
||||
const auto& rt = output.get_tensor_ptr()->get_rt_info();
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end())
|
||||
rout.push_back(it_rt->second.as<size_t>());
|
||||
}
|
||||
|
||||
for (const auto& input : n->inputs()) {
|
||||
auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
|
||||
auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info();
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
|
||||
rin.push_back(reg);
|
||||
}
|
||||
}
|
||||
if (it_rt != rt.end())
|
||||
rin.push_back(it_rt->second.as<size_t>());
|
||||
}
|
||||
|
||||
return std::make_pair(rin, rout);
|
||||
}
|
||||
|
||||
auto tail_transformations(NodeVector& tail, const size_t tail_size, const ngraph::snippets::Generator::GeneratorConfig& config) -> void {
|
||||
NodeVector updated_tile;
|
||||
auto insertFill = [tail_size](const ov::Input<ov::Node>& input) -> std::shared_ptr<ov::Node> {
|
||||
auto copyRegInfo = [](const ov::descriptor::Tensor& from, ov::descriptor::Tensor& to) -> void {
|
||||
auto rt = from.get_rt_info();
|
||||
auto reginfo = rt.find("reginfo");
|
||||
if (reginfo != rt.end()) {
|
||||
to.get_rt_info()["reginfo"] = reginfo->second;
|
||||
}
|
||||
};
|
||||
std::shared_ptr<ov::Node> fill = nullptr;
|
||||
auto& rt = input.get_rt_info();
|
||||
auto fill_rt = rt.find("set_fill");
|
||||
if (fill_rt != rt.end()) {
|
||||
const auto fill_value = fill_rt->second.as<uint32_t>();
|
||||
fill = std::make_shared<ngraph::snippets::op::Fill>(input.get_source_output(), tail_size, fill_value);
|
||||
input.get_node()->set_argument(input.get_index(), fill);
|
||||
// we should explicitly copy reg info because we insert Fill after assign register
|
||||
copyRegInfo(fill->get_input_tensor(0), fill->get_output_tensor(0));
|
||||
}
|
||||
return fill;
|
||||
};
|
||||
|
||||
for (auto& op : tail) {
|
||||
// We should fill vector regs by float_min and zero to have
|
||||
// correct math calculations for ReduceMax and ReduceSum in scalar case.
|
||||
// Note: We find Maximum and Add ops because HorizonMax and HorizonSum are outside Loop,
|
||||
// so they are missed in <tail>
|
||||
if (config.m_need_fill_tail_register &&
|
||||
(ov::is_type<ov::op::v1::Maximum>(op) ||
|
||||
ov::is_type<ov::op::v1::Add>(op))) {
|
||||
for (auto i = 0; i < op->inputs().size(); ++i) {
|
||||
if (auto fill = insertFill(op->input(i))) {
|
||||
updated_tile.push_back(fill);
|
||||
}
|
||||
}
|
||||
} else if (const auto memory_access = std::dynamic_pointer_cast<ngraph::snippets::op::MemoryAccess>(op)) {
|
||||
if (memory_access->get_count() != 1) {
|
||||
memory_access->set_count(tail_size);
|
||||
}
|
||||
}
|
||||
updated_tile.push_back(op);
|
||||
}
|
||||
|
||||
tail = std::move(updated_tile);
|
||||
}
|
||||
|
||||
ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov::Model>& m,
|
||||
const void* compile_params) const {
|
||||
const GeneratorConfig& config,
|
||||
const void* compile_params) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")
|
||||
if (!target->is_supported())
|
||||
throw ngraph_error("unsupported architecture for code genration");
|
||||
|
||||
auto params = m->get_parameters();
|
||||
auto results = m->get_results();
|
||||
auto in = params.size();
|
||||
auto out = results.size();
|
||||
std::vector<size_t> io_last_dims(in + out);
|
||||
std::vector<size_t> io_data_sizes(in + out);
|
||||
std::transform(params.begin(), params.end(), io_last_dims.begin(),
|
||||
[](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
|
||||
std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
|
||||
[](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
|
||||
std::transform(params.begin(), params.end(), io_data_sizes.begin(),
|
||||
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
|
||||
std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
|
||||
[](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
|
||||
throw ngraph_error("unsupported architecture for code generation");
|
||||
|
||||
OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
|
||||
// vector tile
|
||||
// vector loop
|
||||
std::vector<AllocatedEmitter> lowered;
|
||||
for (auto n : m->get_ordered_ops()) {
|
||||
lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
|
||||
auto lower_ops = [&lowered, this](const NodeVector& ops){
|
||||
std::transform(ops.begin(), ops.end(), std::back_inserter(lowered),
|
||||
[this](const std::shared_ptr<Node>& n){
|
||||
return std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n));
|
||||
});
|
||||
};
|
||||
// *1* solo vector/tail loop + empty outer loop
|
||||
// => skip increments (both counter & ptr) : set evaluate_once flag
|
||||
// *2* solo vector/tail loop + non-empty outer loop
|
||||
// => skip counter increments but perform ptr increments : set evaluate_once,
|
||||
// and perform pointer increments through finalization offsets
|
||||
// *3* vector loop(s) + one tail loop
|
||||
// => vector as usual, tail depends on outer loop, see *1* and *2*
|
||||
auto optimize_single_evaluation = [](const std::shared_ptr<op::LoopEnd>& loop, bool force_ptr_increment = false) {
|
||||
if (loop->get_work_amount() < 2 * loop->get_increment()) {
|
||||
loop->set_evaluate_once(true);
|
||||
if (force_ptr_increment || loop->has_outer_loop) {
|
||||
std::vector<int64_t> new_finalization_offsets(loop->get_finalization_offsets());
|
||||
const auto& ptr_increments = loop->get_ptr_increments();
|
||||
for (auto i = 0; i < new_finalization_offsets.size(); i++) {
|
||||
new_finalization_offsets[i] += ptr_increments[i];
|
||||
}
|
||||
loop->set_finalization_offsets(new_finalization_offsets);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
const auto& ops = m->get_ordered_ops();
|
||||
for (auto op = ops.begin(); op < ops.end(); op++) {
|
||||
const auto& loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(*op);
|
||||
|
||||
// scalar tile
|
||||
auto m_scalar = ov::clone_model(*m.get());
|
||||
ngraph::pass::Manager mng;
|
||||
mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
|
||||
mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
|
||||
mng.run_passes(m_scalar);
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
|
||||
std::vector<AllocatedEmitter> scalar_lowered;
|
||||
for (auto n : m_scalar->get_ordered_ops()) {
|
||||
scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
|
||||
// wrapping into tiles1D
|
||||
//todo: in, out, and io_last_dims should derive naturally from the graph representation
|
||||
const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
|
||||
const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
|
||||
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
|
||||
const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
|
||||
const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
|
||||
std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
|
||||
// ignore outer loops and possible manual scalar loops
|
||||
if (loop_begin && loop_begin->get_increment() != 1) {
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::VectorLoop")
|
||||
NodeVector vector_loop, tail_loop;
|
||||
std::shared_ptr<op::LoopEnd> vector_loop_end, tail_loop_end;
|
||||
vector_loop_end = loop_begin->get_loop_end();
|
||||
tail_loop_end = nullptr;
|
||||
while (*op != vector_loop_end)
|
||||
vector_loop.push_back(*op++);
|
||||
vector_loop.push_back(*op);
|
||||
const auto work_amount = vector_loop_end->get_work_amount();
|
||||
const auto increment = vector_loop_end->get_increment();
|
||||
const auto tail_size = work_amount % increment;
|
||||
const auto need_tail = tail_size != 0;
|
||||
const auto need_vector_loop = work_amount >= increment;
|
||||
// Note, that finalization_offsets could be modified inside optimize_single_evaluation,
|
||||
// so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail)
|
||||
std::vector<int64_t> tail_finalization_offsets = need_tail ? vector_loop_end->get_finalization_offsets() : std::vector<int64_t> {};
|
||||
// vector loops are required => Just copy the body, original loop is already a vector one
|
||||
if (need_vector_loop) {
|
||||
// Note that finalization offsets should be applied after the last iteration.
|
||||
// So if there is a tail, then we should apply offsets after it, but not now.
|
||||
if (need_tail)
|
||||
vector_loop_end->set_finalization_offsets(std::vector<int64_t>(tail_finalization_offsets.size(), 0));
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
|
||||
// wrapping into tiles2D
|
||||
auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
|
||||
tile_scheduler->compile_params = compile_params;
|
||||
const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
|
||||
std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
|
||||
if (config.m_optimize_single_evaluation) {
|
||||
// force ptr increments if there is tail
|
||||
optimize_single_evaluation(vector_loop_end, need_tail);
|
||||
}
|
||||
|
||||
lower_ops(vector_loop);
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::TailLoop")
|
||||
// tail is required => transform the body into a tail representation
|
||||
// tail loop is fake loop because for tail we should calculate only
|
||||
// finalization offsets which are supported by LoopEnd.
|
||||
if (need_tail) {
|
||||
NodeMap vector_to_tail_node_map;
|
||||
tail_loop = ngraph::clone_nodes(vector_loop, vector_to_tail_node_map);
|
||||
tail_transformations(tail_loop, tail_size, config);
|
||||
tail_loop_end = ov::as_type_ptr<op::LoopEnd>(*tail_loop.rbegin());
|
||||
tail_loop_end->set_finalization_offsets(tail_finalization_offsets);
|
||||
tail_loop_end->set_increment(tail_size);
|
||||
// ptr increments were set to the old increment, need to update them in accordance with the new one
|
||||
tail_loop_end->update_ptr_increments(static_cast<int64_t>(tail_size));
|
||||
tail_loop_end->set_work_amount(tail_size);
|
||||
tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop;
|
||||
|
||||
if (config.m_optimize_single_evaluation) {
|
||||
// tail loop is always executed once
|
||||
optimize_single_evaluation(tail_loop_end);
|
||||
}
|
||||
|
||||
lower_ops(tail_loop);
|
||||
}
|
||||
} else {
|
||||
lower_ops({*op});
|
||||
}
|
||||
}
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
|
||||
// emission
|
||||
auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
|
||||
tiles2DKernel->compile_params = compile_params;
|
||||
std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
|
||||
kernel->emit_code({in, out}, {});
|
||||
//todo: Kernel need info on i/o data access pattern and data shapes to calculate data offsets
|
||||
// pass Params and Results
|
||||
// todo: it's probably better to move AllocaledEmitter creation inside Kernel constructor
|
||||
// So Kernel accepts only model ptr and target, and creates AllocatedEmitter inside
|
||||
//emission
|
||||
auto loops2DKernel = std::make_shared<op::Kernel>(lowered, m);
|
||||
loops2DKernel->compile_params = compile_params;
|
||||
std::shared_ptr<Emitter> kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel);
|
||||
|
||||
kernel->emit_code({}, {});
|
||||
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::EmitData")
|
||||
lowered.insert(lowered.end(), scalar_lowered.begin(), scalar_lowered.end());
|
||||
for (auto& op : lowered) {
|
||||
op.first->emit_data();
|
||||
}
|
||||
OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
|
||||
|
||||
// todo: we save lowered to access compiled brgemm kernels on execution time (normally lowered is destructed by then)
|
||||
// remove this when kernel caching is implemented. Don't forget to make generate const method.
|
||||
if (config.m_save_lowered_code)
|
||||
lowered_saved = lowered;
|
||||
|
||||
return target->get_snippet();
|
||||
}
|
||||
|
||||
std::shared_ptr<const TargetMachine> Generator::get_target_machine() const {
|
||||
return target;
|
||||
}
|
||||
|
||||
}// namespace snippets
|
||||
}// namespace ngraph
|
||||
|
64
src/common/snippets/src/op/brgemm.cpp
Normal file
64
src/common/snippets/src/op/brgemm.cpp
Normal file
@ -0,0 +1,64 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
#include "snippets/op/brgemm.hpp"
|
||||
#include "ngraph/runtime/host_tensor.hpp"
|
||||
#include "openvino/core/rt_info.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
#include "matmul_shape_inference.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
Brgemm::Brgemm(const Output<Node>& A, const Output<Node>& B, const size_t offset_a, const size_t offset_b, const size_t offset_c)
|
||||
: MatMul(), m_offset_a(offset_a), m_offset_b(offset_b), m_offset_c(offset_c) {
|
||||
set_arguments({A, B});
|
||||
set_output_size(1);
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool Brgemm::visit_attributes(AttributeVisitor& visitor) {
|
||||
MatMul::visit_attributes(visitor);
|
||||
visitor.on_attribute("offset_a", m_offset_a);
|
||||
visitor.on_attribute("offset_b", m_offset_b);
|
||||
visitor.on_attribute("offset_c", m_offset_c);
|
||||
return true;
|
||||
}
|
||||
|
||||
void Brgemm::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(Brgemm_validate_and_infer_types);
|
||||
element::Type result_et;
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
element::Type::merge(result_et, get_input_element_type(0), get_input_element_type(1)),
|
||||
"Arguments do not have the same element type (arg0 element type: ",
|
||||
get_input_element_type(0),
|
||||
", arg1 element type: ",
|
||||
get_input_element_type(1),
|
||||
").");
|
||||
// If no leading dimensions are provided, assume dense row-major inputs-outputs
|
||||
NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static() && get_input_partial_shape(1).is_static(),
|
||||
"Brgemm currently supports only static shapes.");
|
||||
|
||||
std::vector<ov::PartialShape> planar_input_shapes;
|
||||
for (const auto& in : input_values())
|
||||
planar_input_shapes.emplace_back(utils::get_port_planar_shape(in));
|
||||
|
||||
std::vector<ov::PartialShape> output_shapes = {ov::PartialShape{}};
|
||||
ov::op::v0::shape_infer(this, planar_input_shapes, output_shapes);
|
||||
const auto& output_layout = utils::get_node_output_layout(this);
|
||||
output_shapes[0] = utils::get_reordered_planar_shape(output_shapes[0], output_layout);
|
||||
set_output_type(0, result_et, output_shapes[0]);
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> Brgemm::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Brgemm_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Brgemm>(new_args.at(0), new_args.at(1), m_offset_a, m_offset_b, m_offset_c);
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -11,15 +11,21 @@
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, Shape shape)
|
||||
: BroadcastMove(x, shape) {
|
||||
snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, ov::PartialShape shape, size_t offset)
|
||||
: BroadcastMove(x, std::move(shape)), m_offset(offset) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) {
|
||||
BroadcastMove::visit_attributes(visitor);
|
||||
visitor.on_attribute("offset", m_offset);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(BroadcastLoad);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape);
|
||||
return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape, m_offset);
|
||||
}
|
||||
|
||||
void snippets::op::BroadcastLoad::validate_and_infer_types() {
|
||||
|
@ -12,7 +12,7 @@
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, Shape shape) : Op({x}), output_shape(shape) {
|
||||
snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, ov::PartialShape shape) : Op({x}), output_shape(std::move(shape)) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
@ -24,44 +24,9 @@ bool snippets::op::BroadcastMove::visit_attributes(AttributeVisitor& visitor) {
|
||||
std::shared_ptr<Node> snippets::op::BroadcastMove::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(BroadcastMove);
|
||||
check_new_args_count(this, new_args);
|
||||
auto other = std::make_shared<BroadcastMove>(new_args.at(0), this->output_shape);
|
||||
return other;
|
||||
return std::make_shared<BroadcastMove>(new_args.at(0), output_shape);
|
||||
}
|
||||
|
||||
void snippets::op::BroadcastMove::validate_and_infer_types() {
|
||||
set_output_type(0, get_input_element_type(0), this->output_shape);
|
||||
}
|
||||
|
||||
bool snippets::op::BroadcastMove::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
|
||||
INTERNAL_OP_SCOPE(BroadcastMove);
|
||||
NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
|
||||
NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
|
||||
NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
|
||||
NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
|
||||
NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
|
||||
|
||||
auto ishape = input_values[0]->get_shape();
|
||||
auto oshape = output_values[0]->get_shape();
|
||||
|
||||
NGRAPH_CHECK(ishape.size() == oshape.size(), "input and output should have the same rank");
|
||||
|
||||
AxisSet broadcast_axes;
|
||||
for (size_t k = 0; k < ishape.size(); k++) {
|
||||
if (!((ishape[k] == oshape[k])
|
||||
|| (ishape[k] != oshape[k] && ((ishape[k] == 1) != (oshape[k] == 1) ) ))) {
|
||||
throw ngraph_error("FakeBroadcast::evaluate incompatible shapes");
|
||||
}
|
||||
|
||||
if (ishape[k] != oshape[k]) {
|
||||
broadcast_axes.insert(k);
|
||||
}
|
||||
}
|
||||
|
||||
runtime::reference::broadcast(input_values[0]->get_data_ptr<char>(),
|
||||
output_values[0]->get_data_ptr<char>(),
|
||||
input_values[0]->get_shape(),
|
||||
output_values[0]->get_shape(),
|
||||
broadcast_axes,
|
||||
sizeof(float));
|
||||
return true;
|
||||
}
|
||||
}
|
53
src/common/snippets/src/op/buffer.cpp
Normal file
53
src/common/snippets/src/op/buffer.cpp
Normal file
@ -0,0 +1,53 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/buffer.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
auto normalize_rank(int32_t allocation_rank, const size_t shape_rank) -> int32_t {
|
||||
return allocation_rank < 0 ? allocation_rank + static_cast<int32_t>(shape_rank) : allocation_rank;
|
||||
}
|
||||
|
||||
snippets::op::Buffer::Buffer(const Output<Node>& x, const int32_t allocation_rank) : Op({x}), m_allocation_rank(allocation_rank) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::Buffer::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(Buffer_visit_attributes);
|
||||
visitor.on_attribute("allocation_rank", m_allocation_rank);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Buffer::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Buffer_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
auto new_buffer = std::make_shared<Buffer>(new_args.at(0), m_allocation_rank);
|
||||
return new_buffer;
|
||||
}
|
||||
|
||||
void snippets::op::Buffer::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(Buffer_validate_and_infer_types);
|
||||
const auto shape_rank = get_input_partial_shape(0).rank();
|
||||
if (shape_rank.is_static()) {
|
||||
const auto normalized_rank = normalize_rank(m_allocation_rank, shape_rank.get_length());
|
||||
NGRAPH_CHECK(normalized_rank >= 0 && normalized_rank <= shape_rank.get_length(),
|
||||
"Buffer has incorrect allocation rank: " + std::to_string(m_allocation_rank));
|
||||
}
|
||||
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
size_t ngraph::snippets::op::Buffer::get_byte_size() const {
|
||||
const auto pshape = get_input_partial_shape(0);
|
||||
NGRAPH_CHECK(pshape.is_static(), "Buffer should have static shapes for memory allocation");
|
||||
const auto shape = pshape.get_shape();
|
||||
const auto normalized_rank = normalize_rank(m_allocation_rank, shape.size());
|
||||
return ngraph::shape_size(shape.rbegin(), shape.rbegin() + normalized_rank) * get_element_type().size();
|
||||
}
|
38
src/common/snippets/src/op/fill.cpp
Normal file
38
src/common/snippets/src/op/fill.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/fill.hpp"
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::Fill::Fill(const Output<Node>& x, const size_t offset, const uint32_t fill_value)
|
||||
: Op({x}), m_offset(offset), m_fill_value(fill_value) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::Fill::visit_attributes(AttributeVisitor& visitor) {
|
||||
INTERNAL_OP_SCOPE(Fill_visit_attributes);
|
||||
visitor.on_attribute("offset", m_offset);
|
||||
visitor.on_attribute("fill_value", m_fill_value);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Fill::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Fill_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Fill>(new_args.at(0), m_offset, m_fill_value);
|
||||
}
|
||||
|
||||
void snippets::op::Fill::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(Fill_validate_and_infer_types);
|
||||
const auto in_type = get_input_element_type(0);
|
||||
NGRAPH_CHECK(in_type.size() == 4, "Fill operation supports only element types with 4 byte size but got:" + std::to_string(in_type.size()));
|
||||
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
|
||||
}
|
||||
|
28
src/common/snippets/src/op/horizon_max.cpp
Normal file
28
src/common/snippets/src/op/horizon_max.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/op/horizon_max.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::HorizonMax::HorizonMax(const Output<Node>& x) : Op({x}) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::HorizonMax::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(HorizonMax_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<HorizonMax>(new_args.at(0));
|
||||
}
|
||||
|
||||
void snippets::op::HorizonMax::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(HorizonMax_validate_and_infer_types);
|
||||
auto new_shape = get_input_partial_shape(0);
|
||||
if (!ov::is_scalar(new_shape)) {
|
||||
new_shape[new_shape.size() - 1] = 1lu;
|
||||
}
|
||||
set_output_type(0, get_input_element_type(0), new_shape);
|
||||
}
|
28
src/common/snippets/src/op/horizon_sum.cpp
Normal file
28
src/common/snippets/src/op/horizon_sum.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/op/horizon_sum.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::HorizonSum::HorizonSum(const Output<Node>& x) : Op({x}) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::HorizonSum::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(HorizonSum_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<HorizonSum>(new_args.at(0));
|
||||
}
|
||||
|
||||
void snippets::op::HorizonSum::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(HorizonSum_validate_and_infer_types);
|
||||
auto new_shape = get_input_partial_shape(0);
|
||||
if (!ov::is_scalar(new_shape)) {
|
||||
new_shape[new_shape.size() - 1] = 1lu;
|
||||
}
|
||||
set_output_type(0, get_input_element_type(0), new_shape);
|
||||
}
|
@ -5,8 +5,14 @@
|
||||
#include "snippets/op/kernel.hpp"
|
||||
#include "snippets/generator.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
snippets::op::Kernel::Kernel(const std::vector<std::pair<std::shared_ptr<snippets::Emitter>, snippets::RegInfo>>& nested) : Op(), region(nested) {
|
||||
Kernel::Kernel(std::vector<AllocatedEmitter> nested, std::shared_ptr<const ov::Model> m)
|
||||
: Op(), region(std::move(nested)), model(std::move(m)) {
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -8,39 +8,54 @@
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
|
||||
Load::Load(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Load);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Load>(new_args.at(0), m_count);
|
||||
return std::make_shared<Load>(new_args.at(0), m_count, m_offset);
|
||||
}
|
||||
|
||||
void snippets::op::Load::validate_and_infer_types() {
|
||||
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
|
||||
|
||||
LoadReshape::LoadReshape(const Output<ov::Node>& x, const size_t count, const size_t offset, std::vector<size_t> order)
|
||||
: Load(x, count, offset), m_order(std::move(order)) {
|
||||
const auto& in_shape = x.get_partial_shape();
|
||||
NGRAPH_CHECK(in_shape.is_static(), "LoadReshape supports only static input shapes");
|
||||
const auto in_shape_size = in_shape.size();
|
||||
NGRAPH_CHECK(m_order.size() == in_shape_size, "LoadReshape got new_order of invalid size");
|
||||
NGRAPH_CHECK(*std::max_element(m_order.begin(), m_order.end()) == in_shape_size - 1 &&
|
||||
*std::min_element(m_order.begin(), m_order.end()) == 0, "LoadReshape detected invalid values in new_order");
|
||||
const std::set<size_t> unique_dims(order.begin(), order.end());
|
||||
NGRAPH_CHECK(unique_dims.size() == order.size(), "LoadReshape order must not contain repeated elements");
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::Load::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
|
||||
INTERNAL_OP_SCOPE(Load);
|
||||
NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
|
||||
NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
|
||||
NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
|
||||
NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
|
||||
NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
|
||||
NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
|
||||
|
||||
std::copy(input_values[0]->get_data_ptr<uint8_t>(),
|
||||
input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
|
||||
output_values[0]->get_data_ptr<uint8_t>());
|
||||
void snippets::op::LoadReshape::validate_and_infer_types() {
|
||||
const auto& old_shape = get_input_partial_shape(0);
|
||||
ov::PartialShape new_shape;
|
||||
for (const auto idx : m_order)
|
||||
new_shape.push_back(old_shape[idx]);
|
||||
set_output_type(0, get_input_element_type(0), new_shape);
|
||||
}
|
||||
|
||||
bool snippets::op::LoadReshape::visit_attributes(AttributeVisitor& visitor) {
|
||||
Load::visit_attributes(visitor);
|
||||
visitor.on_attribute("order", m_order);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::LoadReshape::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(LoadReshape);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<LoadReshape>(new_args.at(0), m_count, m_offset, m_order);
|
||||
}
|
||||
|
||||
}// namespace op
|
||||
}// namespace snippets
|
||||
}// namespace ngraph
|
||||
|
182
src/common/snippets/src/op/loop.cpp
Normal file
182
src/common/snippets/src/op/loop.cpp
Normal file
@ -0,0 +1,182 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/loop.hpp"
|
||||
#include "snippets/generator.hpp"
|
||||
|
||||
using namespace std;
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t work_amount, size_t increment)
|
||||
: Op(args), work_amount(work_amount), work_amount_increment(increment), evaluate_once(false) {
|
||||
}
|
||||
|
||||
bool LoopBase::visit_attributes(AttributeVisitor &visitor) {
|
||||
visitor.on_attribute("work_amount", work_amount);
|
||||
visitor.on_attribute("increment", work_amount_increment);
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t LoopBase::get_work_amount() const {
|
||||
return work_amount;
|
||||
}
|
||||
|
||||
bool LoopBase::get_evaluate_once() const {
|
||||
return evaluate_once;
|
||||
}
|
||||
|
||||
size_t LoopBase::get_increment() const {
|
||||
return work_amount_increment;
|
||||
}
|
||||
|
||||
LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment)
|
||||
: LoopBase(args, work_amount, work_amount_increment),
|
||||
begin_address(nullptr), input_regs({}) {
|
||||
// We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached
|
||||
// to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it)
|
||||
validate_and_infer_types_except_LoopEnd();
|
||||
}
|
||||
|
||||
LoopBegin::LoopBegin(const std::vector<Output<Node>> &args)
|
||||
: LoopBase(args, 0, 0), begin_address(nullptr), input_regs({}) {
|
||||
validate_and_infer_types_except_LoopEnd();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, work_amount, work_amount_increment));
|
||||
}
|
||||
|
||||
|
||||
void LoopBegin::validate_and_infer_types_except_LoopEnd() {
|
||||
const size_t num_inputs = get_input_size();
|
||||
set_output_size(num_inputs + 1);
|
||||
// All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
|
||||
for (int i = 0; i < num_inputs; i++)
|
||||
get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
|
||||
set_output_type(num_inputs, element::f32, ov::PartialShape{ov::Shape{}});
|
||||
}
|
||||
|
||||
void LoopBegin::validate_and_infer_types() {
|
||||
validate_and_infer_types_except_LoopEnd();
|
||||
const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
|
||||
NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output");
|
||||
const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
|
||||
NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output");
|
||||
work_amount = loop_end->get_work_amount();
|
||||
work_amount_increment = loop_end->get_increment();
|
||||
}
|
||||
|
||||
std::shared_ptr<LoopEnd> LoopBegin::get_loop_end() {
|
||||
const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
|
||||
if (last_output_inputs.size() != 1)
|
||||
throw std::invalid_argument("LoopBegin has more than one inputs attached to the last output");
|
||||
const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
|
||||
if (!loop_end)
|
||||
throw std::invalid_argument("LoopBegin last output is not connected to LoopEnd");
|
||||
return loop_end;
|
||||
}
|
||||
|
||||
LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment,
|
||||
std::vector<bool> apply_increments, std::vector<int64_t> finalization_offsets)
|
||||
: LoopBase(args, work_amount, work_amount_increment), finalization_offsets(std::move(finalization_offsets)),
|
||||
has_outer_loop(true), loop_io_size(0) {
|
||||
ptr_increments.resize(apply_increments.size());
|
||||
std::transform(apply_increments.begin(), apply_increments.end(), ptr_increments.begin(),
|
||||
[work_amount_increment](bool apply) {
|
||||
return apply ? work_amount_increment : 0;
|
||||
});
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t work_amount, size_t work_amount_increment,
|
||||
std::vector<int64_t> ptr_increments, std::vector<int64_t> finalization_offsets)
|
||||
: LoopBase(args, work_amount, work_amount_increment), ptr_increments(std::move(ptr_increments)),
|
||||
finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true), loop_io_size(0) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
return std::make_shared<LoopEnd>(inputs, work_amount, work_amount_increment, ptr_increments, finalization_offsets);
|
||||
}
|
||||
|
||||
std::shared_ptr<LoopBegin> LoopEnd::get_loop_begin() {
|
||||
const auto& loop_begin = ov::as_type_ptr<LoopBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
|
||||
if (!loop_begin)
|
||||
throw std::invalid_argument("LoopEnd last input is not connected to LoopBegin");
|
||||
return loop_begin;
|
||||
}
|
||||
|
||||
const std::vector<int64_t>& LoopEnd::get_finalization_offsets() const {
|
||||
return finalization_offsets;
|
||||
}
|
||||
|
||||
const std::vector<int64_t>& LoopEnd::get_ptr_increments()const {
|
||||
return ptr_increments;
|
||||
}
|
||||
|
||||
void LoopEnd::set_finalization_offsets(std::vector<int64_t> offsets) {
|
||||
if (offsets.size() != loop_io_size)
|
||||
throw std::invalid_argument("LoopEnd set_finalization_offsets is called with inconsistent offsets.size()");
|
||||
finalization_offsets = std::move(offsets);
|
||||
}
|
||||
|
||||
void LoopEnd::set_ptr_increments(std::vector<int64_t> new_ptr_increments) {
|
||||
if (new_ptr_increments.size() != loop_io_size)
|
||||
throw std::invalid_argument("LoopEnd set_ptr_increments is called with inconsistent new_ptr_increments.size()");
|
||||
ptr_increments = std::move(new_ptr_increments);
|
||||
}
|
||||
|
||||
void LoopEnd::update_ptr_increments(int64_t new_increment) {
|
||||
std::transform(ptr_increments.begin(), ptr_increments.end(), ptr_increments.begin(),
|
||||
[new_increment](int64_t old_increment){
|
||||
return old_increment != 0 ? new_increment : 0;
|
||||
});
|
||||
}
|
||||
|
||||
void LoopEnd::set_work_amount(size_t new_work_amount) {
|
||||
work_amount = new_work_amount;
|
||||
// Update LoopBegin to maintain consistency between the Loops
|
||||
get_loop_begin()->work_amount = new_work_amount;
|
||||
}
|
||||
|
||||
void LoopEnd::set_increment(size_t new_increment) {
|
||||
work_amount_increment = new_increment;
|
||||
// Update LoopBegin to maintain consistency between the Loops
|
||||
get_loop_begin()->work_amount_increment = new_increment;
|
||||
}
|
||||
|
||||
void LoopEnd::set_evaluate_once(bool once) {
|
||||
evaluate_once = once;
|
||||
// Update LoopBegin to maintain consistency between the Loops
|
||||
get_loop_begin()->evaluate_once = once;
|
||||
}
|
||||
|
||||
void LoopEnd::validate_and_infer_types() {
|
||||
const size_t num_inputs = get_input_size();
|
||||
const auto loop_begin = ov::as_type_ptr<LoopBegin>(input(get_input_size() - 1).get_source_output().get_node_shared_ptr());
|
||||
NODE_VALIDATION_CHECK(this, loop_begin != nullptr, "LoopEnd must have LoopBegin as the last argument");
|
||||
// Note: have to -2 because the LoopBegin->LoopEnd edge is counted twice
|
||||
loop_io_size = get_input_size() + loop_begin->get_output_size() - 2;
|
||||
NODE_VALIDATION_CHECK(this, ptr_increments.empty() || ptr_increments.size() == loop_io_size,
|
||||
"ptr_increments must be either empty or defined per every input & output of joined Loop. Expected size: ",
|
||||
loop_io_size, " got ", ptr_increments.size());
|
||||
NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size,
|
||||
"finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ",
|
||||
loop_io_size, " got ", finalization_offsets.size());
|
||||
if (ptr_increments.empty())
|
||||
ptr_increments.resize(loop_io_size, static_cast<int64_t>(work_amount_increment));
|
||||
if (finalization_offsets.empty())
|
||||
finalization_offsets.resize(loop_io_size, 0);
|
||||
set_output_size(num_inputs - 1);
|
||||
const auto& ins = inputs();
|
||||
// All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
|
||||
for (int i = 0; i < num_inputs - 1; i++)
|
||||
get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
45
src/common/snippets/src/op/memory_access.cpp
Normal file
45
src/common/snippets/src/op/memory_access.cpp
Normal file
@ -0,0 +1,45 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/memory_access.hpp"
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
MemoryAccess::MemoryAccess(const Output<Node>& x, const size_t count, const size_t offset) : Op({x}), m_count(count), m_offset(offset) {}
|
||||
|
||||
bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) {
|
||||
visitor.on_attribute("count", m_count);
|
||||
visitor.on_attribute("offset", m_offset);
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t MemoryAccess::get_count() const {
|
||||
return m_count;
|
||||
}
|
||||
|
||||
size_t MemoryAccess::get_offset() const {
|
||||
return m_offset;
|
||||
}
|
||||
|
||||
void MemoryAccess::set_count(const size_t count) {
|
||||
m_count = count;
|
||||
}
|
||||
|
||||
void MemoryAccess::set_offset(const size_t offset) {
|
||||
m_offset = offset;
|
||||
}
|
||||
|
||||
void MemoryAccess::validate_and_infer_types() {
|
||||
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -1,15 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/powerstatic.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
BWDCMP_RTTI_DEFINITION(PowerStatic);
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -6,8 +6,6 @@
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
BWDCMP_RTTI_DEFINITION(snippets::op::Scalar);
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Scalar::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Scalar>(*this);
|
||||
@ -22,3 +20,13 @@ void snippets::op::Scalar::validate_and_infer_types() {
|
||||
"Scalar supports only one-element constants, got ", out_pshape.get_shape(),
|
||||
" shape");
|
||||
}
|
||||
|
||||
bool snippets::op::Scalar::visit_attributes(AttributeVisitor& visitor) {
|
||||
auto shape = get_output_shape(0);
|
||||
auto type = get_output_element_type(0);
|
||||
auto value = cast_vector<float>();
|
||||
visitor.on_attribute("element_type", type);
|
||||
visitor.on_attribute("shape", shape);
|
||||
visitor.on_attribute("value", value);
|
||||
return true;
|
||||
}
|
||||
|
@ -8,39 +8,19 @@
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
|
||||
snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
|
||||
snippets::op::Store::Store(const Output<Node>& x, const size_t count, const size_t offset) : MemoryAccess({x}, count, offset) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(Store);
|
||||
INTERNAL_OP_SCOPE(Store_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<Store>(new_args.at(0), m_count);
|
||||
return std::make_shared<Store>(new_args.at(0), m_count, m_offset);
|
||||
}
|
||||
|
||||
void snippets::op::Store::validate_and_infer_types() {
|
||||
set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
|
||||
}
|
||||
|
||||
bool snippets::op::Store::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
|
||||
INTERNAL_OP_SCOPE(Store);
|
||||
NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
|
||||
NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
|
||||
NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
|
||||
NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
|
||||
NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
|
||||
NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
|
||||
|
||||
std::copy(input_values[0]->get_data_ptr<uint8_t>(),
|
||||
input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
|
||||
output_values[0]->get_data_ptr<uint8_t>());
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
|
@ -9,13 +9,22 @@
|
||||
#include "snippets/op/convert_saturation.hpp"
|
||||
#include "snippets/pass/insert_load_store.hpp"
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include "snippets/pass/broadcast_to_movebroadcast.hpp"
|
||||
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
|
||||
#include "snippets/pass/assign_registers.hpp"
|
||||
#include "snippets/pass/convert_constants.hpp"
|
||||
#include "snippets/pass/convert_power_to_powerstatic.hpp"
|
||||
#include "snippets/pass/vector_to_scalar.hpp"
|
||||
#include "snippets/pass/insert_loops.hpp"
|
||||
#include "snippets/pass/transpose_decomposition.hpp"
|
||||
#include "snippets/pass/transform_convert.hpp"
|
||||
#include "snippets/pass/align_element_type.hpp"
|
||||
#include "snippets/pass/matmul_to_brgemm.hpp"
|
||||
#include "snippets/pass/fuse_transpose_brgemm.hpp"
|
||||
#include "snippets/pass/softmax_decomposition.hpp"
|
||||
#include "snippets/pass/reset_buffer.hpp"
|
||||
#include "snippets/pass/insert_buffer.hpp"
|
||||
#include "snippets/pass/loop_fusion.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
|
||||
#include "transformations/common_optimizations/nop_elimination.hpp"
|
||||
@ -34,27 +43,43 @@ using namespace std;
|
||||
using namespace ngraph;
|
||||
using namespace ov::op::util;
|
||||
|
||||
BWDCMP_RTTI_DEFINITION(snippets::op::Subgraph);
|
||||
|
||||
void snippets::op::Subgraph::set_generator(std::shared_ptr<ngraph::snippets::Generator> generator) {
|
||||
m_generator = generator;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) {
|
||||
m_non_scalar_constants_count = count;
|
||||
void snippets::op::Subgraph::set_virtual_port_count(const size_t count) {
|
||||
m_virtual_port_count = count;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::set_buffer_needed(const bool need) {
|
||||
m_buffer_needed = need;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::init_config() {
|
||||
const auto ops = body_ptr()->get_ops();
|
||||
for (const auto& op : ops) {
|
||||
config.m_is_quantized = config.m_is_quantized ||
|
||||
ov::is_type<ov::op::v0::FakeQuantize>(op);
|
||||
config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops ||
|
||||
std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
|
||||
config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision ||
|
||||
is_quantized() ||
|
||||
has_type_relaxed_ops() ||
|
||||
snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
|
||||
config.m_has_domain_sensitive_ops = config.m_has_domain_sensitive_ops ||
|
||||
ov::is_type<ov::op::v1::Transpose>(op) ||
|
||||
ov::is_type<ov::op::v1::Softmax>(op) ||
|
||||
ov::is_type<ov::op::v8::Softmax>(op) ||
|
||||
ov::is_type<ov::op::v0::MatMul>(op);
|
||||
}
|
||||
// Domain sensitive ops are decomposed with explicit Loops. So, we should explicitly insert Loops in Subgraph if it contains these ops
|
||||
config.m_explicit_loop_insertion = config.m_has_domain_sensitive_ops;
|
||||
}
|
||||
|
||||
snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body)
|
||||
: SubGraphOp(args) {
|
||||
: SubGraphOp(args), m_generator(nullptr) {
|
||||
set_function(body);
|
||||
const auto ops = body_ptr()->get_ops();
|
||||
for (const auto& op : ops) {
|
||||
config.m_is_quantized = config.m_is_quantized || ov::is_type<ov::op::v0::FakeQuantize>(op);
|
||||
config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
|
||||
config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() ||
|
||||
snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
|
||||
}
|
||||
|
||||
init_config();
|
||||
constructor_validate_and_infer_types();
|
||||
for (size_t i = 0; i < body->get_parameters().size(); ++i)
|
||||
m_input_descriptions[0].push_back(std::make_shared<InvariantInputDescription>(i, i));
|
||||
@ -64,13 +89,43 @@ snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::M
|
||||
}
|
||||
|
||||
snippets::op::Subgraph::Subgraph(const NodeVector& args, std::shared_ptr<ov::Model> body)
|
||||
: Subgraph(as_output_vector(args), body) {}
|
||||
: Subgraph(as_output_vector(args), std::move(body)) {}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::Subgraph::clone_with_new_inputs(const OutputVector& inputs) const {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
return make_shared<Subgraph>(inputs, ov::clone_model(body()));
|
||||
}
|
||||
|
||||
std::vector<PartialShape> snippets::op::Subgraph::reshape_body(const std::vector<PartialShape>& input_shapes) {
|
||||
auto& params = body_ptr()->get_parameters();
|
||||
OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
|
||||
for (size_t i = 0; i < params.size(); ++i) {
|
||||
params[i]->set_partial_shape(input_shapes[i]);
|
||||
}
|
||||
body_ptr()->validate_nodes_and_infer_types();
|
||||
std::vector<PartialShape> output_shapes;
|
||||
for (const auto& res : body_ptr()->get_results()) {
|
||||
output_shapes.emplace_back(res->get_input_partial_shape(0));
|
||||
}
|
||||
return output_shapes;
|
||||
}
|
||||
|
||||
std::vector<Shape> snippets::op::Subgraph::reshape_body(const std::vector<Shape>& input_shapes) {
|
||||
auto& params = body_ptr()->get_parameters();
|
||||
OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
|
||||
for (size_t i = 0; i < params.size(); ++i) {
|
||||
params[i]->set_partial_shape(input_shapes[i]);
|
||||
}
|
||||
body_ptr()->validate_nodes_and_infer_types();
|
||||
std::vector<Shape> output_shapes;
|
||||
for (const auto& res : body_ptr()->get_results()) {
|
||||
auto pshape = res->get_input_partial_shape(0);
|
||||
OPENVINO_ASSERT(pshape.is_static(), "Subgraph inferred dynamic output shape during reshape with static inputs");
|
||||
output_shapes.emplace_back(res->get_input_partial_shape(0).get_shape());
|
||||
}
|
||||
return output_shapes;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::validate_and_infer_types")
|
||||
@ -111,8 +166,10 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
ngraph::OutputVector subgraph_inputs;
|
||||
|
||||
for (const auto& input : node->input_values()) {
|
||||
if ((utils::is_scalar_constant(input.get_node_shared_ptr())) ||
|
||||
(ov::is_type<ov::op::v0::FakeQuantize>(node) && ov::is_type<ov::op::v0::Constant>(input.get_node_shared_ptr()))) {
|
||||
if (ov::is_type<ngraph::opset1::Constant>(input.get_node_shared_ptr()) &&
|
||||
(ngraph::shape_size(input.get_shape()) == 1 ||
|
||||
ov::is_type<ov::op::v0::FakeQuantize>(node) ||
|
||||
constant_input_should_be_inside_body(node))) {
|
||||
body_inputs.push_back(input);
|
||||
} else {
|
||||
auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
|
||||
@ -142,9 +199,17 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
|
||||
auto body = create_body(node->get_friendly_name(), body_results, body_parameters);
|
||||
auto subgraph = build_subgraph(node, subgraph_inputs, body);
|
||||
|
||||
bool need_buffer = false;
|
||||
size_t hidden_data_count = 0lu;
|
||||
if (auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
|
||||
subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node));
|
||||
hidden_data_count += utils::get_non_scalar_constant_count_for_fq(fq_node);
|
||||
// Ops that requires Buffer
|
||||
} else if (ov::is_type<ov::op::v1::Softmax>(node) ||
|
||||
ov::is_type<ov::op::v8::Softmax>(node)) {
|
||||
need_buffer |= true;
|
||||
}
|
||||
subgraph->set_virtual_port_count(hidden_data_count);
|
||||
subgraph->set_buffer_needed(need_buffer);
|
||||
|
||||
for (size_t i = 0; i < body->get_parameters().size(); i++) {
|
||||
body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
|
||||
@ -170,6 +235,13 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_
|
||||
NGRAPH_SUPPRESS_DEPRECATED_END
|
||||
}
|
||||
|
||||
auto snippets::op::Subgraph::constant_input_should_be_inside_body(const std::shared_ptr<ov::Node>& node) -> bool {
|
||||
return ov::is_type<ov::op::v1::Transpose>(node) ||
|
||||
ov::is_type<ov::op::v1::Broadcast>(node) ||
|
||||
ov::is_type<ov::op::v3::Broadcast>(node) ||
|
||||
ov::is_type<ov::op::v1::Reshape>(node);
|
||||
}
|
||||
|
||||
///
|
||||
/// \brief Canonization transforms original subgraph and to canonical form suitable for code generation. In particular,
|
||||
/// it handles supported layout conversions, broadcasts inputs and outputs to a single rank and layout. Canonicalization
|
||||
@ -178,7 +250,8 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_
|
||||
/// * None: all inputs have the same layout
|
||||
/// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
|
||||
/// Also there is precision aligning inside body of subgraph during canonicalization
|
||||
Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
|
||||
ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes,
|
||||
const BlockedShapeVector& inputShapes) {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
|
||||
NODE_VALIDATION_CHECK(this, inputShapes.size() == body_ptr()->get_parameters().size(),
|
||||
@ -193,30 +266,29 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
|
||||
return std::get<0>(lhs).size() < std::get<0>(rhs).size();
|
||||
});
|
||||
};
|
||||
Shape baseShape;
|
||||
PartialShape baseShape;
|
||||
AxisVector baseOrder;
|
||||
std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
|
||||
const auto baseRank = baseShape.size();
|
||||
const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
|
||||
for (size_t i = 0; i < inputShapes.size(); i++) {
|
||||
const auto &blockedShape = inputShapes[i];
|
||||
Shape inShape;
|
||||
PartialShape inShape;
|
||||
AxisVector inOrder;
|
||||
element::Type inType;
|
||||
std::tie(inShape, inOrder, inType) = blockedShape;
|
||||
const auto inRank = inShape.size();
|
||||
NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
|
||||
if (inRank < baseRank) {
|
||||
Shape newShape(baseRank, 1);
|
||||
PartialShape newShape(ov::Shape(baseRank, 1));
|
||||
// todo: more complicated logics is needed if we want to merge smth else than blocked and planar
|
||||
// could be done by PartialShape::broadcast_merge_into, but this way is faster
|
||||
size_t startOffset = baseRank - inRank;
|
||||
if (baseIsBlocked) {
|
||||
const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
|
||||
NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
|
||||
startOffset--;
|
||||
inShape.insert(inShape.end(), ov::Dimension(1));
|
||||
}
|
||||
std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
|
||||
NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(newShape, inShape, ov::op::AutoBroadcastType::NUMPY),
|
||||
"Failed to broadcast_merge inputs in snippets canonicalization");
|
||||
inShape = std::move(newShape);
|
||||
} else {
|
||||
// todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
|
||||
@ -225,55 +297,66 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
|
||||
"Snippets canonicalization got input shapes of equal ranks but different layouts, which is not supported");
|
||||
}
|
||||
ov::PartialShape tmpPShape(baseShape);
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
|
||||
"Failed to create broadcastable shapes in snippets canonicalization");
|
||||
const auto paramShape = body_ptr()->get_parameters()[i]->get_shape();
|
||||
// todo: we need to generalize canonicalization for domain-sensitive ops. E.g. MatMul inputs can't be broadcasted one to another
|
||||
if (!config.m_has_domain_sensitive_ops)
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
|
||||
"Failed to create broadcastable shapes in snippets canonicalization");
|
||||
const auto paramShape = body_ptr()->get_parameters()[i]->get_partial_shape();
|
||||
const auto paramType = body_ptr()->get_parameters()[i]->get_element_type();
|
||||
if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
|
||||
body_ptr()->replace_parameter(i, std::make_shared<opset1::Parameter>(paramType, inShape));
|
||||
}
|
||||
|
||||
body_ptr()->validate_nodes_and_infer_types();
|
||||
auto skipStartEndOnes = [](const Shape& shape) {
|
||||
auto skipStartEndOnes = [](const PartialShape& shape) {
|
||||
auto begin = shape.begin();
|
||||
auto end = shape.end();
|
||||
while (begin != end && *begin == 1)
|
||||
begin++;
|
||||
while (begin != end && *(end-1) == 1)
|
||||
end--;
|
||||
Shape trimmedShape(end - begin, 1);
|
||||
|
||||
PartialShape trimmedShape(std::vector<ov::Dimension> (end - begin, 1));
|
||||
std::copy(begin, end, trimmedShape.begin());
|
||||
return trimmedShape;
|
||||
};
|
||||
|
||||
// Check that output shapes are broadcastable => can be scheduled
|
||||
const auto& body_results = body_ptr()->get_results();
|
||||
PartialShape outPShape = body_results[0]->get_shape();
|
||||
for (size_t i = 0; i < body_results.size(); i++) {
|
||||
auto shape_i = body_results[i]->get_shape();
|
||||
auto outputShape_i = std::get<0>(outputShapes[i]);
|
||||
// Check that the produced output shape corresponds to the passed shape
|
||||
// Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs),
|
||||
// so we need to remove leading and trailing "1" before the comparison
|
||||
PartialShape pShape_i(skipStartEndOnes(shape_i));
|
||||
bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, skipStartEndOnes(outputShape_i),
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, ov::shape_size(shape_i) == ov::shape_size(outputShape_i) &&
|
||||
compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ",
|
||||
get_friendly_name(), " : ", shape_i, " vs ", outputShape_i, ".");
|
||||
// Check that output shapes are broadcastable to each other => can be scheduled
|
||||
bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs, "Snippets output shapes must be numpy broadcastable");
|
||||
PartialShape outPShape = body_results[0]->get_input_partial_shape(0);
|
||||
// todo: we need a slightly more general approach for backward ROI propagation
|
||||
const auto& result_parent = body_results[0]->get_input_node_shared_ptr(0);
|
||||
if (body_results.size() == 1 &&
|
||||
ov::is_type<opset1::Transpose>(result_parent) &&
|
||||
ov::is_type<opset1::MatMul>(result_parent->get_input_node_shared_ptr(0))) {
|
||||
outPShape = result_parent->get_input_partial_shape(0);
|
||||
} else {
|
||||
for (size_t i = 0; i < body_results.size(); i++) {
|
||||
auto shape_i = body_results[i]->get_input_partial_shape(0);
|
||||
auto outputShape_i = std::get<0>(outputShapes[i]);
|
||||
// Check that the produced output shape corresponds to the passed shape
|
||||
// Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs),
|
||||
// so we need to remove leading and trailing "1" before the comparison
|
||||
PartialShape pShape_i(skipStartEndOnes(shape_i));
|
||||
bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i,
|
||||
skipStartEndOnes(outputShape_i),
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithPassedShape,
|
||||
"Inferred and passed results shapes are incompatible for snippet ");
|
||||
// Check that output shapes are broadcastable to each other => can be scheduled
|
||||
bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
NODE_VALIDATION_CHECK(this, compatibleWithOtherOutputs,
|
||||
"Snippets output shapes must be numpy broadcastable");
|
||||
}
|
||||
}
|
||||
|
||||
// We should insert Converts after Parameters and Constant and before Results
|
||||
// to align precision inside Subgraph body that is supported by Plugin
|
||||
align_element_types(outputShapes, inputShapes);
|
||||
|
||||
exec_domain = outPShape.get_shape();
|
||||
return exec_domain;
|
||||
master_shape = outPShape;
|
||||
return master_shape;
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
|
||||
@ -303,55 +386,209 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu
|
||||
// - Insert Convert before operations that doesn't support original element type for execution
|
||||
// - Insert reverse Convert before operations that support original element type
|
||||
// but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point)
|
||||
// Then we should use ConstantFolding pass to convert element type of Scalars before inference.
|
||||
// - Then we should use ConstantFolding pass to convert element type of Scalars before inference.
|
||||
// - Eliminate redundant Converts which can be inserted in AlignElementType() pass
|
||||
ngraph::pass::Manager manager;
|
||||
if (config.m_is_needed_to_align_precision) {
|
||||
manager.register_pass<snippets::pass::AlignElementType>(execution_element_type);
|
||||
manager.register_pass<ngraph::pass::ConstantFolding>();
|
||||
// TODO [100041] : In some cases AlignElementType pass can insert extra Convert because
|
||||
// the pass doesn't know real precisions in real time.
|
||||
// We call EliminateConverts pass to remove them
|
||||
manager.register_pass<ngraph::pass::EliminateConvert>();
|
||||
}
|
||||
manager.run_passes(body_ptr());
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::initialize_buffer_scratchpad_size() {
|
||||
auto is_transpose_loop = [](const ov::Output<ov::Node>& source_output) -> bool {
|
||||
const auto parent = source_output.get_node_shared_ptr();
|
||||
// Transpose op is decomposed into LoopBegin->LoadReshape->Store->LoopEnd subgraph. LoadReshape op can be only
|
||||
// in Transpose decomposition. So it's enough to verify that this Loop is Transpose pattern.
|
||||
// We cannot check for non-equality of input and output shape of Transpose Loop because Transpose may have the same
|
||||
// shapes on input and output.
|
||||
auto loop_end = ov::as_type_ptr<op::LoopEnd>(parent);
|
||||
if (!loop_end)
|
||||
return false;
|
||||
size_t idx = source_output.get_index();
|
||||
while (ov::is_type<op::LoopEnd>(loop_end->get_input_node_shared_ptr(idx))) {
|
||||
auto consumer = loop_end->input_value(idx);
|
||||
idx = consumer.get_index();
|
||||
loop_end = ov::as_type_ptr<op::LoopEnd>(consumer.get_node_shared_ptr());
|
||||
}
|
||||
|
||||
const auto loop_begin = loop_end->get_loop_begin();
|
||||
// At the moment Transpose Loops cannot be fused with other Loops, so check for one input and one output is enough
|
||||
if (loop_begin->get_input_size() != 1 || loop_end->get_output_size() != 1 || loop_begin->get_output_target_inputs(0).size() != 1)
|
||||
return false;
|
||||
const auto consumer = loop_begin->get_output_target_inputs(0).begin()->get_node();
|
||||
return ov::is_type<op::LoadReshape>(consumer);
|
||||
};
|
||||
auto propagate_offset = [](const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer, const size_t offset) {
|
||||
// If Buffer has offset We set this offset in the next Load and Store ops
|
||||
// to correctly read and write data because all buffers have the one register
|
||||
// Also if user sets offset to a Buffer It means that the Buffer has the corresponding Load and Store ops
|
||||
|
||||
// Propagate to up: in Store. Buffer can have only one Store
|
||||
{
|
||||
auto parent = buffer->get_input_node_shared_ptr(0);
|
||||
auto idx = buffer->input(0).get_source_output().get_index();
|
||||
// There may be graph with several LoopBegin and LoopEnd between Store/Brgemm and Buffer,
|
||||
// so we should iterate through LoopBase
|
||||
while (ov::is_type<snippets::op::LoopBase>(parent)) {
|
||||
const auto source_output = parent->input_value(idx);
|
||||
parent = source_output.get_node_shared_ptr();
|
||||
idx = source_output.get_index();
|
||||
}
|
||||
if (auto store = ov::as_type_ptr<snippets::op::Store>(parent)) {
|
||||
store->set_offset(offset);
|
||||
} else if (const auto brgemm = ov::as_type_ptr<snippets::op::Brgemm>(parent)) {
|
||||
// Brgemm encapsulates work with loading and storing of data
|
||||
brgemm->set_offset_c(offset);
|
||||
} else {
|
||||
throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Store op for offset propagation");
|
||||
}
|
||||
}
|
||||
|
||||
// Propagate to down: in Load. Buffer can have several Load and Loops after himself. We should go through all target inputs
|
||||
{
|
||||
std::function<void(const Input<Node>&)> propagate_down;
|
||||
propagate_down = [&](const Input<Node>& target_input) {
|
||||
const auto child = target_input.get_node()->shared_from_this();
|
||||
// There may be graph with several LoopBegin and LoopEnd between Load/Brgemm and Buffer,
|
||||
// so we should iterate through LoopBase
|
||||
// Example: Softmax decomposition with ReduceMax
|
||||
if (ov::is_type<snippets::op::LoopBase>(child)) {
|
||||
const auto index = target_input.get_index();
|
||||
for (const auto loop_target_output : child->output(index).get_target_inputs()) {
|
||||
propagate_down(loop_target_output);
|
||||
}
|
||||
} else if (const auto load = ov::as_type_ptr<snippets::op::Load>(child)) {
|
||||
load->set_offset(offset);
|
||||
} else if (const auto brgemm = ov::as_type_ptr<snippets::op::Brgemm>(child)) {
|
||||
// Brgemm encapsulates work with loading and storing of data
|
||||
if (target_input.get_index() == 0) {
|
||||
brgemm->set_offset_a(offset);
|
||||
} else if (target_input.get_index() == 1) {
|
||||
brgemm->set_offset_b(offset);
|
||||
}
|
||||
} else {
|
||||
throw ngraph_error("Buffer::set_offset() was called when Buffer didn't have the corresponding Load op for offset propagation");
|
||||
}
|
||||
};
|
||||
|
||||
for (const auto target_output : buffer->output(0).get_target_inputs()) {
|
||||
propagate_down(target_output);
|
||||
}
|
||||
}
|
||||
};
|
||||
m_buffer_scratchpad = 0;
|
||||
size_t offset = 0;
|
||||
const auto ops = body_ptr()->get_ordered_ops();
|
||||
for (const auto& op : ops) {
|
||||
if (const auto buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(op)) {
|
||||
const auto buffer_size = buffer->get_byte_size();
|
||||
// We need to allocate memory for first buffer at least
|
||||
if (m_buffer_scratchpad == 0) {
|
||||
m_buffer_scratchpad += buffer_size;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Transpose and MatMul ops should have different memories on inputs and outputs to avoid data corruption,
|
||||
// so after them, we should allocate new memory. Other operations (Eltwises, Convert) can be executed inplace.
|
||||
const auto parent = buffer->get_input_node_shared_ptr(0);
|
||||
if (ov::is_type<op::Brgemm>(parent) || is_transpose_loop(parent)) {
|
||||
offset = m_buffer_scratchpad;
|
||||
propagate_offset(buffer, offset);
|
||||
m_buffer_scratchpad += buffer_size;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If Buffer op requires memory size more that has been already allocated,
|
||||
// we increase current memory size to the needed size
|
||||
// For example, it's possible when we have a sequence of Eltwise ops with broadcasting
|
||||
const auto current_allocated_memory_size = m_buffer_scratchpad - offset;
|
||||
if (buffer_size > current_allocated_memory_size) {
|
||||
m_buffer_scratchpad += (buffer_size - current_allocated_memory_size);
|
||||
// Note: we don't update offset because we just add memory to needed size
|
||||
}
|
||||
|
||||
propagate_offset(buffer, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::convert_to_snippet_dialect() {
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
|
||||
auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
return n->get_input_shape(0).back() != 1;
|
||||
const auto& pshape = n->get_input_partial_shape(0);
|
||||
const auto& last_dim = pshape[pshape.size() - 1];
|
||||
return last_dim.is_dynamic() || last_dim.get_length() != 1;
|
||||
};
|
||||
|
||||
// At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
|
||||
// Then we are going to support variadic Load/Store with different element count
|
||||
const size_t count = m_generator->get_target_machine()->get_lanes();
|
||||
const auto & params = body_ptr()->get_parameters();
|
||||
|
||||
bool inputs_has_dynamic_last_dims = std::any_of(params.begin(), params.end(),
|
||||
[](const shared_ptr<ngraph::op::Parameter>& p){
|
||||
return p->get_partial_shape().rbegin()->is_dynamic();
|
||||
});
|
||||
const auto allocationRank = static_cast<int32_t>(tileRank);
|
||||
ngraph::pass::Manager manager;
|
||||
if (config.m_has_domain_sensitive_ops) {
|
||||
manager.register_pass<snippets::pass::MatMulToBrgemm>();
|
||||
manager.register_pass<snippets::pass::FuseTransposeBrgemm>();
|
||||
manager.register_pass<snippets::pass::InsertBuffer>(allocationRank);
|
||||
manager.register_pass<snippets::pass::SoftmaxDecomposition>(count, allocationRank);
|
||||
manager.register_pass<snippets::pass::TransposeDecomposition>();
|
||||
}
|
||||
manager.register_pass<snippets::pass::BroadcastToMoveBroadcast>();
|
||||
manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
|
||||
manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
|
||||
manager.register_pass<snippets::pass::InsertLoad>(count);
|
||||
manager.register_pass<snippets::pass::InsertStore>(count);
|
||||
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
|
||||
manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
|
||||
// Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
|
||||
// simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
|
||||
// could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the the output does
|
||||
// (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
|
||||
// with ScalarLoads (ScalarStores) to avoid invalid read in vector Tile. Graph example:
|
||||
// Parameter_0 Parameter_1 Parameter_2
|
||||
// [1,2,5,16] [1,2,5,1] [1,2,5,1]
|
||||
// Load BroadcastLoad Load* Scalar
|
||||
// Add Subtract
|
||||
// \___________ ___________BroadcastMove
|
||||
// \ /
|
||||
// Multiply
|
||||
// Store
|
||||
// Result
|
||||
// Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
|
||||
if (!exec_domain.empty() && exec_domain.back() != 1) {
|
||||
manager.register_pass<snippets::pass::SetScalarCountForLoad>();
|
||||
manager.register_pass<snippets::pass::SetScalarCountForStore>();
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
|
||||
// todo: presently dynamic pipeline is activated even if the last two dimension are static
|
||||
// In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
|
||||
// should be passed as run-time args, so it's a mixed mode: kernel is shape-aware, but some additional runtime args are required
|
||||
// Presently Broadcasting is organized in the following way:
|
||||
// * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims)
|
||||
if (!inputs_has_dynamic_last_dims) {
|
||||
manager.register_pass<snippets::pass::InsertMoveBroadcast>();
|
||||
manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
|
||||
// Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
|
||||
// simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
|
||||
// could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the output does
|
||||
// (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
|
||||
// with ScalarLoads (ScalarStores) to avoid invalid read in vector Loop. Graph example:
|
||||
// Parameter_0 Parameter_1 Parameter_2
|
||||
// [1,2,5,16] [1,2,5,1] [1,2,5,1]
|
||||
// Load BroadcastLoad Load* Scalar
|
||||
// Add Subtract
|
||||
// \___________ ___________BroadcastMove
|
||||
// \ /
|
||||
// Multiply
|
||||
// Store
|
||||
// Result
|
||||
// Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Loop.
|
||||
if (master_shape.size() != 0 && master_shape[master_shape.size() - 1] != 1) {
|
||||
manager.register_pass<snippets::pass::SetScalarCountForLoad>();
|
||||
manager.register_pass<snippets::pass::SetScalarCountForStore>();
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
|
||||
manager.get_pass_config()->
|
||||
set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
|
||||
}
|
||||
// Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if
|
||||
// automatic validation will be disabled in the pass manager
|
||||
manager.register_pass<snippets::pass::InsertLoops>(master_shape, tileRank,
|
||||
m_generator->get_target_machine()->get_lanes(), !config.m_explicit_loop_insertion);
|
||||
if (config.m_has_domain_sensitive_ops) {
|
||||
manager.register_pass<snippets::pass::LoopFusion>();
|
||||
manager.register_pass<snippets::pass::ResetBufferState>();
|
||||
}
|
||||
}
|
||||
manager.run_passes(body_ptr());
|
||||
}
|
||||
@ -380,29 +617,29 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt,
|
||||
INTERNAL_OP_SCOPE(Subgraph);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
|
||||
NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
|
||||
|
||||
convert_to_snippet_dialect();
|
||||
opt.run_passes(body_ptr());
|
||||
|
||||
// generation flow
|
||||
// After all passes, when all optimizations are completed and all MemoryAccess ops are inserted,
|
||||
// we can calculate common buffer scratchpad size and propagate offset from Buffer to the corresponding MemoryAccess ops
|
||||
if (config.m_has_domain_sensitive_ops)
|
||||
initialize_buffer_scratchpad_size();
|
||||
|
||||
snippets::pass::AssignRegisters().run_on_model(body_ptr());
|
||||
|
||||
// schedule generation should go here and be target agnostic
|
||||
const auto ops = body_ptr()->get_ops();
|
||||
ngraph::snippets::Generator::GeneratorConfig generatorConfig;
|
||||
generatorConfig.m_save_lowered_code = config.m_has_domain_sensitive_ops;
|
||||
generatorConfig.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
|
||||
generatorConfig.m_optimize_single_evaluation = std::none_of(ops.begin(), ops.end(), [](const std::shared_ptr<ov::Node>& op) {
|
||||
return ov::is_type<ngraph::snippets::op::Buffer>(op);
|
||||
});
|
||||
|
||||
// actual code emission
|
||||
ngraph::snippets::code ptr = m_generator->generate(body_ptr(), compile_params);
|
||||
ngraph::snippets::code ptr = m_generator->generate(body_ptr(), generatorConfig, compile_params);
|
||||
|
||||
// check that body doesn't have constants for scheduling
|
||||
std::vector<std::shared_ptr<opset1::Constant>> constants;
|
||||
for (auto op : body_ptr()->get_ordered_ops()) {
|
||||
if (auto constant = ov::as_type_ptr<opset1::Constant>(op)) {
|
||||
if (ngraph::shape_size(constant->get_shape()) != 1 && constant->get_shape() != Shape()) {
|
||||
constants.push_back(constant);
|
||||
}
|
||||
}
|
||||
}
|
||||
NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");
|
||||
|
||||
return {exec_domain, false /*canBeLinearized*/, ptr};
|
||||
return {master_shape, false /*canBeLinearized*/, ptr};
|
||||
}
|
||||
|
||||
void snippets::op::Subgraph::print() const {
|
||||
|
@ -1,15 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/tile.hpp"
|
||||
#include "snippets/generator.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
|
||||
size_t num_inputs, size_t num_outputs,
|
||||
const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
|
||||
Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/op/tile_scheduler.hpp"
|
||||
#include "snippets/generator.hpp"
|
||||
|
||||
ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
|
||||
: Op(), vector_region{vector_region}, scalar_region{scalar_region} {
|
||||
}
|
27
src/common/snippets/src/op/vector_buffer.cpp
Normal file
27
src/common/snippets/src/op/vector_buffer.cpp
Normal file
@ -0,0 +1,27 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/op/vector_buffer.hpp"
|
||||
|
||||
#include <ngraph/runtime/host_tensor.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
|
||||
snippets::op::VectorBuffer::VectorBuffer(const ov::element::Type element_type) : Op(), m_element_type(std::move(element_type)) {
|
||||
constructor_validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<Node> snippets::op::VectorBuffer::clone_with_new_inputs(const OutputVector& new_args) const {
|
||||
INTERNAL_OP_SCOPE(VectorBuffer_clone_with_new_inputs);
|
||||
check_new_args_count(this, new_args);
|
||||
return std::make_shared<VectorBuffer>(m_element_type);
|
||||
}
|
||||
|
||||
void snippets::op::VectorBuffer::validate_and_infer_types() {
|
||||
INTERNAL_OP_SCOPE(VectorBuffer_validate_and_infer_types);
|
||||
set_output_type(0, m_element_type, Shape{1lu});
|
||||
}
|
@ -20,13 +20,17 @@ inline auto is_in_op(const std::shared_ptr<ov::Node>& n) -> bool {
|
||||
|| ov::is_type<ov::op::v0::Constant>(n);
|
||||
}
|
||||
|
||||
// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert)
|
||||
// And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite
|
||||
// At the moment Subgraph supports only Eltwise, Select, Convert, Broadcast and FQ (which is decomposed into Eltwises and Convert) with
|
||||
// Softmax (which is decomposed into Eltwises as well)
|
||||
// And only Eltwise and Select ops supports execution only in "exec_type". So we can check op type from the opposite
|
||||
// NOTE: This check is only for executable which isn't Parameter/Constant/Result
|
||||
inline auto op_supports_only_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
|
||||
return !is_in_op(n) &&
|
||||
!ov::is_type<ov::op::v0::Result>(n) &&
|
||||
!ov::is_type<ov::op::v0::Convert>(n);
|
||||
!ov::is_type<ov::op::v1::Transpose>(n) &&
|
||||
!ov::is_type<ov::op::v0::Convert>(n) &&
|
||||
!ov::is_type<ov::op::v1::Broadcast>(n) &&
|
||||
!ov::is_type<ov::op::v3::Broadcast>(n);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -58,7 +62,8 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt
|
||||
// - Input is Convert with unsupported destination type
|
||||
// - Input is Op which support any element type
|
||||
// We couldn't unite these conditions and just check that element type isn't supported exec type
|
||||
// because we don't call validate_and_infer_types() so we don't know new precisions
|
||||
// because we don't call validate_and_infer_types() so we don't know new precisions after setting of original
|
||||
// input and output element types
|
||||
if ((existing_convert && existing_convert->get_destination_type() != exec_type) ||
|
||||
(!op_supports_only_exec_type(shared_input))) {
|
||||
insertConvert(op, i, exec_type);
|
||||
@ -89,6 +94,6 @@ bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_pt
|
||||
}
|
||||
|
||||
bool ngraph::snippets::pass::AlignElementType::opNeedsAlignElementType(const std::shared_ptr<ov::Node>& op, const ov::element::Type exec_type) {
|
||||
// At the moment Snippets support only Eltwise/Convert/FQ which one output so we can just call get_element_type()
|
||||
// At the moment Snippets support only Eltwise/Convert/FQ/Select/Softmax/Broadcast which one output so we can just call get_element_type()
|
||||
return op_supports_only_exec_type(op) && op->get_element_type() != exec_type;
|
||||
}
|
||||
|
@ -2,81 +2,208 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
// #include <openvino/cc/selective_build.h>
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/remarks.hpp"
|
||||
|
||||
#include "snippets/pass/assign_registers.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
|
||||
#include <iterator>
|
||||
|
||||
namespace {
|
||||
static constexpr size_t reg_count = 16lu;
|
||||
} // namespace
|
||||
|
||||
bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
|
||||
RUN_ON_MODEL_SCOPE(AssignRegisters);
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
|
||||
using Reg = size_t;
|
||||
using tensor = std::shared_ptr<descriptor::Tensor>;
|
||||
auto ops = f->get_ordered_ops();
|
||||
decltype(ops) stmts;
|
||||
std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) {
|
||||
return !(std::dynamic_pointer_cast<opset1::Parameter>(op) || std::dynamic_pointer_cast<opset1::Result>(op));
|
||||
});
|
||||
// Note that currently there are 3 types of ops:
|
||||
// * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer?
|
||||
// * gpr->vec: or vec->gpr Load/LoadConvert, Store/StoreConvert, BroadcastLoad etc.
|
||||
// * vec->vec: all other "normal" operations that perform calculations on vector registers: Add, BroadcastMove, Power, etc.
|
||||
enum op_reg_type {gpr2gpr, gpr2vec, vec2gpr, vec2vec};
|
||||
|
||||
size_t rdx = 0;
|
||||
std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
|
||||
for (const auto& op : stmts) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
regs[output.get_tensor_ptr()] = rdx++;
|
||||
auto get_op_reg_type = [](const std::shared_ptr<Node>& op) {
|
||||
if (std::dynamic_pointer_cast<opset1::Parameter>(op) ||
|
||||
std::dynamic_pointer_cast<opset1::Result>(op) ||
|
||||
std::dynamic_pointer_cast<op::LoopBegin>(op) ||
|
||||
std::dynamic_pointer_cast<op::LoopEnd>(op) ||
|
||||
std::dynamic_pointer_cast<op::Brgemm>(op) ||
|
||||
std::dynamic_pointer_cast<op::Buffer>(op))
|
||||
return gpr2gpr;
|
||||
else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
|
||||
std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
|
||||
return gpr2vec;
|
||||
else if (std::dynamic_pointer_cast<snippets::op::Store>(op))
|
||||
return vec2gpr;
|
||||
else
|
||||
return vec2vec;
|
||||
};
|
||||
std::vector<std::pair<op_reg_type, std::shared_ptr<Node>>> typed_ops;
|
||||
for (const auto& op : ops)
|
||||
typed_ops.emplace_back(std::make_pair(get_op_reg_type(op), op));
|
||||
size_t counter_vec = 0;
|
||||
size_t counter_gpr = 0;
|
||||
std::map<tensor, Reg> regs_vec, regs_gpr;
|
||||
// Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually
|
||||
std::map<tensor, Reg> manually_assigned_gprs, manually_assigned_vecs;
|
||||
const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
|
||||
const auto num_parameters = f->get_parameters().size();
|
||||
const auto num_results = f->get_results().size();
|
||||
auto accumulator_reg = 0lu;
|
||||
for (const auto& op : ops) {
|
||||
if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
|
||||
manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(f->get_parameter_index(param));
|
||||
} else if (const auto& result = ov::as_type_ptr<opset1::Result>(op)) {
|
||||
// here we use the fact that Result input & output tensors are identical by construction
|
||||
manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(f->get_result_index(result) + num_parameters);
|
||||
} else if (const auto& buffer = ov::as_type_ptr<op::Buffer>(op)) {
|
||||
// All buffers have one common data pointer
|
||||
manually_assigned_gprs[op->input(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(num_results + num_parameters);
|
||||
manually_assigned_gprs[op->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(num_results + num_parameters);
|
||||
} else if (ov::is_type<op::HorizonMax>(op) || ov::is_type<op::HorizonSum>(op)) {
|
||||
// Only in SoftmaxDecomposition ReduceMax and ReduceSum use HorizonMax/HorizonSum and VectorBuffer.
|
||||
// We should manually set the one vector register for VectorBuffer and Max/Sum output to simulate a accumulator
|
||||
// TODO [96351]: We should rewrite accumulator pattern using another way
|
||||
const auto input = op->get_input_node_shared_ptr(0); // input - it's accumulator math op: Add or Max
|
||||
for (size_t i = 0; i < input->get_input_size(); ++i) {
|
||||
if (ov::is_type<op::VectorBuffer>(input->get_input_node_shared_ptr(i))) {
|
||||
manually_assigned_vecs[input->input(i).get_tensor_ptr()] =
|
||||
static_cast<Reg>(accumulator_reg);
|
||||
}
|
||||
}
|
||||
|
||||
manually_assigned_vecs[input->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(accumulator_reg);
|
||||
manually_assigned_vecs[op->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(accumulator_reg);
|
||||
|
||||
// If there is Broadcast, it should have the same register as Horizon op
|
||||
// because it's a result of the accumulator as well
|
||||
for (auto& out : op->output(0).get_target_inputs()) {
|
||||
const auto child = out.get_node()->shared_from_this();
|
||||
if (ov::is_type<op::BroadcastMove>(child)) {
|
||||
manually_assigned_vecs[child->output(0).get_tensor_ptr()] =
|
||||
static_cast<Reg>(accumulator_reg);
|
||||
}
|
||||
}
|
||||
accumulator_reg++;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::set<Reg>> used;
|
||||
std::vector<std::set<Reg>> def;
|
||||
|
||||
for (const auto& op : stmts) {
|
||||
std::set<Reg> u;
|
||||
for (const auto& input : op->inputs()) {
|
||||
if (regs.count(input.get_tensor_ptr())) {
|
||||
u.insert(regs[input.get_tensor_ptr()]);
|
||||
auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG] (const std::shared_ptr<ov::Node>& op,
|
||||
decltype(regs_vec)& reg_map,
|
||||
const std::map<tensor, Reg>& manually_assigned_regs,
|
||||
size_t& counter) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
const auto& t = output.get_tensor_ptr();
|
||||
// Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
|
||||
// so we have to check that the tensor has not been enumerated already
|
||||
if (reg_map.count(t) == 0) {
|
||||
reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG;
|
||||
}
|
||||
}
|
||||
used.push_back(u);
|
||||
|
||||
std::set<Reg> d;
|
||||
if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
d.insert(regs[output.get_tensor_ptr()]);
|
||||
}
|
||||
};
|
||||
for (const auto& t_op : typed_ops) {
|
||||
switch (t_op.first) {
|
||||
case vec2vec:
|
||||
case gpr2vec:
|
||||
enumerate_out_tensors(t_op.second, regs_vec, manually_assigned_vecs, counter_vec);
|
||||
break;
|
||||
case gpr2gpr:
|
||||
case vec2gpr:
|
||||
enumerate_out_tensors(t_op.second, regs_gpr, manually_assigned_gprs, counter_gpr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// todo: make one for gpr and one for vector
|
||||
std::vector<std::set<Reg>> used_gpr(ops.size(), std::set<Reg>()); // used = used as an input
|
||||
std::vector<std::set<Reg>> defined_gpr(ops.size(), std::set<Reg>()); // defined = used as output
|
||||
std::vector<std::set<Reg>> used_vec(ops.size(), std::set<Reg>());
|
||||
std::vector<std::set<Reg>> defined_vec(ops.size(), std::set<Reg>());
|
||||
|
||||
auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector<tensor>& tensors, const std::map<tensor, Reg>& reg_map) {
|
||||
std::set<Reg> result;
|
||||
for (const auto& t : tensors) {
|
||||
if (reg_map.count(t) == 0)
|
||||
throw ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor");
|
||||
Reg reg_id = reg_map.at(t);
|
||||
if (reg_id != IS_MANUALLY_ALLOCATED_REG)
|
||||
result.insert(reg_id);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
for (size_t i = 0; i < typed_ops.size(); i++) {
|
||||
const auto& t_op = typed_ops[i];
|
||||
std::vector<tensor> used_tensors, defined_tensors;
|
||||
for (const auto& in : t_op.second->inputs())
|
||||
used_tensors.push_back(in.get_tensor_ptr());
|
||||
for (const auto& out : t_op.second->outputs())
|
||||
defined_tensors.push_back(out.get_tensor_ptr());
|
||||
switch (t_op.first) {
|
||||
case vec2vec:
|
||||
used_vec[i] = tensor2reg(used_tensors, regs_vec);
|
||||
defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
|
||||
break;
|
||||
case gpr2gpr:
|
||||
used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
|
||||
defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
|
||||
break;
|
||||
case gpr2vec:
|
||||
used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
|
||||
defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
|
||||
break;
|
||||
case vec2gpr:
|
||||
used_vec[i] = tensor2reg(used_tensors, regs_vec);
|
||||
defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
|
||||
break;
|
||||
}
|
||||
def.push_back(d);
|
||||
}
|
||||
|
||||
// define life intervals
|
||||
std::vector<std::set<Reg>> lifeIn(stmts.size(), std::set<Reg>());
|
||||
std::vector<std::set<Reg>> lifeOut(stmts.size(), std::set<Reg>());
|
||||
// liveOut[i] - regs that are live on exit from i-th (topologically ordered) operation
|
||||
// liveIn[i] - regs that are live on entering the i-th (topologically ordered) operation
|
||||
std::vector<std::set<Reg>> life_in_vec(std::move(used_vec));
|
||||
std::vector<std::set<Reg>> life_out_vec(typed_ops.size(), std::set<Reg>());
|
||||
std::vector<std::set<Reg>> life_in_gpr(std::move(used_gpr));
|
||||
std::vector<std::set<Reg>> life_out_gpr(typed_ops.size(), std::set<Reg>());
|
||||
|
||||
for (size_t i = 0; i < stmts.size(); i++) {
|
||||
for (size_t n = 0; n < stmts.size(); n++) {
|
||||
std::set_difference(lifeOut[n].begin(), lifeOut[n].end(), def[n].begin(), def[n].end(), std::inserter(lifeIn[n], lifeIn[n].begin()));
|
||||
lifeIn[n].insert(used[n].begin(), used[n].end());
|
||||
// todo: this part if O(N*N), so it's slow for large subgraphs. Can we simplify it? At least add an early stopping criteria
|
||||
for (size_t i = 0; i < typed_ops.size(); i++) {
|
||||
for (size_t n = 0; n < typed_ops.size(); n++) {
|
||||
// Regs that are live on entering the operation = regs used by the op + (all other regs alive - regs defined by the op)
|
||||
// copy regs from lifeOut to lifeIn while ignoring regs in def
|
||||
std::set_difference(life_out_gpr[n].begin(), life_out_gpr[n].end(),
|
||||
defined_gpr[n].begin(), defined_gpr[n].end(),
|
||||
std::inserter(life_in_gpr[n], life_in_gpr[n].begin()));
|
||||
std::set_difference(life_out_vec[n].begin(), life_out_vec[n].end(),
|
||||
defined_vec[n].begin(), defined_vec[n].end(),
|
||||
std::inserter(life_in_vec[n], life_in_vec[n].begin()));
|
||||
}
|
||||
for (size_t n = 0; n < stmts.size(); n++) {
|
||||
auto node = stmts[n];
|
||||
if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
|
||||
for (const auto& out : node->outputs()) {
|
||||
for (const auto& port : out.get_target_inputs()) {
|
||||
auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
|
||||
if (pos != stmts.end()) {
|
||||
auto k = pos-stmts.begin();
|
||||
lifeOut[n].insert(lifeIn[k].begin(), lifeIn[k].end());
|
||||
}
|
||||
for (size_t n = 0; n < typed_ops.size(); n++) {
|
||||
auto op = typed_ops[n].second;
|
||||
for (const auto& out : op->outputs()) {
|
||||
for (const auto& port : out.get_target_inputs()) {
|
||||
auto k = std::find(ops.begin(), ops.end(), port.get_node()->shared_from_this()) - ops.begin();
|
||||
if (k == ops.size())
|
||||
throw ngraph_error("assign registers can't find target op in the body");
|
||||
switch (typed_ops[k].first) {
|
||||
case vec2vec:
|
||||
case vec2gpr:
|
||||
life_out_vec[n].insert(life_in_vec[k].begin(), life_in_vec[k].end());
|
||||
break;
|
||||
case gpr2gpr:
|
||||
case gpr2vec:
|
||||
life_out_gpr[n].insert(life_in_gpr[k].begin(), life_in_gpr[k].end());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct by_starting {
|
||||
auto operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs) const -> bool {
|
||||
return lhs.first < rhs.first|| (lhs.first == rhs.first && lhs.second < rhs.second);
|
||||
@ -88,13 +215,15 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
return lhs.second < rhs.second || (lhs.second == rhs.second && lhs.first < rhs.first);
|
||||
}
|
||||
};
|
||||
// A variable live interval - is a range (start, stop) of op indexes, such that
|
||||
// the variable is alive within this range (defined but not used by the last user)
|
||||
std::map<std::pair<int, int>, Reg, by_starting> live_intervals_vec, live_intervals_gpr;
|
||||
|
||||
std::set<std::pair<int, int>, by_starting> live_intervals;
|
||||
|
||||
std::reverse(lifeIn.begin(), lifeIn.end());
|
||||
auto find_last_use = [lifeIn](int i) -> int {
|
||||
int ln = static_cast<int>(lifeIn.size()) - 1;
|
||||
for (auto& x : lifeIn) {
|
||||
std::reverse(life_in_vec.begin(), life_in_vec.end());
|
||||
std::reverse(life_in_gpr.begin(), life_in_gpr.end());
|
||||
auto find_last_use = [](decltype(life_in_gpr) life_in, int i) -> int {
|
||||
int ln = static_cast<int>(life_in.size()) - 1;
|
||||
for (auto& x : life_in) {
|
||||
if (x.find(i) != x.end()) {
|
||||
return ln;
|
||||
}
|
||||
@ -102,67 +231,86 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
|
||||
}
|
||||
return i;
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < stmts.size(); i++) {
|
||||
live_intervals.insert(std::make_pair(static_cast<int>(i), find_last_use(static_cast<int>(i))));
|
||||
for (int i = 0; i < static_cast<int>(typed_ops.size()); i++) {
|
||||
for (const auto& def : defined_vec[i])
|
||||
live_intervals_vec[std::make_pair(i, find_last_use(life_in_vec, static_cast<int>(def)))] = def;
|
||||
for (const auto& def : defined_gpr[i])
|
||||
live_intervals_gpr[std::make_pair(i, find_last_use(life_in_gpr, static_cast<int>(def)))] = def;
|
||||
}
|
||||
|
||||
// http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
|
||||
std::multiset<std::pair<int, int>, by_ending> active;
|
||||
std::map<Reg, Reg> register_map;
|
||||
std::stack<Reg> bank;
|
||||
for (int i = 0; i < 16; i++) bank.push(16-1-i);
|
||||
auto linescan_assign_registers = [](const decltype(live_intervals_vec)& live_intervals,
|
||||
const std::set<Reg>& reg_pool) {
|
||||
// http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
|
||||
// todo: do we need multimap? <=> can an op have two inputs from the same op?
|
||||
std::map<std::pair<int, int>, Reg, by_ending> active;
|
||||
// uniquely defined register => reused reg (reduced subset enabled by reg by reusage)
|
||||
std::map<Reg, Reg> register_map;
|
||||
std::stack<Reg> bank;
|
||||
// regs are stored in ascending order in reg_pool, so walk in reverse to assign them the same way
|
||||
for (auto rit = reg_pool.crbegin(); rit != reg_pool.crend(); rit++)
|
||||
bank.push(*rit);
|
||||
|
||||
for (auto interval : live_intervals) {
|
||||
// check expired
|
||||
while (!active.empty()) {
|
||||
auto x = *active.begin();
|
||||
if (x.second >= interval.first) {
|
||||
break;
|
||||
std::pair<int, int> interval, active_interval;
|
||||
Reg unique_reg, active_unique_reg;
|
||||
for (const auto& interval_reg : live_intervals) {
|
||||
std::tie(interval, unique_reg) = interval_reg;
|
||||
// check expired
|
||||
while (!active.empty()) {
|
||||
std::tie(active_interval, active_unique_reg) = *active.begin();
|
||||
// if end of active interval has not passed yet => stop removing actives since they are sorted by end
|
||||
if (active_interval.second >= interval.first) {
|
||||
break;
|
||||
}
|
||||
active.erase(active_interval);
|
||||
bank.push(register_map[active_unique_reg]);
|
||||
}
|
||||
active.erase(x);
|
||||
bank.push(register_map[x.first]);
|
||||
}
|
||||
// allocate
|
||||
if (active.size() == 16) {
|
||||
throw ngraph_error("caanot allocate registers for a snippet ");
|
||||
} else {
|
||||
register_map[interval.first] = bank.top();
|
||||
bank.pop();
|
||||
active.insert(interval);
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
|
||||
|
||||
for (const auto& reg : regs) {
|
||||
physical_regs[reg.first] = register_map[reg.second];
|
||||
}
|
||||
const auto num_parameters = f->get_parameters().size();
|
||||
for (const auto& n : f->get_ordered_ops()) {
|
||||
auto& rt = n->get_rt_info();
|
||||
std::vector<size_t> regs;
|
||||
regs.reserve(n->outputs().size());
|
||||
/* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
|
||||
* then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
|
||||
* Note also that Parameter and Result store general-purpose register index, because they work with memory
|
||||
* (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
|
||||
* performed on registers.
|
||||
*/
|
||||
if (is_type<ov::op::v0::Result>(n)) {
|
||||
continue;
|
||||
} else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
|
||||
regs.push_back(f->get_parameter_index(param));
|
||||
} else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
|
||||
regs.push_back(f->get_result_index(store) + num_parameters);
|
||||
} else {
|
||||
for (const auto& output : n->outputs()) {
|
||||
auto allocated = physical_regs[output.get_tensor_ptr()];
|
||||
regs.push_back(allocated);
|
||||
// allocate
|
||||
if (active.size() == reg_pool.size()) {
|
||||
// todo: if it is LoopBegin or LoopEnd that requires gpr, and we don't have any in the pool,
|
||||
// then assign SIZE_MAX-1 as a flag to spill a reg inside emitter
|
||||
throw ngraph::ngraph_error("can't allocate registers for a snippet ");
|
||||
} else {
|
||||
register_map[unique_reg] = bank.top();
|
||||
bank.pop();
|
||||
active.insert(interval_reg);
|
||||
}
|
||||
}
|
||||
rt["reginfo"] = regs;
|
||||
}
|
||||
return register_map;
|
||||
};
|
||||
// todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator
|
||||
std::set<Reg> vec_pool;
|
||||
for (Reg i = 0; i < reg_count; i++)
|
||||
vec_pool.insert(i);
|
||||
std::set<Reg> gpr_pool(vec_pool);
|
||||
for (const auto& t_reg : manually_assigned_vecs)
|
||||
vec_pool.erase(t_reg.second);
|
||||
for (const auto& t_reg : manually_assigned_gprs)
|
||||
gpr_pool.erase(t_reg.second);
|
||||
auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool);
|
||||
auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool);
|
||||
|
||||
std::map<tensor, Reg> assigned_regs(std::move(manually_assigned_gprs));
|
||||
assigned_regs.insert(manually_assigned_vecs.begin(), manually_assigned_vecs.end());
|
||||
auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<tensor, Reg>& unique_regs,
|
||||
const std::map<Reg, Reg>& unique2reused) {
|
||||
for (const auto& reg : unique_regs) {
|
||||
if (reg.second == IS_MANUALLY_ALLOCATED_REG)
|
||||
continue;
|
||||
if (unique2reused.count(reg.second) == 0)
|
||||
throw ngraph::ngraph_error("Assign registers failed to allocate register for a tensor");
|
||||
assigned_regs[reg.first] = unique2reused.at(reg.second);
|
||||
}
|
||||
};
|
||||
register_assigned_regs(regs_vec, unique2reused_map_vec);
|
||||
register_assigned_regs(regs_gpr, unique2reused_map_gpr);
|
||||
|
||||
for (const auto& t_op : typed_ops) {
|
||||
for (const auto& out : t_op.second->outputs()) {
|
||||
const auto& t = out.get_tensor_ptr();
|
||||
auto& rt = t->get_rt_info();
|
||||
rt["reginfo"] = static_cast<size_t>(assigned_regs[t]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
49
src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp
Normal file
49
src/common/snippets/src/pass/broadcast_to_movebroadcast.cpp
Normal file
@ -0,0 +1,49 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/broadcast_to_movebroadcast.hpp"
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
ngraph::snippets::pass::BroadcastToMoveBroadcast::BroadcastToMoveBroadcast() {
|
||||
MATCHER_SCOPE(BroadcastToMoveBroadcast);
|
||||
|
||||
auto m_broadcast = ngraph::pattern::wrap_type<ngraph::op::v1::Broadcast, ngraph::op::v3::Broadcast>();
|
||||
|
||||
auto callback = [this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::BroadcastToMoveBroadcast")
|
||||
auto root = m.get_match_root();
|
||||
if (auto broadcast_v1 = ov::as_type_ptr<const ov::op::v1::Broadcast>(root)) {
|
||||
if (broadcast_v1->get_broadcast_spec().m_type != ov::op::AutoBroadcastType::NUMPY)
|
||||
return false;
|
||||
} else if (auto broadcast_v3 = ov::as_type_ptr<const ov::op::v3::Broadcast>(root)) {
|
||||
if (broadcast_v3->get_broadcast_spec().m_type != ov::op::BroadcastType::NUMPY)
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto target_shape = root->get_output_partial_shape(0);
|
||||
const auto value_shape = root->get_input_partial_shape(0);
|
||||
if (target_shape.is_dynamic() || value_shape.is_dynamic()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto broadcast_node = ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim(root->input_value(0),
|
||||
target_shape.get_shape(),
|
||||
value_shape.get_shape());
|
||||
replace_output_update_name(root->output(0), broadcast_node);
|
||||
ngraph::copy_runtime_info(root, broadcast_node.get_node_shared_ptr());
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(m_broadcast, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
@ -6,6 +6,9 @@
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/pass/transpose_decomposition.hpp"
|
||||
#include "snippets/pass/fuse_transpose_brgemm.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
|
||||
@ -14,11 +17,11 @@
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/op/loop.hpp>
|
||||
#include "transformations/utils/utils.hpp"
|
||||
#include "ngraph/op/util/attr_types.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <numeric>
|
||||
#include <climits>
|
||||
@ -32,33 +35,38 @@ namespace pass {
|
||||
namespace {
|
||||
|
||||
auto outputs_are_not_broadcastable(const std::shared_ptr<const Node>& node) -> bool {
|
||||
auto outputs = node->outputs();
|
||||
auto find_smallest_output_shape = [](const std::vector<Output<const Node>>& outputs) -> Shape {
|
||||
return std::accumulate(std::begin(outputs), std::end(outputs), ngraph::Shape(outputs.begin()->get_shape()),
|
||||
[](Shape& other_shape, const Output<const Node>& output){
|
||||
return shape_size(output.get_shape()) < shape_size(other_shape) ? output.get_shape() : other_shape;
|
||||
});
|
||||
};
|
||||
auto ref_shape = find_smallest_output_shape(outputs);
|
||||
|
||||
auto check_shapes_broadcastable = [ref_shape](const Output<const Node>& output) -> bool {
|
||||
auto other_shape = output.get_shape();
|
||||
|
||||
if (other_shape.size() != ref_shape.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return std::inner_product(std::begin(other_shape), std::end(other_shape), std::begin(ref_shape), true,
|
||||
std::logical_and<bool>(), [](Shape::value_type lsh, Shape::value_type rsh){
|
||||
return rsh == 1 || lsh == rsh;
|
||||
});
|
||||
};
|
||||
|
||||
return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs);
|
||||
const auto& outputs = node->outputs();
|
||||
if (outputs.size() <= 1)
|
||||
return false;
|
||||
ov::PartialShape ref_shape = outputs.front().get_partial_shape();
|
||||
bool success = true;
|
||||
for (int i = 1; i < outputs.size() && success; i++) {
|
||||
success &= ov::PartialShape::broadcast_merge_into(ref_shape, outputs[i].get_partial_shape(), ov::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
return !success;
|
||||
}
|
||||
|
||||
auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_supported_op")
|
||||
auto is_supported_matmul = [](const std::shared_ptr<const Node>& n) -> bool {
|
||||
const auto& matmul = is_type<const opset1::MatMul>(n);
|
||||
const auto& out_shape = n->get_output_partial_shape(0);
|
||||
return matmul && out_shape.is_static() && out_shape.size() == 4;
|
||||
};
|
||||
auto is_supported_transpose = [](const std::shared_ptr<const Node>& n) -> bool {
|
||||
const auto& transpose = as_type_ptr<const opset1::Transpose>(n);
|
||||
const auto& out_shape = n->get_output_partial_shape(0);
|
||||
if (transpose && out_shape.is_static()) {
|
||||
const auto& order = as_type_ptr<const opset1::Constant>(n->get_input_node_shared_ptr(1));
|
||||
if (order) {
|
||||
const auto order_value = order->cast_vector<int>();
|
||||
return TransposeDecomposition::supported_cases.count(order_value) != 0 ||
|
||||
FuseTransposeBrgemm::supported_cases.count(order_value) != 0;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto is_supported_fq_op = [](const std::shared_ptr<const Node>& n) -> bool {
|
||||
// TODO [92179]: Add support of FakeQuantize with non-constants inputs and with binarization algorithm.
|
||||
const auto fq = ov::as_type_ptr<const opset1::FakeQuantize>(n);
|
||||
@ -69,6 +77,10 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
|
||||
is_type<opset1::Constant>(n->get_input_node_shared_ptr(4));
|
||||
};
|
||||
|
||||
auto is_supported_ternary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
|
||||
return ov::is_type<opset1::Select>(n);
|
||||
};
|
||||
|
||||
auto is_supported_binary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
|
||||
return ov::is_type<opset1::Add>(n)
|
||||
|| ov::is_type<opset1::Divide>(n)
|
||||
@ -114,14 +126,51 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
|
||||
|| ov::is_type<ngraph::op::v4::Swish>(n)
|
||||
|| ov::is_type<ngraph::op::v4::HSwish>(n);
|
||||
};
|
||||
return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n);
|
||||
|
||||
auto is_supported_softmax = [](const std::shared_ptr<const Node> &n) -> bool {
|
||||
if (n->get_input_size() != 1 || n->get_input_partial_shape(0).rank().is_dynamic())
|
||||
return false;
|
||||
int64_t axis = -1;
|
||||
const auto rank = n->get_input_partial_shape(0).rank();
|
||||
if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(n)) {
|
||||
axis = ngraph::normalize_axis(n->get_friendly_name(), softmax_v8->get_axis(), rank);
|
||||
} else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(n)) {
|
||||
axis = softmax_v1->get_axis();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return axis >= 0 && axis == (rank.get_length() - 1);
|
||||
};
|
||||
|
||||
auto is_supported_broadcast_op = [](const std::shared_ptr<const Node> &n) -> bool {
|
||||
// Broadcast is supported only for MHA tokenization where there are needed and special checks
|
||||
if (auto broadcast_v1 = ov::as_type_ptr<const ov::op::v1::Broadcast>(n)) {
|
||||
return broadcast_v1->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY;
|
||||
} else if (auto broadcast_v3 = ov::as_type_ptr<const ov::op::v3::Broadcast>(n)) {
|
||||
return broadcast_v3->get_broadcast_spec().m_type == ov::op::BroadcastType::NUMPY;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
return is_supported_fq_op(n) ||
|
||||
is_supported_unary_eltwise_op(n) ||
|
||||
is_supported_binary_eltwise_op(n) ||
|
||||
is_supported_ternary_eltwise_op(n) ||
|
||||
is_supported_transpose(n) ||
|
||||
is_supported_softmax(n) ||
|
||||
is_supported_matmul(n) ||
|
||||
is_supported_broadcast_op(n);
|
||||
}
|
||||
|
||||
auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
|
||||
auto supported = [](descriptor::Tensor& t) -> bool {
|
||||
static const std::set<ngraph::element::Type> supported_data_types =
|
||||
{ ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
|
||||
return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
|
||||
auto supported = [&n](descriptor::Tensor& t) -> bool {
|
||||
// Todo: int32 isn't supported in general because i32 emitters are required for bit-exact i32 calculations in some cases
|
||||
// So i32 is supported exclusively for transposes and broadcast
|
||||
return t.get_partial_shape().is_static() &&
|
||||
(TokenizeSnippets::supported_element_types.count(t.get_element_type()) != 0 ||
|
||||
(t.get_element_type() == ngraph::element::i32 &&
|
||||
(ov::is_type<const opset1::Transpose>(n) ||
|
||||
ov::is_type<const opset1::Broadcast>(n))));
|
||||
};
|
||||
const auto & inputs = n->inputs();
|
||||
const auto & outputs = n->outputs();
|
||||
@ -155,65 +204,15 @@ auto get_num_result_children(const std::shared_ptr<const Node> &node) -> size_t
|
||||
}
|
||||
return result;
|
||||
}
|
||||
// Need to update tensor name manually, since intel_cpu::Graph::Replicate() looks at input.get_tensor().get_name();
|
||||
// If subgraph->get_output_size() == 1, then the name will be restored correctly from the node name
|
||||
auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &subgraph) -> void {
|
||||
bool not_set = true;
|
||||
for (unsigned int i = 0; i < subgraph->get_output_size() && not_set; i++) {
|
||||
for (const auto &in : subgraph->get_output_target_inputs(i)) {
|
||||
if (ov::is_type<opset1::Result>(in.get_node())) {
|
||||
const auto& body_result = subgraph->body_ptr()->get_output_op(i);
|
||||
const auto& body_result_input = body_result->get_input_source_output(0);
|
||||
op::Subgraph::fill_empty_output_names(subgraph->output(i), body_result_input);
|
||||
not_set = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
|
||||
const std::set<ngraph::element::Type> ngraph::snippets::pass::TokenizeSnippets::supported_element_types =
|
||||
{ ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
|
||||
|
||||
bool TokenizeSnippets::AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
|
||||
return is_supported_op(node) && has_supported_in_out(node) && node->get_control_dependencies().empty();
|
||||
}
|
||||
|
||||
void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
|
||||
auto &rt = node->get_rt_info();
|
||||
rt["SnippetsNodeType"] = nodeType;
|
||||
}
|
||||
|
||||
SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node> &node) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType")
|
||||
auto &rt = node->get_rt_info();
|
||||
const auto rinfo = rt.find("SnippetsNodeType");
|
||||
if (rinfo == rt.end())
|
||||
return SnippetsNodeType::NotSet;
|
||||
return rinfo->second.as<SnippetsNodeType>();
|
||||
}
|
||||
|
||||
void SetTopologicalOrder(const std::shared_ptr<Node> &node, int64_t order) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder")
|
||||
auto &rt = node->get_rt_info();
|
||||
rt["TopologicalOrder"] = order;
|
||||
}
|
||||
|
||||
int64_t GetTopologicalOrder(const std::shared_ptr<const Node> &node) {
|
||||
auto &rt = node->get_rt_info();
|
||||
const auto rinfo = rt.find("TopologicalOrder");
|
||||
if (rinfo == rt.end())
|
||||
throw ngraph_error("Topological order is required, but not set.");
|
||||
return rinfo->second.as<int64_t>();
|
||||
}
|
||||
|
||||
bool EnumerateNodes::run_on_model(const std::shared_ptr<ov::Model> &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes")
|
||||
int64_t order = 0;
|
||||
// Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough
|
||||
for (auto &node : m->get_ordered_ops()) {
|
||||
SetTopologicalOrder(node, order++);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
TokenizeSnippets::TokenizeSnippets() {
|
||||
MATCHER_SCOPE(TokenizeSnippets);
|
||||
enum continuation_strategy {
|
||||
@ -224,7 +223,12 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
continuation_strategy strategy = continuation_strategy::reset;
|
||||
auto label = std::make_shared<pattern::op::Label>(pattern::any_input(),
|
||||
[](const std::shared_ptr<const Node> &n) {
|
||||
return GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin && AppropriateForSubgraph(n);
|
||||
// todo: MatMul and Transpose ops are always skipped by the SnippetsMarkSkipped pass.
|
||||
// This is a temporary solution. Either modify SnippetsMarkSkipped
|
||||
// or align this with the custom MHA tokenization pass.
|
||||
return (GetSnippetsNodeType(n) != SnippetsNodeType::SkippedByPlugin ||
|
||||
ov::is_type<opset1::MatMul>(n) || ov::is_type<opset1::Transpose>(n))
|
||||
&& AppropriateForSubgraph(n);
|
||||
});
|
||||
ngraph::graph_rewrite_callback callback = [&, strategy](ngraph::pattern::Matcher &m) -> bool {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::CreateSubgraph_callback")
|
||||
@ -248,7 +252,7 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
auto subgraph = op::Subgraph::wrap_node_as_subgraph(node);
|
||||
subgraph->get_rt_info()["originalLayersNames"] = getFusedNames(node) + node->get_friendly_name();
|
||||
ngraph::replace_node(node, subgraph);
|
||||
update_out_tensor_name(subgraph);
|
||||
op::update_out_tensor_name(subgraph);
|
||||
};
|
||||
|
||||
auto abort_with_strategy = [&](const std::string& message_reset,
|
||||
@ -456,10 +460,15 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
// Result op has a single input
|
||||
internal_inputs.push_back(source_result->input_value(0));
|
||||
} else {
|
||||
// We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
|
||||
// After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
|
||||
if ((utils::is_scalar_constant(input_node)) ||
|
||||
(ov::is_type<ov::op::v0::Constant>(input_node) && ov::is_type<ov::op::v0::FakeQuantize>(node))) {
|
||||
// We need some non-scalar constants inside Subgraph in the following cases:
|
||||
// [*] We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
|
||||
// After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
|
||||
// [*] We support Transpose with second Constant input (represents order). This Constant will not be scheduled
|
||||
// and will only be used to decompose Transpose into a proper Load, Store and Loop combination.
|
||||
if (ov::is_type<ngraph::opset1::Constant>(input_node) &&
|
||||
(ngraph::shape_size(input_value.get_shape()) == 1 ||
|
||||
ov::is_type<ov::op::v0::FakeQuantize>(node) ||
|
||||
op::Subgraph::constant_input_should_be_inside_body(node))) {
|
||||
internal_inputs.push_back(input_node->output(0));
|
||||
} else {
|
||||
external_inputs.push_back(input_value);
|
||||
@ -489,18 +498,24 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
// than the actual number of Constants during tokenization.
|
||||
// To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
|
||||
// we should calculate potentional number of non-scalar Constants that will be moved up from body.
|
||||
size_t hidden_non_scalar_constant_count = 0;
|
||||
size_t hidden_data_count = 0;
|
||||
bool need_buffer = false;
|
||||
if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
|
||||
hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
|
||||
hidden_data_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
|
||||
// Ops require a Buffer
|
||||
} else if (ov::is_type<ov::op::v1::Softmax>(node) ||
|
||||
ov::is_type<ov::op::v8::Softmax>(node)) {
|
||||
need_buffer |= true;
|
||||
}
|
||||
|
||||
ResultVector body_results;
|
||||
std::vector<std::set<Input<Node>>> subgraph_result_inputs;
|
||||
|
||||
for (auto subgraph : input_subgraphs) {
|
||||
// we should summurize non-scalar Constants count from all input subgraphs
|
||||
// because we will collapse them with our node and we should get total count of non-scalar Constants
|
||||
hidden_non_scalar_constant_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_non_scalar_constants_count();
|
||||
// we should summurize additional needed data count (non-scalar Constants and Buffers) from all input subgraphs
|
||||
// because we will collapse them with our node and we should get total count
|
||||
hidden_data_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_virtual_port_count();
|
||||
need_buffer |= ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->is_buffer_needed();
|
||||
|
||||
for (auto output : subgraph->outputs()) {
|
||||
bool first_side_consumer = true;
|
||||
@ -541,13 +556,13 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
}
|
||||
|
||||
// todo: move this plugin-specific constraint to the plugin callback
|
||||
if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) {
|
||||
if (body_parameters.size() + body_results.size() + hidden_data_count + static_cast<size_t>(need_buffer) > 12) {
|
||||
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
|
||||
std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
|
||||
std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
|
||||
std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
|
||||
const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
|
||||
std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
|
||||
std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
|
||||
std::to_string(hidden_data_count) + " non-scalar constants and " + std::to_string(need_buffer) + "buffers.";
|
||||
return abort_with_strategy(message_reset, message_abort);
|
||||
}
|
||||
|
||||
@ -557,7 +572,7 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
}
|
||||
auto subgraph = op::build_subgraph(node, external_inputs, body, subgraph_name);
|
||||
copy_runtime_info(replaced_nodes, subgraph);
|
||||
const auto & act_body = subgraph->body();
|
||||
const auto& act_body = subgraph->body();
|
||||
for (size_t i = 0; i < act_body.get_parameters().size(); i++) {
|
||||
act_body.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
|
||||
}
|
||||
@ -574,16 +589,17 @@ TokenizeSnippets::TokenizeSnippets() {
|
||||
target_input.replace_source_output(subgraph->output(i));
|
||||
}
|
||||
}
|
||||
update_out_tensor_name(subgraph);
|
||||
op::update_out_tensor_name(subgraph);
|
||||
|
||||
subgraph->validate_and_infer_types();
|
||||
|
||||
const auto & act_body1 = subgraph->body();
|
||||
const auto& act_body1 = subgraph->body();
|
||||
for (size_t i = 0; i < act_body1.get_parameters().size(); i++) {
|
||||
act_body1.get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
|
||||
}
|
||||
subgraph->get_rt_info()["originalLayersNames"] = fusedNames;
|
||||
subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count);
|
||||
subgraph->set_virtual_port_count(hidden_data_count);
|
||||
subgraph->set_buffer_needed(need_buffer);
|
||||
|
||||
remark(1) << "Replacement (merge) done for: "
|
||||
<< subgraph->get_friendly_name()
|
||||
|
@ -11,7 +11,10 @@
|
||||
|
||||
#include "transformations/utils/utils.hpp"
|
||||
#include "snippets/pass/fq_decomposition.hpp"
|
||||
#include "snippets/pass/softmax_reshape_elimination.hpp"
|
||||
#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
NGRAPH_RTTI_DEFINITION(ngraph::snippets::pass::CommonOptimizations, "Snippets::CommonOptimizations", 0);
|
||||
@ -31,7 +34,11 @@ void ConvertConstantsToParameters(const std::shared_ptr<ngraph::snippets::op::Su
|
||||
|
||||
for (auto& op : body->get_ops()) {
|
||||
auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
|
||||
if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul))
|
||||
if (!constant || ngraph::shape_size(constant->get_shape()) == 1ul)
|
||||
continue;
|
||||
|
||||
const auto child = constant->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
if (op::Subgraph::constant_input_should_be_inside_body(child))
|
||||
continue;
|
||||
|
||||
auto parameter = std::make_shared<opset1::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
|
||||
@ -67,9 +74,11 @@ CommonOptimizations::CommonOptimizations() {
|
||||
// Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs.
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<ngraph::snippets::pass::TransformConvertToConvertTruncation>();
|
||||
manager.register_pass<ngraph::snippets::pass::ExplicitTransposeMatMulInputs>();
|
||||
if (is_quantized) {
|
||||
manager.register_pass<ngraph::snippets::pass::CommonFakeQuantizeDecomposition>();
|
||||
}
|
||||
manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
|
||||
manager.run_passes(body);
|
||||
|
||||
// At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
|
||||
|
@ -20,11 +20,16 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
|
||||
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
|
||||
auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
|
||||
auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
|
||||
if (ov::shape_size(constant->get_output_shape(0)) != 1)
|
||||
return false;
|
||||
// Note that all Constants {1,1,1,1} are converted to Scalar {1} here
|
||||
// This is needed to simplify shape inference, otherwise {1,1,1,1} Constants can increase output rank
|
||||
// Also some operations support only scalar shapes, so we need separate scalars and shape [1]
|
||||
const auto shape = constant->get_output_shape(0).size() == 0 ? ov::Shape{} : ov::Shape{1};
|
||||
auto scalar = std::make_shared<snippets::op::Scalar>(ov::op::v0::Constant(*constant, shape));
|
||||
scalar->set_friendly_name(constant->get_friendly_name());
|
||||
ngraph::copy_runtime_info(constant, scalar);
|
||||
ngraph::replace_node(constant, scalar);
|
||||
|
||||
return true;
|
||||
};
|
||||
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
|
||||
|
@ -0,0 +1,83 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
|
||||
#include "snippets/pass/transpose_decomposition.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset8.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
|
||||
|
||||
ngraph::snippets::pass::ExplicitTransposeMatMulInputs::ExplicitTransposeMatMulInputs() {
|
||||
MATCHER_SCOPE(ExplicitTransposeMatMulInputs);
|
||||
|
||||
auto m_matmul0 = std::make_shared<ngraph::opset1::MatMul>(
|
||||
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
|
||||
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()));
|
||||
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_matmul0, matcher_name),
|
||||
[=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ExplicitTransposeMatMulInputs")
|
||||
auto root = m.get_match_root();
|
||||
bool rewritten = false;
|
||||
|
||||
auto matmul0 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(root);
|
||||
if (!matmul0)
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < matmul0->get_input_size(); i++) {
|
||||
if (i == 0 && !matmul0->get_transpose_a())
|
||||
continue;
|
||||
if (i == 1 && !matmul0->get_transpose_b())
|
||||
continue;
|
||||
|
||||
auto parent1 = matmul0->get_input_node_shared_ptr(i);
|
||||
auto transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent1);
|
||||
while (!transpose1 && !ov::is_type<ngraph::opset1::Parameter>(parent1)) {
|
||||
// We can set supported order and transposed_b(false) only if ops have scalar shapes to avoid shape mismatching
|
||||
const auto parent_count = parent1->inputs().size();
|
||||
bool are_weights_scalar = true;
|
||||
for (size_t j = 1; j < parent_count; ++j) {
|
||||
are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent1->get_input_shape(j)) == 1;
|
||||
}
|
||||
if (!are_weights_scalar)
|
||||
break;
|
||||
|
||||
parent1 = parent1->get_input_node_shared_ptr(0);
|
||||
transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent1);
|
||||
}
|
||||
if (!transpose1)
|
||||
continue;
|
||||
|
||||
const auto transpose_pattern = ngraph::as_type_ptr<ngraph::opset1::Constant>(transpose1->get_input_node_shared_ptr(1));
|
||||
if (!transpose_pattern)
|
||||
continue;
|
||||
|
||||
auto transposed_order = transpose_pattern->cast_vector<int32_t>();
|
||||
std::swap(*transposed_order.rbegin(), *(transposed_order.rbegin() + 1));
|
||||
if (pass::TransposeDecomposition::supported_cases.count(transposed_order) == 0)
|
||||
continue;
|
||||
|
||||
auto new_transpose_order = std::make_shared<ngraph::opset1::Constant>(transpose_pattern->get_element_type(),
|
||||
ngraph::Shape{4},
|
||||
transposed_order);
|
||||
new_transpose_order->set_friendly_name(transpose_pattern->get_friendly_name());
|
||||
ngraph::copy_runtime_info(transpose_pattern, new_transpose_order);
|
||||
transpose1->set_argument(1, new_transpose_order);
|
||||
if (i == 0) {
|
||||
matmul0->set_transpose_a(false);
|
||||
} else {
|
||||
matmul0->set_transpose_b(false);
|
||||
}
|
||||
rewritten |= true;
|
||||
}
|
||||
|
||||
return rewritten;
|
||||
});
|
||||
}
|
86
src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
Normal file
86
src/common/snippets/src/pass/fuse_transpose_brgemm.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "snippets/pass/fuse_transpose_brgemm.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include "snippets/utils.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
#include "ngraph/rt_info.hpp"
|
||||
#include "ngraph/pattern/op/wrap_type.hpp"
|
||||
#include "openvino/pass/pattern/op/or.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
const std::set<std::vector<int>> FuseTransposeBrgemm::supported_cases = {{0, 2, 1, 3}};
|
||||
FuseTransposeBrgemm::FuseTransposeBrgemm() {
|
||||
MATCHER_SCOPE(FuseTransposeBrgemm);
|
||||
auto transpose_is_supported = [](const Output<Node>& transpose_port) {
|
||||
const auto transpose_node = transpose_port.get_node_shared_ptr();
|
||||
// it's safe to do so because of the patterns we used. alternatively we can do it through pattern_values_map
|
||||
const auto& constant = as_type_ptr<ngraph::opset1::Constant>(transpose_node->get_input_node_shared_ptr(1));
|
||||
// if Transpose in and out layout is not empty => something was already fused on this port
|
||||
if (!utils::get_node_output_layout(transpose_node).empty() ||
|
||||
!utils::get_node_output_layout(transpose_node->get_input_node_shared_ptr(0)).empty())
|
||||
return false;
|
||||
const auto& transpose_order = constant->cast_vector<int>();
|
||||
// todo: this limitation is due to the fact that offsets are calculated in Kernel, and the only way
|
||||
// to calc them non-default way is to set Parameter rt_info field. This limitation can be removed if
|
||||
// the rt_info is properly propagated to the corresponding parameter
|
||||
if (!is_type<ngraph::opset1::Parameter>(transpose_node->get_input_node_shared_ptr(0)) ||
|
||||
supported_cases.count(transpose_order) == 0)
|
||||
return false;
|
||||
return true;
|
||||
};
|
||||
auto constant = pattern::wrap_type<opset1::Constant>();
|
||||
auto transpose = pattern::wrap_type<opset1::Transpose>({pattern::any_input(), constant}, transpose_is_supported);
|
||||
auto transpose_matcher = std::make_shared<pattern::Matcher>(transpose);
|
||||
auto brgemm_any = pattern::wrap_type<op::Brgemm>({pattern::any_input(), pattern::any_input()});
|
||||
|
||||
auto brgemm_in0 = pattern::wrap_type<op::Brgemm>({transpose, pattern::any_input()});
|
||||
auto brgemm_in1 = pattern::wrap_type<op::Brgemm>({pattern::any_input(), transpose});
|
||||
auto brgemm_out0 = pattern::wrap_type<opset1::Transpose>({brgemm_any, constant});
|
||||
auto brgemm_or_transpose = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{brgemm_in0, brgemm_in1, brgemm_out0});
|
||||
|
||||
auto callback = [=](pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "FuseTransposeBrgemm")
|
||||
auto set_layout_from_order = [](const std::shared_ptr<opset1::Transpose>& node, const ov::Output<Node>& port) {
|
||||
const auto& const_order = as_type_ptr<opset1::Constant>(node->get_input_node_shared_ptr(1));
|
||||
std::vector<size_t> layout = const_order->cast_vector<size_t>();
|
||||
auto& rt_info = port.get_node_shared_ptr()->get_rt_info();
|
||||
rt_info["Layout"] = layout;
|
||||
};
|
||||
auto brgemm = as_type_ptr<op::Brgemm>(m.get_match_root());
|
||||
// Transpose on the Brgemm's output
|
||||
if (!brgemm) {
|
||||
brgemm = as_type_ptr<op::Brgemm>(m.get_match_root()->get_input_node_shared_ptr(0));
|
||||
const auto& brgemm_out = brgemm->output(0);
|
||||
const auto& transpose_out = m.get_match_value();
|
||||
for (const auto& in : transpose_out.get_target_inputs())
|
||||
in.replace_source_output(brgemm->output(0));
|
||||
set_layout_from_order(as_type_ptr<opset1::Transpose>(transpose_out.get_node_shared_ptr()), brgemm_out);
|
||||
}
|
||||
for (int i = 0; i < brgemm->get_input_size(); i++) {
|
||||
const auto& in_value = brgemm->input_value(i);
|
||||
if (transpose_matcher->match(in_value)) {
|
||||
const auto& transpose = as_type_ptr<opset1::Transpose>(in_value.get_node_shared_ptr());
|
||||
set_layout_from_order(transpose, transpose->input_value(0));
|
||||
brgemm->set_argument(i, transpose->input_value(0));
|
||||
}
|
||||
}
|
||||
// need to run validate_and_infer_types manually: either input shapes were updated or
|
||||
// output Layout was updated (out shape will be updated in validate_and_infer_types())
|
||||
brgemm->validate_and_infer_types();
|
||||
return true;
|
||||
};
|
||||
register_matcher(std::make_shared<pattern::Matcher>(brgemm_or_transpose, matcher_name), callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
96
src/common/snippets/src/pass/insert_buffer.cpp
Normal file
96
src/common/snippets/src/pass/insert_buffer.cpp
Normal file
@ -0,0 +1,96 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/remarks.hpp"
|
||||
|
||||
#include "snippets/pass/insert_buffer.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
ngraph::snippets::pass::InsertBuffer::InsertBuffer(const int32_t allocation_rank) {
|
||||
MATCHER_SCOPE(InsertBuffer);
|
||||
// The list of operations that require Buffers on their Inputs and Outputs
|
||||
const auto pattern = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax,
|
||||
ngraph::op::v8::Softmax,
|
||||
ngraph::op::v1::Transpose,
|
||||
op::Brgemm>();
|
||||
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(pattern, matcher_name),
|
||||
[this, allocation_rank](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertBuffer")
|
||||
auto root = m.get_match_root();
|
||||
bool rewritten = false;
|
||||
|
||||
// check if already has Buffer, Parameter or Constant as an input
|
||||
for (const auto& input : root->inputs()) {
|
||||
const auto input_node = input.get_source_output().get_node()->shared_from_this();
|
||||
if (!ov::is_type<ngraph::snippets::op::Buffer>(input_node) &&
|
||||
!ov::is_type<ngraph::op::v0::Parameter>(input_node) &&
|
||||
!ov::is_type<ngraph::op::v0::Constant>(input_node)) {
|
||||
const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(input_node, allocation_rank);
|
||||
root->set_argument(input.get_index(), buffer);
|
||||
rewritten |= true;
|
||||
}
|
||||
if (ov::is_type<op::Buffer>(input.get_source_output().get_node_shared_ptr()) &&
|
||||
input.get_source_output().get_target_inputs().size() != 1) {
|
||||
throw ngraph::ngraph_error(
|
||||
"If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
|
||||
}
|
||||
}
|
||||
|
||||
// check if already has Buffer or outputs is Result
|
||||
for (const auto& output : root->outputs()) {
|
||||
const auto target_inputs = output.get_target_inputs();
|
||||
if (target_inputs.size() > 1) {
|
||||
for (const auto& consumer : target_inputs) {
|
||||
const auto output_node = consumer.get_node()->shared_from_this();
|
||||
if (ov::is_type<ngraph::snippets::op::Buffer>(output_node)) {
|
||||
// If some of children from one common port are different Buffers,
|
||||
// we should remove them to insert one common Buffer on one common port
|
||||
replace_output_update_name(output_node->output(0), output_node->input_value(0));
|
||||
} else if (ov::is_type<ngraph::op::v0::Result>(output_node)) {
|
||||
// TODO: At this moment operation which is should be wrapped by Buffers doesn't support several childs where one of them is Result
|
||||
// because Result and Buffer from one root port should have the same register. It's not supported at the moment
|
||||
// For example,
|
||||
// Buffer
|
||||
// |
|
||||
// Softmax
|
||||
// / \
|
||||
// Buffer Result
|
||||
throw ngraph::ngraph_error(
|
||||
"Operation which is should be wrapped by Buffers has few children from one output port where one of them is Result");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto buffer = std::make_shared<ngraph::snippets::op::Buffer>(output, allocation_rank);
|
||||
for (const auto& consumer : output.get_target_inputs()) {
|
||||
const auto output_node = consumer.get_node()->shared_from_this();
|
||||
if (output_node != buffer &&
|
||||
!ov::is_type<ngraph::snippets::op::Buffer>(output_node) &&
|
||||
!ov::is_type<ngraph::op::v0::Result>(output_node)) {
|
||||
consumer.replace_source_output(buffer);
|
||||
rewritten |= true;
|
||||
}
|
||||
}
|
||||
|
||||
const auto new_target_inputs = output.get_target_inputs();
|
||||
const auto has_buffer_on_output = std::any_of(new_target_inputs.begin(), new_target_inputs.end(), [](const ov::Input<ov::Node>& consumer) {
|
||||
const auto child = consumer.get_node()->shared_from_this();
|
||||
// We check for count of target inputs of Buffer output because
|
||||
// we created Buffer op with root input previously for the next possible insertions
|
||||
// Thus, if Buffer wasn't inserted, this op doesn't have target inputs on output
|
||||
return ov::is_type<ngraph::snippets::op::Buffer>(child) && child->output(0).get_target_inputs().size() > 0;
|
||||
});
|
||||
if (has_buffer_on_output && new_target_inputs.size() != 1) {
|
||||
throw ngraph::ngraph_error(
|
||||
"If Buffer is a input for operation output, this Buffer should be a single consumer for this port");
|
||||
}
|
||||
}
|
||||
return rewritten;
|
||||
});
|
||||
}
|
@ -15,15 +15,23 @@
|
||||
ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
|
||||
MATCHER_SCOPE(InsertLoad);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Parameter>(), matcher_name),
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Parameter, ngraph::snippets::op::Buffer>(), matcher_name),
|
||||
[this, count](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertLoad")
|
||||
auto root = m.get_match_root();
|
||||
|
||||
// check if already has Load as an output
|
||||
for (auto output : root->outputs()) {
|
||||
for (auto consumer : output.get_target_inputs()) {
|
||||
if (ov::is_type<ngraph::snippets::op::Load>(consumer.get_node())) {
|
||||
for (const auto& output : root->outputs()) {
|
||||
for (const auto& consumer : output.get_target_inputs()) {
|
||||
// if a parameter is connected to a Load => we don't need another one
|
||||
// if a parameter is connected to LoopBegin => there must be Load inside the Loop
|
||||
// if a parameter is connected to MatMul => we don't need Load (read/write is encapsulated into the brgemm emitter)
|
||||
// (it's the responsibility of transformation that inserted the Loops)
|
||||
const auto& consumer_node = consumer.get_node();
|
||||
if (ov::is_type<ngraph::snippets::op::Load>(consumer_node) ||
|
||||
ov::is_type<ngraph::snippets::op::LoopBegin>(consumer_node) ||
|
||||
ov::is_type<ngraph::op::v0::MatMul>(consumer_node) ||
|
||||
ov::is_type<ngraph::op::v1::Transpose>(consumer_node)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -33,8 +41,8 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
|
||||
ngraph::copy_runtime_info(root, load);
|
||||
|
||||
bool rewritten = false;
|
||||
for (auto output : root->outputs()) {
|
||||
for (auto consumer : output.get_target_inputs()) {
|
||||
for (const auto& output : root->outputs()) {
|
||||
for (const auto& consumer : output.get_target_inputs()) {
|
||||
if (consumer.get_node()->shared_from_this() != load) {
|
||||
consumer.replace_source_output(load);
|
||||
rewritten |= true;
|
||||
@ -49,19 +57,23 @@ ngraph::snippets::pass::InsertLoad::InsertLoad(const size_t count) {
|
||||
ngraph::snippets::pass::InsertStore::InsertStore(const size_t count) {
|
||||
MATCHER_SCOPE(InsertStore);
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Result>(), matcher_name),
|
||||
ngraph::pattern::wrap_type<ngraph::opset1::Result, ngraph::snippets::op::Buffer>(), matcher_name),
|
||||
[this, count](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertStore")
|
||||
auto root = m.get_match_root();
|
||||
|
||||
// check if already has Store as an input
|
||||
for (auto input : root->inputs()) {
|
||||
if (ov::is_type<ngraph::snippets::op::Store>(input.get_source_output().get_node())) {
|
||||
for (const auto& input : root->inputs()) {
|
||||
const auto& parent_node = input.get_source_output().get_node();
|
||||
if (ov::is_type<ngraph::snippets::op::Store>(parent_node) ||
|
||||
ov::is_type<ngraph::snippets::op::LoopEnd>(parent_node) ||
|
||||
ov::is_type<ngraph::op::v0::MatMul>(parent_node) ||
|
||||
ov::is_type<ngraph::op::v1::Transpose>(parent_node)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
auto store = std::make_shared<ngraph::snippets::op::Store> (root->input_value(0), count);
|
||||
auto store = std::make_shared<ngraph::snippets::op::Store>(root->input_value(0), count);
|
||||
ngraph::copy_runtime_info(root, store);
|
||||
root->set_argument(0, store);
|
||||
return true;
|
||||
|
285
src/common/snippets/src/pass/insert_loops.cpp
Normal file
285
src/common/snippets/src/pass/insert_loops.cpp
Normal file
@ -0,0 +1,285 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/pass/insert_loops.hpp"
|
||||
#include "snippets/pass/loop_helpers.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t loop_depth, size_t vector_size, bool single_loop_body)
|
||||
: m_master_shape(std::move(master_shape)), m_loop_depth(loop_depth), m_vector_size(vector_size), m_single_loop_body(single_loop_body) {
|
||||
if (m_master_shape.size() < m_loop_depth)
|
||||
throw ngraph_error("InsertLoops can't insert loops: master shape rank is too small");
|
||||
}
|
||||
|
||||
std::vector<bool> InsertLoops::calculate_inner_apply_increments(const ov::PartialShape& master,
|
||||
const std::vector<ov::PartialShape>& shapes) {
|
||||
// Inner Loop applies increments if a dimension is not broadcasted
|
||||
std::vector<bool> apply_increments;
|
||||
apply_increments.reserve(shapes.size());
|
||||
std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
|
||||
[=](const ov::PartialShape& ps) { return utils::get_inner_dim(ps) != 1 && utils::get_inner_dim(master) != 1; });
|
||||
return apply_increments;
|
||||
}
|
||||
std::vector<bool> InsertLoops::calculate_outer_apply_increments(const std::vector<ov::PartialShape>& shapes) {
|
||||
// Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1)
|
||||
std::vector<bool> apply_increments;
|
||||
apply_increments.reserve(shapes.size());
|
||||
std::transform(shapes.begin(), shapes.end(), std::back_inserter(apply_increments),
|
||||
[=](const ov::PartialShape& ps) { return utils::get_outer_dim(ps) != 1 && utils::get_inner_dim(ps) == 1; });
|
||||
return apply_increments;
|
||||
}
|
||||
std::vector<int64_t> InsertLoops::calculate_finalization_offsets(const ov::PartialShape& master,
|
||||
const std::vector<ov::PartialShape>& shapes) {
|
||||
const auto inner_work_amount = utils::get_inner_dim(master).get_length();
|
||||
std::vector<int64_t> inner_finalization_offsets(shapes.size(), 0);
|
||||
std::transform(shapes.begin(), shapes.end(), inner_finalization_offsets.begin(),
|
||||
[=](const ov::PartialShape& ps) {
|
||||
return utils::get_outer_dim(ps) == 1 && utils::get_inner_dim(ps) != 1 ? -inner_work_amount : 0;
|
||||
});
|
||||
return inner_finalization_offsets;
|
||||
}
|
||||
|
||||
void insert_loops_explicitly(const ov::NodeVector& ops, const size_t vector_size) {
|
||||
ov::NodeVector body;
|
||||
ov::NodeVector body_remainder;
|
||||
ov::OutputVector body_parameters;
|
||||
std::vector<ov::Input<ov::Node>> body_results;
|
||||
|
||||
// check for potential parameters for new Loop
|
||||
auto add_body_parameters = [](const std::shared_ptr<ov::Node>& op, ov::OutputVector& body_parameters) {
|
||||
for (const auto& input : op->inputs()) {
|
||||
auto parent = input.get_source_output().get_node_shared_ptr();
|
||||
if (ov::is_type<op::LoopEnd>(parent) ||
|
||||
ov::is_type<op::Buffer>(parent) ||
|
||||
ov::is_type<ov::op::v0::Parameter>(parent) ||
|
||||
ov::is_type<op::Brgemm>(parent)) {
|
||||
body_parameters.push_back(input.get_source_output());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// check for potential results for new Loop
|
||||
auto add_body_results = [](const std::shared_ptr<ov::Node>& op, std::vector<ov::Input<ov::Node>>& body_results) {
|
||||
for (const auto& output : op->outputs()) {
|
||||
for (const auto& target_input : output.get_target_inputs()) {
|
||||
auto child = target_input.get_node();
|
||||
if (ov::is_type<op::LoopBegin>(child) ||
|
||||
ov::is_type<op::Buffer>(child) ||
|
||||
ov::is_type<ov::op::v0::Result>(child) ||
|
||||
ov::is_type<op::Brgemm>(child)) {
|
||||
body_results.push_back(target_input);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// check for potential missing body ops for new loop
|
||||
std::function<void(const std::shared_ptr<ov::Node>& op, ov::NodeVector& body)> add_missing_body_ops;
|
||||
add_missing_body_ops = [&](const std::shared_ptr<ov::Node>& op, ov::NodeVector& body) {
|
||||
if (body_remainder.size()) {
|
||||
for (const auto& input : op->inputs()) {
|
||||
auto parent = input.get_source_output().get_node_shared_ptr();
|
||||
auto iter = std::find(body_remainder.begin(), body_remainder.end(), parent);
|
||||
if (iter != body_remainder.end()) {
|
||||
*std::back_inserter(body) = std::move(*iter);
|
||||
add_missing_body_ops(parent, body);
|
||||
add_body_parameters(parent, body_parameters);
|
||||
add_body_results(op, body_results);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto wrap_body_by_loop = [&](const ov::NodeVector& body, const ov::OutputVector& body_parameters, const std::vector<ov::Input<ov::Node>>& body_results) {
|
||||
NGRAPH_CHECK(!body_parameters.empty(), "The count of parameters for loop should be more than zero to create loop");
|
||||
NGRAPH_CHECK(!body_results.empty(), "The count of results for loop should be more than zero to create loop");
|
||||
std::vector<ov::PartialShape> body_shapes;
|
||||
const auto count_io = body_parameters.size() + body_results.size();
|
||||
body_shapes.reserve(count_io);
|
||||
std::transform(body_parameters.begin(), body_parameters.end(), std::back_inserter(body_shapes),
|
||||
[](const ov::Output<ov::Node>& out) { return out.get_partial_shape(); });
|
||||
std::transform(body_results.begin(), body_results.end(), std::back_inserter(body_shapes),
|
||||
[](const ov::Input<ov::Node>& in) { return in.get_partial_shape(); });
|
||||
|
||||
auto body_master_shape = body_shapes.front();
|
||||
for (const auto& shape : body_shapes) {
|
||||
NGRAPH_CHECK(PartialShape::broadcast_merge_into(body_master_shape, shape, ::ngraph::op::AutoBroadcastType::NUMPY),
|
||||
"Loop input and output must be numpy broadcastable");
|
||||
}
|
||||
const auto inner_work_amount = utils::get_inner_dim(body_master_shape).get_length();
|
||||
const auto outer_work_amount = utils::get_outer_dim(body_master_shape).get_length();
|
||||
|
||||
auto apply_increments = InsertLoops::calculate_inner_apply_increments(body_master_shape, body_shapes);
|
||||
std::vector<int64_t> inner_finalization_offsets(body_shapes.size(), 0);
|
||||
if (outer_work_amount > 1) {
|
||||
inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(body_master_shape, body_shapes);
|
||||
}
|
||||
|
||||
const auto& inner_loop_begin = op::insertLoopBeginAfterOutputs(body_parameters);
|
||||
const auto& inner_loop_end = op::insertLoopEndBeforeInputs(
|
||||
body_results, inner_loop_begin, inner_work_amount, vector_size,
|
||||
apply_increments, inner_finalization_offsets);
|
||||
// set internal flag to enable scalar vs vector loop optimizations
|
||||
inner_loop_end->has_outer_loop = outer_work_amount > 1;
|
||||
// Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
|
||||
// sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
|
||||
// outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
|
||||
// assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
|
||||
// on LoopBegin to guarantee that the constants are executed inside the Loop.
|
||||
for (const auto& n : body) {
|
||||
if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n)) {
|
||||
c->add_control_dependency(inner_loop_begin);
|
||||
}
|
||||
}
|
||||
|
||||
if (outer_work_amount > 1) {
|
||||
std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(body_shapes);
|
||||
std::vector<int64_t> outer_finalization_offsets(body_shapes.size(), 0);
|
||||
const auto& outer_loop_begin = op::insertLoopBegin(body_parameters);
|
||||
op::insertLoopEnd(body_results, outer_loop_begin, outer_work_amount, 1lu,
|
||||
apply_increments, outer_finalization_offsets);
|
||||
}
|
||||
};
|
||||
|
||||
auto op_is_outside_loop = [](const std::shared_ptr<ov::Node>& op) -> bool {
|
||||
if (ov::is_type<ov::op::v0::Parameter>(op) ||
|
||||
ov::is_type<ov::op::v0::Result>(op) ||
|
||||
ov::is_type<op::Buffer>(op))
|
||||
return true;
|
||||
auto& rt = op->get_rt_info();
|
||||
auto outside_rt = rt.find("outside_loop");
|
||||
bool is_outside = false;
|
||||
// If rt info isn't setted it means that op should be inside loop by default
|
||||
if (outside_rt != rt.end()) {
|
||||
is_outside = outside_rt->second.as<bool>();
|
||||
}
|
||||
return is_outside;
|
||||
};
|
||||
|
||||
for (auto iter = ops.begin(); iter < ops.end(); iter++) {
|
||||
const auto op = *iter;
|
||||
// Need to check for that op should be inside or outside loop
|
||||
if (op_is_outside_loop(op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we meet loopBegin or Brgemm, it means that all previous nodes from ordered body
|
||||
// should be in one body. It's like stop signal
|
||||
const auto& loop_begin = ov::as_type_ptr<op::LoopBegin>(op);
|
||||
const auto& brgemm = ov::as_type_ptr<op::Brgemm>(op);
|
||||
if (loop_begin || brgemm) {
|
||||
if (!body.empty()) {
|
||||
if (!body_results.empty()) {
|
||||
wrap_body_by_loop(body, body_parameters, body_results);
|
||||
} else {
|
||||
// If there aren't body results, it means that the current body ops are inputs of the next some operations in ordered_ops
|
||||
// So this set of the current body ops is part of the future body loop.
|
||||
// We should save them to add in body ops in the future
|
||||
std::move(body.begin(), body.end(), std::back_inserter(body_remainder));
|
||||
}
|
||||
}
|
||||
|
||||
// we should skip the next existing Loop body
|
||||
if (loop_begin) {
|
||||
const auto &loop_end = loop_begin->get_loop_end();
|
||||
iter = std::find(iter, ops.end(), loop_end);
|
||||
}
|
||||
|
||||
// clear loop body to create the next
|
||||
body.clear();
|
||||
body_parameters.clear();
|
||||
body_results.clear();
|
||||
} else {
|
||||
add_missing_body_ops(op, body);
|
||||
add_body_parameters(op, body_parameters);
|
||||
add_body_results(op, body_results);
|
||||
|
||||
body.push_back(op);
|
||||
}
|
||||
}
|
||||
|
||||
if (!body.empty()) {
|
||||
wrap_body_by_loop(body, body_parameters, body_results);
|
||||
}
|
||||
}
|
||||
|
||||
bool InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
|
||||
RUN_ON_FUNCTION_SCOPE(InsertLoops);
|
||||
if (m_master_shape.is_dynamic())
|
||||
throw ngraph_error("InsertLoops doesn't support dynamic shapes yet");
|
||||
|
||||
const auto inner_work_amount = utils::get_inner_dim(m_master_shape).get_length();
|
||||
const auto outer_work_amount = m_loop_depth == 2 ? utils::get_outer_dim(m_master_shape).get_length() : 1;
|
||||
|
||||
auto ops = model->get_ordered_ops();
|
||||
ParameterVector commonParams = model->get_parameters();
|
||||
// Note that topological sort parses node arguments in reversed order, but results are added - in direct order
|
||||
// So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter
|
||||
const auto& orig_results = model->get_results();
|
||||
ResultVector commonResults(orig_results.rbegin(), orig_results.rend());
|
||||
std::vector<PartialShape> ioShapes;
|
||||
|
||||
const auto& body_rt_info = model->get_rt_info();
|
||||
const auto& plugin_shapes = body_rt_info.find("PluginShapesOverride");
|
||||
if (plugin_shapes == body_rt_info.end()) {
|
||||
throw ngraph_error("InsertLoops requires PluginShapesOverride rt_info field");
|
||||
} else {
|
||||
const auto& new_shapes = plugin_shapes->second.as<std::vector<std::vector<size_t>>>();
|
||||
if (new_shapes.size() != commonResults.size() + commonParams.size())
|
||||
throw ngraph_error("InsertLoops got invalid number of plugin-overriden shapes");
|
||||
for (int i = 0; i < commonParams.size(); i++)
|
||||
ioShapes.emplace_back(new_shapes[i]);
|
||||
// reverse overriden_shapes for results since commonResults are reversed with respect to model->get_parameters()
|
||||
for (int i = 0; i < commonResults.size(); i++)
|
||||
ioShapes.emplace_back(new_shapes[new_shapes.size() - 1 - i]);
|
||||
}
|
||||
|
||||
if (inner_work_amount > 0) {
|
||||
if (m_single_loop_body) {
|
||||
const auto apply_increments = InsertLoops::calculate_inner_apply_increments(m_master_shape, ioShapes);
|
||||
std::vector<int64_t> inner_finalization_offsets(ioShapes.size(), 0);
|
||||
if (outer_work_amount > 1) {
|
||||
inner_finalization_offsets = InsertLoops::calculate_finalization_offsets(m_master_shape, ioShapes);
|
||||
}
|
||||
const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
|
||||
const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_work_amount,
|
||||
m_vector_size, apply_increments, inner_finalization_offsets);
|
||||
// set internal flag to enable scalar vs vector loop optimizations
|
||||
inner_loop_end->has_outer_loop = outer_work_amount > 1;
|
||||
// Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
|
||||
// sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
|
||||
// outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
|
||||
// assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
|
||||
// on LoopBegin to guarantee that the constants are executed inside the Loop.
|
||||
for (const auto& n : model->get_ordered_ops()) {
|
||||
if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n))
|
||||
c->add_control_dependency(inner_loop_begin);
|
||||
else if (n == inner_loop_begin)
|
||||
break;
|
||||
}
|
||||
|
||||
if (outer_work_amount > 1) {
|
||||
std::vector<bool> apply_increments = InsertLoops::calculate_outer_apply_increments(ioShapes);
|
||||
const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
|
||||
op::insertLoopEnd(commonResults, outer_loop_begin, outer_work_amount, 1lu, apply_increments);
|
||||
}
|
||||
} else {
|
||||
insert_loops_explicitly(ops, m_vector_size);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
@ -7,6 +7,8 @@
|
||||
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
@ -17,43 +19,47 @@ using namespace ngraph;
|
||||
|
||||
namespace {
|
||||
|
||||
std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngraph::Node>& value,
|
||||
const ov::Shape& target_shape, const ov::Shape& normalized_shape) {
|
||||
std::shared_ptr<ngraph::Node> broadcasted_node = value.get_node_shared_ptr();
|
||||
|
||||
if (target_shape == value.get_shape()) {
|
||||
return broadcasted_node;
|
||||
}
|
||||
// Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
|
||||
// will be handled by pointer arithmetics in TileScheduler
|
||||
if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
|
||||
ov::Shape broadcasted_shape = normalized_shape;
|
||||
*broadcasted_shape.rbegin() = *target_shape.rbegin();
|
||||
broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
|
||||
}
|
||||
|
||||
return broadcasted_node;
|
||||
}
|
||||
|
||||
|
||||
std::pair<ov::Shape, std::vector<ov::Shape>> get_numpy_broadcast_shapes(const std::vector<ov::Shape>& input_shapes) {
|
||||
std::pair<ov::PartialShape, std::vector<ov::PartialShape>> get_numpy_broadcast_partial_shapes(const std::vector<ov::PartialShape>& input_shapes) {
|
||||
ov::PartialShape target_shape = input_shapes.front();
|
||||
for (auto i = 1; i < input_shapes.size(); i++) {
|
||||
if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY))
|
||||
throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes");
|
||||
}
|
||||
std::vector<ov::Shape> normalized_shapes;
|
||||
std::vector<ov::PartialShape> normalized_shapes;
|
||||
for (const auto& input : input_shapes) {
|
||||
ov::Shape padded_shape{input};
|
||||
ov::PartialShape padded_shape{input};
|
||||
padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1);
|
||||
normalized_shapes.push_back(std::move(padded_shape));
|
||||
}
|
||||
|
||||
return {target_shape.get_shape(), normalized_shapes};
|
||||
return {target_shape, normalized_shapes};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ngraph::Output<ngraph::Node> ngraph::snippets::pass::InsertMoveBroadcast::BroadcastNodeLastDim(
|
||||
const ngraph::Output<ngraph::Node>& value, const ov::PartialShape& target_shape, const ov::PartialShape& normalized_shape) {
|
||||
if (target_shape == value.get_partial_shape()) {
|
||||
return value;
|
||||
}
|
||||
|
||||
// Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
|
||||
// will be handled by pointer arithmetics inside outer LoopEmitter
|
||||
if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
|
||||
ov::PartialShape broadcasted_shape = normalized_shape;
|
||||
*broadcasted_shape.rbegin() = *target_shape.rbegin();
|
||||
const auto broadcast_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(value, broadcasted_shape);
|
||||
// BroadcastMove should be immediately executed after its input op (input op is node with output which should be broadcasted).
|
||||
// For example, to execute Broadcast outside of a Loop We transfer control dependents and copy rt info
|
||||
broadcast_node->add_node_control_dependents(value.get_node_shared_ptr());
|
||||
ov::copy_runtime_info(value.get_node_shared_ptr(), broadcast_node);
|
||||
|
||||
return broadcast_node->output(0);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
|
||||
MATCHER_SCOPE(InsertMoveBroadcast);
|
||||
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
|
||||
@ -64,31 +70,35 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto is_scalar_constant = [](const ov::Output<ov::Node>& v){
|
||||
if (auto constant = ov::as_type_ptr<ov::op::v0::Constant>(v.get_node_shared_ptr())) {
|
||||
if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
auto is_ignored_node = [](const ov::Output<ov::Node>& v){
|
||||
// We don't need to insert BroadcastMove after the following operations:
|
||||
// - Scalar has emitter with explicit broadcasting
|
||||
// - VectorBuffer has scalar output shape to avoid broadcast conflicts and manually shape insertion.
|
||||
return utils::is_scalar_constant(v.get_node_shared_ptr()) ||
|
||||
ov::is_type<ngraph::snippets::op::VectorBuffer>(v.get_node_shared_ptr());
|
||||
};
|
||||
std::vector<ov::Shape> input_shapes;
|
||||
std::vector<bool> ignore_as_scalar;
|
||||
std::vector<ov::PartialShape> input_shapes;
|
||||
std::vector<bool> is_ignored;
|
||||
for (const auto& val : values) {
|
||||
input_shapes.emplace_back(val.get_shape());
|
||||
ignore_as_scalar.push_back(is_scalar_constant(val));
|
||||
input_shapes.emplace_back(val.get_partial_shape());
|
||||
is_ignored.push_back(is_ignored_node(val));
|
||||
// Do not insert MoveBroadcast if any of the last dims is dynamic,
|
||||
// since we don't know if we really need it. In these cases, broadcasting will be performed
|
||||
// by outer Loop based on runtime shapes.
|
||||
if (!is_ignored.back() && !input_shapes.back().rbegin()->is_static())
|
||||
return false;
|
||||
}
|
||||
|
||||
// find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim
|
||||
auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes);
|
||||
auto bcast_shapes = get_numpy_broadcast_partial_shapes(input_shapes);
|
||||
|
||||
ngraph::OutputVector broadcasted_inputs;
|
||||
for (size_t i = 0; i < values.size(); ++i) {
|
||||
if (ignore_as_scalar[i]) {
|
||||
if (is_ignored[i]) {
|
||||
broadcasted_inputs.push_back(values[i]);
|
||||
} else {
|
||||
auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
|
||||
ngraph::copy_runtime_info(root, node);
|
||||
auto node = BroadcastNodeLastDim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
|
||||
ngraph::copy_runtime_info(root, node.get_node_shared_ptr());
|
||||
broadcasted_inputs.push_back(node);
|
||||
}
|
||||
}
|
||||
|
@ -34,10 +34,10 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
|
||||
return false;
|
||||
}
|
||||
|
||||
auto inshape = root->input(0).get_shape();
|
||||
auto outshape = root->output(0).get_shape();
|
||||
auto inshape = root->input(0).get_partial_shape();
|
||||
auto outshape = root->output(0).get_partial_shape();
|
||||
|
||||
auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
|
||||
auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape, ov::as_type_ptr<snippets::op::Load>(input)->get_offset());
|
||||
ngraph::copy_runtime_info(root, broadcastload);
|
||||
ngraph::replace_node(root, broadcastload);
|
||||
|
||||
|
331
src/common/snippets/src/pass/loop_fusion.cpp
Normal file
331
src/common/snippets/src/pass/loop_fusion.cpp
Normal file
@ -0,0 +1,331 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/pass/loop_fusion.hpp"
|
||||
#include "snippets/utils.hpp"
|
||||
|
||||
namespace {
|
||||
using InputSet = std::set<ov::Input<ov::Node>>;
|
||||
using Edge = std::pair<ov::Output<ov::Node>, InputSet>;
|
||||
|
||||
auto can_be_merged(const std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end_up,
|
||||
const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin_down) -> bool {
|
||||
if (!loop_end_up || !loop_begin_down)
|
||||
return false;
|
||||
|
||||
const auto loop_end_down = loop_begin_down->get_loop_end();
|
||||
const auto loop_begin_up = loop_end_up->get_loop_begin();
|
||||
if (loop_end_down->get_work_amount() != loop_end_up->get_work_amount() ||
|
||||
loop_end_down->get_increment() != loop_end_up->get_increment())
|
||||
return false;
|
||||
|
||||
// If between Loops there are common dependencies (for example, reducing operations), we cannot merge these Loops
|
||||
// Example, when there is HorizonMax op between Loops:
|
||||
// Data
|
||||
// VectorBuffer LoopBegin
|
||||
// \ Load | \
|
||||
// Maximum | /
|
||||
// / LoopEnd
|
||||
// HorizonMax |
|
||||
// \ LoopBegin
|
||||
// \ Load \
|
||||
// Subtract |
|
||||
// Store /
|
||||
// LoopEnd
|
||||
auto up_dependent_ptrs = loop_end_up->get_control_dependents();
|
||||
ov::NodeVector up_dependents(up_dependent_ptrs.size(), nullptr);
|
||||
std::transform(up_dependent_ptrs.begin(), up_dependent_ptrs.end(), up_dependents.begin(), [](ngraph::Node* node) { return node->shared_from_this(); });
|
||||
auto down_dependencies = loop_begin_down->get_control_dependencies();
|
||||
std::sort(up_dependents.begin(), up_dependents.end());
|
||||
std::sort(down_dependencies.begin(), down_dependencies.end());
|
||||
std::vector<std::shared_ptr<ov::Node>> common_nodes;
|
||||
std::set_intersection(up_dependents.begin(), up_dependents.end(), down_dependencies.begin(), down_dependencies.end(),
|
||||
std::back_inserter(common_nodes));
|
||||
// TODO: Add check for sequence/subgraph of depending nodes between Loops.
|
||||
// At these moment we should have full list of dependencies and dependents of Loops to find intersection,
|
||||
// not just first dependent of LoopEnd and first dependency of LoopBegin
|
||||
return common_nodes.size() == 0;
|
||||
}
|
||||
|
||||
auto get_buffer_and_loop_end(const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin_down,
|
||||
std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end_up,
|
||||
std::shared_ptr<ngraph::snippets::op::Buffer>& buffer) -> bool {
|
||||
size_t fusion_input_num = 0;
|
||||
for (const auto& parent : loop_begin_down->input_values()) {
|
||||
const auto parent_shared = parent.get_node_shared_ptr();
|
||||
if (ov::is_type<ov::op::v0::Constant>(parent_shared) ||
|
||||
ov::is_type<ov::op::v0::Parameter>(parent_shared) ||
|
||||
ov::is_type<ngraph::snippets::op::LoopBegin>(parent_shared))
|
||||
continue;
|
||||
|
||||
// We can fuse Loops even LoopBegin has several the same inputs (the common Buffer/LoopEnd)
|
||||
if (buffer && buffer == parent_shared || !buffer && loop_end_up && loop_end_up == parent_shared)
|
||||
continue;
|
||||
|
||||
loop_end_up = ngraph::as_type_ptr<ngraph::snippets::op::LoopEnd>(parent_shared);
|
||||
buffer = ov::as_type_ptr<ngraph::snippets::op::Buffer>(parent_shared);
|
||||
if (buffer) {
|
||||
if (buffer->output(0).get_target_inputs().size() == 0 ||
|
||||
buffer->get_input_size() != 1 ||
|
||||
buffer->get_input_source_output(0).get_target_inputs().size() != 1)
|
||||
return false;
|
||||
|
||||
loop_end_up = ngraph::as_type_ptr<ngraph::snippets::op::LoopEnd>(buffer->get_input_node_shared_ptr(0));
|
||||
}
|
||||
if (loop_end_up)
|
||||
fusion_input_num++;
|
||||
}
|
||||
|
||||
return fusion_input_num == 1;
|
||||
}
|
||||
|
||||
auto collect_loop_inputs(const std::shared_ptr<ngraph::snippets::op::LoopBegin>& loop_begin,
|
||||
const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer,
|
||||
std::vector<Edge>& new_loop_inputs,
|
||||
std::vector<int64_t>& new_ptr_increments,
|
||||
std::vector<int64_t>& new_finalization_offsets) -> void {
|
||||
const auto loop_end = loop_begin->get_loop_end();
|
||||
const auto ptr_increments = loop_end->get_ptr_increments();
|
||||
const auto finalization_offsets = loop_end->get_finalization_offsets();
|
||||
for (size_t i = 0; i < loop_begin->get_input_size(); i++) {
|
||||
const auto input = loop_begin->input(i);
|
||||
// Skip target Buffer
|
||||
if (input.get_source_output().get_node_shared_ptr() != buffer) {
|
||||
const auto edge = Edge{ input.get_source_output(),
|
||||
loop_begin->output(input.get_index()).get_target_inputs() };
|
||||
new_loop_inputs.push_back(edge);
|
||||
new_ptr_increments.push_back(ptr_increments[i]);
|
||||
new_finalization_offsets.push_back(finalization_offsets[i]);
|
||||
// Remove LoopBegin from Parent as target input
|
||||
input.get_source_output().remove_target_input(input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto collect_loop_outputs(const std::shared_ptr<ngraph::snippets::op::LoopEnd>& loop_end,
|
||||
const std::shared_ptr<ngraph::snippets::op::Buffer>& buffer,
|
||||
std::vector<Edge>& new_loop_outputs,
|
||||
std::vector<int64_t>& new_ptr_increments,
|
||||
std::vector<int64_t>& new_finalization_offsets,
|
||||
const bool reduce_max_case) -> bool {
|
||||
const auto loop_begin = loop_end->get_loop_begin();
|
||||
const auto ptr_increments = loop_end->get_ptr_increments();
|
||||
const auto finalization_offsets = loop_end->get_finalization_offsets();
|
||||
bool is_current_reduce_max_case = false;
|
||||
for (size_t i = 0; i < loop_end->get_output_size(); i++) {
|
||||
// ReduceMax case. When Loop cannot have empty output as ngraph op,
|
||||
// we should have fake edge through all Loops (LoopBegin->LoopEnd) which connect src and dst data.
|
||||
// If we merge these this Loop and Loop Before, we should remove this fake edge
|
||||
// because now we have real data for storing
|
||||
auto new_input_node = loop_end->get_input_node_shared_ptr(i);
|
||||
if (ov::is_type<ngraph::snippets::op::LoopBegin>(new_input_node)) {
|
||||
// We set temporary boolean variable because this value is for the next LoopEnd (upper), not for the current LoopEnd
|
||||
is_current_reduce_max_case = true;
|
||||
// Remove LoopEnd from Parent as target input
|
||||
loop_end->input_value(i).remove_target_input(loop_end->input(i));
|
||||
} else {
|
||||
const auto output = loop_end->output(i);
|
||||
// Skip target Buffer
|
||||
InputSet target_inputs;
|
||||
for (const auto& input : output.get_target_inputs()) {
|
||||
if (input.get_node()->shared_from_this() != buffer || reduce_max_case) {
|
||||
target_inputs.insert(input);
|
||||
}
|
||||
}
|
||||
|
||||
if (target_inputs.size()) {
|
||||
const auto edge = Edge{loop_end->input_value(output.get_index()), target_inputs};
|
||||
new_loop_outputs.push_back(edge);
|
||||
new_ptr_increments.push_back(ptr_increments[loop_begin->get_input_size() + i]);
|
||||
new_finalization_offsets.push_back(finalization_offsets[loop_begin->get_input_size() + i]);
|
||||
// Remove LoopEnd from Parent as target input
|
||||
loop_end->input_value(i).remove_target_input(loop_end->input(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return is_current_reduce_max_case;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
bool ngraph::snippets::pass::LoopFusion::Merge(const std::shared_ptr<op::LoopBegin>& loop_begin_down) {
|
||||
if (!loop_begin_down) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::snippets::op::LoopEnd> loop_end_up = nullptr;
|
||||
std::shared_ptr<ngraph::snippets::op::Buffer> buffer = nullptr;
|
||||
// Initialize the corresponding upper LoopEnd and Buffer
|
||||
if (!get_buffer_and_loop_end(loop_begin_down, loop_end_up, buffer)) {
|
||||
return false;
|
||||
}
|
||||
// Check for conditions of fusion
|
||||
if (!can_be_merged(loop_end_up, loop_begin_down)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto loop_end_down = loop_begin_down->get_loop_end();
|
||||
const auto loop_begin_up = loop_end_up->get_loop_begin();
|
||||
const auto new_input_count = loop_begin_up->get_input_size() + loop_begin_down->get_input_size();
|
||||
const auto new_output_count = loop_end_up->get_output_size() + loop_end_down->get_output_size();
|
||||
const auto new_io_count = new_input_count + new_output_count;
|
||||
const auto ptr_increments_up = loop_end_up->get_ptr_increments();
|
||||
const auto ptr_increments_down = loop_end_down->get_ptr_increments();
|
||||
const auto finalization_offsets_up = loop_end_up->get_finalization_offsets();
|
||||
const auto finalization_offsets_down = loop_end_down->get_finalization_offsets();
|
||||
std::vector<int64_t> new_ptr_increments, new_finalization_offsets;
|
||||
new_ptr_increments.reserve(new_io_count);
|
||||
new_finalization_offsets.reserve(new_io_count);
|
||||
|
||||
// Collect new loop inputs
|
||||
std::vector<Edge> loop_inputs;
|
||||
loop_inputs.reserve(new_input_count);
|
||||
new_ptr_increments.reserve(new_io_count);
|
||||
new_finalization_offsets.reserve(new_io_count);
|
||||
collect_loop_inputs(loop_begin_up, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets);
|
||||
collect_loop_inputs(loop_begin_down, buffer, loop_inputs, new_ptr_increments, new_finalization_offsets);
|
||||
|
||||
// Collect new Loop outputs
|
||||
std::vector<Edge> loop_outputs;
|
||||
loop_outputs.reserve(new_output_count);
|
||||
// We can fuse Loop with maximum accumulator pattern only with Smth input
|
||||
// So firstly, we analyze LoopEnd down (it's possible maximum accumulator pattern), set `reduce_max_case` variable
|
||||
// if it's really maximum accumulator pattern, and then analyze LoopEnd up using `reduce_max_case` variable
|
||||
const bool reduce_max_case = collect_loop_outputs(loop_end_down, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, false);
|
||||
collect_loop_outputs(loop_end_up, buffer, loop_outputs, new_ptr_increments, new_finalization_offsets, reduce_max_case);
|
||||
if (reduce_max_case) {
|
||||
const auto target_inputs = loop_begin_down->output(0).get_target_inputs();
|
||||
NGRAPH_CHECK(target_inputs.size() == 1, "LoopBegin in ReduceMax should have only one consumer (Load) for out port 0");
|
||||
const auto load = ov::as_type_ptr<op::Load>(target_inputs.begin()->get_node()->shared_from_this());
|
||||
NGRAPH_CHECK(load != nullptr, "LoopBegin in ReduceMax should have only one consumer for out port 0 - Load");
|
||||
|
||||
const auto store = ov::as_type_ptr<op::Store>(loop_end_up->get_input_node_shared_ptr(0));
|
||||
NGRAPH_CHECK(store != nullptr, "Before LoopEnd should be Store emitter");
|
||||
|
||||
// Connect vector emitters before Store and after Load
|
||||
load->output(0).replace(store->get_input_source_output(0));
|
||||
}
|
||||
|
||||
const auto new_increment = loop_end_up->get_increment();
|
||||
const auto new_work_amount = loop_end_up->get_work_amount();
|
||||
|
||||
// Create new LoopBegin
|
||||
OutputVector new_loop_begin_inputs;
|
||||
new_loop_begin_inputs.reserve(loop_inputs.size());
|
||||
for (const auto& loop_input : loop_inputs) {
|
||||
const auto data_output = loop_input.first;
|
||||
new_loop_begin_inputs.push_back(data_output);
|
||||
}
|
||||
const auto new_loop_begin = std::make_shared<op::LoopBegin>(new_loop_begin_inputs);
|
||||
NGRAPH_CHECK(new_loop_begin->get_input_size() == loop_inputs.size(), "New LoopBegin has incorrect count of inputs.");
|
||||
|
||||
// Connect new LoopBegin to input edges
|
||||
for (size_t i = 0; i < loop_inputs.size(); i++) {
|
||||
const auto edge = loop_inputs[i];
|
||||
for (auto& target_input : edge.second) {
|
||||
target_input.replace_source_output(new_loop_begin->output(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Create new LoopEnd
|
||||
OutputVector new_loop_end_inputs;
|
||||
new_loop_end_inputs.reserve(loop_outputs.size() + 1); // + 1 - for loop_begin
|
||||
for (const auto& loop_output : loop_outputs) {
|
||||
const auto data_output = loop_output.first;
|
||||
new_loop_end_inputs.push_back(data_output);
|
||||
}
|
||||
new_loop_end_inputs.push_back(new_loop_begin->output(new_loop_begin->get_input_size()));
|
||||
const auto new_loop_end = std::make_shared<op::LoopEnd>(new_loop_end_inputs, new_work_amount, new_increment,
|
||||
new_ptr_increments, new_finalization_offsets);
|
||||
NGRAPH_CHECK(new_loop_end->get_output_size() == loop_outputs.size(), "New LoopEnd has incorrect count of outputs.");
|
||||
// Connect new LoopEnd to output edges
|
||||
for (size_t i = 0; i < loop_outputs.size(); i++) {
|
||||
const auto edge = loop_outputs[i];
|
||||
auto new_output = new_loop_end->output(i);
|
||||
for (auto& target_input : edge.second) {
|
||||
target_input.replace_source_output(new_output);
|
||||
}
|
||||
}
|
||||
|
||||
if (reduce_max_case) {
|
||||
loop_end_down->output(0).replace(buffer->output(0));
|
||||
} else {
|
||||
// Remove old Loops and Load/Store if there are around Buffer
|
||||
for (size_t i = 0; i < loop_end_up->get_input_size() - 1; i++) {
|
||||
auto new_output = loop_end_up->input_value(i);
|
||||
loop_end_up->output(i).replace(new_output);
|
||||
new_output.remove_target_input(loop_end_up->input(i));
|
||||
}
|
||||
for (size_t i = 0; i < loop_begin_down->get_input_size(); i++) {
|
||||
const auto output_target_inputs = loop_begin_down->output(i).get_target_inputs();
|
||||
const auto new_output = loop_begin_down->input_value(i);
|
||||
for (const auto &target_input : output_target_inputs) {
|
||||
target_input.replace_source_output(new_output);
|
||||
}
|
||||
|
||||
// Clear old Buffer children
|
||||
new_output.remove_target_input(loop_begin_down->input(i));
|
||||
}
|
||||
}
|
||||
|
||||
new_loop_end->has_outer_loop = loop_end_down->has_outer_loop || loop_end_up->has_outer_loop;
|
||||
|
||||
loop_begin_up->transfer_control_dependents(new_loop_begin);
|
||||
loop_begin_down->transfer_control_dependents(new_loop_begin);
|
||||
loop_end_up->transfer_control_dependents(new_loop_end);
|
||||
loop_end_down->transfer_control_dependents(new_loop_end);
|
||||
new_loop_begin->add_node_control_dependencies(loop_begin_up);
|
||||
new_loop_begin->add_node_control_dependencies(loop_begin_down);
|
||||
new_loop_end->add_node_control_dependencies(loop_end_up);
|
||||
new_loop_end->add_node_control_dependencies(loop_end_down);
|
||||
|
||||
// If there was Buffer between Loops, after Loop fusion
|
||||
// we should remove the Buffer node and MemoryAccess nodes if it's needed
|
||||
if (buffer) {
|
||||
const auto buffer_input = buffer->get_input_node_shared_ptr(0);
|
||||
const auto buffer_output = buffer->output(0).get_target_inputs().begin()->get_node()->shared_from_this();
|
||||
|
||||
// If after merging there are Load and Store, we should remove them
|
||||
if (const auto store = ov::as_type_ptr<op::Store>(buffer_input)) {
|
||||
store->output(0).replace(store->input_value(0));
|
||||
}
|
||||
if (const auto load = ov::as_type_ptr<op::Load>(buffer_output)) {
|
||||
load->output(0).replace(load->input_value(0));
|
||||
}
|
||||
|
||||
// Remove Buffer if there are no Loops and MatMul after Loop fusion
|
||||
// because only these operations can have Buffer node on inputs and outputs.
|
||||
// So if there aren't, it means that Buffer is extra, and we can remove it
|
||||
if (!ov::is_type<op::LoopBegin>(buffer_output) && !ov::is_type<op::LoopEnd>(buffer_input) &&
|
||||
!ov::is_type<op::Brgemm>(buffer_output) && !ov::is_type<op::Brgemm>(buffer_input)) {
|
||||
buffer->output(0).replace(buffer->input_value(0));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::LoopFusion::LoopFusion() {
|
||||
MATCHER_SCOPE(LoopFusion);
|
||||
|
||||
auto m_loop_begin = ngraph::pattern::wrap_type<op::LoopBegin>();
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::LoopFusion")
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
const auto loop_begin = ngraph::as_type_ptr<op::LoopBegin>(pattern_to_output.at(m_loop_begin).get_node_shared_ptr());
|
||||
const auto status = Merge(loop_begin);
|
||||
return status;
|
||||
};
|
||||
|
||||
auto matcher = std::make_shared<ngraph::pattern::Matcher>(m_loop_begin, matcher_name);
|
||||
register_matcher(matcher, callback);
|
||||
}
|
48
src/common/snippets/src/pass/loop_helpers.cpp
Normal file
48
src/common/snippets/src/pass/loop_helpers.cpp
Normal file
@ -0,0 +1,48 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ngraph/op/op.hpp"
|
||||
#include "snippets/pass/loop_helpers.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace op {
|
||||
std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs) {
|
||||
std::vector<std::set<Input<Node>>> originalChildInputs;
|
||||
for (const auto& out : originalOutputs) {
|
||||
originalChildInputs.push_back(out.get_target_inputs());
|
||||
}
|
||||
|
||||
auto loop_begin = std::make_shared<LoopBegin>(originalOutputs);
|
||||
|
||||
for (int i = 0; i < originalChildInputs.size(); i++) {
|
||||
for (auto& input : originalChildInputs[i]) {
|
||||
input.replace_source_output(loop_begin->output(i));
|
||||
}
|
||||
}
|
||||
return loop_begin;
|
||||
}
|
||||
|
||||
std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
|
||||
const std::shared_ptr<LoopBegin>& loopBegin,
|
||||
size_t work_amount, size_t increment,
|
||||
std::vector<bool> apply_increment,
|
||||
std::vector<int64_t> finalization_offsets) {
|
||||
OutputVector originalParentOutputs;
|
||||
for (const auto& in : originalInputs) {
|
||||
originalParentOutputs.push_back(in.get_source_output());
|
||||
}
|
||||
originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1));
|
||||
auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, work_amount, increment,
|
||||
std::move(apply_increment), std::move(finalization_offsets));
|
||||
|
||||
for (int i = 0; i < originalInputs.size(); i++) {
|
||||
originalInputs[i].replace_source_output(loop_end->output(i));
|
||||
}
|
||||
return loop_end;
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
45
src/common/snippets/src/pass/matmul_to_brgemm.cpp
Normal file
45
src/common/snippets/src/pass/matmul_to_brgemm.cpp
Normal file
@ -0,0 +1,45 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/itt.hpp"
|
||||
|
||||
#include "snippets/pass/matmul_to_brgemm.hpp"
|
||||
|
||||
#include "snippets/op/brgemm.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
#include "ngraph/rt_info.hpp"
|
||||
#include "ngraph/pattern/op/wrap_type.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
MatMulToBrgemm::MatMulToBrgemm() {
|
||||
MATCHER_SCOPE(MatMulToBrgemm);
|
||||
auto matmul_pattern = ngraph::pattern::wrap_type<ngraph::opset1::MatMul>({ngraph::pattern::any_input(),
|
||||
ngraph::pattern::any_input()});
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MatMulToBrgemm")
|
||||
auto& pm = m.get_pattern_value_map();
|
||||
const auto matmul = as_type_ptr<ngraph::opset1::MatMul>(pm.at(matmul_pattern).get_node_shared_ptr());
|
||||
// Brgemm doesn't support transposed inputs currently, so we don't convert such matmuls
|
||||
if (matmul->get_transpose_a() || matmul->get_transpose_b())
|
||||
return false;
|
||||
|
||||
auto brgemm = std::make_shared<op::Brgemm>(matmul->get_input_source_output(0), matmul->get_input_source_output(1));
|
||||
brgemm->set_friendly_name(matmul->get_friendly_name());
|
||||
ngraph::copy_runtime_info(matmul, brgemm);
|
||||
ngraph::replace_node(matmul, brgemm);
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(matmul_pattern, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
394
src/common/snippets/src/pass/mha_tokenization.cpp
Normal file
394
src/common/snippets/src/pass/mha_tokenization.cpp
Normal file
@ -0,0 +1,394 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/mha_tokenization.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset8.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/validation_util.hpp>
|
||||
|
||||
|
||||
namespace {
|
||||
auto is_supported_tensor(const ngraph::descriptor::Tensor& t) -> bool {
|
||||
// TODO: Add support of all supported by common tokenization element types
|
||||
// return ngraph::snippets::pass::TokenizeSnippets::supported_element_types.count(input.get_element_type()) != 0;
|
||||
// Also only 4D is supported at the moment
|
||||
return t.get_element_type() == ngraph::element::f32 && t.get_partial_shape().is_static() && t.get_shape().size() == 4;
|
||||
}
|
||||
|
||||
// TODO: Add support of FQ, Reshape?
|
||||
auto is_supported_op(const std::shared_ptr<ngraph::Node>& node) -> bool {
|
||||
return ngraph::snippets::pass::TokenizeSnippets::AppropriateForSubgraph(node) &&
|
||||
(ngraph::is_type<ngraph::op::util::UnaryElementwiseArithmetic>(node) ||
|
||||
ngraph::is_type<ngraph::op::util::BinaryElementwiseArithmetic>(node) ||
|
||||
ngraph::is_type<ngraph::op::v1::Select>(node));
|
||||
}
|
||||
|
||||
auto is_valid_transpose(const std::shared_ptr<ngraph::opset1::Transpose>& node, std::vector<int64_t> expected_order) -> bool {
|
||||
auto valid_transpose_order = [expected_order](const std::shared_ptr<ngraph::Node>& node) -> bool {
|
||||
const auto transpose_pattern = ngraph::as_type_ptr<ngraph::opset1::Constant>(node);
|
||||
if (!transpose_pattern)
|
||||
return false;
|
||||
return transpose_pattern->cast_vector<int64_t>() == expected_order;
|
||||
};
|
||||
|
||||
return node && node->get_output_target_inputs(0).size() == 1 && node->get_shape().size() == 4 &&
|
||||
valid_transpose_order(node->get_input_node_shared_ptr(1)) && is_supported_tensor(node->get_input_tensor(0));
|
||||
}
|
||||
|
||||
auto tokenize_broadcast(const std::shared_ptr<ov::Node>& interm_op, ov::NodeVector& ordered_ops) -> void {
|
||||
// We can tokenize Broadcast op only when output shape of child doesn't depend on Broadcast shape without last dimension.
|
||||
// Snippets remove Broadcast op and insert BroadcastMove if last dimensions before and after Broadcast are different.
|
||||
// Otherwise, we can lose original shape.
|
||||
// Example:
|
||||
// in0 [1, 1, 1] in0 [1, 1, 1] in0 [1, 1, 1] in0 [1, 1, 1]
|
||||
// Broadcast [1, 10, 1] / \ /
|
||||
// \ / --->>> Add
|
||||
// Add |
|
||||
// Result [1, 10, 1] Result [1, 1, 1]
|
||||
|
||||
ov::PartialShape new_output_shape(std::vector<ov::Dimension>{1});
|
||||
ov::NodeVector broadcast_nodes;
|
||||
|
||||
auto skip_last_dim = [](const ov::PartialShape& shape) {
|
||||
return ov::PartialShape(std::vector<ov::Dimension>{shape.begin(), shape.end() - 1});
|
||||
};
|
||||
|
||||
for (auto input : interm_op->inputs()) {
|
||||
auto broadcast = ov::as_type_ptr<ngraph::opset1::Broadcast>(input.get_source_output().get_node_shared_ptr());
|
||||
// TODO: Can we reuse AppropriateForSubgraph here? Seems like it's huge check for Broadcast
|
||||
if (broadcast && broadcast->get_broadcast_spec().m_type == ov::op::AutoBroadcastType::NUMPY &&
|
||||
broadcast->get_output_target_inputs(0).size() == 1) {
|
||||
broadcast_nodes.push_back(broadcast);
|
||||
|
||||
const auto pshape = broadcast->get_input_partial_shape(0);
|
||||
if (pshape.rank().is_static() && pshape.size() > 2) {
|
||||
ov::PartialShape::broadcast_merge_into(new_output_shape,
|
||||
skip_last_dim(pshape),
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
} else {
|
||||
const auto pshape = input.get_partial_shape();
|
||||
if (pshape.rank().is_static() && pshape.size() > 2) {
|
||||
ov::PartialShape::broadcast_merge_into(new_output_shape,
|
||||
skip_last_dim(pshape),
|
||||
::ngraph::op::AutoBroadcastType::NUMPY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!broadcast_nodes.empty()) {
|
||||
if (new_output_shape == skip_last_dim(interm_op->get_output_partial_shape(0))) {
|
||||
std::copy(broadcast_nodes.begin(), broadcast_nodes.end(), std::back_inserter(ordered_ops));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto tokenize_reshape_around_softmax(std::shared_ptr<ov::Node>& interm_op,
|
||||
std::shared_ptr<ngraph::opset1::Reshape>& reshape,
|
||||
ngraph::NodeVector& ordered_ops) -> bool {
|
||||
reshape = ngraph::as_type_ptr<ngraph::opset1::Reshape>(interm_op);
|
||||
if (reshape) {
|
||||
const auto shape = reshape->get_input_shape(0);
|
||||
if (shape.back() != reshape->get_output_shape(0).back() || reshape->get_output_target_inputs(0).size() != 1)
|
||||
return false;
|
||||
ordered_ops.push_back(reshape);
|
||||
interm_op = reshape->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto update_intermediate_supported_ops(std::shared_ptr<ov::Node>& interm_op, ngraph::NodeVector& ordered_ops) -> bool {
|
||||
// TODO: Add Reshape, FQ support
|
||||
while (is_supported_op(interm_op)) {
|
||||
// All supported intermediate ops have only one output port
|
||||
// To verify output element type is enough because all supported intermediate ops have the same output element type as input type
|
||||
if (interm_op->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(interm_op->get_output_tensor(0)))
|
||||
return false;
|
||||
|
||||
// Check for supported Broadcast op
|
||||
if (interm_op->get_input_size() > 1) {
|
||||
tokenize_broadcast(interm_op, ordered_ops);
|
||||
}
|
||||
|
||||
ordered_ops.push_back(interm_op);
|
||||
interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
}
|
||||
return true;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
ngraph::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets() {
|
||||
MATCHER_SCOPE(TokenizeMHASnippets);
|
||||
|
||||
auto m_matmul0 = std::make_shared<ngraph::opset1::MatMul>(ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
|
||||
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()));
|
||||
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_matmul0, matcher_name),
|
||||
[=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TokenizeMHASnippets")
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
|
||||
// After some transformations, a different number of Constants for some operations may be created
|
||||
// than the actual number of Constants during tokenization.
|
||||
// To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
|
||||
// we should calculate potential number of non-scalar Constants that will be moved up from body.
|
||||
// TODO: Need update this variable when FQ will be supported
|
||||
size_t hidden_virtual_ports_count = 0;
|
||||
// Default value is True because MHA pattern always requires Buffer op
|
||||
bool need_buffer = true;
|
||||
std::string fused_names;
|
||||
ngraph::NodeVector ordered_ops;
|
||||
|
||||
/* ======== Matcher Pass ========== */
|
||||
|
||||
/****** Skeleton ******/
|
||||
/* Skeleton on MHA-pattern is:
|
||||
* \ /
|
||||
* MatMul0
|
||||
* |
|
||||
* Eltwise/Select/Reshape/FakeQuantize
|
||||
* |
|
||||
* Softmax
|
||||
* |
|
||||
* Eltwise/Select/Reshape/FakeQuantize
|
||||
* \ /
|
||||
* MatMul1
|
||||
*/
|
||||
const auto matmul0 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(pattern_to_output.at(m_matmul0).get_node_shared_ptr());
|
||||
if (!matmul0 || matmul0->get_output_target_inputs(0).size() != 1 || matmul0->get_transpose_a() ||
|
||||
!is_supported_tensor(matmul0->get_input_tensor(0)) || !is_supported_tensor(matmul0->get_input_tensor(1)))
|
||||
return false;
|
||||
|
||||
if (transformation_callback(matmul0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ordered_ops.push_back(matmul0);
|
||||
|
||||
auto interm_op = matmul0->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
// Add supported operations which are between MatMul0 and Softmax to ordered_ops
|
||||
if (!update_intermediate_supported_ops(interm_op, ordered_ops))
|
||||
return false;
|
||||
|
||||
std::shared_ptr<ngraph::opset1::Reshape> reshape0 = nullptr;
|
||||
if (!tokenize_reshape_around_softmax(interm_op, reshape0, ordered_ops))
|
||||
return false;
|
||||
|
||||
int64_t axis = 0;
|
||||
const auto rank = interm_op->get_input_partial_shape(0).rank();
|
||||
if (const auto softmax_v8 = ngraph::as_type_ptr<ngraph::opset8::Softmax>(interm_op)) {
|
||||
axis = ngraph::normalize_axis(interm_op->get_friendly_name(), softmax_v8->get_axis(), rank);
|
||||
} else if (const auto softmax_v1 = ngraph::as_type_ptr<ngraph::opset1::Softmax>(interm_op)) {
|
||||
axis = softmax_v1->get_axis();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (axis != rank.get_length() - 1 || interm_op->get_output_target_inputs(0).size() != 1)
|
||||
return false;
|
||||
ordered_ops.push_back(interm_op);
|
||||
|
||||
interm_op = interm_op->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
std::shared_ptr<ngraph::opset1::Reshape> reshape1 = nullptr;
|
||||
if (!tokenize_reshape_around_softmax(interm_op, reshape1, ordered_ops))
|
||||
return false;
|
||||
|
||||
if (((reshape0 == nullptr) != (reshape1 == nullptr)) ||
|
||||
(reshape0 && reshape1 && (reshape0->get_input_shape(0) != reshape1->get_output_shape(0))))
|
||||
return false;
|
||||
|
||||
// Add supported operations which are between Softmax and MatMul1 to ordered_ops
|
||||
if (!update_intermediate_supported_ops(interm_op, ordered_ops))
|
||||
return false;
|
||||
|
||||
const auto matmul1 = ngraph::as_type_ptr<ngraph::opset1::MatMul>(interm_op);
|
||||
if (!matmul1 || matmul1->get_output_target_inputs(0).size() != 1 || matmul1->get_transpose_a() || matmul1->get_transpose_b() ||
|
||||
!is_supported_tensor(matmul1->get_input_tensor(0)) || !is_supported_tensor(matmul1->get_input_tensor(1)))
|
||||
return false;
|
||||
|
||||
/***********************/
|
||||
|
||||
/***** Transposes *****/
|
||||
/* There may be Transpose and Reshape ops on inputs and outputs of MHA-pattern skeleton
|
||||
* We can add them into Subgraph body
|
||||
*/
|
||||
|
||||
// First input branch of MatMul0 should be executed before second input branch of MatMul0,
|
||||
// so firstly we insert Transpose1 on the beginning of ordered_ops and then Transpose1
|
||||
bool are_weights_scalar = true;
|
||||
auto parent = matmul0->get_input_node_shared_ptr(1);
|
||||
while (is_supported_op(parent)) {
|
||||
// All supported ops have only one output port
|
||||
// To verify output element type is enough because all supported ops have the same output element type as input type
|
||||
if (parent->get_output_target_inputs(0).size() != 1 || !is_supported_tensor(parent->get_output_tensor(0)))
|
||||
break;
|
||||
|
||||
const auto parent_count = parent->inputs().size();
|
||||
for (size_t i = 1; i < parent_count; ++i) {
|
||||
are_weights_scalar = are_weights_scalar && ngraph::shape_size(parent->get_input_shape(i)) == 1;
|
||||
}
|
||||
ordered_ops.insert(ordered_ops.begin(), parent);
|
||||
// We think that sequence of ops goes through input port 0
|
||||
// But can be Select here? If it can be, parent shouldn't be on input port 0. Need another way?
|
||||
parent = parent->get_input_node_shared_ptr(0);
|
||||
}
|
||||
|
||||
auto transpose1 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(parent);
|
||||
if (matmul0->get_transpose_b()) {
|
||||
if (is_valid_transpose(transpose1, {0, 2, 1, 3})) {
|
||||
// We can support several ops between MatMul0 with transposed_b and Transpose1 with 0213 order
|
||||
// only if these ops have scalar shapes on other inputs.
|
||||
// There is transformation ExplicitTransposeMatMulInputs that set supported order and transposed_b(false).
|
||||
// We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching
|
||||
if (are_weights_scalar) {
|
||||
ordered_ops.insert(ordered_ops.begin(), transpose1);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (is_valid_transpose(transpose1, {0, 2, 3, 1})) {
|
||||
ordered_ops.insert(ordered_ops.begin(), transpose1);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Add Reshape Support for all Transposes
|
||||
// Add 3D support for all Transposes
|
||||
const auto transpose0 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(matmul0->get_input_node_shared_ptr(0));
|
||||
if (is_valid_transpose(transpose0, {0, 2, 1, 3})) {
|
||||
ordered_ops.insert(ordered_ops.begin(), transpose0);
|
||||
} else if (matmul0->get_transpose_b()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto transpose2 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(matmul1->get_input_node_shared_ptr(1));
|
||||
if (is_valid_transpose(transpose2, {0, 2, 1, 3})) {
|
||||
ordered_ops.push_back(transpose2);
|
||||
}
|
||||
ordered_ops.push_back(matmul1);
|
||||
|
||||
auto child = matmul1->get_output_target_inputs(0).begin()->get_node()->shared_from_this();
|
||||
// TODO: Add support Eltwises between MatMul1 and Transpose
|
||||
// status = update_intermediate_supported_ops(child, ordered_ops);
|
||||
// if (!status) {
|
||||
// ordered_ops.push_back(child);
|
||||
// }
|
||||
|
||||
auto transpose3 = ngraph::as_type_ptr<ngraph::opset1::Transpose>(child);
|
||||
if (is_valid_transpose(transpose3, {0, 2, 1, 3})) {
|
||||
ordered_ops.push_back(transpose3);
|
||||
}
|
||||
|
||||
/**********************/
|
||||
|
||||
/* ================================ */
|
||||
|
||||
/* ====== Subgraph creation ======= */
|
||||
|
||||
ngraph::OutputVector body_inputs, subgraph_inputs;
|
||||
ngraph::ParameterVector body_parameters;
|
||||
ngraph::ResultVector body_results;
|
||||
std::vector<std::set<Input<Node>>> subgraph_result_inputs;
|
||||
|
||||
auto create_body_inputs = [&](const std::shared_ptr<ngraph::Node>& node) -> void {
|
||||
for (size_t i = 0; i < node->get_input_size(); ++i) {
|
||||
const auto input = node->input(i);
|
||||
const auto parent = input.get_source_output().get_node_shared_ptr();
|
||||
const auto constant = ov::as_type_ptr<ov::op::v0::Constant>(parent);
|
||||
if (constant && (ngraph::shape_size(input.get_shape()) == 1 || op::Subgraph::constant_input_should_be_inside_body(node))) {
|
||||
// If Constant has one consumer - target node, we add Constant to body_inputs
|
||||
// If Constant has several consumers, we should check that all these consumers are inside Subgraph body
|
||||
// and if all of them are inside body, we can explicitly add Constant to the body_inputs, otherwise we should
|
||||
// make a copy and add copy of Constant to body_inputs
|
||||
// For example, this case is especially valid for Transposes nodes
|
||||
// (several Transposes have the same order so there can be the common Constant with this order)
|
||||
if (constant->get_output_target_inputs(0).size() == 1) {
|
||||
body_inputs.push_back(input.get_source_output());
|
||||
} else {
|
||||
const auto constant_consumers = constant->get_output_target_inputs(0);
|
||||
bool all_consumers_are_inside = std::all_of(constant_consumers.begin(), constant_consumers.end(),
|
||||
[&ordered_ops](const ngraph::Input<ngraph::Node>& input) {
|
||||
return std::find(ordered_ops.begin(), ordered_ops.end(),
|
||||
input.get_node()->shared_from_this()) != ordered_ops.end();
|
||||
});
|
||||
if (all_consumers_are_inside) {
|
||||
body_inputs.push_back(input.get_source_output());
|
||||
} else {
|
||||
const auto constant_copy = constant->clone_with_new_inputs({});
|
||||
node->set_argument(input.get_index(), constant_copy);
|
||||
body_inputs.push_back(constant_copy);
|
||||
}
|
||||
}
|
||||
} else if (std::find(ordered_ops.begin(), ordered_ops.end(), parent) == ordered_ops.end()) {
|
||||
auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
|
||||
body_parameters.push_back(parameter);
|
||||
body_parameters.back()->set_friendly_name(input.get_node()->get_friendly_name());
|
||||
body_inputs.push_back(parameter->output(0));
|
||||
|
||||
subgraph_inputs.push_back(input.get_source_output());
|
||||
|
||||
node->input(i).replace_source_output(parameter);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (const auto& op : ordered_ops) {
|
||||
create_body_inputs(op);
|
||||
op->clear_control_dependencies();
|
||||
fused_names += op->get_friendly_name() + ",";
|
||||
}
|
||||
|
||||
const auto last_node = ordered_ops.back();
|
||||
for (const auto& output : last_node->outputs()) {
|
||||
subgraph_result_inputs.push_back(output.get_target_inputs());
|
||||
}
|
||||
for (const auto& output : last_node->outputs()) {
|
||||
body_results.push_back(std::make_shared<ngraph::opset1::Result>(last_node->output(output.get_index())));
|
||||
}
|
||||
|
||||
if (body_results.size() != subgraph_result_inputs.size()) {
|
||||
throw ngraph_error("body results and node results size mismatch during subgraph collapse");
|
||||
}
|
||||
|
||||
// todo: move this plugin-specific constraint to the plugin callback
|
||||
if (body_parameters.size() + body_results.size() + hidden_virtual_ports_count > 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto body = op::create_body(last_node->get_friendly_name(), body_results, body_parameters);
|
||||
auto subgraph = std::make_shared<op::Subgraph>(subgraph_inputs, body);
|
||||
// Copy runtime info from last node to subgraph - to copy topological order
|
||||
copy_runtime_info(last_node, subgraph);
|
||||
subgraph->set_friendly_name(last_node->get_friendly_name());
|
||||
|
||||
for (size_t i = 0; i < subgraph->get_output_size(); ++i) {
|
||||
for (const auto& target_input : subgraph_result_inputs[i]) {
|
||||
target_input.replace_source_output(subgraph->output(i));
|
||||
}
|
||||
}
|
||||
op::update_out_tensor_name(subgraph);
|
||||
|
||||
subgraph->validate_and_infer_types();
|
||||
|
||||
auto act_body = subgraph->body_ptr();
|
||||
for (size_t i = 0; i < act_body->get_parameters().size(); i++) {
|
||||
act_body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
|
||||
}
|
||||
subgraph->get_rt_info()["originalLayersNames"] = fused_names;
|
||||
subgraph->set_virtual_port_count(hidden_virtual_ports_count);
|
||||
subgraph->set_buffer_needed(need_buffer);
|
||||
|
||||
return true;
|
||||
|
||||
/* ================================ */
|
||||
});
|
||||
}
|
114
src/common/snippets/src/pass/reset_buffer.cpp
Normal file
114
src/common/snippets/src/pass/reset_buffer.cpp
Normal file
@ -0,0 +1,114 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
#include "snippets/pass/reset_buffer.hpp"
|
||||
#include "snippets/op/subgraph.hpp"
|
||||
|
||||
|
||||
namespace {
|
||||
void normalize_ptr_and_offsets(const ov::NodeVector &io, std::vector<int64_t> &ptr_increments, std::vector<int64_t> &finalization_offsets) {
|
||||
bool there_is_buffer = false;
|
||||
// Iterations are from end because before we correct finalization offsets for Loop outputs (io = inputs + outputs)
|
||||
for (int i = static_cast<int>(io.size()) - 1; i >= 0; --i) {
|
||||
if (ov::is_type<ngraph::snippets::op::Buffer>(io[i])) {
|
||||
if (there_is_buffer) {
|
||||
ptr_increments[i] = 0;
|
||||
finalization_offsets[i] = 0;
|
||||
} else {
|
||||
there_is_buffer = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int64_t ngraph::snippets::pass::ResetBufferState::calculate_required_finalization_offsets(const size_t back_step, const size_t target_work_amount) {
|
||||
return target_work_amount != 1 ? -static_cast<int64_t>(back_step) : 0;
|
||||
}
|
||||
|
||||
ngraph::snippets::pass::ResetBufferState::ResetBufferState() {
|
||||
MATCHER_SCOPE(ResetBufferState);
|
||||
|
||||
// Match on LoopEnd is enough at the moment because Buffer op may be only after MatMul and LoopEnd, but
|
||||
// MatMul doesn't change Buffer memory pointer after execution
|
||||
auto m_loop_end = ngraph::pattern::wrap_type<op::LoopEnd>();
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ResetBufferState")
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
|
||||
const auto loop_end = ngraph::as_type_ptr<op::LoopEnd>(pattern_to_output.at(m_loop_end).get_node_shared_ptr());
|
||||
const auto loop_begin = loop_end->get_loop_begin();
|
||||
|
||||
const auto i_size = loop_begin->get_input_size();
|
||||
const auto o_size = loop_end->get_output_size();
|
||||
const auto count_io = i_size + o_size;
|
||||
std::vector<ov::PartialShape> body_shapes(count_io);
|
||||
ov::NodeVector io(count_io);
|
||||
for (size_t i = 0; i < i_size; ++i) {
|
||||
body_shapes[i] = loop_begin->input_value(i).get_partial_shape();
|
||||
io[i] = loop_begin->input_value(i).get_node_shared_ptr();
|
||||
auto port_idx = loop_begin->input_value(i).get_index();
|
||||
while (std::dynamic_pointer_cast<op::LoopBase>(io[i])) {
|
||||
const auto source_output = io[i]->input_value(port_idx);
|
||||
io[i] = source_output.get_node_shared_ptr();
|
||||
port_idx = source_output.get_index();
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < o_size; ++i) {
|
||||
body_shapes[i_size + i] = loop_end->output(i).get_partial_shape();
|
||||
// check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
|
||||
auto consumer = *loop_end->output(i).get_target_inputs().begin();
|
||||
auto port_idx = consumer.get_index();
|
||||
io[i_size + i] = consumer.get_node()->shared_from_this();
|
||||
while (std::dynamic_pointer_cast<op::LoopBase>(io[i_size + i])) {
|
||||
auto consumer = *io[i_size + i]->output(port_idx).get_target_inputs().begin();
|
||||
port_idx = consumer.get_index();
|
||||
io[i_size + i] = consumer.get_node()->shared_from_this();
|
||||
}
|
||||
}
|
||||
|
||||
auto ptr_increments = loop_end->get_ptr_increments();
|
||||
auto finalization_offsets = loop_end->get_finalization_offsets();
|
||||
|
||||
// If after Loop there is immediately Buffer, we should reset the Buffer ptr for the next calculations
|
||||
for (size_t i = 0; i < o_size; ++i) {
|
||||
const auto result_shape = body_shapes[i_size + i].get_shape();
|
||||
// check for first target input is enough for Buffer searching because operations can have only single Buffer per each output port as op
|
||||
const auto consumer = loop_end->output(i).get_target_inputs().begin()->get_node();
|
||||
if (ov::is_type<ngraph::snippets::op::Buffer>(consumer)) {
|
||||
// To calculate finalization offset we should know index of nesting Loop
|
||||
auto loop_index = 0lu;
|
||||
auto loop = loop_end->input_value(i).get_node_shared_ptr();
|
||||
auto port_idx = loop_end->input_value(i).get_index();
|
||||
while (std::dynamic_pointer_cast<op::LoopEnd>(loop)) {
|
||||
const auto source_output = loop->input_value(port_idx);
|
||||
loop = source_output.get_node_shared_ptr();
|
||||
port_idx = source_output.get_index();
|
||||
loop_index++;
|
||||
}
|
||||
|
||||
const auto work_amount = std::accumulate(result_shape.rbegin(), result_shape.rbegin() + loop_index + 1, size_t(1), std::multiplies<size_t>());
|
||||
finalization_offsets[i_size + i] =
|
||||
calculate_required_finalization_offsets(work_amount, *(result_shape.rbegin() + loop_index));
|
||||
}
|
||||
}
|
||||
|
||||
// If there are several Buffers on I/O we should remember that all Buffer have the register,
|
||||
// so we should update ptr for only one Buffer
|
||||
normalize_ptr_and_offsets(io, ptr_increments, finalization_offsets);
|
||||
loop_end->set_finalization_offsets(finalization_offsets);
|
||||
loop_end->set_ptr_increments(ptr_increments);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(m_loop_end, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
216
src/common/snippets/src/pass/softmax_decomposition.cpp
Normal file
216
src/common/snippets/src/pass/softmax_decomposition.cpp
Normal file
@ -0,0 +1,216 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "snippets/remarks.hpp"
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/softmax_decomposition.hpp"
|
||||
#include "snippets/pass/reset_buffer.hpp"
|
||||
#include "snippets/pass/insert_loops.hpp"
|
||||
#include "snippets/pass/loop_helpers.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/pattern/op/or.hpp>
|
||||
#include <ngraph/validation_util.hpp>
|
||||
|
||||
|
||||
ngraph::snippets::pass::SoftmaxDecomposition::SoftmaxDecomposition(const size_t vector_size, const int32_t buffer_allocation_rank) {
|
||||
MATCHER_SCOPE(SoftmaxDecomposition);
|
||||
|
||||
auto m_softmax = ngraph::pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>();
|
||||
|
||||
auto callback = [=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxDecomposition")
|
||||
auto root = m.get_match_root();
|
||||
const auto master_pshape = root->get_input_partial_shape(0);
|
||||
const auto rank = master_pshape.rank();
|
||||
if (rank.is_dynamic() || master_pshape.is_dynamic())
|
||||
return false;
|
||||
|
||||
int64_t axis = 0;
|
||||
if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(root)) {
|
||||
axis = ngraph::normalize_axis(root->get_friendly_name(), softmax_v8->get_axis(), rank);
|
||||
} else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(root)) {
|
||||
axis = softmax_v1->get_axis();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto shape_rank = rank.get_length();
|
||||
if (axis != shape_rank - 1)
|
||||
return false;
|
||||
|
||||
const auto data = root->get_input_node_shared_ptr(0);
|
||||
|
||||
const auto master_shape = master_pshape.get_shape();
|
||||
const auto dimension = shape_rank - 1;
|
||||
const auto work_amount = master_shape[dimension];
|
||||
const auto increment = vector_size;
|
||||
const auto inner_dim = shape_rank - 1;
|
||||
const auto inner_master_work_amount = static_cast<size_t>(master_shape[inner_dim]);
|
||||
const int outer_dim = shape_rank > 1 ? static_cast<int>(shape_rank - 2) : -1;
|
||||
const auto has_outer_loop = outer_dim >= 0 && master_shape[outer_dim] > 1;
|
||||
|
||||
/* ====== ReduceMax decomposition ====== */
|
||||
|
||||
// We have to have fake edge Data -> Loop[ReduceMax] -> Loop[Sub + Exp + ReduceSum] because ReduceMax is
|
||||
// accumulator which finds maximum of elements and save it to vector register. Loop works only with GPR (data) but ReduceMax Loop
|
||||
// doesn't save maximum to data. Seems like, LoopEnd shouldn't have outputs:
|
||||
// Data
|
||||
// VectorBuffer LoopBegin \
|
||||
// \ Load \ |
|
||||
// Maximum / |
|
||||
// / LoopEnd |
|
||||
// HorizonMax /
|
||||
// \ LoopBegin[Sub + Exp + ReduceSum]
|
||||
// But nGraph doesn't allow to have 0 outputs for Node (at least 1 output).
|
||||
// Thus, we propagate data through Loop[ReduceMax] using fake edge because of that Loop[ReduceMax] has two inputs "Data"
|
||||
// Data
|
||||
// VectorBuffer LoopBegin
|
||||
// \ Load | \
|
||||
// Maximum | /
|
||||
// / LoopEnd
|
||||
// HorizonMax |
|
||||
// \ LoopBegin[Sub + Exp + ReduceSum]
|
||||
const auto vector_buffer_max = std::make_shared<ngraph::snippets::op::VectorBuffer>();
|
||||
const auto loop_max_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{data, data});
|
||||
|
||||
const auto load_max = std::make_shared<ngraph::snippets::op::Load>(loop_max_begin->output(0), increment);
|
||||
const auto max = std::make_shared<ov::op::v1::Maximum>(load_max, vector_buffer_max);
|
||||
|
||||
auto apply_increments_max =
|
||||
InsertLoops::calculate_inner_apply_increments(master_shape, {data->get_shape(), data->get_shape(), data->get_shape()});
|
||||
// Input of softmax is Input and Output of this loop, which isn't used inside (it's just to have one output in Loop at least)
|
||||
// So we shouldn't increment pointer after each loop iteration
|
||||
apply_increments_max[0] = false;
|
||||
apply_increments_max[1] = false;
|
||||
// we should always reset data ptr after this loop because in the next Loop this ptr is used
|
||||
// Although output isn't a Buffer op, we set finalization offset and ptr increment for output, because ResetBufferState pass
|
||||
// normalizes offsets and increments starting from outputs
|
||||
const auto finalization_offsets_max =
|
||||
std::vector<int64_t>{ 0, 0, ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, data->get_shape()[inner_dim]) };
|
||||
const auto loop_max_end = std::make_shared<ngraph::snippets::op::LoopEnd>(ngraph::OutputVector{loop_max_begin->output(1), loop_max_begin->output(2)},
|
||||
work_amount, increment, apply_increments_max, finalization_offsets_max);
|
||||
|
||||
const auto horizon_max = std::make_shared<ngraph::snippets::op::HorizonMax>(max);
|
||||
|
||||
/* =========================================== */
|
||||
|
||||
/* === Sub + Exp + ReduceSum decomposition === */
|
||||
|
||||
const auto vector_buffer_sum = std::make_shared<ngraph::snippets::op::VectorBuffer>();
|
||||
const auto loop_sum_begin = ngraph::snippets::op::insertLoopBegin(ngraph::OutputVector{loop_max_end->output(0)});
|
||||
|
||||
const auto load_sub = std::make_shared<ngraph::snippets::op::Load>(loop_sum_begin->output(0), increment);
|
||||
const auto sub = std::make_shared<ov::op::v1::Subtract>(load_sub, horizon_max);
|
||||
const auto exp = std::make_shared<ov::op::v0::Exp>(sub);
|
||||
const auto sum = std::make_shared<ov::op::v1::Add>(exp, vector_buffer_sum);
|
||||
const auto store_exp = std::make_shared<ngraph::snippets::op::Store>(exp, increment);
|
||||
|
||||
auto apply_increments_sum =
|
||||
InsertLoops::calculate_inner_apply_increments(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
|
||||
std::vector<int64_t> finalization_offsets_sum(2, 0);
|
||||
if (has_outer_loop) {
|
||||
finalization_offsets_sum =
|
||||
InsertLoops::calculate_finalization_offsets(master_shape, {load_sub->get_shape(), store_exp->get_shape()});
|
||||
}
|
||||
// we should always reset buffer ptr after loop because in the next Loop this buffer ptr is used
|
||||
finalization_offsets_sum[1] = ResetBufferState::calculate_required_finalization_offsets(inner_master_work_amount, store_exp->get_shape()[inner_dim]);
|
||||
const auto loop_sum_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
|
||||
ngraph::OutputVector{store_exp, loop_sum_begin->output(1)}, work_amount, increment,
|
||||
apply_increments_sum, finalization_offsets_sum);
|
||||
|
||||
const auto horizon_sum = std::make_shared<ngraph::snippets::op::HorizonSum>(sum);
|
||||
const auto buffer_exp = std::make_shared<ngraph::snippets::op::Buffer>(loop_sum_end->output(0), buffer_allocation_rank);
|
||||
|
||||
/* =========================================== */
|
||||
|
||||
/* ================== Div ==================== */
|
||||
|
||||
// Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
|
||||
const auto pow = std::make_shared<ngraph::opset1::Power>(horizon_sum,
|
||||
ngraph::op::Constant::create(ov::element::f32, ngraph::Shape{}, {-1}));
|
||||
|
||||
const auto loop_div_begin = op::insertLoopBegin(ngraph::OutputVector{buffer_exp});
|
||||
|
||||
const auto load_div = std::make_shared<ngraph::snippets::op::Load>(loop_div_begin->output(0), increment);
|
||||
const auto mul = std::make_shared<ov::op::v1::Multiply>(load_div, pow);
|
||||
const auto store_div = std::make_shared<ngraph::snippets::op::Store>(mul, increment);
|
||||
|
||||
auto apply_increments_div =
|
||||
InsertLoops::calculate_inner_apply_increments(master_shape, {load_div->get_shape(), store_div->get_shape()});
|
||||
std::vector<int64_t> finalization_offsets_div(2, 0);
|
||||
if (has_outer_loop) {
|
||||
finalization_offsets_div =
|
||||
InsertLoops::calculate_finalization_offsets(master_shape, {load_div->get_shape(), store_div->get_shape()});
|
||||
}
|
||||
const auto loop_div_end = std::make_shared<ngraph::snippets::op::LoopEnd>(
|
||||
ngraph::OutputVector{store_div, loop_div_begin->output(1)}, work_amount, increment,
|
||||
apply_increments_div, finalization_offsets_div);
|
||||
|
||||
/* =========================================== */
|
||||
|
||||
/* ========== Control dependency ============= */
|
||||
|
||||
loop_max_begin->add_control_dependency(vector_buffer_max);
|
||||
loop_max_end->add_control_dependency(max);
|
||||
horizon_max->add_control_dependency(loop_max_end);
|
||||
loop_sum_begin->add_control_dependency(vector_buffer_sum);
|
||||
loop_sum_begin->add_control_dependency(horizon_max);
|
||||
loop_sum_end->add_control_dependency(sum);
|
||||
horizon_sum->add_control_dependency(loop_sum_end);
|
||||
loop_div_begin->add_control_dependency(horizon_sum);
|
||||
loop_div_begin->add_control_dependency(pow);
|
||||
|
||||
/* =========================================== */
|
||||
|
||||
/* ============= Runtime Info ================ */
|
||||
|
||||
// For tail loop we should fill input of Max by float min and
|
||||
// input of Sum by zero to avoid math incorrect calculations
|
||||
max->input(0).get_rt_info()["set_fill"] = uint32_t(0xff7fffff);
|
||||
sum->input(0).get_rt_info()["set_fill"] = uint32_t(0x00000000);
|
||||
|
||||
// These nodes should be executed outside loops
|
||||
ov::NodeVector ops_outside_loop = { vector_buffer_max, horizon_max, vector_buffer_sum, horizon_sum, pow, buffer_exp };
|
||||
for (const auto& op : ops_outside_loop) {
|
||||
op->get_rt_info()["outside_loop"] = true;
|
||||
}
|
||||
|
||||
ngraph::copy_runtime_info(root,
|
||||
{vector_buffer_max, loop_max_begin, load_max, max, horizon_max, loop_max_end,
|
||||
vector_buffer_sum, loop_sum_begin, load_sub, sub, exp, sum, store_exp, horizon_sum, loop_sum_end, buffer_exp, pow,
|
||||
loop_div_begin, load_div, mul, store_div, loop_div_end});
|
||||
|
||||
/* =========================================== */
|
||||
|
||||
ngraph::replace_node(root, loop_div_end);
|
||||
|
||||
/* ============== Outer loop ================= */
|
||||
if (has_outer_loop) {
|
||||
std::vector<bool> apply_increments =
|
||||
InsertLoops::calculate_outer_apply_increments({root->get_input_shape(0), root->get_output_shape(0)});
|
||||
const auto softmax_parameters =
|
||||
std::vector<ov::Output<ov::Node>>{loop_max_begin->input(0).get_source_output()};
|
||||
const auto output_set = loop_div_end->output(0).get_target_inputs();
|
||||
const auto softmax_results = std::vector<ov::Input<ov::Node>>{output_set.begin(), output_set.end()};
|
||||
const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(softmax_parameters);
|
||||
const auto outer_loop_end = ngraph::snippets::op::insertLoopEndBeforeInputs(
|
||||
softmax_results, outer_loop_begin, master_shape[outer_dim], 1, apply_increments);
|
||||
|
||||
vector_buffer_max->add_control_dependency(outer_loop_begin);
|
||||
|
||||
ngraph::copy_runtime_info(root, {outer_loop_begin, outer_loop_end});
|
||||
}
|
||||
/* =========================================== */
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(m_softmax, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
70
src/common/snippets/src/pass/softmax_reshape_elimination.cpp
Normal file
70
src/common/snippets/src/pass/softmax_reshape_elimination.cpp
Normal file
@ -0,0 +1,70 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
#include "snippets/remarks.hpp"
|
||||
|
||||
#include "snippets/pass/softmax_reshape_elimination.hpp"
|
||||
#include "snippets/snippets_isa.hpp"
|
||||
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/validation_util.hpp>
|
||||
|
||||
ngraph::snippets::pass::SoftmaxReshapeElimination::SoftmaxReshapeElimination() {
|
||||
MATCHER_SCOPE(SoftmaxReshapeElimination);
|
||||
const auto m_reshape0 = pattern::wrap_type<opset1::Reshape>(pattern::has_static_shape());
|
||||
const auto m_softmax = pattern::wrap_type<ngraph::op::v1::Softmax, ngraph::op::v8::Softmax>({m_reshape0});
|
||||
const auto m_reshape1 = pattern::wrap_type<opset1::Reshape>({m_softmax, pattern::wrap_type<opset1::Constant>()});
|
||||
|
||||
register_matcher(std::make_shared<ngraph::pattern::Matcher>(m_reshape1, matcher_name),
|
||||
[=](ngraph::pattern::Matcher &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::SoftmaxReshapeElimination")
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
auto reshape0 = pattern_to_output[m_reshape0].get_node_shared_ptr();
|
||||
auto softmax = pattern_to_output[m_softmax].get_node_shared_ptr();
|
||||
auto reshape1 = pattern_to_output[m_reshape1].get_node_shared_ptr();
|
||||
|
||||
const auto input_shape = reshape0->get_input_partial_shape(0);
|
||||
const auto output_shape = reshape1->get_output_partial_shape(0);
|
||||
if (input_shape.is_dynamic() || output_shape.is_dynamic() || input_shape.get_shape() != output_shape.get_shape())
|
||||
return false;
|
||||
|
||||
const auto softmax_rank = softmax->get_input_partial_shape(0).rank();
|
||||
int64_t axis = 0;
|
||||
if (const auto softmax_v8 = ngraph::as_type_ptr<const ov::op::v8::Softmax>(softmax)) {
|
||||
axis = ngraph::normalize_axis(softmax->get_friendly_name(), softmax_v8->get_axis(), softmax_rank);
|
||||
} else if (const auto softmax_v1 = ngraph::as_type_ptr<const ov::op::v1::Softmax>(softmax)) {
|
||||
axis = softmax_v1->get_axis();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Supports only last axis
|
||||
if (axis != softmax_rank.get_length() - 1)
|
||||
return false;
|
||||
|
||||
// Dimensions by reduction axis should be equal
|
||||
if (input_shape.get_shape().back() != softmax->get_input_shape(0).back())
|
||||
return false;
|
||||
|
||||
// Eliminate Reshape before Softmax
|
||||
reshape0->output(0).replace(reshape0->input_value(0));
|
||||
copy_runtime_info({reshape0->input_value(0).get_node_shared_ptr(), reshape0->output(0).get_node_shared_ptr()},
|
||||
reshape0->input_value(0).get_node_shared_ptr());
|
||||
|
||||
// Eliminate Reshape after Softmax with name saving
|
||||
replace_output_update_name(reshape1->output(0), reshape1->input_value(0));
|
||||
|
||||
// update axis
|
||||
const auto new_axis = input_shape.rank().get_length() - 1;
|
||||
if (auto softmax_v8 = ngraph::as_type_ptr<ov::op::v8::Softmax>(softmax)) {
|
||||
softmax_v8->set_axis(new_axis);
|
||||
} else if (auto softmax_v1 = ngraph::as_type_ptr<ov::op::v1::Softmax>(softmax)) {
|
||||
softmax_v1->set_axis(new_axis);
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
}
|
72
src/common/snippets/src/pass/tokenization.cpp
Normal file
72
src/common/snippets/src/pass/tokenization.cpp
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/itt.hpp>
|
||||
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/pass/common_optimizations.hpp"
|
||||
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace pass {
|
||||
|
||||
void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
|
||||
auto &rt = node->get_rt_info();
|
||||
rt["SnippetsNodeType"] = nodeType;
|
||||
}
|
||||
|
||||
SnippetsNodeType GetSnippetsNodeType(const std::shared_ptr<const Node> &node) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::GetSnippetsNodeType")
|
||||
auto &rt = node->get_rt_info();
|
||||
const auto rinfo = rt.find("SnippetsNodeType");
|
||||
if (rinfo == rt.end())
|
||||
return SnippetsNodeType::NotSet;
|
||||
return rinfo->second.as<SnippetsNodeType>();
|
||||
}
|
||||
|
||||
void SetTopologicalOrder(const std::shared_ptr<Node> &node, int64_t order) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::SetTopologicalOrder")
|
||||
auto &rt = node->get_rt_info();
|
||||
rt["TopologicalOrder"] = order;
|
||||
}
|
||||
|
||||
int64_t GetTopologicalOrder(const std::shared_ptr<const Node> &node) {
|
||||
auto &rt = node->get_rt_info();
|
||||
const auto rinfo = rt.find("TopologicalOrder");
|
||||
if (rinfo == rt.end())
|
||||
throw ngraph_error("Topological order is required, but not set.");
|
||||
return rinfo->second.as<int64_t>();
|
||||
}
|
||||
|
||||
bool EnumerateNodes::run_on_model(const std::shared_ptr<ov::Model> &m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::EnumerateNodes")
|
||||
int64_t order = 0;
|
||||
// Todo: We don't really have to set order for every node, just for subgraph parents and children would be enough
|
||||
for (auto &node : m->get_ordered_ops()) {
|
||||
SetTopologicalOrder(node, order++);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool SnippetsTokenization::run_on_model(const std::shared_ptr<ov::Model>& m) {
|
||||
RUN_ON_FUNCTION_SCOPE(SnippetsTokenization);
|
||||
ngraph::pass::Manager manager(get_pass_config());
|
||||
manager.set_per_pass_validation(false);
|
||||
|
||||
manager.register_pass<EnumerateNodes>();
|
||||
manager.register_pass<TokenizeMHASnippets>();
|
||||
manager.register_pass<TokenizeSnippets>();
|
||||
manager.register_pass<CommonOptimizations>();
|
||||
manager.run_passes(m);
|
||||
|
||||
// Returning value is false because pass::Manager always apply Validation pass if function was changed.
|
||||
// But we don't need to validate the model
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace pass
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
81
src/common/snippets/src/pass/transpose_decomposition.cpp
Normal file
81
src/common/snippets/src/pass/transpose_decomposition.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <snippets/pass/transpose_decomposition.hpp>
|
||||
#include <snippets/itt.hpp>
|
||||
#include <snippets/snippets_isa.hpp>
|
||||
#include <snippets/pass/loop_helpers.hpp>
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/opsets/opset5.hpp>
|
||||
#include <ngraph/partial_shape.hpp>
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <numeric>
|
||||
const std::set<std::vector<int>> ngraph::snippets::pass::TransposeDecomposition::supported_cases = {{0, 2, 3, 1}};
|
||||
ngraph::snippets::pass::TransposeDecomposition::TransposeDecomposition() {
|
||||
MATCHER_SCOPE(TransposeDecomposition);
|
||||
// todo: we need a special transformation that detects and propagates data access pattern to Parameters and Results
|
||||
// this is needed to communicate access pattern to the plugin node and op::Kernel
|
||||
// This is the reason we match only to Parameter, this limitation could be relaxed if we propagate access pattern
|
||||
// to the appropriate parameter
|
||||
auto match_data = ngraph::pattern::wrap_type<opset1::Parameter>();
|
||||
auto match_order = ngraph::pattern::wrap_type<opset1::Constant>();
|
||||
auto match_transpose = ngraph::pattern::wrap_type<ngraph::opset1::Transpose>({match_data, match_order});
|
||||
|
||||
ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
|
||||
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransposeDecomposition")
|
||||
auto& pattern_to_output = m.get_pattern_value_map();
|
||||
const auto transpose = ov::as_type_ptr<ngraph::opset1::Transpose>(
|
||||
pattern_to_output.at(match_transpose).get_node_shared_ptr());
|
||||
|
||||
const auto order = ov::as_type_ptr<ov::op::v0::Constant>(pattern_to_output.at(match_order).get_node_shared_ptr());
|
||||
if (transformation_callback(transpose) || transpose->is_dynamic())
|
||||
return false;
|
||||
|
||||
auto order_value = order->cast_vector<int>();
|
||||
if (supported_cases.count(order_value) == 0)
|
||||
return false;
|
||||
|
||||
auto data_input = pattern_to_output.at(match_data);
|
||||
const auto& data_node = pattern_to_output.at(match_data).get_node_shared_ptr();
|
||||
auto ¶m_rt = data_node->get_rt_info();
|
||||
// Note: store and usage inside emitters as size_t is more convenient, so static_cast here
|
||||
const auto& access_pattern = order->cast_vector<size_t>();
|
||||
param_rt["Layout"] = access_pattern;
|
||||
|
||||
// The line below is Ok, since we ensured that transpose is static above
|
||||
auto data_shape = data_input.get_shape();
|
||||
// dim indexes with respect to SRC
|
||||
const auto dim_C_idx = data_shape.size() - 3;
|
||||
const auto dim_H_idx = data_shape.size() - 2;
|
||||
const auto dim_W_idx = data_shape.size() - 1;
|
||||
const auto size_C = static_cast<int64_t>(data_shape[dim_C_idx]);
|
||||
const auto size_W = static_cast<int64_t>(data_shape[dim_W_idx]);
|
||||
const auto size_H = static_cast<int64_t>(data_shape[dim_H_idx]);
|
||||
|
||||
auto loop_W_begin = std::make_shared<op::LoopBegin>(OutputVector{data_input});
|
||||
auto loop_C_begin = std::make_shared<op::LoopBegin>(OutputVector{loop_W_begin->output(0)});
|
||||
// todo: LoadReshape used here is essentially Load + an easy way to maintain correct shape propagation
|
||||
// fix this in future and develop a more consistent shape propagation approach.
|
||||
auto load = std::make_shared<snippets::op::LoadReshape>(loop_C_begin->output(0), 1, 0, access_pattern);
|
||||
auto store = std::make_shared<snippets::op::Store>(load, 1);
|
||||
const std::vector<int64_t> ptr_increments_C {size_H * size_W, 1};
|
||||
const std::vector<int64_t> finalization_offsets_C {1 - size_H * size_W * size_C, 0};
|
||||
auto loop_C_end = std::make_shared<op::LoopEnd>(OutputVector{store->output(0), loop_C_begin->output(1)},
|
||||
size_C, 1, ptr_increments_C, finalization_offsets_C);
|
||||
auto loop_W_end = std::make_shared<op::LoopEnd>(OutputVector{loop_C_end->output(0), loop_W_begin->output(1)},
|
||||
size_W, 1, std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
|
||||
|
||||
for (auto& input : transpose->output(0).get_target_inputs()) {
|
||||
input.replace_source_output(loop_W_end->output(0));
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(match_transpose, matcher_name);
|
||||
register_matcher(m, callback);
|
||||
}
|
@ -6,8 +6,11 @@
|
||||
|
||||
#include "snippets/pass/fq_decomposition.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace snippets {
|
||||
namespace utils {
|
||||
|
||||
auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t {
|
||||
auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<opset1::FakeQuantize>& fq) -> size_t {
|
||||
std::vector<float> out_scales;
|
||||
std::vector<float> cl, ch, isc, ish, osc, osh;
|
||||
const bool status = ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh);
|
||||
@ -55,3 +58,54 @@ auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::sh
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
std::vector<size_t> get_node_output_layout(const std::shared_ptr<Node>& node) {
|
||||
return get_node_output_layout(node.get());
|
||||
}
|
||||
std::vector<size_t> get_node_output_layout(const Node* node) {
|
||||
if (!node)
|
||||
return {};
|
||||
if (node->is_dynamic())
|
||||
throw ngraph_error("It's illegal to call get_node_output_layout for dynamic nodes");
|
||||
auto &rt = node->get_rt_info();
|
||||
const auto rinfo = rt.find("Layout");
|
||||
if (rinfo != rt.end()) {
|
||||
std::vector<size_t> layout(rinfo->second.as<std::vector<size_t>>());
|
||||
// This might be a little costy, but still useful sanity check. Remove if proved to be unacceptably heavy.
|
||||
std::set<size_t> unique_elements(layout.begin(), layout.end());
|
||||
if (unique_elements.size() < layout.size())
|
||||
throw ngraph_error("Layout must contain only unique dimension indexes");
|
||||
return layout;
|
||||
} else {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
ov::PartialShape get_reordered_planar_shape(const ov::PartialShape& shape, const std::vector<size_t>& layout) {
|
||||
if (layout.empty())
|
||||
return shape;
|
||||
std::vector<Dimension> reordered_shape(layout.size());
|
||||
if (shape.rank().is_dynamic())
|
||||
throw ngraph_error("get_reordered_planar_shape can't be called for outputs with dynamic rank");
|
||||
const size_t rank = shape.rank().get_length();
|
||||
if (layout.size() > rank)
|
||||
throw ngraph_error("Layout rank can't be larger than tensor rank");
|
||||
// Note that it can be smaller though, for example tensor shape can be prepended with 1 for scheduling purposes
|
||||
if (std::any_of(layout.begin(), layout.end(), [=](size_t x) {return x >= rank;}))
|
||||
throw ngraph_error("Invalid layout detected: all layout indexes must be smaller than the tensor rank");
|
||||
for (int i = 0; i < layout.size(); i++)
|
||||
reordered_shape[i] = shape[layout[i]];
|
||||
return reordered_shape;
|
||||
}
|
||||
|
||||
ov::PartialShape get_port_planar_shape(const Output<Node>& out) {
|
||||
std::vector<size_t> layout = get_node_output_layout(out.get_node_shared_ptr());
|
||||
const auto& tensor = out.get_tensor_ptr();
|
||||
if (!tensor)
|
||||
throw ngraph_error("get_port_planar_shape can't be called for an uninitialized output tensor");
|
||||
auto tensor_shape = tensor->get_partial_shape();
|
||||
return get_reordered_planar_shape(tensor_shape, layout);
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace snippets
|
||||
} // namespace ngraph
|
||||
|
@ -40,10 +40,16 @@ public:
|
||||
class LoweringTests : public TransformationTestsF {
|
||||
public:
|
||||
LoweringTests();
|
||||
|
||||
void SetUp() override;
|
||||
void TearDown() override;
|
||||
|
||||
protected:
|
||||
static std::shared_ptr<ngraph::snippets::op::Subgraph> getSubgraph(const std::shared_ptr<Model>& f);
|
||||
static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f);
|
||||
static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f,
|
||||
const ov::PartialShape& master_shape);
|
||||
static std::shared_ptr<ngraph::snippets::op::Subgraph> getTokenizedSubgraph(const std::shared_ptr<Model>& f);
|
||||
ov::PartialShape master_shape{};
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
|
@ -0,0 +1,29 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lowering_utils.hpp"
|
||||
#include "snippets_helpers.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
typedef std::tuple<
|
||||
Shape, // Input shape 0
|
||||
Shape, // Input shape 1
|
||||
Shape // Broadcast shape
|
||||
> BroadcastParams;
|
||||
|
||||
class BroadcastToMoveBroadcastTests : public LoweringTests, public testing::WithParamInterface<BroadcastParams> {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<BroadcastParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
std::shared_ptr<SnippetsFunctionBase> snippets_function;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,33 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lowering_utils.hpp"
|
||||
#include "snippets_helpers.hpp"
|
||||
|
||||
/* The main purpose is to test that FuseTransposeBrgemm properly fuses 0213 Transposes on both inputs, as well as on output
|
||||
*/
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
std::vector<PartialShape>, // Input shapes
|
||||
PartialShape, // Master shape
|
||||
size_t // Transpose position
|
||||
> fuseTransposeBrgemmParams;
|
||||
|
||||
class FuseTransposeBrgemmTests : public LoweringTests, public testing::WithParamInterface<fuseTransposeBrgemmParams> {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<fuseTransposeBrgemmParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
std::shared_ptr<SnippetsFunctionBase> snippets_function;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
20
src/common/snippets/tests/include/pass/mha_tokenization.hpp
Normal file
20
src/common/snippets/tests/include/pass/mha_tokenization.hpp
Normal file
@ -0,0 +1,20 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <common_test_utils/ngraph_test_utils.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
class TokenizeMHASnippetsTests : public TransformationTestsF {
|
||||
public:
|
||||
virtual void run();
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,43 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lowering_utils.hpp"
|
||||
#include "snippets_helpers.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
typedef std::tuple<
|
||||
Shape, // Input shape 0
|
||||
int // Axis
|
||||
> SoftmaxParams;
|
||||
|
||||
typedef std::tuple<
|
||||
Shape, // Input shape 0
|
||||
Shape, // Input shape 1
|
||||
int // Axis
|
||||
> AddSoftmaxParams;
|
||||
|
||||
class SoftmaxTests : public LoweringTests, public testing::WithParamInterface<SoftmaxParams> {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
std::shared_ptr<SnippetsFunctionBase> snippets_function;
|
||||
};
|
||||
|
||||
class AddSoftmaxTests : public LoweringTests, public testing::WithParamInterface<AddSoftmaxParams> {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj);
|
||||
protected:
|
||||
void SetUp() override;
|
||||
std::shared_ptr<SnippetsFunctionBase> snippets_function;
|
||||
};
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -4,7 +4,7 @@
|
||||
|
||||
#include <common_test_utils/ngraph_test_utils.hpp>
|
||||
#include "lowering_utils.hpp"
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
@ -21,7 +21,12 @@ DummyTargetMachine::DummyTargetMachine() {
|
||||
jitters[op::v1::Add::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Subtract::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Multiply::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Divide::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v1::Maximum::get_type_info_static()] = dummy_functor;
|
||||
jitters[op::v0::Exp::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::PowerStatic::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::HorizonMax::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::HorizonSum::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Load::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::BroadcastLoad::get_type_info_static()] = dummy_functor;
|
||||
|
||||
@ -30,8 +35,12 @@ DummyTargetMachine::DummyTargetMachine() {
|
||||
jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Brgemm::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
|
||||
jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor;
|
||||
}
|
||||
|
||||
LoweringTests::LoweringTests() : TransformationTestsF() {
|
||||
@ -41,6 +50,29 @@ LoweringTests::LoweringTests() : TransformationTestsF() {
|
||||
comparator.disable(FunctionsComparator::CmpValues::SUBGRAPH_DESCRIPTORS);
|
||||
}
|
||||
|
||||
void LoweringTests::SetUp() {
|
||||
manager.register_pass<ngraph::pass::InitNodeInfo>();
|
||||
}
|
||||
|
||||
void LoweringTests::TearDown() {
|
||||
auto cloned_function = ngraph::clone_function(*function);
|
||||
if (!function_ref) {
|
||||
function_ref = cloned_function;
|
||||
}
|
||||
manager.run_passes(function);
|
||||
ASSERT_NO_THROW(check_rt_info(function));
|
||||
|
||||
if (comparator.should_compare(FunctionsComparator::ACCURACY)) {
|
||||
auto acc_comparator = FunctionsComparator::no_default();
|
||||
acc_comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
auto res = acc_comparator.compare(function, cloned_function);
|
||||
ASSERT_TRUE(res.valid) << res.message;
|
||||
comparator.disable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
}
|
||||
auto res = comparator.compare(function, function_ref);
|
||||
ASSERT_TRUE(res.valid) << res.message;
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
|
||||
std::shared_ptr<ngraph::snippets::op::Subgraph> subgraph;
|
||||
for (const auto &op : f->get_ops()) {
|
||||
@ -59,9 +91,30 @@ std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const
|
||||
return subgraph;
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f) {
|
||||
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f,
|
||||
const ov::PartialShape& master_shape) {
|
||||
auto subgraph = getTokenizedSubgraph(f);
|
||||
subgraph->set_generator(std::make_shared<DummyGenerator>());
|
||||
subgraph->set_master_shape(master_shape);
|
||||
const auto& body = subgraph->body_ptr();
|
||||
auto& body_rt_info = body->get_rt_info();
|
||||
// todo: insertLoops pass requires body_rt_info["PluginShapesOverride"] and subgraph->set_tile_rank to work normally
|
||||
// consider revising snippets-plugin shape and scheduling communication
|
||||
std::vector<std::vector<size_t>> new_shapes;
|
||||
for (const auto& p : body->get_parameters()) {
|
||||
const auto pshape = p->get_output_partial_shape(0);
|
||||
if (pshape.is_dynamic())
|
||||
IE_THROW() << "getLoweredSubgraph supports only static shapes";
|
||||
new_shapes.push_back(pshape.get_shape());
|
||||
}
|
||||
for (const auto& r : body->get_results()) {
|
||||
const auto pshape = r->get_input_partial_shape(0);
|
||||
if (pshape.is_dynamic())
|
||||
IE_THROW() << "getLoweredSubgraph supports only static shapes";
|
||||
new_shapes.push_back(pshape.get_shape());
|
||||
}
|
||||
body_rt_info["PluginShapesOverride"] = new_shapes;
|
||||
subgraph->set_tile_rank(2);
|
||||
subgraph->generate();
|
||||
return subgraph;
|
||||
}
|
||||
|
@ -0,0 +1,59 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "pass/broadcast_to_movebroadcast.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include <subgraph_lowered.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
|
||||
std::string BroadcastToMoveBroadcastTests::getTestCaseName(testing::TestParamInfo<BroadcastParams> obj) {
|
||||
std::vector<Shape> inputShapes(2);
|
||||
Shape broadcast_shape;
|
||||
std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = obj.param;
|
||||
std::ostringstream result;
|
||||
for (size_t i = 0; i < inputShapes.size(); i++)
|
||||
result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
|
||||
result << "BS=" << CommonTestUtils::vec2str(broadcast_shape) << "_";
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void BroadcastToMoveBroadcastTests::SetUp() {
|
||||
TransformationTestsF::SetUp();
|
||||
std::vector<PartialShape> inputShapes(2);
|
||||
PartialShape broadcast_shape;
|
||||
std::tie(inputShapes[0], inputShapes[1], broadcast_shape) = this->GetParam();
|
||||
snippets_function = std::make_shared<BroadcastAddLoweredFunction>(inputShapes, broadcast_shape);
|
||||
master_shape = {};
|
||||
for (int i = 0; i < inputShapes[0].size(); i++)
|
||||
master_shape.push_back(static_cast<int64_t>(std::max(inputShapes[0].get_shape()[i], inputShapes[1].get_shape()[i])));
|
||||
}
|
||||
|
||||
TEST_P(BroadcastToMoveBroadcastTests, BroadcastSelect) {
|
||||
PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
|
||||
master_shape[master_shape.size() - 1]});
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
||||
namespace BroadcastToMoveBroadcastTestsInstantiation {
|
||||
using ov::Shape;
|
||||
std::vector<Shape> inputShapes0 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}};
|
||||
std::vector<Shape> inputShapes1 {{1, 8, 2, 10}, {1, 8, 2, 1}, {1, 1, 1, 1}};
|
||||
Shape broadcastShape {1, 8, 2, 10};
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Broadcast, BroadcastToMoveBroadcastTests,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShapes0),
|
||||
::testing::ValuesIn(inputShapes1),
|
||||
::testing::Values(broadcastShape)),
|
||||
BroadcastToMoveBroadcastTests::getTestCaseName);
|
||||
} // namespace BroadcastToMoveBroadcastTestsInstantiation
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -23,12 +23,12 @@ std::string CanonicalizationTests::getTestCaseName(testing::TestParamInfo<canoni
|
||||
// input shape
|
||||
result << "IS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(inputs[i])) << "_";
|
||||
// input blocked shape
|
||||
result << "IBS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(blockedshape)) << "_";
|
||||
result << "IBS[" << i << "]=" << CommonTestUtils::partialShape2str({std::get<0>(blockedshape)}) << "_";
|
||||
// input blocked order
|
||||
result << "IBO[" << i << "]=" << CommonTestUtils::vec2str(std::get<1>(blockedshape)) << "_";
|
||||
}
|
||||
// output blocked shape
|
||||
result << "OBS[0]=" << CommonTestUtils::vec2str(std::get<0>(output)) << "_";
|
||||
result << "OBS[0]=" << CommonTestUtils::partialShape2str({std::get<0>(output)}) << "_";
|
||||
// output blocked order
|
||||
result << "OBO[0]=" << CommonTestUtils::vec2str(std::get<1>(output)) << "_";
|
||||
result << "ExpOS[0]=" << CommonTestUtils::vec2str(expectedOutput) << "_";
|
||||
@ -42,7 +42,7 @@ void CanonicalizationTests::SetUp() {
|
||||
std::tie(inputs[0], inputs[1], output_blocked_shapes[0], expected_output_shape) = this->GetParam();
|
||||
|
||||
input_blocked_shapes = {std::get<1>(inputs[0]), std::get<1>(inputs[1])};
|
||||
snippets_function = std::make_shared<AddFunction>(std::vector<Shape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
|
||||
snippets_function = std::make_shared<AddFunction>(std::vector<PartialShape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
|
||||
}
|
||||
|
||||
TEST_P(CanonicalizationTests, Add) {
|
||||
@ -50,8 +50,9 @@ TEST_P(CanonicalizationTests, Add) {
|
||||
function_ref = snippets_function->getReference();
|
||||
auto subgraph = getTokenizedSubgraph(function);
|
||||
subgraph->set_generator(std::make_shared<DummyGenerator>());
|
||||
Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
|
||||
ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
|
||||
auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
|
||||
ASSERT_TRUE(canonical_output_shape.is_static());
|
||||
ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape);
|
||||
}
|
||||
|
||||
namespace CanonicalizationTestsInstantiation {
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <pass/collapse_subgraph.hpp>
|
||||
#include <subgraph_simple.hpp>
|
||||
#include <subgraph_converts.hpp>
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
@ -17,59 +17,64 @@ void CollapseSubgraphTests::run() {
|
||||
std::string name;
|
||||
manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
|
||||
manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
|
||||
// todo: This is a temporary work-around. remove when MatMul tokenization is supported through general pipeline
|
||||
manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
|
||||
[](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
return ov::is_type<const ov::op::v0::MatMul>(n);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_Eltwise) {
|
||||
const auto &f = EltwiseFunction(std::vector<Shape> {{2, 3}, {1, 3}});
|
||||
const auto &f = EltwiseFunction(std::vector<PartialShape> {{2, 3}, {1, 3}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_MatMulWithEltwise) {
|
||||
const auto &f = MatMulEltwiseBranchesFunction(std::vector<Shape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
|
||||
const auto &f = MatMulEltwiseBranchesFunction(std::vector<PartialShape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
|
||||
const auto &f = EltwiseLogLoopFunction(std::vector<Shape> {{2, 5}, {2, 1}});
|
||||
const auto &f = EltwiseLogLoopFunction(std::vector<PartialShape> {{2, 5}, {2, 1}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
|
||||
const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
|
||||
const auto &f = ConvertFunction(std::vector<PartialShape>{{2, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
|
||||
const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
|
||||
const auto &f = ConvertInputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
|
||||
const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
|
||||
const auto &f = ConvertOutputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
|
||||
const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
|
||||
const auto &f = ConvertStubFunction(std::vector<PartialShape>{{2, 5, 2}, {1, 5, 1}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
|
||||
const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
|
||||
const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<PartialShape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
|
||||
std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
|
||||
std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
|
||||
function = f.getOriginal();
|
||||
@ -78,7 +83,7 @@ TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
|
||||
}
|
||||
|
||||
TEST_F(CollapseSubgraphTests, smoke_Snippets_EltwiseTwoResultsFunction) {
|
||||
const auto &f = EltwiseTwoResultsFunction(std::vector<Shape>{{2, 5}, {2, 1}});
|
||||
const auto &f = EltwiseTwoResultsFunction(std::vector<PartialShape>{{2, 5}, {2, 1}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
comparator.enable(FunctionsComparator::CmpValues::NAMES);
|
||||
|
58
src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
Normal file
58
src/common/snippets/tests/src/pass/fuse_transpose_brgemm.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "pass/fuse_transpose_brgemm.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "subgraph_matmul.hpp"
|
||||
#include "subgraph_lowered.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string FuseTransposeBrgemmTests::getTestCaseName(testing::TestParamInfo<fuseTransposeBrgemmParams> obj) {
|
||||
std::vector<PartialShape> input_shapes(2);
|
||||
PartialShape master_shape;
|
||||
size_t transpose_position;
|
||||
std::tie(input_shapes, master_shape, transpose_position) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::partialShape2str({input_shapes[0]}) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::partialShape2str({input_shapes[1]}) << "_";
|
||||
result << "MS=" << CommonTestUtils::partialShape2str({master_shape}) << "_";
|
||||
result << "Pos=" << transpose_position << "_";
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void FuseTransposeBrgemmTests::SetUp() {
|
||||
LoweringTests::SetUp();
|
||||
std::vector<PartialShape> input_shapes(2);
|
||||
size_t transpose_position;
|
||||
std::tie(input_shapes, master_shape, transpose_position) = this->GetParam();
|
||||
|
||||
snippets_function = std::make_shared<Transpose0213MatMulLoweredFunction>(input_shapes, transpose_position);
|
||||
}
|
||||
|
||||
TEST_P(FuseTransposeBrgemmTests, FuseTransposeMatmul) {
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
||||
namespace FuseTransposeBrgemmTestsInstantiation {
|
||||
using ov::Shape;
|
||||
std::vector<fuseTransposeBrgemmParams> test_params{
|
||||
{{{1, 49, 2, 23}, {2, 2, 23, 39}}, {2, 2, 49, 23}, 0},
|
||||
{{{1, 2, 49, 23}, {2, 23, 1, 39}}, {2, 2, 49, 39}, 1},
|
||||
{{{1, 2, 49, 23}, {2, 2, 23, 39}}, {2, 2, 49, 39}, 2},
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FuseTransposeMatMul, FuseTransposeBrgemmTests,
|
||||
::testing::ValuesIn(test_params),
|
||||
FuseTransposeBrgemmTests::getTestCaseName);
|
||||
|
||||
} // namespace FuseTransposeBrgemmTestsInstantiation
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -25,16 +25,18 @@ std::string InsertLoadStoreTests::getTestCaseName(testing::TestParamInfo<insertL
|
||||
}
|
||||
|
||||
void InsertLoadStoreTests::SetUp() {
|
||||
TransformationTestsF::SetUp();
|
||||
LoweringTests::SetUp();
|
||||
std::vector<Shape> inputShapes(3);
|
||||
std::vector<Shape> broadcastShapes(3);
|
||||
std::tie(inputShapes[0], inputShapes[1], inputShapes[2],
|
||||
broadcastShapes[0], broadcastShapes[1], broadcastShapes[2]) = this->GetParam();
|
||||
snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(inputShapes, broadcastShapes);
|
||||
snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(
|
||||
std::vector<PartialShape> {inputShapes[0], inputShapes[1], inputShapes[2]}, broadcastShapes);
|
||||
master_shape = inputShapes[0];
|
||||
}
|
||||
|
||||
TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) {
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
@ -24,15 +24,22 @@ std::string InsertMoveBroadcastTests::getTestCaseName(testing::TestParamInfo<ins
|
||||
}
|
||||
|
||||
void InsertMoveBroadcastTests::SetUp() {
|
||||
TransformationTestsF::SetUp();
|
||||
LoweringTests::SetUp();
|
||||
std::vector<Shape> inputShapes(2);
|
||||
std::vector<Shape> broadcastShapes(2);
|
||||
std::tie(inputShapes[0], inputShapes[1], broadcastShapes[0], broadcastShapes[1]) = this->GetParam();
|
||||
snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(inputShapes, broadcastShapes);
|
||||
snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(std::vector<PartialShape> {inputShapes[0], inputShapes[1]}, broadcastShapes);
|
||||
if (inputShapes[0].size() != inputShapes[1].size())
|
||||
IE_THROW() << "Expected input shapes of the same size";
|
||||
master_shape = {};
|
||||
for (int i = 0; i < inputShapes[0].size(); i++)
|
||||
master_shape.push_back(static_cast<int64_t>(std::max(inputShapes[0][i], inputShapes[1][i])));
|
||||
}
|
||||
|
||||
TEST_P(InsertMoveBroadcastTests, AddBroadcast) {
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
|
||||
PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
|
||||
master_shape[master_shape.size() - 1]});
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
169
src/common/snippets/tests/src/pass/merge_loops.cpp
Normal file
169
src/common/snippets/tests/src/pass/merge_loops.cpp
Normal file
@ -0,0 +1,169 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ngraph/function.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
|
||||
#include <snippets/snippets_isa.hpp>
|
||||
#include <snippets/pass/loop_fusion.hpp>
|
||||
|
||||
#include <transformations/init_node_info.hpp>
|
||||
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
|
||||
using namespace testing;
|
||||
using namespace ngraph;
|
||||
|
||||
TEST(TransformationTests, UnaryEltwisesLoops) {
|
||||
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
|
||||
auto shape = Shape{2, 3, 240};
|
||||
const size_t vector_size = 16;
|
||||
const std::vector<int64_t> inner_ptr_increments(2, vector_size);
|
||||
const std::vector<int64_t> inner_finalization_offsets(2, 0);
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
|
||||
auto outer_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{data});
|
||||
auto inner_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_up});
|
||||
auto load_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(0));
|
||||
auto relu = std::make_shared<op::v0::Relu>(load_up);
|
||||
auto store_up = std::make_shared<snippets::op::Store>(relu);
|
||||
auto inner_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store_up, inner_loop_begin_up->output(1)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(1)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
|
||||
|
||||
auto buffer = std::make_shared<snippets::op::Buffer>(outer_loop_end_up);
|
||||
|
||||
auto outer_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{buffer});
|
||||
auto inner_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_down});
|
||||
auto load_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(0));
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(load_down);
|
||||
auto store_down = std::make_shared<snippets::op::Store>(hswish);
|
||||
auto inner_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store_down, inner_loop_begin_down->output(1)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(1)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
|
||||
|
||||
f = std::make_shared<Function>(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
m.register_pass<snippets::pass::LoopFusion>();
|
||||
m.run_passes(f);
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
|
||||
auto outer_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{data});
|
||||
auto inner_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin});
|
||||
auto load = std::make_shared<snippets::op::Load>(inner_loop_begin->output(0));
|
||||
auto relu = std::make_shared<op::v0::Relu>(load);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(relu);
|
||||
auto store = std::make_shared<snippets::op::Store>(hswish);
|
||||
auto inner_loop_end = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store, inner_loop_begin->output(1)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end->output(0), outer_loop_begin->output(1)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0}, std::vector<int64_t>{0, 0});
|
||||
|
||||
f_ref = std::make_shared<Function>(OutputVector{outer_loop_end->output(0)}, ParameterVector{data});
|
||||
}
|
||||
|
||||
auto res = compare_functions(f, f_ref);
|
||||
ASSERT_TRUE(res.first) << res.second;
|
||||
}
|
||||
|
||||
TEST(TransformationTests, BinaryEltwisesLoops) {
|
||||
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
|
||||
auto shape = Shape{2, 3, 240};
|
||||
const size_t vector_size = 16;
|
||||
{
|
||||
const std::vector<int64_t> inner_ptr_increments(3, vector_size);
|
||||
const std::vector<int64_t> inner_finalization_offsets(3, 0);
|
||||
|
||||
auto data0 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
auto data1 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
|
||||
auto outer_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{data0, data1});
|
||||
auto inner_loop_begin_up = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_up->output(0),
|
||||
outer_loop_begin_up->output(1)});
|
||||
auto load0_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(0));
|
||||
auto load1_up = std::make_shared<snippets::op::Load>(inner_loop_begin_up->output(1));
|
||||
auto add = std::make_shared<op::v1::Add>(load0_up, load1_up);
|
||||
auto relu = std::make_shared<op::v0::Relu>(add);
|
||||
auto store_up = std::make_shared<snippets::op::Store>(relu);
|
||||
auto inner_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store_up, inner_loop_begin_up->output(2)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end_up = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end_up->output(0), outer_loop_begin_up->output(2)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{0, 0, 0});
|
||||
|
||||
auto buffer = std::make_shared<snippets::op::Buffer>(outer_loop_end_up);
|
||||
|
||||
auto data2 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
|
||||
auto outer_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{buffer, data2});
|
||||
auto inner_loop_begin_down = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin_down->output(0),
|
||||
outer_loop_begin_down->output(1)});
|
||||
auto load0_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(0));
|
||||
auto load1_down = std::make_shared<snippets::op::Load>(inner_loop_begin_down->output(1));
|
||||
auto mul = std::make_shared<op::v1::Multiply>(load0_down, load1_down);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(mul);
|
||||
auto store_down = std::make_shared<snippets::op::Store>(hswish);
|
||||
auto inner_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store_down, inner_loop_begin_down->output(2)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end_down = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end_down->output(0), outer_loop_begin_down->output(2)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{0, 0, 0});
|
||||
|
||||
f = std::make_shared<Function>(OutputVector{outer_loop_end_down->output(0)}, ParameterVector{data0, data1, data2});
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
m.register_pass<snippets::pass::LoopFusion>();
|
||||
m.run_passes(f);
|
||||
}
|
||||
{
|
||||
const std::vector<int64_t> inner_ptr_increments(4, vector_size);
|
||||
const std::vector<int64_t> inner_finalization_offsets(4, 0);
|
||||
|
||||
auto data0 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
auto data1 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
auto data2 = std::make_shared<opset1::Parameter>(element::f32, shape);
|
||||
|
||||
auto outer_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{data0, data1, data2});
|
||||
auto inner_loop_begin = std::make_shared<snippets::op::LoopBegin>(OutputVector{outer_loop_begin->output(0),
|
||||
outer_loop_begin->output(1),
|
||||
outer_loop_begin->output(2)});
|
||||
auto load0 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(0));
|
||||
auto load1 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(1));
|
||||
auto load2 = std::make_shared<snippets::op::Load>(inner_loop_begin->output(2));
|
||||
auto add = std::make_shared<op::v1::Add>(load0, load1);
|
||||
auto relu = std::make_shared<op::v0::Relu>(add);
|
||||
auto mul = std::make_shared<op::v1::Multiply>(relu, load2);
|
||||
auto hswish = std::make_shared<op::v4::HSwish>(mul);
|
||||
auto store = std::make_shared<snippets::op::Store>(hswish);
|
||||
auto inner_loop_end = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{store, inner_loop_begin->output(3)}, shape[shape.size() - 1], vector_size,
|
||||
inner_ptr_increments, inner_finalization_offsets);
|
||||
auto outer_loop_end = std::make_shared<snippets::op::LoopEnd>(
|
||||
OutputVector{inner_loop_end->output(0), outer_loop_begin->output(3)}, shape[shape.size() - 2], 1,
|
||||
std::vector<int64_t>{0, 0, 0, 0}, std::vector<int64_t>{0, 0, 0, 0});
|
||||
|
||||
f_ref = std::make_shared<Function>(OutputVector{outer_loop_end->output(0)}, ParameterVector{data0, data1, data2});
|
||||
}
|
||||
|
||||
auto res = compare_functions(f, f_ref);
|
||||
ASSERT_TRUE(res.first) << res.second;
|
||||
}
|
38
src/common/snippets/tests/src/pass/mha_tokenization.cpp
Normal file
38
src/common/snippets/tests/src/pass/mha_tokenization.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <pass/mha_tokenization.hpp>
|
||||
#include <subgraph_mha.hpp>
|
||||
#include "snippets/pass/tokenization.hpp"
|
||||
#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
void TokenizeMHASnippetsTests::run() {
|
||||
ASSERT_TRUE(function);
|
||||
std::string name;
|
||||
manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
|
||||
manager.register_pass<ngraph::snippets::pass::TokenizeMHASnippets>();
|
||||
}
|
||||
|
||||
TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA) {
|
||||
const auto &f = MHAFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) {
|
||||
const auto &f = MHAMatMul0TransposeFunction(std::vector<PartialShape>{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}});
|
||||
function = f.getOriginal();
|
||||
function_ref = f.getReference();
|
||||
run();
|
||||
}
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
122
src/common/snippets/tests/src/pass/softmax_decomposition.cpp
Normal file
122
src/common/snippets/tests/src/pass/softmax_decomposition.cpp
Normal file
@ -0,0 +1,122 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "pass/softmax_decomposition.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "subgraph_softmax.hpp"
|
||||
#include "subgraph_lowered.hpp"
|
||||
|
||||
#include "snippets/pass/softmax_decomposition.hpp"
|
||||
#include "snippets/pass/insert_load_store.hpp"
|
||||
#include "snippets/pass/insert_movebroadcast.hpp"
|
||||
#include "snippets/pass/insert_buffer.hpp"
|
||||
#include "snippets/pass/convert_power_to_powerstatic.hpp"
|
||||
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace snippets {
|
||||
|
||||
std::string SoftmaxTests::getTestCaseName(testing::TestParamInfo<SoftmaxParams> obj) {
|
||||
Shape inputShape;
|
||||
int axis;
|
||||
std::tie(inputShape, axis) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
|
||||
result << "Axis=" << axis << "_";
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void SoftmaxTests::SetUp() {
|
||||
LoweringTests::SetUp();
|
||||
|
||||
const size_t count = 10;
|
||||
manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
|
||||
manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
|
||||
Shape inputShape;
|
||||
int axis;
|
||||
std::tie(inputShape, axis) = this->GetParam();
|
||||
snippets_function = std::make_shared<SoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape}, axis);
|
||||
master_shape = inputShape;
|
||||
}
|
||||
|
||||
std::string AddSoftmaxTests::getTestCaseName(testing::TestParamInfo<AddSoftmaxParams> obj) {
|
||||
Shape inputShape0, inputShape1;
|
||||
int axis;
|
||||
std::tie(inputShape0, inputShape1, axis) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "IS[0]=" << CommonTestUtils::vec2str(inputShape0) << "_";
|
||||
result << "IS[1]=" << CommonTestUtils::vec2str(inputShape1) << "_";
|
||||
result << "Axis=" << axis << "_";
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void AddSoftmaxTests::SetUp() {
|
||||
LoweringTests::SetUp();
|
||||
|
||||
const size_t count = 10;
|
||||
manager.register_pass<ngraph::snippets::pass::InsertBuffer>();
|
||||
manager.register_pass<ngraph::snippets::pass::SoftmaxDecomposition>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::ConvertPowerToPowerStatic>();
|
||||
manager.register_pass<ngraph::snippets::pass::InsertLoad>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::InsertStore>(count);
|
||||
manager.register_pass<ngraph::snippets::pass::InsertMoveBroadcast>();
|
||||
Shape inputShape0, inputShape1;
|
||||
int axis;
|
||||
std::tie(inputShape0, inputShape1, axis) = this->GetParam();
|
||||
snippets_function = std::make_shared<AddSoftmaxLoweredFunction>(std::vector<PartialShape>{inputShape0, inputShape1}, axis);
|
||||
|
||||
ov::PartialShape master_pshape(inputShape0);
|
||||
ov::PartialShape::broadcast_merge_into(master_pshape, inputShape1, op::AutoBroadcastType::NUMPY);
|
||||
master_shape = master_pshape.get_shape();
|
||||
}
|
||||
|
||||
TEST_P(SoftmaxTests, SoftmaxDecomposition) {
|
||||
PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
|
||||
master_shape[master_shape.size() - 1]});
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
||||
TEST_P(AddSoftmaxTests, AddSoftmaxDecomposition) {
|
||||
PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
|
||||
master_shape[master_shape.size() - 1]});
|
||||
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
|
||||
function = subgraph->body_ptr();
|
||||
function_ref = snippets_function->getLowered();
|
||||
}
|
||||
|
||||
namespace SoftmaxTestsInstantiation {
|
||||
std::vector<ov::Shape> inputShape{{12, 4, 12, 12, 127}, {12, 4, 12, 12, 1}};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SoftmaxDecomposition, SoftmaxTests,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShape),
|
||||
::testing::Values(-1)),
|
||||
SoftmaxTests::getTestCaseName);
|
||||
|
||||
} // namespace SoftmaxTestsInstantiation
|
||||
|
||||
namespace AddSoftmaxTestsInstantiation {
|
||||
std::vector<ov::Shape> inputShape0{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
|
||||
std::vector<ov::Shape> inputShape1{{12, 4, 12, 12, 17}, {12, 4, 12, 12, 1}};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_AddSoftmaxDecomposition, AddSoftmaxTests,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(inputShape0),
|
||||
::testing::ValuesIn(inputShape1),
|
||||
::testing::Values(-1)),
|
||||
AddSoftmaxTests::getTestCaseName);
|
||||
|
||||
} // namespace AddSoftmaxTestsInstantiation
|
||||
|
||||
} // namespace snippets
|
||||
} // namespace test
|
||||
} // namespace ov
|
@ -0,0 +1,70 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ngraph/function.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
|
||||
#include <snippets/snippets_isa.hpp>
|
||||
#include <snippets/pass/softmax_reshape_elimination.hpp>
|
||||
|
||||
#include <transformations/init_node_info.hpp>
|
||||
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
|
||||
using namespace testing;
|
||||
using namespace ngraph;
|
||||
|
||||
TEST_F(TransformationTestsF, SoftmaxV1ReshapeElimination) {
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
|
||||
auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{6, 240});
|
||||
auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
|
||||
auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(reshape0, 1);
|
||||
auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int32_t>{2, 3, 240});
|
||||
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
|
||||
function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
|
||||
|
||||
manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3, 240});
|
||||
auto softmax_v1 = std::make_shared<ov::op::v1::Softmax>(data, 2);
|
||||
function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, SoftmaxV8ReshapeElimination) {
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
|
||||
auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{680, 240});
|
||||
auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
|
||||
auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
|
||||
auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
|
||||
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
|
||||
function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
|
||||
|
||||
manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
|
||||
}
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
|
||||
auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(data, 3);
|
||||
function_ref = std::make_shared<Function>(NodeVector{softmax_v1}, ParameterVector{data});
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, SoftmaxReshapeElimination_IncorrectReshape) {
|
||||
{
|
||||
auto data = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 340, 240});
|
||||
auto shape0 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{2, 81600});
|
||||
auto reshape0 = std::make_shared<ov::op::v1::Reshape>(data, shape0, false);
|
||||
auto softmax_v1 = std::make_shared<ov::op::v8::Softmax>(reshape0, -1);
|
||||
auto shape1 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{1, 2, 340, 240});
|
||||
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(softmax_v1, shape1, false);
|
||||
function = std::make_shared<Function>(NodeVector{reshape1}, ParameterVector{data});
|
||||
|
||||
manager.register_pass<snippets::pass::SoftmaxReshapeElimination>();
|
||||
}
|
||||
}
|
@ -33,6 +33,8 @@ TEST(TransformationTests, AssignRegisters) {
|
||||
auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
|
||||
s00->set_friendly_name("s00");
|
||||
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
|
||||
// Note that testing the result is not strictly necessary, since the Result doesn't emit any code
|
||||
f->get_result()->set_friendly_name("r00");
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
@ -52,18 +54,19 @@ TEST(TransformationTests, AssignRegisters) {
|
||||
{"y01", 1},
|
||||
{"y02", 2},
|
||||
{"s00", 2}, // gpr
|
||||
{"r00", 2} // gpr
|
||||
};
|
||||
|
||||
auto total_ops = 0;
|
||||
for (auto& op : f->get_ordered_ops()) {
|
||||
auto& rt = op->get_rt_info();
|
||||
|
||||
auto it_rinfo = rt.find("reginfo");
|
||||
if (it_rinfo != rt.end()) {
|
||||
auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
|
||||
auto reg = reginfo[0];
|
||||
ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
|
||||
total_ops++;
|
||||
for (const auto& output : op->outputs()) {
|
||||
const auto& rt = output.get_tensor_ptr()->get_rt_info();
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
auto reg = it_rt->second.as<size_t>();
|
||||
ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
|
||||
total_ops++;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(total_ops, ref_registers.size());
|
||||
@ -120,6 +123,7 @@ TEST(TransformationTests, AssignRegisters2) {
|
||||
s00->set_friendly_name("s00");
|
||||
|
||||
f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
|
||||
f->get_result()->set_friendly_name("res00");
|
||||
|
||||
pass::Manager m;
|
||||
m.register_pass<pass::InitNodeInfo>();
|
||||
@ -140,17 +144,19 @@ TEST(TransformationTests, AssignRegisters2) {
|
||||
{"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
|
||||
{"r24", 1},
|
||||
{"s00", 8},
|
||||
{"res00", 8}
|
||||
};
|
||||
|
||||
auto total_ops = 0;
|
||||
for (auto& op : f->get_ordered_ops()) {
|
||||
auto& rt = op->get_rt_info();
|
||||
auto it_rinfo = rt.find("reginfo");
|
||||
if (it_rinfo != rt.end()) {
|
||||
auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
|
||||
auto reg = reginfo[0];
|
||||
ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
|
||||
total_ops++;
|
||||
for (const auto& output : op->outputs()) {
|
||||
const auto& rt = output.get_tensor_ptr()->get_rt_info();
|
||||
auto it_rt = rt.find("reginfo");
|
||||
if (it_rt != rt.end()) {
|
||||
auto reg = it_rt->second.as<size_t>();
|
||||
ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
|
||||
total_ops++;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(total_ops, ref_registers.size());
|
||||
|
@ -653,7 +653,7 @@ NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_qkv_hidden_sizes) {
|
||||
|
||||
test_case.add_input<float>(input);
|
||||
test_case.add_expected_output<float>(output);
|
||||
test_case.run_with_tolerance_as_fp(1e-6);
|
||||
test_case.run_with_tolerance_as_fp(1e-4);
|
||||
}
|
||||
|
||||
NGRAPH_TEST(${BACKEND_NAME}, onnx_model_attention_unidirectional) {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user